├── .coveragerc ├── docs ├── source │ ├── images │ │ └── merge_time_series.png │ ├── _templates │ │ └── autosummary │ │ │ ├── base.rst │ │ │ ├── module.rst │ │ │ └── class.rst │ ├── api.rst │ ├── index.rst │ ├── install.rst │ ├── conf.py │ └── release.rst ├── Makefile └── make.bat ├── zcollection ├── tests │ ├── __init__.py │ ├── test_sync.py │ ├── fixture.py │ ├── test_mathematics.py │ ├── fs.py │ ├── test_expression.py │ ├── cluster.py │ ├── test_dask_utils.py │ ├── data.py │ ├── s3.py │ ├── test_compressed_array.py │ ├── test_meta.py │ └── test_fs_utils.py ├── view │ └── tests │ │ └── __init__.py ├── collection │ ├── tests │ │ └── __init__.py │ └── callable_objects.py ├── indexing │ ├── tests │ │ ├── __init__.py │ │ └── test_abc.py │ └── __init__.py ├── merging │ ├── tests │ │ ├── __init__.py │ │ ├── test_merging.py │ │ └── test_time_series.py │ ├── time_series.py │ └── __init__.py ├── variable │ ├── tests │ │ ├── __init__.py │ │ ├── test_abc.py │ │ ├── data.py │ │ ├── test_delayed_array.py │ │ └── test_array.py │ ├── __init__.py │ └── array.py ├── partitioning │ ├── tests │ │ ├── __init__.py │ │ ├── test_registry.py │ │ ├── data.py │ │ └── test_sequence.py │ ├── __init__.py │ ├── registry.py │ ├── sequence.py │ └── date.py ├── convenience │ ├── __init__.py │ ├── view.py │ └── collection.py ├── mathematics.py ├── __init__.py ├── expression.py ├── sync.py ├── representation.py ├── dask_utils.py ├── type_hints.py └── fs_utils.py ├── examples ├── README.rst ├── ex_indexing.py └── ex_view.py ├── .vscode └── settings.json ├── readthedocs.yml ├── .github └── workflows │ ├── pre-commit.yml │ ├── pypipublish.yaml │ └── ci.yaml ├── conda ├── environment.yml └── meta.yaml ├── pyproject.toml ├── setup.py ├── conftest.py ├── LICENSE ├── .pre-commit-config.yaml ├── README.rst ├── setup.cfg └── .gitignore /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | */tests/* 4 | zcollection/typing.py 5 | */pytest* 6 | -------------------------------------------------------------------------------- /docs/source/images/merge_time_series.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CNES/zcollection/HEAD/docs/source/images/merge_time_series.png -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/base.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. currentmodule:: {{ module }} 4 | 5 | .. auto{{ objtype }}:: {{ objname }} 6 | -------------------------------------------------------------------------------- /zcollection/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | -------------------------------------------------------------------------------- /zcollection/view/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | -------------------------------------------------------------------------------- /zcollection/collection/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | -------------------------------------------------------------------------------- /zcollection/indexing/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | -------------------------------------------------------------------------------- /zcollection/merging/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | -------------------------------------------------------------------------------- /zcollection/variable/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | -------------------------------------------------------------------------------- /zcollection/partitioning/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | -------------------------------------------------------------------------------- /examples/README.rst: -------------------------------------------------------------------------------- 1 | Example Gallery 2 | =============== 3 | 4 | This gallery of examples shows a variety of relatively small snippets or 5 | examples of tasks that can be done with the ``zcollection`` package. 6 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": [ 3 | "." 4 | ], 5 | "python.testing.unittestEnabled": false, 6 | "python.testing.pytestEnabled": true, 7 | "python.formatting.provider": "yapf" 8 | } 9 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sphinx: 4 | builder: html 5 | configuration: docs/source/conf.py 6 | 7 | python: 8 | install: 9 | - path: . 10 | method: setuptools 11 | 12 | conda: 13 | environment: conda/environment.yml 14 | 15 | build: 16 | os: ubuntu-20.04 17 | tools: 18 | python: mambaforge-4.10 19 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: 4 | pull_request: 5 | push: 6 | 7 | jobs: 8 | pre-commit: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - uses: actions/setup-python@v3 13 | with: 14 | python-version: 3.11 15 | - uses: pre-commit/action@v3.0.0 16 | -------------------------------------------------------------------------------- /conda/environment.yml: -------------------------------------------------------------------------------- 1 | name: ZCollection 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - dask-core 7 | - distributed 8 | - furo 9 | - numcodecs 10 | - pyarrow 11 | - pypandoc 12 | - pytest 13 | - requests 14 | - s3fs 15 | - setuptools-scm 16 | - sphinx-gallery 17 | - sphinx-inline-tabs 18 | - xarray 19 | - zarr>=2.11.0 20 | -------------------------------------------------------------------------------- /zcollection/tests/test_sync.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Testing the sync module. 7 | ======================== 8 | """ 9 | from .. import sync 10 | 11 | 12 | def test_no_sync(): 13 | """Test the no_sync class.""" 14 | touch = False 15 | with sync.NoSync() as _: 16 | touch = True 17 | assert touch 18 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=45", "wheel", "setuptools_scm>=6.2"] 3 | 4 | [tool.setuptools_scm] 5 | write_to = "zcollection/version.py" 6 | write_to_template = ''' 7 | # Copyright (c) 2023 CNES 8 | # 9 | # All rights reserved. Use of this source code is governed by a 10 | # BSD-style license that can be found in the LICENSE file. 11 | """ 12 | Get software version information 13 | ================================ 14 | """ 15 | __version__ = "{version}" 16 | ''' 17 | -------------------------------------------------------------------------------- /zcollection/convenience/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Convenience functions 7 | ===================== 8 | """ 9 | from .collection import create_collection, open_collection 10 | from .view import create_view, open_view 11 | 12 | __all__ = ( 13 | 'create_collection', 14 | 'open_collection', 15 | 'create_view', 16 | 'open_view', 17 | ) 18 | -------------------------------------------------------------------------------- /zcollection/variable/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Variables of a dataset. 7 | ======================= 8 | """ 9 | from ..meta import Attribute 10 | from .abc import Variable, new_variable 11 | from .array import Array 12 | from .delayed_array import DelayedArray 13 | 14 | __all__ = ('Attribute', 'Variable', 'Array', 'DelayedArray', 'new_variable') 15 | -------------------------------------------------------------------------------- /zcollection/indexing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Indexing a Collection. 7 | ====================== 8 | """ 9 | import warnings 10 | 11 | try: 12 | from .abc import Indexer, QueryDict, Scalar 13 | __all__ = ('Indexer', 'QueryDict', 'Scalar') 14 | except ImportError: # pragma: no cover 15 | warnings.warn( 16 | 'Install PyArrow to use the indexing capabilities of zcollection.') 17 | -------------------------------------------------------------------------------- /zcollection/tests/fixture.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Fixtures for the tests. 7 | ======================= 8 | """ 9 | from typing import Literal 10 | 11 | import pytest 12 | 13 | 14 | @pytest.fixture 15 | def dask_arrays() -> Literal[True]: 16 | """Load the data in Dask arrays.""" 17 | return True 18 | 19 | 20 | @pytest.fixture 21 | def numpy_arrays() -> Literal[False]: 22 | """Load the data in NumPy arrays.""" 23 | return False 24 | -------------------------------------------------------------------------------- /zcollection/tests/test_mathematics.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Mathematics testing. 7 | ==================== 8 | """ 9 | from .. import mathematics 10 | 11 | 12 | def test_prod(): 13 | """Test the product of an iterable.""" 14 | assert mathematics.prod([]) == 1 15 | assert mathematics.prod([1]) == 1 16 | assert mathematics.prod([1, 2, 3]) == 6 17 | assert mathematics.prod([1, 2, 3, 4, 5]) == 120 18 | assert mathematics.prod([1, 2, 3, 4, 5, 6, 7, 8, 9]) == 362880 19 | assert mathematics.prod([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) == 3628800 20 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /zcollection/mathematics.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Mathematical functions. 7 | ======================= 8 | """ 9 | from typing import Iterable 10 | import functools 11 | import operator 12 | 13 | 14 | def prod(iterable: Iterable) -> int: 15 | """Return the product of all elements in the given iterable. 16 | 17 | Args: 18 | iterable: An iterable containing numeric values. 19 | 20 | Returns: 21 | The product of all elements in the iterable. If the iterable is empty, 22 | returns 1. 23 | """ 24 | return functools.reduce(operator.mul, iterable, 1) 25 | -------------------------------------------------------------------------------- /.github/workflows/pypipublish.yaml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | with: 14 | submodules: 'true' 15 | - name: Set up Python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: "3.x" 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install setuptools setuptools-scm twine 23 | - name: Build and publish 24 | run: | 25 | echo "[pypi]" > ~/.pypirc 26 | echo "username = __token__" >> ~/.pypirc 27 | echo "password = ${{ secrets.PYPI_PASSWORD }}" >> ~/.pypirc 28 | python setup.py sdist 29 | twine upload dist/* 30 | -------------------------------------------------------------------------------- /zcollection/variable/tests/test_abc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Testing interface for the variable module 7 | ========================================= 8 | """ 9 | import numpy 10 | 11 | from ..abc import not_equal 12 | 13 | 14 | def test_variable_not_equal() -> None: 15 | """Test if two values are different.""" 16 | assert not_equal(1, 2) is True 17 | assert not_equal(1, 1) is False 18 | assert not_equal(1, '1') is True 19 | assert not_equal(1, numpy.nan) is True 20 | assert not_equal(numpy.nan, numpy.nan) is False 21 | assert not_equal(numpy.nan, 1) is True 22 | assert not_equal(numpy.datetime64('NaT'), numpy.datetime64('NaT')) is False 23 | assert not_equal(numpy.datetime64('NaT'), 1) is True 24 | -------------------------------------------------------------------------------- /conda/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "zcollection" %} 2 | {% set version = "0.0" %} 3 | 4 | package: 5 | name: {{ name|lower }} 6 | version: {{ version }} 7 | 8 | source: 9 | path: .. 10 | 11 | build: 12 | number: 0 13 | script: {{ PYTHON }} -m pip install . -vv --use-feature=in-tree-build 14 | skip: true # [linux32 or win32 or py<36] 15 | 16 | requirements: 17 | build: 18 | - python 19 | run: 20 | - dask 21 | - fsspec 22 | - numcodecs 23 | - numpy >=1.20 24 | - python 25 | - xarray 26 | - zarr 27 | test: 28 | requires: 29 | - pytest 30 | commands: 31 | - pytest --pyargs zcollection 32 | 33 | about: 34 | home: https://github.com/CNES/zcollection 35 | license: Proprietary 36 | license_family: Proprietary 37 | summary: 'Handle a collection of Zarr groups' 38 | doc_url: https://zcollection.readthedocs.io/en/latest/ 39 | dev_url: https://github.com/CNES/zcollection 40 | -------------------------------------------------------------------------------- /zcollection/partitioning/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Partitioning scheme. 7 | ==================== 8 | 9 | Entry point of the implemented partitioning schemes. 10 | 11 | * :py:class:`Sequence `: 12 | Partitioning a sequence of variables. 13 | * :py:class:`Date `: Partitioning a 14 | sequence of dates. 15 | 16 | .. class:: Partitioning 17 | 18 | Alias for :class:`zcollection.partitioning.abc.Partitioning`. 19 | """ 20 | from .abc import Partitioning 21 | from .date import Date 22 | from .registry import get_codecs, register_codec 23 | from .sequence import Sequence 24 | 25 | register_codec(Date) 26 | register_codec(Sequence) 27 | 28 | __all__ = ('Partitioning', 'Date', 'Sequence', 'get_codecs') 29 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """This script is the entry point for building, distributing and installing 6 | this module using distutils/setuptools.""" 7 | import pathlib 8 | 9 | import setuptools 10 | import setuptools.command.sdist 11 | 12 | # Working directory 13 | WORKING_DIRECTORY = pathlib.Path(__file__).parent.absolute() 14 | 15 | 16 | class SDist(setuptools.command.sdist.sdist): 17 | """Custom sdist command that copies the pytest configuration file into the 18 | package.""" 19 | user_options = setuptools.command.sdist.sdist.user_options 20 | 21 | def run(self): 22 | """Carry out the action.""" 23 | source = WORKING_DIRECTORY.joinpath('conftest.py') 24 | target = WORKING_DIRECTORY.joinpath('zcollection', 'conftest.py') 25 | source.rename(target) 26 | try: 27 | super().run() 28 | finally: 29 | target.rename(source) 30 | 31 | 32 | setuptools.setup(cmdclass={'sdist': SDist}) 33 | -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/module.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. automodule:: {{ fullname }} 4 | {% block attributes -%} 5 | {% if attributes %} 6 | .. rubric:: {{ ('Modules Attributes') }} 7 | .. autosummary:: 8 | :toctree: 9 | {% for item in attributes %} 10 | {{ item }} 11 | {%- endfor %} 12 | {% endif -%} 13 | {% endblock -%} 14 | {% block classes -%} 15 | {% if classes %} 16 | .. rubric:: {{ ('Classes') }} 17 | .. autosummary:: 18 | :toctree: 19 | {% for item in classes %} 20 | {{ item }} 21 | {%- endfor %} 22 | {% endif -%} 23 | {% endblock -%} 24 | {% block exceptions -%} 25 | {% if exceptions %} 26 | .. rubric:: {{ ('Exceptions') }} 27 | .. autosummary:: 28 | :toctree: 29 | {% for item in exceptions %} 30 | {{ item }} 31 | {%- endfor %} 32 | {% endif -%} 33 | {% endblock -%} 34 | {% block functions -%} 35 | {% if functions %} 36 | .. rubric:: {{ ('Functions') }} 37 | .. autosummary:: 38 | :toctree: 39 | {% for item in functions %} 40 | {{ item }} 41 | {%- endfor %} 42 | {% endif -%} 43 | {% endblock -%} 44 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | ========== 7 | Test setup 8 | ========== 9 | """ 10 | 11 | 12 | def pytest_addoption(parser) -> None: 13 | """Add command line options to pytest.""" 14 | parser.addoption( 15 | '--s3', 16 | action='store_true', 17 | default=False, 18 | help='Enable tests on the local S3 server driven by minio. ' 19 | '(default: False)') 20 | parser.addoption( 21 | '--memory', 22 | action='store_true', 23 | default=False, 24 | help='Use a file system in memory instead of the local file system. ' 25 | '(default: False)') 26 | parser.addoption( 27 | '--threads_per_worker', 28 | action='store', 29 | default=None, 30 | type=int, 31 | help='Number of threads for each worker Dask. (default: the number of ' 32 | 'logical cores of the target platform).') 33 | parser.addoption( 34 | '--n_workers', 35 | action='store', 36 | default=None, 37 | type=int, 38 | help='Number of core for each worker Dask. (default: the number of ' 39 | 'cores of the target platform).') 40 | parser.addoption( 41 | '--processes', 42 | action='store_true', 43 | default=False, 44 | help='Whether to use processes or threads for Dask. (default: False)') 45 | -------------------------------------------------------------------------------- /zcollection/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Handle a collection of Zarr groups. 7 | =================================== 8 | """ 9 | from . import merging, partitioning 10 | from .collection import Collection 11 | from .collection.abc import Indexer, PartitionFilter, PartitionFilterCallback 12 | from .collection.callable_objects import ( 13 | MapCallable, 14 | PartitionCallable, 15 | UpdateCallable, 16 | ) 17 | from .convenience import ( 18 | create_collection, 19 | create_view, 20 | open_collection, 21 | open_view, 22 | ) 23 | from .dataset import Dataset, Expression 24 | from .meta import Attribute 25 | from .variable import Array, DelayedArray, Variable 26 | from .version import __version__ 27 | from .view import View, ViewReference, ViewUpdateCallable 28 | 29 | __all__ = ( 30 | '__version__', 31 | 'Array', 32 | 'Attribute', 33 | 'Collection', 34 | 'create_collection', 35 | 'create_view', 36 | 'Dataset', 37 | 'DelayedArray', 38 | 'Expression', 39 | 'Indexer', 40 | 'MapCallable', 41 | 'merging', 42 | 'open_collection', 43 | 'open_view', 44 | 'PartitionCallable', 45 | 'PartitionFilter', 46 | 'PartitionFilterCallback', 47 | 'partitioning', 48 | 'UpdateCallable', 49 | 'Variable', 50 | 'version', 51 | 'View', 52 | 'ViewReference', 53 | 'ViewUpdateCallable', 54 | ) 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, CNES 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | :tocdepth: 2 2 | 3 | API Documentation 4 | ################# 5 | 6 | Partitioning 7 | ============ 8 | 9 | Handles the partitioning of the collection. 10 | 11 | .. autosummary:: 12 | :toctree: _generated/ 13 | 14 | zcollection.partitioning 15 | zcollection.partitioning.abc 16 | zcollection.partitioning.date 17 | zcollection.partitioning.registry 18 | zcollection.partitioning.sequence 19 | 20 | .. _merging_datasets: 21 | 22 | Merging of datasets 23 | =================== 24 | 25 | Merging of existing datasets in a partition. 26 | 27 | .. autosummary:: 28 | :toctree: _generated/ 29 | 30 | zcollection.merging 31 | zcollection.merging.time_series 32 | zcollection.merging.period 33 | 34 | Variable 35 | ======== 36 | 37 | Variables handled by the datasets. These objects manage access to the data 38 | stored in the collection. 39 | 40 | .. autosummary:: 41 | :toctree: _generated/ 42 | 43 | zcollection.variable.abc 44 | zcollection.variable.array 45 | zcollection.variable.delayed_array 46 | 47 | Collection 48 | ========== 49 | 50 | .. autosummary:: 51 | :toctree: _generated/ 52 | 53 | zcollection.collection 54 | zcollection.dask_utils 55 | zcollection.dataset 56 | zcollection.expression 57 | zcollection.fs_utils 58 | zcollection.meta 59 | zcollection.sync 60 | zcollection.type_hints 61 | zcollection.view 62 | 63 | Indexing 64 | ======== 65 | 66 | .. autosummary:: 67 | :toctree: _generated/ 68 | 69 | zcollection.indexing 70 | zcollection.indexing.abc 71 | 72 | Convenience functions 73 | ===================== 74 | 75 | .. autosummary:: 76 | :toctree: _generated/ 77 | 78 | zcollection.create_collection 79 | zcollection.create_view 80 | zcollection.open_collection 81 | zcollection.open_view 82 | -------------------------------------------------------------------------------- /zcollection/partitioning/tests/test_registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Test the registry module. 7 | ========================= 8 | """ 9 | from typing import Any 10 | 11 | import pytest 12 | 13 | from .. import registry 14 | 15 | 16 | def test_get_codecs() -> None: 17 | """Test the get_codecs function.""" 18 | with pytest.raises(ValueError): 19 | registry.get_codecs({'ID': 'foo'}) 20 | 21 | with pytest.raises(ValueError): 22 | registry.get_codecs({'id': 'foo'}) 23 | 24 | 25 | class MyCodec: 26 | """A dummy codec.""" 27 | ID = 'foo' 28 | 29 | __slots__ = ('attribute', ) 30 | 31 | def __init__(self, attribute) -> None: 32 | self.attribute: Any = attribute 33 | 34 | def get_config(self) -> dict: 35 | """Returns the configuration of the codec.""" 36 | return {'id': self.ID, 'attribute': self.attribute} 37 | 38 | @classmethod 39 | def from_config(cls, config: dict) -> 'MyCodec': 40 | """Creates an instance from the given configuration.""" 41 | return cls(config['attribute']) 42 | 43 | 44 | def test_register_codec() -> None: 45 | """Test the register_codec function.""" 46 | registry.register_codec(MyCodec, codec_id='foo') # type: ignore[arg-type] 47 | 48 | instance = MyCodec(12) 49 | 50 | other = registry.get_codecs(instance.get_config()) 51 | assert other.attribute == instance.attribute # type: ignore 52 | assert isinstance(other, MyCodec) 53 | 54 | with pytest.raises(ValueError): 55 | registry.register_codec( 56 | MyCodec, # type: ignore[arg-type] 57 | codec_id='foo') 58 | -------------------------------------------------------------------------------- /zcollection/partitioning/tests/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Make test data. 7 | =============== 8 | """ 9 | import numpy 10 | import xarray 11 | 12 | START_DATE = numpy.datetime64('2000-01-01', 'ns') 13 | END_DATE = numpy.datetime64('2000-06-30', 'ns') 14 | DELTA = numpy.timedelta64(72, 'h') 15 | 16 | 17 | def create_test_sequence(repeatability, number_of_measures, 18 | number_of_cycles) -> xarray.Dataset: 19 | """Creation of a data set for testing purposes.""" 20 | pass_number = numpy.hstack([ 21 | numpy.tile(ix + 1, number_of_measures) for j in range(number_of_cycles) 22 | for ix in range(repeatability) 23 | ]) 24 | cycle_number = numpy.hstack([ 25 | numpy.tile(ix + 1, repeatability * number_of_measures) 26 | for ix in range(number_of_cycles) 27 | ]) 28 | delta = numpy.timedelta64(24 // repeatability // 2, 'h') 29 | time: numpy.ndarray = numpy.arange(START_DATE, 30 | START_DATE + len(cycle_number) * delta, 31 | delta) 32 | observation = numpy.random.rand(cycle_number.size) # type: ignore 33 | return xarray.Dataset({ 34 | 'time': 35 | xarray.DataArray( 36 | time, 37 | dims=('num_lines', ), 38 | ), 39 | 'cycle_number': 40 | xarray.DataArray( 41 | cycle_number, 42 | dims=('num_lines', ), 43 | ), 44 | 'pass_number': 45 | xarray.DataArray( 46 | pass_number, 47 | dims=('num_lines', ), 48 | ), 49 | 'observation': 50 | xarray.DataArray( 51 | observation, 52 | dims=('num_lines', ), 53 | ), 54 | }) 55 | -------------------------------------------------------------------------------- /zcollection/expression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Handles the partition selection expressions 7 | =========================================== 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Any, ClassVar 12 | import ast 13 | import dataclasses 14 | import types 15 | 16 | 17 | @dataclasses.dataclass 18 | class Expression: 19 | """Partitioning expressions. 20 | 21 | Args: 22 | expression: The expression to be evaluated 23 | Raises: 24 | NameError: If a variable is not defined. 25 | Example: 26 | >>> expr = Expression("year==2000 and month==1 and day in range(1, 12)") 27 | """ 28 | #: Compiled expression to be evaluated 29 | code: types.CodeType 30 | 31 | #: Known data members 32 | __slots__: tuple[str, ...] = ('code', ) 33 | 34 | #: The builtins that are allowed in the expression. 35 | BUILTINS: ClassVar[dict[str, Any]] = {'range': range} 36 | 37 | def __init__(self, expression: str) -> None: 38 | self.code = compile(ast.parse(expression, mode='eval'), ' ', 'eval') 39 | 40 | def __call__(self, variables: dict[str, Any]) -> Any: 41 | try: 42 | __locals: dict[str, Any] = { 43 | name: variables[name] 44 | for name in self.code.co_names if name not in self.BUILTINS 45 | } 46 | # pylint: disable=eval-used 47 | # The eval function is used here to evaluate a simple expression. 48 | # The only builtin functions allowed is the range function. 49 | return eval(self.code, {'__builtins__': self.BUILTINS}, __locals) 50 | # pylint: enable=eval-used 51 | except KeyError as err: 52 | raise NameError(f'name {err!s} is not defined') from err 53 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: check-case-conflict 6 | - id: check-docstring-first 7 | - id: check-json 8 | - id: check-toml 9 | - id: debug-statements 10 | - id: end-of-file-fixer 11 | - id: double-quote-string-fixer 12 | - id: mixed-line-ending 13 | args: [--fix=lf] 14 | exclude: docs/make.bat 15 | - id: trailing-whitespace 16 | exclude: conda/meta.yaml 17 | - repo: https://github.com/asottile/pyupgrade 18 | rev: "v3.15.0" 19 | hooks: 20 | - id: pyupgrade 21 | args: [--py38-plus] 22 | - repo: https://github.com/PyCQA/isort 23 | rev: 5.13.2 24 | hooks: 25 | - id: isort 26 | - repo: https://github.com/PyCQA/flake8 27 | rev: 7.0.0 28 | hooks: 29 | - id: flake8 30 | exclude: tests 31 | - repo: https://github.com/pre-commit/pygrep-hooks 32 | rev: "v1.10.0" 33 | hooks: 34 | - id: python-check-blanket-noqa 35 | - id: python-no-log-warn 36 | - id: rst-backticks 37 | - id: rst-directive-colons 38 | - id: rst-inline-touching-normal 39 | - repo: https://github.com/pre-commit/mirrors-yapf 40 | rev: v0.32.0 41 | hooks: 42 | - id: yapf 43 | additional_dependencies: 44 | - toml 45 | - repo: https://github.com/myint/docformatter 46 | rev: "v1.7.5" 47 | hooks: 48 | - id: docformatter 49 | - repo: https://github.com/codespell-project/codespell 50 | rev: "v2.2.6" 51 | hooks: 52 | - id: codespell 53 | - repo: https://github.com/pre-commit/mirrors-mypy 54 | rev: v1.8.0 55 | hooks: 56 | - id: mypy 57 | exclude: docs 58 | additional_dependencies: 59 | # Type stubs 60 | - types-requests 61 | - types-setuptools 62 | # Typed libraries 63 | - dask 64 | - numpy 65 | - pandas 66 | - pyarrow 67 | -------------------------------------------------------------------------------- /zcollection/partitioning/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Registers the partitioning codecs. 7 | ================================== 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Any 12 | 13 | from . import abc 14 | 15 | #: A registry of all available partitioning codecs. 16 | CODEC_REGISTRY: dict[str, type[abc.Partitioning]] = {} 17 | 18 | 19 | def get_codecs(config: dict[str, Any]) -> abc.Partitioning: 20 | """Get the partitioning scheme for the given configuration. 21 | 22 | Args: 23 | config: A dictionary of the partitioning configuration parameters. 24 | 25 | Returns: 26 | The partitioning scheme. 27 | 28 | Raises: 29 | ValueError: If the requested codec is not defined. 30 | """ 31 | codec_id: Any | None = config.pop('id', None) 32 | if codec_id is None: 33 | raise ValueError(f'codec not available: {codec_id!r}') 34 | cls: type[abc.Partitioning] | None = CODEC_REGISTRY.get(codec_id, None) 35 | if cls is None: 36 | raise ValueError(f'codec not available: {codec_id!r}') 37 | return cls.from_config(config) 38 | 39 | 40 | def register_codec(cls: type[abc.Partitioning], 41 | *, 42 | codec_id: str | None = None) -> None: 43 | """Register a partitioning scheme. 44 | 45 | Args: 46 | cls: The partitioning scheme class. 47 | codec_id: The partitioning scheme identifier. 48 | 49 | Raises: 50 | ValueError: If the codec identifier is already registered. 51 | """ 52 | if codec_id is None: 53 | codec_id = cls.ID 54 | if codec_id is None: 55 | raise ValueError('codec identifier not defined') 56 | if codec_id in CODEC_REGISTRY: 57 | raise ValueError(f'codec already registered: {codec_id!r}') 58 | CODEC_REGISTRY[codec_id] = cls 59 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ZCollection 2 | =========== 3 | 4 | This project is a Python library allowing manipulating data partitioned into a 5 | **collection** of `Zarr `_ groups. 6 | 7 | This collection allows dividing a dataset into several partitions to facilitate 8 | acquisitions or updates made from new products. Possible data partitioning is: 9 | by **date** (hour, day, month, etc.) or by **sequence**. 10 | 11 | A collection partitioned by date, with a monthly resolution, may look like on 12 | the disk: 13 | 14 | .. code-block:: text 15 | 16 | collection/ 17 | ├── year=2022 18 | │ ├── month=01/ 19 | │ │ ├── time/ 20 | │ │ │ ├── 0.0 21 | │ │ │ ├── .zarray 22 | │ │ │ └── .zattrs 23 | │ │ ├── var1/ 24 | │ │ │ ├── 0.0 25 | │ │ │ ├── .zarray 26 | │ │ │ └── .zattrs 27 | │ │ ├── .zattrs 28 | │ │ ├── .zgroup 29 | │ │ └── .zmetadata 30 | │ └── month=02/ 31 | │ ├── time/ 32 | │ │ ├── 0.0 33 | │ │ ├── .zarray 34 | │ │ └── .zattrs 35 | │ ├── var1/ 36 | │ │ ├── 0.0 37 | │ │ ├── .zarray 38 | │ │ └── .zattrs 39 | │ ├── .zattrs 40 | │ ├── .zgroup 41 | │ └── .zmetadata 42 | └── .zcollection 43 | 44 | Partition updates can be set to overwrite existing data with new ones or to 45 | update them using different **strategies**. 46 | 47 | The `Dask library `_ handles the data to scale the treatments 48 | quickly. 49 | 50 | It is possible to create views on a reference collection, to add and modify 51 | variables contained in a reference collection, accessible in reading only. 52 | 53 | This library can store data on POSIX, S3, or any other file system supported by 54 | the Python library `fsspec 55 | `_. Note, however, only POSIX 56 | and S3 file systems have been tested. 57 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | author = CNES/CLS 3 | author_email = fbriol@gmail.com 4 | classifiers = 5 | Development Status :: 5 - Production/Stable 6 | Intended Audience :: Science/Research 7 | Natural Language :: English 8 | Operating System :: MacOS 9 | Operating System :: Microsoft :: Windows 10 | Operating System :: POSIX 11 | Programming Language :: Python :: 3.8 12 | Programming Language :: Python :: 3.9 13 | Programming Language :: Python :: 3.10 14 | Programming Language :: Python :: 3.11 15 | Programming Language :: Python :: 3.12 16 | Topic :: Scientific/Engineering :: Physics 17 | description = Zarr Collection 18 | keywords = zarr, collection, xarray, dask 19 | license = BSD License 20 | license_files = LICENSE 21 | long_description = file: README.rst 22 | long_description_content_type = text/x-rst 23 | name = zcollection 24 | url = https://github.com/CNES/zcollection 25 | version = attr: zcollection.version.__version__ 26 | 27 | [options] 28 | include_package_data = True 29 | install_requires = 30 | dask >= 2022.8.0 31 | distributed 32 | fasteners 33 | fsspec 34 | numcodecs 35 | numpy>=1.20 36 | pandas 37 | xarray 38 | zarr>=2.11 39 | package_dir = 40 | = . 41 | packages = find: 42 | python_requires = >=3.8 43 | zip_safe = False 44 | 45 | [options.extras_require] 46 | test = 47 | pytest 48 | pytest-cov 49 | 50 | [options.package_data] 51 | * = *.json 52 | 53 | [flake8] 54 | exclude = docs,tests 55 | max-line-length = 80 56 | ignore = 57 | # Assigning lambda expression 58 | E731 59 | # Ambiguous variable names 60 | E741 61 | # line break before binary operator 62 | W503 63 | # line break after binary operator 64 | W504 65 | # whitespace before : 66 | E203 67 | 68 | [isort] 69 | combine_as_imports=True 70 | force_grid_wrap=0 71 | force_sort_within_sections=True 72 | force_to_top=typing 73 | include_trailing_comma=True 74 | line_length=80 75 | multi_line_output=3 76 | skip= 77 | build 78 | docs/source/conf.py 79 | 80 | [mypy] 81 | ignore_missing_imports=True 82 | exclude=tests 83 | -------------------------------------------------------------------------------- /zcollection/variable/tests/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Create test variables. 7 | ====================== 8 | """ 9 | import numpy 10 | import zarr 11 | 12 | from .. import Array, DelayedArray 13 | from ... import meta 14 | 15 | 16 | def array(name='var1', fill_value=0) -> Array: 17 | """Creates a test variable with the given name, fill value, dimensions, and 18 | attributes. 19 | 20 | Args: 21 | name: The name of the variable. 22 | fill_value: The fill value for uninitialized parts of the array. 23 | 24 | Returns: 25 | An Array object. 26 | """ 27 | return Array(name=name, 28 | data=numpy.arange(10, dtype='int64').reshape(5, 2), 29 | dimensions=('x', 'y'), 30 | attrs=(meta.Attribute(name='attr', value=1), ), 31 | compressor=zarr.Blosc(cname='zstd', clevel=1), 32 | fill_value=fill_value, 33 | filters=(zarr.Delta('int64', 34 | 'int32'), zarr.Delta('int32', 'int32'))) 35 | 36 | 37 | def delayed_array(name='var1', fill_value=0) -> DelayedArray: 38 | """Create a delayed test variable with the given name, fill value, 39 | dimensions, and attributes. 40 | 41 | Args: 42 | name: The name of the variable. 43 | fill_value: The fill value for uninitialized parts of the array. 44 | 45 | Returns: 46 | A DelayedArray object representing a lazily-evaluated test variable. 47 | """ 48 | return DelayedArray(name=name, 49 | data=numpy.arange(10, dtype='int64').reshape(5, 2), 50 | dimensions=('x', 'y'), 51 | attrs=(meta.Attribute(name='attr', value=1), ), 52 | compressor=zarr.Blosc(cname='zstd', clevel=1), 53 | fill_value=fill_value, 54 | filters=(zarr.Delta('int64', 'int32'), 55 | zarr.Delta('int32', 'int32'))) 56 | -------------------------------------------------------------------------------- /zcollection/variable/tests/test_delayed_array.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Testing variables 7 | ================= 8 | """ 9 | from typing import Any 10 | 11 | import dask.array.core 12 | import dask.array.ma 13 | import numpy 14 | import pytest 15 | 16 | # pylint: disable=unused-import # Need to import for fixtures 17 | from ...tests.cluster import dask_client, dask_cluster 18 | from ..delayed_array import _as_dask_array 19 | 20 | # pylint enable=unused-import 21 | 22 | 23 | def test_as_dask_array( 24 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 25 | ) -> None: 26 | """Test converting array like to a dask array.""" 27 | dask_array: dask.array.core.Array 28 | fill_value: Any 29 | np_array: numpy.ndarray 30 | 31 | np_array = numpy.arange(10) 32 | dask_array, fill_value = _as_dask_array(np_array) 33 | assert isinstance(dask_array, dask.array.core.Array) 34 | assert fill_value is None 35 | 36 | np_array = numpy.ma.masked_equal(np_array, 5) 37 | dask_array, fill_value = _as_dask_array(np_array) 38 | assert isinstance(dask_array, dask.array.core.Array) 39 | assert fill_value == 5 40 | 41 | dask_array, fill_value = _as_dask_array( 42 | dask.array.ma.masked_equal(np_array, 5)) 43 | assert isinstance(dask_array, dask.array.core.Array) 44 | assert fill_value == 5 45 | 46 | with pytest.raises(ValueError): 47 | _as_dask_array(numpy.ma.masked_equal(np_array, 5), fill_value=6) 48 | 49 | with pytest.raises(ValueError): 50 | _as_dask_array(numpy.ma.masked_equal( 51 | numpy.arange(numpy.datetime64(0, 'Y'), 52 | numpy.datetime64(10, 'Y'), 53 | dtype='M8[Y]'), numpy.datetime64(5, 'Y')), 54 | fill_value=numpy.datetime64('NaT')) 55 | 56 | dask_array, fill_value = _as_dask_array(numpy.ma.masked_equal( 57 | numpy.arange(numpy.datetime64(0, 'Y'), 58 | numpy.datetime64(10, 'Y'), 59 | dtype='M8[Y]'), numpy.datetime64('NaT')), 60 | fill_value=numpy.datetime64('NaT')) 61 | assert isinstance(dask_array, dask.array.core.Array) 62 | -------------------------------------------------------------------------------- /zcollection/sync.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Synchronization of concurrent accesses 7 | ====================================== 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Callable 12 | import abc 13 | import threading 14 | 15 | import fasteners 16 | 17 | 18 | class Sync(abc.ABC): # pragma: no cover 19 | """Interface of the classes handling the synchronization of concurrent 20 | accesses.""" 21 | 22 | @abc.abstractmethod 23 | def __enter__(self) -> bool: 24 | ... 25 | 26 | @abc.abstractmethod 27 | def __exit__(self, exc_type, exc_value, traceback) -> None: 28 | ... 29 | 30 | @abc.abstractmethod 31 | def is_locked(self) -> bool: 32 | """Returns True if the lock is acquired, False otherwise.""" 33 | 34 | 35 | class NoSync(Sync): 36 | """This class is used when the user does not want to synchronize accesses 37 | to the collection, in other words, when there is no concurrency.""" 38 | 39 | def __enter__(self) -> bool: 40 | return True 41 | 42 | def __exit__(self, exc_type, exc_value, traceback) -> None: 43 | """As this class does not perform any synchronization, this method has 44 | nothing to do.""" 45 | 46 | def is_locked(self) -> bool: 47 | """As this class does not perform any synchronization, this method 48 | always returns False.""" 49 | return False 50 | 51 | 52 | class ProcessSync(Sync): 53 | """This class is used when the user wants to synchronize accesses to the 54 | collection, in other words, when there is concurrency.""" 55 | 56 | def __init__(self, path: str) -> None: 57 | self.lock = fasteners.InterProcessLock(path) 58 | 59 | def __enter__(self) -> bool: 60 | return self.lock.acquire() 61 | 62 | def __exit__(self, exc_type, exc_value, traceback) -> None: 63 | try: 64 | self.lock.release() 65 | except threading.ThreadError: 66 | pass 67 | 68 | def __reduce__(self) -> tuple[Callable, tuple[str]]: 69 | return (ProcessSync, (str(self.lock.path), )) 70 | 71 | def is_locked(self) -> bool: 72 | """Returns True if the lock is acquired, False otherwise.""" 73 | return self.lock.exists() 74 | -------------------------------------------------------------------------------- /zcollection/variable/tests/test_array.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Testing variables 7 | ================= 8 | """ 9 | from typing import Any 10 | 11 | import dask.array.core 12 | import dask.array.ma 13 | import numpy 14 | import pytest 15 | 16 | # pylint: disable=unused-import # Need to import for fixtures 17 | from ...tests.cluster import dask_client, dask_cluster 18 | # pylint enable=unused-import 19 | from ..array import _as_numpy_array 20 | 21 | 22 | def test_as_numpy_array(dask_client) -> None: 23 | """Test converting array like to a dask array.""" 24 | array: numpy.ndarray 25 | fill_value: Any 26 | np_array: numpy.ndarray 27 | 28 | np_array = numpy.arange(10) 29 | array, fill_value = _as_numpy_array(np_array) 30 | assert isinstance(array, numpy.ndarray) 31 | assert not isinstance(array, numpy.ma.MaskedArray) 32 | assert fill_value is None 33 | 34 | np_array = numpy.ma.masked_equal(np_array, 5) 35 | array, fill_value = _as_numpy_array(np_array) 36 | assert isinstance(array, numpy.ndarray) 37 | assert not isinstance(array, numpy.ma.MaskedArray) 38 | assert fill_value == 5 39 | 40 | array, fill_value = _as_numpy_array(dask.array.ma.masked_equal( 41 | np_array, 5)) 42 | assert isinstance(array, numpy.ndarray) 43 | assert not isinstance(array, numpy.ma.MaskedArray) 44 | assert fill_value == 5 45 | 46 | with pytest.raises(ValueError): 47 | _as_numpy_array(numpy.ma.masked_equal(np_array, 5), fill_value=6) 48 | 49 | with pytest.raises(ValueError): 50 | _as_numpy_array(numpy.ma.masked_equal( 51 | numpy.arange(numpy.datetime64(0, 'Y'), 52 | numpy.datetime64(10, 'Y'), 53 | dtype='M8[Y]'), numpy.datetime64(5, 'Y')), 54 | fill_value=numpy.datetime64('NaT')) 55 | 56 | array, fill_value = _as_numpy_array(numpy.ma.masked_equal( 57 | numpy.arange(numpy.datetime64(0, 'Y'), 58 | numpy.datetime64(10, 'Y'), 59 | dtype='M8[Y]'), numpy.datetime64('NaT')), 60 | fill_value=numpy.datetime64('NaT')) 61 | assert isinstance(array, numpy.ndarray) 62 | assert not isinstance(array, numpy.ma.MaskedArray) 63 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | ZCollection 2 | =========== 3 | 4 | This project is a Python library manipulating data split into a 5 | :py:class:`collection ` of groups stored in 6 | `Zarr format `_. 7 | 8 | This collection allows dividing a dataset into several partitions to facilitate 9 | acquisitions or updates made from new products. Possible data partitioning is: 10 | by :py:class:`date ` (hour, day, month, 11 | etc.) or by :py:class:`sequence `. 12 | 13 | A collection partitioned by date, with a monthly resolution, may look like on 14 | the disk: :: 15 | 16 | collection/ 17 | ├── year=2022 18 | │ ├── month=01/ 19 | │ │ ├── time/ 20 | │ │ │ ├── 0.0 21 | │ │ │ ├── .zarray 22 | │ │ │ └── .zattrs 23 | │ │ ├── var1/ 24 | │ │ │ ├── 0.0 25 | │ │ │ ├── .zarray 26 | │ │ │ └── .zattrs 27 | │ │ ├── .zattrs 28 | │ │ ├── .zgroup 29 | │ │ └── .zmetadata 30 | │ └── month=02/ 31 | │ ├── time/ 32 | │ │ ├── 0.0 33 | │ │ ├── .zarray 34 | │ │ └── .zattrs 35 | │ ├── var1/ 36 | │ │ ├── 0.0 37 | │ │ ├── .zarray 38 | │ │ └── .zattrs 39 | │ ├── .zattrs 40 | │ ├── .zgroup 41 | │ └── .zmetadata 42 | └── .zcollection 43 | 44 | Partition updates can be set to overwrite existing data with new ones or to 45 | update them using different :py:mod:`strategies `. 46 | 47 | The `Dask library `_ handles the data to scale the treatments 48 | quickly. 49 | 50 | It is possible to create views on a reference collection, to add and modify 51 | variables contained in a reference collection, accessible in reading only. 52 | 53 | This library can store data on POSIX, S3, or any other file system supported by 54 | the Python library `fsspec 55 | `_. Note, however, only POSIX 56 | and S3 file systems have been tested. 57 | 58 | .. toctree:: 59 | :maxdepth: 2 60 | :caption: Contents: 61 | 62 | install 63 | auto_examples/index.rst 64 | api 65 | release 66 | 67 | Indices and tables 68 | ================== 69 | 70 | * :ref:`genindex` 71 | * :ref:`modindex` 72 | * :ref:`search` 73 | -------------------------------------------------------------------------------- /zcollection/convenience/view.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Convenience functions 7 | ===================== 8 | """ 9 | from __future__ import annotations 10 | 11 | import fsspec 12 | 13 | from .. import collection, fs_utils, sync, view 14 | 15 | 16 | def create_view( 17 | path: str, 18 | view_ref: view.ViewReference, 19 | *, 20 | filesystem: fsspec.AbstractFileSystem | str | None = None, 21 | filters: collection.PartitionFilter = None, 22 | synchronizer: sync.Sync | None = None, 23 | ) -> view.View: 24 | """Create a new view. 25 | 26 | Args: 27 | path: View storage directory. 28 | view_ref: Access properties for the reference view. 29 | filesystem: The file system used to access the view. 30 | filters: The filters used to select the partitions of the reference 31 | view. If not provided, all partitions are selected. 32 | synchronizer: The synchronizer used to synchronize the view. 33 | 34 | Example: 35 | >>> view_ref = ViewReference( 36 | ... partition_base_dir="/data/mycollection") 37 | >>> view = create_view("/home/user/myview", view_ref) 38 | 39 | Returns: 40 | The created view. 41 | 42 | Raises: 43 | ValueError: If the path already exists. 44 | """ 45 | filesystem = fs_utils.get_fs(filesystem) 46 | if filesystem.exists(path): 47 | raise ValueError(f'path {path!r} already exists.') 48 | return view.View(path, 49 | view_ref, 50 | ds=None, 51 | filesystem=filesystem, 52 | filters=filters, 53 | synchronizer=synchronizer) 54 | 55 | 56 | def open_view( 57 | path: str, 58 | *, 59 | filesystem: fsspec.AbstractFileSystem | str | None = None, 60 | synchronizer: sync.Sync | None = None, 61 | ) -> view.View: 62 | """Open an existing view. 63 | 64 | Args: 65 | path: View storage directory. 66 | filesystem: The file system used to access the view. 67 | synchronizer: The synchronizer used to synchronize the view. 68 | 69 | Returns: 70 | The opened view. 71 | 72 | Example: 73 | >>> view = open_view("/home/user/myview") 74 | """ 75 | return view.View.from_config(path, 76 | filesystem=filesystem, 77 | synchronizer=synchronizer) 78 | -------------------------------------------------------------------------------- /zcollection/collection/callable_objects.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Callable objects. 7 | ================= 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Any, Callable, Protocol, Sequence 12 | 13 | from .. import dataset 14 | from ..type_hints import ArrayLike 15 | 16 | #: Function type to load and call a callback function of type 17 | #: :class:`PartitionCallable`. 18 | WrappedPartitionCallable = Callable[[Sequence[str]], None] 19 | 20 | 21 | #: pylint: disable=too-few-public-methods 22 | class PartitionCallable(Protocol): 23 | """Protocol for partition callables. 24 | 25 | A partition callable is a function that accepts a dataset and 26 | returns a result. 27 | """ 28 | 29 | @property 30 | def __name__(self) -> str: 31 | """Name of the callable.""" 32 | # pylint: disable=unnecessary-ellipsis 33 | # Make checker happy. 34 | ... 35 | # pylint: enable=unnecessary-ellipsis 36 | 37 | def __call__(self, zds: dataset.Dataset, *args, **kwargs) -> Any: 38 | """Call the partition function. 39 | 40 | Args: 41 | zds: Dataset to partition. 42 | *args: Positional arguments. 43 | **kwargs: Keyword arguments. 44 | 45 | Returns: 46 | Result of the partition function. 47 | """ 48 | 49 | 50 | #: Alias for :class:`PartitionCallable`. 51 | MapCallable = PartitionCallable 52 | 53 | 54 | class UpdateCallable(Protocol): 55 | """Protocol for update callables. 56 | 57 | A callable update is a function that accepts a data set and returns 58 | a dictionary of arrays to update. 59 | """ 60 | 61 | @property 62 | def __name__(self) -> str: 63 | """Name of the callable.""" 64 | # pylint: disable=unnecessary-ellipsis 65 | # Make checker happy. 66 | ... 67 | # pylint: enable=unnecessary-ellipsis 68 | 69 | def __call__(self, zds: dataset.Dataset, *args, 70 | **kwargs) -> dict[str, ArrayLike]: 71 | """Call the update function. 72 | 73 | Args: 74 | zds: Dataset to update. 75 | *args: Positional arguments. 76 | **kwargs: Keyword arguments. 77 | 78 | Returns: 79 | Dictionary of arrays to update. 80 | """ 81 | # pylint: disable=unnecessary-ellipsis 82 | # Mandatory to make Pylance happy. 83 | ... 84 | # pylint: enable=unnecessary-ellipsis 85 | -------------------------------------------------------------------------------- /zcollection/tests/fs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Fixture for testing the file system. 7 | ==================================== 8 | """ 9 | from typing import Any, Iterator 10 | import pathlib 11 | import tempfile 12 | 13 | import fsspec 14 | import fsspec.implementations.memory 15 | import pytest 16 | 17 | try: 18 | # pylint: disable=unused-import # Need to import for fixtures 19 | from .s3 import S3, s3, s3_base # type: ignore 20 | 21 | # pylint: disable=unused-import 22 | S3_IMPORT_EXCEPTION = None 23 | except ImportError as err: 24 | S3_IMPORT_EXCEPTION = str(err) 25 | 26 | 27 | def tempdir(tmpdir) -> pathlib.Path: 28 | """Create a temporary directory.""" 29 | return pathlib.Path(tempfile.mkdtemp(dir=str(tmpdir))) 30 | 31 | 32 | class Local: 33 | """Local files system.""" 34 | 35 | def __init__(self, tmpdir, protocol) -> None: 36 | #: The filesystem. 37 | self.fs: fsspec.AbstractFileSystem = fsspec.filesystem(protocol) 38 | #: The root directory. 39 | self.root = tempdir(pathlib.Path(tmpdir)) 40 | #: The collection directory. 41 | self.collection: pathlib.Path = self.root.joinpath('collection') 42 | #: The view directory. 43 | self.view: pathlib.Path = self.root.joinpath('view') 44 | 45 | def __getattr__(self, name) -> Any: 46 | return getattr(self.fs, name) 47 | 48 | 49 | @pytest.fixture 50 | def local_fs(tmpdir, pytestconfig) -> Iterator[Local]: 51 | """Local filesystem.""" 52 | protocol: str = 'memory' if pytestconfig.getoption('memory') else 'file' 53 | instance = Local(tmpdir, protocol) 54 | yield instance 55 | try: 56 | # For the memory protocol we delete the written data to free the 57 | # memory. 58 | if isinstance(instance.fs, 59 | fsspec.implementations.memory.MemoryFileSystem): 60 | instance.fs.rm(str(instance.root), recursive=True) 61 | except FileNotFoundError: 62 | pass 63 | 64 | 65 | # pylint: disable=redefined-outer-name,function-redefined 66 | if S3_IMPORT_EXCEPTION is None: 67 | 68 | @pytest.fixture 69 | def s3_fs(s3) -> S3: # type: ignore[arg-type] 70 | """S3 filesystem.""" 71 | return S3(s3) # type: ignore 72 | else: 73 | 74 | @pytest.fixture 75 | def s3() -> None: 76 | """S3 filesystem.""" 77 | 78 | @pytest.fixture 79 | def s3_base() -> None: 80 | """S3 filesystem.""" 81 | 82 | @pytest.fixture 83 | def s3_fs(pytestconfig) -> None: 84 | """S3 filesystem.""" 85 | if pytestconfig.getoption('s3'): 86 | pytest.fail(f'Unable to test S3: {S3_IMPORT_EXCEPTION}') 87 | else: 88 | pytest.skip('S3 is disabled') 89 | 90 | 91 | # pylint: enable=redefined-outer-name,function-redefined 92 | -------------------------------------------------------------------------------- /zcollection/representation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Representation of dataset objects. 7 | ================================== 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Any, Iterable, Iterator, Sequence 12 | 13 | from .meta import Attribute 14 | 15 | 16 | def dimensions(dims: dict[str, int]) -> str: 17 | """Returns a string representation of the dimensions. 18 | 19 | Args: 20 | dims: A dictionary containing the dimensions. 21 | 22 | Returns: 23 | A string representation of the dimensions in the form of a tuple, where 24 | each element of the tuple is a string containing the dimension name and 25 | its corresponding value. 26 | """ 27 | return str(tuple(f'{name}: {value}' for name, value in dims.items())) 28 | 29 | 30 | def _maybe_truncate(obj: Any, max_size: int) -> str: 31 | """Truncate the string representation of an object to the given length. 32 | 33 | Args: 34 | obj: An object. 35 | max_size: The maximum length of the string representation. 36 | 37 | Returns: 38 | The string representation of the object, truncated to the given length 39 | if necessary. 40 | """ 41 | result = str(obj) 42 | if len(result) > max_size: 43 | return result[:max_size - 3] + '...' 44 | return result 45 | 46 | 47 | def pretty_print(obj: Any, num_characters: int = 120) -> str: 48 | """Return a pretty printed string representation of the given object. 49 | 50 | Args: 51 | obj: 52 | An object to be pretty printed. 53 | num_characters: 54 | An integer representing the maximum number of 55 | characters per line. 56 | 57 | Returns: 58 | A string representation of the object, pretty printed with a maximum of 59 | `num_characters` characters per line. 60 | """ 61 | result: str = _maybe_truncate(obj, num_characters) 62 | return result + ' ' * max(num_characters - len(result), 0) 63 | 64 | 65 | def calculate_column_width(items: Iterable) -> int: 66 | """Calculate the maximum width of a column. 67 | 68 | Args: 69 | items: An iterable of items. 70 | 71 | Returns: 72 | The maximum width of a column. 73 | """ 74 | max_name: int = max(len(str(name)) for name in items) 75 | return max(max_name, 7) 76 | 77 | 78 | def attributes(attrs: Sequence[Attribute]) -> Iterator[str]: 79 | """Get the string representation of the attributes. 80 | 81 | Args: 82 | attrs: The attributes. 83 | 84 | Returns: 85 | The string representation of the attributes. 86 | """ 87 | width: int = calculate_column_width(item.name for item in attrs) 88 | for attr in attrs: 89 | name_str: str = f' {attr.name:<{width}s}' 90 | yield pretty_print(f'{name_str}: {attr.value!r}') 91 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # Dask worker space 142 | dask-worker-space/ 143 | 144 | # Autogenerated documentation files 145 | docs/source/_generated 146 | docs/source/auto_examples/ 147 | 148 | # Generated version file 149 | zcollection/version.py 150 | -------------------------------------------------------------------------------- /docs/source/install.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | Required dependencies 5 | --------------------- 6 | 7 | - Python (3.8 or later) 8 | - setuptools 9 | - `dask `_ 10 | - `distributed `_ 11 | - `fsspec `_ 12 | - `numcodecs `_ 13 | - `numpy `_ 14 | - `pyarrow `_ 15 | - `xarray `_ 16 | - `zarr `_ 17 | 18 | .. note:: 19 | 20 | `pyarrow` is optional, but required if you want to use the indexing API. 21 | 22 | Instructions 23 | ------------ 24 | 25 | Installation via conda 26 | ###################### 27 | 28 | The easiest way to install the library is to use conda. First, install the 29 | dependencies:: 30 | 31 | $ conda install -c conda-forge zcollection 32 | 33 | Installation via conda and sources 34 | ################################## 35 | 36 | It is possible to install the latest version from source. First, install the 37 | dependencies using conda:: 38 | 39 | $ conda install dask distributed fsspec numcodecs numpy pandas pyarrow xarray zarr 40 | 41 | Then, clone the repository:: 42 | 43 | $ git clone git@github.com:CNES/zcollection.git 44 | $ cd zcollection 45 | 46 | Finally, install the library using pip (it is possible to checkout a different 47 | branch before installing):: 48 | 49 | $ pip install . 50 | 51 | Installation via pip 52 | #################### 53 | 54 | $ pip install zcollection 55 | 56 | Testing 57 | ------- 58 | 59 | To run the test suite after installing the library, install (via pypi or 60 | conda) `pytest `__ and run ``pytest`` in the root 61 | directory of the cloned repository. 62 | 63 | The unit test process can be modified using options implemented for this 64 | project, in addition to the options provided by ``pytest``. The available user 65 | options are: 66 | 67 | - **s3**: Enable tests on the local S3 server driven by minio. (default: False) 68 | - **memory**: Use a file system in memory instead of the local file system. 69 | (default: False) 70 | - **threads_per_worker**: Number of threads for each worker Dask. 71 | (default: the number of logical cores of the target platform). 72 | - **n_workers**: Number of core for each worker Dask. 73 | (default: the number of cores of the target platform). 74 | 75 | To run the tests using a local S3 server, driven by the ``minio`` software, 76 | it's necessary to install the following optional requirements: 77 | 78 | - `s3fs `_ 79 | - `requests `_ 80 | 81 | You will need to install the ``minio`` program. You can find more information 82 | on this web `page `_. 83 | 84 | Documentation 85 | ------------- 86 | 87 | The documentation use sphinx and Google-style docstrings. To build the 88 | documentation, run ``make html`` in the ``docs`` directory. 89 | -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline }} 2 | 3 | .. currentmodule:: {{ module }} 4 | 5 | .. autoclass:: {{ objname }} 6 | :show-inheritance: 7 | {% block methods %} 8 | 9 | {%- set attr = [] -%} 10 | {%- set meth = [] -%} 11 | {%- set private = [] -%} 12 | {%- set protected = [] -%} 13 | {%- set special = [] -%} 14 | {%- set inherited_meth = [] -%} 15 | {%- set skip = ['__abstractmethods__', 16 | '__annotations__', 17 | '__dict__', 18 | '__doc__', 19 | '__entries__', 20 | '__hash__', 21 | '__init__', 22 | '__members__', 23 | '__module__', 24 | '__slots__', 25 | '__slotnames__', 26 | '__weakref__'] -%} 27 | 28 | {%- for item in methods if not item in skip -%} 29 | {%- if item in inherited_members -%} 30 | {{ inherited_meth.append(item) or "" }} 31 | {%- else -%} 32 | {{ meth.append(item) or "" }} 33 | {%- endif -%} 34 | {%- endfor -%} 35 | 36 | {%- for item in members 37 | if not item in inherited_members and not item in skip -%} 38 | {%- if item.startswith('__') and item.endswith('__') -%} 39 | {{ special.append(item) or "" }} 40 | {%- elif item.startswith('__') -%} 41 | {{ private.append(item) or "" }} 42 | {%- elif item.startswith('_') -%} 43 | {{ protected.append(item) or "" }} 44 | {%- endif -%} 45 | {%- endfor %} 46 | 47 | {%- if attributes %} 48 | .. rubric:: {{ _('Attributes') }} 49 | .. autosummary:: 50 | :toctree: 51 | {% for item in attributes %} 52 | ~{{ name }}.{{ item }} 53 | {%- endfor %} 54 | {% endif -%} 55 | 56 | {%- if meth %} 57 | .. rubric:: {{ _('Public Methods') }} 58 | .. autosummary:: 59 | :toctree: 60 | {% for item in meth %} 61 | ~{{ name }}.{{ item }} 62 | {%- endfor %} 63 | {% endif -%} 64 | 65 | {%- if protected %} 66 | .. rubric:: {{ _('Protected Methods') }} 67 | .. autosummary:: 68 | :toctree: 69 | {% for item in protected %} 70 | ~{{ name }}.{{ item }} 71 | {%- endfor %} 72 | {% endif -%} 73 | 74 | {%- if private %} 75 | .. rubric:: {{ _('Private Methods') }} 76 | .. autosummary:: 77 | :toctree: 78 | {% for item in private %} 79 | ~{{ name }}.{{ item }} 80 | {%- endfor %} 81 | {% endif -%} 82 | 83 | {%- if special %} 84 | 85 | .. rubric:: {{ _('Special Methods') }} 86 | .. autosummary:: 87 | :toctree: 88 | {% for item in special %} 89 | ~{{ name }}.{{ item }} 90 | {%- endfor %} 91 | {%- endif -%} 92 | 93 | {%- if inherited_meth %} 94 | 95 | .. rubric:: {{ _('Inherited Methods') }} 96 | .. autosummary:: 97 | :toctree: 98 | {% for item in inherited_meth %} 99 | ~{{ name }}.{{ item }} 100 | {%- endfor %} 101 | {%- endif -%} 102 | 103 | {%- endblock -%} 104 | -------------------------------------------------------------------------------- /zcollection/convenience/collection.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Convenience functions 7 | ===================== 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Literal 12 | 13 | import xarray 14 | 15 | from .. import collection, dataset, fs_utils, partitioning 16 | 17 | 18 | def create_collection( 19 | axis: str, 20 | ds: xarray.Dataset | dataset.Dataset, 21 | partition_handler: partitioning.Partitioning, 22 | partition_base_dir: str, 23 | **kwargs, 24 | ) -> collection.Collection: 25 | """Create a collection. 26 | 27 | Args: 28 | axis: The axis to use for the collection. 29 | ds: The dataset to use. 30 | partition_handler: The partitioning handler to use. 31 | partition_base_dir: The base directory to use for the partitions. 32 | **kwargs: Additional parameters are passed through to the constructor 33 | of the class :py:class:`Collection`. 34 | 35 | Example: 36 | >>> import xarray as xr 37 | >>> import zcollection 38 | >>> data = xr.Dataset({ 39 | ... "a": xr.DataArray([1, 2, 3]), 40 | ... "b": xr.DataArray([4, 5, 6]) 41 | ... }) 42 | >>> collection = zcollection.create_collection( 43 | ... "a", data, 44 | ... zcollection.partitioning.Sequence(("a", )), 45 | ... "/tmp/my_collection") 46 | 47 | Returns: 48 | The collection. 49 | 50 | Raises: 51 | ValueError: If the base directory already exists. 52 | """ 53 | filesystem = fs_utils.get_fs(kwargs.pop('filesystem', None)) 54 | if filesystem.exists(partition_base_dir): 55 | raise ValueError( 56 | f'The directory {partition_base_dir!r} already exists.') 57 | if isinstance(ds, xarray.Dataset): 58 | ds = dataset.Dataset.from_xarray(ds) 59 | return collection.Collection(axis, 60 | ds.metadata(), 61 | partition_handler, 62 | partition_base_dir, 63 | mode='w', 64 | filesystem=filesystem, 65 | **kwargs) 66 | 67 | 68 | # pylint: disable=redefined-builtin 69 | def open_collection(path: str, 70 | *, 71 | mode: Literal['r', 'w'] | None = None, 72 | **kwargs) -> collection.Collection: 73 | """Open a collection. 74 | 75 | Args: 76 | path: The path to the collection. 77 | mode: The mode to open the collection. 78 | **kwargs: Additional parameters are passed through the method 79 | :py:meth:`zcollection.collection.Collection.from_config`. 80 | Returns: 81 | The collection. 82 | 83 | Example: 84 | >>> import zcollection 85 | >>> collection = zcollection.open_collection( 86 | ... "/tmp/mycollection", mode="r") 87 | """ 88 | return collection.Collection.from_config(path, mode=mode, **kwargs) 89 | # pylint: enable=redefined-builtin 90 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: "*" 6 | pull_request: 7 | branches: master 8 | 9 | jobs: 10 | linux: 11 | name: ${{ matrix.python-version }}-posix 12 | runs-on: ubuntu-latest 13 | timeout-minutes: 15 14 | strategy: 15 | fail-fast: false 16 | max-parallel: 5 17 | matrix: 18 | python-version: ['3.8', '3.9', '3.10', '3.11'] 19 | steps: 20 | - name: Checkout 21 | uses: actions/checkout@v4 22 | 23 | - name: Setup Miniconda 24 | uses: mamba-org/setup-micromamba@v1 25 | with: 26 | cache-downloads: true 27 | condarc: | 28 | channels: 29 | - conda-forge 30 | create-args: | 31 | python=${{ matrix.python-version }} 32 | environment-name: ZCollection 33 | environment-file: conda/environment.yml 34 | 35 | - name: Run Tests 36 | shell: bash -l {0} 37 | run: | 38 | python -m setuptools_scm 39 | if [[ ! -e zcollection/version.py ]]; then 40 | echo "__version__ = '$(git describe --tags --always)'" > zcollection/version.py 41 | fi 42 | pytest -v -ra --processes 43 | 44 | s3-fs: 45 | runs-on: ubuntu-latest 46 | timeout-minutes: 15 47 | steps: 48 | - name: Checkout 49 | uses: actions/checkout@v3 50 | 51 | - name: Install MinIO 52 | run: | 53 | mkdir -p /opt/minio/bin 54 | wget -nv -P /opt/minio/bin \ 55 | https://dl.min.io/server/minio/release/linux-amd64/minio 56 | chmod +x /opt/minio/bin/minio 57 | 58 | - name: Add MinIO To System Path 59 | run: | 60 | echo /opt/minio/bin >> $GITHUB_PATH 61 | 62 | - name: Setup Miniconda 63 | uses: mamba-org/setup-micromamba@v1 64 | with: 65 | cache-downloads: true 66 | condarc: | 67 | channels: 68 | - conda-forge 69 | create-args: | 70 | python=3.9 71 | environment-name: ZCollection 72 | environment-file: conda/environment.yml 73 | 74 | - name: Run Tests 75 | shell: bash -l {0} 76 | run: | 77 | python -m setuptools_scm 78 | python -m setuptools_scm 79 | if [[ ! -e zcollection/version.py ]]; then 80 | echo "__version__ = '$(git describe --tags --always)'" > zcollection/version.py 81 | fi 82 | pytest -v -ra --s3 --processes 83 | 84 | win: 85 | name: win 86 | runs-on: windows-2019 87 | timeout-minutes: 15 88 | 89 | steps: 90 | - name: Checkout 91 | uses: actions/checkout@v3 92 | 93 | - name: Setup Miniconda 94 | uses: mamba-org/setup-micromamba@v1 95 | with: 96 | cache-downloads: true 97 | condarc: | 98 | channels: 99 | - conda-forge 100 | create-args: | 101 | python=3.9 102 | environment-name: ZCollection 103 | environment-file: conda/environment.yml 104 | 105 | - name: Run Tests 106 | shell: bash -l {0} 107 | run: | 108 | python -m setuptools_scm 109 | if [[ ! -e zcollection/version.py ]]; then 110 | echo "__version__ = '$(git describe --tags --always)'" > zcollection/version.py 111 | fi 112 | pytest -v -ra --processes 113 | -------------------------------------------------------------------------------- /zcollection/dask_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Dask utilities 7 | ============== 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Any, Callable, Iterator, Sequence 12 | import itertools 13 | import uuid 14 | 15 | from dask.delayed import Delayed as dask_Delayed 16 | import dask.distributed 17 | import dask.highlevelgraph 18 | 19 | 20 | def dask_workers(client: dask.distributed.Client, 21 | cores_only: bool = False) -> int: 22 | """Return the number of dask workers available. 23 | 24 | Args: 25 | client: dask client 26 | cores_only: if True, only the number of cores is returned, 27 | otherwise the total number of threads is returned. 28 | 29 | Returns: 30 | number of dask workers 31 | 32 | Raises: 33 | ValueError: If no dask workers are available. 34 | """ 35 | result: int = len( 36 | client.ncores()) if cores_only else sum( # type: ignore[arg-type] 37 | item 38 | for item in client.nthreads().values()) # type: ignore[arg-type] 39 | if result == 0: 40 | raise RuntimeError('No dask workers available') 41 | return result 42 | 43 | 44 | def get_client() -> dask.distributed.Client: 45 | """Return the default dask client. 46 | 47 | Returns: 48 | default dask client 49 | """ 50 | try: 51 | return dask.distributed.get_client() 52 | except ValueError: 53 | return dask.distributed.Client( 54 | processes=False, 55 | direct_to_workers=True, 56 | ) 57 | 58 | 59 | def split_sequence(sequence: Sequence[Any], 60 | sections: int | None = None) -> Iterator[Sequence[Any]]: 61 | """Split a sequence into sections. 62 | 63 | Args: 64 | sequence: The sequence to split. 65 | sections: The number of sections to split the sequence into. Default 66 | divides the sequence into n sections of one element. 67 | 68 | Returns: 69 | Iterator of sequences. 70 | """ 71 | sections = len(sequence) if sections is None else sections 72 | if sections <= 0: 73 | raise ValueError('The number of sections must be greater than zero.') 74 | length: int = len(sequence) 75 | sections = min(sections, length) 76 | 77 | size: int 78 | extras: int 79 | size, extras = divmod(length, sections) 80 | 81 | div = tuple( 82 | itertools.accumulate([0] + extras * [size + 1] + 83 | (sections - extras) * [size])) 84 | yield from (sequence[item:div[ix + 1]] for ix, item in enumerate(div[:-1])) 85 | 86 | 87 | def simple_delayed(name: str, func: Callable) -> dask_Delayed: 88 | """Create a simple delayed function. 89 | 90 | Args: 91 | name: name of the function 92 | func: function to be delayed 93 | 94 | Returns: 95 | delayed function 96 | """ 97 | name = f'{name}-{str(uuid.uuid4())}' 98 | return dask_Delayed( 99 | name, 100 | dask.highlevelgraph.HighLevelGraph({name: { 101 | name: func 102 | }}, {name: set()}), 103 | None, 104 | ) 105 | -------------------------------------------------------------------------------- /zcollection/tests/test_expression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Tests of the expression evaluation 7 | ================================== 8 | """ 9 | from __future__ import annotations 10 | 11 | import timeit 12 | 13 | import numpy 14 | import pytest 15 | import xarray 16 | 17 | from .. import dataset 18 | from ..expression import Expression 19 | from ..partitioning import Date 20 | # pylint: disable=unused-import # Need to import for fixtures 21 | from .cluster import dask_client, dask_cluster 22 | 23 | # pylint enable=unused-import 24 | 25 | 26 | def make_dataset(num_samples: int | None = None) -> dataset.Dataset: 27 | """Creation of a data set for testing purposes.""" 28 | dates = numpy.arange(numpy.datetime64('2000-01-01', 'ns'), 29 | numpy.datetime64('2009-12-31', 'ns'), 30 | numpy.timedelta64(1, 'h')).astype('datetime64[ns]') 31 | if num_samples is not None: 32 | dates = dates[:num_samples + 1] 33 | observation = numpy.random.rand(dates.size) # type: ignore 34 | return dataset.Dataset.from_xarray( 35 | xarray.Dataset({ 36 | 'dates': 37 | xarray.DataArray(dates, dims=('num_lines', )), 38 | 'observation': 39 | xarray.DataArray(observation, dims=('num_lines', )) 40 | })) 41 | 42 | 43 | def test_expression() -> None: 44 | """Test of the creation of expressions.""" 45 | expr = Expression('a == b') 46 | assert expr({'a': 1, 'b': 1}) 47 | assert not expr({'a': 1, 'b': 2}) 48 | 49 | with pytest.raises(SyntaxError): 50 | Expression('a==') 51 | 52 | with pytest.raises(NameError): 53 | assert expr({'a': 1, 'c': 1}) 54 | 55 | 56 | def test_date_expression( 57 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 58 | ) -> None: 59 | """Test of expressions handling dates..""" 60 | zds = make_dataset(5 * 24) 61 | partitioning = Date(('dates', ), 'D') 62 | 63 | for partition, _ in partitioning.split_dataset(zds, 'num_lines'): 64 | variables = dict(partitioning.parse('/'.join(partition))) 65 | expr = Expression('year==2000') 66 | assert expr(variables) 67 | expr = Expression('year==2000 and month==1') 68 | assert expr(variables) 69 | expr = Expression('year==2000 and month==1 and day in range(1, 12)') 70 | assert expr(variables) 71 | 72 | 73 | def test_bench_expression( 74 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 75 | ) -> None: 76 | """Benchmark of expressions.""" 77 | partitioning = Date(('dates', ), 'D') 78 | zds = make_dataset() 79 | expr = Expression('year==2000 and month==1 and day in range(1, 12)') 80 | times = [] 81 | number = 100 82 | for partition, _ in partitioning.split_dataset(zds, 'num_lines'): 83 | variables = dict(partitioning.parse('/'.join(partition))) 84 | times.append( 85 | timeit.timeit('expr(variables)', 86 | globals={ 87 | 'expr': expr, 88 | 'variables': variables 89 | }, 90 | number=number)) 91 | 92 | assert sum(times) / (len(times) * number) < 1e-5 93 | -------------------------------------------------------------------------------- /zcollection/tests/cluster.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Fixtures for testing Dask clusters using the pytest. 7 | ==================================================== 8 | """ 9 | from typing import Iterator 10 | import contextlib 11 | import logging 12 | import weakref 13 | 14 | import dask.config 15 | import dask.distributed 16 | import py 17 | import pytest 18 | 19 | 20 | @pytest.fixture() 21 | def dask_cluster( 22 | pytestconfig, 23 | tmpdir_factory, 24 | scope='session', # pylint: disable=unused-argument 25 | ) -> str: 26 | """Launch a Dask LocalCluster with a configurable number of workers.""" 27 | n_workers: int | None 28 | threads_per_worker: int | None 29 | processes: bool 30 | 31 | try: 32 | n_workers = int(pytestconfig.getoption('n_workers')) 33 | except TypeError: 34 | n_workers = None 35 | 36 | try: 37 | threads_per_worker = int(pytestconfig.getoption('threads_per_worker')) 38 | except TypeError: 39 | threads_per_worker = None 40 | 41 | try: 42 | processes = int(pytestconfig.getoption('processes')) == 1 43 | except TypeError: 44 | processes = False 45 | 46 | tmpdir: py.path.local = tmpdir_factory.getbasetemp() 47 | scheduler_file: py.path.local = tmpdir / 'scheduler.json' 48 | if scheduler_file.exists(): 49 | return str(scheduler_file) 50 | 51 | # Use the root path of the test session for the dask worker space 52 | dask_worker: py.path.local = tmpdir / 'dask_worker_space' 53 | dask.config.set(temporary_directory=str(dask_worker)) 54 | 55 | logging.info('Dask local cluster starting') 56 | cluster = dask.distributed.LocalCluster( 57 | protocol='tcp://', 58 | n_workers=n_workers, 59 | threads_per_worker=threads_per_worker, 60 | processes=processes, 61 | ) 62 | 63 | def teardown() -> None: 64 | """Stop the cluster and remove the scheduler file.""" 65 | if scheduler_file.exists(): 66 | scheduler_file.remove() 67 | 68 | weakref.finalize(cluster, teardown) 69 | 70 | # Make sure we can connect to the cluster. 71 | with dask.distributed.Client(cluster) as client: 72 | client.write_scheduler_file(scheduler_file) 73 | client.wait_for_workers(1) 74 | 75 | logging.info('Dask local cluster started') 76 | return str(scheduler_file) 77 | 78 | 79 | @contextlib.contextmanager 80 | def _scheduler_file( 81 | dask_cluster, # pylint: disable=redefined-outer-name 82 | ) -> Iterator[str]: 83 | """Get the scheduler used to connect to the cluster.""" 84 | yield dask_cluster 85 | 86 | 87 | @pytest.fixture() 88 | def dask_client( 89 | dask_cluster, # pylint: disable=redefined-outer-name 90 | ) -> Iterator[dask.distributed.Client]: 91 | """Connect a Dask client to the cluster.""" 92 | with _scheduler_file(dask_cluster) as scheduler_file: 93 | try: 94 | with dask.distributed.Client( 95 | scheduler_file=scheduler_file) as client: 96 | yield client 97 | except RuntimeError: 98 | # Ignore while the client is being stopped. 99 | pass 100 | -------------------------------------------------------------------------------- /zcollection/type_hints.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Type hints for the zcollection package. 7 | ======================================= 8 | 9 | .. rubric:: Type aliases 10 | 11 | .. py:data:: DType 12 | :canonical: DType 13 | 14 | Type of a numpy array. 15 | 16 | .. py:data:: DTypeLike 17 | :canonical: DTypeLike 18 | 19 | Type of a numpy array or a string. 20 | 21 | .. py:data:: NDArray 22 | :canonical: NDArray 23 | 24 | Type of a numpy array. 25 | 26 | .. py:data:: NDMaskedArray 27 | 28 | Type of a numpy array with a mask. 29 | """ 30 | from __future__ import annotations 31 | 32 | from typing import TYPE_CHECKING, Any, Protocol, TypeVar 33 | 34 | try: 35 | from types import GenericAlias # type: ignore[attr-defined] 36 | except ImportError: 37 | # pylint: disable=ungrouped-imports 38 | # For Python < 3.9 we use a backport of GenericAlias provided by 39 | # numpy 40 | # isort: off 41 | from numpy._typing._generic_alias import ( # type: ignore[no-redef] 42 | _GenericAlias as GenericAlias, ) 43 | # isort: on 44 | # pylint: enable=ungrouped-imports 45 | 46 | try: 47 | from typing_extensions import TypeAlias 48 | except ImportError: 49 | # pylint: disable=ungrouped-imports 50 | # TypeAlias is defined in typing starting from 3.10 51 | from typing import TypeAlias # type: ignore[attr-defined,no-redef] 52 | # pylint: enable=ungrouped-imports 53 | 54 | import numpy 55 | import numpy.typing 56 | 57 | # pylint: disable=invalid-name 58 | _DType_co = TypeVar('_DType_co', covariant=True, bound='numpy.dtype[Any]') 59 | _ScalarType_co = TypeVar('_ScalarType_co', bound=numpy.generic, covariant=True) 60 | # pylint: enable=invalid-name 61 | 62 | if TYPE_CHECKING: 63 | DType = numpy.dtype[_ScalarType_co] 64 | NDMaskedArray = numpy.ma.MaskedArray[Any, DType] # pragma: no cover 65 | else: 66 | DType = GenericAlias(numpy.dtype, (_ScalarType_co, )) 67 | NDMaskedArray = GenericAlias(numpy.ma.MaskedArray, (Any, DType)) 68 | 69 | NDArray: TypeAlias = numpy.typing.NDArray # pragma: no cover 70 | DTypeLike: TypeAlias = numpy.typing.DTypeLike # pragma: no cover 71 | 72 | 73 | class ArrayLike(Protocol[_DType_co]): 74 | """Protocol for array-like objects.""" 75 | 76 | def __array__(self) -> NDArray: 77 | ... 78 | 79 | @property 80 | def dtype(self) -> DType: 81 | """The data type of the array.""" 82 | # pylint: disable=unnecessary-ellipsis 83 | # Make checker happy. 84 | ... 85 | # pylint: enable=unnecessary-ellipsis 86 | 87 | @property 88 | def shape(self) -> tuple[int, ...]: 89 | """The shape of the array.""" 90 | # pylint: disable=unnecessary-ellipsis 91 | # Make checker happy. 92 | ... 93 | # pylint: enable=unnecessary-ellipsis 94 | 95 | @property 96 | def size(self) -> int: 97 | """The size of the array.""" 98 | # pylint: disable=unnecessary-ellipsis 99 | # Make checker happy. 100 | ... 101 | # pylint: enable=unnecessary-ellipsis 102 | 103 | def astype(self, dtype: DTypeLike) -> ArrayLike[_DType_co]: 104 | """Convert the array to a given type.""" 105 | # pylint: disable=unnecessary-ellipsis 106 | # Make checker happy. 107 | ... 108 | # pylint: enable=unnecessary-ellipsis 109 | -------------------------------------------------------------------------------- /zcollection/merging/tests/test_merging.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Test merging. 7 | ============= 8 | """ 9 | import numpy 10 | import pytest 11 | import zarr 12 | 13 | from .. import _update_fs, merge_time_series, perform 14 | from ... import sync 15 | from ...tests import data 16 | # pylint: disable=unused-import # Need to import for fixtures 17 | from ...tests.cluster import dask_client, dask_cluster 18 | from ...tests.fixture import dask_arrays, numpy_arrays 19 | from ...tests.fs import local_fs 20 | 21 | # pylint: enable=unused-import 22 | 23 | 24 | class MyError(RuntimeError): 25 | """Custom error.""" 26 | 27 | 28 | class ThrowError(sync.Sync): 29 | """Throw an error when merging.""" 30 | 31 | def __enter__(self) -> bool: 32 | raise MyError('This is an error') 33 | 34 | def __exit__(self, exc_type, exc_value, traceback) -> None: 35 | ... 36 | 37 | def is_locked(self) -> bool: 38 | return False 39 | 40 | 41 | def test_update_fs( 42 | dask_client, # pylint: disable=redefined-outer-name 43 | local_fs, # pylint: disable=redefined-outer-name 44 | ) -> None: 45 | """Test the _update_fs function.""" 46 | generator = data.create_test_dataset(delayed=False) 47 | zds = next(generator) 48 | 49 | partition_folder = local_fs.root.joinpath('variable=1') 50 | 51 | zattrs = str(partition_folder.joinpath('.zattrs')) 52 | future = dask_client.submit(_update_fs, str(partition_folder), 53 | dask_client.scatter(zds), local_fs.fs) 54 | dask_client.gather(future) 55 | assert local_fs.exists(zattrs) 56 | 57 | local_fs.fs.rm(str(partition_folder), recursive=True) 58 | assert not local_fs.exists(zattrs) 59 | seen_exception = False 60 | try: 61 | future = dask_client.submit(_update_fs, 62 | str(partition_folder), 63 | dask_client.scatter(zds), 64 | local_fs.fs, 65 | synchronizer=ThrowError()) 66 | dask_client.gather(future) 67 | except MyError: 68 | seen_exception = True 69 | assert seen_exception 70 | assert not local_fs.exists(zattrs) 71 | 72 | 73 | @pytest.mark.parametrize('arrays_type', ['dask_arrays', 'numpy_arrays']) 74 | def test_perform( 75 | dask_client, # pylint: disable=redefined-outer-name 76 | local_fs, # pylint: disable=redefined-outer-name 77 | arrays_type, 78 | request, 79 | ) -> None: 80 | """Test the perform function.""" 81 | delayed = request.getfixturevalue(arrays_type) 82 | generator = data.create_test_dataset(delayed=delayed) 83 | zds = next(generator) 84 | 85 | path = str(local_fs.root.joinpath('variable=1')) 86 | 87 | future = dask_client.submit(_update_fs, path, dask_client.scatter(zds), 88 | local_fs.fs) 89 | dask_client.gather(future) 90 | 91 | future = dask_client.submit(perform, 92 | dask_client.scatter(zds), 93 | path, 94 | 'time', 95 | local_fs.fs, 96 | 'time', 97 | delayed=delayed, 98 | merge_callable=merge_time_series) 99 | dask_client.gather(future) 100 | 101 | zgroup = zarr.open_consolidated(local_fs.get_mapper(path)) 102 | assert numpy.all(zgroup['time'][...] == zds['time'].values) 103 | assert numpy.all(zgroup['var1'][...] == zds['var1'].values) 104 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import importlib.metadata 14 | import pathlib 15 | import sys 16 | 17 | HERE = pathlib.Path(__file__).absolute().parent 18 | 19 | # Insert the project root dir as the first element in the PYTHONPATH. 20 | sys.path.insert(0, str(HERE.parent.parent)) 21 | 22 | # -- Project information ----------------------------------------------------- 23 | 24 | project = 'zcollection' 25 | copyright = '(2022, CNES/CLS)' 26 | author = 'CLS' 27 | 28 | # The full version, including alpha/beta/rc tags 29 | try: 30 | release = importlib.metadata.version(project) 31 | except importlib.metadata.PackageNotFoundError: 32 | release = '0.0.0' 33 | version = '.'.join(release.split('.')[:2]) 34 | 35 | # -- General configuration --------------------------------------------------- 36 | 37 | # Add any Sphinx extension module names here, as strings. They can be 38 | # extensions coming with Sphinx (named "sphinx.ext.*") or your custom 39 | # ones. 40 | extensions = [ 41 | 'sphinx_inline_tabs', 42 | 'sphinx_gallery.gen_gallery', 43 | 'sphinx.ext.autodoc', 44 | 'sphinx.ext.autosummary', 45 | 'sphinx.ext.intersphinx', 46 | 'sphinx.ext.mathjax', 47 | 'sphinx.ext.napoleon', 48 | 'sphinx.ext.viewcode', 49 | ] 50 | 51 | autosummary_generate = True 52 | 53 | autodoc_typehints = 'description' 54 | autodoc_type_aliases = dict( 55 | ArrayLike='ArrayLike', 56 | DType='DType', 57 | DTypeLike='DTypeLike', 58 | Indexer='Indexer', 59 | NDArray='NDArray', 60 | NDMaskedArray='NDMaskedArray', 61 | PartitionCallback='PartitionCallback', 62 | QueryDict='QueryDict', 63 | Scalar='Scalar', 64 | ) 65 | 66 | numpydoc_class_members_toctree = True 67 | numpydoc_show_class_members = False 68 | 69 | # Add any paths that contain templates here, relative to this directory. 70 | templates_path = ['_templates'] 71 | 72 | # List of patterns, relative to source directory, that match files and 73 | # directories to ignore when looking for source files. 74 | # This pattern also affects html_static_path and html_extra_path. 75 | exclude_patterns = [] 76 | 77 | # -- Options for HTML output ------------------------------------------------- 78 | 79 | # The theme to use for HTML and HTML Help pages. See the documentation for 80 | # a list of builtin themes. 81 | # 82 | html_theme = 'furo' 83 | html_title = 'ZCollection' 84 | 85 | # Add any paths that contain custom static files (such as style sheets) here, 86 | # relative to this directory. They are copied after the builtin static files, 87 | # so a file named "default.css" will overwrite the builtin "default.css". 88 | html_static_path = ['_static'] 89 | 90 | intersphinx_mapping = { 91 | 'dask': ('https://docs.dask.org/en/latest/', None), 92 | 'fsspec': ('https://filesystem-spec.readthedocs.io/en/latest/', None), 93 | 'numpy': ('https://numpy.org/doc/stable/', None), 94 | 'python': ('https://docs.python.org/3/', None), 95 | 'xarray': ('https://docs.xarray.dev/en/stable/', None), 96 | 'zarr': ('https://zarr.readthedocs.io/en/stable', None), 97 | } 98 | 99 | # -- Extension configuration ------------------------------------------------- 100 | sphinx_gallery_conf = { 101 | 'examples_dirs': [HERE.parent.parent.joinpath('examples')], 102 | 'filename_pattern': r'[\\\/]ex_', 103 | 'pypandoc': False, 104 | } 105 | -------------------------------------------------------------------------------- /docs/source/release.rst: -------------------------------------------------------------------------------- 1 | Release notes 2 | ============= 3 | 4 | 2024.2.0 5 | -------- 6 | * Refactor merging module and improve temporary directory handling. 7 | * Bug fix: Add cache invalidation for updated partitions. 8 | 9 | 2024.1.0 10 | -------- 11 | * Bug fix: Partitions must not be sorted lexicographically. 12 | 13 | 2023.11.2 14 | --------- 15 | * Returns a list of added/deleted partitions. 16 | 17 | 2023.11.1 18 | --------- 19 | * Skip filesystem scans: Queries utilizing an indexer reuse known partitions for 20 | efficiency. 21 | 22 | 2023.11.0 23 | --------- 24 | * Evaluate an expression from a dataset. 25 | * Fix linter warning. 26 | * Validate partitions. 27 | 28 | 2023.10.0 29 | --------- 30 | * Merge time series with data gaps. 31 | * Fix Linux fork startup blocking test. 32 | * Correction of issues related to CI executions. 33 | * Added an option to specify the list of variables used by the callback 34 | function for updating a partition. 35 | * Classmethods removed from indexers. 36 | 37 | 2023.5.0 38 | -------- 39 | * Add missing copyrights. 40 | * Modularise code to reduce the number of lines per module. 41 | * Writing variables is limited to the worker being used. 42 | * Improve test coverage. 43 | * #9: Read the version attribute directly from the ``version.py`` module. 44 | * #8: Incomplete overlaps with more than one worker. 45 | * #7: Fix bug, in the update method, if the user has selected multiple 46 | partitions the selected variables must contain the updated variables. 47 | * #6: the parameter name for specifying the number of concurrent inserts is 48 | incorrect. 49 | * #3: Add a trim argument to the ``update`` method, like Dask's Dask's 50 | ``map_overlap``. 51 | * Update the documentation. 52 | * Refactor the code. 53 | * Loading data using Dask or Numpy. 54 | * Variable adds attributes to partitions. 55 | 56 | 2023.3.2 57 | -------- 58 | * Writing a partition with many variables is slow. 59 | * Writing metadata only in the collection's configuration. 60 | * Adding an inter-process lock 61 | * If a variable has been modified since its initialization, the library throws a 62 | specific exception to warn the user. 63 | 64 | 2023.3.1 65 | -------- 66 | * Fixed a compatibility issue with fspec 2023.3.0. 67 | 68 | 2023.3.0 69 | -------- 70 | * Apply an optional mask before querying an indexer. 71 | 72 | 2023.2.0 73 | -------- 74 | * Synchronize the view with the reference collection. 75 | * Support for Python 3.11. 76 | * Bug fixes. 77 | * Optimization of the insertion of new partitions. 78 | * Copy collection over different file systems. 79 | * Export Dataset to Zarr group. 80 | 81 | 2022.12.0/2022.12.1 82 | ------------------- 83 | 84 | Release on December 2, 2022 85 | 86 | * Write immutable variables of a dataset into a single group. 87 | * Possibility to update partitions using neighbor partitions (useful for 88 | filtering, for example). 89 | * Refactor methods overlapping partitions. 90 | * Update documentation. 91 | 92 | 2022.10.2/2022.10.1 93 | ------------------- 94 | 95 | Release on October 13, 20212 96 | 97 | * Add compatibility with Python 3.8. 98 | 99 | 2022.10.0 100 | --------- 101 | 102 | Release on October 7, 20212 103 | 104 | * Added an option to the method ``drop_partitions`` to drop partitions 105 | older than a specified time delta relative to the current time. 106 | 107 | 2022.8.0 108 | -------- 109 | 110 | Release on August 14, 2022 111 | 112 | * Support Python starting 3.9. 113 | * Refactor convenience functions. 114 | * Refactor dataset & variables modules. 115 | * The indexer can return only the partition keys. 116 | * Optimization of dataset handling. 117 | * Bug fixes. 118 | 119 | 0.2 / 2020-04-04 120 | ---------------- 121 | 122 | Release on April 4, 2020 123 | 124 | * Installation from PyPi. 125 | * Unsigned integers are not handled. 126 | 127 | 0.1 / 2022-08-30 128 | ----------------- 129 | 130 | Release on March 30, 2020 131 | 132 | * First public version. 133 | -------------------------------------------------------------------------------- /zcollection/tests/test_dask_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Testing utilities 7 | ================= 8 | """ 9 | import dask.distributed 10 | import pytest 11 | 12 | from .. import dask_utils 13 | # pylint: disable=unused-import # Need to import for fixtures 14 | from .cluster import dask_client, dask_cluster 15 | 16 | # pylint: disable=unused-import 17 | 18 | 19 | @pytest.mark.filterwarnings('ignore:Port \\d+ is already in use.*') 20 | def test_get_client_with_no_cluster(): 21 | """Test the get_client function with no cluster.""" 22 | with dask_utils.get_client() as client: 23 | assert isinstance(client, dask.distributed.Client) 24 | 25 | 26 | def test_get_client_with_cluster( 27 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 28 | ): 29 | """Test the get_client function with a cluster.""" 30 | with dask_utils.get_client() as client: 31 | assert isinstance(client, dask.distributed.Client) 32 | 33 | 34 | def test_dask_workers( 35 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 36 | ): 37 | """Test the dask_workers function.""" 38 | assert dask_utils.dask_workers(dask_client, cores_only=True) == len( 39 | dask_client.ncores()) # type: ignore 40 | assert dask_utils.dask_workers(dask_client, cores_only=False) == sum( 41 | item for item in dask_client.nthreads().values()) # type: ignore 42 | 43 | 44 | def test_split_sequence(): 45 | """Test the split_sequence function.""" 46 | assert list(dask_utils.split_sequence(list(range(10)), 2)) == [ 47 | [0, 1, 2, 3, 4], 48 | [5, 6, 7, 8, 9], 49 | ] 50 | assert list(dask_utils.split_sequence(list(range(10)), 3)) == [ 51 | [0, 1, 2, 3], 52 | [4, 5, 6], 53 | [7, 8, 9], 54 | ] 55 | assert list(dask_utils.split_sequence(list(range(10)), 4)) == [ 56 | [0, 1, 2], 57 | [3, 4, 5], 58 | [6, 7], 59 | [8, 9], 60 | ] 61 | assert list(dask_utils.split_sequence(list(range(10)), 5)) == [ 62 | [0, 1], 63 | [2, 3], 64 | [4, 5], 65 | [6, 7], 66 | [8, 9], 67 | ] 68 | assert list(dask_utils.split_sequence(list(range(10)), 6)) == [ 69 | [0, 1], 70 | [2, 3], 71 | [4, 5], 72 | [6, 7], 73 | [8], 74 | [9], 75 | ] 76 | assert list(dask_utils.split_sequence(list(range(10)), 7)) == [ 77 | [0, 1], 78 | [2, 3], 79 | [4, 5], 80 | [6], 81 | [7], 82 | [8], 83 | [9], 84 | ] 85 | assert list(dask_utils.split_sequence(list(range(10)), 8)) == [ 86 | [0, 1], 87 | [2, 3], 88 | [4], 89 | [5], 90 | [6], 91 | [7], 92 | [8], 93 | [9], 94 | ] 95 | assert list(dask_utils.split_sequence(list(range(10)), 9)) == [ 96 | [0, 1], 97 | [2], 98 | [3], 99 | [4], 100 | [5], 101 | [6], 102 | [7], 103 | [8], 104 | [9], 105 | ] 106 | assert list(dask_utils.split_sequence(list(range(10)), 10)) == [ 107 | [0], 108 | [1], 109 | [2], 110 | [3], 111 | [4], 112 | [5], 113 | [6], 114 | [7], 115 | [8], 116 | [9], 117 | ] 118 | assert list(dask_utils.split_sequence(list(range(10)), 11)) == [ 119 | [0], 120 | [1], 121 | [2], 122 | [3], 123 | [4], 124 | [5], 125 | [6], 126 | [7], 127 | [8], 128 | [9], 129 | ] 130 | assert list(dask_utils.split_sequence(list(range(10)))) == [ 131 | [0], 132 | [1], 133 | [2], 134 | [3], 135 | [4], 136 | [5], 137 | [6], 138 | [7], 139 | [8], 140 | [9], 141 | ] 142 | with pytest.raises(ValueError): 143 | list(dask_utils.split_sequence(list(range(10)), 0)) 144 | -------------------------------------------------------------------------------- /zcollection/tests/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Make test datasets 7 | ================== 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Iterator 12 | import itertools 13 | 14 | import numpy 15 | import zarr 16 | 17 | from .. import collection, dataset, partitioning 18 | from ..type_hints import NDArray 19 | 20 | #: First date of the test dataset. 21 | START_DATE = numpy.datetime64('2000-01-01', 'ns') 22 | #: Last date of the test dataset. 23 | END_DATE = numpy.datetime64('2000-06-30', 'ns') 24 | #: Delta between two dates. 25 | DELTA = numpy.timedelta64(72, 'h') 26 | #: Fill value. 27 | FILL_VALUE = 2147483647 28 | 29 | 30 | def make_dataset(dates: numpy.ndarray, 31 | measures: numpy.ndarray, 32 | fill_value: float | None = None, 33 | filters: tuple | None = None, 34 | delayed: bool = True) -> dataset.Dataset: 35 | """Create a dataset.""" 36 | array_class = (dataset.DelayedArray if delayed else dataset.Array) 37 | return dataset.Dataset( 38 | attrs=(dataset.Attribute(name='attr', value=1), ), 39 | variables=( 40 | array_class( 41 | name='time', 42 | data=dates, 43 | dimensions=('num_lines', ), 44 | attrs=(dataset.Attribute(name='attr', value=1), ), 45 | compressor=zarr.Blosc(), 46 | ), 47 | array_class(name='var1', 48 | data=measures, 49 | dimensions=('num_lines', 'num_pixels'), 50 | attrs=(dataset.Attribute(name='attr', value=1), ), 51 | fill_value=fill_value, 52 | filters=filters), 53 | array_class(name='var2', 54 | data=measures, 55 | dimensions=('num_lines', 'num_pixels'), 56 | attrs=(dataset.Attribute(name='attr', value=1), ), 57 | fill_value=fill_value, 58 | filters=filters), 59 | ), 60 | ) 61 | 62 | 63 | def create_test_dataset(delayed: bool = True) -> Iterator[dataset.Dataset]: 64 | """Create a temporal dataset.""" 65 | 66 | dates: NDArray = numpy.arange(START_DATE, END_DATE, DELTA) 67 | indices: NDArray = numpy.arange(0, len(dates)) 68 | 69 | for item in numpy.array_split(dates, 12): 70 | mask: NDArray = (dates >= item[0]) & (dates <= item[-1]) 71 | measures = numpy.vstack((indices[mask], ) * 25).T 72 | 73 | yield make_dataset(item, measures, delayed=delayed) 74 | 75 | 76 | def create_test_dataset_with_fillvalue( 77 | delayed: bool = True) -> Iterator[dataset.Dataset]: 78 | """Create a dataset with a fixed scale offset filter and fill values.""" 79 | 80 | dates: NDArray = numpy.arange(START_DATE, END_DATE, DELTA) 81 | measures: NDArray = numpy.arange(0, len(dates), dtype=numpy.float64) 82 | measures[measures % 2 == 0] = FILL_VALUE 83 | measures = numpy.vstack((measures, ) * 25).T * 1e-4 84 | 85 | yield make_dataset(dates, 86 | measures, 87 | delayed=delayed, 88 | fill_value=FILL_VALUE * 1e-4, 89 | filters=(zarr.FixedScaleOffset(scale=10000, 90 | offset=0, 91 | dtype=' collection.Collection: 98 | """Create a collection.""" 99 | zds: dataset.Dataset = next( 100 | create_test_dataset_with_fillvalue( 101 | delayed=delayed) if with_fillvalue else create_test_dataset( 102 | delayed=delayed)) 103 | zcollection = collection.Collection('time', 104 | zds.metadata(), 105 | partitioning.Date(('time', ), 'D'), 106 | str(tested_fs.collection), 107 | filesystem=tested_fs.fs) 108 | zcollection.insert(zds) 109 | return zcollection 110 | 111 | 112 | #: List of filesystems and datasets to test. 113 | FILE_SYSTEM_DATASET = list( 114 | itertools.product([ 115 | 'local_fs', 116 | 's3_fs', 117 | ], [ 118 | create_test_dataset, 119 | create_test_dataset_with_fillvalue, 120 | ])) 121 | -------------------------------------------------------------------------------- /zcollection/tests/s3.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Fixtures for testing S3 using the pytest and minio. 7 | =================================================== 8 | """ 9 | from typing import Iterator, Literal 10 | import os 11 | import pathlib 12 | import shlex 13 | import subprocess 14 | import time 15 | 16 | import botocore.client 17 | import botocore.session 18 | import pytest 19 | import requests 20 | import s3fs 21 | 22 | #: Listen port 23 | PORT = 5555 24 | #: Listen address 25 | ENDPOINT: str = f'127.0.0.1:{PORT}' 26 | #: URI for minio 27 | ENDPOINT_URI: str = f'http://{ENDPOINT}' 28 | #: Credential for minio 29 | CREDENTIAL = '25219d58-f6c6-11eb-922c-770d49cd18e4' 30 | 31 | 32 | def have_minio() -> Literal[True]: 33 | """Check if minio is available.""" 34 | try: 35 | subprocess.check_output(['minio', '--version']) 36 | return True 37 | except: 38 | raise ImportError('minio: command not found') from None 39 | 40 | 41 | have_minio() 42 | 43 | 44 | def is_minio_up(timeout: float) -> bool: 45 | """Check if minio server is up.""" 46 | try: 47 | response = requests.get(ENDPOINT_URI, timeout=timeout) 48 | if response.status_code == 403: 49 | return True 50 | except: # pylint: disable=bare-except 51 | pass 52 | return False 53 | 54 | 55 | def wait_for_minio_to_start(timeout: float) -> None: 56 | """Wait for the minio server to start.""" 57 | while timeout > 0: 58 | try: 59 | response = requests.get(ENDPOINT_URI, timeout=1) 60 | if response.status_code == 403: 61 | return 62 | except: # pylint: disable=bare-except 63 | pass 64 | timeout -= 0.1 65 | time.sleep(0.1) 66 | raise RuntimeError("minio server didn't start") 67 | 68 | 69 | @pytest.fixture() 70 | def s3_base(tmpdir, pytestconfig) -> Iterator[None]: 71 | """Launch minio server.""" 72 | if pytestconfig.getoption('s3') is False: 73 | pytest.skip('S3 disabled') 74 | if is_minio_up(timeout=1): 75 | raise RuntimeError('minio server already up') 76 | os.environ['MINIO_CACHE_AFTER'] = '1' 77 | os.environ['MINIO_CACHE'] = 'on' 78 | os.environ['MINIO_ROOT_PASSWORD'] = CREDENTIAL 79 | os.environ['MINIO_ROOT_USER'] = CREDENTIAL 80 | # pylint: disable=consider-using-with 81 | process = subprocess.Popen( 82 | shlex.split(f'minio server --quiet --address {ENDPOINT} ' 83 | f"--console-address :{PORT+1} '{tmpdir!s}'")) 84 | 85 | try: 86 | wait_for_minio_to_start(timeout=30) 87 | yield 88 | finally: 89 | process.terminate() 90 | process.wait() 91 | # pylint: enable=consider-using-with 92 | 93 | 94 | def make_bucket(name) -> None: 95 | """Create a bucket.""" 96 | session: botocore.session.Session = botocore.session.get_session() 97 | client = session.create_client( 98 | 's3', 99 | aws_access_key_id=CREDENTIAL, 100 | aws_secret_access_key=CREDENTIAL, 101 | endpoint_url=ENDPOINT_URI, 102 | region_name='us-east-1', 103 | config=botocore.client.Config(signature_version='s3v4')) 104 | client.create_bucket(Bucket=name, ACL='public-read') 105 | 106 | 107 | # pylint: disable=redefined-outer-name, unused-argument # pytest fixture 108 | @pytest.fixture() 109 | def s3(s3_base) -> Iterator[s3fs.core.S3FileSystem]: 110 | """Create a S3 file system instance.""" 111 | s3fs.core.S3FileSystem.clear_instance_cache() 112 | fs = s3fs.core.S3FileSystem(anon=False, 113 | key=CREDENTIAL, 114 | secret=CREDENTIAL, 115 | client_kwargs={'endpoint_url': ENDPOINT_URI}) 116 | fs.invalidate_cache() 117 | yield fs 118 | # pylint: enable=redefined-outer-name, unused-argument 119 | 120 | 121 | class S3Path(type(pathlib.Path())): # type: ignore[misc] 122 | """Handle S3 path on multiple platforms.""" 123 | 124 | def __str__(self) -> str: 125 | return super().__str__().replace('\\', '/') 126 | 127 | 128 | class S3: 129 | """S3 filesystem.""" 130 | #: Bucket ID 131 | ID = 0 132 | 133 | # pylint: disable=redefined-outer-name # pytest fixture 134 | def __init__(self, s3: s3fs.core.S3FileSystem) -> None: 135 | name: str = f'bucket{S3.ID}' 136 | S3.ID += 1 137 | make_bucket(name) 138 | self.collection: S3Path = S3Path(name).joinpath('collection') 139 | self.view: S3Path = S3Path(name).joinpath('view') 140 | self.fs: s3fs.core.S3FileSystem = s3 141 | 142 | # pylint: enable=redefined-outer-name 143 | -------------------------------------------------------------------------------- /zcollection/fs_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | File system tools 7 | ================= 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Any, Iterator, Sequence 12 | import os 13 | 14 | import fsspec 15 | 16 | #: Path separator 17 | SEPARATOR = '/' 18 | 19 | 20 | def join_path(*args: str) -> str: 21 | """Join path elements.""" 22 | return SEPARATOR.join(args) 23 | 24 | 25 | def normalize_path(fs: fsspec.AbstractFileSystem, path: str) -> str: 26 | """Normalize the path. 27 | 28 | Args: 29 | fs: file system object 30 | path: path to test 31 | 32 | Returns: 33 | Normalized path. 34 | """ 35 | # pylint: disable=protected-access 36 | # There is no public method to perform this operation. 37 | path = fs._strip_protocol(path) # type: ignore[return-value] 38 | # pylint: enable=protected-access 39 | if path == '': 40 | path = fs.sep 41 | if fs.protocol in ('file', 'memory'): 42 | return os.path.normpath(path) 43 | return path 44 | 45 | 46 | def get_fs( 47 | filesystem: fsspec.AbstractFileSystem | str | None = None 48 | ) -> fsspec.AbstractFileSystem: 49 | """Return the file system object from the input. 50 | 51 | Args: 52 | filesystem: file system object or file system name 53 | 54 | Returns: 55 | File system object. 56 | 57 | Example: 58 | >>> from fsspec.implementations.local import LocalFileSystem 59 | >>> get_fs("hdfs") 60 | >>> get_fs(LocalFileSystem("/tmp/swot")) 61 | """ 62 | filesystem = filesystem or 'file' 63 | return (fsspec.filesystem(filesystem) 64 | if isinstance(filesystem, str) else filesystem) 65 | 66 | 67 | def fs_walk( 68 | fs: fsspec.AbstractFileSystem, 69 | path: str, 70 | sort: bool = False, 71 | ) -> Iterator[tuple[str, list[str], list[str]]]: 72 | """Return the list of files and directories in a directory. 73 | 74 | Args: 75 | fs: file system object 76 | path: path to the directory 77 | sort: if True, the list of files and directories is sorted 78 | alphabetically 79 | 80 | Returns: 81 | Iterator of (path, directories, files). 82 | """ 83 | dirs: list[str] 84 | files: list[str] 85 | 86 | dirs, files = [], [] 87 | try: 88 | listing: list[dict[str, Any]] = fs.ls(path, detail=True) 89 | except (FileNotFoundError, OSError): 90 | yield '', [], [] 91 | return 92 | 93 | for is_dir, name in ((info['type'] == 'directory', info['name']) 94 | for info in listing): 95 | # each info name must be at least [path]/part , but here 96 | # we check also for names like [path]/part/ 97 | dirs.append(name) if is_dir else files.append( 98 | name.rsplit(SEPARATOR, 1)[-1]) 99 | 100 | def sort_sequence(sequence: list[str]) -> list[str]: 101 | """Sort the sequence if the user wishes.""" 102 | return list(sorted(sequence)) if sort else sequence 103 | 104 | dirs = sort_sequence(dirs) 105 | yield path.rstrip(SEPARATOR), dirs, sort_sequence(files) 106 | 107 | for item in dirs: 108 | yield from fs_walk(fs, item, sort=sort) 109 | 110 | 111 | def copy_file( 112 | source: str, 113 | target: str, 114 | fs_source: fsspec.AbstractFileSystem, 115 | fs_target: fsspec.AbstractFileSystem, 116 | ) -> None: 117 | """Copy a file from one location to another. 118 | 119 | Args: 120 | source: The name of the source file. 121 | target: The name of the target file. 122 | fs_source: The file system that the source file is stored on. 123 | fs_target: The file system that the target file is stored on. 124 | """ 125 | with fs_source.open(source, 'rb') as source_stream: 126 | with fs_target.open(target, 'wb') as target_stream: 127 | target_stream.write(source_stream.read()) # type: ignore[arg-type] 128 | 129 | 130 | def copy_files( 131 | source: Sequence[str], 132 | target: str, 133 | fs_source: fsspec.AbstractFileSystem, 134 | fs_target: fsspec.AbstractFileSystem, 135 | ) -> None: 136 | """Copy a list of files from one location to another. 137 | 138 | Args: 139 | source: The names of the source files. 140 | target: The name of the target directory. 141 | fs_source: The file system that the source files are stored on. 142 | fs_target: The file system that the target directory is stored on. 143 | """ 144 | tuple( 145 | map( 146 | lambda path: copy_file(path, 147 | join_path(target, os.path.basename(path)), 148 | fs_source, fs_target), source)) 149 | 150 | 151 | def copy_tree( 152 | source: str, 153 | target: str, 154 | fs_source: fsspec.AbstractFileSystem, 155 | fs_target: fsspec.AbstractFileSystem, 156 | ) -> None: 157 | """Copy a directory tree from one location to another. 158 | 159 | Args: 160 | source: The name of the source directory. 161 | target: The name of the target directory. 162 | fs_source: The file system that the source directory is stored on. 163 | fs_target: The file system that the target directory is stored on. 164 | 165 | Raises: 166 | ValueError: If the target already exists. 167 | """ 168 | if fs_target.exists(target): 169 | raise ValueError(f'Target {target} already exists') 170 | fs_target.mkdir(target) 171 | for root, dirs, files in tuple(fs_walk(fs_source, source)): 172 | for name in files: 173 | source_path: str = join_path(root, name) 174 | copy_file(source_path, 175 | join_path(target, os.path.relpath(source_path, source)), 176 | fs_source, fs_target) 177 | for source_path in dirs: 178 | fs_target.mkdir( 179 | join_path(target, os.path.relpath(source_path, source))) 180 | -------------------------------------------------------------------------------- /zcollection/merging/time_series.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Merging a time series 7 | ===================== 8 | """ 9 | import numpy 10 | 11 | from . import period 12 | from .. import dataset 13 | from ..type_hints import NDArray 14 | 15 | 16 | def _merge_time_series( 17 | existing_ds: dataset.Dataset, 18 | inserted_ds: dataset.Dataset, 19 | axis: str, 20 | partitioning_dim: str, 21 | ) -> dataset.Dataset: 22 | """Merge two time series together. 23 | 24 | See :func:`merge_time_series` for 25 | details. 26 | """ 27 | existing_axis: NDArray = existing_ds.variables[axis].values 28 | inserted_axis: NDArray = inserted_ds.variables[axis].values 29 | existing_period = period.Period(existing_axis.min(), 30 | existing_axis.max(), 31 | within=True) 32 | inserted_period = period.Period(inserted_axis.min(), 33 | inserted_axis.max(), 34 | within=True) 35 | 36 | relation: period.PeriodRelation = inserted_period.get_relation( 37 | existing_period) 38 | 39 | # The new piece is located before the existing data. 40 | if relation.is_before(): 41 | return inserted_ds.concat(existing_ds, partitioning_dim) 42 | 43 | # The new piece is located after the existing data. 44 | if relation.is_after(): 45 | return existing_ds.concat(inserted_ds, partitioning_dim) 46 | 47 | # The new piece replace the old one 48 | if relation.contains(): 49 | return inserted_ds 50 | 51 | intersection: period.Period = inserted_period.intersection(existing_period) 52 | 53 | # The new piece is located before, but there is an overlap 54 | # between the two datasets. 55 | if relation.is_before_overlapping(): 56 | # pylint: disable=comparison-with-callable 57 | indices = numpy.where( 58 | # comparison between ndarray and datetime64 59 | existing_axis > intersection.end())[0] # type: ignore 60 | # pylint: enable=comparison-with-callable 61 | return inserted_ds.concat( 62 | existing_ds.isel({partitioning_dim: indices}), partitioning_dim) 63 | 64 | # The new piece is located after, but there is an overlap 65 | # between the two datasets. 66 | if relation.is_after_overlapping(): 67 | # pylint: disable=comparison-with-callable 68 | indices = numpy.where( 69 | # comparison between ndarray and datetime64 70 | existing_axis < intersection.begin)[0] # type: ignore 71 | # pylint: enable=comparison-with-callable 72 | return existing_ds.isel({ 73 | partitioning_dim: indices 74 | }).concat(inserted_ds, partitioning_dim) 75 | 76 | assert relation.is_inside() 77 | # comparison between ndarray and datetime64 78 | index = numpy.where(existing_axis < intersection.begin)[0] # type: ignore 79 | before: dataset.Dataset = existing_ds.isel( 80 | {partitioning_dim: slice(0, index[-1] + 1, None)}) 81 | 82 | # pylint: disable=comparison-with-callable 83 | # comparison between ndarray and datetime64 84 | index = numpy.where(existing_axis > intersection.end())[0] # type: ignore 85 | # pylint: enable=comparison-with-callable 86 | after: dataset.Dataset = existing_ds.isel( 87 | {partitioning_dim: slice(index[0], index[-1] + 1, None)}) 88 | 89 | return before.concat((inserted_ds, after), partitioning_dim) 90 | 91 | 92 | def merge_time_series( 93 | existing_ds: dataset.Dataset, 94 | inserted_ds: dataset.Dataset, 95 | axis: str, 96 | partitioning_dim: str, 97 | **kwargs, 98 | ) -> dataset.Dataset: 99 | """Merge two time series together. 100 | 101 | Replaces only the intersection between the existing dataset and the new one, 102 | but also keeps the existing records if they have not been updated. 103 | 104 | The following figure illustrates the implemented algorithm. Column ``A`` 105 | represents the new data and column ``B``, the data already present. The 106 | different cells in the columns represent the hours on the day of the 107 | measurements. The merge result is shown in column ``C``. It contains the 108 | measurements of the column ``A`` or column ``B`` if column ``A`` does not 109 | replace them. 110 | 111 | .. figure:: ../images/merge_time_series.svg 112 | :align: center 113 | :width: 50% 114 | 115 | Args: 116 | existing_ds: The existing dataset. 117 | inserted_ds: The inserted dataset. 118 | axis: The axis to merge on. 119 | partitioning_dim: The name of the partitioning dimension. 120 | kwargs: 121 | tolerance: This parameter sets the tolerance level for detecting 122 | data gaps in the inserted axis dataset. If set to ``None``, 123 | the algorithm will not check for data gaps in the inserted 124 | dataset. 125 | 126 | Returns: 127 | The merged dataset. 128 | """ 129 | tolerance = kwargs.get('tolerance', None) 130 | index: NDArray 131 | 132 | # Check if the inserted dataset contains data gaps. 133 | if tolerance is not None: 134 | inserted_axis: NDArray = inserted_ds.variables[axis].values 135 | delta: NDArray = numpy.concatenate( 136 | (numpy.array([0]), numpy.diff(numpy.roll(inserted_axis, 0)))) 137 | index = numpy.concatenate( 138 | (numpy.array([0], numpy.int64), numpy.where(delta > tolerance)[0], 139 | numpy.array([inserted_axis.size], numpy.int64))) 140 | else: 141 | index = numpy.array([], dtype=numpy.int64) 142 | 143 | if index.size > 1: 144 | # Split the inserted dataset into several datasets between the data 145 | # gaps. 146 | for ix in range(len(index) - 1): 147 | existing_ds = _merge_time_series( 148 | existing_ds, 149 | inserted_ds.isel( 150 | {partitioning_dim: slice(index[ix], index[ix + 1], None)}), 151 | axis, partitioning_dim) 152 | return existing_ds 153 | return _merge_time_series(existing_ds, inserted_ds, axis, partitioning_dim) 154 | -------------------------------------------------------------------------------- /zcollection/partitioning/sequence.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Partitioning a sequence of variables 7 | ==================================== 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Any, ClassVar, Iterator 12 | 13 | import dask.array.core 14 | import dask.array.routines 15 | import numpy 16 | 17 | from . import abc 18 | from ..type_hints import ArrayLike, NDArray 19 | 20 | 21 | def _is_monotonic(arr: NDArray) -> bool: 22 | """Check if the array is monotonic. 23 | 24 | The matrix will be sorted in the reverse order of the partitioning keys 25 | (column in the matrix). If the order of the matrix is unchanged, the 26 | different partitioning columns are monotonic. 27 | 28 | Args: 29 | arr: The array to check. 30 | 31 | Returns: 32 | True if the array is monotonic, False otherwise. 33 | """ 34 | # `reversed` because `numpy.lexsort` wants the most significant key last. 35 | values: list[NDArray] = [ 36 | arr[:, ix] for ix in reversed(range(arr.shape[1])) 37 | ] 38 | sort_order: NDArray = numpy.lexsort(numpy.array(values)) 39 | return numpy.all(abc.difference(sort_order) > 0) # type: ignore 40 | 41 | 42 | def _unique(arr: ArrayLike, is_delayed: bool) -> tuple[NDArray, NDArray]: 43 | """Return unique elements and their indices. 44 | 45 | Args: 46 | arr: Array of elements. 47 | is_delayed: If True, the array is delayed. 48 | Returns: 49 | Tuple of unique elements and their indices. 50 | """ 51 | index: NDArray 52 | indices: NDArray 53 | 54 | if is_delayed: 55 | index, indices = abc.unique(arr) # type: ignore[arg-type] 56 | if not _is_monotonic(index): 57 | raise ValueError('index is not monotonic') 58 | return index, indices 59 | return abc.unique_and_check_monotony(arr) 60 | 61 | 62 | class Sequence(abc.Partitioning): 63 | """Initialize a partitioning scheme for a sequence of variables. 64 | 65 | A sequence is a combination of variables constituting unique monotonic keys. 66 | For example, the orbit number (``cycle``) and the half-orbit number 67 | (``pass``) of a satellite. 68 | 69 | Args: 70 | variables: A list of strings representing the variables to be used for 71 | partitioning. 72 | dtype: An optional sequence of strings representing the data type used 73 | to store variable values in a binary representation without data 74 | loss. Must be one of the following allowed data types: ``int8``, 75 | ``int16``, ``int32``, ``int64``, ``uint8``, ``uint16``, ``uint32``, 76 | ``uint64``. If not provided, defaults to ``int64`` for all 77 | variables. 78 | 79 | Raises: 80 | ValueError: If the periodicity is not valid. 81 | 82 | Example: 83 | >>> partitioning = Sequence(["a", "b", "c"], (None, 10, 10)) 84 | """ 85 | #: The ID of the partitioning scheme. 86 | ID: ClassVar[str] = 'Sequence' 87 | 88 | # pylint: disable=arguments-differ 89 | # False positive: `self` is used in the signature. 90 | @staticmethod 91 | def _split(variables: dict[str, ArrayLike]) -> Iterator[abc.Partition]: 92 | """Split the variables constituting the partitioning into partitioning 93 | schemes.""" 94 | index: NDArray 95 | indices: NDArray 96 | matrix: dask.array.core.Array | NDArray 97 | 98 | # Determine if the variables are handled by Dask. 99 | is_delayed: bool = any( 100 | isinstance(item, dask.array.core.Array) 101 | for item in variables.values()) 102 | 103 | # Combines the arrays of variable values into a transposed matrix. 104 | matrix = dask.array.routines.vstack(tuple( 105 | variables.values())).transpose() if is_delayed else numpy.vstack( 106 | tuple(variables.values())).transpose() 107 | if matrix.dtype.kind not in 'iu': 108 | raise TypeError('The variables must be integer') 109 | 110 | index, indices = _unique(matrix, is_delayed) # type: ignore[arg-type] 111 | indices = abc.concatenate_item(indices, matrix.shape[0]) 112 | 113 | fields = tuple(variables.keys()) 114 | # pylint: disable=unnecessary-lambda-assignment 115 | # We want to reference a lambda function, not assign it to a variable. 116 | if len(fields) == 1: 117 | concat: Any = lambda fields, keys: (fields + keys, ) 118 | else: 119 | concat = lambda fields, keys: tuple(zip(fields, keys)) 120 | # pylint: enable=unnecessary-lambda-assignment 121 | 122 | return ((concat(fields, 123 | tuple(item)), slice(start, indices[ix + 1], None)) 124 | for item, (ix, start) in zip(index, enumerate(indices[:-1]))) 125 | # pylint: enable=arguments-differ 126 | 127 | def encode( 128 | self, 129 | partition: tuple[tuple[str, int], ...], 130 | ) -> tuple[int, ...]: 131 | """Encode a partitioning scheme to the handled values. 132 | 133 | Args: 134 | partition: The partitioning scheme to be encoded. 135 | 136 | Returns: 137 | The encoded partitioning scheme. 138 | 139 | Example: 140 | >>> partitioning = Sequence(["a", "b", "c"]) 141 | >>> fields = partitioning.parse("a=100/b=10/c=1") 142 | >>> fields 143 | (('a', 100), ('b', 10), ('c', 1)) 144 | >>> partitioning.encode(fields) 145 | (100, 10, 1) 146 | """ 147 | return tuple(value 148 | for _, value in self.parse(self.join(partition, '/'))) 149 | 150 | def decode(self, values: tuple[int, ...]) -> tuple[tuple[str, int], ...]: 151 | """Decode a partitioning scheme. 152 | 153 | Args: 154 | values: The encoded partitioning scheme. 155 | 156 | Returns: 157 | The decoded partitioning scheme. 158 | 159 | Example: 160 | >>> partitioning = Sequence(["a", "b", "c"]) 161 | >>> partitioning.decode((100, 10, 1)) 162 | (('a', 100), ('b', 10), ('c', 1)) 163 | """ 164 | return tuple( 165 | (key, value) for key, value in zip(self.variables, values)) 166 | -------------------------------------------------------------------------------- /zcollection/merging/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Handle merging of datasets of a partition. 7 | ========================================== 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Protocol 12 | import hashlib 13 | import shutil 14 | 15 | import fsspec 16 | import fsspec.implementations.local 17 | import zarr.storage 18 | 19 | from zcollection import fs_utils 20 | 21 | from .. import dataset, storage, sync 22 | from .time_series import merge_time_series 23 | 24 | __all__ = ('MergeCallable', 'perform', 'merge_time_series') 25 | 26 | 27 | #: pylint: disable=too-few-public-methods,duplicate-code 28 | class MergeCallable(Protocol): 29 | """Protocol to merge datasets stored in a partition. 30 | 31 | A merge callable is a function that accepts an existing dataset 32 | present in a partition, a new dataset to merge, the partitioning 33 | dimension and the axis to merge on. It returns the merged dataset. 34 | """ 35 | 36 | def __call__( 37 | self, 38 | existing_ds: dataset.Dataset, 39 | inserted_ds: dataset.Dataset, 40 | axis: str, 41 | partitioning_dim: str, 42 | **kwargs, 43 | ) -> dataset.Dataset: # pylint: disable=duplicate-code 44 | """Call the partition function. 45 | 46 | Args: 47 | existing_ds: The existing dataset. 48 | inserted_ds: The inserted dataset. 49 | axis: The axis to merge on. 50 | partitioning_dim: The partitioning dimension. 51 | **kwargs: Additional keyword arguments. 52 | 53 | Returns: 54 | The merged dataset. 55 | """ 56 | # pylint: disable=unnecessary-ellipsis 57 | # Ellipsis is necessary to make the function signature match the 58 | # protocol. 59 | ... # pragma: no cover 60 | # pylint: enable=unnecessary-ellipsis 61 | 62 | #: pylint: enable=too-few-public-methods,duplicate-code 63 | 64 | 65 | def _rename( 66 | fs: fsspec.AbstractFileSystem, 67 | source: str, 68 | dest: str, 69 | ) -> None: 70 | """Rename a directory on a file system. 71 | 72 | Args: 73 | fs: The file system. 74 | source: The source directory. 75 | dest: The destination directory. 76 | """ 77 | if isinstance(fs, fsspec.implementations.local.LocalFileSystem): 78 | # fspec implementation of the local file system, copy the source 79 | # directory to the destination directory and remove the source 80 | # directory. This is not efficient. So we use the shutil 81 | # implementation to rename the directory. 82 | shutil.rmtree(dest, ignore_errors=True) 83 | shutil.move(source, dest) 84 | return 85 | 86 | fs.rm(dest, recursive=True) 87 | fs.mv(source, dest, recursive=True) 88 | 89 | 90 | def _extract_root_dirname(dirname: str, sep: str) -> str: 91 | """Extracts the root directory name from a partition name.""" 92 | parts = filter(lambda x: '=' not in x, dirname.split(sep)) 93 | return sep.join(parts) 94 | 95 | 96 | def _update_fs( 97 | dirname: str, 98 | zds: dataset.Dataset, 99 | fs: fsspec.AbstractFileSystem, 100 | *, 101 | synchronizer: sync.Sync | None = None, 102 | ) -> None: 103 | """Updates a dataset stored in a partition. 104 | 105 | Args: 106 | dirname: The name of the partition. 107 | zds: The dataset to update. 108 | fs: The file system that the partition is stored on. 109 | synchronizer: The instance handling access to critical resources. 110 | """ 111 | # Building a temporary directory to store the new data. The name of the 112 | # temporary directory is the hash of the partition name. 113 | temp: str = fs_utils.join_path( 114 | _extract_root_dirname(dirname, fs.sep), 115 | hashlib.sha256(dirname.encode()).hexdigest()) 116 | if fs.exists(temp): 117 | fs.rm(temp, recursive=True) 118 | 119 | # Initializing Zarr group 120 | zarr.storage.init_group(store=fs.get_mapper(temp)) 121 | 122 | # Writing new data. 123 | try: 124 | # The synchronization is done by the caller. 125 | storage.write_zarr_group(zds, temp, fs, synchronizer or sync.NoSync()) 126 | except Exception: 127 | # The "write_zarr_group" method throws the exception if all scheduled 128 | # tasks are finished. So here we can delete the temporary directory. 129 | fs.rm(temp, recursive=True) 130 | raise 131 | 132 | # Rename the existing entry on the file system 133 | _rename(fs, temp, dirname) 134 | 135 | 136 | def perform( 137 | ds_inserted: dataset.Dataset, 138 | dirname: str, 139 | axis: str, 140 | fs: fsspec.AbstractFileSystem, 141 | partitioning_dim: str, 142 | *, 143 | delayed: bool = True, 144 | merge_callable: MergeCallable | None, 145 | synchronizer: sync.Sync | None = None, 146 | **kwargs, 147 | ) -> None: 148 | """Merges a new dataset with an existing partition. 149 | 150 | Args: 151 | ds_inserted: The dataset to merge. 152 | dirname: The name of the partition. 153 | axis: The axis to merge on. 154 | fs: The file system on which the partition is stored. 155 | partitioning_dim: The partitioning dimension. 156 | delayed: If True, the existing dataset is loaded lazily. Defaults to 157 | True. 158 | merge_callable: The merge callable. If None, the inserted dataset 159 | overwrites the existing dataset stored in the partition. 160 | Defaults to None. 161 | synchronizer: The instance handling access to critical resources. 162 | Defaults to None. 163 | **kwargs: Additional keyword arguments are passed through to the merge 164 | callable. 165 | """ 166 | if merge_callable is None: 167 | zds = ds_inserted 168 | else: 169 | ds = storage.open_zarr_group(dirname, fs, delayed=delayed) 170 | # Read dataset does not contain insertion properties. 171 | # This properties might be loss in the merge_callable depending on which 172 | # dataset is used. 173 | ds.copy_properties(ds=ds_inserted) 174 | zds = merge_callable(ds, ds_inserted, axis, partitioning_dim, **kwargs) 175 | _update_fs(dirname, zds, fs, synchronizer=synchronizer) 176 | -------------------------------------------------------------------------------- /zcollection/tests/test_compressed_array.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """Tests for :class:`zcollection.compressed_array.CompressedArray`.""" 6 | from typing import Any 7 | 8 | import dask.array.core 9 | import dask.array.creation 10 | import dask.array.random 11 | import dask.array.reductions 12 | import dask.array.routines 13 | import dask.array.ufunc 14 | import dask.array.utils 15 | import numpy 16 | import pytest 17 | 18 | from ..compressed_array import CompressedArray 19 | # pylint: disable=unused-import # Need to import for fixtures 20 | from .cluster import dask_client, dask_cluster 21 | 22 | # pylint: enable=unused-import 23 | 24 | # pylint: disable=unnecessary-lambda # We keep the lambdas for readability 25 | #: Functions to test 26 | functions = [ 27 | lambda x: x, 28 | dask.array.ufunc.expm1, 29 | lambda x: 2 * x, 30 | lambda x: x / 2, 31 | lambda x: x**2, 32 | lambda x: x + x, 33 | lambda x: x * x, 34 | lambda x: x[0], 35 | lambda x: x[:, 1], 36 | lambda x: x[:1, :, 1:3], 37 | lambda x: x.T, 38 | lambda x: dask.array.routines.transpose(x, (1, 2, 0)), 39 | dask.array.reductions.nanmean, 40 | lambda x: dask.array.reductions.nanmean(x, axis=1), 41 | dask.array.reductions.nanmax, 42 | dask.array.reductions.nanmin, 43 | dask.array.reductions.nanprod, 44 | dask.array.reductions.nanstd, 45 | dask.array.reductions.nanvar, 46 | dask.array.reductions.nansum, 47 | lambda x: dask.array.reductions.median(x, axis=0), 48 | dask.array.reductions.nanargmax, 49 | dask.array.reductions.nanargmin, 50 | lambda x: dask.array.reductions.nancumprod(x, axis=0), 51 | lambda x: dask.array.reductions.nancumsum(x, axis=0), 52 | lambda x: x.sum(), 53 | lambda x: x.moment(order=0), 54 | lambda x: x.mean(), 55 | lambda x: x.mean(axis=1), 56 | lambda x: x.std(), 57 | lambda x: x.std(axis=1), 58 | lambda x: x.var(), 59 | lambda x: x.var(axis=1), 60 | lambda x: x.dot(numpy.arange(x.shape[-1])), 61 | lambda x: x.dot(numpy.eye(x.shape[-1])), 62 | lambda x: dask.array.routines.tensordot( 63 | x, numpy.ones(x.shape[:2]), axes=[(0, 1), 64 | (0, 1)]), # type: ignore[arg-type] 65 | lambda x: x.sum(axis=0), 66 | lambda x: x.max(axis=0), 67 | lambda x: x.min(axis=0), 68 | lambda x: x.sum(axis=(1, 2)), 69 | lambda x: x.astype(numpy.complex128), 70 | lambda x: x.map_blocks(lambda x: x * 2), 71 | lambda x: x.map_overlap( 72 | lambda x: x * 2, depth=0, trim=True, boundary='none'), 73 | lambda x: x.map_overlap( 74 | lambda x: x * 2, depth=0, trim=False, boundary='none'), 75 | lambda x: x.round(1), 76 | lambda x: x.reshape((x.shape[0] * x.shape[1], x.shape[2])), 77 | abs, 78 | lambda x: x > 0.5, 79 | lambda x: x.rechunk((4, 4, 4)), 80 | lambda x: x.rechunk((2, 2, 1)), 81 | numpy.isneginf, 82 | numpy.isposinf, 83 | ] 84 | # pylint: enable=unnecessary-lambda 85 | 86 | 87 | @pytest.mark.filterwarnings( 88 | 'ignore:Casting complex values to real discards the imaginary part') 89 | @pytest.mark.parametrize('func', functions) 90 | def test_basic( 91 | func, 92 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 93 | ) -> None: 94 | """Test basic functionality.""" 95 | values: numpy.ndarray = numpy.random.random((2, 3, 4)) 96 | arr: dask.array.core.Array = dask.array.core.from_array( 97 | CompressedArray(values), chunks='auto') 98 | compressed_array: numpy.ndarray = func(arr).compute() 99 | array: numpy.ndarray = func(dask.array.core.from_array(values)).compute() 100 | assert compressed_array.shape == array.shape 101 | assert numpy.allclose(compressed_array, array) 102 | 103 | 104 | def test_metadata( 105 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 106 | ) -> None: 107 | """Test metadata.""" 108 | y: dask.array.core.Array = dask.array.random.random((10, 10), 109 | chunks=(5, 5)) 110 | z = CompressedArray(y.compute()) 111 | y = y.map_blocks(CompressedArray) # type: ignore[assignment] 112 | 113 | # pylint: disable=protected-access 114 | assert isinstance(y._meta, numpy.ndarray) 115 | assert isinstance((y + 1)._meta, numpy.ndarray) 116 | assert isinstance(y[:5, ::2]._meta, numpy.ndarray) 117 | assert isinstance( 118 | y.rechunk((2, 2))._meta, # type: ignore[arg-type] 119 | numpy.ndarray) 120 | assert isinstance((y - z), numpy.ndarray) 121 | assert isinstance(y.persist()._meta, numpy.ndarray) 122 | # pylint: enable=protected-access 123 | 124 | 125 | def test_from_delayed_meta( 126 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 127 | ) -> None: 128 | """Test from_delayed with meta.""" 129 | 130 | def f() -> CompressedArray: 131 | return CompressedArray(numpy.eye(3)) 132 | 133 | d: Any = dask.delayed(f)() # type: ignore 134 | x: dask.array.core.Array = dask.array.core.from_delayed( 135 | d, shape=(3, 3), meta=CompressedArray(numpy.eye(1))) 136 | assert numpy.all(x.compute() == f()[...]) # type: ignore 137 | 138 | 139 | def test_from_array( 140 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 141 | ) -> None: 142 | """Test from_array.""" 143 | x = CompressedArray(numpy.eye(10)) 144 | d: dask.array.core.Array = dask.array.core.from_array( 145 | x, chunks=(5, 5)) # type: ignore[arg-type] 146 | 147 | # pylint: disable=protected-access 148 | assert isinstance(d._meta, numpy.ndarray) 149 | # pylint: enable=protected-access 150 | assert isinstance(d.compute(), numpy.ndarray) 151 | assert numpy.allclose(d.compute(), x) 152 | 153 | 154 | def test_map_blocks( 155 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 156 | ) -> None: 157 | """Test map_blocks.""" 158 | x: dask.array.core.Array = dask.array.creation.eye( 159 | 10, chunks=5) # type: ignore[arg-type] 160 | y: dask.array.core.Array = x.map_blocks( 161 | CompressedArray) # type: ignore[arg-type] 162 | # pylint: disable=protected-access 163 | assert isinstance(y._meta, numpy.ndarray) 164 | # pylint: enable=protected-access 165 | assert numpy.allclose(y.compute(), x.compute()) 166 | 167 | 168 | def test_compressed_masked_array( 169 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 170 | ) -> None: 171 | """Test CompressedMaskedArray.""" 172 | x: dask.array.core.Array = dask.array.creation.eye( 173 | 10, chunks=5) # type: ignore[arg-type] 174 | y: dask.array.core.Array = x.map_blocks( 175 | CompressedArray, fill_value=0) # type: ignore[arg-type] 176 | # assert isinstance(y._meta, CompressedArray) 177 | assert isinstance(y[...].compute(), numpy.ma.MaskedArray) 178 | assert isinstance(y.compute(), numpy.ma.MaskedArray) 179 | assert y.mean().compute() == 1 180 | assert y.min().compute() == 1 181 | assert y.max().compute() == 1 182 | assert y.sum().compute() == 10 183 | assert y.std().compute() == 0 184 | assert (y * 2).mean().compute() == 2 185 | -------------------------------------------------------------------------------- /examples/ex_indexing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Indexing a Collection. 3 | ====================== 4 | 5 | In this example, we will see how to index a collection. 6 | """ 7 | from typing import Iterator, List, Optional, Tuple, Union 8 | import pathlib 9 | import pprint 10 | 11 | import dask.distributed 12 | import fsspec 13 | import numpy 14 | 15 | import zcollection 16 | import zcollection.indexing 17 | import zcollection.partitioning.tests.data 18 | 19 | # %% 20 | # Initialization of the environment 21 | # --------------------------------- 22 | fs = fsspec.filesystem('memory') 23 | cluster = dask.distributed.LocalCluster(processes=False) 24 | client = dask.distributed.Client(cluster) 25 | 26 | # %% 27 | # A collection can be indexed. This allows quick access to the data without 28 | # having to browse the entire dataset. 29 | # 30 | # Creating the test collection. 31 | # ----------------------------- 32 | # 33 | # For this latest example, we will index another data set. This one contains 34 | # measurements of a fictitious satellite on several half-orbits. 35 | zds: zcollection.Dataset = zcollection.Dataset.from_xarray( 36 | zcollection.partitioning.tests.data.create_test_sequence(5, 20, 10)) 37 | print(zds) 38 | 39 | # %% 40 | collection: zcollection.Collection = zcollection.create_collection( 41 | 'time', 42 | zds, 43 | zcollection.partitioning.Date(('time', ), 'M'), 44 | partition_base_dir='/one_other_collection', 45 | filesystem=fs) 46 | collection.insert(zds, merge_callable=zcollection.merging.merge_time_series) 47 | 48 | # %% 49 | # Here we have created a collection partitioned by month. 50 | pprint.pprint(fs.listdir('/one_other_collection/year=2000')) 51 | 52 | 53 | # %% 54 | # Class to implement 55 | # ------------------ 56 | # 57 | # The idea of the implementation is to calculate for each visited partition, the 58 | # slice of data that has a constant quantity. In our example, we will rely on 59 | # the cycle and pass number information. The first method we will implement is 60 | # the detection of these constant parts of two vectors containing the cycle and 61 | # pass number. 62 | def split_half_orbit( 63 | cycle_number: numpy.ndarray, 64 | pass_number: numpy.ndarray, 65 | ) -> Iterator[Tuple[int, int]]: 66 | """Calculate the indexes of the start and stop of each half-orbit. 67 | 68 | Args: 69 | pass_number: Pass numbers. 70 | Returns: 71 | Iterator of start and stop indexes. 72 | """ 73 | assert pass_number.shape == cycle_number.shape 74 | pass_idx = numpy.where(numpy.roll(pass_number, 1) != pass_number)[0] 75 | cycle_idx = numpy.where(numpy.roll(cycle_number, 1) != cycle_number)[0] 76 | 77 | half_orbit = numpy.unique( 78 | numpy.concatenate( 79 | (pass_idx, cycle_idx, numpy.array([pass_number.size], 80 | dtype='int64')))) 81 | del pass_idx, cycle_idx 82 | 83 | yield from tuple(zip(half_orbit[:-1], half_orbit[1:])) 84 | 85 | 86 | # %% 87 | # Now we will compute these constant parts from a dataset contained in a 88 | # partition. 89 | def _half_orbit( 90 | zds: zcollection.Dataset, 91 | *args, 92 | dtype: numpy.dtype | None = None, 93 | **kwargs, 94 | ) -> numpy.ndarray: 95 | """Return the indexes of the start and stop of each half-orbit. 96 | 97 | Args: 98 | ds: Datasets stored in a partition to be indexed. 99 | Returns: 100 | Dictionary of start and stop indexes for each half-orbit. 101 | """ 102 | pass_number_varname = kwargs.pop('pass_number', 'pass_number') 103 | cycle_number_varname = kwargs.pop('cycle_number', 'cycle_number') 104 | pass_number = zds.variables[pass_number_varname].values 105 | cycle_number = zds.variables[cycle_number_varname].values 106 | 107 | generator = (( 108 | i0, 109 | i1, 110 | cycle_number[i0], 111 | pass_number[i0], 112 | ) for i0, i1 in split_half_orbit(cycle_number, pass_number)) 113 | 114 | return numpy.fromiter(generator, dtype) 115 | 116 | 117 | # %% 118 | # Finally, we implement our indexing class. The base class 119 | # (:py:class:`zcollection.indexing.Indexer`) 120 | # implements the index update and the associated queries. 121 | class HalfOrbitIndexer(zcollection.indexing.Indexer): 122 | """Index collection by half-orbit.""" 123 | #: Column name of the cycle number. 124 | CYCLE_NUMBER = 'cycle_number' 125 | 126 | #: Column name of the pass number. 127 | PASS_NUMBER = 'pass_number' 128 | 129 | def dtype(self, /, **kwargs) -> List[Tuple[str, str]]: 130 | """Return the columns of the index. 131 | 132 | Returns: 133 | A tuple of (name, type) pairs. 134 | """ 135 | return super().dtype() + [ 136 | (self.CYCLE_NUMBER, 'uint16'), 137 | (self.PASS_NUMBER, 'uint16'), 138 | ] 139 | 140 | @classmethod 141 | def create( 142 | cls, 143 | path: Union[pathlib.Path, str], 144 | zds: zcollection.Collection, 145 | filesystem: Optional[fsspec.AbstractFileSystem] = None, 146 | **kwargs, 147 | ) -> 'HalfOrbitIndexer': 148 | """Create a new index. 149 | 150 | Args: 151 | path: The path to the index. 152 | ds: The collection to be indexed. 153 | filesystem: The filesystem to use. 154 | Returns: 155 | The created index. 156 | """ 157 | return super()._create(path, 158 | zds, 159 | meta=dict(attribute=b'value'), 160 | filesystem=filesystem) # type: ignore 161 | 162 | def update( 163 | self, 164 | zds: zcollection.Collection, 165 | partition_size: Optional[int] = None, 166 | npartitions: Optional[int] = None, 167 | **kwargs, 168 | ) -> None: 169 | """Update the index. 170 | 171 | Args: 172 | ds: New data stored in the collection to be indexed. 173 | partition_size: The length of each bag partition. 174 | npartitions: The number of desired bag partitions. 175 | cycle_number: The name of the cycle number variable stored in the 176 | collection. Defaults to "cycle_number". 177 | pass_number: The name of the pass number variable stored in the 178 | collection. Defaults to "pass_number". 179 | """ 180 | super()._update(zds, 181 | _half_orbit, 182 | partition_size, 183 | npartitions, 184 | dtype=self.dtype(), 185 | **kwargs) 186 | 187 | 188 | # %% 189 | # Using the index 190 | # --------------- 191 | # 192 | # Now we can create our index and fill it. 193 | indexer: HalfOrbitIndexer = HalfOrbitIndexer.create('/index.parquet', 194 | collection, 195 | filesystem=fs) 196 | indexer.update(collection) 197 | 198 | # The following command allows us to view the information stored in our index: 199 | # the first and last indexes of the partition associated with the registered 200 | # half-orbit number and the identifier of the indexed partition. 201 | indexer.table.to_pandas() 202 | 203 | # %% 204 | # This index can now be used to load a part of a collection. 205 | selection: zcollection.Dataset | None = collection.load( 206 | indexer=indexer.query(dict(pass_number=[1, 2])), 207 | delayed=False, 208 | ) 209 | assert selection is not None 210 | selection.to_xarray() 211 | 212 | # %% 213 | # Close the local cluster to avoid printing warning messages in the other 214 | # examples. 215 | client.close() 216 | cluster.close() 217 | -------------------------------------------------------------------------------- /zcollection/partitioning/tests/test_sequence.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Test partitioning by sequence. 7 | ============================== 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Iterator 12 | import pickle 13 | 14 | import dask.array.core 15 | import numpy 16 | import pytest 17 | import xarray 18 | 19 | from zcollection.type_hints import ArrayLike 20 | 21 | from . import data 22 | from .. import Sequence, get_codecs 23 | from ... import dataset 24 | # pylint: disable=unused-import # Need to import for fixtures 25 | from ...tests.cluster import dask_client, dask_cluster 26 | 27 | # pylint: enable=unused-import # Need to import for fixtures 28 | 29 | 30 | def test_construction() -> None: 31 | """Test the sequence constructor.""" 32 | assert isinstance(Sequence(('a', 'b')), Sequence) 33 | assert len(Sequence(('a', 'b'))) == 2 34 | with pytest.raises(ValueError): 35 | Sequence(('a', 'b'), (0, )) # type: ignore 36 | with pytest.raises(ValueError): 37 | Sequence((), ()) 38 | with pytest.raises(ValueError): 39 | Sequence(('a', 'b'), dtype=('c', 'd')) 40 | with pytest.raises(ValueError): 41 | Sequence(('a', 'b'), dtype=('float32', 'int32')) 42 | with pytest.raises(TypeError): 43 | Sequence(('a', 'b'), dtype='int32') 44 | partitioning = Sequence(('a', 'b')) 45 | partition_keys = partitioning.parse('a=1/b=2') 46 | assert partitioning.encode(partition_keys) == (1, 2) 47 | with pytest.raises(ValueError): 48 | partitioning.encode((('A', 1), ('b', 2))) 49 | assert partitioning.decode((1, 2)) == (('a', 1), ('b', 2)) 50 | assert partition_keys == (('a', 1), ('b', 2)) 51 | with pytest.raises(ValueError): 52 | partitioning.parse('a=1/b=2/c=3') 53 | with pytest.raises(ValueError): 54 | partitioning.parse('field=1') 55 | 56 | 57 | @pytest.mark.parametrize('delayed', [False, True]) 58 | def test_split_dataset( 59 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 60 | delayed: bool, 61 | ) -> None: 62 | """Test the split_dataset method.""" 63 | repeatability = 5 64 | xds = data.create_test_sequence(repeatability, 20, 10) 65 | partitioning = Sequence(('cycle_number', 'pass_number')) 66 | 67 | cycle_number = 1 68 | pass_number = 1 69 | 70 | assert partitioning.dtype() == ( 71 | ('cycle_number', 'int64'), 72 | ('pass_number', 'int64'), 73 | ) 74 | 75 | # Build the test dataset 76 | zds = dataset.Dataset.from_xarray(xds) 77 | if not delayed: 78 | zds = zds.compute() 79 | 80 | iterator = partitioning.split_dataset(zds, 'num_lines') 81 | assert isinstance(iterator, Iterator) 82 | 83 | for partition, indexer in iterator: 84 | subset = zds.isel(indexer) 85 | expected = (f'cycle_number={cycle_number}', 86 | f'pass_number={pass_number}') 87 | assert expected == partition 88 | assert numpy.all( 89 | xds.where((xds.cycle_number == cycle_number) 90 | & (xds.pass_number == pass_number), 91 | drop=True).observation == 92 | subset.variables['observation'].array) 93 | 94 | partition_keys = partitioning.parse('/'.join(partition)) 95 | assert partition_keys == (('cycle_number', cycle_number), 96 | ('pass_number', pass_number)) 97 | assert partitioning.decode( 98 | partitioning.encode(partition_keys)) == partition_keys 99 | assert partitioning.join(partition_keys, '/') == '/'.join(partition) 100 | 101 | pass_number += 1 102 | if pass_number > repeatability: 103 | pass_number = 1 104 | cycle_number += 1 105 | 106 | xds['cycle_number'] = xarray.DataArray(numpy.array( 107 | [xds['cycle_number'].values] * 2).T, 108 | dims=('num_lines', 'nump_pixels')) 109 | zds = dataset.Dataset.from_xarray(xds) 110 | if not delayed: 111 | zds = zds.compute() 112 | with pytest.raises(ValueError): 113 | list(partitioning.split_dataset(zds, 'num_lines')) 114 | 115 | 116 | def test_config() -> None: 117 | """Test the configuration of the Sequence class.""" 118 | partitioning = Sequence(('cycle_number', 'pass_number')) 119 | config = partitioning.get_config() 120 | partitioning = get_codecs(config) # type: ignore[assignment] 121 | assert isinstance(partitioning, Sequence) 122 | 123 | 124 | def test_pickle() -> None: 125 | """Test the pickling of the Date class.""" 126 | partitioning = Sequence(('cycle_number', 'pass_number')) 127 | other = pickle.loads(pickle.dumps(partitioning)) 128 | assert isinstance(other, Sequence) 129 | assert other.variables == ('cycle_number', 'pass_number') 130 | 131 | 132 | # pylint: disable=protected-access 133 | @pytest.mark.parametrize('delayed', [False, True]) 134 | def test_multiple_sequence( 135 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 136 | delayed: bool, 137 | ) -> None: 138 | """Test the creation of a sequence with multiple variables.""" 139 | arrays = { 140 | '_a': numpy.array([], dtype='i8'), 141 | '_b': numpy.array([], dtype='i8'), 142 | '_c': numpy.array([], dtype='i8') 143 | } 144 | for _a in range(5): 145 | for _b in range(5): 146 | arrays['_a'] = numpy.concatenate( 147 | (arrays['_a'], numpy.full((5, ), _a, dtype='i8'))) 148 | arrays['_b'] = numpy.concatenate( 149 | (arrays['_b'], numpy.full((5, ), _b, dtype='i8'))) 150 | arrays['_c'] = numpy.concatenate( 151 | (arrays['_c'], numpy.arange(5, dtype='i8'))) 152 | partitioning = Sequence(('_a', '_b', '_c')) 153 | chunks: str = (10, ) # type: ignore[assignment] 154 | if delayed: 155 | variables: dict[str, ArrayLike] = { # type: ignore[assignment] 156 | '_a': dask.array.core.from_array(arrays['_a'], chunks=chunks), 157 | '_b': dask.array.core.from_array(arrays['_b'], chunks=chunks), 158 | '_c': dask.array.core.from_array(arrays['_c'], chunks=chunks) 159 | } 160 | else: 161 | variables = arrays # type: ignore[assignment] 162 | _a = 0 163 | _b = 0 164 | _c = 0 165 | for idx, item in enumerate( 166 | partitioning._split(variables)): # type: ignore[arg-type] 167 | assert item[0] == (('_a', _a), ('_b', _b), ('_c', _c)) 168 | _c += 1 169 | if _c > 4: 170 | _c = 0 171 | _b += 1 172 | if _b > 4: 173 | _b = 0 174 | _a += 1 175 | assert item[1] == slice(idx, idx + 1) 176 | 177 | numpy.random.shuffle(arrays['_c']) 178 | variables['_c'] = dask.array.core.from_array( # type: ignore[assignment] 179 | arrays['_c'], chunks=chunks) if delayed else arrays['_c'] 180 | 181 | with pytest.raises(ValueError): 182 | list(partitioning._split(variables)) # type: ignore[arg-type] 183 | 184 | del variables['_c'] 185 | del variables['_b'] 186 | partitioning = Sequence(('_a', '_b', '_c')) 187 | 188 | _a = 0 189 | for idx, item in enumerate( 190 | partitioning._split(variables)): # type: ignore[arg-type] 191 | assert item[0] == (('_a', _a), ) 192 | _a += 1 193 | assert item[1] == slice(idx * 25, idx * 25 + 25) 194 | # pylint: enable=protected-access 195 | 196 | 197 | def test_values_must_be_integer() -> None: 198 | """Test that the values must be integer.""" 199 | values = numpy.arange(0, 100, dtype='f8') 200 | partitioning = Sequence(('values', )) 201 | # pylint: disable=protected-access 202 | with pytest.raises(TypeError): 203 | list(partitioning._split({'values': values})) 204 | # pylint: enable=protected-access 205 | -------------------------------------------------------------------------------- /zcollection/merging/tests/test_time_series.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """Test the time series merging.""" 6 | import copy 7 | 8 | import numpy 9 | 10 | from .. import time_series 11 | from ...tests import data 12 | # pylint: disable=unused-import # Need to import for fixtures 13 | from ...tests.cluster import dask_client, dask_cluster 14 | from ...type_hints import NDArray 15 | 16 | # pylint: enable=unused-import # Need to import for fixtures 17 | 18 | 19 | def test_merge_disjoint( 20 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 21 | ) -> None: 22 | """Test the update of two disjoint time series.""" 23 | generator = data.create_test_dataset() 24 | zds0 = next(generator) 25 | zds1 = next(generator) 26 | 27 | zds = time_series.merge_time_series(zds1, zds0, 'time', 'num_lines') 28 | assert numpy.all(zds.variables['time'].values == numpy.concatenate(( 29 | zds0.variables['time'].values, zds1.variables['time'].values))) 30 | 31 | zds = time_series.merge_time_series(zds0, zds1, 'time', 'num_lines') 32 | assert numpy.all(zds.variables['time'].values == numpy.concatenate(( 33 | zds0.variables['time'].values, zds1.variables['time'].values))) 34 | 35 | zds = time_series.merge_time_series(zds0, zds0, 'time', 'num_lines') 36 | assert numpy.all( 37 | zds.variables['time'].values == zds0.variables['time'].values) 38 | 39 | 40 | def test_merge_intersection( 41 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 42 | ) -> None: 43 | """Test the update of two intersecting time series.""" 44 | generator = data.create_test_dataset() 45 | zds0 = next(generator) 46 | # ds0.variables["time"].values => numpy.array([ 47 | # "2000-01-01T00:00:00.000000", "2000-01-04T00:00:00.000000", 48 | # "2000-01-07T00:00:00.000000", "2000-01-10T00:00:00.000000", 49 | # "2000-01-13T00:00:00.000000", "2000-01-16T00:00:00.000000" 50 | # ]) 51 | zds1 = next(generator) 52 | # ds1.variables["time"].values => numpy.array([ 53 | # "2000-01-19T00:00:00.000000", "2000-01-22T00:00:00.000000", 54 | # "2000-01-25T00:00:00.000000", "2000-01-28T00:00:00.000000", 55 | # "2000-01-31T00:00:00.000000"]) 56 | 57 | existing_zds = zds1 58 | new_zds = copy.deepcopy(zds0) 59 | new_zds.variables['time'] = zds0.variables['time'].duplicate( 60 | zds0.variables['time'].values + numpy.timedelta64(9, 'D')) 61 | 62 | zds = time_series.merge_time_series(existing_zds, new_zds, 'time', 63 | 'num_lines') 64 | assert numpy.all(zds.variables['time'].values == numpy.concatenate(( 65 | zds0.variables['time'].values[3:], zds1.variables['time'].values[:]))) 66 | 67 | existing_zds = zds0 68 | new_zds = copy.deepcopy(zds1) 69 | new_zds.variables['time'] = zds1.variables['time'].duplicate( 70 | zds1.variables['time'].values - numpy.timedelta64(9, 'D')) 71 | zds = time_series.merge_time_series(existing_zds, new_zds, 'time', 72 | 'num_lines') 73 | assert numpy.all(zds.variables['time'].values == numpy.concatenate(( 74 | zds0.variables['time'].values[:], zds1.variables['time'].values[:2]))) 75 | 76 | existing_zds = zds0 77 | new_zds = zds0.isel({'num_lines': slice(1, -1)}) 78 | new_zds.variables['var1'] = new_zds.variables['var1'].duplicate( 79 | new_zds.variables['var1'].values + 100) 80 | zds = time_series.merge_time_series(existing_zds, new_zds, 'time', 81 | 'num_lines') 82 | assert numpy.all(zds.variables['var1'].values == numpy.concatenate(( 83 | zds0.variables['var1'].values[:1], 84 | zds0.variables['var1'].values[1:-1] + 100, 85 | zds0.variables['var1'].values[-1:]))) 86 | 87 | 88 | def test_intersection_with_tolerance() -> None: 89 | """Test the update of two intersecting time series with a data gap.""" 90 | axis: NDArray = numpy.arange(numpy.datetime64('2000-01-01', 'ns'), 91 | numpy.datetime64('2000-01-01T23:59:59', 'ns'), 92 | numpy.timedelta64(1, 's')) 93 | measures = numpy.vstack((numpy.arange(axis.size), ) * 25).T 94 | zds0 = data.make_dataset(axis, measures, delayed=False) 95 | 96 | dates: NDArray = numpy.arange( 97 | numpy.datetime64('2000-01-01T10:00:00', 'ns'), 98 | numpy.datetime64('2000-01-01T14:59:59', 'ns'), 99 | numpy.timedelta64(1, 's')) 100 | 101 | # Create a gap in the data by removing the data between 11:00 and 13:00 102 | mask = (dates > numpy.datetime64('2000-01-01T11:00:00', 'ns')) & ( 103 | dates < numpy.datetime64('2000-01-01T13:00:00', 'ns')) 104 | dates = dates[~mask] 105 | measures = numpy.vstack((numpy.full(dates.size, -1), ) * 25).T 106 | zds1 = data.make_dataset(dates, measures, delayed=False) 107 | 108 | # Merge the two datasets with a tolerance of 1 minute to keep the 109 | # data gap in the existing dataset. 110 | zds_gap_filled = time_series.merge_time_series(zds0, 111 | zds1, 112 | 'time', 113 | 'num_lines', 114 | tolerance=numpy.timedelta64( 115 | 1, 'm')) 116 | # Merge the two datasets without a tolerance. The data gap is 117 | # kept and stored in the new dataset. 118 | zds_with_gap = time_series.merge_time_series( 119 | zds0, 120 | zds1, 121 | 'time', 122 | 'num_lines', 123 | ) 124 | assert zds_with_gap.time.size == zds0.time.size - mask.sum() 125 | 126 | mask = (axis > numpy.datetime64('2000-01-01T11:00:00', 'ns')) & ( 127 | axis < numpy.datetime64('2000-01-01T13:00:00', 'ns')) 128 | assert numpy.all(zds_gap_filled.variables['time'].values == 129 | zds0.variables['time'].values) 130 | assert numpy.all((zds_gap_filled.variables['var1'].values[:, 0] < 0 131 | ).sum() == zds1.dimensions['num_lines']) 132 | 133 | # Create gaps in the data by removing the data between 11:00 to 13:00 134 | # 15:00 to 17:00 and 19:00 to 21:00 135 | mask = (axis > numpy.datetime64('2000-01-01T11:00:00', 'ns')) & ( 136 | axis < numpy.datetime64('2000-01-01T13:00:00', 'ns')) 137 | mask |= (axis > numpy.datetime64('2000-01-01T15:00:00', 'ns')) & ( 138 | axis < numpy.datetime64('2000-01-01T17:00:00', 'ns')) 139 | mask |= (axis > numpy.datetime64('2000-01-01T19:00:00', 'ns')) & ( 140 | axis < numpy.datetime64('2000-01-01T21:00:00', 'ns')) 141 | 142 | dates = axis[~mask] 143 | 144 | measures = numpy.vstack((numpy.full(dates.size, -1), ) * 25).T 145 | zds1 = data.make_dataset(dates, measures, delayed=False) 146 | 147 | # Merge the two datasets with a tolerance of 1 minute to keep the 148 | # data gaps in the existing dataset. 149 | zds_gap_filled = time_series.merge_time_series(zds0, 150 | zds1, 151 | 'time', 152 | 'num_lines', 153 | tolerance=numpy.timedelta64( 154 | 1, 'm')) 155 | # Merge the two datasets without a tolerance. The data gaps are 156 | # kept and stored in the new dataset. 157 | zds_with_gap = time_series.merge_time_series( 158 | zds0, 159 | zds1, 160 | 'time', 161 | 'num_lines', 162 | ) 163 | 164 | assert numpy.all(zds_gap_filled.variables['time'].values == 165 | zds0.variables['time'].values) 166 | assert zds_with_gap.time.size == zds0.time.size - mask.sum() 167 | assert numpy.all((zds_gap_filled.variables['var1'].values[:, 0] < 0 168 | ).sum() == zds1.dimensions['num_lines']) 169 | -------------------------------------------------------------------------------- /zcollection/partitioning/date.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Partitioning by date 7 | ==================== 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Any, ClassVar, Iterator, Sequence 12 | import datetime 13 | 14 | import dask.array.core 15 | import numpy 16 | 17 | from . import abc 18 | from ..type_hints import ArrayLike, NDArray 19 | 20 | #: Numpy time units 21 | RESOLUTION = ('Y', 'M', 'D', 'h', 'm', 's') 22 | 23 | #: Numpy time unit meanings 24 | UNITS = ('year', 'month', 'day', 'hour', 'minute', 'second') 25 | 26 | #: Data type for time units 27 | DATA_TYPES = ('uint16', 'uint8', 'uint8', 'uint8', 'uint8', 'uint8') 28 | 29 | #: Time separation units 30 | SEPARATORS: dict[str, str] = { 31 | 'year': '-', 32 | 'month': '-', 33 | 'day': 'T', 34 | 'hour': ':', 35 | 'minute': ':', 36 | 'second': '.' 37 | } 38 | 39 | 40 | def _unique(arr: ArrayLike, is_delayed: bool) -> tuple[NDArray, NDArray]: 41 | """Return unique elements and their indices. 42 | 43 | Args: 44 | arr: Array of elements. 45 | is_delayed: If True, the array is delayed. 46 | Returns: 47 | Tuple of unique elements and their indices. 48 | Raises: 49 | ValueError: If the array is not monotonic. 50 | """ 51 | index: NDArray 52 | indices: NDArray 53 | 54 | if is_delayed: 55 | index, indices = abc.unique(arr) # type: ignore[arg-type] 56 | # We don't use here the function `numpy.diff` but `abc.difference` for 57 | # optimization purposes. 58 | if not numpy.all( 59 | abc.difference(index.view(numpy.int64)) >= 0): # type: ignore 60 | raise ValueError('index is not monotonic') 61 | return index, indices 62 | return abc.unique_and_check_monotony(arr) 63 | 64 | 65 | class Date(abc.Partitioning): 66 | """Initialize a partitioning scheme based on dates. 67 | 68 | Args: 69 | variables: A list of strings representing the variables to be used for 70 | partitioning. 71 | resolution: Time resolution of the partitioning. Must be in 72 | :data:`RESOLUTION`. 73 | 74 | Raises: 75 | ValueError: If the resolution is not in the list of supported 76 | resolutions or if the partitioning is not performed on a one 77 | dimensional variable. 78 | 79 | Example: 80 | >>> partitioning = Date(variables=("time", ), resolution="Y") 81 | """ 82 | __slots__ = ('_attrs', '_index', 'resolution') 83 | 84 | #: The ID of the partitioning scheme 85 | ID: ClassVar[str] = 'Date' 86 | 87 | def __init__(self, variables: Sequence[str], resolution: str) -> None: 88 | if len(variables) != 1: 89 | raise ValueError( 90 | 'Partitioning on dates is performed on a single variable.') 91 | if resolution not in RESOLUTION: 92 | raise ValueError('resolution must be in: ' + ', '.join(RESOLUTION)) 93 | index: int = RESOLUTION.index(resolution) + 1 94 | 95 | #: The time resolution of the partitioning 96 | self.resolution: str = resolution 97 | #: The time parts used for the partitioning 98 | self._attrs: tuple[str, ...] = UNITS[:index + 1] 99 | #: The indices of the time parts used for the partitioning 100 | self._index = tuple(range(index)) 101 | super().__init__(variables, 102 | tuple(DATA_TYPES[ix] for ix in self._index)) 103 | 104 | def _keys(self) -> Sequence[str]: 105 | """Return the keys of the partitioning scheme.""" 106 | return tuple(UNITS[ix] for ix in self._index) 107 | 108 | # pylint: disable=arguments-differ 109 | # False positive: the base method is static. 110 | def _partition( # type: ignore[override] 111 | self, 112 | selection: tuple[tuple[str, Any], ...], 113 | ) -> tuple[str, ...]: 114 | """Return the partitioning scheme for the given selection.""" 115 | datetime64: NDArray = selection[0][1] 116 | py_datetime: datetime.datetime = datetime64.astype('M8[s]').item() 117 | return tuple(UNITS[ix] + '=' + 118 | f'{getattr(py_datetime, self._attrs[ix]):02d}' 119 | for ix in self._index) 120 | # pylint: enable=arguments-differ 121 | 122 | def _split( 123 | self, 124 | variables: dict[str, ArrayLike], 125 | ) -> Iterator[abc.Partition]: 126 | """Return the partitioning scheme for the given variables.""" 127 | index: NDArray 128 | indices: NDArray 129 | name: str 130 | values: ArrayLike 131 | 132 | # Determine if the variables are handled by Dask. 133 | is_delayed: bool = any( 134 | isinstance(value, dask.array.core.Array) 135 | for value in variables.values()) 136 | name, values = tuple(variables.items())[0] 137 | 138 | if not numpy.issubdtype(values.dtype, numpy.dtype('datetime64')): 139 | raise TypeError('values must be a datetime64 array') 140 | 141 | index, indices = _unique( 142 | values.astype(f'datetime64[{self.resolution}]'), is_delayed) 143 | indices = abc.concatenate_item(indices, values.size) 144 | 145 | return ((((name, date), ), slice(start, indices[ix + 1], None)) 146 | for date, (ix, start) in zip(index, enumerate(indices[:-1]))) 147 | 148 | @staticmethod 149 | def _stringify(partition: tuple[tuple[str, int], ...]) -> str: 150 | """Return a string representation of the partitioning scheme.""" 151 | string = ''.join(f'{value:02d}' + SEPARATORS[item] 152 | for item, value in partition) 153 | if string[-1] in SEPARATORS.values(): 154 | string = string[:-1] 155 | return string 156 | 157 | @staticmethod 158 | def join(partition_scheme: tuple[tuple[str, int], ...], sep: str) -> str: 159 | """Join a partitioning scheme. 160 | 161 | Args: 162 | partition_scheme: The partitioning scheme to be joined. 163 | sep: The separator to be used. 164 | 165 | Returns: 166 | The joined partitioning scheme. 167 | 168 | Example: 169 | >>> partitioning = Date(variables=("time", ), resolution="D") 170 | >>> partitioning.join((("year", 2020), ("month", 1), ("day", 1)), 171 | ... "/") 172 | 'year=2020/month=01/day=01' 173 | """ 174 | return sep.join(f'{k}={v:02d}' for k, v in partition_scheme) 175 | 176 | def encode( 177 | self, 178 | partition: tuple[tuple[str, int], ...], 179 | ) -> tuple[Any, ...]: 180 | """Encode a partitioning scheme. 181 | 182 | Args: 183 | partition: The partitioning scheme to be encoded. 184 | 185 | Returns: 186 | The encoded partitioning scheme. 187 | 188 | Example: 189 | >>> partitioning = Date(variables=("time", ), resolution="D") 190 | >>> fields = partitioning.parse("year=2020/month=01/day=01") 191 | >>> fields 192 | (("year", 2020), ("month", 1), ("day", 1)) 193 | >>> partitioning.encode(fields) 194 | (numpy.datetime64('2020-01-01'),) 195 | """ 196 | return (numpy.datetime64(self._stringify(partition)), ) 197 | 198 | def decode( 199 | self, 200 | values: tuple[Any, ...], 201 | ) -> tuple[tuple[str, int], ...]: 202 | """Decode a partitioning scheme. 203 | 204 | Args: 205 | values: The partitioning scheme to be decoded. 206 | 207 | Returns: 208 | The decoded partitioning scheme. 209 | 210 | Example: 211 | >>> partitioning = Date(variables=("time", ), resolution="D") 212 | >>> partitioning.decode((numpy.datetime64('2020-01-01'), )) 213 | (("year", 2020), ("month", 1), ("day", 1)) 214 | """ 215 | datetime64: NDArray = values[0] 216 | py_datetime: datetime.datetime = datetime64.astype('M8[s]').item() 217 | return tuple((UNITS[ix], getattr(py_datetime, self._attrs[ix])) 218 | for ix in self._index) 219 | -------------------------------------------------------------------------------- /zcollection/variable/array.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | In memory variable arrays. 7 | ========================== 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Any, Sequence 12 | 13 | import dask.array.core 14 | import dask.array.ma 15 | import numcodecs.abc 16 | import numpy 17 | import zarr 18 | 19 | from ..meta import Attribute 20 | from ..type_hints import ArrayLike, NDArray, NDMaskedArray 21 | from .abc import Variable, concat, new_variable, not_equal 22 | 23 | 24 | def _as_numpy_array( 25 | arr: Any, 26 | *, 27 | fill_value: Any | None = None, 28 | ) -> tuple[NDArray, Any]: 29 | """Convert an array-like object to a numpy array. 30 | 31 | Args: 32 | arr: An array-like object. 33 | fill_value: The fill value. 34 | 35 | Returns: 36 | If the data provided is a masked array, the functions return the array 37 | with masked data replaced by its fill value and the fill value of the 38 | offered masked array. Otherwise, the provided array and fill value. 39 | """ 40 | result: NDArray = numpy.asanyarray(arr) 41 | if isinstance(result, numpy.ma.MaskedArray): 42 | if fill_value is not None and not_equal(fill_value, result.fill_value): 43 | raise ValueError( 44 | f'The fill value {fill_value!r} does not match the fill value ' 45 | f'{result.fill_value!r} of the array.') 46 | return numpy.ma.filled(result, result.fill_value), result.fill_value 47 | return result, fill_value 48 | 49 | 50 | class Array(Variable): 51 | """Access to the chunked data using numpy arrays. 52 | 53 | Args: 54 | name: Name of the variable 55 | data: Variable data 56 | dimensions: Variable dimensions 57 | attrs: Variable attributes 58 | compressor: Compression codec 59 | fill_value: Value to use for uninitialized values 60 | filters: Filters to apply before writing data to disk 61 | """ 62 | 63 | def __init__(self, 64 | name: str, 65 | data: ArrayLike[Any], 66 | dimensions: Sequence[str], 67 | *, 68 | attrs: Sequence[Attribute] | None = None, 69 | compressor: numcodecs.abc.Codec | None = None, 70 | fill_value: Any | None = None, 71 | filters: Sequence[numcodecs.abc.Codec] | None = None) -> None: 72 | array: NDArray 73 | array, fill_value = _as_numpy_array(data, fill_value=fill_value) 74 | super().__init__( 75 | name, 76 | array, 77 | dimensions, 78 | attrs=attrs, 79 | compressor=compressor, 80 | fill_value=fill_value, 81 | filters=filters, 82 | ) 83 | 84 | @property 85 | def data(self) -> dask.array.core.Array: 86 | """Return the numpy array wrapped in a dask array. If the variable has 87 | a fill value, the result is a masked array where masked values are 88 | equal to the fill value. 89 | 90 | Returns: 91 | The dask array 92 | 93 | .. seealso:: 94 | 95 | :meth:`Variable.array` 96 | """ 97 | if self.fill_value is None: 98 | return dask.array.core.from_array(self.array) 99 | return dask.array.ma.masked_equal(self.array, self.fill_value) 100 | 101 | @property 102 | def values(self) -> NDArray | NDMaskedArray: 103 | """Return the variable data as a numpy array. 104 | 105 | .. note:: 106 | 107 | If the variable has a fill value, the result is a masked array where 108 | masked values are equal to the fill value. 109 | 110 | Returns: 111 | The variable data 112 | """ 113 | return self.array if self.fill_value is None else numpy.ma.masked_equal( 114 | self.array, self.fill_value) 115 | 116 | @values.setter 117 | def values(self, data: Any) -> None: 118 | """Defines the underlying numpy array. If the data provided is a masked 119 | array, it's converted to an array, where the masked values are replaced 120 | by its fill value, and its fill value becomes the new fill value of 121 | this instance. Otherwise, the underlying array is defined as the new 122 | data and the fill value is set to None. 123 | 124 | Args: 125 | data: The new data to use 126 | 127 | Raises: 128 | ValueError: If the shape of the data does not match the shape of 129 | the stored data. 130 | """ 131 | if len(data.shape) != len(self.dimensions): 132 | raise ValueError('data shape does not match variable dimensions') 133 | self.array, self.fill_value = _as_numpy_array( 134 | data, fill_value=self.fill_value) 135 | 136 | def persist(self, **_) -> Array: 137 | """Persist the variable data into memory. 138 | 139 | Returns: 140 | The variable 141 | """ 142 | return self 143 | 144 | def compute(self, **_) -> NDArray | NDMaskedArray: 145 | """Return the variable data as a numpy array. 146 | 147 | .. note:: 148 | 149 | If the variable has a fill value, the result is a masked array where 150 | masked values are equal to the fill value. 151 | """ 152 | return self.values 153 | 154 | def fill(self) -> Array: 155 | """Fill the variable with the fill value. If the variable has no fill 156 | value, this method does nothing. 157 | 158 | Returns: 159 | The variable. 160 | """ 161 | if self.fill_value is not None: 162 | self.array = numpy.full_like(self.array, self.fill_value) 163 | return self 164 | 165 | @classmethod 166 | def from_zarr(cls, array: zarr.Array, name: str, dimension: str, 167 | **kwargs) -> Array: 168 | """Create a new variable from a zarr array. 169 | 170 | Args: 171 | array: The zarr array 172 | name: Name of the variable 173 | dimension: Name of the attribute that defines the dimensions of the 174 | variable 175 | **kwargs: Additional arguments. These arguments are ignored, but 176 | they are accepted to be compatible with the base class. 177 | 178 | Returns: 179 | The variable 180 | """ 181 | attrs = tuple( 182 | Attribute(k, v) for k, v in array.attrs.items() if k != dimension) 183 | return new_variable(cls, 184 | name=name, 185 | array=array[...], 186 | dimensions=array.attrs[dimension], 187 | attrs=attrs, 188 | compressor=array.compressor, 189 | fill_value=array.fill_value, 190 | filters=tuple(array.filters or ())) 191 | 192 | def concat(self, other: Array | Sequence[Array], dim: str) -> Array: 193 | """Concatenate this variable with another variable or a list of 194 | variables along a dimension. 195 | 196 | Args: 197 | other: Variable or list of variables to concatenate with this 198 | variable. 199 | dim: Dimension to concatenate along. 200 | 201 | Returns: 202 | New variable. 203 | 204 | Raises: 205 | ValueError: if the variables provided is an empty sequence or if 206 | any item in the sequence is not an instance of :class:`Array`. 207 | """ 208 | return concat(self, other, numpy.concatenate, dim) 209 | 210 | def __getitem__(self, key: Any) -> Any: 211 | """Get a slice of the variable. 212 | 213 | Args: 214 | key: Slice or index to use. 215 | Returns: 216 | The variable slice. 217 | """ 218 | return (self.array[key] if self.fill_value is None else 219 | numpy.ma.masked_equal(self.array[key], self.fill_value)) 220 | 221 | def rechunk(self, **_) -> Array: 222 | """Rechunk the variable. 223 | 224 | Returns: 225 | The variable. 226 | """ 227 | return self 228 | -------------------------------------------------------------------------------- /zcollection/indexing/tests/test_abc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """Test the base class for indexing.""" 6 | from __future__ import annotations 7 | 8 | from typing import Iterator 9 | import pathlib 10 | 11 | import fsspec 12 | import numpy 13 | import pyarrow 14 | import pytest 15 | 16 | from .. import abc 17 | from ... import collection, convenience, dataset, partitioning 18 | from ...partitioning.tests import data 19 | # pylint: disable=unused-import # Need to import for fixtures 20 | from ...tests.cluster import dask_client, dask_cluster 21 | from ...tests.fs import local_fs 22 | # pylint: enable=unused-import 23 | from ...type_hints import NDArray 24 | 25 | 26 | def split_half_orbit( 27 | cycle_number: numpy.ndarray, 28 | pass_number: numpy.ndarray, 29 | ) -> Iterator[tuple[int, int]]: 30 | """Calculate the indexes of the start and stop of each half-orbit. 31 | 32 | Args: 33 | pass_number: Pass numbers. 34 | 35 | Returns: 36 | Iterator of start and stop indexes. 37 | """ 38 | assert pass_number.shape == cycle_number.shape 39 | pass_idx = numpy.where(numpy.roll(pass_number, 1) != pass_number)[0] 40 | cycle_idx = numpy.where(numpy.roll(cycle_number, 1) != cycle_number)[0] 41 | 42 | half_orbit = numpy.unique( 43 | numpy.concatenate( 44 | (pass_idx, cycle_idx, numpy.array([pass_number.size], 45 | dtype='int64')))) 46 | del pass_idx, cycle_idx 47 | 48 | yield from tuple(zip(half_orbit[:-1], half_orbit[1:])) 49 | 50 | 51 | # pylint: disable=unused-argument,invalid-name 52 | # The signature of the function must follow the signature of 53 | # zcollection.PartitionCallable 54 | def _half_orbit( 55 | zds: dataset.Dataset, 56 | *args, 57 | dtype: numpy.dtype | None = None, 58 | **kwargs, 59 | ) -> NDArray: 60 | """Return the indexes of the start and stop of each half-orbit. 61 | 62 | Args: 63 | zds: Datasets stored in a partition to be indexed. 64 | 65 | Returns: 66 | Dictionary of start and stop indexes for each half-orbit. 67 | """ 68 | pass_number_varname = kwargs.pop('pass_number', 'pass_number') 69 | cycle_number_varname = kwargs.pop('cycle_number', 'cycle_number') 70 | pass_number = zds.variables[pass_number_varname].values 71 | cycle_number = zds.variables[cycle_number_varname].values 72 | 73 | generator = (( 74 | i0, 75 | i1, 76 | cycle_number[i0], 77 | pass_number[i0], 78 | ) for i0, i1 in split_half_orbit(cycle_number, pass_number)) 79 | 80 | return numpy.fromiter( # type: ignore 81 | generator, dtype) 82 | 83 | 84 | class HalfOrbitIndexer(abc.Indexer): 85 | """Index SWOT collection by half-orbit.""" 86 | #: Column name of the cycle number. 87 | CYCLE_NUMBER = 'cycle_number' 88 | 89 | #: Column name of the pass number. 90 | PASS_NUMBER = 'pass_number' 91 | 92 | def dtype(self, /, **kwargs) -> list[tuple[str, str]]: 93 | """Return the columns of the index. 94 | 95 | Returns: 96 | A tuple of (name, type) pairs. 97 | """ 98 | return super().dtype() + [ 99 | (self.CYCLE_NUMBER, 'uint16'), 100 | (self.PASS_NUMBER, 'uint16'), 101 | ] 102 | 103 | @classmethod 104 | def create( 105 | cls, 106 | path: pathlib.Path | str, 107 | zds: collection.Collection, 108 | *, 109 | filesystem: fsspec.AbstractFileSystem | None = None, 110 | **kwargs, 111 | ) -> HalfOrbitIndexer: 112 | """Create a new index. 113 | 114 | Args: 115 | path: The path to the index. 116 | zds: The collection to be indexed. 117 | filesystem: The filesystem to use. 118 | 119 | Returns: 120 | The created index. 121 | """ 122 | return super()._create(path, 123 | zds, 124 | meta={'attribute': b'value'}, 125 | filesystem=filesystem) # type: ignore 126 | 127 | def update( 128 | self, 129 | zds: collection.Collection, 130 | *, 131 | partition_size: int | None = None, 132 | npartitions: int | None = None, 133 | **kwargs, 134 | ) -> None: 135 | """Update the index. 136 | 137 | Args: 138 | zds: New data stored in the collection to be indexed. 139 | partition_size: The length of each bag partition. 140 | npartitions: The number of desired bag partitions. 141 | cycle_number: The name of the cycle number variable stored in the 142 | collection. Defaults to "cycle_number". 143 | pass_number: The name of the pass number variable stored in the 144 | collection. Defaults to "pass_number". 145 | """ 146 | super()._update(zds, 147 | _half_orbit, 148 | partition_size, 149 | npartitions, 150 | dtype=self.dtype(), 151 | **kwargs) 152 | 153 | 154 | def test_indexer( 155 | dask_client, # pylint: disable=redefined-outer-name,unused-argument 156 | local_fs, # pylint: disable=redefined-outer-name 157 | ): 158 | """Test the base class of the indexer.""" 159 | ds = dataset.Dataset.from_xarray(data.create_test_sequence(5, 20, 10)) 160 | 161 | zcollection = convenience.create_collection( 162 | 'time', 163 | ds, 164 | partitioning.Date(('time', ), 'M'), 165 | partition_base_dir=str(local_fs.collection), 166 | filesystem=local_fs.fs) 167 | zcollection.insert(ds, merge_callable=collection.merging.merge_time_series) 168 | 169 | indexer = HalfOrbitIndexer.create(str( 170 | local_fs.collection.joinpath('index.parquet')), 171 | zcollection, 172 | filesystem=local_fs.fs) 173 | 174 | # Index not yet created 175 | with pytest.raises(ValueError): 176 | _ = indexer.table 177 | 178 | assert indexer.dtype() == [('start', 'int64'), ('stop', 'int64'), 179 | ('cycle_number', 'uint16'), 180 | ('pass_number', 'uint16')] 181 | indexer.update(zcollection) 182 | assert isinstance(indexer.table, pyarrow.Table) 183 | 184 | selection = zcollection.load(indexer=indexer.query({'cycle_number': 2})) 185 | assert selection is not None 186 | assert set(selection.variables['cycle_number'].values) == {2} 187 | 188 | with pytest.raises(ValueError): 189 | indexer.query({'cycle_number': 3}, logical_op='X') 190 | 191 | with pytest.raises(ValueError): 192 | indexer.query({'X': 3}) 193 | 194 | # Updating the index should not change the indexer. 195 | indexer.update(zcollection) 196 | other = zcollection.load(indexer=indexer.query({'cycle_number': 2})) 197 | assert other is not None 198 | assert numpy.all( 199 | other['observation'].values == selection['observation'].values) 200 | 201 | selection = zcollection.load( 202 | indexer=indexer.query({'cycle_number': [2, 4]})) 203 | assert selection is not None 204 | assert set(selection.variables['cycle_number'].values) == {2, 4} 205 | 206 | selection = zcollection.load(indexer=indexer.query({ 207 | 'cycle_number': [2, 4], 208 | 'pass_number': 1 209 | })) 210 | assert selection is not None 211 | assert set(selection.variables['cycle_number'].values) == {2, 4} 212 | assert set(selection.variables['pass_number'].values) == {1} 213 | 214 | selection = zcollection.load(indexer=indexer.query({ 215 | 'cycle_number': [2, 4], 216 | 'pass_number': [1, 5] 217 | })) 218 | assert selection is not None 219 | assert set(selection.variables['cycle_number'].values) == {2, 4} 220 | assert set(selection.variables['pass_number'].values) == {1, 5} 221 | 222 | indexer = HalfOrbitIndexer.open(str( 223 | local_fs.collection.joinpath('index.parquet')), 224 | filesystem=local_fs.fs) 225 | assert indexer.meta == {'attribute': b'value'} 226 | selection = zcollection.load(indexer=indexer.query({ 227 | 'cycle_number': [2, 4], 228 | 'pass_number': [1, 5] 229 | })) 230 | assert selection is not None 231 | assert set(selection.variables['cycle_number'].values) == {2, 4} 232 | assert set(selection.variables['pass_number'].values) == {1, 5} 233 | 234 | indices = tuple( 235 | indexer.query({'cycle_number': [2, 4]}, only_partition_keys=False)) 236 | assert tuple(item[0] for item in indices[0][0]) == ( 237 | 'cycle_number', 238 | 'pass_number', 239 | 'year', 240 | 'month', 241 | ) 242 | 243 | indexer = HalfOrbitIndexer('', filesystem=fsspec.filesystem('memory')) 244 | assert indexer.query({'cycle_number': [2, 4]}) == () 245 | -------------------------------------------------------------------------------- /examples/ex_view.py: -------------------------------------------------------------------------------- 1 | """ 2 | Overview of a View. 3 | =================== 4 | 5 | This section outlines the steps required to get started with the main features 6 | of a ``View``. 7 | """ 8 | from typing import Iterator 9 | import pprint 10 | 11 | import dask.distributed 12 | import fsspec 13 | import numpy 14 | 15 | import zcollection 16 | import zcollection.tests.data 17 | 18 | 19 | # %% 20 | # Initialization of the environment 21 | # --------------------------------- 22 | # 23 | # As in the example of handling 24 | # :ref:`collections `, we will create 25 | # the test environment and a collection. 26 | def create_dataset() -> zcollection.Dataset: 27 | """Create a dataset to record.""" 28 | generator: Iterator[zcollection.Dataset] = \ 29 | zcollection.tests.data.create_test_dataset_with_fillvalue() 30 | return next(generator) 31 | 32 | 33 | cluster = dask.distributed.LocalCluster(processes=False) 34 | client = dask.distributed.Client(cluster) 35 | 36 | zds: zcollection.Dataset | None = create_dataset() 37 | assert zds is not None 38 | fs: fsspec.AbstractFileSystem = fsspec.filesystem('memory') 39 | collection: zcollection.Collection = zcollection.create_collection( 40 | 'time', 41 | zds, 42 | zcollection.partitioning.Date(('time', ), resolution='M'), 43 | '/view_reference', 44 | filesystem=fs) 45 | collection.insert(zds, merge_callable=zcollection.merging.merge_time_series) 46 | 47 | # %% 48 | # Creation of views 49 | # ----------------- 50 | # 51 | # A :py:class:`view` allows you to extend a collection 52 | # (:py:class:`a view reference`) that you are 53 | # not allowed to modify. 54 | view: zcollection.View = zcollection.create_view( 55 | '/my_view', 56 | zcollection.view.ViewReference('/view_reference', fs), 57 | filesystem=fs) 58 | 59 | # %% 60 | # .. note:: 61 | # 62 | # The created view can be accessed using the following command :: 63 | # 64 | # >>> view = zcollection.open_view("/my_view", filesystem=fs) 65 | # 66 | # Editing variables 67 | # ----------------- 68 | # When the view is created, it has no data of its own, it uses all the 69 | # partitions defined in the reference view. You can select the partitions used 70 | # from the reference collection by specifying the keyword argument ``filters`` 71 | # during the creation of the view. 72 | pprint.pprint(fs.listdir('/my_view')) 73 | 74 | # %% 75 | # It's not yet possible to read data from the view, as it does not yet have any 76 | # data. To minimize the risk of mismatches with the reference view, the data 77 | # present in the view drives the range of data that can be read. 78 | try: 79 | view.load() 80 | except ValueError as err: 81 | print(err) 82 | 83 | # %% 84 | # Such a state of the view is not very interesting. But it is possible to 85 | # :py:meth:`add` and modify variables in 86 | # order to enhance the view. 87 | var3_template: zcollection.meta.Variable = zds.metadata().variables['var2'] 88 | var3_template.name = 'var3' 89 | view.add_variable(var3_template) 90 | del var3_template 91 | 92 | # %% 93 | # This step creates all necessary partitions for the new variable. 94 | pprint.pprint(fs.listdir('/my_view/year=2000')) 95 | 96 | # %% 97 | # The new variable is not initialized. 98 | zds = view.load() 99 | assert zds is not None 100 | zds.variables['var3'].values 101 | 102 | # %% 103 | # The same principle used by the collection allows to 104 | # :py:meth:`update` the variables. 105 | view.update( 106 | lambda ds: dict(var3=ds['var1'].values * 0 + 1)) # type: ignore[arg-type] 107 | 108 | # %% 109 | # Like the :py:meth:`update` method 110 | # of the collection, the update method of view allows to selecting the 111 | # neighboring partitions with the keyword argument ``depth``. 112 | 113 | # %% 114 | zds = view.load() 115 | assert zds is not None 116 | var3: numpy.ndarray = zds['var3'].values 117 | print(var3) 118 | 119 | # %% 120 | # **Warning**: The variables of the reference collection cannot be edited. 121 | try: 122 | view.update( 123 | lambda ds: dict(var2=ds['var2'].values * 0)) # type: ignore[arg-type] 124 | except ValueError as exc: 125 | print(str(exc)) 126 | 127 | 128 | # %% 129 | # Sync the view with the reference 130 | # -------------------------------- 131 | # The view may not be read anymore if the number of elements in the reference 132 | # collection and in the view is not identical. To avoid this problem, the view 133 | # is automatically synchronized when it is opened. But only if the reference 134 | # collection has been completed (adding new data after the existing data), the 135 | # data already present in the view are kept. The existing tables in the view are 136 | # resized and filled with the defined fill values. If you want to know which 137 | # partitions are synchronized, you have to use the following data flow: open the 138 | # view and ask not to synchronize it (``resync=False``), then call the ``sync`` 139 | # method of view class to obtain a filter allowing selecting all the partitions 140 | # that have been modified. 141 | # 142 | # Let's illustrate this data flow with an example. 143 | # 144 | # First, we create an utility function to resize a dataset. 145 | def resize(ds: zcollection.Dataset, dim: str, 146 | size: int) -> zcollection.Dataset: 147 | """Resize a dataset.""" 148 | 149 | def new_shape( 150 | var: zcollection.Variable, 151 | selected_dim: str, 152 | new_size: int, 153 | ) -> tuple[int, ...]: 154 | """Compute the new shape of a variable.""" 155 | return tuple(new_size if dim == selected_dim else size 156 | for dim, size in zip(var.dimensions, var.shape)) 157 | 158 | return zcollection.Dataset([ 159 | zcollection.Array( 160 | name, 161 | numpy.resize(var.array.compute(), new_shape(var, dim, size)), 162 | var.dimensions, 163 | attrs=var.attrs, 164 | compressor=var.compressor, 165 | fill_value=var.fill_value, 166 | filters=var.filters, 167 | ) for name, var in ds.variables.items() 168 | ]) 169 | 170 | 171 | # %% 172 | # We then modify the last partition of the reference collection. We start by 173 | # opening the reference collection and loading the last partition. 174 | collection = zcollection.open_collection('/view_reference', 175 | filesystem=fs, 176 | mode='w') 177 | zds = collection.load( 178 | filters=lambda keys: keys['month'] == 6 and keys['year'] == 2000) 179 | assert zds is not None 180 | 181 | # %% 182 | # We create a new time variable, resize the dataset and insert the new time 183 | # values. 184 | time: numpy.ndarray = numpy.arange( 185 | numpy.datetime64('2000-06-01T00:00:00'), 186 | numpy.datetime64('2000-06-30T23:59:59'), 187 | numpy.timedelta64(1, 'h'), 188 | ) 189 | zds = resize(zds, 'num_lines', len(time)) 190 | zds['time'].values = time 191 | 192 | # %% 193 | # Finally, we update the partition in the reference collection. 194 | collection.insert(zds) 195 | 196 | # %% 197 | # Now we cannot load the view, because the shape of the last partition is no 198 | # longer consistent between the reference collection and the view. 199 | try: 200 | view.load() 201 | except ValueError as err: 202 | print(err) 203 | 204 | # %% 205 | # We call the ``sync`` method to resynchronize the view. 206 | filters = view.sync() 207 | 208 | # %% 209 | # The method returns a callable that can be used to filter the partitions that 210 | # have been synchronized. You can use this information to perform a 211 | # :py:meth:`update` of the view on the 212 | # synchronized partitions: :: 213 | # 214 | # view.update( 215 | # lambda ds: dict(var3=ds['var1'].values * 0 + 1), 216 | # filters=filters) 217 | # 218 | print(tuple(view.partitions(filters=filters))) 219 | 220 | # %% 221 | # The view is now synchronized and can be loaded. 222 | zds = view.load() 223 | assert zds is not None 224 | zds.variables['var3'].values 225 | 226 | # %% 227 | # Map a function over the view 228 | # ---------------------------- 229 | # It's possible to map a function over the partitions of the view. 230 | for partition, array in view.map(lambda ds: ( # type: ignore[arg-type] 231 | ds['var1'].values + ds['var2'].values)).compute(): 232 | print(f' * partition = {partition}: mean = {array.mean()}') 233 | 234 | # %% 235 | # .. seealso:: 236 | # 237 | # See the :py:meth:`map_overlap` method 238 | # apply a function over the partitions of the view of selecting the 239 | # neighboring partitions. 240 | # 241 | # Drop a variable 242 | # ---------------- 243 | # A method allows you to 244 | # :py:meth:`drop_variable` variables from 245 | # the view. 246 | view.drop_variable('var3') 247 | try: 248 | view.load() 249 | except ValueError as err: 250 | # The view no longer has a variable. 251 | print(err) 252 | 253 | # %% 254 | # **Warning**: The variables of the reference collection cannot be dropped. 255 | try: 256 | view.drop_variable('var2') 257 | except ValueError as exc: 258 | print(str(exc)) 259 | 260 | # %% 261 | # Close the local cluster to avoid printing warning messages in the other 262 | # examples. 263 | client.close() 264 | cluster.close() 265 | -------------------------------------------------------------------------------- /zcollection/tests/test_meta.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Metadata testing. 7 | ================= 8 | """ 9 | from __future__ import annotations 10 | 11 | from typing import Any 12 | import json 13 | import pathlib 14 | import pickle 15 | 16 | import numpy 17 | import pytest 18 | import zarr.codecs 19 | 20 | from .. import meta 21 | 22 | 23 | def test_attribute() -> None: 24 | """Test attribute creation.""" 25 | att = meta.Attribute('a', 23.4) 26 | assert isinstance(att, meta.Attribute) 27 | assert att.name == 'a' 28 | assert att.value == 23.4 29 | assert str(att) == "Attribute('a', 23.4)" 30 | # pylint: disable=comparison-with-itself 31 | assert att == att 32 | assert (att == 'X') is False 33 | assert att != meta.Attribute('a', '23.4') 34 | assert isinstance(meta.Attribute.from_config(att.get_config()), 35 | meta.Attribute) 36 | 37 | att = meta.Attribute('a', numpy.arange(10)) 38 | assert att == meta.Attribute('a', numpy.arange(10)) 39 | 40 | att = meta.Attribute('a', numpy.datetime64('2000-01-01', 'us')) 41 | assert att == att 42 | # pylint: enable=comparison-with-itself 43 | 44 | 45 | def test_dimension() -> None: 46 | """Test dimension creation.""" 47 | dim = meta.Dimension('a', 12) 48 | assert isinstance(dim, meta.Dimension) 49 | assert dim.name == 'a' 50 | assert dim.value == 12 51 | assert str(dim) == "Dimension('a', 12)" 52 | # pylint: disable=comparison-with-itself 53 | assert dim == dim 54 | # pylint: enable=comparison-with-itself 55 | assert dim != meta.Dimension('a', 11) 56 | assert isinstance(meta.Dimension.from_config(dim.get_config()), 57 | meta.Dimension) 58 | 59 | 60 | def test_variable() -> None: 61 | """Test variable creation.""" 62 | var = meta.Variable('a', 63 | numpy.dtype('int16'), 64 | dimensions=('a', ), 65 | attrs=(meta.Attribute('x', 12), ), 66 | compressor=zarr.codecs.Zlib(), 67 | filters=(zarr.codecs.Delta(numpy.float64, numpy.int16), 68 | zarr.codecs.FixedScaleOffset( 69 | 0, 1, numpy.int16))) 70 | assert isinstance(var, meta.Variable) 71 | assert str(var) == "Variable('a')" 72 | # pylint: disable=comparison-with-itself 73 | assert var == var 74 | assert (var == 2) is False 75 | other: meta.Variable = meta.Variable.from_config(var.get_config()) 76 | assert var == other 77 | other.name = 'x' 78 | assert var != other 79 | # pylint: enable=comparison-with-itself 80 | 81 | 82 | def test_dataset() -> None: 83 | """Test dataset creation.""" 84 | root: pathlib.Path = pathlib.Path(__file__).parent 85 | with root.joinpath('first_dataset.json').open(encoding='utf-8') as stream: 86 | first: dict[str, Any] = json.load(stream) 87 | with root.joinpath('second_dataset.json').open(encoding='utf-8') as stream: 88 | second: dict[str, Any] = json.load(stream) 89 | ds: meta.Dataset = meta.Dataset.from_config(first) 90 | other: meta.Dataset = meta.Dataset.from_config(second) 91 | assert ds == other 92 | assert (ds == 2) is False 93 | assert (ds != other) is False 94 | ds.dimensions = ds.dimensions + ('dummy', ) 95 | assert ds != other 96 | 97 | 98 | def test_select_variables() -> None: 99 | """Test select_variables.""" 100 | root: pathlib.Path = pathlib.Path(__file__).parent 101 | with root.joinpath('first_dataset.json').open(encoding='utf-8') as stream: 102 | config: dict[str, Any] = json.load(stream) 103 | ds: meta.Dataset = meta.Dataset.from_config(config) 104 | variables: set[str] = ds.select_variables(('longitude', 'latitude')) 105 | assert variables == {'longitude', 'latitude'} 106 | variables = ds.select_variables(drop_variables=('longitude', 'latitude')) 107 | assert set(variables) & {'longitude', 'latitude'} == set() 108 | variables = ds.select_variables(keep_variables=('longitude', 'latitude', 109 | 'time'), 110 | drop_variables=('time', )) 111 | assert variables == {'longitude', 'latitude'} 112 | 113 | 114 | def test_search_same_dimensions_as() -> None: 115 | """Test search_same_dimensions_as.""" 116 | root: pathlib.Path = pathlib.Path(__file__).parent 117 | with root.joinpath('first_dataset.json').open(encoding='utf-8') as stream: 118 | first: dict[str, Any] = json.load(stream) 119 | ds: meta.Dataset = meta.Dataset.from_config(first) 120 | other: meta.Variable = ds.search_same_dimensions_as( 121 | ds.variables['simulated_error_karin']) 122 | assert other.dimensions == ds.variables['simulated_error_karin'].dimensions 123 | 124 | other = meta.Variable.from_config(other.get_config()) 125 | other.dimensions = other.dimensions + ('dummy', ) 126 | with pytest.raises(ValueError): 127 | ds.search_same_dimensions_as(other) 128 | 129 | 130 | def test_pickle() -> None: 131 | """Test pickling.""" 132 | root: pathlib.Path = pathlib.Path(__file__).parent 133 | with root.joinpath('first_dataset.json').open(encoding='utf-8') as stream: 134 | data: dict[str, Any] = json.load(stream) 135 | ds: meta.Dataset = meta.Dataset.from_config(data) 136 | other: meta.Dataset = pickle.loads(pickle.dumps(ds)) 137 | assert ds == other 138 | 139 | 140 | def test_missing_variables() -> None: 141 | """Test missing_variables.""" 142 | root: pathlib.Path = pathlib.Path(__file__).parent 143 | with root.joinpath('first_dataset.json').open(encoding='utf-8') as stream: 144 | data: dict[str, Any] = json.load(stream) 145 | ds: meta.Dataset = meta.Dataset.from_config(data) 146 | other: meta.Dataset = pickle.loads(pickle.dumps(ds)) 147 | 148 | assert len(ds.missing_variables(other)) == 0 149 | 150 | del other.variables['cross_track_distance'] 151 | del other.variables['cycle_number'] 152 | 153 | assert set(ds.missing_variables(other)) == { 154 | 'cross_track_distance', 'cycle_number' 155 | } 156 | 157 | other.variables['XXX'] = other.variables['longitude'] 158 | other.variables['XXX'].name = 'XXX' 159 | with pytest.raises(ValueError): 160 | ds.missing_variables(other) 161 | 162 | 163 | def test_add_variable() -> None: 164 | """Test adding a variable.""" 165 | ds = meta.Dataset(('x', 'y'), []) 166 | ds.add_variable(meta.Variable('a', numpy.float64, dimensions=('x', 'y'))) 167 | 168 | with pytest.raises(ValueError): 169 | ds.add_variable( 170 | meta.Variable('a', numpy.float64, dimensions=('x', 'y'))) 171 | 172 | ds.add_variable(meta.Variable('b', numpy.float64, dimensions=('x', ))) 173 | ds.add_variable(meta.Variable('c', numpy.float64, dimensions=('y', ))) 174 | 175 | with pytest.raises(ValueError): 176 | ds.add_variable( 177 | meta.Variable('d', numpy.float64, dimensions=('a', 'y'))) 178 | 179 | with pytest.raises(ValueError): 180 | ds.add_variable( 181 | meta.Variable('e', numpy.float64, dimensions=('a', 'b'))) 182 | 183 | with pytest.raises(ValueError): 184 | ds.add_variable(meta.Variable('f', numpy.float64, dimensions=('a', ))) 185 | 186 | ds.add_variable(meta.Variable('g', numpy.float64)) 187 | 188 | 189 | def test_select_variables_by_dims() -> None: 190 | """Test select_variable_by_dims.""" 191 | ds = meta.Dataset(('a', 'b', 'x', 'y'), []) 192 | ds.add_variable(meta.Variable('a', numpy.float64, dimensions=('x', 'y'))) 193 | ds.add_variable(meta.Variable('b', numpy.float64, dimensions=('x', ))) 194 | ds.add_variable(meta.Variable('c', numpy.float64, dimensions=('y', ))) 195 | ds.add_variable(meta.Variable('d', numpy.float64, dimensions=('a', 'y'))) 196 | ds.add_variable(meta.Variable('e', numpy.float64, dimensions=('a', 'b'))) 197 | ds.add_variable(meta.Variable('f', numpy.float64, dimensions=('a', ))) 198 | ds.add_variable(meta.Variable('g', numpy.float64)) 199 | 200 | assert ds.select_variables_by_dims(('x', 'y')) == {'a', 'b', 'c', 'd'} 201 | assert ds.select_variables_by_dims(('x', )) == {'a', 'b'} 202 | assert ds.select_variables_by_dims(('y', )) == {'a', 'c', 'd'} 203 | assert ds.select_variables_by_dims(('a', 'y')) == {'a', 'c', 'd', 'e', 'f'} 204 | assert ds.select_variables_by_dims(('a', 'b')) == {'d', 'e', 'f'} 205 | assert ds.select_variables_by_dims(('a', )) == {'d', 'e', 'f'} 206 | assert ds.select_variables_by_dims(()) == {'g'} 207 | assert ds.select_variables_by_dims(('z', )) == set() 208 | 209 | assert ds.select_variables_by_dims(('x', 'y'), 210 | predicate=False) == {'e', 'f', 'g'} 211 | assert ds.select_variables_by_dims( 212 | ('x', ), predicate=False) == {'c', 'd', 'e', 'f', 'g'} 213 | assert ds.select_variables_by_dims( 214 | ('y', ), predicate=False) == {'b', 'e', 'f', 'g'} 215 | assert ds.select_variables_by_dims(('a', 'y'), 216 | predicate=False) == {'b', 'g'} 217 | assert ds.select_variables_by_dims( 218 | ('a', 'b'), predicate=False) == {'a', 'b', 'c', 'g'} 219 | assert ds.select_variables_by_dims( 220 | ('a', ), predicate=False) == {'a', 'b', 'c', 'g'} 221 | assert ds.select_variables_by_dims( 222 | (), predicate=False) == {'a', 'b', 'c', 'd', 'e', 'f'} 223 | assert ds.select_variables_by_dims( 224 | ('z', ), predicate=False) == {'a', 'b', 'c', 'd', 'e', 'f', 'g'} 225 | -------------------------------------------------------------------------------- /zcollection/tests/test_fs_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 CNES 2 | # 3 | # All rights reserved. Use of this source code is governed by a 4 | # BSD-style license that can be found in the LICENSE file. 5 | """ 6 | Testing utilities 7 | ================= 8 | """ 9 | from typing import Any 10 | import os 11 | import pathlib 12 | import platform 13 | 14 | import fsspec 15 | import fsspec.implementations.local 16 | 17 | from .. import fs_utils 18 | # pylint: disable=unused-import # Need to import for fixtures 19 | from .cluster import dask_client, dask_cluster 20 | 21 | # pylint: disable=unused-import 22 | 23 | #: Test data 24 | TEXT = '''Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam porta 25 | turpis dictum, porta tellus eu, convallis mi. Integer at placerat diam. Donec in 26 | various neque. Morbi sed nisi finibus, mattis velit non, pulvinar metus. Duis 27 | feugiat diam eget augue posuere, nec aliquam dolor tristique. Aliquam a dolor 28 | vel ante sagittis dictum vel at dolor. Suspendisse velit dolor, vestibulum eget 29 | aliquet ut, imperdiet at justo. Nullam sit amet suscipit orci, bibendum sagittis 30 | orci. Aliquam mattis feugiat rutrum. Vivamus fermentum ex non mauris faucibus 31 | vehicula. Donec odio lacus, viverra et hendrerit eu, mollis eget mauris. Duis 32 | suscipit, velit nec finibus ullamcorper, nisi lorem fermentum tellus, ut viverra 33 | nunc lorem ut odio. Duis eget ligula maximus, venenatis nulla a, commodo dolor. 34 | Aenean justo sapien, mollis aliquam vestibulum id, suscipit a ligula. Phasellus 35 | porta arcu erat, elementum faucibus leo auctor vel. Integer vel pharetra leo.''' 36 | 37 | 38 | def test_join_path() -> None: 39 | """Test the join_path function.""" 40 | assert fs_utils.join_path('a', 'b', 'c') == 'a/b/c' 41 | assert fs_utils.join_path('a', 'b', 'c', 'd') == 'a/b/c/d' 42 | assert fs_utils.join_path('a', 'b', 'c', 'd', 'e') == 'a/b/c/d/e' 43 | assert fs_utils.join_path('a', 'b', 'c', 'd', 'e', 'f') == 'a/b/c/d/e/f' 44 | 45 | 46 | def test_get_fs() -> None: 47 | """Test the get_fs function.""" 48 | fs = fs_utils.get_fs('file') 49 | assert isinstance(fs, fsspec.implementations.local.LocalFileSystem) 50 | fs = fs_utils.get_fs() 51 | assert isinstance(fs, fsspec.implementations.local.LocalFileSystem) 52 | 53 | 54 | def test_fs_walk(tmpdir) -> None: 55 | """Test the fs_walk function.""" 56 | item: Any 57 | 58 | for idx, item in enumerate([ 59 | ('year=2014', 'month=5'), 60 | ('year=2014', 'month=5', 'day=2'), 61 | ('year=2014', 'month=5', 'day=1'), 62 | ('year=2014', 'month=5', 'day=3'), 63 | ('year=2014', 'month=4'), 64 | ('year=2014', 'month=4', 'day=16'), 65 | ('year=2014', 'month=4', 'day=24'), 66 | ('year=2014', 'month=4', 'day=27'), 67 | ('year=2014', 'month=4', 'day=20'), 68 | ('year=2014', 'month=4', 'day=29'), 69 | ('year=2014', 'month=4', 'day=14'), 70 | ('year=2014', 'month=4', 'day=25'), 71 | ('year=2014', 'month=4', 'day=19'), 72 | ('year=2014', 'month=4', 'day=12'), 73 | ('year=2014', 'month=4', 'day=23'), 74 | ('year=2014', 'month=4', 'day=17'), 75 | ('year=2014', 'month=4', 'day=28'), 76 | ('year=2014', 'month=4', 'day=13'), 77 | ('year=2014', 'month=4', 'day=21'), 78 | ('year=2014', 'month=4', 'day=15'), 79 | ('year=2014', 'month=4', 'day=18'), 80 | ('year=2014', 'month=4', 'day=26'), 81 | ('year=2014', 'month=4', 'day=22'), 82 | ('year=2014', 'month=4', 'day=30'), 83 | ]): 84 | path = pathlib.Path(tmpdir).joinpath(*item) 85 | path.mkdir(parents=True, exist_ok=False) 86 | if 'day' in item[-1]: 87 | with path.joinpath(f'file_{idx}.txt').open(mode='w', 88 | encoding='utf-8'): 89 | ... 90 | 91 | fs = fs_utils.get_fs() 92 | listing1 = [] 93 | for root, _dirs, files in fs_utils.fs_walk(fs, str(tmpdir), sort=True): 94 | for item in files: 95 | listing1.append(fs.sep.join([root, item])) 96 | 97 | listing2 = [] 98 | for root, _dirs, files in fs_utils.fs_walk(fs, str(tmpdir), sort=False): 99 | for item in files: 100 | listing2.append(fs.sep.join([root, item])) 101 | 102 | assert listing1 == sorted(listing2) 103 | 104 | assert list( 105 | fs_utils.fs_walk(fs, 106 | str(pathlib.Path(tmpdir).joinpath('inexistent')), 107 | sort=True)) == [('', [], [])] 108 | 109 | 110 | def test_normalize_path() -> None: 111 | """Test the normalize_path function.""" 112 | fs = fsspec.filesystem('file') 113 | root = str(pathlib.Path('/').resolve()) 114 | if platform.system() == 'Windows': 115 | # fsspec returns only the drive letter for the root path. 116 | root = root.replace('\\', '') 117 | 118 | def istrcmp(str1, str2): 119 | """Case insensitive string comparison.""" 120 | if platform.system() == 'Windows': 121 | str1 = str1.replace('\\', '/') 122 | str2 = str2.replace('\\', '/') 123 | return str1.lower() == str2.lower() 124 | 125 | assert istrcmp(fs_utils.normalize_path(fs, '/'), root) 126 | assert istrcmp(fs_utils.normalize_path(fs, './foo'), 127 | str(pathlib.Path('.').resolve() / 'foo')) 128 | 129 | fs = fsspec.filesystem('memory') 130 | assert fs_utils.normalize_path(fs, '/') == os.path.sep 131 | assert fs_utils.normalize_path(fs, './foo') == f'{os.path.sep}foo' 132 | 133 | fs = fsspec.filesystem('s3') 134 | assert fs_utils.normalize_path(fs, '/') == '/' 135 | assert fs_utils.normalize_path(fs, './foo') == './foo' 136 | 137 | 138 | def test_copy_file(tmpdir) -> None: 139 | """Test the copy file across different file systems.""" 140 | fs_source = fsspec.filesystem('file') 141 | fs_target = fsspec.filesystem('memory') 142 | path = str(tmpdir / 'foo.txt') 143 | with fs_source.open(path, mode='wb', encoding='utf-8') as stream: 144 | stream.write(TEXT.encode('utf-8')) 145 | fs_utils.copy_file(path, 'foo.txt', fs_source, fs_target) 146 | 147 | assert fs_target.cat('foo.txt').decode('utf-8') == TEXT 148 | 149 | 150 | def test_copy_files(tmpdir) -> None: 151 | """Test the copy files across different file systems.""" 152 | source = tmpdir / 'source' 153 | target = tmpdir / 'target' 154 | fs_source = fsspec.filesystem('file') 155 | fs_target = fsspec.filesystem('file') 156 | fs_source.mkdir(source) 157 | fs_target.mkdir(target) 158 | paths = [ 159 | str(source / item) for item in ( 160 | 'foo.txt', 161 | 'bar.txt', 162 | 'baz.txt', 163 | ) 164 | ] 165 | for path in paths: 166 | with fs_source.open(path, mode='wb', encoding='utf-8') as stream: 167 | stream.write(TEXT.encode('utf-8')) 168 | fs_utils.copy_files(paths, str(target), fs_source, fs_target) 169 | 170 | for item in fs_target.ls(str(target)): 171 | assert fs_target.cat(item).decode('utf-8') == TEXT 172 | 173 | 174 | def test_copy_tree(tmpdir) -> None: 175 | """Test the copy tree across different file systems.""" 176 | item: Any 177 | fs_source = fsspec.filesystem('file') 178 | fs_target = fsspec.filesystem('memory') 179 | 180 | for idx, item in enumerate([ 181 | ('year=2014', 'month=5'), 182 | ('year=2014', 'month=5', 'day=2'), 183 | ('year=2014', 'month=5', 'day=1'), 184 | ('year=2014', 'month=5', 'day=3'), 185 | ('year=2014', 'month=4'), 186 | ('year=2014', 'month=4', 'day=16'), 187 | ('year=2014', 'month=4', 'day=24'), 188 | ('year=2014', 'month=4', 'day=27'), 189 | ('year=2014', 'month=4', 'day=20'), 190 | ('year=2014', 'month=4', 'day=29'), 191 | ('year=2014', 'month=4', 'day=14'), 192 | ('year=2014', 'month=4', 'day=25'), 193 | ('year=2014', 'month=4', 'day=19'), 194 | ('year=2014', 'month=4', 'day=12'), 195 | ('year=2014', 'month=4', 'day=23'), 196 | ('year=2014', 'month=4', 'day=17'), 197 | ('year=2014', 'month=4', 'day=28'), 198 | ('year=2014', 'month=4', 'day=13'), 199 | ('year=2014', 'month=4', 'day=21'), 200 | ('year=2014', 'month=4', 'day=15'), 201 | ('year=2014', 'month=4', 'day=18'), 202 | ('year=2014', 'month=4', 'day=26'), 203 | ('year=2014', 'month=4', 'day=22'), 204 | ('year=2014', 'month=4', 'day=30'), 205 | ]): 206 | path = fs_utils.join_path(str(tmpdir), *item) 207 | fs_source.makedirs(path, exist_ok=False) 208 | if 'day' in item[-1]: 209 | with fs_source.open(fs_utils.join_path(path, f'file_{idx}.txt'), 210 | mode='wb', 211 | encoding='utf-8') as stream: 212 | stream.write(TEXT.encode('utf-8')) 213 | 214 | fs_utils.copy_tree(str(tmpdir), '/tree', fs_source, fs_target) 215 | 216 | for root, dirs, files in fs_utils.fs_walk(fs_target, '/tree'): 217 | for item in files: 218 | assert fs_target.cat(fs_utils.join_path( 219 | root, item)).decode('utf-8') == TEXT 220 | for item in dirs: 221 | item = item.replace('\\', '/') 222 | parts = item.replace('/tree/', '').split(fs_target.sep) 223 | assert parts[0] == 'year=2014' 224 | if len(parts) > 1: 225 | assert parts[1] in ['month=4', 'month=5'] 226 | if len(parts) > 2: 227 | assert 'day=' in parts[2] 228 | if len(parts) > 3: 229 | assert 'file_' in parts[3] 230 | assert parts[3].endswith('.txt') 231 | --------------------------------------------------------------------------------