├── .coveragerc
├── docs
    ├── source
    │   ├── images
    │   │   └── merge_time_series.png
    │   ├── _templates
    │   │   └── autosummary
    │   │   │   ├── base.rst
    │   │   │   ├── module.rst
    │   │   │   └── class.rst
    │   ├── api.rst
    │   ├── index.rst
    │   ├── install.rst
    │   ├── conf.py
    │   └── release.rst
    ├── Makefile
    └── make.bat
├── zcollection
    ├── tests
    │   ├── __init__.py
    │   ├── test_sync.py
    │   ├── fixture.py
    │   ├── test_mathematics.py
    │   ├── fs.py
    │   ├── test_expression.py
    │   ├── cluster.py
    │   ├── test_dask_utils.py
    │   ├── data.py
    │   ├── s3.py
    │   ├── test_compressed_array.py
    │   ├── test_meta.py
    │   └── test_fs_utils.py
    ├── view
    │   └── tests
    │   │   └── __init__.py
    ├── collection
    │   ├── tests
    │   │   └── __init__.py
    │   └── callable_objects.py
    ├── indexing
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_abc.py
    │   └── __init__.py
    ├── merging
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_merging.py
    │   │   └── test_time_series.py
    │   ├── time_series.py
    │   └── __init__.py
    ├── variable
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_abc.py
    │   │   ├── data.py
    │   │   ├── test_delayed_array.py
    │   │   └── test_array.py
    │   ├── __init__.py
    │   └── array.py
    ├── partitioning
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_registry.py
    │   │   ├── data.py
    │   │   └── test_sequence.py
    │   ├── __init__.py
    │   ├── registry.py
    │   ├── sequence.py
    │   └── date.py
    ├── convenience
    │   ├── __init__.py
    │   ├── view.py
    │   └── collection.py
    ├── mathematics.py
    ├── __init__.py
    ├── expression.py
    ├── sync.py
    ├── representation.py
    ├── dask_utils.py
    ├── type_hints.py
    └── fs_utils.py
├── examples
    ├── README.rst
    ├── ex_indexing.py
    └── ex_view.py
├── .vscode
    └── settings.json
├── readthedocs.yml
├── .github
    └── workflows
    │   ├── pre-commit.yml
    │   ├── pypipublish.yaml
    │   └── ci.yaml
├── conda
    ├── environment.yml
    └── meta.yaml
├── pyproject.toml
├── setup.py
├── conftest.py
├── LICENSE
├── .pre-commit-config.yaml
├── README.rst
├── setup.cfg
└── .gitignore


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 |     */tests/*
4 |     zcollection/typing.py
5 |     */pytest*
6 | 


--------------------------------------------------------------------------------
/docs/source/images/merge_time_series.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CNES/zcollection/HEAD/docs/source/images/merge_time_series.png


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/base.rst:
--------------------------------------------------------------------------------
1 | {{ fullname | escape | underline}}
2 | 
3 | .. currentmodule:: {{ module }}
4 | 
5 | .. auto{{ objtype }}:: {{ objname }}
6 | 


--------------------------------------------------------------------------------
/zcollection/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 CNES
2 | #
3 | # All rights reserved. Use of this source code is governed by a
4 | # BSD-style license that can be found in the LICENSE file.
5 | 


--------------------------------------------------------------------------------
/zcollection/view/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 CNES
2 | #
3 | # All rights reserved. Use of this source code is governed by a
4 | # BSD-style license that can be found in the LICENSE file.
5 | 


--------------------------------------------------------------------------------
/zcollection/collection/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 CNES
2 | #
3 | # All rights reserved. Use of this source code is governed by a
4 | # BSD-style license that can be found in the LICENSE file.
5 | 


--------------------------------------------------------------------------------
/zcollection/indexing/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 CNES
2 | #
3 | # All rights reserved. Use of this source code is governed by a
4 | # BSD-style license that can be found in the LICENSE file.
5 | 


--------------------------------------------------------------------------------
/zcollection/merging/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 CNES
2 | #
3 | # All rights reserved. Use of this source code is governed by a
4 | # BSD-style license that can be found in the LICENSE file.
5 | 


--------------------------------------------------------------------------------
/zcollection/variable/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 CNES
2 | #
3 | # All rights reserved. Use of this source code is governed by a
4 | # BSD-style license that can be found in the LICENSE file.
5 | 


--------------------------------------------------------------------------------
/zcollection/partitioning/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 CNES
2 | #
3 | # All rights reserved. Use of this source code is governed by a
4 | # BSD-style license that can be found in the LICENSE file.
5 | 


--------------------------------------------------------------------------------
/examples/README.rst:
--------------------------------------------------------------------------------
1 | Example Gallery
2 | ===============
3 | 
4 | This gallery of examples shows a variety of relatively small snippets or
5 | examples of tasks that can be done with the ``zcollection`` package.
6 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.testing.pytestArgs": [
3 |         "."
4 |     ],
5 |     "python.testing.unittestEnabled": false,
6 |     "python.testing.pytestEnabled": true,
7 |     "python.formatting.provider": "yapf"
8 | }
9 | 


--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sphinx:
 4 |   builder: html
 5 |   configuration: docs/source/conf.py
 6 | 
 7 | python:
 8 |   install:
 9 |     - path: .
10 |       method: setuptools
11 | 
12 | conda:
13 |   environment: conda/environment.yml
14 | 
15 | build:
16 |   os: ubuntu-20.04
17 |   tools:
18 |     python: mambaforge-4.10
19 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | name: pre-commit
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 | 
 7 | jobs:
 8 |   pre-commit:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v3
12 |     - uses: actions/setup-python@v3
13 |       with:
14 |         python-version: 3.11
15 |     - uses: pre-commit/action@v3.0.0
16 | 


--------------------------------------------------------------------------------
/conda/environment.yml:
--------------------------------------------------------------------------------
 1 | name: ZCollection
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - dask-core
 7 |   - distributed
 8 |   - furo
 9 |   - numcodecs
10 |   - pyarrow
11 |   - pypandoc
12 |   - pytest
13 |   - requests
14 |   - s3fs
15 |   - setuptools-scm
16 |   - sphinx-gallery
17 |   - sphinx-inline-tabs
18 |   - xarray
19 |   - zarr>=2.11.0
20 | 


--------------------------------------------------------------------------------
/zcollection/tests/test_sync.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Testing the sync module.
 7 | ========================
 8 | """
 9 | from .. import sync
10 | 
11 | 
12 | def test_no_sync():
13 |     """Test the no_sync class."""
14 |     touch = False
15 |     with sync.NoSync() as _:
16 |         touch = True
17 |     assert touch
18 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=45", "wheel", "setuptools_scm>=6.2"]
 3 | 
 4 | [tool.setuptools_scm]
 5 | write_to = "zcollection/version.py"
 6 | write_to_template = '''
 7 | # Copyright (c) 2023 CNES
 8 | #
 9 | # All rights reserved. Use of this source code is governed by a
10 | # BSD-style license that can be found in the LICENSE file.
11 | """
12 | Get software version information
13 | ================================
14 | """
15 | __version__ = "{version}"
16 | '''
17 | 


--------------------------------------------------------------------------------
/zcollection/convenience/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Convenience functions
 7 | =====================
 8 | """
 9 | from .collection import create_collection, open_collection
10 | from .view import create_view, open_view
11 | 
12 | __all__ = (
13 |     'create_collection',
14 |     'open_collection',
15 |     'create_view',
16 |     'open_view',
17 | )
18 | 


--------------------------------------------------------------------------------
/zcollection/variable/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Variables of a dataset.
 7 | =======================
 8 | """
 9 | from ..meta import Attribute
10 | from .abc import Variable, new_variable
11 | from .array import Array
12 | from .delayed_array import DelayedArray
13 | 
14 | __all__ = ('Attribute', 'Variable', 'Array', 'DelayedArray', 'new_variable')
15 | 


--------------------------------------------------------------------------------
/zcollection/indexing/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Indexing a Collection.
 7 | ======================
 8 | """
 9 | import warnings
10 | 
11 | try:
12 |     from .abc import Indexer, QueryDict, Scalar
13 |     __all__ = ('Indexer', 'QueryDict', 'Scalar')
14 | except ImportError:  # pragma: no cover
15 |     warnings.warn(
16 |         'Install PyArrow to use the indexing capabilities of zcollection.')
17 | 


--------------------------------------------------------------------------------
/zcollection/tests/fixture.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Fixtures for the tests.
 7 | =======================
 8 | """
 9 | from typing import Literal
10 | 
11 | import pytest
12 | 
13 | 
14 | @pytest.fixture
15 | def dask_arrays() -> Literal[True]:
16 |     """Load the data in Dask arrays."""
17 |     return True
18 | 
19 | 
20 | @pytest.fixture
21 | def numpy_arrays() -> Literal[False]:
22 |     """Load the data in NumPy arrays."""
23 |     return False
24 | 


--------------------------------------------------------------------------------
/zcollection/tests/test_mathematics.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Mathematics testing.
 7 | ====================
 8 | """
 9 | from .. import mathematics
10 | 
11 | 
12 | def test_prod():
13 |     """Test the product of an iterable."""
14 |     assert mathematics.prod([]) == 1
15 |     assert mathematics.prod([1]) == 1
16 |     assert mathematics.prod([1, 2, 3]) == 6
17 |     assert mathematics.prod([1, 2, 3, 4, 5]) == 120
18 |     assert mathematics.prod([1, 2, 3, 4, 5, 6, 7, 8, 9]) == 362880
19 |     assert mathematics.prod([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) == 3628800
20 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/zcollection/mathematics.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Mathematical functions.
 7 | =======================
 8 | """
 9 | from typing import Iterable
10 | import functools
11 | import operator
12 | 
13 | 
14 | def prod(iterable: Iterable) -> int:
15 |     """Return the product of all elements in the given iterable.
16 | 
17 |     Args:
18 |         iterable: An iterable containing numeric values.
19 | 
20 |     Returns:
21 |         The product of all elements in the iterable. If the iterable is empty,
22 |         returns 1.
23 |     """
24 |     return functools.reduce(operator.mul, iterable, 1)
25 | 


--------------------------------------------------------------------------------
/.github/workflows/pypipublish.yaml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |     - '*'
 7 | 
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v2
13 |         with:
14 |           submodules: 'true'
15 |       - name: Set up Python
16 |         uses: actions/setup-python@v2
17 |         with:
18 |           python-version: "3.x"
19 |       - name: Install dependencies
20 |         run: |
21 |           python -m pip install --upgrade pip
22 |           pip install setuptools setuptools-scm twine
23 |       - name: Build and publish
24 |         run: |
25 |           echo "[pypi]" > ~/.pypirc
26 |           echo "username = __token__" >> ~/.pypirc
27 |           echo "password = ${{ secrets.PYPI_PASSWORD }}" >> ~/.pypirc
28 |           python setup.py sdist
29 |           twine upload dist/*
30 | 


--------------------------------------------------------------------------------
/zcollection/variable/tests/test_abc.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Testing interface for the variable module
 7 | =========================================
 8 | """
 9 | import numpy
10 | 
11 | from ..abc import not_equal
12 | 
13 | 
14 | def test_variable_not_equal() -> None:
15 |     """Test if two values are different."""
16 |     assert not_equal(1, 2) is True
17 |     assert not_equal(1, 1) is False
18 |     assert not_equal(1, '1') is True
19 |     assert not_equal(1, numpy.nan) is True
20 |     assert not_equal(numpy.nan, numpy.nan) is False
21 |     assert not_equal(numpy.nan, 1) is True
22 |     assert not_equal(numpy.datetime64('NaT'), numpy.datetime64('NaT')) is False
23 |     assert not_equal(numpy.datetime64('NaT'), 1) is True
24 | 


--------------------------------------------------------------------------------
/conda/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "zcollection" %}
 2 | {% set version = "0.0" %}
 3 | 
 4 | package:
 5 |   name: {{ name|lower }}
 6 |   version: {{ version }}
 7 | 
 8 | source:
 9 |     path: ..
10 | 
11 | build:
12 |   number: 0
13 |   script: {{ PYTHON }} -m pip install . -vv --use-feature=in-tree-build
14 |   skip: true  # [linux32 or win32 or py<36]
15 | 
16 | requirements:
17 |   build:
18 |     - python
19 |   run:
20 |     - dask
21 |     - fsspec
22 |     - numcodecs
23 |     - numpy >=1.20
24 |     - python
25 |     - xarray
26 |     - zarr
27 | test:
28 |   requires:
29 |     - pytest
30 |   commands:
31 |     - pytest --pyargs zcollection
32 | 
33 | about:
34 |   home: https://github.com/CNES/zcollection
35 |   license: Proprietary
36 |   license_family: Proprietary
37 |   summary: 'Handle a collection of Zarr groups'
38 |   doc_url: https://zcollection.readthedocs.io/en/latest/
39 |   dev_url: https://github.com/CNES/zcollection
40 | 


--------------------------------------------------------------------------------
/zcollection/partitioning/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Partitioning scheme.
 7 | ====================
 8 | 
 9 | Entry point of the implemented partitioning schemes.
10 | 
11 | * :py:class:`Sequence <zcollection.partitioning.sequence.Sequence>`:
12 |   Partitioning a sequence of variables.
13 | * :py:class:`Date <zcollection.partitioning.date.Date>`: Partitioning a
14 |   sequence of dates.
15 | 
16 | .. class:: Partitioning
17 | 
18 |     Alias for :class:`zcollection.partitioning.abc.Partitioning`.
19 | """
20 | from .abc import Partitioning
21 | from .date import Date
22 | from .registry import get_codecs, register_codec
23 | from .sequence import Sequence
24 | 
25 | register_codec(Date)
26 | register_codec(Sequence)
27 | 
28 | __all__ = ('Partitioning', 'Date', 'Sequence', 'get_codecs')
29 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """This script is the entry point for building, distributing and installing
 6 | this module using distutils/setuptools."""
 7 | import pathlib
 8 | 
 9 | import setuptools
10 | import setuptools.command.sdist
11 | 
12 | # Working directory
13 | WORKING_DIRECTORY = pathlib.Path(__file__).parent.absolute()
14 | 
15 | 
16 | class SDist(setuptools.command.sdist.sdist):
17 |     """Custom sdist command that copies the pytest configuration file into the
18 |     package."""
19 |     user_options = setuptools.command.sdist.sdist.user_options
20 | 
21 |     def run(self):
22 |         """Carry out the action."""
23 |         source = WORKING_DIRECTORY.joinpath('conftest.py')
24 |         target = WORKING_DIRECTORY.joinpath('zcollection', 'conftest.py')
25 |         source.rename(target)
26 |         try:
27 |             super().run()
28 |         finally:
29 |             target.rename(source)
30 | 
31 | 
32 | setuptools.setup(cmdclass={'sdist': SDist})
33 | 


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/module.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname | escape | underline}}
 2 | 
 3 | .. automodule:: {{ fullname }}
 4 |    {% block attributes -%}
 5 |    {% if attributes %}
 6 |    .. rubric:: {{ ('Modules Attributes') }}
 7 |    .. autosummary::
 8 |       :toctree:
 9 |    {% for item in attributes %}
10 |       {{ item }}
11 |    {%- endfor %}
12 |    {% endif -%}
13 |    {% endblock -%}
14 |    {% block classes -%}
15 |    {% if classes %}
16 |    .. rubric:: {{ ('Classes') }}
17 |    .. autosummary::
18 |       :toctree:
19 |    {% for item in classes %}
20 |       {{ item }}
21 |    {%- endfor %}
22 |    {% endif -%}
23 |    {% endblock -%}
24 |    {% block exceptions -%}
25 |    {% if exceptions %}
26 |    .. rubric:: {{ ('Exceptions') }}
27 |    .. autosummary::
28 |       :toctree:
29 |    {% for item in exceptions %}
30 |       {{ item }}
31 |    {%- endfor %}
32 |    {% endif -%}
33 |    {% endblock -%}
34 |    {% block functions -%}
35 |    {% if functions %}
36 |    .. rubric:: {{ ('Functions') }}
37 |    .. autosummary::
38 |       :toctree:
39 |    {% for item in functions %}
40 |       {{ item }}
41 |    {%- endfor %}
42 |    {% endif -%}
43 |    {% endblock -%}
44 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | ==========
 7 | Test setup
 8 | ==========
 9 | """
10 | 
11 | 
12 | def pytest_addoption(parser) -> None:
13 |     """Add command line options to pytest."""
14 |     parser.addoption(
15 |         '--s3',
16 |         action='store_true',
17 |         default=False,
18 |         help='Enable tests on the local S3 server driven by minio. '
19 |         '(default: False)')
20 |     parser.addoption(
21 |         '--memory',
22 |         action='store_true',
23 |         default=False,
24 |         help='Use a file system in memory instead of the local file system. '
25 |         '(default: False)')
26 |     parser.addoption(
27 |         '--threads_per_worker',
28 |         action='store',
29 |         default=None,
30 |         type=int,
31 |         help='Number of threads for each worker Dask. (default: the number of '
32 |         'logical cores of the target platform).')
33 |     parser.addoption(
34 |         '--n_workers',
35 |         action='store',
36 |         default=None,
37 |         type=int,
38 |         help='Number of core for each worker Dask. (default: the number of '
39 |         'cores of the target platform).')
40 |     parser.addoption(
41 |         '--processes',
42 |         action='store_true',
43 |         default=False,
44 |         help='Whether to use processes or threads for Dask. (default: False)')
45 | 


--------------------------------------------------------------------------------
/zcollection/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Handle a collection of Zarr groups.
 7 | ===================================
 8 | """
 9 | from . import merging, partitioning
10 | from .collection import Collection
11 | from .collection.abc import Indexer, PartitionFilter, PartitionFilterCallback
12 | from .collection.callable_objects import (
13 |     MapCallable,
14 |     PartitionCallable,
15 |     UpdateCallable,
16 | )
17 | from .convenience import (
18 |     create_collection,
19 |     create_view,
20 |     open_collection,
21 |     open_view,
22 | )
23 | from .dataset import Dataset, Expression
24 | from .meta import Attribute
25 | from .variable import Array, DelayedArray, Variable
26 | from .version import __version__
27 | from .view import View, ViewReference, ViewUpdateCallable
28 | 
29 | __all__ = (
30 |     '__version__',
31 |     'Array',
32 |     'Attribute',
33 |     'Collection',
34 |     'create_collection',
35 |     'create_view',
36 |     'Dataset',
37 |     'DelayedArray',
38 |     'Expression',
39 |     'Indexer',
40 |     'MapCallable',
41 |     'merging',
42 |     'open_collection',
43 |     'open_view',
44 |     'PartitionCallable',
45 |     'PartitionFilter',
46 |     'PartitionFilterCallback',
47 |     'partitioning',
48 |     'UpdateCallable',
49 |     'Variable',
50 |     'version',
51 |     'View',
52 |     'ViewReference',
53 |     'ViewUpdateCallable',
54 | )
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, CNES
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
 1 | :tocdepth: 2
 2 | 
 3 | API Documentation
 4 | #################
 5 | 
 6 | Partitioning
 7 | ============
 8 | 
 9 | Handles the partitioning of the collection.
10 | 
11 | .. autosummary::
12 |   :toctree: _generated/
13 | 
14 |   zcollection.partitioning
15 |   zcollection.partitioning.abc
16 |   zcollection.partitioning.date
17 |   zcollection.partitioning.registry
18 |   zcollection.partitioning.sequence
19 | 
20 | .. _merging_datasets:
21 | 
22 | Merging of datasets
23 | ===================
24 | 
25 | Merging of existing datasets in a partition.
26 | 
27 | .. autosummary::
28 |   :toctree: _generated/
29 | 
30 |   zcollection.merging
31 |   zcollection.merging.time_series
32 |   zcollection.merging.period
33 | 
34 | Variable
35 | ========
36 | 
37 | Variables handled by the datasets. These objects manage access to the data
38 | stored in the collection.
39 | 
40 | .. autosummary::
41 |   :toctree: _generated/
42 | 
43 |   zcollection.variable.abc
44 |   zcollection.variable.array
45 |   zcollection.variable.delayed_array
46 | 
47 | Collection
48 | ==========
49 | 
50 | .. autosummary::
51 |   :toctree: _generated/
52 | 
53 |   zcollection.collection
54 |   zcollection.dask_utils
55 |   zcollection.dataset
56 |   zcollection.expression
57 |   zcollection.fs_utils
58 |   zcollection.meta
59 |   zcollection.sync
60 |   zcollection.type_hints
61 |   zcollection.view
62 | 
63 | Indexing
64 | ========
65 | 
66 | .. autosummary::
67 |   :toctree: _generated/
68 | 
69 |   zcollection.indexing
70 |   zcollection.indexing.abc
71 | 
72 | Convenience functions
73 | =====================
74 | 
75 | .. autosummary::
76 |   :toctree: _generated/
77 | 
78 |   zcollection.create_collection
79 |   zcollection.create_view
80 |   zcollection.open_collection
81 |   zcollection.open_view
82 | 


--------------------------------------------------------------------------------
/zcollection/partitioning/tests/test_registry.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Test the registry module.
 7 | =========================
 8 | """
 9 | from typing import Any
10 | 
11 | import pytest
12 | 
13 | from .. import registry
14 | 
15 | 
16 | def test_get_codecs() -> None:
17 |     """Test the get_codecs function."""
18 |     with pytest.raises(ValueError):
19 |         registry.get_codecs({'ID': 'foo'})
20 | 
21 |     with pytest.raises(ValueError):
22 |         registry.get_codecs({'id': 'foo'})
23 | 
24 | 
25 | class MyCodec:
26 |     """A dummy codec."""
27 |     ID = 'foo'
28 | 
29 |     __slots__ = ('attribute', )
30 | 
31 |     def __init__(self, attribute) -> None:
32 |         self.attribute: Any = attribute
33 | 
34 |     def get_config(self) -> dict:
35 |         """Returns the configuration of the codec."""
36 |         return {'id': self.ID, 'attribute': self.attribute}
37 | 
38 |     @classmethod
39 |     def from_config(cls, config: dict) -> 'MyCodec':
40 |         """Creates an instance from the given configuration."""
41 |         return cls(config['attribute'])
42 | 
43 | 
44 | def test_register_codec() -> None:
45 |     """Test the register_codec function."""
46 |     registry.register_codec(MyCodec, codec_id='foo')  # type: ignore[arg-type]
47 | 
48 |     instance = MyCodec(12)
49 | 
50 |     other = registry.get_codecs(instance.get_config())
51 |     assert other.attribute == instance.attribute  # type: ignore
52 |     assert isinstance(other, MyCodec)
53 | 
54 |     with pytest.raises(ValueError):
55 |         registry.register_codec(
56 |             MyCodec,  # type: ignore[arg-type]
57 |             codec_id='foo')
58 | 


--------------------------------------------------------------------------------
/zcollection/partitioning/tests/data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Make test data.
 7 | ===============
 8 | """
 9 | import numpy
10 | import xarray
11 | 
12 | START_DATE = numpy.datetime64('2000-01-01', 'ns')
13 | END_DATE = numpy.datetime64('2000-06-30', 'ns')
14 | DELTA = numpy.timedelta64(72, 'h')
15 | 
16 | 
17 | def create_test_sequence(repeatability, number_of_measures,
18 |                          number_of_cycles) -> xarray.Dataset:
19 |     """Creation of a data set for testing purposes."""
20 |     pass_number = numpy.hstack([
21 |         numpy.tile(ix + 1, number_of_measures) for j in range(number_of_cycles)
22 |         for ix in range(repeatability)
23 |     ])
24 |     cycle_number = numpy.hstack([
25 |         numpy.tile(ix + 1, repeatability * number_of_measures)
26 |         for ix in range(number_of_cycles)
27 |     ])
28 |     delta = numpy.timedelta64(24 // repeatability // 2, 'h')
29 |     time: numpy.ndarray = numpy.arange(START_DATE,
30 |                                        START_DATE + len(cycle_number) * delta,
31 |                                        delta)
32 |     observation = numpy.random.rand(cycle_number.size)  # type: ignore
33 |     return xarray.Dataset({
34 |         'time':
35 |         xarray.DataArray(
36 |             time,
37 |             dims=('num_lines', ),
38 |         ),
39 |         'cycle_number':
40 |         xarray.DataArray(
41 |             cycle_number,
42 |             dims=('num_lines', ),
43 |         ),
44 |         'pass_number':
45 |         xarray.DataArray(
46 |             pass_number,
47 |             dims=('num_lines', ),
48 |         ),
49 |         'observation':
50 |         xarray.DataArray(
51 |             observation,
52 |             dims=('num_lines', ),
53 |         ),
54 |     })
55 | 


--------------------------------------------------------------------------------
/zcollection/expression.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Handles the partition selection expressions
 7 | ===========================================
 8 | """
 9 | from __future__ import annotations
10 | 
11 | from typing import Any, ClassVar
12 | import ast
13 | import dataclasses
14 | import types
15 | 
16 | 
17 | @dataclasses.dataclass
18 | class Expression:
19 |     """Partitioning expressions.
20 | 
21 |     Args:
22 |         expression: The expression to be evaluated
23 |     Raises:
24 |         NameError: If a variable is not defined.
25 |     Example:
26 |         >>> expr = Expression("year==2000 and month==1 and day in range(1, 12)")
27 |     """
28 |     #: Compiled expression to be evaluated
29 |     code: types.CodeType
30 | 
31 |     #: Known data members
32 |     __slots__: tuple[str, ...] = ('code', )
33 | 
34 |     #: The builtins that are allowed in the expression.
35 |     BUILTINS: ClassVar[dict[str, Any]] = {'range': range}
36 | 
37 |     def __init__(self, expression: str) -> None:
38 |         self.code = compile(ast.parse(expression, mode='eval'), ' ', 'eval')
39 | 
40 |     def __call__(self, variables: dict[str, Any]) -> Any:
41 |         try:
42 |             __locals: dict[str, Any] = {
43 |                 name: variables[name]
44 |                 for name in self.code.co_names if name not in self.BUILTINS
45 |             }
46 |             # pylint: disable=eval-used
47 |             # The eval function is used here to evaluate a simple expression.
48 |             # The only builtin functions allowed is the range function.
49 |             return eval(self.code, {'__builtins__': self.BUILTINS}, __locals)
50 |             # pylint: enable=eval-used
51 |         except KeyError as err:
52 |             raise NameError(f'name {err!s} is not defined') from err
53 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.5.0
 4 |     hooks:
 5 |       - id: check-case-conflict
 6 |       - id: check-docstring-first
 7 |       - id: check-json
 8 |       - id: check-toml
 9 |       - id: debug-statements
10 |       - id: end-of-file-fixer
11 |       - id: double-quote-string-fixer
12 |       - id: mixed-line-ending
13 |         args: [--fix=lf]
14 |         exclude: docs/make.bat
15 |       - id: trailing-whitespace
16 |         exclude: conda/meta.yaml
17 |   - repo: https://github.com/asottile/pyupgrade
18 |     rev: "v3.15.0"
19 |     hooks:
20 |     - id: pyupgrade
21 |       args: [--py38-plus]
22 |   - repo: https://github.com/PyCQA/isort
23 |     rev: 5.13.2
24 |     hooks:
25 |       - id: isort
26 |   - repo: https://github.com/PyCQA/flake8
27 |     rev: 7.0.0
28 |     hooks:
29 |       - id: flake8
30 |         exclude: tests
31 |   - repo: https://github.com/pre-commit/pygrep-hooks
32 |     rev: "v1.10.0"
33 |     hooks:
34 |     - id: python-check-blanket-noqa
35 |     - id: python-no-log-warn
36 |     - id: rst-backticks
37 |     - id: rst-directive-colons
38 |     - id: rst-inline-touching-normal
39 |   - repo: https://github.com/pre-commit/mirrors-yapf
40 |     rev: v0.32.0
41 |     hooks:
42 |       - id: yapf
43 |         additional_dependencies:
44 |           - toml
45 |   - repo: https://github.com/myint/docformatter
46 |     rev: "v1.7.5"
47 |     hooks:
48 |     - id: docformatter
49 |   - repo: https://github.com/codespell-project/codespell
50 |     rev: "v2.2.6"
51 |     hooks:
52 |     - id: codespell
53 |   - repo: https://github.com/pre-commit/mirrors-mypy
54 |     rev: v1.8.0
55 |     hooks:
56 |       - id: mypy
57 |         exclude: docs
58 |         additional_dependencies:
59 |           # Type stubs
60 |           - types-requests
61 |           - types-setuptools
62 |           # Typed libraries
63 |           - dask
64 |           - numpy
65 |           - pandas
66 |           - pyarrow
67 | 


--------------------------------------------------------------------------------
/zcollection/partitioning/registry.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Registers the partitioning codecs.
 7 | ==================================
 8 | """
 9 | from __future__ import annotations
10 | 
11 | from typing import Any
12 | 
13 | from . import abc
14 | 
15 | #: A registry of all available partitioning codecs.
16 | CODEC_REGISTRY: dict[str, type[abc.Partitioning]] = {}
17 | 
18 | 
19 | def get_codecs(config: dict[str, Any]) -> abc.Partitioning:
20 |     """Get the partitioning scheme for the given configuration.
21 | 
22 |     Args:
23 |         config: A dictionary of the partitioning configuration parameters.
24 | 
25 |     Returns:
26 |         The partitioning scheme.
27 | 
28 |     Raises:
29 |         ValueError: If the requested codec is not defined.
30 |     """
31 |     codec_id: Any | None = config.pop('id', None)
32 |     if codec_id is None:
33 |         raise ValueError(f'codec not available: {codec_id!r}')
34 |     cls: type[abc.Partitioning] | None = CODEC_REGISTRY.get(codec_id, None)
35 |     if cls is None:
36 |         raise ValueError(f'codec not available: {codec_id!r}')
37 |     return cls.from_config(config)
38 | 
39 | 
40 | def register_codec(cls: type[abc.Partitioning],
41 |                    *,
42 |                    codec_id: str | None = None) -> None:
43 |     """Register a partitioning scheme.
44 | 
45 |     Args:
46 |         cls: The partitioning scheme class.
47 |         codec_id: The partitioning scheme identifier.
48 | 
49 |     Raises:
50 |         ValueError: If the codec identifier is already registered.
51 |     """
52 |     if codec_id is None:
53 |         codec_id = cls.ID
54 |     if codec_id is None:
55 |         raise ValueError('codec identifier not defined')
56 |     if codec_id in CODEC_REGISTRY:
57 |         raise ValueError(f'codec already registered: {codec_id!r}')
58 |     CODEC_REGISTRY[codec_id] = cls
59 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | ZCollection
 2 | ===========
 3 | 
 4 | This project is a Python library allowing manipulating data partitioned into a
 5 | **collection** of `Zarr <https://zarr.readthedocs.io/en/stable/>`_ groups.
 6 | 
 7 | This collection allows dividing a dataset into several partitions to facilitate
 8 | acquisitions or updates made from new products. Possible data partitioning is:
 9 | by **date** (hour, day, month, etc.) or by **sequence**.
10 | 
11 | A collection partitioned by date, with a monthly resolution, may look like on
12 | the disk:
13 | 
14 | .. code-block:: text
15 | 
16 |     collection/
17 |     ├── year=2022
18 |     │    ├── month=01/
19 |     │    │    ├── time/
20 |     │    │    │    ├── 0.0
21 |     │    │    │    ├── .zarray
22 |     │    │    │    └── .zattrs
23 |     │    │    ├── var1/
24 |     │    │    │    ├── 0.0
25 |     │    │    │    ├── .zarray
26 |     │    │    │    └── .zattrs
27 |     │    │    ├── .zattrs
28 |     │    │    ├── .zgroup
29 |     │    │    └── .zmetadata
30 |     │    └── month=02/
31 |     │         ├── time/
32 |     │         │    ├── 0.0
33 |     │         │    ├── .zarray
34 |     │         │    └── .zattrs
35 |     │         ├── var1/
36 |     │         │    ├── 0.0
37 |     │         │    ├── .zarray
38 |     │         │    └── .zattrs
39 |     │         ├── .zattrs
40 |     │         ├── .zgroup
41 |     │         └── .zmetadata
42 |     └── .zcollection
43 | 
44 | Partition updates can be set to overwrite existing data with new ones or to
45 | update them using different **strategies**.
46 | 
47 | The `Dask library <https://dask.org/>`_ handles the data to scale the treatments
48 | quickly.
49 | 
50 | It is possible to create views on a reference collection, to add and modify
51 | variables contained in a reference collection, accessible in reading only.
52 | 
53 | This library can store data on POSIX, S3, or any other file system supported by
54 | the Python library `fsspec
55 | <https://filesystem-spec.readthedocs.io/en/latest/>`_. Note, however, only POSIX
56 | and S3 file systems have been tested.
57 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | author = CNES/CLS
 3 | author_email = fbriol@gmail.com
 4 | classifiers =
 5 |     Development Status :: 5 - Production/Stable
 6 |     Intended Audience :: Science/Research
 7 |     Natural Language :: English
 8 |     Operating System :: MacOS
 9 |     Operating System :: Microsoft :: Windows
10 |     Operating System :: POSIX
11 |     Programming Language :: Python :: 3.8
12 |     Programming Language :: Python :: 3.9
13 |     Programming Language :: Python :: 3.10
14 |     Programming Language :: Python :: 3.11
15 |     Programming Language :: Python :: 3.12
16 |     Topic :: Scientific/Engineering :: Physics
17 | description = Zarr Collection
18 | keywords = zarr, collection, xarray, dask
19 | license = BSD License
20 | license_files = LICENSE
21 | long_description = file: README.rst
22 | long_description_content_type = text/x-rst
23 | name = zcollection
24 | url = https://github.com/CNES/zcollection
25 | version = attr: zcollection.version.__version__
26 | 
27 | [options]
28 | include_package_data = True
29 | install_requires =
30 |     dask >= 2022.8.0
31 |     distributed
32 |     fasteners
33 |     fsspec
34 |     numcodecs
35 |     numpy>=1.20
36 |     pandas
37 |     xarray
38 |     zarr>=2.11
39 | package_dir =
40 |     = .
41 | packages = find:
42 | python_requires = >=3.8
43 | zip_safe = False
44 | 
45 | [options.extras_require]
46 | test =
47 |     pytest
48 |     pytest-cov
49 | 
50 | [options.package_data]
51 | * = *.json
52 | 
53 | [flake8]
54 | exclude = docs,tests
55 | max-line-length = 80
56 | ignore =
57 |     # Assigning lambda expression
58 |     E731
59 |     # Ambiguous variable names
60 |     E741
61 |     # line break before binary operator
62 |     W503
63 |     # line break after binary operator
64 |     W504
65 |     # whitespace before :
66 |     E203
67 | 
68 | [isort]
69 | combine_as_imports=True
70 | force_grid_wrap=0
71 | force_sort_within_sections=True
72 | force_to_top=typing
73 | include_trailing_comma=True
74 | line_length=80
75 | multi_line_output=3
76 | skip=
77 |     build
78 |     docs/source/conf.py
79 | 
80 | [mypy]
81 | ignore_missing_imports=True
82 | exclude=tests
83 | 


--------------------------------------------------------------------------------
/zcollection/variable/tests/data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Create test variables.
 7 | ======================
 8 | """
 9 | import numpy
10 | import zarr
11 | 
12 | from .. import Array, DelayedArray
13 | from ... import meta
14 | 
15 | 
16 | def array(name='var1', fill_value=0) -> Array:
17 |     """Creates a test variable with the given name, fill value, dimensions, and
18 |     attributes.
19 | 
20 |     Args:
21 |         name: The name of the variable.
22 |         fill_value: The fill value for uninitialized parts of the array.
23 | 
24 |     Returns:
25 |         An Array object.
26 |     """
27 |     return Array(name=name,
28 |                  data=numpy.arange(10, dtype='int64').reshape(5, 2),
29 |                  dimensions=('x', 'y'),
30 |                  attrs=(meta.Attribute(name='attr', value=1), ),
31 |                  compressor=zarr.Blosc(cname='zstd', clevel=1),
32 |                  fill_value=fill_value,
33 |                  filters=(zarr.Delta('int64',
34 |                                      'int32'), zarr.Delta('int32', 'int32')))
35 | 
36 | 
37 | def delayed_array(name='var1', fill_value=0) -> DelayedArray:
38 |     """Create a delayed test variable with the given name, fill value,
39 |     dimensions, and attributes.
40 | 
41 |     Args:
42 |         name: The name of the variable.
43 |         fill_value: The fill value for uninitialized parts of the array.
44 | 
45 |     Returns:
46 |         A DelayedArray object representing a lazily-evaluated test variable.
47 |     """
48 |     return DelayedArray(name=name,
49 |                         data=numpy.arange(10, dtype='int64').reshape(5, 2),
50 |                         dimensions=('x', 'y'),
51 |                         attrs=(meta.Attribute(name='attr', value=1), ),
52 |                         compressor=zarr.Blosc(cname='zstd', clevel=1),
53 |                         fill_value=fill_value,
54 |                         filters=(zarr.Delta('int64', 'int32'),
55 |                                  zarr.Delta('int32', 'int32')))
56 | 


--------------------------------------------------------------------------------
/zcollection/variable/tests/test_delayed_array.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Testing variables
 7 | =================
 8 | """
 9 | from typing import Any
10 | 
11 | import dask.array.core
12 | import dask.array.ma
13 | import numpy
14 | import pytest
15 | 
16 | # pylint: disable=unused-import # Need to import for fixtures
17 | from ...tests.cluster import dask_client, dask_cluster
18 | from ..delayed_array import _as_dask_array
19 | 
20 | # pylint enable=unused-import
21 | 
22 | 
23 | def test_as_dask_array(
24 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
25 | ) -> None:
26 |     """Test converting array like to a dask array."""
27 |     dask_array: dask.array.core.Array
28 |     fill_value: Any
29 |     np_array: numpy.ndarray
30 | 
31 |     np_array = numpy.arange(10)
32 |     dask_array, fill_value = _as_dask_array(np_array)
33 |     assert isinstance(dask_array, dask.array.core.Array)
34 |     assert fill_value is None
35 | 
36 |     np_array = numpy.ma.masked_equal(np_array, 5)
37 |     dask_array, fill_value = _as_dask_array(np_array)
38 |     assert isinstance(dask_array, dask.array.core.Array)
39 |     assert fill_value == 5
40 | 
41 |     dask_array, fill_value = _as_dask_array(
42 |         dask.array.ma.masked_equal(np_array, 5))
43 |     assert isinstance(dask_array, dask.array.core.Array)
44 |     assert fill_value == 5
45 | 
46 |     with pytest.raises(ValueError):
47 |         _as_dask_array(numpy.ma.masked_equal(np_array, 5), fill_value=6)
48 | 
49 |     with pytest.raises(ValueError):
50 |         _as_dask_array(numpy.ma.masked_equal(
51 |             numpy.arange(numpy.datetime64(0, 'Y'),
52 |                          numpy.datetime64(10, 'Y'),
53 |                          dtype='M8[Y]'), numpy.datetime64(5, 'Y')),
54 |                        fill_value=numpy.datetime64('NaT'))
55 | 
56 |     dask_array, fill_value = _as_dask_array(numpy.ma.masked_equal(
57 |         numpy.arange(numpy.datetime64(0, 'Y'),
58 |                      numpy.datetime64(10, 'Y'),
59 |                      dtype='M8[Y]'), numpy.datetime64('NaT')),
60 |                                             fill_value=numpy.datetime64('NaT'))
61 |     assert isinstance(dask_array, dask.array.core.Array)
62 | 


--------------------------------------------------------------------------------
/zcollection/sync.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Synchronization of concurrent accesses
 7 | ======================================
 8 | """
 9 | from __future__ import annotations
10 | 
11 | from typing import Callable
12 | import abc
13 | import threading
14 | 
15 | import fasteners
16 | 
17 | 
18 | class Sync(abc.ABC):  # pragma: no cover
19 |     """Interface of the classes handling the synchronization of concurrent
20 |     accesses."""
21 | 
22 |     @abc.abstractmethod
23 |     def __enter__(self) -> bool:
24 |         ...
25 | 
26 |     @abc.abstractmethod
27 |     def __exit__(self, exc_type, exc_value, traceback) -> None:
28 |         ...
29 | 
30 |     @abc.abstractmethod
31 |     def is_locked(self) -> bool:
32 |         """Returns True if the lock is acquired, False otherwise."""
33 | 
34 | 
35 | class NoSync(Sync):
36 |     """This class is used when the user does not want to synchronize accesses
37 |     to the collection, in other words, when there is no concurrency."""
38 | 
39 |     def __enter__(self) -> bool:
40 |         return True
41 | 
42 |     def __exit__(self, exc_type, exc_value, traceback) -> None:
43 |         """As this class does not perform any synchronization, this method has
44 |         nothing to do."""
45 | 
46 |     def is_locked(self) -> bool:
47 |         """As this class does not perform any synchronization, this method
48 |         always returns False."""
49 |         return False
50 | 
51 | 
52 | class ProcessSync(Sync):
53 |     """This class is used when the user wants to synchronize accesses to the
54 |     collection, in other words, when there is concurrency."""
55 | 
56 |     def __init__(self, path: str) -> None:
57 |         self.lock = fasteners.InterProcessLock(path)
58 | 
59 |     def __enter__(self) -> bool:
60 |         return self.lock.acquire()
61 | 
62 |     def __exit__(self, exc_type, exc_value, traceback) -> None:
63 |         try:
64 |             self.lock.release()
65 |         except threading.ThreadError:
66 |             pass
67 | 
68 |     def __reduce__(self) -> tuple[Callable, tuple[str]]:
69 |         return (ProcessSync, (str(self.lock.path), ))
70 | 
71 |     def is_locked(self) -> bool:
72 |         """Returns True if the lock is acquired, False otherwise."""
73 |         return self.lock.exists()
74 | 


--------------------------------------------------------------------------------
/zcollection/variable/tests/test_array.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Testing variables
 7 | =================
 8 | """
 9 | from typing import Any
10 | 
11 | import dask.array.core
12 | import dask.array.ma
13 | import numpy
14 | import pytest
15 | 
16 | # pylint: disable=unused-import # Need to import for fixtures
17 | from ...tests.cluster import dask_client, dask_cluster
18 | # pylint enable=unused-import
19 | from ..array import _as_numpy_array
20 | 
21 | 
22 | def test_as_numpy_array(dask_client) -> None:
23 |     """Test converting array like to a dask array."""
24 |     array: numpy.ndarray
25 |     fill_value: Any
26 |     np_array: numpy.ndarray
27 | 
28 |     np_array = numpy.arange(10)
29 |     array, fill_value = _as_numpy_array(np_array)
30 |     assert isinstance(array, numpy.ndarray)
31 |     assert not isinstance(array, numpy.ma.MaskedArray)
32 |     assert fill_value is None
33 | 
34 |     np_array = numpy.ma.masked_equal(np_array, 5)
35 |     array, fill_value = _as_numpy_array(np_array)
36 |     assert isinstance(array, numpy.ndarray)
37 |     assert not isinstance(array, numpy.ma.MaskedArray)
38 |     assert fill_value == 5
39 | 
40 |     array, fill_value = _as_numpy_array(dask.array.ma.masked_equal(
41 |         np_array, 5))
42 |     assert isinstance(array, numpy.ndarray)
43 |     assert not isinstance(array, numpy.ma.MaskedArray)
44 |     assert fill_value == 5
45 | 
46 |     with pytest.raises(ValueError):
47 |         _as_numpy_array(numpy.ma.masked_equal(np_array, 5), fill_value=6)
48 | 
49 |     with pytest.raises(ValueError):
50 |         _as_numpy_array(numpy.ma.masked_equal(
51 |             numpy.arange(numpy.datetime64(0, 'Y'),
52 |                          numpy.datetime64(10, 'Y'),
53 |                          dtype='M8[Y]'), numpy.datetime64(5, 'Y')),
54 |                         fill_value=numpy.datetime64('NaT'))
55 | 
56 |     array, fill_value = _as_numpy_array(numpy.ma.masked_equal(
57 |         numpy.arange(numpy.datetime64(0, 'Y'),
58 |                      numpy.datetime64(10, 'Y'),
59 |                      dtype='M8[Y]'), numpy.datetime64('NaT')),
60 |                                         fill_value=numpy.datetime64('NaT'))
61 |     assert isinstance(array, numpy.ndarray)
62 |     assert not isinstance(array, numpy.ma.MaskedArray)
63 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | ZCollection
 2 | ===========
 3 | 
 4 | This project is a Python library manipulating data split into a
 5 | :py:class:`collection <zcollection.collection.Collection>` of groups stored in
 6 | `Zarr format <https://zarr.readthedocs.io/en/stable/>`_.
 7 | 
 8 | This collection allows dividing a dataset into several partitions to facilitate
 9 | acquisitions or updates made from new products. Possible data partitioning is:
10 | by :py:class:`date <zcollection.partitioning.date.Date>` (hour, day, month,
11 | etc.) or by :py:class:`sequence <zcollection.partitioning.sequence.Sequence>`.
12 | 
13 | A collection partitioned by date, with a monthly resolution, may look like on
14 | the disk: ::
15 | 
16 |    collection/
17 |    ├── year=2022
18 |    │    ├── month=01/
19 |    │    │    ├── time/
20 |    │    │    │    ├── 0.0
21 |    │    │    │    ├── .zarray
22 |    │    │    │    └── .zattrs
23 |    │    │    ├── var1/
24 |    │    │    │    ├── 0.0
25 |    │    │    │    ├── .zarray
26 |    │    │    │    └── .zattrs
27 |    │    │    ├── .zattrs
28 |    │    │    ├── .zgroup
29 |    │    │    └── .zmetadata
30 |    │    └── month=02/
31 |    │         ├── time/
32 |    │         │    ├── 0.0
33 |    │         │    ├── .zarray
34 |    │         │    └── .zattrs
35 |    │         ├── var1/
36 |    │         │    ├── 0.0
37 |    │         │    ├── .zarray
38 |    │         │    └── .zattrs
39 |    │         ├── .zattrs
40 |    │         ├── .zgroup
41 |    │         └── .zmetadata
42 |    └── .zcollection
43 | 
44 | Partition updates can be set to overwrite existing data with new ones or to
45 | update them using different :py:mod:`strategies <zcollection.merging>`.
46 | 
47 | The `Dask library <https://dask.org/>`_ handles the data to scale the treatments
48 | quickly.
49 | 
50 | It is possible to create views on a reference collection, to add and modify
51 | variables contained in a reference collection, accessible in reading only.
52 | 
53 | This library can store data on POSIX, S3, or any other file system supported by
54 | the Python library `fsspec
55 | <https://filesystem-spec.readthedocs.io/en/latest/>`_. Note, however, only POSIX
56 | and S3 file systems have been tested.
57 | 
58 | .. toctree::
59 |    :maxdepth: 2
60 |    :caption: Contents:
61 | 
62 |    install
63 |    auto_examples/index.rst
64 |    api
65 |    release
66 | 
67 | Indices and tables
68 | ==================
69 | 
70 | * :ref:`genindex`
71 | * :ref:`modindex`
72 | * :ref:`search`
73 | 


--------------------------------------------------------------------------------
/zcollection/convenience/view.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Convenience functions
 7 | =====================
 8 | """
 9 | from __future__ import annotations
10 | 
11 | import fsspec
12 | 
13 | from .. import collection, fs_utils, sync, view
14 | 
15 | 
16 | def create_view(
17 |     path: str,
18 |     view_ref: view.ViewReference,
19 |     *,
20 |     filesystem: fsspec.AbstractFileSystem | str | None = None,
21 |     filters: collection.PartitionFilter = None,
22 |     synchronizer: sync.Sync | None = None,
23 | ) -> view.View:
24 |     """Create a new view.
25 | 
26 |     Args:
27 |         path: View storage directory.
28 |         view_ref: Access properties for the reference view.
29 |         filesystem: The file system used to access the view.
30 |         filters: The filters used to select the partitions of the reference
31 |             view. If not provided, all partitions are selected.
32 |         synchronizer: The synchronizer used to synchronize the view.
33 | 
34 |     Example:
35 |         >>> view_ref = ViewReference(
36 |         ...     partition_base_dir="/data/mycollection")
37 |         >>> view = create_view("/home/user/myview", view_ref)
38 | 
39 |     Returns:
40 |         The created view.
41 | 
42 |     Raises:
43 |         ValueError: If the path already exists.
44 |     """
45 |     filesystem = fs_utils.get_fs(filesystem)
46 |     if filesystem.exists(path):
47 |         raise ValueError(f'path {path!r} already exists.')
48 |     return view.View(path,
49 |                      view_ref,
50 |                      ds=None,
51 |                      filesystem=filesystem,
52 |                      filters=filters,
53 |                      synchronizer=synchronizer)
54 | 
55 | 
56 | def open_view(
57 |     path: str,
58 |     *,
59 |     filesystem: fsspec.AbstractFileSystem | str | None = None,
60 |     synchronizer: sync.Sync | None = None,
61 | ) -> view.View:
62 |     """Open an existing view.
63 | 
64 |     Args:
65 |         path: View storage directory.
66 |         filesystem: The file system used to access the view.
67 |         synchronizer: The synchronizer used to synchronize the view.
68 | 
69 |     Returns:
70 |         The opened view.
71 | 
72 |     Example:
73 |         >>> view = open_view("/home/user/myview")
74 |     """
75 |     return view.View.from_config(path,
76 |                                  filesystem=filesystem,
77 |                                  synchronizer=synchronizer)
78 | 


--------------------------------------------------------------------------------
/zcollection/collection/callable_objects.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Callable objects.
 7 | =================
 8 | """
 9 | from __future__ import annotations
10 | 
11 | from typing import Any, Callable, Protocol, Sequence
12 | 
13 | from .. import dataset
14 | from ..type_hints import ArrayLike
15 | 
16 | #: Function type to load and call a callback function of type
17 | #: :class:`PartitionCallable`.
18 | WrappedPartitionCallable = Callable[[Sequence[str]], None]
19 | 
20 | 
21 | #: pylint: disable=too-few-public-methods
22 | class PartitionCallable(Protocol):
23 |     """Protocol for partition callables.
24 | 
25 |     A partition callable is a function that accepts a dataset and
26 |     returns a result.
27 |     """
28 | 
29 |     @property
30 |     def __name__(self) -> str:
31 |         """Name of the callable."""
32 |         # pylint: disable=unnecessary-ellipsis
33 |         # Make checker happy.
34 |         ...
35 |         # pylint: enable=unnecessary-ellipsis
36 | 
37 |     def __call__(self, zds: dataset.Dataset, *args, **kwargs) -> Any:
38 |         """Call the partition function.
39 | 
40 |         Args:
41 |             zds: Dataset to partition.
42 |             *args: Positional arguments.
43 |             **kwargs: Keyword arguments.
44 | 
45 |         Returns:
46 |             Result of the partition function.
47 |         """
48 | 
49 | 
50 | #: Alias for :class:`PartitionCallable`.
51 | MapCallable = PartitionCallable
52 | 
53 | 
54 | class UpdateCallable(Protocol):
55 |     """Protocol for update callables.
56 | 
57 |     A callable update is a function that accepts a data set and returns
58 |     a dictionary of arrays to update.
59 |     """
60 | 
61 |     @property
62 |     def __name__(self) -> str:
63 |         """Name of the callable."""
64 |         # pylint: disable=unnecessary-ellipsis
65 |         # Make checker happy.
66 |         ...
67 |         # pylint: enable=unnecessary-ellipsis
68 | 
69 |     def __call__(self, zds: dataset.Dataset, *args,
70 |                  **kwargs) -> dict[str, ArrayLike]:
71 |         """Call the update function.
72 | 
73 |         Args:
74 |             zds: Dataset to update.
75 |             *args: Positional arguments.
76 |             **kwargs: Keyword arguments.
77 | 
78 |         Returns:
79 |             Dictionary of arrays to update.
80 |         """
81 |         # pylint: disable=unnecessary-ellipsis
82 |         # Mandatory to make Pylance happy.
83 |         ...
84 |         # pylint: enable=unnecessary-ellipsis
85 | 


--------------------------------------------------------------------------------
/zcollection/tests/fs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Fixture for testing the file system.
 7 | ====================================
 8 | """
 9 | from typing import Any, Iterator
10 | import pathlib
11 | import tempfile
12 | 
13 | import fsspec
14 | import fsspec.implementations.memory
15 | import pytest
16 | 
17 | try:
18 |     # pylint: disable=unused-import # Need to import for fixtures
19 |     from .s3 import S3, s3, s3_base  # type: ignore
20 | 
21 |     # pylint: disable=unused-import
22 |     S3_IMPORT_EXCEPTION = None
23 | except ImportError as err:
24 |     S3_IMPORT_EXCEPTION = str(err)
25 | 
26 | 
27 | def tempdir(tmpdir) -> pathlib.Path:
28 |     """Create a temporary directory."""
29 |     return pathlib.Path(tempfile.mkdtemp(dir=str(tmpdir)))
30 | 
31 | 
32 | class Local:
33 |     """Local files system."""
34 | 
35 |     def __init__(self, tmpdir, protocol) -> None:
36 |         #: The filesystem.
37 |         self.fs: fsspec.AbstractFileSystem = fsspec.filesystem(protocol)
38 |         #: The root directory.
39 |         self.root = tempdir(pathlib.Path(tmpdir))
40 |         #: The collection directory.
41 |         self.collection: pathlib.Path = self.root.joinpath('collection')
42 |         #: The view directory.
43 |         self.view: pathlib.Path = self.root.joinpath('view')
44 | 
45 |     def __getattr__(self, name) -> Any:
46 |         return getattr(self.fs, name)
47 | 
48 | 
49 | @pytest.fixture
50 | def local_fs(tmpdir, pytestconfig) -> Iterator[Local]:
51 |     """Local filesystem."""
52 |     protocol: str = 'memory' if pytestconfig.getoption('memory') else 'file'
53 |     instance = Local(tmpdir, protocol)
54 |     yield instance
55 |     try:
56 |         # For the memory protocol we delete the written data to free the
57 |         # memory.
58 |         if isinstance(instance.fs,
59 |                       fsspec.implementations.memory.MemoryFileSystem):
60 |             instance.fs.rm(str(instance.root), recursive=True)
61 |     except FileNotFoundError:
62 |         pass
63 | 
64 | 
65 | # pylint: disable=redefined-outer-name,function-redefined
66 | if S3_IMPORT_EXCEPTION is None:
67 | 
68 |     @pytest.fixture
69 |     def s3_fs(s3) -> S3:  # type: ignore[arg-type]
70 |         """S3 filesystem."""
71 |         return S3(s3)  # type: ignore
72 | else:
73 | 
74 |     @pytest.fixture
75 |     def s3() -> None:
76 |         """S3 filesystem."""
77 | 
78 |     @pytest.fixture
79 |     def s3_base() -> None:
80 |         """S3 filesystem."""
81 | 
82 |     @pytest.fixture
83 |     def s3_fs(pytestconfig) -> None:
84 |         """S3 filesystem."""
85 |         if pytestconfig.getoption('s3'):
86 |             pytest.fail(f'Unable to test S3: {S3_IMPORT_EXCEPTION}')
87 |         else:
88 |             pytest.skip('S3 is disabled')
89 | 
90 | 
91 | # pylint: enable=redefined-outer-name,function-redefined
92 | 


--------------------------------------------------------------------------------
/zcollection/representation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Representation of dataset objects.
 7 | ==================================
 8 | """
 9 | from __future__ import annotations
10 | 
11 | from typing import Any, Iterable, Iterator, Sequence
12 | 
13 | from .meta import Attribute
14 | 
15 | 
16 | def dimensions(dims: dict[str, int]) -> str:
17 |     """Returns a string representation of the dimensions.
18 | 
19 |     Args:
20 |         dims: A dictionary containing the dimensions.
21 | 
22 |     Returns:
23 |         A string representation of the dimensions in the form of a tuple, where
24 |         each element of the tuple is a string containing the dimension name and
25 |         its corresponding value.
26 |     """
27 |     return str(tuple(f'{name}: {value}' for name, value in dims.items()))
28 | 
29 | 
30 | def _maybe_truncate(obj: Any, max_size: int) -> str:
31 |     """Truncate the string representation of an object to the given length.
32 | 
33 |     Args:
34 |         obj: An object.
35 |         max_size: The maximum length of the string representation.
36 | 
37 |     Returns:
38 |         The string representation of the object, truncated to the given length
39 |         if necessary.
40 |     """
41 |     result = str(obj)
42 |     if len(result) > max_size:
43 |         return result[:max_size - 3] + '...'
44 |     return result
45 | 
46 | 
47 | def pretty_print(obj: Any, num_characters: int = 120) -> str:
48 |     """Return a pretty printed string representation of the given object.
49 | 
50 |     Args:
51 |         obj:
52 |             An object to be pretty printed.
53 |         num_characters:
54 |             An integer representing the maximum number of
55 |             characters per line.
56 | 
57 |     Returns:
58 |         A string representation of the object, pretty printed with a maximum of
59 |         `num_characters` characters per line.
60 |     """
61 |     result: str = _maybe_truncate(obj, num_characters)
62 |     return result + ' ' * max(num_characters - len(result), 0)
63 | 
64 | 
65 | def calculate_column_width(items: Iterable) -> int:
66 |     """Calculate the maximum width of a column.
67 | 
68 |     Args:
69 |         items: An iterable of items.
70 | 
71 |     Returns:
72 |         The maximum width of a column.
73 |     """
74 |     max_name: int = max(len(str(name)) for name in items)
75 |     return max(max_name, 7)
76 | 
77 | 
78 | def attributes(attrs: Sequence[Attribute]) -> Iterator[str]:
79 |     """Get the string representation of the attributes.
80 | 
81 |     Args:
82 |         attrs: The attributes.
83 | 
84 |     Returns:
85 |         The string representation of the attributes.
86 |     """
87 |     width: int = calculate_column_width(item.name for item in attrs)
88 |     for attr in attrs:
89 |         name_str: str = f'    {attr.name:<{width}s}'
90 |         yield pretty_print(f'{name_str}: {attr.value!r}')
91 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 
141 | # Dask worker space
142 | dask-worker-space/
143 | 
144 | # Autogenerated documentation files
145 | docs/source/_generated
146 | docs/source/auto_examples/
147 | 
148 | # Generated version file
149 | zcollection/version.py
150 | 


--------------------------------------------------------------------------------
/docs/source/install.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | Required dependencies
 5 | ---------------------
 6 | 
 7 | - Python (3.8 or later)
 8 | - setuptools
 9 | - `dask <https://dask.pydata.org/>`_
10 | - `distributed <https://distributed.dask.org/en/stable/>`_
11 | - `fsspec <https://filesystem-spec.readthedocs.io/en/latest/>`_
12 | - `numcodecs <https://numcodecs.readthedocs.io/en/stable/>`_
13 | - `numpy <https://numpy.org/>`_
14 | - `pyarrow <https://arrow.apache.org/docs/python/>`_
15 | - `xarray <http://xarray.pydata.org/en/stable/>`_
16 | - `zarr <https://zarr.readthedocs.io/en/stable/>`_
17 | 
18 | .. note::
19 | 
20 |     `pyarrow` is optional, but required if you want to use the indexing API.
21 | 
22 | Instructions
23 | ------------
24 | 
25 | Installation via conda
26 | ######################
27 | 
28 | The easiest way to install the library is to use conda. First, install the
29 | dependencies::
30 | 
31 |     $ conda install -c conda-forge zcollection
32 | 
33 | Installation via conda and sources
34 | ##################################
35 | 
36 | It is possible to install the latest version from source. First, install the
37 | dependencies using conda::
38 | 
39 |     $ conda install dask distributed fsspec numcodecs numpy pandas pyarrow xarray zarr
40 | 
41 | Then, clone the repository::
42 | 
43 |     $ git clone git@github.com:CNES/zcollection.git
44 |     $ cd zcollection
45 | 
46 | Finally, install the library using pip (it is possible to checkout a different
47 | branch before installing)::
48 | 
49 |     $ pip install .
50 | 
51 | Installation via pip
52 | ####################
53 | 
54 |     $ pip install zcollection
55 | 
56 | Testing
57 | -------
58 | 
59 | To run the test suite after installing the library, install (via pypi or
60 | conda) `pytest <https://pytest.org>`__ and run ``pytest`` in the root
61 | directory of the cloned repository.
62 | 
63 | The unit test process can be modified using options implemented for this
64 | project, in addition to the options provided by ``pytest``. The available user
65 | options are:
66 | 
67 | - **s3**: Enable tests on the local S3 server driven by minio. (default: False)
68 | - **memory**: Use a file system in memory instead of the local file system.
69 |   (default: False)
70 | - **threads_per_worker**: Number of threads for each worker Dask.
71 |   (default: the number of logical cores of the target platform).
72 | - **n_workers**: Number of core for each worker Dask.
73 |   (default: the number of cores of the target platform).
74 | 
75 | To run the tests using a local S3 server, driven by the ``minio`` software,
76 | it's necessary to install the following optional requirements:
77 | 
78 | - `s3fs <https://github.com/fsspec/s3fs/>`_
79 | - `requests <https://docs.python-requests.org/en/latest/>`_
80 | 
81 | You will need to install the ``minio`` program. You can find more information
82 | on this web `page <https://min.io/download>`_.
83 | 
84 | Documentation
85 | -------------
86 | 
87 | The documentation use sphinx and Google-style docstrings. To build the
88 | documentation, run ``make html`` in the ``docs`` directory.
89 | 


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
  1 | {{ fullname | escape | underline }}
  2 | 
  3 | .. currentmodule:: {{ module }}
  4 | 
  5 | .. autoclass:: {{ objname }}
  6 |    :show-inheritance:
  7 |    {% block methods %}
  8 | 
  9 |    {%- set attr = [] -%}
 10 |    {%- set meth = [] -%}
 11 |    {%- set private = [] -%}
 12 |    {%- set protected = [] -%}
 13 |    {%- set special = [] -%}
 14 |    {%- set inherited_meth = [] -%}
 15 |    {%- set skip = ['__abstractmethods__',
 16 |                    '__annotations__',
 17 |                    '__dict__',
 18 |                    '__doc__',
 19 |                    '__entries__',
 20 |                    '__hash__',
 21 |                    '__init__',
 22 |                    '__members__',
 23 |                    '__module__',
 24 |                    '__slots__',
 25 |                    '__slotnames__',
 26 |                    '__weakref__'] -%}
 27 | 
 28 |    {%- for item in methods if not item in skip -%}
 29 |       {%- if item in inherited_members -%}
 30 |          {{ inherited_meth.append(item) or "" }}
 31 |       {%- else -%}
 32 |          {{ meth.append(item) or "" }}
 33 |       {%- endif -%}
 34 |    {%- endfor -%}
 35 | 
 36 |    {%- for item in members
 37 |          if not item in inherited_members and not item in skip -%}
 38 |       {%- if item.startswith('__') and item.endswith('__') -%}
 39 |          {{ special.append(item) or "" }}
 40 |       {%- elif item.startswith('__') -%}
 41 |          {{ private.append(item) or "" }}
 42 |       {%- elif item.startswith('_') -%}
 43 |          {{ protected.append(item) or "" }}
 44 |       {%- endif -%}
 45 |    {%- endfor %}
 46 | 
 47 |    {%- if attributes %}
 48 |    .. rubric:: {{ _('Attributes') }}
 49 |    .. autosummary::
 50 |       :toctree:
 51 |    {% for item in attributes %}
 52 |       ~{{ name }}.{{ item }}
 53 |    {%- endfor %}
 54 |    {% endif -%}
 55 | 
 56 |    {%- if meth %}
 57 |    .. rubric:: {{ _('Public Methods') }}
 58 |    .. autosummary::
 59 |       :toctree:
 60 |    {% for item in meth %}
 61 |       ~{{ name }}.{{ item }}
 62 |    {%- endfor %}
 63 |    {% endif -%}
 64 | 
 65 |    {%- if protected %}
 66 |    .. rubric:: {{ _('Protected Methods') }}
 67 |    .. autosummary::
 68 |       :toctree:
 69 |    {% for item in protected %}
 70 |       ~{{ name }}.{{ item }}
 71 |    {%- endfor %}
 72 |    {% endif -%}
 73 | 
 74 |    {%- if private %}
 75 |    .. rubric:: {{ _('Private Methods') }}
 76 |    .. autosummary::
 77 |       :toctree:
 78 |    {% for item in private %}
 79 |       ~{{ name }}.{{ item }}
 80 |    {%- endfor %}
 81 |    {% endif -%}
 82 | 
 83 |    {%- if special %}
 84 | 
 85 |    .. rubric:: {{ _('Special Methods') }}
 86 |    .. autosummary::
 87 |       :toctree:
 88 |    {% for item in special %}
 89 |       ~{{ name }}.{{ item }}
 90 |    {%- endfor %}
 91 |    {%- endif -%}
 92 | 
 93 |    {%- if inherited_meth %}
 94 | 
 95 |    .. rubric:: {{ _('Inherited Methods') }}
 96 |    .. autosummary::
 97 |       :toctree:
 98 |    {% for item in inherited_meth %}
 99 |       ~{{ name }}.{{ item }}
100 |    {%- endfor %}
101 |    {%- endif -%}
102 | 
103 |    {%- endblock -%}
104 | 


--------------------------------------------------------------------------------
/zcollection/convenience/collection.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Convenience functions
 7 | =====================
 8 | """
 9 | from __future__ import annotations
10 | 
11 | from typing import Literal
12 | 
13 | import xarray
14 | 
15 | from .. import collection, dataset, fs_utils, partitioning
16 | 
17 | 
18 | def create_collection(
19 |     axis: str,
20 |     ds: xarray.Dataset | dataset.Dataset,
21 |     partition_handler: partitioning.Partitioning,
22 |     partition_base_dir: str,
23 |     **kwargs,
24 | ) -> collection.Collection:
25 |     """Create a collection.
26 | 
27 |     Args:
28 |         axis: The axis to use for the collection.
29 |         ds: The dataset to use.
30 |         partition_handler: The partitioning handler to use.
31 |         partition_base_dir: The base directory to use for the partitions.
32 |         **kwargs: Additional parameters are passed through to the constructor
33 |             of the class :py:class:`Collection`.
34 | 
35 |     Example:
36 |         >>> import xarray as xr
37 |         >>> import zcollection
38 |         >>> data = xr.Dataset({
39 |         ...     "a": xr.DataArray([1, 2, 3]),
40 |         ...     "b": xr.DataArray([4, 5, 6])
41 |         ... })
42 |         >>> collection = zcollection.create_collection(
43 |         ...     "a", data,
44 |         ...     zcollection.partitioning.Sequence(("a", )),
45 |         ...     "/tmp/my_collection")
46 | 
47 |     Returns:
48 |         The collection.
49 | 
50 |     Raises:
51 |         ValueError: If the base directory already exists.
52 |     """
53 |     filesystem = fs_utils.get_fs(kwargs.pop('filesystem', None))
54 |     if filesystem.exists(partition_base_dir):
55 |         raise ValueError(
56 |             f'The directory {partition_base_dir!r} already exists.')
57 |     if isinstance(ds, xarray.Dataset):
58 |         ds = dataset.Dataset.from_xarray(ds)
59 |     return collection.Collection(axis,
60 |                                  ds.metadata(),
61 |                                  partition_handler,
62 |                                  partition_base_dir,
63 |                                  mode='w',
64 |                                  filesystem=filesystem,
65 |                                  **kwargs)
66 | 
67 | 
68 | # pylint: disable=redefined-builtin
69 | def open_collection(path: str,
70 |                     *,
71 |                     mode: Literal['r', 'w'] | None = None,
72 |                     **kwargs) -> collection.Collection:
73 |     """Open a collection.
74 | 
75 |     Args:
76 |         path: The path to the collection.
77 |         mode: The mode to open the collection.
78 |         **kwargs: Additional parameters are passed through the method
79 |             :py:meth:`zcollection.collection.Collection.from_config`.
80 |     Returns:
81 |         The collection.
82 | 
83 |     Example:
84 |         >>> import zcollection
85 |         >>> collection = zcollection.open_collection(
86 |         ...     "/tmp/mycollection", mode="r")
87 |     """
88 |     return collection.Collection.from_config(path, mode=mode, **kwargs)
89 |     # pylint: enable=redefined-builtin
90 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: "*"
  6 |   pull_request:
  7 |     branches: master
  8 | 
  9 | jobs:
 10 |   linux:
 11 |     name: ${{ matrix.python-version }}-posix
 12 |     runs-on: ubuntu-latest
 13 |     timeout-minutes: 15
 14 |     strategy:
 15 |       fail-fast: false
 16 |       max-parallel: 5
 17 |       matrix:
 18 |         python-version: ['3.8', '3.9', '3.10', '3.11']
 19 |     steps:
 20 |     - name: Checkout
 21 |       uses: actions/checkout@v4
 22 | 
 23 |     - name: Setup Miniconda
 24 |       uses: mamba-org/setup-micromamba@v1
 25 |       with:
 26 |         cache-downloads: true
 27 |         condarc: |
 28 |           channels:
 29 |             - conda-forge
 30 |         create-args: |
 31 |           python=${{ matrix.python-version }}
 32 |         environment-name: ZCollection
 33 |         environment-file: conda/environment.yml
 34 | 
 35 |     - name: Run Tests
 36 |       shell: bash -l {0}
 37 |       run: |
 38 |         python -m setuptools_scm
 39 |         if [[ ! -e zcollection/version.py ]]; then
 40 |           echo "__version__ = '$(git describe --tags --always)'" > zcollection/version.py
 41 |         fi
 42 |         pytest -v -ra --processes
 43 | 
 44 |   s3-fs:
 45 |     runs-on: ubuntu-latest
 46 |     timeout-minutes: 15
 47 |     steps:
 48 |     - name: Checkout
 49 |       uses: actions/checkout@v3
 50 | 
 51 |     - name: Install MinIO
 52 |       run: |
 53 |         mkdir -p /opt/minio/bin
 54 |         wget -nv -P /opt/minio/bin \
 55 |           https://dl.min.io/server/minio/release/linux-amd64/minio
 56 |         chmod +x /opt/minio/bin/minio
 57 | 
 58 |     - name: Add MinIO To System Path
 59 |       run: |
 60 |         echo /opt/minio/bin >> $GITHUB_PATH
 61 | 
 62 |     - name: Setup Miniconda
 63 |       uses: mamba-org/setup-micromamba@v1
 64 |       with:
 65 |         cache-downloads: true
 66 |         condarc: |
 67 |           channels:
 68 |             - conda-forge
 69 |         create-args: |
 70 |           python=3.9
 71 |         environment-name: ZCollection
 72 |         environment-file: conda/environment.yml
 73 | 
 74 |     - name: Run Tests
 75 |       shell: bash -l {0}
 76 |       run: |
 77 |         python -m setuptools_scm
 78 |         python -m setuptools_scm
 79 |         if [[ ! -e zcollection/version.py ]]; then
 80 |           echo "__version__ = '$(git describe --tags --always)'" > zcollection/version.py
 81 |         fi
 82 |         pytest -v -ra --s3 --processes
 83 | 
 84 |   win:
 85 |     name: win
 86 |     runs-on: windows-2019
 87 |     timeout-minutes: 15
 88 | 
 89 |     steps:
 90 |     - name: Checkout
 91 |       uses: actions/checkout@v3
 92 | 
 93 |     - name: Setup Miniconda
 94 |       uses: mamba-org/setup-micromamba@v1
 95 |       with:
 96 |         cache-downloads: true
 97 |         condarc: |
 98 |           channels:
 99 |             - conda-forge
100 |         create-args: |
101 |           python=3.9
102 |         environment-name: ZCollection
103 |         environment-file: conda/environment.yml
104 | 
105 |     - name: Run Tests
106 |       shell: bash -l {0}
107 |       run: |
108 |         python -m setuptools_scm
109 |         if [[ ! -e zcollection/version.py ]]; then
110 |           echo "__version__ = '$(git describe --tags --always)'" > zcollection/version.py
111 |         fi
112 |         pytest -v -ra --processes
113 | 


--------------------------------------------------------------------------------
/zcollection/dask_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Dask utilities
  7 | ==============
  8 | """
  9 | from __future__ import annotations
 10 | 
 11 | from typing import Any, Callable, Iterator, Sequence
 12 | import itertools
 13 | import uuid
 14 | 
 15 | from dask.delayed import Delayed as dask_Delayed
 16 | import dask.distributed
 17 | import dask.highlevelgraph
 18 | 
 19 | 
 20 | def dask_workers(client: dask.distributed.Client,
 21 |                  cores_only: bool = False) -> int:
 22 |     """Return the number of dask workers available.
 23 | 
 24 |     Args:
 25 |         client: dask client
 26 |         cores_only: if True, only the number of cores is returned,
 27 |             otherwise the total number of threads is returned.
 28 | 
 29 |     Returns:
 30 |         number of dask workers
 31 | 
 32 |     Raises:
 33 |         ValueError: If no dask workers are available.
 34 |     """
 35 |     result: int = len(
 36 |         client.ncores()) if cores_only else sum(  # type: ignore[arg-type]
 37 |             item
 38 |             for item in client.nthreads().values())  # type: ignore[arg-type]
 39 |     if result == 0:
 40 |         raise RuntimeError('No dask workers available')
 41 |     return result
 42 | 
 43 | 
 44 | def get_client() -> dask.distributed.Client:
 45 |     """Return the default dask client.
 46 | 
 47 |     Returns:
 48 |         default dask client
 49 |     """
 50 |     try:
 51 |         return dask.distributed.get_client()
 52 |     except ValueError:
 53 |         return dask.distributed.Client(
 54 |             processes=False,
 55 |             direct_to_workers=True,
 56 |         )
 57 | 
 58 | 
 59 | def split_sequence(sequence: Sequence[Any],
 60 |                    sections: int | None = None) -> Iterator[Sequence[Any]]:
 61 |     """Split a sequence into sections.
 62 | 
 63 |     Args:
 64 |         sequence: The sequence to split.
 65 |         sections: The number of sections to split the sequence into. Default
 66 |             divides the sequence into n sections of one element.
 67 | 
 68 |     Returns:
 69 |         Iterator of sequences.
 70 |     """
 71 |     sections = len(sequence) if sections is None else sections
 72 |     if sections <= 0:
 73 |         raise ValueError('The number of sections must be greater than zero.')
 74 |     length: int = len(sequence)
 75 |     sections = min(sections, length)
 76 | 
 77 |     size: int
 78 |     extras: int
 79 |     size, extras = divmod(length, sections)
 80 | 
 81 |     div = tuple(
 82 |         itertools.accumulate([0] + extras * [size + 1] +
 83 |                              (sections - extras) * [size]))
 84 |     yield from (sequence[item:div[ix + 1]] for ix, item in enumerate(div[:-1]))
 85 | 
 86 | 
 87 | def simple_delayed(name: str, func: Callable) -> dask_Delayed:
 88 |     """Create a simple delayed function.
 89 | 
 90 |     Args:
 91 |         name: name of the function
 92 |         func: function to be delayed
 93 | 
 94 |     Returns:
 95 |         delayed function
 96 |     """
 97 |     name = f'{name}-{str(uuid.uuid4())}'
 98 |     return dask_Delayed(
 99 |         name,
100 |         dask.highlevelgraph.HighLevelGraph({name: {
101 |             name: func
102 |         }}, {name: set()}),
103 |         None,
104 |     )
105 | 


--------------------------------------------------------------------------------
/zcollection/tests/test_expression.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 CNES
 2 | #
 3 | # All rights reserved. Use of this source code is governed by a
 4 | # BSD-style license that can be found in the LICENSE file.
 5 | """
 6 | Tests of the expression evaluation
 7 | ==================================
 8 | """
 9 | from __future__ import annotations
10 | 
11 | import timeit
12 | 
13 | import numpy
14 | import pytest
15 | import xarray
16 | 
17 | from .. import dataset
18 | from ..expression import Expression
19 | from ..partitioning import Date
20 | # pylint: disable=unused-import # Need to import for fixtures
21 | from .cluster import dask_client, dask_cluster
22 | 
23 | # pylint enable=unused-import
24 | 
25 | 
26 | def make_dataset(num_samples: int | None = None) -> dataset.Dataset:
27 |     """Creation of a data set for testing purposes."""
28 |     dates = numpy.arange(numpy.datetime64('2000-01-01', 'ns'),
29 |                          numpy.datetime64('2009-12-31', 'ns'),
30 |                          numpy.timedelta64(1, 'h')).astype('datetime64[ns]')
31 |     if num_samples is not None:
32 |         dates = dates[:num_samples + 1]
33 |     observation = numpy.random.rand(dates.size)  # type: ignore
34 |     return dataset.Dataset.from_xarray(
35 |         xarray.Dataset({
36 |             'dates':
37 |             xarray.DataArray(dates, dims=('num_lines', )),
38 |             'observation':
39 |             xarray.DataArray(observation, dims=('num_lines', ))
40 |         }))
41 | 
42 | 
43 | def test_expression() -> None:
44 |     """Test of the creation of expressions."""
45 |     expr = Expression('a == b')
46 |     assert expr({'a': 1, 'b': 1})
47 |     assert not expr({'a': 1, 'b': 2})
48 | 
49 |     with pytest.raises(SyntaxError):
50 |         Expression('a==')
51 | 
52 |     with pytest.raises(NameError):
53 |         assert expr({'a': 1, 'c': 1})
54 | 
55 | 
56 | def test_date_expression(
57 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
58 | ) -> None:
59 |     """Test of expressions handling dates.."""
60 |     zds = make_dataset(5 * 24)
61 |     partitioning = Date(('dates', ), 'D')
62 | 
63 |     for partition, _ in partitioning.split_dataset(zds, 'num_lines'):
64 |         variables = dict(partitioning.parse('/'.join(partition)))
65 |         expr = Expression('year==2000')
66 |         assert expr(variables)
67 |         expr = Expression('year==2000 and month==1')
68 |         assert expr(variables)
69 |         expr = Expression('year==2000 and month==1 and day in range(1, 12)')
70 |         assert expr(variables)
71 | 
72 | 
73 | def test_bench_expression(
74 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
75 | ) -> None:
76 |     """Benchmark of expressions."""
77 |     partitioning = Date(('dates', ), 'D')
78 |     zds = make_dataset()
79 |     expr = Expression('year==2000 and month==1 and day in range(1, 12)')
80 |     times = []
81 |     number = 100
82 |     for partition, _ in partitioning.split_dataset(zds, 'num_lines'):
83 |         variables = dict(partitioning.parse('/'.join(partition)))
84 |         times.append(
85 |             timeit.timeit('expr(variables)',
86 |                           globals={
87 |                               'expr': expr,
88 |                               'variables': variables
89 |                           },
90 |                           number=number))
91 | 
92 |     assert sum(times) / (len(times) * number) < 1e-5
93 | 


--------------------------------------------------------------------------------
/zcollection/tests/cluster.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Fixtures for testing Dask clusters using the pytest.
  7 | ====================================================
  8 | """
  9 | from typing import Iterator
 10 | import contextlib
 11 | import logging
 12 | import weakref
 13 | 
 14 | import dask.config
 15 | import dask.distributed
 16 | import py
 17 | import pytest
 18 | 
 19 | 
 20 | @pytest.fixture()
 21 | def dask_cluster(
 22 |         pytestconfig,
 23 |         tmpdir_factory,
 24 |         scope='session',  # pylint: disable=unused-argument
 25 | ) -> str:
 26 |     """Launch a Dask LocalCluster with a configurable number of workers."""
 27 |     n_workers: int | None
 28 |     threads_per_worker: int | None
 29 |     processes: bool
 30 | 
 31 |     try:
 32 |         n_workers = int(pytestconfig.getoption('n_workers'))
 33 |     except TypeError:
 34 |         n_workers = None
 35 | 
 36 |     try:
 37 |         threads_per_worker = int(pytestconfig.getoption('threads_per_worker'))
 38 |     except TypeError:
 39 |         threads_per_worker = None
 40 | 
 41 |     try:
 42 |         processes = int(pytestconfig.getoption('processes')) == 1
 43 |     except TypeError:
 44 |         processes = False
 45 | 
 46 |     tmpdir: py.path.local = tmpdir_factory.getbasetemp()
 47 |     scheduler_file: py.path.local = tmpdir / 'scheduler.json'
 48 |     if scheduler_file.exists():
 49 |         return str(scheduler_file)
 50 | 
 51 |     # Use the root path of the test session for the dask worker space
 52 |     dask_worker: py.path.local = tmpdir / 'dask_worker_space'
 53 |     dask.config.set(temporary_directory=str(dask_worker))
 54 | 
 55 |     logging.info('Dask local cluster starting')
 56 |     cluster = dask.distributed.LocalCluster(
 57 |         protocol='tcp://',
 58 |         n_workers=n_workers,
 59 |         threads_per_worker=threads_per_worker,
 60 |         processes=processes,
 61 |     )
 62 | 
 63 |     def teardown() -> None:
 64 |         """Stop the cluster and remove the scheduler file."""
 65 |         if scheduler_file.exists():
 66 |             scheduler_file.remove()
 67 | 
 68 |     weakref.finalize(cluster, teardown)
 69 | 
 70 |     # Make sure we can connect to the cluster.
 71 |     with dask.distributed.Client(cluster) as client:
 72 |         client.write_scheduler_file(scheduler_file)
 73 |         client.wait_for_workers(1)
 74 | 
 75 |     logging.info('Dask local cluster started')
 76 |     return str(scheduler_file)
 77 | 
 78 | 
 79 | @contextlib.contextmanager
 80 | def _scheduler_file(
 81 |         dask_cluster,  # pylint: disable=redefined-outer-name
 82 | ) -> Iterator[str]:
 83 |     """Get the scheduler used to connect to the cluster."""
 84 |     yield dask_cluster
 85 | 
 86 | 
 87 | @pytest.fixture()
 88 | def dask_client(
 89 |     dask_cluster,  # pylint: disable=redefined-outer-name
 90 | ) -> Iterator[dask.distributed.Client]:
 91 |     """Connect a Dask client to the cluster."""
 92 |     with _scheduler_file(dask_cluster) as scheduler_file:
 93 |         try:
 94 |             with dask.distributed.Client(
 95 |                     scheduler_file=scheduler_file) as client:
 96 |                 yield client
 97 |         except RuntimeError:
 98 |             # Ignore while the client is being stopped.
 99 |             pass
100 | 


--------------------------------------------------------------------------------
/zcollection/type_hints.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Type hints for the zcollection package.
  7 | =======================================
  8 | 
  9 | .. rubric:: Type aliases
 10 | 
 11 | .. py:data:: DType
 12 |     :canonical: DType
 13 | 
 14 |     Type of a numpy array.
 15 | 
 16 | .. py:data:: DTypeLike
 17 |     :canonical: DTypeLike
 18 | 
 19 |     Type of a numpy array or a string.
 20 | 
 21 | .. py:data:: NDArray
 22 |     :canonical: NDArray
 23 | 
 24 |     Type of a numpy array.
 25 | 
 26 | .. py:data:: NDMaskedArray
 27 | 
 28 |     Type of a numpy array with a mask.
 29 | """
 30 | from __future__ import annotations
 31 | 
 32 | from typing import TYPE_CHECKING, Any, Protocol, TypeVar
 33 | 
 34 | try:
 35 |     from types import GenericAlias  # type: ignore[attr-defined]
 36 | except ImportError:
 37 |     # pylint: disable=ungrouped-imports
 38 |     # For Python < 3.9 we use a backport of GenericAlias provided by
 39 |     # numpy
 40 |     # isort: off
 41 |     from numpy._typing._generic_alias import (  # type: ignore[no-redef]
 42 |         _GenericAlias as GenericAlias, )
 43 |     # isort: on
 44 |     # pylint: enable=ungrouped-imports
 45 | 
 46 | try:
 47 |     from typing_extensions import TypeAlias
 48 | except ImportError:
 49 |     # pylint: disable=ungrouped-imports
 50 |     # TypeAlias is defined in typing starting from 3.10
 51 |     from typing import TypeAlias  # type: ignore[attr-defined,no-redef]
 52 |     # pylint: enable=ungrouped-imports
 53 | 
 54 | import numpy
 55 | import numpy.typing
 56 | 
 57 | # pylint: disable=invalid-name
 58 | _DType_co = TypeVar('_DType_co', covariant=True, bound='numpy.dtype[Any]')
 59 | _ScalarType_co = TypeVar('_ScalarType_co', bound=numpy.generic, covariant=True)
 60 | # pylint: enable=invalid-name
 61 | 
 62 | if TYPE_CHECKING:
 63 |     DType = numpy.dtype[_ScalarType_co]
 64 |     NDMaskedArray = numpy.ma.MaskedArray[Any, DType]  # pragma: no cover
 65 | else:
 66 |     DType = GenericAlias(numpy.dtype, (_ScalarType_co, ))
 67 |     NDMaskedArray = GenericAlias(numpy.ma.MaskedArray, (Any, DType))
 68 | 
 69 | NDArray: TypeAlias = numpy.typing.NDArray  # pragma: no cover
 70 | DTypeLike: TypeAlias = numpy.typing.DTypeLike  # pragma: no cover
 71 | 
 72 | 
 73 | class ArrayLike(Protocol[_DType_co]):
 74 |     """Protocol for array-like objects."""
 75 | 
 76 |     def __array__(self) -> NDArray:
 77 |         ...
 78 | 
 79 |     @property
 80 |     def dtype(self) -> DType:
 81 |         """The data type of the array."""
 82 |         # pylint: disable=unnecessary-ellipsis
 83 |         # Make checker happy.
 84 |         ...
 85 |         # pylint: enable=unnecessary-ellipsis
 86 | 
 87 |     @property
 88 |     def shape(self) -> tuple[int, ...]:
 89 |         """The shape of the array."""
 90 |         # pylint: disable=unnecessary-ellipsis
 91 |         # Make checker happy.
 92 |         ...
 93 |         # pylint: enable=unnecessary-ellipsis
 94 | 
 95 |     @property
 96 |     def size(self) -> int:
 97 |         """The size of the array."""
 98 |         # pylint: disable=unnecessary-ellipsis
 99 |         # Make checker happy.
100 |         ...
101 |         # pylint: enable=unnecessary-ellipsis
102 | 
103 |     def astype(self, dtype: DTypeLike) -> ArrayLike[_DType_co]:
104 |         """Convert the array to a given type."""
105 |         # pylint: disable=unnecessary-ellipsis
106 |         # Make checker happy.
107 |         ...
108 |         # pylint: enable=unnecessary-ellipsis
109 | 


--------------------------------------------------------------------------------
/zcollection/merging/tests/test_merging.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Test merging.
  7 | =============
  8 | """
  9 | import numpy
 10 | import pytest
 11 | import zarr
 12 | 
 13 | from .. import _update_fs, merge_time_series, perform
 14 | from ... import sync
 15 | from ...tests import data
 16 | # pylint: disable=unused-import # Need to import for fixtures
 17 | from ...tests.cluster import dask_client, dask_cluster
 18 | from ...tests.fixture import dask_arrays, numpy_arrays
 19 | from ...tests.fs import local_fs
 20 | 
 21 | # pylint: enable=unused-import
 22 | 
 23 | 
 24 | class MyError(RuntimeError):
 25 |     """Custom error."""
 26 | 
 27 | 
 28 | class ThrowError(sync.Sync):
 29 |     """Throw an error when merging."""
 30 | 
 31 |     def __enter__(self) -> bool:
 32 |         raise MyError('This is an error')
 33 | 
 34 |     def __exit__(self, exc_type, exc_value, traceback) -> None:
 35 |         ...
 36 | 
 37 |     def is_locked(self) -> bool:
 38 |         return False
 39 | 
 40 | 
 41 | def test_update_fs(
 42 |         dask_client,  # pylint: disable=redefined-outer-name
 43 |         local_fs,  # pylint: disable=redefined-outer-name
 44 | ) -> None:
 45 |     """Test the _update_fs function."""
 46 |     generator = data.create_test_dataset(delayed=False)
 47 |     zds = next(generator)
 48 | 
 49 |     partition_folder = local_fs.root.joinpath('variable=1')
 50 | 
 51 |     zattrs = str(partition_folder.joinpath('.zattrs'))
 52 |     future = dask_client.submit(_update_fs, str(partition_folder),
 53 |                                 dask_client.scatter(zds), local_fs.fs)
 54 |     dask_client.gather(future)
 55 |     assert local_fs.exists(zattrs)
 56 | 
 57 |     local_fs.fs.rm(str(partition_folder), recursive=True)
 58 |     assert not local_fs.exists(zattrs)
 59 |     seen_exception = False
 60 |     try:
 61 |         future = dask_client.submit(_update_fs,
 62 |                                     str(partition_folder),
 63 |                                     dask_client.scatter(zds),
 64 |                                     local_fs.fs,
 65 |                                     synchronizer=ThrowError())
 66 |         dask_client.gather(future)
 67 |     except MyError:
 68 |         seen_exception = True
 69 |     assert seen_exception
 70 |     assert not local_fs.exists(zattrs)
 71 | 
 72 | 
 73 | @pytest.mark.parametrize('arrays_type', ['dask_arrays', 'numpy_arrays'])
 74 | def test_perform(
 75 |     dask_client,  # pylint: disable=redefined-outer-name
 76 |     local_fs,  # pylint: disable=redefined-outer-name
 77 |     arrays_type,
 78 |     request,
 79 | ) -> None:
 80 |     """Test the perform function."""
 81 |     delayed = request.getfixturevalue(arrays_type)
 82 |     generator = data.create_test_dataset(delayed=delayed)
 83 |     zds = next(generator)
 84 | 
 85 |     path = str(local_fs.root.joinpath('variable=1'))
 86 | 
 87 |     future = dask_client.submit(_update_fs, path, dask_client.scatter(zds),
 88 |                                 local_fs.fs)
 89 |     dask_client.gather(future)
 90 | 
 91 |     future = dask_client.submit(perform,
 92 |                                 dask_client.scatter(zds),
 93 |                                 path,
 94 |                                 'time',
 95 |                                 local_fs.fs,
 96 |                                 'time',
 97 |                                 delayed=delayed,
 98 |                                 merge_callable=merge_time_series)
 99 |     dask_client.gather(future)
100 | 
101 |     zgroup = zarr.open_consolidated(local_fs.get_mapper(path))
102 |     assert numpy.all(zgroup['time'][...] == zds['time'].values)
103 |     assert numpy.all(zgroup['var1'][...] == zds['var1'].values)
104 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | #
 13 | import importlib.metadata
 14 | import pathlib
 15 | import sys
 16 | 
 17 | HERE = pathlib.Path(__file__).absolute().parent
 18 | 
 19 | # Insert the project root dir as the first element in the PYTHONPATH.
 20 | sys.path.insert(0, str(HERE.parent.parent))
 21 | 
 22 | # -- Project information -----------------------------------------------------
 23 | 
 24 | project = 'zcollection'
 25 | copyright = '(2022, CNES/CLS)'
 26 | author = 'CLS'
 27 | 
 28 | # The full version, including alpha/beta/rc tags
 29 | try:
 30 |     release = importlib.metadata.version(project)
 31 | except importlib.metadata.PackageNotFoundError:
 32 |     release = '0.0.0'
 33 | version = '.'.join(release.split('.')[:2])
 34 | 
 35 | # -- General configuration ---------------------------------------------------
 36 | 
 37 | # Add any Sphinx extension module names here, as strings. They can be
 38 | # extensions coming with Sphinx (named "sphinx.ext.*") or your custom
 39 | # ones.
 40 | extensions = [
 41 |     'sphinx_inline_tabs',
 42 |     'sphinx_gallery.gen_gallery',
 43 |     'sphinx.ext.autodoc',
 44 |     'sphinx.ext.autosummary',
 45 |     'sphinx.ext.intersphinx',
 46 |     'sphinx.ext.mathjax',
 47 |     'sphinx.ext.napoleon',
 48 |     'sphinx.ext.viewcode',
 49 | ]
 50 | 
 51 | autosummary_generate = True
 52 | 
 53 | autodoc_typehints = 'description'
 54 | autodoc_type_aliases = dict(
 55 |     ArrayLike='ArrayLike',
 56 |     DType='DType',
 57 |     DTypeLike='DTypeLike',
 58 |     Indexer='Indexer',
 59 |     NDArray='NDArray',
 60 |     NDMaskedArray='NDMaskedArray',
 61 |     PartitionCallback='PartitionCallback',
 62 |     QueryDict='QueryDict',
 63 |     Scalar='Scalar',
 64 | )
 65 | 
 66 | numpydoc_class_members_toctree = True
 67 | numpydoc_show_class_members = False
 68 | 
 69 | # Add any paths that contain templates here, relative to this directory.
 70 | templates_path = ['_templates']
 71 | 
 72 | # List of patterns, relative to source directory, that match files and
 73 | # directories to ignore when looking for source files.
 74 | # This pattern also affects html_static_path and html_extra_path.
 75 | exclude_patterns = []
 76 | 
 77 | # -- Options for HTML output -------------------------------------------------
 78 | 
 79 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 80 | # a list of builtin themes.
 81 | #
 82 | html_theme = 'furo'
 83 | html_title = 'ZCollection'
 84 | 
 85 | # Add any paths that contain custom static files (such as style sheets) here,
 86 | # relative to this directory. They are copied after the builtin static files,
 87 | # so a file named "default.css" will overwrite the builtin "default.css".
 88 | html_static_path = ['_static']
 89 | 
 90 | intersphinx_mapping = {
 91 |     'dask': ('https://docs.dask.org/en/latest/', None),
 92 |     'fsspec': ('https://filesystem-spec.readthedocs.io/en/latest/', None),
 93 |     'numpy': ('https://numpy.org/doc/stable/', None),
 94 |     'python': ('https://docs.python.org/3/', None),
 95 |     'xarray': ('https://docs.xarray.dev/en/stable/', None),
 96 |     'zarr': ('https://zarr.readthedocs.io/en/stable', None),
 97 | }
 98 | 
 99 | # -- Extension configuration -------------------------------------------------
100 | sphinx_gallery_conf = {
101 |     'examples_dirs': [HERE.parent.parent.joinpath('examples')],
102 |     'filename_pattern': r'[\\\/]ex_',
103 |     'pypandoc': False,
104 | }
105 | 


--------------------------------------------------------------------------------
/docs/source/release.rst:
--------------------------------------------------------------------------------
  1 | Release notes
  2 | =============
  3 | 
  4 | 2024.2.0
  5 | --------
  6 | * Refactor merging module and improve temporary directory handling.
  7 | * Bug fix: Add cache invalidation for updated partitions.
  8 | 
  9 | 2024.1.0
 10 | --------
 11 | * Bug fix: Partitions must not be sorted lexicographically.
 12 | 
 13 | 2023.11.2
 14 | ---------
 15 | * Returns a list of added/deleted partitions.
 16 | 
 17 | 2023.11.1
 18 | ---------
 19 | * Skip filesystem scans: Queries utilizing an indexer reuse known partitions for
 20 |   efficiency.
 21 | 
 22 | 2023.11.0
 23 | ---------
 24 | * Evaluate an expression from a dataset.
 25 | * Fix linter warning.
 26 | * Validate partitions.
 27 | 
 28 | 2023.10.0
 29 | ---------
 30 | * Merge time series with data gaps.
 31 | * Fix Linux fork startup blocking test.
 32 | * Correction of issues related to CI executions.
 33 | * Added an option to specify the list of variables used by the callback
 34 |   function for updating a partition.
 35 | * Classmethods removed from indexers.
 36 | 
 37 | 2023.5.0
 38 | --------
 39 | * Add missing copyrights.
 40 | * Modularise code to reduce the number of lines per module.
 41 | * Writing variables is limited to the worker being used.
 42 | * Improve test coverage.
 43 | * #9: Read the version attribute directly from the ``version.py`` module.
 44 | * #8: Incomplete overlaps with more than one worker.
 45 | * #7: Fix bug, in the update method, if the user has selected multiple
 46 |   partitions the selected variables must contain the updated variables.
 47 | * #6: the parameter name for specifying the number of concurrent inserts is
 48 |   incorrect.
 49 | * #3: Add a trim argument to the ``update`` method, like Dask's Dask's
 50 |   ``map_overlap``.
 51 | * Update the documentation.
 52 | * Refactor the code.
 53 | * Loading data using Dask or Numpy.
 54 | * Variable adds attributes to partitions.
 55 | 
 56 | 2023.3.2
 57 | --------
 58 | * Writing a partition with many variables is slow.
 59 | * Writing metadata only in the collection's configuration.
 60 | * Adding an inter-process lock
 61 | * If a variable has been modified since its initialization, the library throws a
 62 |   specific exception to warn the user.
 63 | 
 64 | 2023.3.1
 65 | --------
 66 | * Fixed a compatibility issue with fspec 2023.3.0.
 67 | 
 68 | 2023.3.0
 69 | --------
 70 | * Apply an optional mask before querying an indexer.
 71 | 
 72 | 2023.2.0
 73 | --------
 74 | * Synchronize the view with the reference collection.
 75 | * Support for Python 3.11.
 76 | * Bug fixes.
 77 | * Optimization of the insertion of new partitions.
 78 | * Copy collection over different file systems.
 79 | * Export Dataset to Zarr group.
 80 | 
 81 | 2022.12.0/2022.12.1
 82 | -------------------
 83 | 
 84 | Release on December 2, 2022
 85 | 
 86 | * Write immutable variables of a dataset into a single group.
 87 | * Possibility to update partitions using neighbor partitions (useful for
 88 |   filtering, for example).
 89 | * Refactor methods overlapping partitions.
 90 | * Update documentation.
 91 | 
 92 | 2022.10.2/2022.10.1
 93 | -------------------
 94 | 
 95 | Release on October 13, 20212
 96 | 
 97 | * Add compatibility with Python 3.8.
 98 | 
 99 | 2022.10.0
100 | ---------
101 | 
102 | Release on October 7, 20212
103 | 
104 | * Added an option to the method ``drop_partitions`` to drop partitions
105 |   older than a specified time delta relative to the current time.
106 | 
107 | 2022.8.0
108 | --------
109 | 
110 | Release on August 14, 2022
111 | 
112 | * Support Python starting 3.9.
113 | * Refactor convenience functions.
114 | * Refactor dataset & variables modules.
115 | * The indexer can return only the partition keys.
116 | * Optimization of dataset handling.
117 | * Bug fixes.
118 | 
119 | 0.2 / 2020-04-04
120 | ----------------
121 | 
122 | Release on April 4, 2020
123 | 
124 | * Installation from PyPi.
125 | * Unsigned integers are not handled.
126 | 
127 | 0.1 / 2022-08-30
128 | -----------------
129 | 
130 | Release on March 30, 2020
131 | 
132 | * First public version.
133 | 


--------------------------------------------------------------------------------
/zcollection/tests/test_dask_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Testing utilities
  7 | =================
  8 | """
  9 | import dask.distributed
 10 | import pytest
 11 | 
 12 | from .. import dask_utils
 13 | # pylint: disable=unused-import # Need to import for fixtures
 14 | from .cluster import dask_client, dask_cluster
 15 | 
 16 | # pylint: disable=unused-import
 17 | 
 18 | 
 19 | @pytest.mark.filterwarnings('ignore:Port \\d+ is already in use.*')
 20 | def test_get_client_with_no_cluster():
 21 |     """Test the get_client function with no cluster."""
 22 |     with dask_utils.get_client() as client:
 23 |         assert isinstance(client, dask.distributed.Client)
 24 | 
 25 | 
 26 | def test_get_client_with_cluster(
 27 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
 28 | ):
 29 |     """Test the get_client function with a cluster."""
 30 |     with dask_utils.get_client() as client:
 31 |         assert isinstance(client, dask.distributed.Client)
 32 | 
 33 | 
 34 | def test_dask_workers(
 35 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
 36 | ):
 37 |     """Test the dask_workers function."""
 38 |     assert dask_utils.dask_workers(dask_client, cores_only=True) == len(
 39 |         dask_client.ncores())  # type: ignore
 40 |     assert dask_utils.dask_workers(dask_client, cores_only=False) == sum(
 41 |         item for item in dask_client.nthreads().values())  # type: ignore
 42 | 
 43 | 
 44 | def test_split_sequence():
 45 |     """Test the split_sequence function."""
 46 |     assert list(dask_utils.split_sequence(list(range(10)), 2)) == [
 47 |         [0, 1, 2, 3, 4],
 48 |         [5, 6, 7, 8, 9],
 49 |     ]
 50 |     assert list(dask_utils.split_sequence(list(range(10)), 3)) == [
 51 |         [0, 1, 2, 3],
 52 |         [4, 5, 6],
 53 |         [7, 8, 9],
 54 |     ]
 55 |     assert list(dask_utils.split_sequence(list(range(10)), 4)) == [
 56 |         [0, 1, 2],
 57 |         [3, 4, 5],
 58 |         [6, 7],
 59 |         [8, 9],
 60 |     ]
 61 |     assert list(dask_utils.split_sequence(list(range(10)), 5)) == [
 62 |         [0, 1],
 63 |         [2, 3],
 64 |         [4, 5],
 65 |         [6, 7],
 66 |         [8, 9],
 67 |     ]
 68 |     assert list(dask_utils.split_sequence(list(range(10)), 6)) == [
 69 |         [0, 1],
 70 |         [2, 3],
 71 |         [4, 5],
 72 |         [6, 7],
 73 |         [8],
 74 |         [9],
 75 |     ]
 76 |     assert list(dask_utils.split_sequence(list(range(10)), 7)) == [
 77 |         [0, 1],
 78 |         [2, 3],
 79 |         [4, 5],
 80 |         [6],
 81 |         [7],
 82 |         [8],
 83 |         [9],
 84 |     ]
 85 |     assert list(dask_utils.split_sequence(list(range(10)), 8)) == [
 86 |         [0, 1],
 87 |         [2, 3],
 88 |         [4],
 89 |         [5],
 90 |         [6],
 91 |         [7],
 92 |         [8],
 93 |         [9],
 94 |     ]
 95 |     assert list(dask_utils.split_sequence(list(range(10)), 9)) == [
 96 |         [0, 1],
 97 |         [2],
 98 |         [3],
 99 |         [4],
100 |         [5],
101 |         [6],
102 |         [7],
103 |         [8],
104 |         [9],
105 |     ]
106 |     assert list(dask_utils.split_sequence(list(range(10)), 10)) == [
107 |         [0],
108 |         [1],
109 |         [2],
110 |         [3],
111 |         [4],
112 |         [5],
113 |         [6],
114 |         [7],
115 |         [8],
116 |         [9],
117 |     ]
118 |     assert list(dask_utils.split_sequence(list(range(10)), 11)) == [
119 |         [0],
120 |         [1],
121 |         [2],
122 |         [3],
123 |         [4],
124 |         [5],
125 |         [6],
126 |         [7],
127 |         [8],
128 |         [9],
129 |     ]
130 |     assert list(dask_utils.split_sequence(list(range(10)))) == [
131 |         [0],
132 |         [1],
133 |         [2],
134 |         [3],
135 |         [4],
136 |         [5],
137 |         [6],
138 |         [7],
139 |         [8],
140 |         [9],
141 |     ]
142 |     with pytest.raises(ValueError):
143 |         list(dask_utils.split_sequence(list(range(10)), 0))
144 | 


--------------------------------------------------------------------------------
/zcollection/tests/data.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Make test datasets
  7 | ==================
  8 | """
  9 | from __future__ import annotations
 10 | 
 11 | from typing import Iterator
 12 | import itertools
 13 | 
 14 | import numpy
 15 | import zarr
 16 | 
 17 | from .. import collection, dataset, partitioning
 18 | from ..type_hints import NDArray
 19 | 
 20 | #: First date of the test dataset.
 21 | START_DATE = numpy.datetime64('2000-01-01', 'ns')
 22 | #: Last date of the test dataset.
 23 | END_DATE = numpy.datetime64('2000-06-30', 'ns')
 24 | #: Delta between two dates.
 25 | DELTA = numpy.timedelta64(72, 'h')
 26 | #: Fill value.
 27 | FILL_VALUE = 2147483647
 28 | 
 29 | 
 30 | def make_dataset(dates: numpy.ndarray,
 31 |                  measures: numpy.ndarray,
 32 |                  fill_value: float | None = None,
 33 |                  filters: tuple | None = None,
 34 |                  delayed: bool = True) -> dataset.Dataset:
 35 |     """Create a dataset."""
 36 |     array_class = (dataset.DelayedArray if delayed else dataset.Array)
 37 |     return dataset.Dataset(
 38 |         attrs=(dataset.Attribute(name='attr', value=1), ),
 39 |         variables=(
 40 |             array_class(
 41 |                 name='time',
 42 |                 data=dates,
 43 |                 dimensions=('num_lines', ),
 44 |                 attrs=(dataset.Attribute(name='attr', value=1), ),
 45 |                 compressor=zarr.Blosc(),
 46 |             ),
 47 |             array_class(name='var1',
 48 |                         data=measures,
 49 |                         dimensions=('num_lines', 'num_pixels'),
 50 |                         attrs=(dataset.Attribute(name='attr', value=1), ),
 51 |                         fill_value=fill_value,
 52 |                         filters=filters),
 53 |             array_class(name='var2',
 54 |                         data=measures,
 55 |                         dimensions=('num_lines', 'num_pixels'),
 56 |                         attrs=(dataset.Attribute(name='attr', value=1), ),
 57 |                         fill_value=fill_value,
 58 |                         filters=filters),
 59 |         ),
 60 |     )
 61 | 
 62 | 
 63 | def create_test_dataset(delayed: bool = True) -> Iterator[dataset.Dataset]:
 64 |     """Create a temporal dataset."""
 65 | 
 66 |     dates: NDArray = numpy.arange(START_DATE, END_DATE, DELTA)
 67 |     indices: NDArray = numpy.arange(0, len(dates))
 68 | 
 69 |     for item in numpy.array_split(dates, 12):
 70 |         mask: NDArray = (dates >= item[0]) & (dates <= item[-1])
 71 |         measures = numpy.vstack((indices[mask], ) * 25).T
 72 | 
 73 |         yield make_dataset(item, measures, delayed=delayed)
 74 | 
 75 | 
 76 | def create_test_dataset_with_fillvalue(
 77 |         delayed: bool = True) -> Iterator[dataset.Dataset]:
 78 |     """Create a dataset with a fixed scale offset filter and fill values."""
 79 | 
 80 |     dates: NDArray = numpy.arange(START_DATE, END_DATE, DELTA)
 81 |     measures: NDArray = numpy.arange(0, len(dates), dtype=numpy.float64)
 82 |     measures[measures % 2 == 0] = FILL_VALUE
 83 |     measures = numpy.vstack((measures, ) * 25).T * 1e-4
 84 | 
 85 |     yield make_dataset(dates,
 86 |                        measures,
 87 |                        delayed=delayed,
 88 |                        fill_value=FILL_VALUE * 1e-4,
 89 |                        filters=(zarr.FixedScaleOffset(scale=10000,
 90 |                                                       offset=0,
 91 |                                                       dtype='<f8',
 92 |                                                       astype='i4'), ))
 93 | 
 94 | 
 95 | def create_test_collection(tested_fs,
 96 |                            with_fillvalue=False,
 97 |                            delayed=True) -> collection.Collection:
 98 |     """Create a collection."""
 99 |     zds: dataset.Dataset = next(
100 |         create_test_dataset_with_fillvalue(
101 |             delayed=delayed) if with_fillvalue else create_test_dataset(
102 |                 delayed=delayed))
103 |     zcollection = collection.Collection('time',
104 |                                         zds.metadata(),
105 |                                         partitioning.Date(('time', ), 'D'),
106 |                                         str(tested_fs.collection),
107 |                                         filesystem=tested_fs.fs)
108 |     zcollection.insert(zds)
109 |     return zcollection
110 | 
111 | 
112 | #: List of filesystems and datasets to test.
113 | FILE_SYSTEM_DATASET = list(
114 |     itertools.product([
115 |         'local_fs',
116 |         's3_fs',
117 |     ], [
118 |         create_test_dataset,
119 |         create_test_dataset_with_fillvalue,
120 |     ]))
121 | 


--------------------------------------------------------------------------------
/zcollection/tests/s3.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Fixtures for testing S3 using the pytest and minio.
  7 | ===================================================
  8 | """
  9 | from typing import Iterator, Literal
 10 | import os
 11 | import pathlib
 12 | import shlex
 13 | import subprocess
 14 | import time
 15 | 
 16 | import botocore.client
 17 | import botocore.session
 18 | import pytest
 19 | import requests
 20 | import s3fs
 21 | 
 22 | #: Listen port
 23 | PORT = 5555
 24 | #: Listen address
 25 | ENDPOINT: str = f'127.0.0.1:{PORT}'
 26 | #: URI for minio
 27 | ENDPOINT_URI: str = f'http://{ENDPOINT}'
 28 | #: Credential for minio
 29 | CREDENTIAL = '25219d58-f6c6-11eb-922c-770d49cd18e4'
 30 | 
 31 | 
 32 | def have_minio() -> Literal[True]:
 33 |     """Check if minio is available."""
 34 |     try:
 35 |         subprocess.check_output(['minio', '--version'])
 36 |         return True
 37 |     except:
 38 |         raise ImportError('minio: command not found') from None
 39 | 
 40 | 
 41 | have_minio()
 42 | 
 43 | 
 44 | def is_minio_up(timeout: float) -> bool:
 45 |     """Check if minio server is up."""
 46 |     try:
 47 |         response = requests.get(ENDPOINT_URI, timeout=timeout)
 48 |         if response.status_code == 403:
 49 |             return True
 50 |     except:  # pylint: disable=bare-except
 51 |         pass
 52 |     return False
 53 | 
 54 | 
 55 | def wait_for_minio_to_start(timeout: float) -> None:
 56 |     """Wait for the minio server to start."""
 57 |     while timeout > 0:
 58 |         try:
 59 |             response = requests.get(ENDPOINT_URI, timeout=1)
 60 |             if response.status_code == 403:
 61 |                 return
 62 |         except:  # pylint: disable=bare-except
 63 |             pass
 64 |         timeout -= 0.1
 65 |         time.sleep(0.1)
 66 |     raise RuntimeError("minio server didn't start")
 67 | 
 68 | 
 69 | @pytest.fixture()
 70 | def s3_base(tmpdir, pytestconfig) -> Iterator[None]:
 71 |     """Launch minio server."""
 72 |     if pytestconfig.getoption('s3') is False:
 73 |         pytest.skip('S3 disabled')
 74 |     if is_minio_up(timeout=1):
 75 |         raise RuntimeError('minio server already up')
 76 |     os.environ['MINIO_CACHE_AFTER'] = '1'
 77 |     os.environ['MINIO_CACHE'] = 'on'
 78 |     os.environ['MINIO_ROOT_PASSWORD'] = CREDENTIAL
 79 |     os.environ['MINIO_ROOT_USER'] = CREDENTIAL
 80 |     # pylint: disable=consider-using-with
 81 |     process = subprocess.Popen(
 82 |         shlex.split(f'minio server --quiet --address {ENDPOINT} '
 83 |                     f"--console-address :{PORT+1} '{tmpdir!s}'"))
 84 | 
 85 |     try:
 86 |         wait_for_minio_to_start(timeout=30)
 87 |         yield
 88 |     finally:
 89 |         process.terminate()
 90 |         process.wait()
 91 |     # pylint: enable=consider-using-with
 92 | 
 93 | 
 94 | def make_bucket(name) -> None:
 95 |     """Create a bucket."""
 96 |     session: botocore.session.Session = botocore.session.get_session()
 97 |     client = session.create_client(
 98 |         's3',
 99 |         aws_access_key_id=CREDENTIAL,
100 |         aws_secret_access_key=CREDENTIAL,
101 |         endpoint_url=ENDPOINT_URI,
102 |         region_name='us-east-1',
103 |         config=botocore.client.Config(signature_version='s3v4'))
104 |     client.create_bucket(Bucket=name, ACL='public-read')
105 | 
106 | 
107 | # pylint: disable=redefined-outer-name, unused-argument # pytest fixture
108 | @pytest.fixture()
109 | def s3(s3_base) -> Iterator[s3fs.core.S3FileSystem]:
110 |     """Create a S3 file system instance."""
111 |     s3fs.core.S3FileSystem.clear_instance_cache()
112 |     fs = s3fs.core.S3FileSystem(anon=False,
113 |                                 key=CREDENTIAL,
114 |                                 secret=CREDENTIAL,
115 |                                 client_kwargs={'endpoint_url': ENDPOINT_URI})
116 |     fs.invalidate_cache()
117 |     yield fs
118 |     # pylint: enable=redefined-outer-name, unused-argument
119 | 
120 | 
121 | class S3Path(type(pathlib.Path())):  # type: ignore[misc]
122 |     """Handle S3 path on multiple platforms."""
123 | 
124 |     def __str__(self) -> str:
125 |         return super().__str__().replace('\\', '/')
126 | 
127 | 
128 | class S3:
129 |     """S3 filesystem."""
130 |     #: Bucket ID
131 |     ID = 0
132 | 
133 |     # pylint: disable=redefined-outer-name # pytest fixture
134 |     def __init__(self, s3: s3fs.core.S3FileSystem) -> None:
135 |         name: str = f'bucket{S3.ID}'
136 |         S3.ID += 1
137 |         make_bucket(name)
138 |         self.collection: S3Path = S3Path(name).joinpath('collection')
139 |         self.view: S3Path = S3Path(name).joinpath('view')
140 |         self.fs: s3fs.core.S3FileSystem = s3
141 | 
142 |     # pylint: enable=redefined-outer-name
143 | 


--------------------------------------------------------------------------------
/zcollection/fs_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | File system tools
  7 | =================
  8 | """
  9 | from __future__ import annotations
 10 | 
 11 | from typing import Any, Iterator, Sequence
 12 | import os
 13 | 
 14 | import fsspec
 15 | 
 16 | #: Path separator
 17 | SEPARATOR = '/'
 18 | 
 19 | 
 20 | def join_path(*args: str) -> str:
 21 |     """Join path elements."""
 22 |     return SEPARATOR.join(args)
 23 | 
 24 | 
 25 | def normalize_path(fs: fsspec.AbstractFileSystem, path: str) -> str:
 26 |     """Normalize the path.
 27 | 
 28 |     Args:
 29 |         fs: file system object
 30 |         path: path to test
 31 | 
 32 |     Returns:
 33 |         Normalized path.
 34 |     """
 35 |     # pylint: disable=protected-access
 36 |     # There is no public method to perform this operation.
 37 |     path = fs._strip_protocol(path)  # type: ignore[return-value]
 38 |     # pylint: enable=protected-access
 39 |     if path == '':
 40 |         path = fs.sep
 41 |     if fs.protocol in ('file', 'memory'):
 42 |         return os.path.normpath(path)
 43 |     return path
 44 | 
 45 | 
 46 | def get_fs(
 47 |     filesystem: fsspec.AbstractFileSystem | str | None = None
 48 | ) -> fsspec.AbstractFileSystem:
 49 |     """Return the file system object from the input.
 50 | 
 51 |     Args:
 52 |         filesystem: file system object or file system name
 53 | 
 54 |     Returns:
 55 |         File system object.
 56 | 
 57 |     Example:
 58 |         >>> from fsspec.implementations.local import LocalFileSystem
 59 |         >>> get_fs("hdfs")
 60 |         >>> get_fs(LocalFileSystem("/tmp/swot"))
 61 |     """
 62 |     filesystem = filesystem or 'file'
 63 |     return (fsspec.filesystem(filesystem)
 64 |             if isinstance(filesystem, str) else filesystem)
 65 | 
 66 | 
 67 | def fs_walk(
 68 |     fs: fsspec.AbstractFileSystem,
 69 |     path: str,
 70 |     sort: bool = False,
 71 | ) -> Iterator[tuple[str, list[str], list[str]]]:
 72 |     """Return the list of files and directories in a directory.
 73 | 
 74 |     Args:
 75 |         fs: file system object
 76 |         path: path to the directory
 77 |         sort: if True, the list of files and directories is sorted
 78 |             alphabetically
 79 | 
 80 |     Returns:
 81 |         Iterator of (path, directories, files).
 82 |     """
 83 |     dirs: list[str]
 84 |     files: list[str]
 85 | 
 86 |     dirs, files = [], []
 87 |     try:
 88 |         listing: list[dict[str, Any]] = fs.ls(path, detail=True)
 89 |     except (FileNotFoundError, OSError):
 90 |         yield '', [], []
 91 |         return
 92 | 
 93 |     for is_dir, name in ((info['type'] == 'directory', info['name'])
 94 |                          for info in listing):
 95 |         # each info name must be at least [path]/part , but here
 96 |         # we check also for names like [path]/part/
 97 |         dirs.append(name) if is_dir else files.append(
 98 |             name.rsplit(SEPARATOR, 1)[-1])
 99 | 
100 |     def sort_sequence(sequence: list[str]) -> list[str]:
101 |         """Sort the sequence if the user wishes."""
102 |         return list(sorted(sequence)) if sort else sequence
103 | 
104 |     dirs = sort_sequence(dirs)
105 |     yield path.rstrip(SEPARATOR), dirs, sort_sequence(files)
106 | 
107 |     for item in dirs:
108 |         yield from fs_walk(fs, item, sort=sort)
109 | 
110 | 
111 | def copy_file(
112 |     source: str,
113 |     target: str,
114 |     fs_source: fsspec.AbstractFileSystem,
115 |     fs_target: fsspec.AbstractFileSystem,
116 | ) -> None:
117 |     """Copy a file from one location to another.
118 | 
119 |     Args:
120 |         source: The name of the source file.
121 |         target: The name of the target file.
122 |         fs_source: The file system that the source file is stored on.
123 |         fs_target: The file system that the target file is stored on.
124 |     """
125 |     with fs_source.open(source, 'rb') as source_stream:
126 |         with fs_target.open(target, 'wb') as target_stream:
127 |             target_stream.write(source_stream.read())  # type: ignore[arg-type]
128 | 
129 | 
130 | def copy_files(
131 |     source: Sequence[str],
132 |     target: str,
133 |     fs_source: fsspec.AbstractFileSystem,
134 |     fs_target: fsspec.AbstractFileSystem,
135 | ) -> None:
136 |     """Copy a list of files from one location to another.
137 | 
138 |     Args:
139 |         source: The names of the source files.
140 |         target: The name of the target directory.
141 |         fs_source: The file system that the source files are stored on.
142 |         fs_target: The file system that the target directory is stored on.
143 |     """
144 |     tuple(
145 |         map(
146 |             lambda path: copy_file(path,
147 |                                    join_path(target, os.path.basename(path)),
148 |                                    fs_source, fs_target), source))
149 | 
150 | 
151 | def copy_tree(
152 |     source: str,
153 |     target: str,
154 |     fs_source: fsspec.AbstractFileSystem,
155 |     fs_target: fsspec.AbstractFileSystem,
156 | ) -> None:
157 |     """Copy a directory tree from one location to another.
158 | 
159 |     Args:
160 |         source: The name of the source directory.
161 |         target: The name of the target directory.
162 |         fs_source: The file system that the source directory is stored on.
163 |         fs_target: The file system that the target directory is stored on.
164 | 
165 |     Raises:
166 |         ValueError: If the target already exists.
167 |     """
168 |     if fs_target.exists(target):
169 |         raise ValueError(f'Target {target} already exists')
170 |     fs_target.mkdir(target)
171 |     for root, dirs, files in tuple(fs_walk(fs_source, source)):
172 |         for name in files:
173 |             source_path: str = join_path(root, name)
174 |             copy_file(source_path,
175 |                       join_path(target, os.path.relpath(source_path, source)),
176 |                       fs_source, fs_target)
177 |         for source_path in dirs:
178 |             fs_target.mkdir(
179 |                 join_path(target, os.path.relpath(source_path, source)))
180 | 


--------------------------------------------------------------------------------
/zcollection/merging/time_series.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Merging a time series
  7 | =====================
  8 | """
  9 | import numpy
 10 | 
 11 | from . import period
 12 | from .. import dataset
 13 | from ..type_hints import NDArray
 14 | 
 15 | 
 16 | def _merge_time_series(
 17 |     existing_ds: dataset.Dataset,
 18 |     inserted_ds: dataset.Dataset,
 19 |     axis: str,
 20 |     partitioning_dim: str,
 21 | ) -> dataset.Dataset:
 22 |     """Merge two time series together.
 23 | 
 24 |     See :func:`merge_time_series` for
 25 |     details.
 26 |     """
 27 |     existing_axis: NDArray = existing_ds.variables[axis].values
 28 |     inserted_axis: NDArray = inserted_ds.variables[axis].values
 29 |     existing_period = period.Period(existing_axis.min(),
 30 |                                     existing_axis.max(),
 31 |                                     within=True)
 32 |     inserted_period = period.Period(inserted_axis.min(),
 33 |                                     inserted_axis.max(),
 34 |                                     within=True)
 35 | 
 36 |     relation: period.PeriodRelation = inserted_period.get_relation(
 37 |         existing_period)
 38 | 
 39 |     # The new piece is located before the existing data.
 40 |     if relation.is_before():
 41 |         return inserted_ds.concat(existing_ds, partitioning_dim)
 42 | 
 43 |     # The new piece is located after the existing data.
 44 |     if relation.is_after():
 45 |         return existing_ds.concat(inserted_ds, partitioning_dim)
 46 | 
 47 |     # The new piece replace the old one
 48 |     if relation.contains():
 49 |         return inserted_ds
 50 | 
 51 |     intersection: period.Period = inserted_period.intersection(existing_period)
 52 | 
 53 |     # The new piece is located before, but there is an overlap
 54 |     # between the two datasets.
 55 |     if relation.is_before_overlapping():
 56 |         # pylint: disable=comparison-with-callable
 57 |         indices = numpy.where(
 58 |             # comparison between ndarray and datetime64
 59 |             existing_axis > intersection.end())[0]  # type: ignore
 60 |         # pylint: enable=comparison-with-callable
 61 |         return inserted_ds.concat(
 62 |             existing_ds.isel({partitioning_dim: indices}), partitioning_dim)
 63 | 
 64 |     # The new piece is located after, but there is an overlap
 65 |     # between the two datasets.
 66 |     if relation.is_after_overlapping():
 67 |         # pylint: disable=comparison-with-callable
 68 |         indices = numpy.where(
 69 |             # comparison between ndarray and datetime64
 70 |             existing_axis < intersection.begin)[0]  # type: ignore
 71 |         # pylint: enable=comparison-with-callable
 72 |         return existing_ds.isel({
 73 |             partitioning_dim: indices
 74 |         }).concat(inserted_ds, partitioning_dim)
 75 | 
 76 |     assert relation.is_inside()
 77 |     # comparison between ndarray and datetime64
 78 |     index = numpy.where(existing_axis < intersection.begin)[0]  # type: ignore
 79 |     before: dataset.Dataset = existing_ds.isel(
 80 |         {partitioning_dim: slice(0, index[-1] + 1, None)})
 81 | 
 82 |     # pylint: disable=comparison-with-callable
 83 |     # comparison between ndarray and datetime64
 84 |     index = numpy.where(existing_axis > intersection.end())[0]  # type: ignore
 85 |     # pylint: enable=comparison-with-callable
 86 |     after: dataset.Dataset = existing_ds.isel(
 87 |         {partitioning_dim: slice(index[0], index[-1] + 1, None)})
 88 | 
 89 |     return before.concat((inserted_ds, after), partitioning_dim)
 90 | 
 91 | 
 92 | def merge_time_series(
 93 |     existing_ds: dataset.Dataset,
 94 |     inserted_ds: dataset.Dataset,
 95 |     axis: str,
 96 |     partitioning_dim: str,
 97 |     **kwargs,
 98 | ) -> dataset.Dataset:
 99 |     """Merge two time series together.
100 | 
101 |     Replaces only the intersection between the existing dataset and the new one,
102 |     but also keeps the existing records if they have not been updated.
103 | 
104 |     The following figure illustrates the implemented algorithm. Column ``A``
105 |     represents the new data and column ``B``, the data already present. The
106 |     different cells in the columns represent the hours on the day of the
107 |     measurements. The merge result is shown in column ``C``. It contains the
108 |     measurements of the column ``A`` or column ``B`` if column ``A`` does not
109 |     replace them.
110 | 
111 |     .. figure:: ../images/merge_time_series.svg
112 |         :align: center
113 |         :width: 50%
114 | 
115 |     Args:
116 |         existing_ds: The existing dataset.
117 |         inserted_ds: The inserted dataset.
118 |         axis: The axis to merge on.
119 |         partitioning_dim: The name of the partitioning dimension.
120 |         kwargs:
121 |             tolerance: This parameter sets the tolerance level for detecting
122 |                 data gaps in the inserted axis dataset. If set to ``None``,
123 |                 the algorithm will not check for data gaps in the inserted
124 |                 dataset.
125 | 
126 |     Returns:
127 |         The merged dataset.
128 |     """
129 |     tolerance = kwargs.get('tolerance', None)
130 |     index: NDArray
131 | 
132 |     # Check if the inserted dataset contains data gaps.
133 |     if tolerance is not None:
134 |         inserted_axis: NDArray = inserted_ds.variables[axis].values
135 |         delta: NDArray = numpy.concatenate(
136 |             (numpy.array([0]), numpy.diff(numpy.roll(inserted_axis, 0))))
137 |         index = numpy.concatenate(
138 |             (numpy.array([0], numpy.int64), numpy.where(delta > tolerance)[0],
139 |              numpy.array([inserted_axis.size], numpy.int64)))
140 |     else:
141 |         index = numpy.array([], dtype=numpy.int64)
142 | 
143 |     if index.size > 1:
144 |         # Split the inserted dataset into several datasets between the data
145 |         # gaps.
146 |         for ix in range(len(index) - 1):
147 |             existing_ds = _merge_time_series(
148 |                 existing_ds,
149 |                 inserted_ds.isel(
150 |                     {partitioning_dim: slice(index[ix], index[ix + 1], None)}),
151 |                 axis, partitioning_dim)
152 |         return existing_ds
153 |     return _merge_time_series(existing_ds, inserted_ds, axis, partitioning_dim)
154 | 


--------------------------------------------------------------------------------
/zcollection/partitioning/sequence.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Partitioning a sequence of variables
  7 | ====================================
  8 | """
  9 | from __future__ import annotations
 10 | 
 11 | from typing import Any, ClassVar, Iterator
 12 | 
 13 | import dask.array.core
 14 | import dask.array.routines
 15 | import numpy
 16 | 
 17 | from . import abc
 18 | from ..type_hints import ArrayLike, NDArray
 19 | 
 20 | 
 21 | def _is_monotonic(arr: NDArray) -> bool:
 22 |     """Check if the array is monotonic.
 23 | 
 24 |     The matrix will be sorted in the reverse order of the partitioning keys
 25 |     (column in the matrix). If the order of the matrix is unchanged, the
 26 |     different partitioning columns are monotonic.
 27 | 
 28 |     Args:
 29 |         arr: The array to check.
 30 | 
 31 |     Returns:
 32 |         True if the array is monotonic, False otherwise.
 33 |     """
 34 |     # `reversed` because `numpy.lexsort` wants the most significant key last.
 35 |     values: list[NDArray] = [
 36 |         arr[:, ix] for ix in reversed(range(arr.shape[1]))
 37 |     ]
 38 |     sort_order: NDArray = numpy.lexsort(numpy.array(values))
 39 |     return numpy.all(abc.difference(sort_order) > 0)  # type: ignore
 40 | 
 41 | 
 42 | def _unique(arr: ArrayLike, is_delayed: bool) -> tuple[NDArray, NDArray]:
 43 |     """Return unique elements and their indices.
 44 | 
 45 |     Args:
 46 |         arr: Array of elements.
 47 |         is_delayed: If True, the array is delayed.
 48 |     Returns:
 49 |         Tuple of unique elements and their indices.
 50 |     """
 51 |     index: NDArray
 52 |     indices: NDArray
 53 | 
 54 |     if is_delayed:
 55 |         index, indices = abc.unique(arr)  # type: ignore[arg-type]
 56 |         if not _is_monotonic(index):
 57 |             raise ValueError('index is not monotonic')
 58 |         return index, indices
 59 |     return abc.unique_and_check_monotony(arr)
 60 | 
 61 | 
 62 | class Sequence(abc.Partitioning):
 63 |     """Initialize a partitioning scheme for a sequence of variables.
 64 | 
 65 |     A sequence is a combination of variables constituting unique monotonic keys.
 66 |     For example, the orbit number (``cycle``) and the half-orbit number
 67 |     (``pass``) of a satellite.
 68 | 
 69 |     Args:
 70 |         variables: A list of strings representing the variables to be used for
 71 |             partitioning.
 72 |         dtype: An optional sequence of strings representing the data type used
 73 |             to store variable values in a binary representation without data
 74 |             loss. Must be one of the following allowed data types: ``int8``,
 75 |             ``int16``, ``int32``, ``int64``, ``uint8``, ``uint16``, ``uint32``,
 76 |             ``uint64``. If not provided, defaults to ``int64`` for all
 77 |             variables.
 78 | 
 79 |     Raises:
 80 |         ValueError: If the periodicity is not valid.
 81 | 
 82 |     Example:
 83 |         >>> partitioning = Sequence(["a", "b", "c"], (None, 10, 10))
 84 |     """
 85 |     #: The ID of the partitioning scheme.
 86 |     ID: ClassVar[str] = 'Sequence'
 87 | 
 88 |     # pylint: disable=arguments-differ
 89 |     # False positive: `self` is used in the signature.
 90 |     @staticmethod
 91 |     def _split(variables: dict[str, ArrayLike]) -> Iterator[abc.Partition]:
 92 |         """Split the variables constituting the partitioning into partitioning
 93 |         schemes."""
 94 |         index: NDArray
 95 |         indices: NDArray
 96 |         matrix: dask.array.core.Array | NDArray
 97 | 
 98 |         # Determine if the variables are handled by Dask.
 99 |         is_delayed: bool = any(
100 |             isinstance(item, dask.array.core.Array)
101 |             for item in variables.values())
102 | 
103 |         # Combines the arrays of variable values into a transposed matrix.
104 |         matrix = dask.array.routines.vstack(tuple(
105 |             variables.values())).transpose() if is_delayed else numpy.vstack(
106 |                 tuple(variables.values())).transpose()
107 |         if matrix.dtype.kind not in 'iu':
108 |             raise TypeError('The variables must be integer')
109 | 
110 |         index, indices = _unique(matrix, is_delayed)  # type: ignore[arg-type]
111 |         indices = abc.concatenate_item(indices, matrix.shape[0])
112 | 
113 |         fields = tuple(variables.keys())
114 |         # pylint: disable=unnecessary-lambda-assignment
115 |         # We want to reference a lambda function, not assign it to a variable.
116 |         if len(fields) == 1:
117 |             concat: Any = lambda fields, keys: (fields + keys, )
118 |         else:
119 |             concat = lambda fields, keys: tuple(zip(fields, keys))
120 |         # pylint: enable=unnecessary-lambda-assignment
121 | 
122 |         return ((concat(fields,
123 |                         tuple(item)), slice(start, indices[ix + 1], None))
124 |                 for item, (ix, start) in zip(index, enumerate(indices[:-1])))
125 |         # pylint: enable=arguments-differ
126 | 
127 |     def encode(
128 |         self,
129 |         partition: tuple[tuple[str, int], ...],
130 |     ) -> tuple[int, ...]:
131 |         """Encode a partitioning scheme to the handled values.
132 | 
133 |         Args:
134 |             partition: The partitioning scheme to be encoded.
135 | 
136 |         Returns:
137 |             The encoded partitioning scheme.
138 | 
139 |         Example:
140 |             >>> partitioning = Sequence(["a", "b", "c"])
141 |             >>> fields = partitioning.parse("a=100/b=10/c=1")
142 |             >>> fields
143 |             (('a', 100), ('b', 10), ('c', 1))
144 |             >>> partitioning.encode(fields)
145 |             (100, 10, 1)
146 |         """
147 |         return tuple(value
148 |                      for _, value in self.parse(self.join(partition, '/')))
149 | 
150 |     def decode(self, values: tuple[int, ...]) -> tuple[tuple[str, int], ...]:
151 |         """Decode a partitioning scheme.
152 | 
153 |         Args:
154 |             values: The encoded partitioning scheme.
155 | 
156 |         Returns:
157 |             The decoded partitioning scheme.
158 | 
159 |         Example:
160 |             >>> partitioning = Sequence(["a", "b", "c"])
161 |             >>> partitioning.decode((100, 10, 1))
162 |             (('a', 100), ('b', 10), ('c', 1))
163 |         """
164 |         return tuple(
165 |             (key, value) for key, value in zip(self.variables, values))
166 | 


--------------------------------------------------------------------------------
/zcollection/merging/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Handle merging of datasets of a partition.
  7 | ==========================================
  8 | """
  9 | from __future__ import annotations
 10 | 
 11 | from typing import Protocol
 12 | import hashlib
 13 | import shutil
 14 | 
 15 | import fsspec
 16 | import fsspec.implementations.local
 17 | import zarr.storage
 18 | 
 19 | from zcollection import fs_utils
 20 | 
 21 | from .. import dataset, storage, sync
 22 | from .time_series import merge_time_series
 23 | 
 24 | __all__ = ('MergeCallable', 'perform', 'merge_time_series')
 25 | 
 26 | 
 27 | #: pylint: disable=too-few-public-methods,duplicate-code
 28 | class MergeCallable(Protocol):
 29 |     """Protocol to merge datasets stored in a partition.
 30 | 
 31 |     A merge callable is a function that accepts an existing dataset
 32 |     present in a partition, a new dataset to merge, the partitioning
 33 |     dimension and the axis to merge on. It returns the merged dataset.
 34 |     """
 35 | 
 36 |     def __call__(
 37 |         self,
 38 |         existing_ds: dataset.Dataset,
 39 |         inserted_ds: dataset.Dataset,
 40 |         axis: str,
 41 |         partitioning_dim: str,
 42 |         **kwargs,
 43 |     ) -> dataset.Dataset:  # pylint: disable=duplicate-code
 44 |         """Call the partition function.
 45 | 
 46 |         Args:
 47 |             existing_ds: The existing dataset.
 48 |             inserted_ds: The inserted dataset.
 49 |             axis: The axis to merge on.
 50 |             partitioning_dim: The partitioning dimension.
 51 |             **kwargs: Additional keyword arguments.
 52 | 
 53 |         Returns:
 54 |             The merged dataset.
 55 |         """
 56 |         # pylint: disable=unnecessary-ellipsis
 57 |         # Ellipsis is necessary to make the function signature match the
 58 |         # protocol.
 59 |         ...  # pragma: no cover
 60 |         # pylint: enable=unnecessary-ellipsis
 61 | 
 62 |     #: pylint: enable=too-few-public-methods,duplicate-code
 63 | 
 64 | 
 65 | def _rename(
 66 |     fs: fsspec.AbstractFileSystem,
 67 |     source: str,
 68 |     dest: str,
 69 | ) -> None:
 70 |     """Rename a directory on a file system.
 71 | 
 72 |     Args:
 73 |         fs: The file system.
 74 |         source: The source directory.
 75 |         dest: The destination directory.
 76 |     """
 77 |     if isinstance(fs, fsspec.implementations.local.LocalFileSystem):
 78 |         # fspec implementation of the local file system, copy the source
 79 |         # directory to the destination directory and remove the source
 80 |         # directory. This is not efficient. So we use the shutil
 81 |         # implementation to rename the directory.
 82 |         shutil.rmtree(dest, ignore_errors=True)
 83 |         shutil.move(source, dest)
 84 |         return
 85 | 
 86 |     fs.rm(dest, recursive=True)
 87 |     fs.mv(source, dest, recursive=True)
 88 | 
 89 | 
 90 | def _extract_root_dirname(dirname: str, sep: str) -> str:
 91 |     """Extracts the root directory name from a partition name."""
 92 |     parts = filter(lambda x: '=' not in x, dirname.split(sep))
 93 |     return sep.join(parts)
 94 | 
 95 | 
 96 | def _update_fs(
 97 |     dirname: str,
 98 |     zds: dataset.Dataset,
 99 |     fs: fsspec.AbstractFileSystem,
100 |     *,
101 |     synchronizer: sync.Sync | None = None,
102 | ) -> None:
103 |     """Updates a dataset stored in a partition.
104 | 
105 |     Args:
106 |         dirname: The name of the partition.
107 |         zds: The dataset to update.
108 |         fs: The file system that the partition is stored on.
109 |         synchronizer: The instance handling access to critical resources.
110 |     """
111 |     # Building a temporary directory to store the new data. The name of the
112 |     # temporary directory is the hash of the partition name.
113 |     temp: str = fs_utils.join_path(
114 |         _extract_root_dirname(dirname, fs.sep),
115 |         hashlib.sha256(dirname.encode()).hexdigest())
116 |     if fs.exists(temp):
117 |         fs.rm(temp, recursive=True)
118 | 
119 |     # Initializing Zarr group
120 |     zarr.storage.init_group(store=fs.get_mapper(temp))
121 | 
122 |     # Writing new data.
123 |     try:
124 |         # The synchronization is done by the caller.
125 |         storage.write_zarr_group(zds, temp, fs, synchronizer or sync.NoSync())
126 |     except Exception:
127 |         # The "write_zarr_group" method throws the exception if all scheduled
128 |         # tasks are finished. So here we can delete the temporary directory.
129 |         fs.rm(temp, recursive=True)
130 |         raise
131 | 
132 |     # Rename the existing entry on the file system
133 |     _rename(fs, temp, dirname)
134 | 
135 | 
136 | def perform(
137 |     ds_inserted: dataset.Dataset,
138 |     dirname: str,
139 |     axis: str,
140 |     fs: fsspec.AbstractFileSystem,
141 |     partitioning_dim: str,
142 |     *,
143 |     delayed: bool = True,
144 |     merge_callable: MergeCallable | None,
145 |     synchronizer: sync.Sync | None = None,
146 |     **kwargs,
147 | ) -> None:
148 |     """Merges a new dataset with an existing partition.
149 | 
150 |     Args:
151 |         ds_inserted: The dataset to merge.
152 |         dirname: The name of the partition.
153 |         axis: The axis to merge on.
154 |         fs: The file system on which the partition is stored.
155 |         partitioning_dim: The partitioning dimension.
156 |         delayed: If True, the existing dataset is loaded lazily. Defaults to
157 |             True.
158 |         merge_callable: The merge callable. If None, the inserted dataset
159 |             overwrites the existing dataset stored in the partition.
160 |             Defaults to None.
161 |         synchronizer: The instance handling access to critical resources.
162 |             Defaults to None.
163 |         **kwargs: Additional keyword arguments are passed through to the merge
164 |             callable.
165 |     """
166 |     if merge_callable is None:
167 |         zds = ds_inserted
168 |     else:
169 |         ds = storage.open_zarr_group(dirname, fs, delayed=delayed)
170 |         # Read dataset does not contain insertion properties.
171 |         # This properties might be loss in the merge_callable depending on which
172 |         # dataset is used.
173 |         ds.copy_properties(ds=ds_inserted)
174 |         zds = merge_callable(ds, ds_inserted, axis, partitioning_dim, **kwargs)
175 |     _update_fs(dirname, zds, fs, synchronizer=synchronizer)
176 | 


--------------------------------------------------------------------------------
/zcollection/tests/test_compressed_array.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """Tests for :class:`zcollection.compressed_array.CompressedArray`."""
  6 | from typing import Any
  7 | 
  8 | import dask.array.core
  9 | import dask.array.creation
 10 | import dask.array.random
 11 | import dask.array.reductions
 12 | import dask.array.routines
 13 | import dask.array.ufunc
 14 | import dask.array.utils
 15 | import numpy
 16 | import pytest
 17 | 
 18 | from ..compressed_array import CompressedArray
 19 | # pylint: disable=unused-import # Need to import for fixtures
 20 | from .cluster import dask_client, dask_cluster
 21 | 
 22 | # pylint: enable=unused-import
 23 | 
 24 | # pylint: disable=unnecessary-lambda # We keep the lambdas for readability
 25 | #: Functions to test
 26 | functions = [
 27 |     lambda x: x,
 28 |     dask.array.ufunc.expm1,
 29 |     lambda x: 2 * x,
 30 |     lambda x: x / 2,
 31 |     lambda x: x**2,
 32 |     lambda x: x + x,
 33 |     lambda x: x * x,
 34 |     lambda x: x[0],
 35 |     lambda x: x[:, 1],
 36 |     lambda x: x[:1, :, 1:3],
 37 |     lambda x: x.T,
 38 |     lambda x: dask.array.routines.transpose(x, (1, 2, 0)),
 39 |     dask.array.reductions.nanmean,
 40 |     lambda x: dask.array.reductions.nanmean(x, axis=1),
 41 |     dask.array.reductions.nanmax,
 42 |     dask.array.reductions.nanmin,
 43 |     dask.array.reductions.nanprod,
 44 |     dask.array.reductions.nanstd,
 45 |     dask.array.reductions.nanvar,
 46 |     dask.array.reductions.nansum,
 47 |     lambda x: dask.array.reductions.median(x, axis=0),
 48 |     dask.array.reductions.nanargmax,
 49 |     dask.array.reductions.nanargmin,
 50 |     lambda x: dask.array.reductions.nancumprod(x, axis=0),
 51 |     lambda x: dask.array.reductions.nancumsum(x, axis=0),
 52 |     lambda x: x.sum(),
 53 |     lambda x: x.moment(order=0),
 54 |     lambda x: x.mean(),
 55 |     lambda x: x.mean(axis=1),
 56 |     lambda x: x.std(),
 57 |     lambda x: x.std(axis=1),
 58 |     lambda x: x.var(),
 59 |     lambda x: x.var(axis=1),
 60 |     lambda x: x.dot(numpy.arange(x.shape[-1])),
 61 |     lambda x: x.dot(numpy.eye(x.shape[-1])),
 62 |     lambda x: dask.array.routines.tensordot(
 63 |         x, numpy.ones(x.shape[:2]), axes=[(0, 1),
 64 |                                           (0, 1)]),  # type: ignore[arg-type]
 65 |     lambda x: x.sum(axis=0),
 66 |     lambda x: x.max(axis=0),
 67 |     lambda x: x.min(axis=0),
 68 |     lambda x: x.sum(axis=(1, 2)),
 69 |     lambda x: x.astype(numpy.complex128),
 70 |     lambda x: x.map_blocks(lambda x: x * 2),
 71 |     lambda x: x.map_overlap(
 72 |         lambda x: x * 2, depth=0, trim=True, boundary='none'),
 73 |     lambda x: x.map_overlap(
 74 |         lambda x: x * 2, depth=0, trim=False, boundary='none'),
 75 |     lambda x: x.round(1),
 76 |     lambda x: x.reshape((x.shape[0] * x.shape[1], x.shape[2])),
 77 |     abs,
 78 |     lambda x: x > 0.5,
 79 |     lambda x: x.rechunk((4, 4, 4)),
 80 |     lambda x: x.rechunk((2, 2, 1)),
 81 |     numpy.isneginf,
 82 |     numpy.isposinf,
 83 | ]
 84 | # pylint: enable=unnecessary-lambda
 85 | 
 86 | 
 87 | @pytest.mark.filterwarnings(
 88 |     'ignore:Casting complex values to real discards the imaginary part')
 89 | @pytest.mark.parametrize('func', functions)
 90 | def test_basic(
 91 |         func,
 92 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
 93 | ) -> None:
 94 |     """Test basic functionality."""
 95 |     values: numpy.ndarray = numpy.random.random((2, 3, 4))
 96 |     arr: dask.array.core.Array = dask.array.core.from_array(
 97 |         CompressedArray(values), chunks='auto')
 98 |     compressed_array: numpy.ndarray = func(arr).compute()
 99 |     array: numpy.ndarray = func(dask.array.core.from_array(values)).compute()
100 |     assert compressed_array.shape == array.shape
101 |     assert numpy.allclose(compressed_array, array)
102 | 
103 | 
104 | def test_metadata(
105 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
106 | ) -> None:
107 |     """Test metadata."""
108 |     y: dask.array.core.Array = dask.array.random.random((10, 10),
109 |                                                         chunks=(5, 5))
110 |     z = CompressedArray(y.compute())
111 |     y = y.map_blocks(CompressedArray)  # type: ignore[assignment]
112 | 
113 |     # pylint: disable=protected-access
114 |     assert isinstance(y._meta, numpy.ndarray)
115 |     assert isinstance((y + 1)._meta, numpy.ndarray)
116 |     assert isinstance(y[:5, ::2]._meta, numpy.ndarray)
117 |     assert isinstance(
118 |         y.rechunk((2, 2))._meta,  # type: ignore[arg-type]
119 |         numpy.ndarray)
120 |     assert isinstance((y - z), numpy.ndarray)
121 |     assert isinstance(y.persist()._meta, numpy.ndarray)
122 |     # pylint: enable=protected-access
123 | 
124 | 
125 | def test_from_delayed_meta(
126 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
127 | ) -> None:
128 |     """Test from_delayed with meta."""
129 | 
130 |     def f() -> CompressedArray:
131 |         return CompressedArray(numpy.eye(3))
132 | 
133 |     d: Any = dask.delayed(f)()  # type: ignore
134 |     x: dask.array.core.Array = dask.array.core.from_delayed(
135 |         d, shape=(3, 3), meta=CompressedArray(numpy.eye(1)))
136 |     assert numpy.all(x.compute() == f()[...])  # type: ignore
137 | 
138 | 
139 | def test_from_array(
140 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
141 | ) -> None:
142 |     """Test from_array."""
143 |     x = CompressedArray(numpy.eye(10))
144 |     d: dask.array.core.Array = dask.array.core.from_array(
145 |         x, chunks=(5, 5))  # type: ignore[arg-type]
146 | 
147 |     # pylint: disable=protected-access
148 |     assert isinstance(d._meta, numpy.ndarray)
149 |     # pylint: enable=protected-access
150 |     assert isinstance(d.compute(), numpy.ndarray)
151 |     assert numpy.allclose(d.compute(), x)
152 | 
153 | 
154 | def test_map_blocks(
155 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
156 | ) -> None:
157 |     """Test map_blocks."""
158 |     x: dask.array.core.Array = dask.array.creation.eye(
159 |         10, chunks=5)  # type: ignore[arg-type]
160 |     y: dask.array.core.Array = x.map_blocks(
161 |         CompressedArray)  # type: ignore[arg-type]
162 |     # pylint: disable=protected-access
163 |     assert isinstance(y._meta, numpy.ndarray)
164 |     # pylint: enable=protected-access
165 |     assert numpy.allclose(y.compute(), x.compute())
166 | 
167 | 
168 | def test_compressed_masked_array(
169 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
170 | ) -> None:
171 |     """Test CompressedMaskedArray."""
172 |     x: dask.array.core.Array = dask.array.creation.eye(
173 |         10, chunks=5)  # type: ignore[arg-type]
174 |     y: dask.array.core.Array = x.map_blocks(
175 |         CompressedArray, fill_value=0)  # type: ignore[arg-type]
176 |     # assert isinstance(y._meta, CompressedArray)
177 |     assert isinstance(y[...].compute(), numpy.ma.MaskedArray)
178 |     assert isinstance(y.compute(), numpy.ma.MaskedArray)
179 |     assert y.mean().compute() == 1
180 |     assert y.min().compute() == 1
181 |     assert y.max().compute() == 1
182 |     assert y.sum().compute() == 10
183 |     assert y.std().compute() == 0
184 |     assert (y * 2).mean().compute() == 2
185 | 


--------------------------------------------------------------------------------
/examples/ex_indexing.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Indexing a Collection.
  3 | ======================
  4 | 
  5 | In this example, we will see how to index a collection.
  6 | """
  7 | from typing import Iterator, List, Optional, Tuple, Union
  8 | import pathlib
  9 | import pprint
 10 | 
 11 | import dask.distributed
 12 | import fsspec
 13 | import numpy
 14 | 
 15 | import zcollection
 16 | import zcollection.indexing
 17 | import zcollection.partitioning.tests.data
 18 | 
 19 | # %%
 20 | # Initialization of the environment
 21 | # ---------------------------------
 22 | fs = fsspec.filesystem('memory')
 23 | cluster = dask.distributed.LocalCluster(processes=False)
 24 | client = dask.distributed.Client(cluster)
 25 | 
 26 | # %%
 27 | # A collection can be indexed. This allows quick access to the data without
 28 | # having to browse the entire dataset.
 29 | #
 30 | # Creating the test collection.
 31 | # -----------------------------
 32 | #
 33 | # For this latest example, we will index another data set. This one contains
 34 | # measurements of a fictitious satellite on several half-orbits.
 35 | zds: zcollection.Dataset = zcollection.Dataset.from_xarray(
 36 |     zcollection.partitioning.tests.data.create_test_sequence(5, 20, 10))
 37 | print(zds)
 38 | 
 39 | # %%
 40 | collection: zcollection.Collection = zcollection.create_collection(
 41 |     'time',
 42 |     zds,
 43 |     zcollection.partitioning.Date(('time', ), 'M'),
 44 |     partition_base_dir='/one_other_collection',
 45 |     filesystem=fs)
 46 | collection.insert(zds, merge_callable=zcollection.merging.merge_time_series)
 47 | 
 48 | # %%
 49 | # Here we have created a collection partitioned by month.
 50 | pprint.pprint(fs.listdir('/one_other_collection/year=2000'))
 51 | 
 52 | 
 53 | # %%
 54 | # Class to implement
 55 | # ------------------
 56 | #
 57 | # The idea of the implementation is to calculate for each visited partition, the
 58 | # slice of data that has a constant quantity. In our example, we will rely on
 59 | # the cycle and pass number information. The first method we will implement is
 60 | # the detection of these constant parts of two vectors containing the cycle and
 61 | # pass number.
 62 | def split_half_orbit(
 63 |     cycle_number: numpy.ndarray,
 64 |     pass_number: numpy.ndarray,
 65 | ) -> Iterator[Tuple[int, int]]:
 66 |     """Calculate the indexes of the start and stop of each half-orbit.
 67 | 
 68 |     Args:
 69 |         pass_number: Pass numbers.
 70 |     Returns:
 71 |         Iterator of start and stop indexes.
 72 |     """
 73 |     assert pass_number.shape == cycle_number.shape
 74 |     pass_idx = numpy.where(numpy.roll(pass_number, 1) != pass_number)[0]
 75 |     cycle_idx = numpy.where(numpy.roll(cycle_number, 1) != cycle_number)[0]
 76 | 
 77 |     half_orbit = numpy.unique(
 78 |         numpy.concatenate(
 79 |             (pass_idx, cycle_idx, numpy.array([pass_number.size],
 80 |                                               dtype='int64'))))
 81 |     del pass_idx, cycle_idx
 82 | 
 83 |     yield from tuple(zip(half_orbit[:-1], half_orbit[1:]))
 84 | 
 85 | 
 86 | # %%
 87 | # Now we will compute these constant parts from a dataset contained in a
 88 | # partition.
 89 | def _half_orbit(
 90 |     zds: zcollection.Dataset,
 91 |     *args,
 92 |     dtype: numpy.dtype | None = None,
 93 |     **kwargs,
 94 | ) -> numpy.ndarray:
 95 |     """Return the indexes of the start and stop of each half-orbit.
 96 | 
 97 |     Args:
 98 |         ds: Datasets stored in a partition to be indexed.
 99 |     Returns:
100 |         Dictionary of start and stop indexes for each half-orbit.
101 |     """
102 |     pass_number_varname = kwargs.pop('pass_number', 'pass_number')
103 |     cycle_number_varname = kwargs.pop('cycle_number', 'cycle_number')
104 |     pass_number = zds.variables[pass_number_varname].values
105 |     cycle_number = zds.variables[cycle_number_varname].values
106 | 
107 |     generator = ((
108 |         i0,
109 |         i1,
110 |         cycle_number[i0],
111 |         pass_number[i0],
112 |     ) for i0, i1 in split_half_orbit(cycle_number, pass_number))
113 | 
114 |     return numpy.fromiter(generator, dtype)
115 | 
116 | 
117 | # %%
118 | # Finally, we implement our indexing class. The base class
119 | # (:py:class:`zcollection.indexing.Indexer<zcollection.indexing.abc.Indexer>`)
120 | # implements the index update and the associated queries.
121 | class HalfOrbitIndexer(zcollection.indexing.Indexer):
122 |     """Index collection by half-orbit."""
123 |     #: Column name of the cycle number.
124 |     CYCLE_NUMBER = 'cycle_number'
125 | 
126 |     #: Column name of the pass number.
127 |     PASS_NUMBER = 'pass_number'
128 | 
129 |     def dtype(self, /, **kwargs) -> List[Tuple[str, str]]:
130 |         """Return the columns of the index.
131 | 
132 |         Returns:
133 |             A tuple of (name, type) pairs.
134 |         """
135 |         return super().dtype() + [
136 |             (self.CYCLE_NUMBER, 'uint16'),
137 |             (self.PASS_NUMBER, 'uint16'),
138 |         ]
139 | 
140 |     @classmethod
141 |     def create(
142 |         cls,
143 |         path: Union[pathlib.Path, str],
144 |         zds: zcollection.Collection,
145 |         filesystem: Optional[fsspec.AbstractFileSystem] = None,
146 |         **kwargs,
147 |     ) -> 'HalfOrbitIndexer':
148 |         """Create a new index.
149 | 
150 |         Args:
151 |             path: The path to the index.
152 |             ds: The collection to be indexed.
153 |             filesystem: The filesystem to use.
154 |         Returns:
155 |             The created index.
156 |         """
157 |         return super()._create(path,
158 |                                zds,
159 |                                meta=dict(attribute=b'value'),
160 |                                filesystem=filesystem)  # type: ignore
161 | 
162 |     def update(
163 |         self,
164 |         zds: zcollection.Collection,
165 |         partition_size: Optional[int] = None,
166 |         npartitions: Optional[int] = None,
167 |         **kwargs,
168 |     ) -> None:
169 |         """Update the index.
170 | 
171 |         Args:
172 |             ds: New data stored in the collection to be indexed.
173 |             partition_size: The length of each bag partition.
174 |             npartitions: The number of desired bag partitions.
175 |             cycle_number: The name of the cycle number variable stored in the
176 |                 collection. Defaults to "cycle_number".
177 |             pass_number: The name of the pass number variable stored in the
178 |                 collection. Defaults to "pass_number".
179 |         """
180 |         super()._update(zds,
181 |                         _half_orbit,
182 |                         partition_size,
183 |                         npartitions,
184 |                         dtype=self.dtype(),
185 |                         **kwargs)
186 | 
187 | 
188 | # %%
189 | # Using the index
190 | # ---------------
191 | #
192 | # Now we can create our index and fill it.
193 | indexer: HalfOrbitIndexer = HalfOrbitIndexer.create('/index.parquet',
194 |                                                     collection,
195 |                                                     filesystem=fs)
196 | indexer.update(collection)
197 | 
198 | # The following command allows us to view the information stored in our index:
199 | # the first and last indexes of the partition associated with the registered
200 | # half-orbit number and the identifier of the indexed partition.
201 | indexer.table.to_pandas()
202 | 
203 | # %%
204 | # This index can now be used to load a part of a collection.
205 | selection: zcollection.Dataset | None = collection.load(
206 |     indexer=indexer.query(dict(pass_number=[1, 2])),
207 |     delayed=False,
208 | )
209 | assert selection is not None
210 | selection.to_xarray()
211 | 
212 | # %%
213 | # Close the local cluster to avoid printing warning messages in the other
214 | # examples.
215 | client.close()
216 | cluster.close()
217 | 


--------------------------------------------------------------------------------
/zcollection/partitioning/tests/test_sequence.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Test partitioning by sequence.
  7 | ==============================
  8 | """
  9 | from __future__ import annotations
 10 | 
 11 | from typing import Iterator
 12 | import pickle
 13 | 
 14 | import dask.array.core
 15 | import numpy
 16 | import pytest
 17 | import xarray
 18 | 
 19 | from zcollection.type_hints import ArrayLike
 20 | 
 21 | from . import data
 22 | from .. import Sequence, get_codecs
 23 | from ... import dataset
 24 | # pylint: disable=unused-import # Need to import for fixtures
 25 | from ...tests.cluster import dask_client, dask_cluster
 26 | 
 27 | # pylint: enable=unused-import # Need to import for fixtures
 28 | 
 29 | 
 30 | def test_construction() -> None:
 31 |     """Test the sequence constructor."""
 32 |     assert isinstance(Sequence(('a', 'b')), Sequence)
 33 |     assert len(Sequence(('a', 'b'))) == 2
 34 |     with pytest.raises(ValueError):
 35 |         Sequence(('a', 'b'), (0, ))  # type: ignore
 36 |     with pytest.raises(ValueError):
 37 |         Sequence((), ())
 38 |     with pytest.raises(ValueError):
 39 |         Sequence(('a', 'b'), dtype=('c', 'd'))
 40 |     with pytest.raises(ValueError):
 41 |         Sequence(('a', 'b'), dtype=('float32', 'int32'))
 42 |     with pytest.raises(TypeError):
 43 |         Sequence(('a', 'b'), dtype='int32')
 44 |     partitioning = Sequence(('a', 'b'))
 45 |     partition_keys = partitioning.parse('a=1/b=2')
 46 |     assert partitioning.encode(partition_keys) == (1, 2)
 47 |     with pytest.raises(ValueError):
 48 |         partitioning.encode((('A', 1), ('b', 2)))
 49 |     assert partitioning.decode((1, 2)) == (('a', 1), ('b', 2))
 50 |     assert partition_keys == (('a', 1), ('b', 2))
 51 |     with pytest.raises(ValueError):
 52 |         partitioning.parse('a=1/b=2/c=3')
 53 |     with pytest.raises(ValueError):
 54 |         partitioning.parse('field=1')
 55 | 
 56 | 
 57 | @pytest.mark.parametrize('delayed', [False, True])
 58 | def test_split_dataset(
 59 |     dask_client,  # pylint: disable=redefined-outer-name,unused-argument
 60 |     delayed: bool,
 61 | ) -> None:
 62 |     """Test the split_dataset method."""
 63 |     repeatability = 5
 64 |     xds = data.create_test_sequence(repeatability, 20, 10)
 65 |     partitioning = Sequence(('cycle_number', 'pass_number'))
 66 | 
 67 |     cycle_number = 1
 68 |     pass_number = 1
 69 | 
 70 |     assert partitioning.dtype() == (
 71 |         ('cycle_number', 'int64'),
 72 |         ('pass_number', 'int64'),
 73 |     )
 74 | 
 75 |     # Build the test dataset
 76 |     zds = dataset.Dataset.from_xarray(xds)
 77 |     if not delayed:
 78 |         zds = zds.compute()
 79 | 
 80 |     iterator = partitioning.split_dataset(zds, 'num_lines')
 81 |     assert isinstance(iterator, Iterator)
 82 | 
 83 |     for partition, indexer in iterator:
 84 |         subset = zds.isel(indexer)
 85 |         expected = (f'cycle_number={cycle_number}',
 86 |                     f'pass_number={pass_number}')
 87 |         assert expected == partition
 88 |         assert numpy.all(
 89 |             xds.where((xds.cycle_number == cycle_number)
 90 |                       & (xds.pass_number == pass_number),
 91 |                       drop=True).observation ==
 92 |             subset.variables['observation'].array)
 93 | 
 94 |         partition_keys = partitioning.parse('/'.join(partition))
 95 |         assert partition_keys == (('cycle_number', cycle_number),
 96 |                                   ('pass_number', pass_number))
 97 |         assert partitioning.decode(
 98 |             partitioning.encode(partition_keys)) == partition_keys
 99 |         assert partitioning.join(partition_keys, '/') == '/'.join(partition)
100 | 
101 |         pass_number += 1
102 |         if pass_number > repeatability:
103 |             pass_number = 1
104 |             cycle_number += 1
105 | 
106 |     xds['cycle_number'] = xarray.DataArray(numpy.array(
107 |         [xds['cycle_number'].values] * 2).T,
108 |                                            dims=('num_lines', 'nump_pixels'))
109 |     zds = dataset.Dataset.from_xarray(xds)
110 |     if not delayed:
111 |         zds = zds.compute()
112 |     with pytest.raises(ValueError):
113 |         list(partitioning.split_dataset(zds, 'num_lines'))
114 | 
115 | 
116 | def test_config() -> None:
117 |     """Test the configuration of the Sequence class."""
118 |     partitioning = Sequence(('cycle_number', 'pass_number'))
119 |     config = partitioning.get_config()
120 |     partitioning = get_codecs(config)  # type: ignore[assignment]
121 |     assert isinstance(partitioning, Sequence)
122 | 
123 | 
124 | def test_pickle() -> None:
125 |     """Test the pickling of the Date class."""
126 |     partitioning = Sequence(('cycle_number', 'pass_number'))
127 |     other = pickle.loads(pickle.dumps(partitioning))
128 |     assert isinstance(other, Sequence)
129 |     assert other.variables == ('cycle_number', 'pass_number')
130 | 
131 | 
132 | # pylint: disable=protected-access
133 | @pytest.mark.parametrize('delayed', [False, True])
134 | def test_multiple_sequence(
135 |     dask_client,  # pylint: disable=redefined-outer-name,unused-argument
136 |     delayed: bool,
137 | ) -> None:
138 |     """Test the creation of a sequence with multiple variables."""
139 |     arrays = {
140 |         '_a': numpy.array([], dtype='i8'),
141 |         '_b': numpy.array([], dtype='i8'),
142 |         '_c': numpy.array([], dtype='i8')
143 |     }
144 |     for _a in range(5):
145 |         for _b in range(5):
146 |             arrays['_a'] = numpy.concatenate(
147 |                 (arrays['_a'], numpy.full((5, ), _a, dtype='i8')))
148 |             arrays['_b'] = numpy.concatenate(
149 |                 (arrays['_b'], numpy.full((5, ), _b, dtype='i8')))
150 |             arrays['_c'] = numpy.concatenate(
151 |                 (arrays['_c'], numpy.arange(5, dtype='i8')))
152 |     partitioning = Sequence(('_a', '_b', '_c'))
153 |     chunks: str = (10, )  # type: ignore[assignment]
154 |     if delayed:
155 |         variables: dict[str, ArrayLike] = {  # type: ignore[assignment]
156 |             '_a': dask.array.core.from_array(arrays['_a'], chunks=chunks),
157 |             '_b': dask.array.core.from_array(arrays['_b'], chunks=chunks),
158 |             '_c': dask.array.core.from_array(arrays['_c'], chunks=chunks)
159 |         }
160 |     else:
161 |         variables = arrays  # type: ignore[assignment]
162 |     _a = 0
163 |     _b = 0
164 |     _c = 0
165 |     for idx, item in enumerate(
166 |             partitioning._split(variables)):  # type: ignore[arg-type]
167 |         assert item[0] == (('_a', _a), ('_b', _b), ('_c', _c))
168 |         _c += 1
169 |         if _c > 4:
170 |             _c = 0
171 |             _b += 1
172 |         if _b > 4:
173 |             _b = 0
174 |             _a += 1
175 |         assert item[1] == slice(idx, idx + 1)
176 | 
177 |     numpy.random.shuffle(arrays['_c'])
178 |     variables['_c'] = dask.array.core.from_array(  # type: ignore[assignment]
179 |         arrays['_c'], chunks=chunks) if delayed else arrays['_c']
180 | 
181 |     with pytest.raises(ValueError):
182 |         list(partitioning._split(variables))  # type: ignore[arg-type]
183 | 
184 |     del variables['_c']
185 |     del variables['_b']
186 |     partitioning = Sequence(('_a', '_b', '_c'))
187 | 
188 |     _a = 0
189 |     for idx, item in enumerate(
190 |             partitioning._split(variables)):  # type: ignore[arg-type]
191 |         assert item[0] == (('_a', _a), )
192 |         _a += 1
193 |         assert item[1] == slice(idx * 25, idx * 25 + 25)
194 |     # pylint: enable=protected-access
195 | 
196 | 
197 | def test_values_must_be_integer() -> None:
198 |     """Test that the values must be integer."""
199 |     values = numpy.arange(0, 100, dtype='f8')
200 |     partitioning = Sequence(('values', ))
201 |     # pylint: disable=protected-access
202 |     with pytest.raises(TypeError):
203 |         list(partitioning._split({'values': values}))
204 |     # pylint: enable=protected-access
205 | 


--------------------------------------------------------------------------------
/zcollection/merging/tests/test_time_series.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """Test the time series merging."""
  6 | import copy
  7 | 
  8 | import numpy
  9 | 
 10 | from .. import time_series
 11 | from ...tests import data
 12 | # pylint: disable=unused-import # Need to import for fixtures
 13 | from ...tests.cluster import dask_client, dask_cluster
 14 | from ...type_hints import NDArray
 15 | 
 16 | # pylint: enable=unused-import # Need to import for fixtures
 17 | 
 18 | 
 19 | def test_merge_disjoint(
 20 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
 21 | ) -> None:
 22 |     """Test the update of two disjoint time series."""
 23 |     generator = data.create_test_dataset()
 24 |     zds0 = next(generator)
 25 |     zds1 = next(generator)
 26 | 
 27 |     zds = time_series.merge_time_series(zds1, zds0, 'time', 'num_lines')
 28 |     assert numpy.all(zds.variables['time'].values == numpy.concatenate((
 29 |         zds0.variables['time'].values, zds1.variables['time'].values)))
 30 | 
 31 |     zds = time_series.merge_time_series(zds0, zds1, 'time', 'num_lines')
 32 |     assert numpy.all(zds.variables['time'].values == numpy.concatenate((
 33 |         zds0.variables['time'].values, zds1.variables['time'].values)))
 34 | 
 35 |     zds = time_series.merge_time_series(zds0, zds0, 'time', 'num_lines')
 36 |     assert numpy.all(
 37 |         zds.variables['time'].values == zds0.variables['time'].values)
 38 | 
 39 | 
 40 | def test_merge_intersection(
 41 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
 42 | ) -> None:
 43 |     """Test the update of two intersecting time series."""
 44 |     generator = data.create_test_dataset()
 45 |     zds0 = next(generator)
 46 |     # ds0.variables["time"].values => numpy.array([
 47 |     #     "2000-01-01T00:00:00.000000", "2000-01-04T00:00:00.000000",
 48 |     #     "2000-01-07T00:00:00.000000", "2000-01-10T00:00:00.000000",
 49 |     #     "2000-01-13T00:00:00.000000", "2000-01-16T00:00:00.000000"
 50 |     # ])
 51 |     zds1 = next(generator)
 52 |     # ds1.variables["time"].values => numpy.array([
 53 |     #     "2000-01-19T00:00:00.000000", "2000-01-22T00:00:00.000000",
 54 |     #     "2000-01-25T00:00:00.000000", "2000-01-28T00:00:00.000000",
 55 |     #     "2000-01-31T00:00:00.000000"])
 56 | 
 57 |     existing_zds = zds1
 58 |     new_zds = copy.deepcopy(zds0)
 59 |     new_zds.variables['time'] = zds0.variables['time'].duplicate(
 60 |         zds0.variables['time'].values + numpy.timedelta64(9, 'D'))
 61 | 
 62 |     zds = time_series.merge_time_series(existing_zds, new_zds, 'time',
 63 |                                         'num_lines')
 64 |     assert numpy.all(zds.variables['time'].values == numpy.concatenate((
 65 |         zds0.variables['time'].values[3:], zds1.variables['time'].values[:])))
 66 | 
 67 |     existing_zds = zds0
 68 |     new_zds = copy.deepcopy(zds1)
 69 |     new_zds.variables['time'] = zds1.variables['time'].duplicate(
 70 |         zds1.variables['time'].values - numpy.timedelta64(9, 'D'))
 71 |     zds = time_series.merge_time_series(existing_zds, new_zds, 'time',
 72 |                                         'num_lines')
 73 |     assert numpy.all(zds.variables['time'].values == numpy.concatenate((
 74 |         zds0.variables['time'].values[:], zds1.variables['time'].values[:2])))
 75 | 
 76 |     existing_zds = zds0
 77 |     new_zds = zds0.isel({'num_lines': slice(1, -1)})
 78 |     new_zds.variables['var1'] = new_zds.variables['var1'].duplicate(
 79 |         new_zds.variables['var1'].values + 100)
 80 |     zds = time_series.merge_time_series(existing_zds, new_zds, 'time',
 81 |                                         'num_lines')
 82 |     assert numpy.all(zds.variables['var1'].values == numpy.concatenate((
 83 |         zds0.variables['var1'].values[:1],
 84 |         zds0.variables['var1'].values[1:-1] + 100,
 85 |         zds0.variables['var1'].values[-1:])))
 86 | 
 87 | 
 88 | def test_intersection_with_tolerance() -> None:
 89 |     """Test the update of two intersecting time series with a data gap."""
 90 |     axis: NDArray = numpy.arange(numpy.datetime64('2000-01-01', 'ns'),
 91 |                                  numpy.datetime64('2000-01-01T23:59:59', 'ns'),
 92 |                                  numpy.timedelta64(1, 's'))
 93 |     measures = numpy.vstack((numpy.arange(axis.size), ) * 25).T
 94 |     zds0 = data.make_dataset(axis, measures, delayed=False)
 95 | 
 96 |     dates: NDArray = numpy.arange(
 97 |         numpy.datetime64('2000-01-01T10:00:00', 'ns'),
 98 |         numpy.datetime64('2000-01-01T14:59:59', 'ns'),
 99 |         numpy.timedelta64(1, 's'))
100 | 
101 |     # Create a gap in the data by removing the data between 11:00 and 13:00
102 |     mask = (dates > numpy.datetime64('2000-01-01T11:00:00', 'ns')) & (
103 |         dates < numpy.datetime64('2000-01-01T13:00:00', 'ns'))
104 |     dates = dates[~mask]
105 |     measures = numpy.vstack((numpy.full(dates.size, -1), ) * 25).T
106 |     zds1 = data.make_dataset(dates, measures, delayed=False)
107 | 
108 |     # Merge the two datasets with a tolerance of 1 minute to keep the
109 |     # data gap in the existing dataset.
110 |     zds_gap_filled = time_series.merge_time_series(zds0,
111 |                                                    zds1,
112 |                                                    'time',
113 |                                                    'num_lines',
114 |                                                    tolerance=numpy.timedelta64(
115 |                                                        1, 'm'))
116 |     # Merge the two datasets without a tolerance. The data gap is
117 |     # kept and stored in the new dataset.
118 |     zds_with_gap = time_series.merge_time_series(
119 |         zds0,
120 |         zds1,
121 |         'time',
122 |         'num_lines',
123 |     )
124 |     assert zds_with_gap.time.size == zds0.time.size - mask.sum()
125 | 
126 |     mask = (axis > numpy.datetime64('2000-01-01T11:00:00', 'ns')) & (
127 |         axis < numpy.datetime64('2000-01-01T13:00:00', 'ns'))
128 |     assert numpy.all(zds_gap_filled.variables['time'].values ==
129 |                      zds0.variables['time'].values)
130 |     assert numpy.all((zds_gap_filled.variables['var1'].values[:, 0] < 0
131 |                       ).sum() == zds1.dimensions['num_lines'])
132 | 
133 |     # Create gaps in the data by removing the data between 11:00 to 13:00
134 |     # 15:00 to 17:00 and 19:00 to 21:00
135 |     mask = (axis > numpy.datetime64('2000-01-01T11:00:00', 'ns')) & (
136 |         axis < numpy.datetime64('2000-01-01T13:00:00', 'ns'))
137 |     mask |= (axis > numpy.datetime64('2000-01-01T15:00:00', 'ns')) & (
138 |         axis < numpy.datetime64('2000-01-01T17:00:00', 'ns'))
139 |     mask |= (axis > numpy.datetime64('2000-01-01T19:00:00', 'ns')) & (
140 |         axis < numpy.datetime64('2000-01-01T21:00:00', 'ns'))
141 | 
142 |     dates = axis[~mask]
143 | 
144 |     measures = numpy.vstack((numpy.full(dates.size, -1), ) * 25).T
145 |     zds1 = data.make_dataset(dates, measures, delayed=False)
146 | 
147 |     # Merge the two datasets with a tolerance of 1 minute to keep the
148 |     # data gaps in the existing dataset.
149 |     zds_gap_filled = time_series.merge_time_series(zds0,
150 |                                                    zds1,
151 |                                                    'time',
152 |                                                    'num_lines',
153 |                                                    tolerance=numpy.timedelta64(
154 |                                                        1, 'm'))
155 |     # Merge the two datasets without a tolerance. The data gaps are
156 |     # kept and stored in the new dataset.
157 |     zds_with_gap = time_series.merge_time_series(
158 |         zds0,
159 |         zds1,
160 |         'time',
161 |         'num_lines',
162 |     )
163 | 
164 |     assert numpy.all(zds_gap_filled.variables['time'].values ==
165 |                      zds0.variables['time'].values)
166 |     assert zds_with_gap.time.size == zds0.time.size - mask.sum()
167 |     assert numpy.all((zds_gap_filled.variables['var1'].values[:, 0] < 0
168 |                       ).sum() == zds1.dimensions['num_lines'])
169 | 


--------------------------------------------------------------------------------
/zcollection/partitioning/date.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Partitioning by date
  7 | ====================
  8 | """
  9 | from __future__ import annotations
 10 | 
 11 | from typing import Any, ClassVar, Iterator, Sequence
 12 | import datetime
 13 | 
 14 | import dask.array.core
 15 | import numpy
 16 | 
 17 | from . import abc
 18 | from ..type_hints import ArrayLike, NDArray
 19 | 
 20 | #: Numpy time units
 21 | RESOLUTION = ('Y', 'M', 'D', 'h', 'm', 's')
 22 | 
 23 | #: Numpy time unit meanings
 24 | UNITS = ('year', 'month', 'day', 'hour', 'minute', 'second')
 25 | 
 26 | #: Data type for time units
 27 | DATA_TYPES = ('uint16', 'uint8', 'uint8', 'uint8', 'uint8', 'uint8')
 28 | 
 29 | #: Time separation units
 30 | SEPARATORS: dict[str, str] = {
 31 |     'year': '-',
 32 |     'month': '-',
 33 |     'day': 'T',
 34 |     'hour': ':',
 35 |     'minute': ':',
 36 |     'second': '.'
 37 | }
 38 | 
 39 | 
 40 | def _unique(arr: ArrayLike, is_delayed: bool) -> tuple[NDArray, NDArray]:
 41 |     """Return unique elements and their indices.
 42 | 
 43 |     Args:
 44 |         arr: Array of elements.
 45 |         is_delayed: If True, the array is delayed.
 46 |     Returns:
 47 |         Tuple of unique elements and their indices.
 48 |     Raises:
 49 |         ValueError: If the array is not monotonic.
 50 |     """
 51 |     index: NDArray
 52 |     indices: NDArray
 53 | 
 54 |     if is_delayed:
 55 |         index, indices = abc.unique(arr)  # type: ignore[arg-type]
 56 |         # We don't use here the function `numpy.diff` but `abc.difference` for
 57 |         # optimization purposes.
 58 |         if not numpy.all(
 59 |                 abc.difference(index.view(numpy.int64)) >= 0):  # type: ignore
 60 |             raise ValueError('index is not monotonic')
 61 |         return index, indices
 62 |     return abc.unique_and_check_monotony(arr)
 63 | 
 64 | 
 65 | class Date(abc.Partitioning):
 66 |     """Initialize a partitioning scheme based on dates.
 67 | 
 68 |     Args:
 69 |         variables: A list of strings representing the variables to be used for
 70 |             partitioning.
 71 |         resolution: Time resolution of the partitioning. Must be in
 72 |             :data:`RESOLUTION`.
 73 | 
 74 |     Raises:
 75 |         ValueError: If the resolution is not in the list of supported
 76 |             resolutions or if the partitioning is not performed on a one
 77 |             dimensional variable.
 78 | 
 79 |     Example:
 80 |         >>> partitioning = Date(variables=("time", ), resolution="Y")
 81 |     """
 82 |     __slots__ = ('_attrs', '_index', 'resolution')
 83 | 
 84 |     #: The ID of the partitioning scheme
 85 |     ID: ClassVar[str] = 'Date'
 86 | 
 87 |     def __init__(self, variables: Sequence[str], resolution: str) -> None:
 88 |         if len(variables) != 1:
 89 |             raise ValueError(
 90 |                 'Partitioning on dates is performed on a single variable.')
 91 |         if resolution not in RESOLUTION:
 92 |             raise ValueError('resolution must be in: ' + ', '.join(RESOLUTION))
 93 |         index: int = RESOLUTION.index(resolution) + 1
 94 | 
 95 |         #: The time resolution of the partitioning
 96 |         self.resolution: str = resolution
 97 |         #: The time parts used for the partitioning
 98 |         self._attrs: tuple[str, ...] = UNITS[:index + 1]
 99 |         #: The indices of the time parts used for the partitioning
100 |         self._index = tuple(range(index))
101 |         super().__init__(variables,
102 |                          tuple(DATA_TYPES[ix] for ix in self._index))
103 | 
104 |     def _keys(self) -> Sequence[str]:
105 |         """Return the keys of the partitioning scheme."""
106 |         return tuple(UNITS[ix] for ix in self._index)
107 | 
108 |     # pylint: disable=arguments-differ
109 |     # False positive: the base method is static.
110 |     def _partition(  # type: ignore[override]
111 |         self,
112 |         selection: tuple[tuple[str, Any], ...],
113 |     ) -> tuple[str, ...]:
114 |         """Return the partitioning scheme for the given selection."""
115 |         datetime64: NDArray = selection[0][1]
116 |         py_datetime: datetime.datetime = datetime64.astype('M8[s]').item()
117 |         return tuple(UNITS[ix] + '=' +
118 |                      f'{getattr(py_datetime, self._attrs[ix]):02d}'
119 |                      for ix in self._index)
120 |         # pylint: enable=arguments-differ
121 | 
122 |     def _split(
123 |         self,
124 |         variables: dict[str, ArrayLike],
125 |     ) -> Iterator[abc.Partition]:
126 |         """Return the partitioning scheme for the given variables."""
127 |         index: NDArray
128 |         indices: NDArray
129 |         name: str
130 |         values: ArrayLike
131 | 
132 |         # Determine if the variables are handled by Dask.
133 |         is_delayed: bool = any(
134 |             isinstance(value, dask.array.core.Array)
135 |             for value in variables.values())
136 |         name, values = tuple(variables.items())[0]
137 | 
138 |         if not numpy.issubdtype(values.dtype, numpy.dtype('datetime64')):
139 |             raise TypeError('values must be a datetime64 array')
140 | 
141 |         index, indices = _unique(
142 |             values.astype(f'datetime64[{self.resolution}]'), is_delayed)
143 |         indices = abc.concatenate_item(indices, values.size)
144 | 
145 |         return ((((name, date), ), slice(start, indices[ix + 1], None))
146 |                 for date, (ix, start) in zip(index, enumerate(indices[:-1])))
147 | 
148 |     @staticmethod
149 |     def _stringify(partition: tuple[tuple[str, int], ...]) -> str:
150 |         """Return a string representation of the partitioning scheme."""
151 |         string = ''.join(f'{value:02d}' + SEPARATORS[item]
152 |                          for item, value in partition)
153 |         if string[-1] in SEPARATORS.values():
154 |             string = string[:-1]
155 |         return string
156 | 
157 |     @staticmethod
158 |     def join(partition_scheme: tuple[tuple[str, int], ...], sep: str) -> str:
159 |         """Join a partitioning scheme.
160 | 
161 |         Args:
162 |             partition_scheme: The partitioning scheme to be joined.
163 |             sep: The separator to be used.
164 | 
165 |         Returns:
166 |             The joined partitioning scheme.
167 | 
168 |         Example:
169 |             >>> partitioning = Date(variables=("time", ), resolution="D")
170 |             >>> partitioning.join((("year", 2020), ("month", 1), ("day", 1)),
171 |             ...                   "/")
172 |             'year=2020/month=01/day=01'
173 |         """
174 |         return sep.join(f'{k}={v:02d}' for k, v in partition_scheme)
175 | 
176 |     def encode(
177 |         self,
178 |         partition: tuple[tuple[str, int], ...],
179 |     ) -> tuple[Any, ...]:
180 |         """Encode a partitioning scheme.
181 | 
182 |         Args:
183 |             partition: The partitioning scheme to be encoded.
184 | 
185 |         Returns:
186 |             The encoded partitioning scheme.
187 | 
188 |         Example:
189 |             >>> partitioning = Date(variables=("time", ), resolution="D")
190 |             >>> fields = partitioning.parse("year=2020/month=01/day=01")
191 |             >>> fields
192 |             (("year", 2020), ("month", 1), ("day", 1))
193 |             >>> partitioning.encode(fields)
194 |             (numpy.datetime64('2020-01-01'),)
195 |         """
196 |         return (numpy.datetime64(self._stringify(partition)), )
197 | 
198 |     def decode(
199 |         self,
200 |         values: tuple[Any, ...],
201 |     ) -> tuple[tuple[str, int], ...]:
202 |         """Decode a partitioning scheme.
203 | 
204 |         Args:
205 |             values: The partitioning scheme to be decoded.
206 | 
207 |         Returns:
208 |             The decoded partitioning scheme.
209 | 
210 |         Example:
211 |             >>> partitioning = Date(variables=("time", ), resolution="D")
212 |             >>> partitioning.decode((numpy.datetime64('2020-01-01'), ))
213 |             (("year", 2020), ("month", 1), ("day", 1))
214 |         """
215 |         datetime64: NDArray = values[0]
216 |         py_datetime: datetime.datetime = datetime64.astype('M8[s]').item()
217 |         return tuple((UNITS[ix], getattr(py_datetime, self._attrs[ix]))
218 |                      for ix in self._index)
219 | 


--------------------------------------------------------------------------------
/zcollection/variable/array.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | In memory variable arrays.
  7 | ==========================
  8 | """
  9 | from __future__ import annotations
 10 | 
 11 | from typing import Any, Sequence
 12 | 
 13 | import dask.array.core
 14 | import dask.array.ma
 15 | import numcodecs.abc
 16 | import numpy
 17 | import zarr
 18 | 
 19 | from ..meta import Attribute
 20 | from ..type_hints import ArrayLike, NDArray, NDMaskedArray
 21 | from .abc import Variable, concat, new_variable, not_equal
 22 | 
 23 | 
 24 | def _as_numpy_array(
 25 |     arr: Any,
 26 |     *,
 27 |     fill_value: Any | None = None,
 28 | ) -> tuple[NDArray, Any]:
 29 |     """Convert an array-like object to a numpy array.
 30 | 
 31 |     Args:
 32 |         arr: An array-like object.
 33 |         fill_value: The fill value.
 34 | 
 35 |     Returns:
 36 |         If the data provided is a masked array, the functions return the array
 37 |         with masked data replaced by its fill value and the fill value of the
 38 |         offered masked array. Otherwise, the provided array and fill value.
 39 |     """
 40 |     result: NDArray = numpy.asanyarray(arr)
 41 |     if isinstance(result, numpy.ma.MaskedArray):
 42 |         if fill_value is not None and not_equal(fill_value, result.fill_value):
 43 |             raise ValueError(
 44 |                 f'The fill value {fill_value!r} does not match the fill value '
 45 |                 f'{result.fill_value!r} of the array.')
 46 |         return numpy.ma.filled(result, result.fill_value), result.fill_value
 47 |     return result, fill_value
 48 | 
 49 | 
 50 | class Array(Variable):
 51 |     """Access to the chunked data using numpy arrays.
 52 | 
 53 |     Args:
 54 |         name: Name of the variable
 55 |         data: Variable data
 56 |         dimensions: Variable dimensions
 57 |         attrs: Variable attributes
 58 |         compressor: Compression codec
 59 |         fill_value: Value to use for uninitialized values
 60 |         filters: Filters to apply before writing data to disk
 61 |     """
 62 | 
 63 |     def __init__(self,
 64 |                  name: str,
 65 |                  data: ArrayLike[Any],
 66 |                  dimensions: Sequence[str],
 67 |                  *,
 68 |                  attrs: Sequence[Attribute] | None = None,
 69 |                  compressor: numcodecs.abc.Codec | None = None,
 70 |                  fill_value: Any | None = None,
 71 |                  filters: Sequence[numcodecs.abc.Codec] | None = None) -> None:
 72 |         array: NDArray
 73 |         array, fill_value = _as_numpy_array(data, fill_value=fill_value)
 74 |         super().__init__(
 75 |             name,
 76 |             array,
 77 |             dimensions,
 78 |             attrs=attrs,
 79 |             compressor=compressor,
 80 |             fill_value=fill_value,
 81 |             filters=filters,
 82 |         )
 83 | 
 84 |     @property
 85 |     def data(self) -> dask.array.core.Array:
 86 |         """Return the numpy array wrapped in a dask array. If the variable has
 87 |         a fill value, the result is a masked array where masked values are
 88 |         equal to the fill value.
 89 | 
 90 |         Returns:
 91 |             The dask array
 92 | 
 93 |         .. seealso::
 94 | 
 95 |             :meth:`Variable.array`
 96 |         """
 97 |         if self.fill_value is None:
 98 |             return dask.array.core.from_array(self.array)
 99 |         return dask.array.ma.masked_equal(self.array, self.fill_value)
100 | 
101 |     @property
102 |     def values(self) -> NDArray | NDMaskedArray:
103 |         """Return the variable data as a numpy array.
104 | 
105 |         .. note::
106 | 
107 |             If the variable has a fill value, the result is a masked array where
108 |             masked values are equal to the fill value.
109 | 
110 |         Returns:
111 |             The variable data
112 |         """
113 |         return self.array if self.fill_value is None else numpy.ma.masked_equal(
114 |             self.array, self.fill_value)
115 | 
116 |     @values.setter
117 |     def values(self, data: Any) -> None:
118 |         """Defines the underlying numpy array. If the data provided is a masked
119 |         array, it's converted to an array, where the masked values are replaced
120 |         by its fill value, and its fill value becomes the new fill value of
121 |         this instance. Otherwise, the underlying array is defined as the new
122 |         data and the fill value is set to None.
123 | 
124 |         Args:
125 |             data: The new data to use
126 | 
127 |         Raises:
128 |             ValueError: If the shape of the data does not match the shape of
129 |                 the stored data.
130 |         """
131 |         if len(data.shape) != len(self.dimensions):
132 |             raise ValueError('data shape does not match variable dimensions')
133 |         self.array, self.fill_value = _as_numpy_array(
134 |             data, fill_value=self.fill_value)
135 | 
136 |     def persist(self, **_) -> Array:
137 |         """Persist the variable data into memory.
138 | 
139 |         Returns:
140 |             The variable
141 |         """
142 |         return self
143 | 
144 |     def compute(self, **_) -> NDArray | NDMaskedArray:
145 |         """Return the variable data as a numpy array.
146 | 
147 |         .. note::
148 | 
149 |             If the variable has a fill value, the result is a masked array where
150 |             masked values are equal to the fill value.
151 |         """
152 |         return self.values
153 | 
154 |     def fill(self) -> Array:
155 |         """Fill the variable with the fill value. If the variable has no fill
156 |         value, this method does nothing.
157 | 
158 |         Returns:
159 |             The variable.
160 |         """
161 |         if self.fill_value is not None:
162 |             self.array = numpy.full_like(self.array, self.fill_value)
163 |         return self
164 | 
165 |     @classmethod
166 |     def from_zarr(cls, array: zarr.Array, name: str, dimension: str,
167 |                   **kwargs) -> Array:
168 |         """Create a new variable from a zarr array.
169 | 
170 |         Args:
171 |             array: The zarr array
172 |             name: Name of the variable
173 |             dimension: Name of the attribute that defines the dimensions of the
174 |                 variable
175 |             **kwargs: Additional arguments. These arguments are ignored, but
176 |                 they are accepted to be compatible with the base class.
177 | 
178 |         Returns:
179 |             The variable
180 |         """
181 |         attrs = tuple(
182 |             Attribute(k, v) for k, v in array.attrs.items() if k != dimension)
183 |         return new_variable(cls,
184 |                             name=name,
185 |                             array=array[...],
186 |                             dimensions=array.attrs[dimension],
187 |                             attrs=attrs,
188 |                             compressor=array.compressor,
189 |                             fill_value=array.fill_value,
190 |                             filters=tuple(array.filters or ()))
191 | 
192 |     def concat(self, other: Array | Sequence[Array], dim: str) -> Array:
193 |         """Concatenate this variable with another variable or a list of
194 |         variables along a dimension.
195 | 
196 |         Args:
197 |             other: Variable or list of variables to concatenate with this
198 |                 variable.
199 |             dim: Dimension to concatenate along.
200 | 
201 |         Returns:
202 |             New variable.
203 | 
204 |         Raises:
205 |             ValueError: if the variables provided is an empty sequence or if
206 |                 any item in the sequence is not an instance of :class:`Array`.
207 |         """
208 |         return concat(self, other, numpy.concatenate, dim)
209 | 
210 |     def __getitem__(self, key: Any) -> Any:
211 |         """Get a slice of the variable.
212 | 
213 |         Args:
214 |             key: Slice or index to use.
215 |         Returns:
216 |             The variable slice.
217 |         """
218 |         return (self.array[key] if self.fill_value is None else
219 |                 numpy.ma.masked_equal(self.array[key], self.fill_value))
220 | 
221 |     def rechunk(self, **_) -> Array:
222 |         """Rechunk the variable.
223 | 
224 |         Returns:
225 |             The variable.
226 |         """
227 |         return self
228 | 


--------------------------------------------------------------------------------
/zcollection/indexing/tests/test_abc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """Test the base class for indexing."""
  6 | from __future__ import annotations
  7 | 
  8 | from typing import Iterator
  9 | import pathlib
 10 | 
 11 | import fsspec
 12 | import numpy
 13 | import pyarrow
 14 | import pytest
 15 | 
 16 | from .. import abc
 17 | from ... import collection, convenience, dataset, partitioning
 18 | from ...partitioning.tests import data
 19 | # pylint: disable=unused-import # Need to import for fixtures
 20 | from ...tests.cluster import dask_client, dask_cluster
 21 | from ...tests.fs import local_fs
 22 | # pylint: enable=unused-import
 23 | from ...type_hints import NDArray
 24 | 
 25 | 
 26 | def split_half_orbit(
 27 |     cycle_number: numpy.ndarray,
 28 |     pass_number: numpy.ndarray,
 29 | ) -> Iterator[tuple[int, int]]:
 30 |     """Calculate the indexes of the start and stop of each half-orbit.
 31 | 
 32 |     Args:
 33 |         pass_number: Pass numbers.
 34 | 
 35 |     Returns:
 36 |         Iterator of start and stop indexes.
 37 |     """
 38 |     assert pass_number.shape == cycle_number.shape
 39 |     pass_idx = numpy.where(numpy.roll(pass_number, 1) != pass_number)[0]
 40 |     cycle_idx = numpy.where(numpy.roll(cycle_number, 1) != cycle_number)[0]
 41 | 
 42 |     half_orbit = numpy.unique(
 43 |         numpy.concatenate(
 44 |             (pass_idx, cycle_idx, numpy.array([pass_number.size],
 45 |                                               dtype='int64'))))
 46 |     del pass_idx, cycle_idx
 47 | 
 48 |     yield from tuple(zip(half_orbit[:-1], half_orbit[1:]))
 49 | 
 50 | 
 51 | # pylint: disable=unused-argument,invalid-name
 52 | # The signature of the function must follow the signature of
 53 | # zcollection.PartitionCallable
 54 | def _half_orbit(
 55 |     zds: dataset.Dataset,
 56 |     *args,
 57 |     dtype: numpy.dtype | None = None,
 58 |     **kwargs,
 59 | ) -> NDArray:
 60 |     """Return the indexes of the start and stop of each half-orbit.
 61 | 
 62 |     Args:
 63 |         zds: Datasets stored in a partition to be indexed.
 64 | 
 65 |     Returns:
 66 |         Dictionary of start and stop indexes for each half-orbit.
 67 |     """
 68 |     pass_number_varname = kwargs.pop('pass_number', 'pass_number')
 69 |     cycle_number_varname = kwargs.pop('cycle_number', 'cycle_number')
 70 |     pass_number = zds.variables[pass_number_varname].values
 71 |     cycle_number = zds.variables[cycle_number_varname].values
 72 | 
 73 |     generator = ((
 74 |         i0,
 75 |         i1,
 76 |         cycle_number[i0],
 77 |         pass_number[i0],
 78 |     ) for i0, i1 in split_half_orbit(cycle_number, pass_number))
 79 | 
 80 |     return numpy.fromiter(  # type: ignore
 81 |         generator, dtype)
 82 | 
 83 | 
 84 | class HalfOrbitIndexer(abc.Indexer):
 85 |     """Index SWOT collection by half-orbit."""
 86 |     #: Column name of the cycle number.
 87 |     CYCLE_NUMBER = 'cycle_number'
 88 | 
 89 |     #: Column name of the pass number.
 90 |     PASS_NUMBER = 'pass_number'
 91 | 
 92 |     def dtype(self, /, **kwargs) -> list[tuple[str, str]]:
 93 |         """Return the columns of the index.
 94 | 
 95 |         Returns:
 96 |             A tuple of (name, type) pairs.
 97 |         """
 98 |         return super().dtype() + [
 99 |             (self.CYCLE_NUMBER, 'uint16'),
100 |             (self.PASS_NUMBER, 'uint16'),
101 |         ]
102 | 
103 |     @classmethod
104 |     def create(
105 |         cls,
106 |         path: pathlib.Path | str,
107 |         zds: collection.Collection,
108 |         *,
109 |         filesystem: fsspec.AbstractFileSystem | None = None,
110 |         **kwargs,
111 |     ) -> HalfOrbitIndexer:
112 |         """Create a new index.
113 | 
114 |         Args:
115 |             path: The path to the index.
116 |             zds: The collection to be indexed.
117 |             filesystem: The filesystem to use.
118 | 
119 |         Returns:
120 |             The created index.
121 |         """
122 |         return super()._create(path,
123 |                                zds,
124 |                                meta={'attribute': b'value'},
125 |                                filesystem=filesystem)  # type: ignore
126 | 
127 |     def update(
128 |         self,
129 |         zds: collection.Collection,
130 |         *,
131 |         partition_size: int | None = None,
132 |         npartitions: int | None = None,
133 |         **kwargs,
134 |     ) -> None:
135 |         """Update the index.
136 | 
137 |         Args:
138 |             zds: New data stored in the collection to be indexed.
139 |             partition_size: The length of each bag partition.
140 |             npartitions: The number of desired bag partitions.
141 |             cycle_number: The name of the cycle number variable stored in the
142 |                 collection. Defaults to "cycle_number".
143 |             pass_number: The name of the pass number variable stored in the
144 |                 collection. Defaults to "pass_number".
145 |         """
146 |         super()._update(zds,
147 |                         _half_orbit,
148 |                         partition_size,
149 |                         npartitions,
150 |                         dtype=self.dtype(),
151 |                         **kwargs)
152 | 
153 | 
154 | def test_indexer(
155 |         dask_client,  # pylint: disable=redefined-outer-name,unused-argument
156 |         local_fs,  # pylint: disable=redefined-outer-name
157 | ):
158 |     """Test the base class of the indexer."""
159 |     ds = dataset.Dataset.from_xarray(data.create_test_sequence(5, 20, 10))
160 | 
161 |     zcollection = convenience.create_collection(
162 |         'time',
163 |         ds,
164 |         partitioning.Date(('time', ), 'M'),
165 |         partition_base_dir=str(local_fs.collection),
166 |         filesystem=local_fs.fs)
167 |     zcollection.insert(ds, merge_callable=collection.merging.merge_time_series)
168 | 
169 |     indexer = HalfOrbitIndexer.create(str(
170 |         local_fs.collection.joinpath('index.parquet')),
171 |                                       zcollection,
172 |                                       filesystem=local_fs.fs)
173 | 
174 |     # Index not yet created
175 |     with pytest.raises(ValueError):
176 |         _ = indexer.table
177 | 
178 |     assert indexer.dtype() == [('start', 'int64'), ('stop', 'int64'),
179 |                                ('cycle_number', 'uint16'),
180 |                                ('pass_number', 'uint16')]
181 |     indexer.update(zcollection)
182 |     assert isinstance(indexer.table, pyarrow.Table)
183 | 
184 |     selection = zcollection.load(indexer=indexer.query({'cycle_number': 2}))
185 |     assert selection is not None
186 |     assert set(selection.variables['cycle_number'].values) == {2}
187 | 
188 |     with pytest.raises(ValueError):
189 |         indexer.query({'cycle_number': 3}, logical_op='X')
190 | 
191 |     with pytest.raises(ValueError):
192 |         indexer.query({'X': 3})
193 | 
194 |     # Updating the index should not change the indexer.
195 |     indexer.update(zcollection)
196 |     other = zcollection.load(indexer=indexer.query({'cycle_number': 2}))
197 |     assert other is not None
198 |     assert numpy.all(
199 |         other['observation'].values == selection['observation'].values)
200 | 
201 |     selection = zcollection.load(
202 |         indexer=indexer.query({'cycle_number': [2, 4]}))
203 |     assert selection is not None
204 |     assert set(selection.variables['cycle_number'].values) == {2, 4}
205 | 
206 |     selection = zcollection.load(indexer=indexer.query({
207 |         'cycle_number': [2, 4],
208 |         'pass_number': 1
209 |     }))
210 |     assert selection is not None
211 |     assert set(selection.variables['cycle_number'].values) == {2, 4}
212 |     assert set(selection.variables['pass_number'].values) == {1}
213 | 
214 |     selection = zcollection.load(indexer=indexer.query({
215 |         'cycle_number': [2, 4],
216 |         'pass_number': [1, 5]
217 |     }))
218 |     assert selection is not None
219 |     assert set(selection.variables['cycle_number'].values) == {2, 4}
220 |     assert set(selection.variables['pass_number'].values) == {1, 5}
221 | 
222 |     indexer = HalfOrbitIndexer.open(str(
223 |         local_fs.collection.joinpath('index.parquet')),
224 |                                     filesystem=local_fs.fs)
225 |     assert indexer.meta == {'attribute': b'value'}
226 |     selection = zcollection.load(indexer=indexer.query({
227 |         'cycle_number': [2, 4],
228 |         'pass_number': [1, 5]
229 |     }))
230 |     assert selection is not None
231 |     assert set(selection.variables['cycle_number'].values) == {2, 4}
232 |     assert set(selection.variables['pass_number'].values) == {1, 5}
233 | 
234 |     indices = tuple(
235 |         indexer.query({'cycle_number': [2, 4]}, only_partition_keys=False))
236 |     assert tuple(item[0] for item in indices[0][0]) == (
237 |         'cycle_number',
238 |         'pass_number',
239 |         'year',
240 |         'month',
241 |     )
242 | 
243 |     indexer = HalfOrbitIndexer('', filesystem=fsspec.filesystem('memory'))
244 |     assert indexer.query({'cycle_number': [2, 4]}) == ()
245 | 


--------------------------------------------------------------------------------
/examples/ex_view.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Overview of a View.
  3 | ===================
  4 | 
  5 | This section outlines the steps required to get started with the main features
  6 | of a ``View``.
  7 | """
  8 | from typing import Iterator
  9 | import pprint
 10 | 
 11 | import dask.distributed
 12 | import fsspec
 13 | import numpy
 14 | 
 15 | import zcollection
 16 | import zcollection.tests.data
 17 | 
 18 | 
 19 | # %%
 20 | # Initialization of the environment
 21 | # ---------------------------------
 22 | #
 23 | # As in the example of handling
 24 | # :ref:`collections <sphx_glr_auto_examples_ex_collection.py>`, we will create
 25 | # the test environment and a collection.
 26 | def create_dataset() -> zcollection.Dataset:
 27 |     """Create a dataset to record."""
 28 |     generator: Iterator[zcollection.Dataset] = \
 29 |         zcollection.tests.data.create_test_dataset_with_fillvalue()
 30 |     return next(generator)
 31 | 
 32 | 
 33 | cluster = dask.distributed.LocalCluster(processes=False)
 34 | client = dask.distributed.Client(cluster)
 35 | 
 36 | zds: zcollection.Dataset | None = create_dataset()
 37 | assert zds is not None
 38 | fs: fsspec.AbstractFileSystem = fsspec.filesystem('memory')
 39 | collection: zcollection.Collection = zcollection.create_collection(
 40 |     'time',
 41 |     zds,
 42 |     zcollection.partitioning.Date(('time', ), resolution='M'),
 43 |     '/view_reference',
 44 |     filesystem=fs)
 45 | collection.insert(zds, merge_callable=zcollection.merging.merge_time_series)
 46 | 
 47 | # %%
 48 | # Creation of views
 49 | # -----------------
 50 | #
 51 | # A :py:class:`view<zcollection.view.View>` allows you to extend a collection
 52 | # (:py:class:`a view reference<zcollection.view.ViewReference>`) that you are
 53 | # not allowed to modify.
 54 | view: zcollection.View = zcollection.create_view(
 55 |     '/my_view',
 56 |     zcollection.view.ViewReference('/view_reference', fs),
 57 |     filesystem=fs)
 58 | 
 59 | # %%
 60 | # .. note::
 61 | #
 62 | #     The created view can be accessed using the following command ::
 63 | #
 64 | #         >>> view = zcollection.open_view("/my_view", filesystem=fs)
 65 | #
 66 | # Editing variables
 67 | # -----------------
 68 | # When the view is created, it has no data of its own, it uses all the
 69 | # partitions defined in the reference view. You can select the partitions used
 70 | # from the reference collection by specifying the keyword argument ``filters``
 71 | # during the creation of the view.
 72 | pprint.pprint(fs.listdir('/my_view'))
 73 | 
 74 | # %%
 75 | # It's not yet possible to read data from the view, as it does not yet have any
 76 | # data. To minimize the risk of mismatches with the reference view, the data
 77 | # present in the view drives the range of data that can be read.
 78 | try:
 79 |     view.load()
 80 | except ValueError as err:
 81 |     print(err)
 82 | 
 83 | # %%
 84 | # Such a state of the view is not very interesting. But it is possible to
 85 | # :py:meth:`add<zcollection.view.View.add_variable>` and modify variables in
 86 | # order to enhance the view.
 87 | var3_template: zcollection.meta.Variable = zds.metadata().variables['var2']
 88 | var3_template.name = 'var3'
 89 | view.add_variable(var3_template)
 90 | del var3_template
 91 | 
 92 | # %%
 93 | # This step creates all necessary partitions for the new variable.
 94 | pprint.pprint(fs.listdir('/my_view/year=2000'))
 95 | 
 96 | # %%
 97 | # The new variable is not initialized.
 98 | zds = view.load()
 99 | assert zds is not None
100 | zds.variables['var3'].values
101 | 
102 | # %%
103 | # The same principle used by the collection allows to
104 | # :py:meth:`update<zcollection.view.View.update>` the variables.
105 | view.update(
106 |     lambda ds: dict(var3=ds['var1'].values * 0 + 1))  # type: ignore[arg-type]
107 | 
108 | # %%
109 | # Like the :py:meth:`update<zcollection.collection.Collection.update>` method
110 | # of the collection, the update method of view allows to selecting the
111 | # neighboring partitions with the keyword argument ``depth``.
112 | 
113 | # %%
114 | zds = view.load()
115 | assert zds is not None
116 | var3: numpy.ndarray = zds['var3'].values
117 | print(var3)
118 | 
119 | # %%
120 | # **Warning**: The variables of the reference collection cannot be edited.
121 | try:
122 |     view.update(
123 |         lambda ds: dict(var2=ds['var2'].values * 0))  # type: ignore[arg-type]
124 | except ValueError as exc:
125 |     print(str(exc))
126 | 
127 | 
128 | # %%
129 | # Sync the view with the reference
130 | # --------------------------------
131 | # The view may not be read anymore if the number of elements in the reference
132 | # collection and in the view is not identical. To avoid this problem, the view
133 | # is automatically synchronized when it is opened. But only if the reference
134 | # collection has been completed (adding new data after the existing data), the
135 | # data already present in the view are kept. The existing tables in the view are
136 | # resized and filled with the defined fill values. If you want to know which
137 | # partitions are synchronized, you have to use the following data flow: open the
138 | # view and ask not to synchronize it (``resync=False``), then call the ``sync``
139 | # method of view class to obtain a filter allowing selecting all the partitions
140 | # that have been modified.
141 | #
142 | # Let's illustrate this data flow with an example.
143 | #
144 | # First, we create an utility function to resize a dataset.
145 | def resize(ds: zcollection.Dataset, dim: str,
146 |            size: int) -> zcollection.Dataset:
147 |     """Resize a dataset."""
148 | 
149 |     def new_shape(
150 |         var: zcollection.Variable,
151 |         selected_dim: str,
152 |         new_size: int,
153 |     ) -> tuple[int, ...]:
154 |         """Compute the new shape of a variable."""
155 |         return tuple(new_size if dim == selected_dim else size
156 |                      for dim, size in zip(var.dimensions, var.shape))
157 | 
158 |     return zcollection.Dataset([
159 |         zcollection.Array(
160 |             name,
161 |             numpy.resize(var.array.compute(), new_shape(var, dim, size)),
162 |             var.dimensions,
163 |             attrs=var.attrs,
164 |             compressor=var.compressor,
165 |             fill_value=var.fill_value,
166 |             filters=var.filters,
167 |         ) for name, var in ds.variables.items()
168 |     ])
169 | 
170 | 
171 | # %%
172 | # We then modify the last partition of the reference collection. We start by
173 | # opening the reference collection and loading the last partition.
174 | collection = zcollection.open_collection('/view_reference',
175 |                                          filesystem=fs,
176 |                                          mode='w')
177 | zds = collection.load(
178 |     filters=lambda keys: keys['month'] == 6 and keys['year'] == 2000)
179 | assert zds is not None
180 | 
181 | # %%
182 | # We create a new time variable, resize the dataset and insert the new time
183 | # values.
184 | time: numpy.ndarray = numpy.arange(
185 |     numpy.datetime64('2000-06-01T00:00:00'),
186 |     numpy.datetime64('2000-06-30T23:59:59'),
187 |     numpy.timedelta64(1, 'h'),
188 | )
189 | zds = resize(zds, 'num_lines', len(time))
190 | zds['time'].values = time
191 | 
192 | # %%
193 | # Finally, we update the partition in the reference collection.
194 | collection.insert(zds)
195 | 
196 | # %%
197 | # Now we cannot load the view, because the shape of the last partition is no
198 | # longer consistent between the reference collection and the view.
199 | try:
200 |     view.load()
201 | except ValueError as err:
202 |     print(err)
203 | 
204 | # %%
205 | # We call the ``sync`` method to resynchronize the view.
206 | filters = view.sync()
207 | 
208 | # %%
209 | # The method returns a callable that can be used to filter the partitions that
210 | # have been synchronized. You can use this information to perform a
211 | # :py:meth:`update<zcollection.view.View.update>` of the view on the
212 | # synchronized partitions: ::
213 | #
214 | #     view.update(
215 | #         lambda ds: dict(var3=ds['var1'].values * 0 + 1),
216 | #         filters=filters)
217 | #
218 | print(tuple(view.partitions(filters=filters)))
219 | 
220 | # %%
221 | # The view is now synchronized and can be loaded.
222 | zds = view.load()
223 | assert zds is not None
224 | zds.variables['var3'].values
225 | 
226 | # %%
227 | # Map a function over the view
228 | # ----------------------------
229 | # It's possible to map a function over the partitions of the view.
230 | for partition, array in view.map(lambda ds: (  # type: ignore[arg-type]
231 |         ds['var1'].values + ds['var2'].values)).compute():
232 |     print(f' * partition = {partition}: mean = {array.mean()}')
233 | 
234 | # %%
235 | # .. seealso::
236 | #
237 | #     See the :py:meth:`map_overlap<zcollection.view.View.map_overlap>` method
238 | #     apply a function over the partitions of the view of selecting the
239 | #     neighboring partitions.
240 | #
241 | # Drop a variable
242 | # ----------------
243 | # A method allows you to
244 | # :py:meth:`drop_variable<zcollection.view.View.drop_variable>` variables from
245 | # the view.
246 | view.drop_variable('var3')
247 | try:
248 |     view.load()
249 | except ValueError as err:
250 |     # The view no longer has a variable.
251 |     print(err)
252 | 
253 | # %%
254 | # **Warning**: The variables of the reference collection cannot be dropped.
255 | try:
256 |     view.drop_variable('var2')
257 | except ValueError as exc:
258 |     print(str(exc))
259 | 
260 | # %%
261 | # Close the local cluster to avoid printing warning messages in the other
262 | # examples.
263 | client.close()
264 | cluster.close()
265 | 


--------------------------------------------------------------------------------
/zcollection/tests/test_meta.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Metadata testing.
  7 | =================
  8 | """
  9 | from __future__ import annotations
 10 | 
 11 | from typing import Any
 12 | import json
 13 | import pathlib
 14 | import pickle
 15 | 
 16 | import numpy
 17 | import pytest
 18 | import zarr.codecs
 19 | 
 20 | from .. import meta
 21 | 
 22 | 
 23 | def test_attribute() -> None:
 24 |     """Test attribute creation."""
 25 |     att = meta.Attribute('a', 23.4)
 26 |     assert isinstance(att, meta.Attribute)
 27 |     assert att.name == 'a'
 28 |     assert att.value == 23.4
 29 |     assert str(att) == "Attribute('a', 23.4)"
 30 |     # pylint: disable=comparison-with-itself
 31 |     assert att == att
 32 |     assert (att == 'X') is False
 33 |     assert att != meta.Attribute('a', '23.4')
 34 |     assert isinstance(meta.Attribute.from_config(att.get_config()),
 35 |                       meta.Attribute)
 36 | 
 37 |     att = meta.Attribute('a', numpy.arange(10))
 38 |     assert att == meta.Attribute('a', numpy.arange(10))
 39 | 
 40 |     att = meta.Attribute('a', numpy.datetime64('2000-01-01', 'us'))
 41 |     assert att == att
 42 |     # pylint: enable=comparison-with-itself
 43 | 
 44 | 
 45 | def test_dimension() -> None:
 46 |     """Test dimension creation."""
 47 |     dim = meta.Dimension('a', 12)
 48 |     assert isinstance(dim, meta.Dimension)
 49 |     assert dim.name == 'a'
 50 |     assert dim.value == 12
 51 |     assert str(dim) == "Dimension('a', 12)"
 52 |     # pylint: disable=comparison-with-itself
 53 |     assert dim == dim
 54 |     # pylint: enable=comparison-with-itself
 55 |     assert dim != meta.Dimension('a', 11)
 56 |     assert isinstance(meta.Dimension.from_config(dim.get_config()),
 57 |                       meta.Dimension)
 58 | 
 59 | 
 60 | def test_variable() -> None:
 61 |     """Test variable creation."""
 62 |     var = meta.Variable('a',
 63 |                         numpy.dtype('int16'),
 64 |                         dimensions=('a', ),
 65 |                         attrs=(meta.Attribute('x', 12), ),
 66 |                         compressor=zarr.codecs.Zlib(),
 67 |                         filters=(zarr.codecs.Delta(numpy.float64, numpy.int16),
 68 |                                  zarr.codecs.FixedScaleOffset(
 69 |                                      0, 1, numpy.int16)))
 70 |     assert isinstance(var, meta.Variable)
 71 |     assert str(var) == "Variable('a')"
 72 |     # pylint: disable=comparison-with-itself
 73 |     assert var == var
 74 |     assert (var == 2) is False
 75 |     other: meta.Variable = meta.Variable.from_config(var.get_config())
 76 |     assert var == other
 77 |     other.name = 'x'
 78 |     assert var != other
 79 |     # pylint: enable=comparison-with-itself
 80 | 
 81 | 
 82 | def test_dataset() -> None:
 83 |     """Test dataset creation."""
 84 |     root: pathlib.Path = pathlib.Path(__file__).parent
 85 |     with root.joinpath('first_dataset.json').open(encoding='utf-8') as stream:
 86 |         first: dict[str, Any] = json.load(stream)
 87 |     with root.joinpath('second_dataset.json').open(encoding='utf-8') as stream:
 88 |         second: dict[str, Any] = json.load(stream)
 89 |     ds: meta.Dataset = meta.Dataset.from_config(first)
 90 |     other: meta.Dataset = meta.Dataset.from_config(second)
 91 |     assert ds == other
 92 |     assert (ds == 2) is False
 93 |     assert (ds != other) is False
 94 |     ds.dimensions = ds.dimensions + ('dummy', )
 95 |     assert ds != other
 96 | 
 97 | 
 98 | def test_select_variables() -> None:
 99 |     """Test select_variables."""
100 |     root: pathlib.Path = pathlib.Path(__file__).parent
101 |     with root.joinpath('first_dataset.json').open(encoding='utf-8') as stream:
102 |         config: dict[str, Any] = json.load(stream)
103 |     ds: meta.Dataset = meta.Dataset.from_config(config)
104 |     variables: set[str] = ds.select_variables(('longitude', 'latitude'))
105 |     assert variables == {'longitude', 'latitude'}
106 |     variables = ds.select_variables(drop_variables=('longitude', 'latitude'))
107 |     assert set(variables) & {'longitude', 'latitude'} == set()
108 |     variables = ds.select_variables(keep_variables=('longitude', 'latitude',
109 |                                                     'time'),
110 |                                     drop_variables=('time', ))
111 |     assert variables == {'longitude', 'latitude'}
112 | 
113 | 
114 | def test_search_same_dimensions_as() -> None:
115 |     """Test search_same_dimensions_as."""
116 |     root: pathlib.Path = pathlib.Path(__file__).parent
117 |     with root.joinpath('first_dataset.json').open(encoding='utf-8') as stream:
118 |         first: dict[str, Any] = json.load(stream)
119 |     ds: meta.Dataset = meta.Dataset.from_config(first)
120 |     other: meta.Variable = ds.search_same_dimensions_as(
121 |         ds.variables['simulated_error_karin'])
122 |     assert other.dimensions == ds.variables['simulated_error_karin'].dimensions
123 | 
124 |     other = meta.Variable.from_config(other.get_config())
125 |     other.dimensions = other.dimensions + ('dummy', )
126 |     with pytest.raises(ValueError):
127 |         ds.search_same_dimensions_as(other)
128 | 
129 | 
130 | def test_pickle() -> None:
131 |     """Test pickling."""
132 |     root: pathlib.Path = pathlib.Path(__file__).parent
133 |     with root.joinpath('first_dataset.json').open(encoding='utf-8') as stream:
134 |         data: dict[str, Any] = json.load(stream)
135 |     ds: meta.Dataset = meta.Dataset.from_config(data)
136 |     other: meta.Dataset = pickle.loads(pickle.dumps(ds))
137 |     assert ds == other
138 | 
139 | 
140 | def test_missing_variables() -> None:
141 |     """Test missing_variables."""
142 |     root: pathlib.Path = pathlib.Path(__file__).parent
143 |     with root.joinpath('first_dataset.json').open(encoding='utf-8') as stream:
144 |         data: dict[str, Any] = json.load(stream)
145 |     ds: meta.Dataset = meta.Dataset.from_config(data)
146 |     other: meta.Dataset = pickle.loads(pickle.dumps(ds))
147 | 
148 |     assert len(ds.missing_variables(other)) == 0
149 | 
150 |     del other.variables['cross_track_distance']
151 |     del other.variables['cycle_number']
152 | 
153 |     assert set(ds.missing_variables(other)) == {
154 |         'cross_track_distance', 'cycle_number'
155 |     }
156 | 
157 |     other.variables['XXX'] = other.variables['longitude']
158 |     other.variables['XXX'].name = 'XXX'
159 |     with pytest.raises(ValueError):
160 |         ds.missing_variables(other)
161 | 
162 | 
163 | def test_add_variable() -> None:
164 |     """Test adding a variable."""
165 |     ds = meta.Dataset(('x', 'y'), [])
166 |     ds.add_variable(meta.Variable('a', numpy.float64, dimensions=('x', 'y')))
167 | 
168 |     with pytest.raises(ValueError):
169 |         ds.add_variable(
170 |             meta.Variable('a', numpy.float64, dimensions=('x', 'y')))
171 | 
172 |     ds.add_variable(meta.Variable('b', numpy.float64, dimensions=('x', )))
173 |     ds.add_variable(meta.Variable('c', numpy.float64, dimensions=('y', )))
174 | 
175 |     with pytest.raises(ValueError):
176 |         ds.add_variable(
177 |             meta.Variable('d', numpy.float64, dimensions=('a', 'y')))
178 | 
179 |     with pytest.raises(ValueError):
180 |         ds.add_variable(
181 |             meta.Variable('e', numpy.float64, dimensions=('a', 'b')))
182 | 
183 |     with pytest.raises(ValueError):
184 |         ds.add_variable(meta.Variable('f', numpy.float64, dimensions=('a', )))
185 | 
186 |     ds.add_variable(meta.Variable('g', numpy.float64))
187 | 
188 | 
189 | def test_select_variables_by_dims() -> None:
190 |     """Test select_variable_by_dims."""
191 |     ds = meta.Dataset(('a', 'b', 'x', 'y'), [])
192 |     ds.add_variable(meta.Variable('a', numpy.float64, dimensions=('x', 'y')))
193 |     ds.add_variable(meta.Variable('b', numpy.float64, dimensions=('x', )))
194 |     ds.add_variable(meta.Variable('c', numpy.float64, dimensions=('y', )))
195 |     ds.add_variable(meta.Variable('d', numpy.float64, dimensions=('a', 'y')))
196 |     ds.add_variable(meta.Variable('e', numpy.float64, dimensions=('a', 'b')))
197 |     ds.add_variable(meta.Variable('f', numpy.float64, dimensions=('a', )))
198 |     ds.add_variable(meta.Variable('g', numpy.float64))
199 | 
200 |     assert ds.select_variables_by_dims(('x', 'y')) == {'a', 'b', 'c', 'd'}
201 |     assert ds.select_variables_by_dims(('x', )) == {'a', 'b'}
202 |     assert ds.select_variables_by_dims(('y', )) == {'a', 'c', 'd'}
203 |     assert ds.select_variables_by_dims(('a', 'y')) == {'a', 'c', 'd', 'e', 'f'}
204 |     assert ds.select_variables_by_dims(('a', 'b')) == {'d', 'e', 'f'}
205 |     assert ds.select_variables_by_dims(('a', )) == {'d', 'e', 'f'}
206 |     assert ds.select_variables_by_dims(()) == {'g'}
207 |     assert ds.select_variables_by_dims(('z', )) == set()
208 | 
209 |     assert ds.select_variables_by_dims(('x', 'y'),
210 |                                        predicate=False) == {'e', 'f', 'g'}
211 |     assert ds.select_variables_by_dims(
212 |         ('x', ), predicate=False) == {'c', 'd', 'e', 'f', 'g'}
213 |     assert ds.select_variables_by_dims(
214 |         ('y', ), predicate=False) == {'b', 'e', 'f', 'g'}
215 |     assert ds.select_variables_by_dims(('a', 'y'),
216 |                                        predicate=False) == {'b', 'g'}
217 |     assert ds.select_variables_by_dims(
218 |         ('a', 'b'), predicate=False) == {'a', 'b', 'c', 'g'}
219 |     assert ds.select_variables_by_dims(
220 |         ('a', ), predicate=False) == {'a', 'b', 'c', 'g'}
221 |     assert ds.select_variables_by_dims(
222 |         (), predicate=False) == {'a', 'b', 'c', 'd', 'e', 'f'}
223 |     assert ds.select_variables_by_dims(
224 |         ('z', ), predicate=False) == {'a', 'b', 'c', 'd', 'e', 'f', 'g'}
225 | 


--------------------------------------------------------------------------------
/zcollection/tests/test_fs_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023 CNES
  2 | #
  3 | # All rights reserved. Use of this source code is governed by a
  4 | # BSD-style license that can be found in the LICENSE file.
  5 | """
  6 | Testing utilities
  7 | =================
  8 | """
  9 | from typing import Any
 10 | import os
 11 | import pathlib
 12 | import platform
 13 | 
 14 | import fsspec
 15 | import fsspec.implementations.local
 16 | 
 17 | from .. import fs_utils
 18 | # pylint: disable=unused-import # Need to import for fixtures
 19 | from .cluster import dask_client, dask_cluster
 20 | 
 21 | # pylint: disable=unused-import
 22 | 
 23 | #: Test data
 24 | TEXT = '''Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam porta
 25 | turpis dictum, porta tellus eu, convallis mi. Integer at placerat diam. Donec in
 26 | various neque. Morbi sed nisi finibus, mattis velit non, pulvinar metus. Duis
 27 | feugiat diam eget augue posuere, nec aliquam dolor tristique. Aliquam a dolor
 28 | vel ante sagittis dictum vel at dolor. Suspendisse velit dolor, vestibulum eget
 29 | aliquet ut, imperdiet at justo. Nullam sit amet suscipit orci, bibendum sagittis
 30 | orci. Aliquam mattis feugiat rutrum. Vivamus fermentum ex non mauris faucibus
 31 | vehicula. Donec odio lacus, viverra et hendrerit eu, mollis eget mauris. Duis
 32 | suscipit, velit nec finibus ullamcorper, nisi lorem fermentum tellus, ut viverra
 33 | nunc lorem ut odio. Duis eget ligula maximus, venenatis nulla a, commodo dolor.
 34 | Aenean justo sapien, mollis aliquam vestibulum id, suscipit a ligula. Phasellus
 35 | porta arcu erat, elementum faucibus leo auctor vel. Integer vel pharetra leo.'''
 36 | 
 37 | 
 38 | def test_join_path() -> None:
 39 |     """Test the join_path function."""
 40 |     assert fs_utils.join_path('a', 'b', 'c') == 'a/b/c'
 41 |     assert fs_utils.join_path('a', 'b', 'c', 'd') == 'a/b/c/d'
 42 |     assert fs_utils.join_path('a', 'b', 'c', 'd', 'e') == 'a/b/c/d/e'
 43 |     assert fs_utils.join_path('a', 'b', 'c', 'd', 'e', 'f') == 'a/b/c/d/e/f'
 44 | 
 45 | 
 46 | def test_get_fs() -> None:
 47 |     """Test the get_fs function."""
 48 |     fs = fs_utils.get_fs('file')
 49 |     assert isinstance(fs, fsspec.implementations.local.LocalFileSystem)
 50 |     fs = fs_utils.get_fs()
 51 |     assert isinstance(fs, fsspec.implementations.local.LocalFileSystem)
 52 | 
 53 | 
 54 | def test_fs_walk(tmpdir) -> None:
 55 |     """Test the fs_walk function."""
 56 |     item: Any
 57 | 
 58 |     for idx, item in enumerate([
 59 |         ('year=2014', 'month=5'),
 60 |         ('year=2014', 'month=5', 'day=2'),
 61 |         ('year=2014', 'month=5', 'day=1'),
 62 |         ('year=2014', 'month=5', 'day=3'),
 63 |         ('year=2014', 'month=4'),
 64 |         ('year=2014', 'month=4', 'day=16'),
 65 |         ('year=2014', 'month=4', 'day=24'),
 66 |         ('year=2014', 'month=4', 'day=27'),
 67 |         ('year=2014', 'month=4', 'day=20'),
 68 |         ('year=2014', 'month=4', 'day=29'),
 69 |         ('year=2014', 'month=4', 'day=14'),
 70 |         ('year=2014', 'month=4', 'day=25'),
 71 |         ('year=2014', 'month=4', 'day=19'),
 72 |         ('year=2014', 'month=4', 'day=12'),
 73 |         ('year=2014', 'month=4', 'day=23'),
 74 |         ('year=2014', 'month=4', 'day=17'),
 75 |         ('year=2014', 'month=4', 'day=28'),
 76 |         ('year=2014', 'month=4', 'day=13'),
 77 |         ('year=2014', 'month=4', 'day=21'),
 78 |         ('year=2014', 'month=4', 'day=15'),
 79 |         ('year=2014', 'month=4', 'day=18'),
 80 |         ('year=2014', 'month=4', 'day=26'),
 81 |         ('year=2014', 'month=4', 'day=22'),
 82 |         ('year=2014', 'month=4', 'day=30'),
 83 |     ]):
 84 |         path = pathlib.Path(tmpdir).joinpath(*item)
 85 |         path.mkdir(parents=True, exist_ok=False)
 86 |         if 'day' in item[-1]:
 87 |             with path.joinpath(f'file_{idx}.txt').open(mode='w',
 88 |                                                        encoding='utf-8'):
 89 |                 ...
 90 | 
 91 |     fs = fs_utils.get_fs()
 92 |     listing1 = []
 93 |     for root, _dirs, files in fs_utils.fs_walk(fs, str(tmpdir), sort=True):
 94 |         for item in files:
 95 |             listing1.append(fs.sep.join([root, item]))
 96 | 
 97 |     listing2 = []
 98 |     for root, _dirs, files in fs_utils.fs_walk(fs, str(tmpdir), sort=False):
 99 |         for item in files:
100 |             listing2.append(fs.sep.join([root, item]))
101 | 
102 |     assert listing1 == sorted(listing2)
103 | 
104 |     assert list(
105 |         fs_utils.fs_walk(fs,
106 |                          str(pathlib.Path(tmpdir).joinpath('inexistent')),
107 |                          sort=True)) == [('', [], [])]
108 | 
109 | 
110 | def test_normalize_path() -> None:
111 |     """Test the normalize_path function."""
112 |     fs = fsspec.filesystem('file')
113 |     root = str(pathlib.Path('/').resolve())
114 |     if platform.system() == 'Windows':
115 |         # fsspec returns only the drive letter for the root path.
116 |         root = root.replace('\\', '')
117 | 
118 |     def istrcmp(str1, str2):
119 |         """Case insensitive string comparison."""
120 |         if platform.system() == 'Windows':
121 |             str1 = str1.replace('\\', '/')
122 |             str2 = str2.replace('\\', '/')
123 |         return str1.lower() == str2.lower()
124 | 
125 |     assert istrcmp(fs_utils.normalize_path(fs, '/'), root)
126 |     assert istrcmp(fs_utils.normalize_path(fs, './foo'),
127 |                    str(pathlib.Path('.').resolve() / 'foo'))
128 | 
129 |     fs = fsspec.filesystem('memory')
130 |     assert fs_utils.normalize_path(fs, '/') == os.path.sep
131 |     assert fs_utils.normalize_path(fs, './foo') == f'{os.path.sep}foo'
132 | 
133 |     fs = fsspec.filesystem('s3')
134 |     assert fs_utils.normalize_path(fs, '/') == '/'
135 |     assert fs_utils.normalize_path(fs, './foo') == './foo'
136 | 
137 | 
138 | def test_copy_file(tmpdir) -> None:
139 |     """Test the copy file across different file systems."""
140 |     fs_source = fsspec.filesystem('file')
141 |     fs_target = fsspec.filesystem('memory')
142 |     path = str(tmpdir / 'foo.txt')
143 |     with fs_source.open(path, mode='wb', encoding='utf-8') as stream:
144 |         stream.write(TEXT.encode('utf-8'))
145 |     fs_utils.copy_file(path, 'foo.txt', fs_source, fs_target)
146 | 
147 |     assert fs_target.cat('foo.txt').decode('utf-8') == TEXT
148 | 
149 | 
150 | def test_copy_files(tmpdir) -> None:
151 |     """Test the copy files across different file systems."""
152 |     source = tmpdir / 'source'
153 |     target = tmpdir / 'target'
154 |     fs_source = fsspec.filesystem('file')
155 |     fs_target = fsspec.filesystem('file')
156 |     fs_source.mkdir(source)
157 |     fs_target.mkdir(target)
158 |     paths = [
159 |         str(source / item) for item in (
160 |             'foo.txt',
161 |             'bar.txt',
162 |             'baz.txt',
163 |         )
164 |     ]
165 |     for path in paths:
166 |         with fs_source.open(path, mode='wb', encoding='utf-8') as stream:
167 |             stream.write(TEXT.encode('utf-8'))
168 |     fs_utils.copy_files(paths, str(target), fs_source, fs_target)
169 | 
170 |     for item in fs_target.ls(str(target)):
171 |         assert fs_target.cat(item).decode('utf-8') == TEXT
172 | 
173 | 
174 | def test_copy_tree(tmpdir) -> None:
175 |     """Test the copy tree across different file systems."""
176 |     item: Any
177 |     fs_source = fsspec.filesystem('file')
178 |     fs_target = fsspec.filesystem('memory')
179 | 
180 |     for idx, item in enumerate([
181 |         ('year=2014', 'month=5'),
182 |         ('year=2014', 'month=5', 'day=2'),
183 |         ('year=2014', 'month=5', 'day=1'),
184 |         ('year=2014', 'month=5', 'day=3'),
185 |         ('year=2014', 'month=4'),
186 |         ('year=2014', 'month=4', 'day=16'),
187 |         ('year=2014', 'month=4', 'day=24'),
188 |         ('year=2014', 'month=4', 'day=27'),
189 |         ('year=2014', 'month=4', 'day=20'),
190 |         ('year=2014', 'month=4', 'day=29'),
191 |         ('year=2014', 'month=4', 'day=14'),
192 |         ('year=2014', 'month=4', 'day=25'),
193 |         ('year=2014', 'month=4', 'day=19'),
194 |         ('year=2014', 'month=4', 'day=12'),
195 |         ('year=2014', 'month=4', 'day=23'),
196 |         ('year=2014', 'month=4', 'day=17'),
197 |         ('year=2014', 'month=4', 'day=28'),
198 |         ('year=2014', 'month=4', 'day=13'),
199 |         ('year=2014', 'month=4', 'day=21'),
200 |         ('year=2014', 'month=4', 'day=15'),
201 |         ('year=2014', 'month=4', 'day=18'),
202 |         ('year=2014', 'month=4', 'day=26'),
203 |         ('year=2014', 'month=4', 'day=22'),
204 |         ('year=2014', 'month=4', 'day=30'),
205 |     ]):
206 |         path = fs_utils.join_path(str(tmpdir), *item)
207 |         fs_source.makedirs(path, exist_ok=False)
208 |         if 'day' in item[-1]:
209 |             with fs_source.open(fs_utils.join_path(path, f'file_{idx}.txt'),
210 |                                 mode='wb',
211 |                                 encoding='utf-8') as stream:
212 |                 stream.write(TEXT.encode('utf-8'))
213 | 
214 |     fs_utils.copy_tree(str(tmpdir), '/tree', fs_source, fs_target)
215 | 
216 |     for root, dirs, files in fs_utils.fs_walk(fs_target, '/tree'):
217 |         for item in files:
218 |             assert fs_target.cat(fs_utils.join_path(
219 |                 root, item)).decode('utf-8') == TEXT
220 |         for item in dirs:
221 |             item = item.replace('\\', '/')
222 |             parts = item.replace('/tree/', '').split(fs_target.sep)
223 |             assert parts[0] == 'year=2014'
224 |             if len(parts) > 1:
225 |                 assert parts[1] in ['month=4', 'month=5']
226 |             if len(parts) > 2:
227 |                 assert 'day=' in parts[2]
228 |             if len(parts) > 3:
229 |                 assert 'file_' in parts[3]
230 |                 assert parts[3].endswith('.txt')
231 | 


--------------------------------------------------------------------------------