├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    ├── codecov.yml
    └── workflows
    │   ├── main.yml
    │   ├── release.yml
    │   ├── typing.yml
    │   └── upstream.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── LICENSE
├── README.md
├── citation.cff
├── conftest.py
├── docs
    ├── Makefile
    ├── _static
    │   └── custom.css
    ├── api.rst
    ├── conf.py
    ├── contributing.md
    ├── core_team_guide.md
    ├── custom_readers.md
    ├── data_structures.md
    ├── examples.md
    ├── faq.md
    ├── index.md
    ├── installation.md
    ├── make.bat
    ├── releases.rst
    └── usage.md
├── examples
    ├── append
    │   └── noaa-cdr-sst.ipynb
    ├── coiled
    │   └── terraclimate.ipynb
    ├── mursst-icechunk-with-lithops
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── config.py
    │   ├── ec2_for_lithops_runtime
    │   │   ├── 00-create-security-group.sh
    │   │   ├── 01-launch-ec2.sh
    │   │   ├── 02-setup-ec2-role.sh
    │   │   ├── 03-setup-ec2.sh
    │   │   └── README.md
    │   ├── helpers.py
    │   ├── lithops.yaml
    │   ├── lithops_functions.py
    │   ├── main.py
    │   ├── models.py
    │   ├── repo.py
    │   ├── requirements.txt
    │   ├── url_utils.py
    │   ├── virtual_datasets.py
    │   └── zarr_operations.py
    └── virtualizarr-with-lithops
    │   ├── Dockerfile_virtualizarr
    │   ├── README.md
    │   ├── lithops.yaml
    │   ├── requirements.txt
    │   └── virtualizarr-with-lithops.py
├── pyproject.toml
└── virtualizarr
    ├── __init__.py
    ├── accessor.py
    ├── backend.py
    ├── codecs.py
    ├── manifests
        ├── __init__.py
        ├── array.py
        ├── array_api.py
        ├── group.py
        ├── manifest.py
        ├── store.py
        └── utils.py
    ├── parallel.py
    ├── py.typed
    ├── readers
        ├── __init__.py
        ├── api.py
        ├── dmrpp.py
        ├── fits.py
        ├── hdf
        │   ├── __init__.py
        │   ├── filters.py
        │   └── hdf.py
        ├── hdf5.py
        ├── kerchunk.py
        ├── netcdf3.py
        ├── tiff.py
        └── zarr.py
    ├── tests
        ├── __init__.py
        ├── conftest.py
        ├── test_backend.py
        ├── test_codecs.py
        ├── test_integration.py
        ├── test_manifests
        │   ├── __init__.py
        │   ├── test_array.py
        │   ├── test_group.py
        │   ├── test_manifest.py
        │   └── test_store.py
        ├── test_readers
        │   ├── __init__.py
        │   ├── conftest.py
        │   ├── test_dmrpp.py
        │   ├── test_fits.py
        │   ├── test_hdf
        │   │   ├── test_hdf.py
        │   │   ├── test_hdf_filters.py
        │   │   ├── test_hdf_integration.py
        │   │   └── test_hdf_manifest_store.py
        │   ├── test_kerchunk.py
        │   ├── test_netcdf3.py
        │   └── test_zarr.py
        ├── test_utils.py
        ├── test_writers
        │   ├── __init__.py
        │   ├── conftest.py
        │   ├── test_icechunk.py
        │   └── test_kerchunk.py
        └── test_xarray.py
    ├── translators
        ├── __init__.py
        └── kerchunk.py
    ├── types
        ├── __init__.py
        ├── general.py
        └── kerchunk.py
    ├── utils.py
    ├── vendor
        ├── __init__.py
        └── zarr
        │   ├── __init__.py
        │   └── core
        │       ├── __init__.py
        │       ├── common.py
        │       └── metadata.py
    ├── writers
        ├── __init__.py
        ├── icechunk.py
        └── kerchunk.py
    └── xarray.py


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!-- Feel free to remove check-list items aren't relevant to your change -->
 2 | 
 3 | - [ ] Closes #xxxx
 4 | - [ ] Tests added
 5 | - [ ] Tests passing
 6 | - [ ] Full type hint coverage
 7 | - [ ] Changes are documented in `docs/releases.rst`
 8 | - [ ] New functions/methods are listed in `api.rst`
 9 | - [ ] New functionality has documentation
10 | 


--------------------------------------------------------------------------------
/.github/codecov.yml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   status:
 3 |     project:
 4 |       default:
 5 |         target: 75
 6 |         # See https://json.schemastore.org/codecov.json
 7 |         threshold: "0.1%"
 8 |     patch:
 9 |       default:
10 |         target: 75
11 | comment:
12 |   layout: "diff, files"
13 |   behavior: default
14 |   require_changes: true  # if true: only post the comment if coverage changes
15 |   branches:               # branch names that can post comment
16 |     - "main"
17 |     - "develop"
18 | ignore:
19 |   - "conftest.py"
20 |   - "virtualizarr/tests"  # ignore folders and all its contents
21 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" , "develop"]
 6 |     paths-ignore:
 7 |     - 'docs/**'
 8 |   pull_request:
 9 |     branches: [ "main" , "develop"]
10 |     paths-ignore:
11 |     - 'docs/**'
12 |   schedule:
13 |     - cron: "0 0 * * *"
14 | 
15 | concurrency:
16 |   group: ${{ github.workflow }}-${{ github.ref }}
17 |   cancel-in-progress: true
18 | 
19 | jobs:
20 |   test:
21 |     name: ${{ matrix.environment }}-build
22 |     runs-on: ubuntu-latest
23 |     defaults:
24 |       run:
25 |         shell: bash -l {0}
26 |     strategy:
27 |       matrix:
28 |         environment: [test-py311, test-py312, min-deps, minio]
29 |     steps:
30 |       - uses: actions/checkout@v4
31 |       - uses: prefix-dev/setup-pixi@v0.8.3
32 |         with:
33 |           pixi-version: v0.41.4
34 |           environments: ${{ matrix.environment }}
35 | 
36 |       - name: List installed libraries
37 |         run: |
38 |           pixi install --environment ${{ matrix.environment }}
39 |           pixi list --environment ${{ matrix.environment }}
40 | 
41 |       - name: Running Tests
42 |         run: |
43 |           pixi run -e ${{ matrix.environment }} run-tests-xml-cov
44 | 
45 |       - name: Upload code coverage to Codecov
46 |         uses: codecov/codecov-action@v3.1.4
47 |         with:
48 |           file: ./coverage.xml
49 |           flags: unittests
50 |           env_vars: OS,PYTHON
51 |           name: codecov-umbrella
52 |           fail_ci_if_error: false
53 |           token: ${{ secrets.CODECOV_TOKEN }}
54 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | name: Build distribution
  2 | on:
  3 |   release:
  4 |     types:
  5 |       - published
  6 |   # push:
  7 |   #   branches: [ "main" ]
  8 |   pull_request:
  9 |     branches: [ "main" , "develop"]
 10 | 
 11 | permissions:
 12 |   contents: read
 13 | 
 14 | env:
 15 |   PIP_ROOT_USER_ACTION: ignore
 16 | 
 17 | jobs:
 18 |   build-artifacts:
 19 |     runs-on: ubuntu-latest
 20 |     if: github.repository == 'zarr-developers/VirtualiZarr'
 21 |     steps:
 22 |       - uses: actions/checkout@v4
 23 |         with:
 24 |           fetch-depth: 0
 25 |       - uses: actions/setup-python@v5.0.0
 26 |         name: Install Python
 27 |         with:
 28 |           python-version: "3.12"
 29 |       - name: Install dependencies
 30 |         run: |
 31 |           python -m pip install --upgrade pip
 32 |           python -m pip install setuptools setuptools-scm wheel twine check-manifest
 33 |           git clean -xdf
 34 |           git restore -SW .
 35 |       - name: Build tarball and wheels
 36 |         run: |
 37 |           python -m build --sdist --wheel .
 38 |       - name: Check built artifacts
 39 |         run: |
 40 |           python -m twine check --strict dist/*
 41 |           pwd
 42 |           if [ -f dist/virtualizarr-unknown.tar.gz ]; then
 43 |             echo "❌ INVALID VERSION NUMBER"
 44 |             exit 1
 45 |           else
 46 |             echo "✅ Looks good"
 47 |           fi
 48 |       - uses: actions/upload-artifact@v4
 49 |         with:
 50 |           name: releases
 51 |           path: dist
 52 | 
 53 |   test-built-dist:
 54 |     needs: build-artifacts
 55 |     runs-on: ubuntu-latest
 56 |     environment:
 57 |       name: test-release
 58 |       url: https://test.pypi.org/p/virtualizarr
 59 |     permissions:
 60 |       id-token: write
 61 |     steps:
 62 |       - uses: actions/setup-python@v5.0.0
 63 |         name: Install Python
 64 |         with:
 65 |           python-version: "3.12"
 66 |       - uses: actions/download-artifact@v4
 67 |         with:
 68 |           name: releases
 69 |           path: dist
 70 |       - name: List contents of built dist
 71 |         run: |
 72 |           ls -ltrh
 73 |           ls -ltrh dist
 74 |       - name: Verify the built dist/wheel is valid
 75 |         run: |
 76 |           python -m pip install --upgrade pip
 77 |           python -m pip install dist/virtualizarr*.whl
 78 |           python -c "import virtualizarr; print(virtualizarr.__version__)"
 79 |       - name: Publish package to TestPyPI
 80 |         if: github.event_name == 'release'
 81 |         uses: pypa/gh-action-pypi-publish@v1.8.14
 82 |         with:
 83 |           repository-url: https://test.pypi.org/legacy/
 84 |           # verbose: true
 85 | 
 86 |   upload-to-pypi:
 87 |     needs: test-built-dist
 88 |     if: github.event_name == 'release'
 89 |     runs-on: ubuntu-latest
 90 |     environment:
 91 |       name: release
 92 |       url: https://pypi.org/p/virtualizarr
 93 |     permissions:
 94 |       id-token: write
 95 |     steps:
 96 |       - uses: actions/download-artifact@v4
 97 |         with:
 98 |           name: releases
 99 |           path: dist
100 |       - name: Publish package to PyPI
101 |         uses: pypa/gh-action-pypi-publish@v1.8.14
102 | 


--------------------------------------------------------------------------------
/.github/workflows/typing.yml:
--------------------------------------------------------------------------------
 1 | name: Typing
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" , "develop"]
 6 |     paths-ignore:
 7 |     - 'docs/**'
 8 |   pull_request:
 9 |     branches: [ "main" , "develop"]
10 |     paths-ignore:
11 |     - 'docs/**'
12 |   schedule:
13 |     - cron: "0 0 * * *"
14 | 
15 | concurrency:
16 |   group: ${{ github.workflow }}-${{ github.ref }}
17 |   cancel-in-progress: true
18 | 
19 | env:
20 |   PIP_ROOT_USER_ACTION: ignore
21 | 
22 | jobs:
23 |   mypy:
24 |     name: mypy
25 |     runs-on: ubuntu-latest
26 |     steps:
27 |       - uses: actions/checkout@v4
28 |       - uses: prefix-dev/setup-pixi@v0.8.3
29 |         with:
30 |           pixi-version: v0.41.4
31 |       - name: List installed libraries
32 |         run: |
33 |           pixi install --environment test
34 |           pixi list --environment test
35 |       - name: Type check
36 |         run: |
37 |           pixi run -e test run-mypy
38 | 


--------------------------------------------------------------------------------
/.github/workflows/upstream.yml:
--------------------------------------------------------------------------------
 1 | name: Upstream
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" , "develop"]
 6 |     paths-ignore:
 7 |     - 'docs/**'
 8 |   pull_request:
 9 |     branches: [ "main" , "develop"]
10 |     types: [ labeled ]
11 |     paths-ignore:
12 |     - 'docs/**'
13 |   schedule:
14 |     - cron: "0 0 * * *"
15 | 
16 | concurrency:
17 |   group: ${{ github.workflow }}-${{ github.ref }}
18 |   cancel-in-progress: true
19 | 
20 | jobs:
21 |   test-upstream:
22 |     name: ${{ matrix.environment }}-build
23 |     if: |
24 |       github.event_name == 'push' ||
25 |       github.event_name == 'schedule' ||
26 |       (github.event_name == 'pull_request' && github.event.label.name == 'test-upstream')
27 |     runs-on: ubuntu-latest
28 |     defaults:
29 |       run:
30 |         shell: bash -l {0}
31 |     strategy:
32 |       matrix:
33 |         environment: [upstream]
34 |     steps:
35 |       - uses: actions/checkout@v4
36 |       - uses: prefix-dev/setup-pixi@v0.8.3
37 |         with:
38 |           pixi-version: v0.41.4
39 |           environments: ${{ matrix.environment }}
40 | 
41 |       - name: List installed libraries
42 |         run: |
43 |           pixi install --environment ${{ matrix.environment }}
44 |           pixi list --environment ${{ matrix.environment }}
45 | 
46 |       - name: Running Tests
47 |         run: |
48 |           pixi run -e ${{ matrix.environment }} run-tests-xml-cov
49 | 
50 |       - name: Upload code coverage to Codecov
51 |         uses: codecov/codecov-action@v3.1.4
52 |         with:
53 |           file: ./coverage.xml
54 |           flags: unittests
55 |           env_vars: OS,PYTHON
56 |           name: codecov-umbrella
57 |           fail_ci_if_error: false
58 |           token: ${{ secrets.CODECOV_TOKEN }}
59 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | virtualizarr/_version.py
162 | docs/generated/
163 | docs/jupyter_execute/
164 | examples/
165 | 
166 | # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode
167 | # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode
168 | 
169 | ### VisualStudioCode ###
170 | .vscode
171 | 
172 | # Local History for Visual Studio Code
173 | .history/
174 | 
175 | # Built Visual Studio Code Extensions
176 | *.vsix
177 | 
178 | ### VisualStudioCode Patch ###
179 | # Ignore all local history of files
180 | .history
181 | .ionide
182 | 
183 | # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode
184 | 
185 | # Pixi folder
186 | .pixi/
187 | 
188 | # python virtual environment
189 | .venv
190 | venv
191 | 
192 | # Pixi lock file (because it changes with every upstream commit)
193 | pixi.lock
194 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # https://pre-commit.com/
 2 | ci:
 3 |   autoupdate_schedule: monthly
 4 | repos:
 5 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 6 |     rev: v5.0.0
 7 |     hooks:
 8 |       - id: trailing-whitespace
 9 |       - id: end-of-file-fixer
10 |       - id: check-yaml
11 | 
12 |   - repo: https://github.com/astral-sh/ruff-pre-commit
13 |     # Ruff version.
14 |     rev: v0.11.8
15 |     hooks:
16 |       # Run the linter.
17 |       - id: ruff
18 |         args: [ --fix ]
19 |       # Run the formatter.
20 |       - id: ruff-format
21 |   - repo: https://github.com/citation-file-format/cff-converter-python
22 |     rev: ebf0b5e44d67f8beaa1cd13a0d0393ea04c6058d
23 |     hooks:
24 |       - id: validate-cff
25 |   - repo: https://github.com/codespell-project/codespell
26 |     rev: v2.4.1
27 |     hooks:
28 |       - id: codespell
29 |         args: ["-L", "fo,ihs,kake,te", "-S", "fixture"]
30 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | build:
 6 |   os: ubuntu-22.04
 7 |   tools:
 8 |     # this ensures a viable `mamba` is on `$PATH``
 9 |     python: mambaforge-latest
10 |   commands:
11 |     - mamba install -c conda-forge -c nodefaults pixi
12 |     - pixi install --environment docs
13 |     - pixi run build-docs
14 |     - pixi run readthedocs
15 | 
16 | # Build documentation in the doc/ directory with Sphinx
17 | sphinx:
18 |   configuration: docs/conf.py
19 |   fail_on_warning: true
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # VirtualiZarr
 2 | 
 3 | [![CI](https://github.com/zarr-developers/VirtualiZarr/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/zarr-developers/VirtualiZarr/actions?query=workflow%3ACI)
 4 | [![Code coverage](https://codecov.io/gh/zarr-developers/VirtualiZarr/branch/main/graph/badge.svg?flag=unittests)](https://codecov.io/gh/zarr-developers/VirtualiZarr)
 5 | [![Docs](https://readthedocs.org/projects/virtualizarr/badge/?version=latest)](https://virtualizarr.readthedocs.io/en/latest/)
 6 | [![Linted and Formatted with Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 7 | [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
 8 | [![pre-commit Enabled](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://pre-commit.com/)
 9 | [![Apache 2.0 License](https://img.shields.io/badge/license-Apache%202-cb2533.svg)](https://www.apache.org/licenses/LICENSE-2.0)
10 | [![Python Versions](https://img.shields.io/python/required-version-toml?tomlFilePath=https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/main/pyproject.toml&logo=Python&logoColor=gold&label=Python)](https://docs.python.org)
11 | [![slack](https://img.shields.io/badge/slack-virtualizarr-purple.svg?logo=slack)](https://join.slack.com/t/earthmover-community/shared_invite/zt-32to7398i-HorUXmzPzyy9U87yLxweIA)
12 | [![Latest Release](https://img.shields.io/github/v/release/zarr-developers/VirtualiZarr)](https://github.com/zarr-developers/VirtualiZarr/releases)
13 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/virtualizarr?label=pypi%7Cdownloads)](https://pypistats.org/packages/virtualizarr)
14 | [![Conda - Downloads](https://img.shields.io/conda/d/conda-forge/virtualizarr
15 | )](https://anaconda.org/conda-forge/virtualizarr)
16 | 
17 | 
18 | 
19 | ## Cloud-Optimize your Scientific Data as a Virtual Zarr Datacube, using Xarray syntax.
20 | 
21 | The best way to distribute large scientific datasets is via the Cloud, in [Cloud-Optimized formats](https://guide.cloudnativegeo.org/) [^1]. But often this data is stuck in archival pre-Cloud file formats such as netCDF.
22 | 
23 | **VirtualiZarr[^2] makes it easy to create "Virtual" Zarr datacubes, allowing performant access to archival data as if it were in the Cloud-Optimized [Zarr format](https://zarr.dev/), _without duplicating any data_.**
24 | 
25 | Please see the [documentation](https://virtualizarr.readthedocs.io/en/stable/index.html).
26 | 
27 | ### Features
28 | 
29 | * Create virtual references pointing to bytes inside an archival file with [`open_virtual_dataset`](https://virtualizarr.readthedocs.io/en/latest/usage.html#opening-files-as-virtual-datasets).
30 | * Supports a [range of archival file formats](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare), including netCDF4 and HDF5, and has a pluggable system for supporting new formats.
31 | * [Combine data from multiple files](https://virtualizarr.readthedocs.io/en/latest/usage.html#combining-virtual-datasets) into one larger datacube using [xarray's combining functions](https://docs.xarray.dev/en/stable/user-guide/combining.html), such as [`xarray.concat`](https://docs.xarray.dev/en/stable/generated/xarray.concat.html).
32 | * Commit the virtual references to storage either using the [Kerchunk references](https://fsspec.github.io/kerchunk/spec.html) specification or the [Icechunk](https://icechunk.io/) transactional storage engine.
33 | * Users access the virtual datacube simply as a single zarr-compatible store using [`xarray.open_zarr`](https://docs.xarray.dev/en/stable/generated/xarray.open_zarr.html).
34 | 
35 | ### Inspired by Kerchunk
36 | 
37 | VirtualiZarr grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [Kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk but in a zarr-native way, and with a familiar array-like API.
38 | 
39 | You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk.
40 | 
41 | ### Development Status and Roadmap
42 | 
43 | VirtualiZarr version 1 (mostly) achieves [feature parity](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) with kerchunk's logic for combining datasets, providing an easier way to manipulate kerchunk references in memory and generate kerchunk reference files on disk.
44 | 
45 | Future VirtualiZarr development will focus on generalizing and upstreaming useful concepts into the Zarr specification, the Zarr-Python library, Xarray, and possibly some new packages.
46 | 
47 | We have a lot of ideas, including:
48 | - [Zarr v3 support](https://github.com/zarr-developers/VirtualiZarr/issues/17)
49 | - [Zarr-native on-disk chunk manifest format](https://github.com/zarr-developers/zarr-specs/issues/287)
50 | - ["Virtual concatenation"](https://github.com/zarr-developers/zarr-specs/issues/288) of separate Zarr arrays
51 | - ManifestArrays as an [intermediate layer in-memory](https://github.com/zarr-developers/VirtualiZarr/issues/71) in Zarr-Python
52 | - [Separating CF-related Codecs from xarray](https://github.com/zarr-developers/VirtualiZarr/issues/68#issuecomment-2197682388)
53 | - [Generating references without kerchunk](https://github.com/zarr-developers/VirtualiZarr/issues/78)
54 | 
55 | If you see other opportunities then we would love to hear your ideas!
56 | 
57 | ### Talks and Presentations
58 | 
59 | - 2024/11/21 - MET Office Architecture Guild - Tom Nicholas - [Slides](https://speakerdeck.com/tomnicholas/virtualizarr-talk-at-met-office)
60 | - 2024/11/13 - Cloud-Native Geospatial conference - Raphael Hagen - [Slides](https://decks.carbonplan.org/cloud-native-geo/11-13-24)
61 | - 2024/07/24 - ESIP Meeting - Sean Harkins - [Event](https://2024julyesipmeeting.sched.com/event/1eVP6) / [Recording](https://youtu.be/T6QAwJIwI3Q?t=3689)
62 | - 2024/05/15 - Pangeo showcase - Tom Nicholas - [Event](https://discourse.pangeo.io/t/pangeo-showcase-virtualizarr-create-virtual-zarr-stores-using-xarray-syntax/4127/2) / [Recording](https://youtu.be/ioxgzhDaYiE) / [Slides](https://speakerdeck.com/tomnicholas/virtualizarr-create-virtual-zarr-stores-using-xarray-syntax)
63 | 
64 | ### Credits
65 | 
66 | This package was originally developed by [Tom Nicholas](https://github.com/TomNicholas) whilst working at [[C]Worthy](https://cworthy.org), who deserve credit for allowing him to prioritise a generalizable open-source solution to the dataset virtualization problem. VirtualiZarr is now a community-owned multi-stakeholder project.
67 | 
68 | ### Licence
69 | 
70 | Apache 2.0
71 | 
72 | ### References
73 | 
74 | [^1]: [_Cloud-Native Repositories for Big Scientific Data_, Abernathey et. al., _Computing in Science & Engineering_.](https://ieeexplore.ieee.org/abstract/document/9354557)
75 | 
76 | [^2]: (Pronounced "Virtual-Eye-Zarr" - like "virtualizer" but more piratey 🦜)
77 | 


--------------------------------------------------------------------------------
/citation.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | title: "VirtualiZarr"
 4 | abstract: "Create virtual Zarr stores for cloud-friendly access to archival data, using familiar xarray syntax."
 5 | license: Apache-2.0
 6 | repository-code: "https://github.com/zarr-developers/VirtualiZarr"
 7 | authors:
 8 | - family-names: "Nicholas"
 9 |   given-names: "Thomas"
10 |   orcid: "https://orcid.org/0000-0002-2176-0530"
11 | - family-names: "Hagen"
12 |   given-names: "Norland"
13 |   orcid: "https://orcid.org/0000-0000-0000-0000"
14 | - family-names: "Harkins"
15 |   given-names: "Sean"
16 |   orcid: "https://orcid.org/0000-0000-0000-0000"
17 | - family-names: "Barciauskas"
18 |   given-names: "Aimee"
19 |   orcid: "https://orcid.org/0000-0002-3158-9554"
20 | - family-names:  "Jones"
21 |   given-names: "Max"
22 |   orcid: "https://orcid.org/0000-0003-0180-8928"
23 | - family-names: "Signell"
24 |   given-names: "Julia"
25 |   orcid: "https://orcid.org/0000-0002-4120-3192"
26 | - family-names: "Nag"
27 |   given-names: "Ayush"
28 |   orcid: "https://orcid.org/0009-0008-1790-597X"
29 | - family-names: "Hidalgo"
30 |   given-names: "Gustavo"
31 |   orcid: "https://orcid.org/0000-0000-0000-0000"
32 | - family-names: "Augspurger"
33 |   given-names: "Tom"
34 |   orcid: "https://orcid.org/0000-0002-8136-7087"
35 | - family-names: "Abernathey"
36 |   given-names: "Ryan"
37 |   orcid: "https://orcid.org/0000-0001-5999-4917"
38 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
1 | .bd-sidebar-primary {
2 |     display: none; !important;
3 | }
4 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | #############
 2 | API Reference
 3 | #############
 4 | 
 5 | .. currentmodule:: virtualizarr
 6 | 
 7 | VirtualiZarr has a small API surface, because most of the complexity is handled by xarray functions like ``xarray.concat`` and ``xarray.merge``.
 8 | Users can use xarray for every step apart from reading and serializing virtual references.
 9 | 
10 | User API
11 | ========
12 | 
13 | Reading
14 | -------
15 | 
16 | .. currentmodule:: virtualizarr.backend
17 | .. autosummary::
18 |     :nosignatures:
19 |     :toctree: generated/
20 | 
21 |     open_virtual_dataset
22 | 
23 | Serialization
24 | -------------
25 | 
26 | .. currentmodule:: virtualizarr.accessor
27 | .. autosummary::
28 |     :nosignatures:
29 |     :toctree: generated/
30 | 
31 |     VirtualiZarrDatasetAccessor.to_kerchunk
32 |     VirtualiZarrDatasetAccessor.to_icechunk
33 |     VirtualiZarrDataTreeAccessor.to_icechunk
34 | 
35 | Information
36 | -----------
37 | 
38 | .. currentmodule:: virtualizarr.accessor
39 | .. autosummary::
40 |     :nosignatures:
41 |     :toctree: generated/
42 | 
43 |     VirtualiZarrDatasetAccessor.nbytes
44 | 
45 | Rewriting
46 | ---------
47 | 
48 | .. currentmodule:: virtualizarr.accessor
49 | .. autosummary::
50 |     :nosignatures:
51 |     :toctree: generated/
52 | 
53 |     VirtualiZarrDatasetAccessor.rename_paths
54 | 
55 | Developer API
56 | =============
57 | 
58 | If you want to write a new reader to create virtual references pointing to a custom file format, you will need to use VirtualiZarr's internal classes.
59 | 
60 | Manifests
61 | ---------
62 | 
63 | VirtualiZarr uses these classes to store virtual references internally.
64 | 
65 | .. currentmodule:: virtualizarr.manifests
66 | .. autosummary::
67 |     :nosignatures:
68 |     :toctree: generated/
69 | 
70 |     ChunkManifest
71 |     ManifestArray
72 | 
73 | 
74 | Array API
75 | ---------
76 | 
77 | VirtualiZarr's :py:class:`~virtualizarr.ManifestArray` objects support a limited subset of the Python Array API standard in :py:mod:`virtualizarr.manifests.array_api`.
78 | 
79 | .. currentmodule:: virtualizarr.manifests.array_api
80 | .. autosummary::
81 |     :nosignatures:
82 |     :toctree: generated/
83 | 
84 |     concatenate
85 |     stack
86 |     expand_dims
87 |     broadcast_to
88 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # For the full list of built-in configuration values, see the documentation:
  4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  5 | 
  6 | # -- Project information -----------------------------------------------------
  7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
  8 | 
  9 | project = "virtualizarr"
 10 | copyright = "2024, Thomas Nicholas"
 11 | author = "Thomas Nicholas"
 12 | 
 13 | # -- General configuration ---------------------------------------------------
 14 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 15 | 
 16 | 
 17 | extensions = [
 18 |     "myst_nb",
 19 |     "sphinx.ext.autodoc",
 20 |     "sphinx.ext.autosummary",
 21 |     "sphinx.ext.extlinks",
 22 |     "sphinx.ext.intersphinx",
 23 |     "sphinx_copybutton",
 24 |     "sphinx_togglebutton",
 25 |     "sphinx_design",
 26 |     "sphinx.ext.napoleon",
 27 | ]
 28 | 
 29 | extlinks = {
 30 |     "issue": ("https://github.com/zarr-developers/virtualizarr/issues/%s", "GH%s"),
 31 |     "pull": ("https://github.com/zarr-developers/virtualizarr/pull/%s", "PR%s"),
 32 |     "discussion": ("https://github.com/zarr-developers/virtualizarr/discussions/%s", "D%s"),
 33 | }
 34 | 
 35 | # Example configuration for intersphinx: refer to the Python standard library.
 36 | # use in refs e.g:
 37 | # :ref:`comparison manual <python:comparisons>`
 38 | intersphinx_mapping = {
 39 |     "python": ("https://docs.python.org/3/", None),
 40 |     "numpy": ("https://numpy.org/doc/stable/", None),
 41 |     "zarr": ("https://zarr.readthedocs.io/en/stable/", None),
 42 |     "xarray": ("https://docs.xarray.dev/en/stable/", None),
 43 |     "obstore": ("https://developmentseed.org/obstore/latest/", None),
 44 | }
 45 | 
 46 | # Add any paths that contain templates here, relative to this directory.
 47 | templates_path = ["_templates"]
 48 | 
 49 | # The master toctree document.
 50 | master_doc = "index"
 51 | 
 52 | # The language for content autogenerated by Sphinx. Refer to documentation
 53 | # for a list of supported languages.
 54 | #
 55 | # This is also used if you do content translation via gettext catalogs.
 56 | # Usually you set "language" from the command line for these cases.
 57 | language = "en"
 58 | 
 59 | # List of patterns, relative to source directory, that match files and
 60 | # directories to ignore when looking for source files.
 61 | # This patterns also effect to html_static_path and html_extra_path
 62 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 63 | 
 64 | # The name of the Pygments (syntax highlighting) style to use.
 65 | pygments_style = "sphinx"
 66 | 
 67 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 68 | todo_include_todos = False
 69 | 
 70 | # -- Myst Options -------------------------------------------------
 71 | 
 72 | myst_heading_anchors = 3
 73 | 
 74 | # -- Options for HTML output -------------------------------------------------
 75 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 76 | 
 77 | html_theme = "pydata_sphinx_theme"
 78 | html_theme_options = {
 79 |     "use_edit_page_button": True,
 80 |     "icon_links": [
 81 |         {
 82 |             "name": "GitHub",
 83 |             "url": "https://github.com/zarr-developers/VirtualiZarr",
 84 |             "icon": "fa-brands fa-github",
 85 |             "type": "fontawesome",
 86 |         },
 87 |     ]
 88 | }
 89 | html_title = "VirtualiZarr"
 90 | html_context = {
 91 |     "github_user": "zarr-developers",
 92 |     "github_repo": "VirtualiZarr",
 93 |     "github_version": "main",
 94 |     "doc_path": "docs",
 95 | }
 96 | 
 97 | # remove sidebar, see GH issue #82
 98 | html_css_files = [
 99 |     'custom.css',
100 | ]
101 | 
102 | # html_logo = "_static/_future_logo.png"
103 | 
104 | html_static_path = ["_static"]
105 | 
106 | 
107 | # issues
108 | # dark mode/lm switch
109 | 


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | Contributions are welcome and encouraged! We ask only that all contributors follow the [Zarr Developers Code of Conduct](https://github.com/zarr-developers/.github/blob/main/CODE_OF_CONDUCT.md).
  4 | 
  5 | ## Contributing code
  6 | 
  7 | Before opening a PR to contribute code you should check that your changes work by running the test suite locally.
  8 | 
  9 | ```{important}
 10 | :name: dependencies
 11 | We use [pixi](https://pixi.sh/latest/) to manage dependencies, which you'll want to install to get started.
 12 | ```
 13 | 
 14 | Run tests with the `pixi run --environment test run-tests` command. Some tests require downloading files over the network.
 15 | Use the `run-tests-no-network` task if you want to run tests faster or have no internet access:
 16 | 
 17 | ```bash
 18 | # Run all tests
 19 | pixi run --environment test run-tests
 20 | # Skip tests that require a network connection
 21 | pixi run --environment test run-tests-no-network
 22 | ```
 23 | 
 24 | You can also run tests in other environments:
 25 | 
 26 | ```bash
 27 | pixi run --environment min-deps run-tests # Test with the minimal set of dependencies installed
 28 | pixi run --environment upstream run-tests # Test with unreleased versions of upstream libraries
 29 | # List which versions are installed in the `min-deps` environment
 30 | pixi list --environment min-deps
 31 | ```
 32 | 
 33 | Further, the `pytest-cov` plugin is a test dependency, so you can generate a test
 34 | coverage report locally, if you wish (CI will automatically do so).  Here are some
 35 | examples:
 36 | 
 37 | ```bash
 38 | pixi run --environment test run-tests-cov              # Terminal report showing missing coverage
 39 | pixi run --environment test run-tests-html-cov         # HTML report written to htmlcov/index.html
 40 | ```
 41 | 
 42 | Rather than using pixi tasks (essentially aliases for running commands in a given shell), you can explicitly start
 43 | a shell within a given environment and execute `pytest` (or other commands) directly:
 44 | 
 45 | ```bash
 46 | # Start a shell within the environment
 47 | pixi shell --environment test
 48 | # Run the tests
 49 | pytest virtualizarr
 50 | # Exit the shell
 51 | exit
 52 | ```
 53 | 
 54 | If you run into issues with the development environment, here are some recommending steps:
 55 | - Update pixi using `pixi self-update` and then retry the development workflow.
 56 | - Clean up environments using `pixi clean` and then retry the development workflow.
 57 | - Manually find and clean the cache dir listed in `pixi info` and then retry the development workflow.
 58 | - Ask for help in the [VirtualiZarr channel of the Earthmover community slack](https://earthmover-community.slack.com/archives/C08EXCE8ZQX).
 59 | 
 60 | ### Code standards
 61 | 
 62 | #### Pre-commit
 63 | 
 64 | All code must conform to the PEP8 standard. `VirtualiZarr` uses a set of `pre-commit` hooks and the `pre-commit` bot to format, type-check, and prettify the codebase. `pre-commit` can be installed locally by running:
 65 | 
 66 | ```
 67 | python -m pip install pre-commit
 68 | ```
 69 | The hooks can be installed locally by running:
 70 | 
 71 | ```
 72 | pre-commit install
 73 | ```
 74 | 
 75 | This would run the checks every time a commit is created locally. These checks will also run on every commit pushed to an open PR, resulting in some automatic styling fixes by the `pre-commit` bot. The checks will by default only run on the files modified by a commit, but the checks can be triggered for all the files by running:
 76 | 
 77 | ```
 78 | pre-commit run --all-files
 79 | ```
 80 | 
 81 | If you would like to skip the failing checks and push the code for further discussion, use the `--no-verify` option with `git commit`.
 82 | 
 83 | #### Private functions
 84 | 
 85 | `VirtualiZarr` uses the following convention for private functions:
 86 | 
 87 | - Functions are preceded with an `_` (single underscore) if they should only be used within that module and may change at any time
 88 | - Functions without a preceding `_` (single underscore) are treated as relatively stable by the rest of the codebase, but not for public use (i.e. they are stable developer API).
 89 | - Public functions are documented in the fully public API and should follow the backwards-compatibility expectations of Effective Effort Versioning.
 90 | 
 91 | ## Contributing documentation
 92 | 
 93 | Whilst the CI will build the updated documentation for each PR, it can also be useful to check that the documentation has rendered as expected by building it locally.
 94 | 
 95 | ### Build the documentation locally
 96 | 
 97 | ```bash
 98 | pixi install --environment docs
 99 | pixi run build-docs
100 | ```
101 | Pixi can also be used to serve continuously updating version of the documentation during development at [http://0.0.0.0:8000/](http://0.0.0.0:8000/).
102 | This can be done by navigating to [http://0.0.0.0:8000/](http://0.0.0.0:8000/) in your browser after running:
103 | 
104 | ```bash
105 | pixi run serve-docs
106 | ```
107 | 
108 | ### Access the documentation locally
109 | 
110 | Open `docs/_build/html/index.html` in a web browser (on MacOS you can do this from the terminal using `open docs/_build/html/index.html`).
111 | 
112 | ## Making a release
113 | 
114 | Anyone with commit privileges to the repository can issue a release, and you should feel free to issue a release at any point in time when all the CI tests on `main` are passing.
115 | 
116 | 1. Decide on the release version number for the new release, following the [EffVer](https://jacobtomlinson.dev/effver/) versioning scheme (e.g., releasing v0.2.0 as the next release after v0.1.0 denotes that “some small effort may be required to make sure this version works for you”).
117 | 2. Write a high-level summary of the changes in this release, and write it into the release notes in `docs/releases.rst`. Create and merge a PR which adds the summary and also changes the release notes to say today's date and the version number of the new release. Don't add the blank template for future releases yet.
118 | 3. Navigate to the [https://github.com/zarr-developers/virtualizarr/releases](https://github.com/zarr-developers/virtualizarr/releases) releases page.
119 | 4. Select 'Draft a new release'.
120 | 5. Select 'Choose a tag', then 'Create a new tag'
121 | 6. Enter the name for the new tag (i.e. the release version number).
122 | 7. Click 'Generate Release Notes' to draft notes based on merged pull requests, and paste the same release summary you wrote earlier at the top.
123 | 8. Edit the draft release notes for consistency.
124 | 9. Select 'Publish' to publish the release. This should automatically upload the new release to [PyPI](https://pypi.org/project/virtualizarr/) and [conda-forge](https://anaconda.org/conda-forge/virtualizarr).
125 | 10. Check that this has run successfully (PyPI should show the new version number very quickly, but conda-forge might take several hours).
126 | 11. Create and merge a PR to add a new empty section to the `docs/releases.rst` for the next release in the future. See [this commit](https://github.com/zarr-developers/VirtualiZarr/commit/e3912f08e22f2e3230af6eb1a2aacb5728822fa1) for an example (you can assume the next release will be numbered `vX.Y.Z+1`, but the number doesn't actually matter).
127 | 12. (Optional) Advertise the release on social media 📣
128 | 


--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | 
3 | The following examples demonstrate the use of VirtualiZarr to create virtual datasets of various kinds:
4 | 
5 | 1. [Appending new daily NOAA SST data to Icechunk](https://github.com/zarr-developers/VirtualiZarr/blob/main/examples/append/noaa-cdr-sst.ipynb)
6 | 2. [Parallel reference generation using Coiled Functions](https://github.com/zarr-developers/VirtualiZarr/blob/main/examples/coiled/terraclimate.ipynb)
7 | 3. [Serverless parallel reference generation using Lithops](https://github.com/zarr-developers/VirtualiZarr/tree/main/examples/virtualizarr-with-lithops)
8 | 4. [MUR SST Virtual and Zarr Icechunk Store Generation using Lithops](https://github.com/zarr-developers/VirtualiZarr/tree/main/examples/mursst-icechunk-with-lithops)
9 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # VirtualiZarr
 2 | 
 3 | **Create virtual Zarr stores for cloud-friendly access to archival data, using familiar xarray syntax.**
 4 | 
 5 | The best way to distribute large scientific datasets is via the Cloud, in [Cloud-Optimized formats](https://guide.cloudnativegeo.org/) [^1]. But often this data is stuck in archival pre-Cloud file formats such as netCDF.
 6 | 
 7 | **VirtualiZarr[^2] makes it easy to create "Virtual" Zarr stores, allowing performant access to archival data as if it were in the Cloud-Optimized [Zarr format](https://zarr.dev/), _without duplicating any data_.**
 8 | 
 9 | ## Motivation
10 | 
11 | "Virtualized data" solves an incredibly important problem: accessing big archival datasets via a cloud-optimized pattern, but without copying or modifying the original data in any way. This is a win-win-win for users, data engineers, and data providers. Users see fast-opening zarr-compliant stores that work performantly with libraries like xarray and dask, data engineers can provide this speed by adding a lightweight virtualization layer on top of existing data (without having to ask anyone's permission), and data providers don't have to change anything about their archival files for them to be used in a cloud-optimized way.
12 | 
13 | VirtualiZarr aims to make the creation of cloud-optimized virtualized zarr data from existing scientific data as easy as possible.
14 | 
15 | ## Features
16 | 
17 | * Create virtual references pointing to bytes inside a archival file with [`open_virtual_dataset`](https://virtualizarr.readthedocs.io/en/latest/usage.html#opening-files-as-virtual-datasets),
18 | * Supports a [range of archival file formats](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare), including netCDF4 and HDF5,
19 | * [Combine data from multiple files](https://virtualizarr.readthedocs.io/en/latest/usage.html#combining-virtual-datasets) into one larger store using [xarray's combining functions](https://docs.xarray.dev/en/stable/user-guide/combining.html), such as [`xarray.concat`](https://docs.xarray.dev/en/stable/generated/xarray.concat.html),
20 | * Commit the virtual references to storage either using the [Kerchunk references](https://fsspec.github.io/kerchunk/spec.html) specification or the [Icechunk](https://icechunk.io/) transactional storage engine.
21 | * Users access the virtual dataset using [`xarray.open_dataset`](https://docs.xarray.dev/en/stable/generated/xarray.open_dataset.html#xarray.open_dataset).
22 | 
23 | ## Inspired by Kerchunk
24 | 
25 | VirtualiZarr grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [Kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk but in a zarr-native way, and with a familiar array-like API.
26 | 
27 | You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk.
28 | 
29 | ## Usage
30 | 
31 | Creating the virtual store looks very similar to how we normally open data with xarray:
32 | 
33 | ```python
34 | from virtualizarr import open_virtual_dataset
35 | 
36 | virtual_datasets = [
37 |     open_virtual_dataset(filepath)
38 |     for filepath in glob.glob('/my/files*.nc')
39 | ]
40 | 
41 | # this Dataset wraps a bunch of virtual ManifestArray objects directly
42 | virtual_ds = xr.combine_nested(virtual_datasets, concat_dim=['time'])
43 | 
44 | # cache the combined dataset pattern to disk, in this case using the existing kerchunk specification for reference files
45 | virtual_ds.virtualize.to_kerchunk('combined.json', format='json')
46 | ```
47 | 
48 | Now you can open your shiny new Zarr store instantly:
49 | 
50 | ```python
51 | ds = xr.open_dataset('combined.json', engine='kerchunk', chunks={})  # normal xarray.Dataset object, wrapping dask/numpy arrays etc.
52 | ```
53 | 
54 | No data has been loaded or copied in this process, we have merely created an on-disk lookup table that points xarray into the specific parts of the original netCDF files when it needs to read each chunk.
55 | 
56 | See the [Usage docs page](#usage) for more details.
57 | 
58 | ## Talks and Presentations
59 | 
60 | - 2024/11/21 - MET Office Architecture Guild - Tom Nicholas - [Slides](https://speakerdeck.com/tomnicholas/virtualizarr-talk-at-met-office)
61 | - 2024/11/13 - Cloud-Native Geospatial conference - Raphael Hagen - [Slides](https://decks.carbonplan.org/cloud-native-geo/11-13-24)
62 | - 2024/07/24 - ESIP Meeting - Sean Harkins - [Event](https://2024julyesipmeeting.sched.com/event/1eVP6) / [Recording](https://youtu.be/T6QAwJIwI3Q?t=3689)
63 | - 2024/05/15 - Pangeo showcase - Tom Nicholas - [Event](https://discourse.pangeo.io/t/pangeo-showcase-virtualizarr-create-virtual-zarr-stores-using-xarray-syntax/4127/2) / [Recording](https://youtu.be/ioxgzhDaYiE) / [Slides](https://speakerdeck.com/tomnicholas/virtualizarr-create-virtual-zarr-stores-using-xarray-syntax)
64 | 
65 | ## Credits
66 | 
67 | This package was originally developed by [Tom Nicholas](https://github.com/TomNicholas) whilst working at [[C]Worthy](https://cworthy.org), who deserve credit for allowing him to prioritise a generalizable open-source solution to the dataset virtualization problem. VirtualiZarr is now a community-owned multi-stakeholder project.
68 | 
69 | ## Licence
70 | 
71 | Apache 2.0
72 | 
73 | ## Pages
74 | 
75 | ```{toctree}
76 | :maxdepth: 2
77 | 
78 | self
79 | installation
80 | usage
81 | examples
82 | faq
83 | api
84 | data_structures
85 | custom_readers
86 | releases
87 | contributing
88 | core_team_guide
89 | ```
90 | 
91 | ## References
92 | 
93 | [^1]: [_Cloud-Native Repositories for Big Scientific Data_, Abernathey et. al., _Computing in Science & Engineering_.](https://ieeexplore.ieee.org/abstract/document/9354557)
94 | 
95 | [^2]: (Pronounced like "virtualizer" but more piratey 🦜)
96 | 


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | VirtualiZarr is available on PyPI via pip:
 4 | 
 5 | ```shell
 6 | pip install virtualizarr
 7 | ```
 8 | 
 9 | and on conda-forge:
10 | 
11 | ```shell
12 | conda install -c conda-forge virtualizarr
13 | ```
14 | 
15 | ## Optional dependencies
16 | 
17 | VirtualiZarr has many optional dependencies, split into those for reading various file formats, and those for writing virtual references out to different formats.
18 | 
19 | Optional dependencies can be installed in groups via pip. For example to read HDF files and write virtual references to icechunk you could install all necessary dependencies via:
20 | 
21 | ```shell
22 | pip install "virtualizarr[hdf, icechunk]"
23 | ```
24 | 
25 | The full list of optional dependencies can be seen in the `pyproject.toml` file:
26 | 
27 | ```{literalinclude} ../pyproject.toml
28 | :start-at: "[project.optional-dependencies]"
29 | :end-before: "# Dependency sets under dependencies-groups are NOT available via PyPI"
30 | ```
31 | 
32 | The compound groups allow you to install multiple sets of dependencies at once, e.g. install every file reader via
33 | 
34 | ```shell
35 | pip install "virtualizarr[all_readers]"
36 | ```
37 | 
38 | The basic `pip install virtualizarr` will only install the minimal required dependencies, and so may not be particularly useful on its own.
39 | 
40 | ## Install Test Dependencies
41 | 
42 | For local development you will want to install the test dependencies so that you can run all the tests in the test suite:
43 | 
44 | ```shell
45 | pip install '-e .[test]'
46 | ```
47 | 
48 | ## Install Docs Dependencies
49 | 
50 | To build the documentation locally you will need further dependencies:
51 | 
52 | ```shell
53 | pip install '-e .[docs]'
54 | ```
55 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/examples/coiled/terraclimate.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Virtualizarr and Coiled - Building a virtual dataset of Terraclimate\n",
  8 |     "\n",
  9 |     "This notebook is an example of using Virtualizarr together with the Python distributed processing framework [Coiled](https://www.coiled.io/) to generate references using [serverless functions](https://docs.coiled.io/user_guide/functions.html). \n",
 10 |     "- **Note:** running this notebook requires a coiled account.\n"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "## The dataset\n",
 18 |     "For this example, we are going to create a virtual zarr store from the [Terraclimate](https://www.climatologylab.org/terraclimate.html) dataset. Terraclimate is a monthly dataset spanning 66 years and containing 14 climate and water balance variables. It is made up of 924 individual NetCDF4 files. When represented as an Xarray dataset, it is over 1TB in size."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## Parallelizing `virtualizarr` reference generation with coiled serverless functions\n",
 26 |     "Coiled serverless functions allow us to easily spin up hundreds of small compute instances, which are great for individual file reference generation. We were able to process 924 netCDF files into a 1TB virtual xarray dataset in 9 minutes for ~$0.24."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "## Installation and environment\n",
 34 |     "\n",
 35 |     "You should install the Python requirements in a clean virtual environment of your choice. Each coiled serverless function will reuse this environment, so it's best to start with a clean slate.\n",
 36 |     "\n",
 37 |     "```bash\n",
 38 |     "pip install 'virtualizarr['icechunk','hdf']' coiled ipykernel bokeh\n",
 39 |     "```"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## Imports\n"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "import coiled\n",
 56 |     "import icechunk\n",
 57 |     "import numpy as np\n",
 58 |     "import xarray as xr\n",
 59 |     "\n",
 60 |     "from virtualizarr import open_virtual_dataset"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Create the Terraclimate variable and year url combinations \n",
 68 |     "`14 variables * 66 years = 924 NetCDF files`\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "\n"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "tvars = [\n",
 81 |     "    \"aet\",\n",
 82 |     "    \"def\",\n",
 83 |     "    \"pet\",\n",
 84 |     "    \"ppt\",\n",
 85 |     "    \"q\",\n",
 86 |     "    \"soil\",\n",
 87 |     "    \"srad\",\n",
 88 |     "    \"swe\",\n",
 89 |     "    \"tmax\",\n",
 90 |     "    \"tmin\",\n",
 91 |     "    \"vap\",\n",
 92 |     "    \"ws\",\n",
 93 |     "    \"vpd\",\n",
 94 |     "    \"PDSI\",\n",
 95 |     "]\n",
 96 |     "min_year = 1958\n",
 97 |     "max_year = 2023\n",
 98 |     "time_list = np.arange(min_year, max_year + 1, 1)\n",
 99 |     "\n",
100 |     "combinations = [\n",
101 |     "    f\"https://climate.northwestknowledge.net/TERRACLIMATE-DATA/TerraClimate_{var}_{year}.nc\"\n",
102 |     "    for year in time_list\n",
103 |     "    for var in tvars\n",
104 |     "]"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Define the coiled serverless function\n",
112 |     "\n",
113 |     "### Serverless function setup notes:\n",
114 |     "- This coiled function is tailored to AWS\n",
115 |     "- `vm_type=[\"t4g.small\"]` - This is a small instance, you shouldn't need large machines for reference generation\n",
116 |     "- `spot_policy=\"spot_with_fallback\"` is cheaper, but might have unintended consequences\n",
117 |     "- `arm=True` uses VMs with ARM architecture, which is cheaper\n",
118 |     "- `idle_timeout=\"10 minutes\"` workers will shut down after 10 minutes of inactivity \n",
119 |     "- `n_workers=[100, 300]` adaptive scaling between 100 & 300 workers\n",
120 |     "- `name` [optional] if you want to keep track of your cluster in the coiled dashboard\n",
121 |     "\n",
122 |     "More details can be found in the [serverless function API](https://docs.coiled.io/user_guide/functions.html#api)."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "@coiled.function(\n",
132 |     "    region=\"us-west-2\",\n",
133 |     "    vm_type=[\"t4g.small\"],\n",
134 |     "    spot_policy=\"spot_with_fallback\",\n",
135 |     "    arm=True,\n",
136 |     "    idle_timeout=\"10 minutes\",\n",
137 |     "    n_workers=[10, 100],\n",
138 |     "    name=\"parallel_reference_generation\",\n",
139 |     ")\n",
140 |     "def process(filename):\n",
141 |     "    vds = open_virtual_dataset(\n",
142 |     "        filename,\n",
143 |     "        decode_times=True,\n",
144 |     "        loadable_variables=[\"time\", \"lat\", \"lon\", \"crs\"],\n",
145 |     "    )\n",
146 |     "    return vds\n",
147 |     "\n",
148 |     "\n",
149 |     "# process.map distributes out the input file urls to coiled functions\n",
150 |     "# retires=10 allows for individual task retires, which can be useful for inconsistent server behavior\n",
151 |     "results = process.map(combinations[0:2], retries=10)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "\n",
159 |     "## Combine references into virtual dataset"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "# extract generator values into a list\n",
169 |     "vds_list = [result for result in results]\n",
170 |     "\n",
171 |     "# combine individual refs into a virtual Xarray dataset\n",
172 |     "mds = xr.combine_by_coords(\n",
173 |     "    vds_list, coords=\"minimal\", compat=\"override\", combine_attrs=\"drop\"\n",
174 |     ")\n",
175 |     "\n",
176 |     "mds"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "print(str(\"{0:.2f}\".format(mds.nbytes / 1e12)), \" TB\")"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "## Save the virtual dataset to Icechunk\n",
193 |     "\n",
194 |     "Now that we have this virtual dataset, we can write it to Icechunk. \n",
195 |     "\n",
196 |     "In this example we're creating a local icechunk store, but you could configure it for cloud storage."
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "local_storage_conifg = icechunk.local_filesystem_storage(\"./terraclimate\")\n",
206 |     "repo = icechunk.Repository.open_or_create(local_storage_conifg)\n",
207 |     "session = repo.writable_session(\"main\")"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "mds.virtualize.to_icechunk(store=session.store)"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "## Open the Icechunk store with Xarray\n",
224 |     "\n",
225 |     "**Warning:** Calling `to_zarr` on this dataset will try to write out 1TB of data.\n"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "combined_ds = xr.open_zarr(session.store, consolidated=False, zarr_format=3)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "combined_ds"
244 |    ]
245 |   }
246 |  ],
247 |  "metadata": {
248 |   "kernelspec": {
249 |    "display_name": "Python 3",
250 |    "language": "python",
251 |    "name": "python3"
252 |   },
253 |   "language_info": {
254 |    "codemirror_mode": {
255 |     "name": "ipython",
256 |     "version": 3
257 |    },
258 |    "file_extension": ".py",
259 |    "mimetype": "text/x-python",
260 |    "name": "python",
261 |    "nbconvert_exporter": "python",
262 |    "pygments_lexer": "ipython3",
263 |    "version": "3.12.8"
264 |   }
265 |  },
266 |  "nbformat": 4,
267 |  "nbformat_minor": 2
268 | }
269 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use AWS Lambda base image for Python 3.11
 2 | FROM public.ecr.aws/lambda/python:3.11
 3 | 
 4 | ARG FUNCTION_DIR
 5 | 
 6 | # Set working directory
 7 | WORKDIR /var/task
 8 | 
 9 | # Update system libraries and install necessary utilities
10 | RUN yum update -y && \
11 |     yum install -y wget unzip tar gzip git && \
12 |     yum clean all
13 | 
14 | # Install uv package manager and move it to /usr/local/bin
15 | RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
16 |     mv ~/.local/bin/uv /usr/local/bin/uv && \
17 |     chmod +x /usr/local/bin/uv
18 | 
19 | # Verify uv installation
20 | RUN uv --version
21 | 
22 | RUN uv pip install --upgrade pip wheel six setuptools --system \
23 |     && uv pip install --upgrade --no-cache-dir --system \
24 |     awslambdaric \
25 |     boto3 \
26 |     redis \
27 |     httplib2 \
28 |     requests \
29 |     numpy \
30 |     scipy \
31 |     pandas \
32 |     pika \
33 |     kafka-python \
34 |     cloudpickle \
35 |     ps-mem \
36 |     tblib \
37 |     psutil
38 | 
39 | # Set environment variables for Lambda
40 | ENV PYTHONPATH="/var/lang/lib/python3.11/site-packages:${FUNCTION_DIR}"
41 | 
42 | # Copy and install dependencies from requirements.txt using uv
43 | COPY requirements.txt /tmp/requirements.txt
44 | RUN uv pip install --no-cache-dir -r /tmp/requirements.txt --system
45 | 
46 | # Copy application code
47 | COPY lithops_lambda.zip ${FUNCTION_DIR}
48 | RUN unzip lithops_lambda.zip \
49 |     && rm lithops_lambda.zip \
50 |     && mkdir handler \
51 |     && touch handler/__init__.py \
52 |     && mv entry_point.py handler/
53 | 
54 | # Set Lambda entry point
55 | CMD [ "handler.entry_point.lambda_handler" ]
56 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/README.md:
--------------------------------------------------------------------------------
  1 | # Lithops Package for MUR SST Data Processing
  2 | 
  3 | This package provides functionality for processing MUR SST (Multi-scale Ultra-high Resolution Sea Surface Temperature) data using [Lithops](https://lithops-cloud.github.io/), a framework for serverless computing.
  4 | 
  5 | ## Environment + Lithops Setup
  6 | 
  7 | 1. Set up a Python environment. The below example uses [`uv`](https://docs.astral.sh/uv/), but other environment managers should work as well:
  8 | 
  9 | ```sh
 10 | uv venv virtualizarr-lithops --python 3.11
 11 | source virtualizarr-lithops/bin/activate
 12 | uv pip install -r requirements.txt
 13 | ```
 14 | 
 15 | 2. Follow the [AWS Lambda Configuration](https://lithops-cloud.github.io/docs/source/compute_config/aws_lambda.html#configuration) instructions, unless you already have an appropriate AWS IAM role to use.
 16 | 
 17 | 3. Follow the [AWS Credential setup](https://lithops-cloud.github.io/docs/source/compute_config/aws_lambda.html#aws-credential-setup) instructions.
 18 | 
 19 | 4. Check and modify as necessary compute and storage backends for [lithops](https://lithops-cloud.github.io/docs/source/configuration.html) in `lithops.yaml`.
 20 | 
 21 | 
 22 | 5. Build the lithops lambda runtime if it does not exist in your target AWS environment.
 23 | ```bash
 24 | export LITHOPS_CONFIG_FILE=$(pwd)/lithops.yaml
 25 | lithops runtime build -b aws_lambda -f Dockerfile vz-runtime
 26 | ```
 27 | 
 28 | For various reasons, you may want to build the lambda runtime on EC2 (docker can be a resource hog and pushing to ECR is faster, for example). If you wish to use EC2, please see the scripts in `ec2_for_lithops_runtime/` in this directory.
 29 | 
 30 | > [!IMPORTANT]
 31 | > If the runtime was created with a different IAM identity, an appropriate `user_id` will need to be included in the lithops configuration under `aws_lamda`.
 32 | 
 33 | > [!TIP]
 34 | > You can configure the AWS Lambda architecture via the `architecture` key under `aws_lambda` in the lithops configuration file.
 35 | 
 36 | 
 37 | 6. (Optional) To rebuild the Lithops Lambda runtime image, delete the existing one:
 38 | 
 39 | ```bash
 40 | lithops runtime delete -b aws_lambda -d virtualizarr-runtime
 41 | ```
 42 | 
 43 | ## Package Structure
 44 | 
 45 | The package is organized into the following modules:
 46 | 
 47 | - `__init__.py`: Package initialization and exports
 48 | - `config.py`: Configuration settings and constants
 49 | - `models.py`: Data models and structures
 50 | - `url_utils.py`: URL generation and file listing
 51 | - `repo.py`: Icechunk repository management
 52 | - `virtual_datasets.py`: Virtual dataset operations
 53 | - `zarr_operations.py`: Zarr array operations
 54 | - `helpers.py`: Data helpers
 55 | - `lithops_functions.py`: Lithops execution wrappers
 56 | - `cli.py`: Command-line interface
 57 | 
 58 | ## Usage
 59 | 
 60 | ### Command-line Interface
 61 | 
 62 | The package provides a command-line interface for running various functions:
 63 | 
 64 | ```bash
 65 | python main.py <function> [options]
 66 | ```
 67 | 
 68 | Available functions:
 69 | 
 70 | - `write_to_icechunk`: Write data to Icechunk
 71 | - `check_data_store_access`: Check access to the data store
 72 | - `calc_icechunk_store_mean`: Calculate the mean of the Icechunk store
 73 | - `calc_original_files_mean`: Calculate the mean of the original files
 74 | - `list_installed_packages`: List installed packages
 75 | 
 76 | Options:
 77 | 
 78 | - `--start_date`: Start date for data processing (YYYY-MM-DD)
 79 | - `--end_date`: End date for data processing (YYYY-MM-DD)
 80 | - `--append_dim`: Append dimension for writing to Icechunk
 81 | 
 82 | ### Examples
 83 | 
 84 | #### Writing Data to Icechunk
 85 | 
 86 | ```bash
 87 | python main.py write_to_icechunk --start_date 2022-01-01 --end_date 2022-01-02
 88 | ```
 89 | 
 90 | #### Calculating the Mean of the Icechunk Store
 91 | 
 92 | ```bash
 93 | python main.py calc_icechunk_store_mean --start_date 2022-01-01 --end_date 2022-01-31
 94 | ```
 95 | 
 96 | #### Checking Data Store Access
 97 | 
 98 | ```bash
 99 | python main.py check_data_store_access
100 | ```
101 | 
102 | ## Programmatic Usage
103 | 
104 | You can also use the package programmatically:
105 | 
106 | ```python
107 | from lithops_functions import write_to_icechunk
108 | 
109 | # Write data to Icechunk
110 | write_to_icechunk(start_date="2022-01-01", end_date="2022-01-31")
111 | ```
112 | 
113 | ## Testing
114 | 
115 | To test the package, you can use the provided test functions:
116 | 
117 | ```bash
118 | python main.py check_data_store_access
119 | ```
120 | 
121 | This will verify that the package can access the data store.
122 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Lithops package for MUR SST data processing import
 3 | 
 4 | This package provides functionality for processing MUR SST data using Lithops,
 5 | a framework for serverless computing import
 6 | """
 7 | 
 8 | from . import (
 9 |     config,
10 |     data_processing,
11 |     lithops_functions,
12 |     models,
13 |     repo,
14 |     url_utils,
15 |     virtual_datasets,
16 | )
17 | 
18 | __all__ = [
19 |     "config",
20 |     "data_processing",
21 |     "lithops_functions",
22 |     "models",
23 |     "repo",
24 |     "url_utils",
25 |     "virtual_datasets",
26 | ]
27 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/cli.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Command-line interface.
 3 | 
 4 | This module provides a command-line interface for the package.
 5 | """
 6 | 
 7 | import argparse
 8 | 
 9 | from lithops_functions import (
10 |     lithops_calc_icechunk_store_mean,
11 |     lithops_calc_original_files_mean,
12 |     lithops_check_data_store_access,
13 |     lithops_list_installed_packages,
14 |     write_to_icechunk,
15 | )
16 | 
17 | 
18 | def parse_args():
19 |     """
20 |     Parse command-line arguments.
21 | 
22 |     Returns:
23 |         The parsed arguments
24 |     """
25 |     parser = argparse.ArgumentParser(description="Run lithops functions.")
26 |     parser.add_argument(
27 |         "function",
28 |         choices=[
29 |             "write_to_icechunk",
30 |             "check_data_store_access",
31 |             "calc_icechunk_store_mean",
32 |             "calc_original_files_mean",
33 |             "list_installed_packages",
34 |         ],
35 |         help="The function to run.",
36 |     )
37 |     parser.add_argument(
38 |         "--start_date",
39 |         type=str,
40 |         help="Start date for data processing (YYYY-MM-DD).",
41 |     )
42 |     parser.add_argument(
43 |         "--end_date",
44 |         type=str,
45 |         help="End date for data processing (YYYY-MM-DD).",
46 |     )
47 |     parser.add_argument(
48 |         "--append_dim",
49 |         type=str,
50 |         help="Append dimension for writing to icechunk.",
51 |     )
52 |     return parser.parse_args()
53 | 
54 | 
55 | def main():
56 |     """
57 |     Main entry point for the command-line interface.
58 |     """
59 |     args = parse_args()
60 |     start_date = args.start_date
61 |     end_date = args.end_date
62 |     append_dim = args.append_dim
63 | 
64 |     if args.function == "write_to_icechunk":
65 |         write_to_icechunk(
66 |             start_date=start_date, end_date=end_date, append_dim=append_dim
67 |         )
68 |     elif args.function == "check_data_store_access":
69 |         lithops_check_data_store_access()
70 |     elif args.function == "calc_icechunk_store_mean":
71 |         lithops_calc_icechunk_store_mean(start_date=start_date, end_date=end_date)
72 |     elif args.function == "calc_original_files_mean":
73 |         lithops_calc_original_files_mean(start_date=start_date, end_date=end_date)
74 |     elif args.function == "list_installed_packages":
75 |         lithops_list_installed_packages()
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration settings for MUR SST data processing.
 3 | 
 4 | This module contains all the configuration settings and constants used
 5 | throughout the package.
 6 | """
 7 | 
 8 | import fsspec
 9 | 
10 | # S3 filesystem for reading data
11 | fs_read = fsspec.filesystem("s3", anon=False, skip_instance_cache=True)
12 | 
13 | # Data source configuration
14 | base_url = "s3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1"
15 | data_vars = ["analysed_sst", "analysis_error", "mask", "sea_ice_fraction"]
16 | drop_vars = ["dt_1km_data", "sst_anomaly"]
17 | 
18 | # Storage configuration
19 | bucket = "nasa-eodc-scratch"
20 | store_name = "MUR-JPL-L4-GLOB-v4.1-virtual-v1"
21 | directory = "test"
22 | 
23 | # Spatial subset configuration
24 | lat_slice = slice(48.5, 48.7)
25 | lon_slice = slice(-124.7, -124.5)
26 | 
27 | # Date range processing dictionary
28 | date_process_dict = {
29 |     ("2002-06-30", "2003-09-10"): "virtual_dataset",
30 |     ("2003-09-11", "2003-09-11"): "zarr",
31 |     ("2003-09-12", "2021-02-19"): "virtual_dataset",
32 |     ("2021-02-20", "2021-02-21"): "zarr",
33 |     ("2021-02-22", "2021-12-23"): "virtual_dataset",
34 |     ("2021-12-24", "2022-01-26"): "zarr",
35 |     ("2022-01-27", "2022-11-08"): "virtual_dataset",
36 |     ("2022-11-09", "2022-11-09"): "zarr",
37 |     ("2022-11-10", "2023-02-23"): "virtual_dataset",
38 |     ("2023-02-24", "2023-02-28"): "zarr",
39 |     ("2023-03-01", "2023-04-21"): "virtual_dataset",
40 |     ("2023-04-22", "2023-04-22"): "zarr",
41 |     ("2023-04-23", "2023-09-03"): "virtual_dataset",
42 | }
43 | 
44 | zarr_concurrency = 4
45 | 
46 | mursst_var_chunks = {
47 |     "analysed_sst": {"time": 1, "lat": 1023, "lon": 2047},
48 |     "analysis_error": {"time": 1, "lat": 1023, "lon": 2047},
49 |     "mask": {"time": 1, "lat": 1447, "lon": 2895},
50 |     "sea_ice_fraction": {"time": 1, "lat": 1447, "lon": 2895},
51 | }
52 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/ec2_for_lithops_runtime/00-create-security-group.sh:
--------------------------------------------------------------------------------
1 | export SECURITY_GROUP_NAME=XXX
2 | export VPC_ID=XXX
3 | aws ec2 create-security-group --group-name $SG_GROUP_NAME --description "security group for ithops runtime builder ec2" --vpc-id $VPC_ID
4 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/ec2_for_lithops_runtime/01-launch-ec2.sh:
--------------------------------------------------------------------------------
 1 | # look up the group id created
 2 | export SECURITY_GROUP_ID=XXX
 3 | export YOUR_IP=$(curl -s https://checkip.amazonaws.com)
 4 | export AMI_ID=ami-027951e78de46a00e
 5 | export SSH_KEY_NAME=XXX
 6 | aws ec2 authorize-security-group-ingress --group-id $SECURITY_GROUP_ID --ip-permissions '{"IpProtocol":"tcp","FromPort":22,"ToPort":22,"IpRanges":[{"CidrIp":"'$YOUR_IP'/32"}]}'
 7 | aws ec2 run-instances --image-id $AMI_ID \
 8 |   --instance-type "t3.medium" --key-name $SSH_KEY_NAME \
 9 |     --block-device-mappings '{"DeviceName":"/dev/xvda","Ebs":{"Encrypted":false,"DeleteOnTermination":true,"Iops":3000,"SnapshotId":"snap-01783d80c688baa0f","VolumeSize":30,"VolumeType":"gp3","Throughput":125}}' \
10 |     --network-interfaces '{"AssociatePublicIpAddress":true,"DeviceIndex":0,"Groups":["'$SECURITY_GROUP_ID'"]}' \
11 |     --credit-specification '{"CpuCredits":"unlimited"}' \
12 |     --metadata-options '{"HttpEndpoint":"enabled","HttpPutResponseHopLimit":2,"HttpTokens":"required"}' \
13 |     --private-dns-name-options '{"HostnameType":"ip-name","EnableResourceNameDnsARecord":true,"EnableResourceNameDnsAAAARecord":false}' \
14 |     --count "1"
15 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/ec2_for_lithops_runtime/02-setup-ec2-role.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Set variables
 4 | ROLE_NAME="EC2_Lithops_Lambda_Builder"
 5 | INSTANCE_ID=XXX  # Replace with your EC2 instance ID
 6 | POLICY_NAME="EC2LithopsLambdaPolicy"
 7 | REGION=XXX
 8 | 
 9 | # Step 1: Create the IAM role
10 | aws iam create-role --role-name $ROLE_NAME \
11 |     --assume-role-policy-document '{
12 |         "Version": "2012-10-17",
13 |         "Statement": [
14 |             {
15 |                 "Effect": "Allow",
16 |                 "Principal": { "Service": "ec2.amazonaws.com" },
17 |                 "Action": "sts:AssumeRole"
18 |             }
19 |         ]
20 |     }' > /dev/null
21 | 
22 | echo "✅ IAM Role '$ROLE_NAME' created."
23 | 
24 | # Step 2: Attach necessary policies
25 | aws iam put-role-policy --role-name $ROLE_NAME --policy-name $POLICY_NAME \
26 |     --policy-document '{
27 |         "Version": "2012-10-17",
28 |         "Statement": [
29 |             {
30 |                 "Effect": "Allow",
31 |                 "Action": [
32 |                     "ecr:GetAuthorizationToken",
33 |                     "ecr:BatchCheckLayerAvailability",
34 |                     "ecr:CompleteLayerUpload",
35 |                     "ecr:UploadLayerPart",
36 |                     "ecr:InitiateLayerUpload",
37 |                     "ecr:PutImage",
38 |                     "ecr:BatchGetImage",
39 |                     "lambda:CreateFunction",
40 |                     "lambda:UpdateFunctionCode",
41 |                     "s3:GetObject",
42 |                     "s3:ListBucket",
43 |                     "ecr:CreateRepository"
44 |                 ],
45 |                 "Resource": "*"
46 |             }
47 |         ]
48 |     }' > /dev/null
49 | 
50 | echo "✅ IAM policy attached to role '$ROLE_NAME'."
51 | 
52 | # Step 3: Create an Instance Profile and associate with the role
53 | aws iam create-instance-profile --instance-profile-name $ROLE_NAME > /dev/null
54 | aws iam add-role-to-instance-profile --instance-profile-name $ROLE_NAME --role-name $ROLE_NAME
55 | 
56 | echo "✅ Instance profile '$ROLE_NAME' created and role attached."
57 | 
58 | # Step 4: Attach the IAM role to the running EC2 instance
59 | aws ec2 associate-iam-instance-profile --instance-id $INSTANCE_ID \
60 |     --iam-instance-profile Name=$ROLE_NAME > /dev/null
61 | 
62 | echo "✅ IAM role '$ROLE_NAME' attached to instance '$INSTANCE_ID'."
63 | 
64 | # Step 5: Confirm the role is attached
65 | echo "🔄 Waiting for role to be active..."
66 | sleep 10
67 | aws ec2 describe-instances --instance-ids $INSTANCE_ID --query "Reservations[*].Instances[*].IamInstanceProfile" --output json
68 | 
69 | echo "✅ Done! The EC2 instance now has the necessary permissions."
70 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/ec2_for_lithops_runtime/03-setup-ec2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Exit on error
 4 | set -e
 5 | 
 6 | echo "Updating system packages..."
 7 | sudo yum update -y
 8 | 
 9 | echo "Installing Python 3 and pip..."
10 | sudo yum install -y python3 python3-pip
11 | 
12 | echo "Installing Docker..."
13 | sudo yum install -y docker git
14 | 
15 | echo "Starting Docker service..."
16 | sudo systemctl start docker
17 | sudo systemctl enable docker
18 | 
19 | echo "Adding current user to Docker group..."
20 | sudo usermod -aG docker $USER
21 | 
22 | echo "Installing uv package manager..."
23 | curl -LsSf https://astral.sh/uv/install.sh | sh
24 | 
25 | echo "Verifying installations..."
26 | python3 --version
27 | pip3 --version
28 | docker --version
29 | uv --version
30 | 
31 | echo "Setup complete! Please log out and log back in to apply Docker group changes."
32 | 
33 | # lithops environment setup
34 | git clone https://github.com/zarr-developers/Virtualizarr
35 | cd Virtualizarr/
36 | cd examples/mursst-icechunk-with-lithops/
37 | uv venv virtualizarr-lithops
38 | source virtualizarr-lithops/bin/activate
39 | uv pip install -r requirements.txt
40 | export LITHOPS_CONFIG_FILE=$(pwd)/lithops.yaml
41 | lithops runtime build -b aws_lambda -f Dockerfile virtualizarr-runtime
42 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/ec2_for_lithops_runtime/README.md:
--------------------------------------------------------------------------------
 1 | # Launch and use an EC2 for building the Lithops lambda runtime
 2 | 
 3 | The scripts in this directly will help to launch and set up an ec2 so that you can build and push a lithops lambda runtime.
 4 | 
 5 | You will need AWS console and CLI access.
 6 | 
 7 | Steps:
 8 | 
 9 | 1. Access the AWS console to create an SSH key in AWS that you can associate the EC2 when launching.
10 | 2. Add a `SECURITY_GROUP_NAME` of your choosing and appropriate `VPC_ID` to `00-create-security-group.sh` and execute that script.
11 | 3. Add the `SECURITY_GROUP_ID` and other required variables to `01-launch-ec2.sh` and execute that script.
12 | 4. Add the `INSTANCE_ID` to `02-setup-ec2-role.sh` and execute that script.
13 | 5. SSH into the instance and execute the scripts in `03-setup-ec2.sh`.
14 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/helpers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helpers.
  3 | """
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import xarray as xr
  8 | from config import date_process_dict, lat_slice, lon_slice
  9 | from repo import open_or_create_repo
 10 | from zarr_operations import configure_zarr
 11 | 
 12 | 
 13 | def xarray_open_icechunk(open_or_create_repo_func: callable = open_or_create_repo):
 14 |     """
 15 |     Open an Icechunk repository as an xarray Dataset.
 16 | 
 17 |     Args:
 18 |         open_or_create_repo_func: Function to open or create a repository
 19 | 
 20 |     Returns:
 21 |         An xarray Dataset
 22 |     """
 23 |     # Configure Zarr for optimal performance
 24 |     configure_zarr()
 25 |     repo = open_or_create_repo_func()
 26 |     session = repo.readonly_session("main")
 27 |     return xr.open_dataset(
 28 |         session.store, consolidated=False, zarr_format=3, engine="zarr"
 29 |     )
 30 | 
 31 | 
 32 | def open_and_read_data(
 33 |     file: str, lat_slice_arg: slice = lat_slice, lon_slice_arg: slice = lon_slice
 34 | ):
 35 |     """
 36 |     Open and read data from a file.
 37 | 
 38 |     Args:
 39 |         file: The file to open
 40 |         lat_slice_arg: The latitude slice
 41 |         lon_slice_arg: The longitude slice
 42 | 
 43 |     Returns:
 44 |         The data values
 45 |     """
 46 |     from config import fs_read
 47 | 
 48 |     ds = xr.open_dataset(fs_read.open(file), chunks={})
 49 |     return ds.analysed_sst.sel(lat=lat_slice_arg, lon=lon_slice_arg).values
 50 | 
 51 | 
 52 | def get_mean(values: np.ndarray):
 53 |     """
 54 |     Calculate the mean of an array.
 55 | 
 56 |     Args:
 57 |         values: The array to calculate the mean of
 58 | 
 59 |     Returns:
 60 |         The mean value
 61 |     """
 62 |     return np.nanmean(values)
 63 | 
 64 | 
 65 | # Convert dictionary to a Pandas DataFrame with IntervalIndex
 66 | interval_df = pd.DataFrame(
 67 |     [
 68 |         {
 69 |             "interval": pd.Interval(
 70 |                 pd.Timestamp(start), pd.Timestamp(end), closed="both"
 71 |             ),
 72 |             "label": label,
 73 |         }
 74 |         for (start, end), label in date_process_dict.items()
 75 |     ]
 76 | )
 77 | 
 78 | 
 79 | def find_label_for_range(date_str1, date_str2, df=interval_df):
 80 |     """
 81 |     Find the corresponding label for two dates.
 82 | 
 83 |     Args:
 84 |         date_str1: The first date in YYYY-MM-DD format
 85 |         date_str2: The second date in YYYY-MM-DD format
 86 |         df: The DataFrame with intervals and labels
 87 | 
 88 |     Returns:
 89 |         The label for the date range
 90 |     """
 91 |     date1, date2 = pd.Timestamp(date_str1), pd.Timestamp(date_str2)
 92 | 
 93 |     # Find intervals where both dates are contained
 94 |     match = df[
 95 |         df["interval"].apply(lambda interval: date1 in interval and date2 in interval)
 96 |     ]
 97 |     if match.empty:
 98 |         raise ValueError(
 99 |             f"No matching interval found for dates {date_str1} and {date_str2}"
100 |         )
101 | 
102 |     return match["label"].iloc[0] if not match.empty else None
103 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/lithops.yaml:
--------------------------------------------------------------------------------
 1 | lithops:
 2 |   backend: aws_lambda
 3 |   storage: aws_s3
 4 |   data_limit: False # Max (iter)data size (in MB). Set to False for unlimited size.
 5 | 
 6 | aws:
 7 |   region: us-west-2
 8 | 
 9 | aws_lambda:
10 |   execution_role: arn:aws:iam::CHANGE_ME:role/veda-data-reader-dev
11 |   runtime: vz-runtime:latest
12 |   runtime_memory: 10240
13 |   # user_id: kuf3 # if the runtime was created with a different IAM identity, this user id will need to be included
14 | 
15 | aws_s3:
16 |   bucket: arn:aws:s3:::nasa-eodc-lithops
17 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Main entry point for the lithops package.
 4 | 
 5 | This script provides a simple interface for running the package from the command line.
 6 | """
 7 | 
 8 | from cli import main
 9 | 
10 | if __name__ == "__main__":
11 |     main()
12 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/models.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Data models for MUR SST data processing.
 3 | 
 4 | This module contains data structures used throughout the package.
 5 | """
 6 | 
 7 | from dataclasses import dataclass
 8 | 
 9 | 
10 | @dataclass
11 | class Task:
12 |     """
13 |     Represents a data processing task.
14 | 
15 |     Attributes:
16 |         var: The variable name to process
17 |         dt: The datetime string
18 |         time_idx: The time index in the array
19 |     """
20 | 
21 |     var: str
22 |     dt: str
23 |     time_idx: int
24 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/repo.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Icechunk repository management.
 3 | 
 4 | This module contains functions for creating and managing Icechunk repositories.
 5 | """
 6 | 
 7 | import boto3
 8 | import icechunk
 9 | from config import bucket, directory, store_name
10 | 
11 | 
12 | def open_or_create_repo():
13 |     """
14 |     Open or create an Icechunk repository.
15 | 
16 |     Returns:
17 |         An Icechunk repository object
18 |     """
19 |     # Config for repo storage
20 |     session = boto3.Session()
21 | 
22 |     # Get the credentials from the session
23 |     credentials = session.get_credentials()
24 | 
25 |     # Extract the actual key, secret, and token
26 |     creds = credentials.get_frozen_credentials()
27 |     storage_config = icechunk.s3_storage(
28 |         bucket=bucket,
29 |         prefix=f"{directory}/{store_name}",
30 |         region="us-west-2",
31 |         access_key_id=creds.access_key,
32 |         secret_access_key=creds.secret_key,
33 |         session_token=creds.token,
34 |     )
35 | 
36 |     # Config for repo
37 |     repo_config = icechunk.RepositoryConfig.default()
38 |     repo_config.set_virtual_chunk_container(
39 |         icechunk.VirtualChunkContainer(
40 |             "s3", "s3://", icechunk.s3_store(region="us-west-2")
41 |         )
42 |     )
43 | 
44 |     # Config for repo virtual chunk credentials
45 |     virtual_chunk_creds = icechunk.containers_credentials(
46 |         s3=icechunk.s3_credentials(anonymous=False)
47 |     )
48 | 
49 |     repo = icechunk.Repository.open_or_create(
50 |         storage=storage_config,
51 |         config=repo_config,
52 |         virtual_chunk_credentials=virtual_chunk_creds,
53 |     )
54 |     return repo
55 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/requirements.txt:
--------------------------------------------------------------------------------
 1 | xarray>=2025.1.0
 2 | h5netcdf
 3 | h5py
 4 | pandas
 5 | s3fs
 6 | boto3==1.35.99
 7 | dask
 8 | distributed
 9 | lithops
10 | git+https://github.com/zarr-developers/virtualizarr.git@main[icechunk]
11 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/url_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | URL utilities for MUR SST data processing.
 3 | 
 4 | This module contains functions for generating URLs and listing files.
 5 | """
 6 | 
 7 | import datetime
 8 | from typing import List
 9 | 
10 | import pandas as pd
11 | from config import base_url
12 | 
13 | 
14 | def make_url(date: datetime) -> str:
15 |     """
16 |     Create an S3 URL for a specific datetime.
17 | 
18 |     Args:
19 |         date: The datetime to create a URL for
20 | 
21 |     Returns:
22 |         The S3 URL for the specified datetime
23 |     """
24 |     date_string = date.strftime("%Y%m%d") + "090000"
25 |     components = [
26 |         base_url,
27 |         f"{date_string}-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc",
28 |     ]
29 |     return "/".join(components)
30 | 
31 | 
32 | def list_mur_sst_files(start_date: str, end_date: str, dmrpp: bool = True) -> List[str]:
33 |     """
34 |     List all files in S3 with a certain date prefix.
35 | 
36 |     Args:
37 |         start_date: The start date in YYYY-MM-DD format
38 |         end_date: The end date in YYYY-MM-DD format
39 |         dmrpp: Whether to return DMR++ URLs (default: True)
40 | 
41 |     Returns:
42 |         A list of S3 URLs for the specified date range
43 |     """
44 |     dates = pd.date_range(start=start_date, end=end_date, freq="1D")
45 |     netcdf_urls = [make_url(date) for date in dates]
46 |     if not dmrpp:
47 |         return netcdf_urls
48 |     return [f + ".dmrpp" for f in netcdf_urls]
49 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/virtual_datasets.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Virtual dataset operations.
  3 | 
  4 | This module contains functions for working with virtual datasets.
  5 | """
  6 | 
  7 | import xarray as xr
  8 | from config import drop_vars
  9 | from repo import open_or_create_repo
 10 | 
 11 | from virtualizarr import open_virtual_dataset
 12 | 
 13 | 
 14 | def map_open_virtual_dataset(uri, open_args: dict = {}):
 15 |     """
 16 |     Map function to open virtual datasets.
 17 | 
 18 |     Args:
 19 |         uri: The URI of the virtual dataset
 20 | 
 21 |     Returns:
 22 |         A virtual dataset
 23 |     """
 24 |     vds = open_virtual_dataset(
 25 |         uri,
 26 |         indexes={},
 27 |         **open_args,
 28 |     )
 29 |     return vds.drop_vars(drop_vars, errors="ignore")
 30 | 
 31 | 
 32 | def concat_virtual_datasets(results):
 33 |     """
 34 |     Reduce to concat virtual datasets.
 35 | 
 36 |     Args:
 37 |         results: A list of virtual datasets
 38 | 
 39 |     Returns:
 40 |         A concatenated virtual dataset
 41 |     """
 42 |     combined_vds = xr.concat(
 43 |         results,
 44 |         dim="time",
 45 |         coords="minimal",
 46 |         compat="override",
 47 |         combine_attrs="override",
 48 |     )
 49 |     return combined_vds
 50 | 
 51 | 
 52 | def write_virtual_results_to_icechunk(
 53 |     virtual_ds, start_date: str, end_date: str, append_dim: str = None
 54 | ):
 55 |     """
 56 |     Write virtual dataset results to IceChunk.
 57 | 
 58 |     Args:
 59 |         virtual_ds: The virtual dataset to write
 60 |         start_date: The start date in YYYY-MM-DD format
 61 |         end_date: The end date in YYYY-MM-DD format
 62 |         append_dim: The dimension to append to (optional)
 63 | 
 64 |     Returns:
 65 |         The commit ID
 66 |     """
 67 |     repo = open_or_create_repo()
 68 |     session = repo.writable_session("main")
 69 | 
 70 |     # Check if store is already populated
 71 |     with session.allow_pickling():
 72 |         if append_dim:
 73 |             # Only use append_dim if store already has data
 74 |             virtual_ds.virtualize.to_icechunk(session.store, append_dim=append_dim)
 75 |         else:
 76 |             # If we can't check or there's an error, assume store is empty
 77 |             virtual_ds.virtualize.to_icechunk(session.store)
 78 | 
 79 |     return session.commit(f"Commit data {start_date} to {end_date}")
 80 | 
 81 | 
 82 | def concat_and_write_virtual_datasets(
 83 |     results, start_date: str, end_date: str, append_dim: str = None
 84 | ):
 85 |     """
 86 |     Reduce to concat virtual datasets and write to icechunk.
 87 | 
 88 |     Args:
 89 |         results: A list of virtual datasets
 90 |         start_date: The start date in YYYY-MM-DD format
 91 |         end_date: The end date in YYYY-MM-DD format
 92 |         append_dim: The dimension to append to (optional)
 93 | 
 94 |     Returns:
 95 |         The commit ID
 96 |     """
 97 |     combined_vds = concat_virtual_datasets(results)
 98 |     return write_virtual_results_to_icechunk(
 99 |         combined_vds, start_date, end_date, append_dim
100 |     )
101 | 


--------------------------------------------------------------------------------
/examples/mursst-icechunk-with-lithops/zarr_operations.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Zarr operations.
  3 | 
  4 | This module contains functions for working with Zarr arrays.
  5 | """
  6 | 
  7 | from typing import cast
  8 | 
  9 | import icechunk
 10 | import numpy as np
 11 | import pandas as pd
 12 | import xarray as xr
 13 | import zarr
 14 | from config import mursst_var_chunks, zarr_concurrency
 15 | from models import Task
 16 | 
 17 | 
 18 | def resize_data_array(var_name: str, session: icechunk.Session, n_timesteps: int):
 19 |     """
 20 |     Resize a data variable array.
 21 | 
 22 |     Args:
 23 |         var_name: The name of the variable to resize
 24 |         session: The IceChunk session
 25 |         n_timesteps: The number of timesteps to add
 26 | 
 27 |     Returns:
 28 |         The updated session
 29 |     """
 30 |     group = zarr.group(store=session.store, overwrite=False)
 31 |     current_shape = group[var_name].shape
 32 |     group[var_name].resize((current_shape[0] + n_timesteps,) + current_shape[1:])
 33 |     return session
 34 | 
 35 | 
 36 | def handle_time_dimension(session: icechunk.Session, start_date: str, end_date: str):
 37 |     """
 38 |     Handle time dimension and return datetime-index pairs.
 39 | 
 40 |     Args:
 41 |         session: The Icechunk session
 42 |         start_date: The start date in YYYY-MM-DD format
 43 |         end_date: The end date in YYYY-MM-DD format
 44 | 
 45 |     Returns:
 46 |         A tuple containing the updated session and a list of datetime-index pairs
 47 |     """
 48 |     group = zarr.group(store=session.store, overwrite=False)
 49 |     dt_index = pd.date_range(start=start_date, end=end_date, freq="1D")
 50 |     n_timesteps = len(dt_index)
 51 |     current_time_length = group["time"].shape[0]
 52 | 
 53 |     # Resize time array
 54 |     group["time"].resize((current_time_length + n_timesteps,))
 55 | 
 56 |     # Update time values
 57 |     reference_date = pd.Timestamp("1981-01-01 00:00:00")
 58 |     dt_index_seconds_since_1981 = (dt_index - reference_date).total_seconds()
 59 |     group["time"][-n_timesteps:] = np.int32(dt_index_seconds_since_1981)
 60 | 
 61 |     # Return list of (datetime, index) pairs
 62 |     return (
 63 |         session,
 64 |         [(dt, current_time_length + idx) for idx, dt in enumerate(dt_index)],
 65 |     )
 66 | 
 67 | 
 68 | def write_data_to_zarr(task: Task, session: icechunk.Session, ds: xr.Dataset):
 69 |     """
 70 |     Write data to Zarr array.
 71 | 
 72 |     Args:
 73 |         task: The task containing variable, datetime, and time index
 74 |         session: The Icechunk session
 75 |         ds: The xarray Dataset containing the data
 76 | 
 77 |     Returns:
 78 |         The updated session
 79 |     """
 80 |     group = zarr.group(store=session.store, overwrite=False)
 81 |     var, dt, time_idx = task.var, task.dt, task.time_idx
 82 |     data_array = ds[var].sel(time=dt)
 83 |     current_array = cast(zarr.Array, group[var])
 84 |     # where we actually write the data
 85 |     current_array[time_idx, :, :] = data_array.values
 86 |     return session
 87 | 
 88 | 
 89 | def configure_zarr():
 90 |     """
 91 |     Configure Zarr settings for optimal performance.
 92 |     """
 93 |     zarr.config.set(
 94 |         {
 95 |             "async": {"concurrency": zarr_concurrency, "timeout": None},
 96 |             "threading": {"max_workers": None},
 97 |         }
 98 |     )
 99 | 
100 | 
101 | def map_open_files(file: str):
102 |     """
103 |     Map function to open files.
104 | 
105 |     Args:
106 |         file: The file to open
107 | 
108 |     Returns:
109 |         An opened file object
110 |     """
111 |     from config import fs_read
112 | 
113 |     return fs_read.open(file)
114 | 
115 | 
116 | def xarray_open_mfdataset(files: list[str]):
117 |     """
118 |     Open multiple files as an xarray Dataset.
119 | 
120 |     Args:
121 |         files: A list of file objects
122 | 
123 |     Returns:
124 |         An xarray Dataset
125 |     """
126 |     from config import drop_vars
127 | 
128 |     ds = xr.open_mfdataset(
129 |         files, mask_and_scale=False, drop_variables=drop_vars, chunks={}
130 |     )
131 |     for var, chunks in mursst_var_chunks.items():
132 |         ds[var] = ds[var].chunk(chunks)
133 |     return ds
134 | 


--------------------------------------------------------------------------------
/examples/virtualizarr-with-lithops/Dockerfile_virtualizarr:
--------------------------------------------------------------------------------
 1 | # Python 3.11
 2 | FROM python:3.11-slim-buster
 3 | 
 4 | 
 5 | RUN apt-get update \
 6 |     # Install aws-lambda-cpp build dependencies
 7 |     && apt-get install -y \
 8 |       g++ \
 9 |       make \
10 |       cmake \
11 |       unzip \
12 |     # cleanup package lists, they are not used anymore in this image
13 |     && rm -rf /var/lib/apt/lists/* \
14 |     && apt-cache search linux-headers-generic
15 | 
16 | ARG FUNCTION_DIR="/function"
17 | 
18 | # Copy function code
19 | RUN mkdir -p ${FUNCTION_DIR}
20 | 
21 | # Update pip
22 | # NB botocore/boto3 are pinned due to https://github.com/boto/boto3/issues/3648
23 | #    using versions from https://github.com/aio-libs/aiobotocore/blob/72b8dd5d7d4ef2f1a49a0ae0c37b47e5280e2070/setup.py
24 | #    due to s3fs dependency
25 | RUN pip install --upgrade --ignore-installed pip wheel six setuptools \
26 |     && pip install --upgrade --no-cache-dir --ignore-installed \
27 |         awslambdaric \
28 |         botocore==1.29.76 \
29 |         boto3==1.26.76 \
30 |         redis \
31 |         httplib2 \
32 |         requests \
33 |         numpy \
34 |         scipy \
35 |         pandas \
36 |         pika \
37 |         kafka-python \
38 |         cloudpickle \
39 |         ps-mem \
40 |         tblib
41 | 
42 | # Set working directory to function root directory
43 | WORKDIR ${FUNCTION_DIR}
44 | 
45 | # Add Lithops
46 | COPY lithops_lambda.zip ${FUNCTION_DIR}
47 | RUN unzip lithops_lambda.zip \
48 |     && rm lithops_lambda.zip \
49 |     && mkdir handler \
50 |     && touch handler/__init__.py \
51 |     && mv entry_point.py handler/
52 | 
53 | # Put your dependencies here, using RUN pip install... or RUN apt install...
54 | 
55 | COPY requirements.txt requirements.txt
56 | RUN pip install --no-cache-dir -r requirements.txt
57 | 
58 | ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ]
59 | CMD [ "handler.entry_point.lambda_handler" ]
60 | 


--------------------------------------------------------------------------------
/examples/virtualizarr-with-lithops/README.md:
--------------------------------------------------------------------------------
 1 | # Generate a virtual zarr dataset using lithops
 2 | 
 3 | This example walks through how to create a virtual dataset from a collection of
 4 | netCDF files on s3 using lithops to open each file in parallel then concatenate
 5 | them into a single virtual dataset.
 6 | 
 7 | ## Credits
 8 | Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook
 9 | by norlandrhagen.
10 | 
11 | Please, contribute improvements.
12 | 
13 | 
14 | 
15 | 1. Set up a Python environment
16 | ```bash
17 | conda create --name virtualizarr-lithops -y python=3.11
18 | conda activate virtualizarr-lithops
19 | pip install -r requirements.txt
20 | ```
21 | 
22 | 2. Configure compute and storage backends for [lithops](https://lithops-cloud.github.io/docs/source/configuration.html).
23 | The configuration in `lithops.yaml` uses AWS Lambda for [compute](https://lithops-cloud.github.io/docs/source/compute_config/aws_lambda.html) and AWS S3 for [storage](https://lithops-cloud.github.io/docs/source/storage_config/aws_s3.html).
24 | To use those backends, simply edit `lithops.yaml` with your `bucket` and `execution_role`.
25 | 
26 | 1. Build a runtime image for Cubed
27 | ```bash
28 | export LITHOPS_CONFIG_FILE=$(pwd)/lithops.yaml
29 | lithops runtime build -b aws_lambda -f Dockerfile_virtualizarr virtualizarr-runtime
30 | ```
31 | 
32 | 1. Run the script
33 | ```bash
34 | python virtualizarr-with-lithops.py
35 | ```
36 | 
37 | ## Cleaning up
38 | To rebuild the Lithops image, delete the existing one by running
39 | ```bash
40 | lithops runtime delete -b aws_lambda -d virtualizarr-runtime
41 | ```
42 | 


--------------------------------------------------------------------------------
/examples/virtualizarr-with-lithops/lithops.yaml:
--------------------------------------------------------------------------------
 1 | lithops:
 2 |     backend: aws_lambda
 3 |     storage: aws_s3
 4 | 
 5 | aws:
 6 |     region: us-west-2
 7 | 
 8 | aws_lambda:
 9 |     execution_role: arn:aws:iam::807615458658:role/lambdaLithopsExecutionRole
10 |     runtime: virtualizarr-runtime
11 |     runtime_memory: 2000
12 | 
13 | aws_s3:
14 |     bucket: arn:aws:s3:::cubed-thodson-temp
15 | 


--------------------------------------------------------------------------------
/examples/virtualizarr-with-lithops/requirements.txt:
--------------------------------------------------------------------------------
1 | boto
2 | cftime
3 | h5py
4 | kerchunk
5 | lithops
6 | s3fs
7 | virtualizarr
8 | xarray
9 | 


--------------------------------------------------------------------------------
/examples/virtualizarr-with-lithops/virtualizarr-with-lithops.py:
--------------------------------------------------------------------------------
 1 | # Use lithops to create a virtual dataset from a collection of necdf files on s3.
 2 | #
 3 | # Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook
 4 | # by norlandrhagen.
 5 | #
 6 | # Please, contribute improvements.
 7 | 
 8 | import fsspec
 9 | import lithops
10 | import xarray as xr
11 | 
12 | from virtualizarr import open_virtual_dataset
13 | 
14 | # to demonstrate this workflow, we will use a collection of netcdf files from the WRF-SE-AK-AR5 project.
15 | fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True)
16 | files_paths = fs_read.glob("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/*")
17 | file_pattern = sorted(["s3://" + f for f in files_paths])
18 | 
19 | # optionally, truncate file_pattern while debugging
20 | # file_pattern = file_pattern[:4]
21 | 
22 | print(f"{len(file_pattern)} file paths were retrieved.")
23 | 
24 | 
25 | def map_references(fil):
26 |     """Map function to open virtual datasets."""
27 |     vds = open_virtual_dataset(
28 |         fil,
29 |         indexes={},
30 |         loadable_variables=["Time"],
31 |         cftime_variables=["Time"],
32 |     )
33 |     return vds
34 | 
35 | 
36 | def reduce_references(results):
37 |     """Reduce to concat virtual datasets."""
38 |     combined_vds = xr.combine_nested(
39 |         results,
40 |         concat_dim=["Time"],
41 |         coords="minimal",
42 |         compat="override",
43 |     )
44 |     return combined_vds
45 | 
46 | 
47 | fexec = lithops.FunctionExecutor(config_file="lithops.yaml")
48 | 
49 | futures = fexec.map_reduce(
50 |     map_references,
51 |     file_pattern,
52 |     reduce_references,
53 |     spawn_reducer=100,
54 | )
55 | 
56 | ds = futures.get_result()
57 | 
58 | # write out the virtual dataset to a kerchunk json
59 | ds.virtualize.to_kerchunk("combined.json", format="json")
60 | 


--------------------------------------------------------------------------------
/virtualizarr/__init__.py:
--------------------------------------------------------------------------------
 1 | from importlib.metadata import version as _version
 2 | 
 3 | from virtualizarr.accessor import (
 4 |     VirtualiZarrDatasetAccessor,
 5 |     VirtualiZarrDataTreeAccessor,
 6 | )
 7 | from virtualizarr.backend import open_virtual_dataset, open_virtual_mfdataset
 8 | from virtualizarr.manifests import ChunkManifest, ManifestArray
 9 | 
10 | try:
11 |     __version__ = _version("virtualizarr")
12 | except Exception:
13 |     # Local copy or not installed with setuptools.
14 |     # Disable minimum version checks on downstream libraries.
15 |     __version__ = "9999"
16 | 
17 | __all__ = [
18 |     "ChunkManifest",
19 |     "ManifestArray",
20 |     "VirtualiZarrDatasetAccessor",
21 |     "VirtualiZarrDataTreeAccessor",
22 |     "open_virtual_dataset",
23 |     "open_virtual_mfdataset",
24 | ]
25 | 


--------------------------------------------------------------------------------
/virtualizarr/codecs.py:
--------------------------------------------------------------------------------
  1 | from typing import TYPE_CHECKING, Any, Tuple, Union
  2 | 
  3 | import numpy as np
  4 | import zarr
  5 | from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
  6 | from zarr.abc.codec import Codec as ZarrCodec
  7 | from zarr.core.codec_pipeline import BatchedCodecPipeline
  8 | from zarr.core.metadata.v3 import ArrayV3Metadata
  9 | 
 10 | if TYPE_CHECKING:
 11 |     from .manifests.array import ManifestArray
 12 | 
 13 | CodecPipeline = Tuple[ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec, ...]
 14 | 
 15 | DeconstructedCodecPipeline = tuple[
 16 |     tuple[ArrayArrayCodec, ...],  # Array-to-array transformations
 17 |     ArrayBytesCodec | None,  # Array-to-bytes conversion
 18 |     tuple[BytesBytesCodec, ...],  # Bytes-to-bytes transformations
 19 | ]
 20 | 
 21 | 
 22 | def numcodec_config_to_configurable(num_codec: dict) -> dict:
 23 |     """
 24 |     Convert a numcodecs codec into a zarr v3 configurable.
 25 |     """
 26 |     if num_codec["id"].startswith("numcodecs."):
 27 |         return num_codec
 28 | 
 29 |     num_codec_copy = num_codec.copy()
 30 |     name = "numcodecs." + num_codec_copy.pop("id")
 31 |     return {"name": name, "configuration": num_codec_copy}
 32 | 
 33 | 
 34 | def extract_codecs(
 35 |     codecs: CodecPipeline,
 36 | ) -> DeconstructedCodecPipeline:
 37 |     """Extracts various codec types."""
 38 |     arrayarray_codecs: tuple[ArrayArrayCodec, ...] = ()
 39 |     arraybytes_codec: ArrayBytesCodec | None = None
 40 |     bytesbytes_codecs: tuple[BytesBytesCodec, ...] = ()
 41 |     for codec in codecs:
 42 |         if isinstance(codec, ArrayArrayCodec):
 43 |             arrayarray_codecs += (codec,)
 44 |         if isinstance(codec, ArrayBytesCodec):
 45 |             arraybytes_codec = codec
 46 |         if isinstance(codec, BytesBytesCodec):
 47 |             bytesbytes_codecs += (codec,)
 48 |     return (arrayarray_codecs, arraybytes_codec, bytesbytes_codecs)
 49 | 
 50 | 
 51 | def convert_to_codec_pipeline(
 52 |     dtype: np.dtype,
 53 |     codecs: list[dict] | None = [],
 54 | ) -> BatchedCodecPipeline:
 55 |     """
 56 |     Convert list of codecs to valid BatchedCodecPipeline.
 57 | 
 58 |     Parameters
 59 |     ----------
 60 |     dtype : np.dtype
 61 |     codecs: list[dict] | None
 62 | 
 63 |     Returns
 64 |     -------
 65 |     BatchedCodecPipeline
 66 |     """
 67 |     from zarr.core.array import _get_default_chunk_encoding_v3
 68 |     from zarr.registry import get_codec_class
 69 | 
 70 |     zarr_codecs: tuple[ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec, ...] = ()
 71 |     if codecs and len(codecs) > 0:
 72 |         zarr_codecs = tuple(
 73 |             get_codec_class(codec["name"]).from_dict(codec) for codec in codecs
 74 |         )
 75 | 
 76 |     # It would be nice to use zarr.core.codec_pipeline.codecs_from_list here but that function requires
 77 |     # array array codecs and array bytes codecs to already be present in the list and in the correct order.
 78 |     arrayarray_codecs, arraybytes_codec, bytesbytes_codecs = extract_codecs(zarr_codecs)
 79 | 
 80 |     if arraybytes_codec is None:
 81 |         arraybytes_codec = _get_default_chunk_encoding_v3(dtype)[1]
 82 | 
 83 |     codec_pipeline = BatchedCodecPipeline(
 84 |         array_array_codecs=arrayarray_codecs,
 85 |         array_bytes_codec=arraybytes_codec,
 86 |         bytes_bytes_codecs=bytesbytes_codecs,
 87 |         batch_size=1,
 88 |     )
 89 | 
 90 |     return codec_pipeline
 91 | 
 92 | 
 93 | def get_codec_config(codec: ZarrCodec) -> dict[str, Any]:
 94 |     """
 95 |     Extract configuration from a codec, handling both zarr-python and numcodecs codecs.
 96 |     """
 97 | 
 98 |     if hasattr(codec, "codec_config"):
 99 |         return codec.codec_config
100 |     elif hasattr(codec, "get_config"):
101 |         return codec.get_config()
102 |     elif hasattr(codec, "_zstd_codec"):
103 |         # related issue: https://github.com/zarr-developers/VirtualiZarr/issues/514
104 |         # very silly workaround. codec.to_dict for zstd gives:
105 |         # {'name': 'zstd', 'configuration': {'level': 0, 'checksum': False}}
106 |         # which when passed through ArrayV2Metadata -> numcodecs.get_codec gives the error:
107 |         # *** numcodecs.errors.UnknownCodecError: codec not available: 'None'
108 |         # if codec._zstd_codec.get_config() : {'id': 'zstd', 'level': 0, 'checksum': False}
109 |         # is passed to numcodecs.get_codec. It works fine.
110 |         return codec._zstd_codec.get_config()
111 |     elif hasattr(codec, "to_dict"):
112 |         return codec.to_dict()
113 |     else:
114 |         raise ValueError(f"Unable to parse codec configuration: {codec}")
115 | 
116 | 
117 | def get_codecs(array: Union["ManifestArray", "zarr.Array"]) -> CodecPipeline:
118 |     """
119 |     Get the zarr v3 codec pipeline for either a ManifestArray or a Zarr Array.
120 | 
121 |     Parameters
122 |     ----------
123 |     array : Union[ManifestArray, Array]
124 |         The input array, either ManifestArray or Zarr Array.
125 | 
126 |     Returns
127 |     -------
128 |     CodecPipeline
129 |         A tuple of zarr v3 codecs representing the codec pipeline.
130 | 
131 |     Raises
132 |     ------
133 |     ValueError
134 |         If the array type is unsupported or the array's metadata is not in zarr v3 format.
135 |     """
136 |     if not isinstance(array.metadata, ArrayV3Metadata):
137 |         raise ValueError(
138 |             "Only zarr v3 format arrays are supported. Please convert your array to v3 format."
139 |         )
140 | 
141 |     return array.metadata.codecs
142 | 


--------------------------------------------------------------------------------
/virtualizarr/manifests/__init__.py:
--------------------------------------------------------------------------------
1 | # Note: This directory is named "manifests" rather than "manifest".
2 | # This is just to avoid conflicting with some type of file called manifest that .gitignore recommends ignoring.
3 | 
4 | from virtualizarr.manifests.array import ManifestArray  # type: ignore # noqa
5 | from virtualizarr.manifests.group import ManifestGroup  # type: ignore # noqa
6 | from virtualizarr.manifests.manifest import ChunkEntry, ChunkManifest  # type: ignore # noqa
7 | from virtualizarr.manifests.store import ManifestStore, ObjectStoreRegistry  # type: ignore # noqa
8 | 


--------------------------------------------------------------------------------
/virtualizarr/manifests/group.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import textwrap
  4 | from typing import Iterator, Mapping
  5 | 
  6 | import xarray as xr
  7 | from zarr.core.group import GroupMetadata
  8 | 
  9 | from virtualizarr.manifests import ManifestArray
 10 | 
 11 | 
 12 | class ManifestGroup(
 13 |     Mapping[str, "ManifestArray | ManifestGroup"],
 14 | ):
 15 |     """
 16 |     Immutable representation of a single virtual zarr group.
 17 |     """
 18 | 
 19 |     _members: Mapping[str, "ManifestArray | ManifestGroup"]
 20 |     _metadata: GroupMetadata
 21 | 
 22 |     def __init__(
 23 |         self,
 24 |         arrays: Mapping[str, ManifestArray] | None = None,
 25 |         groups: Mapping[str, "ManifestGroup"] | None = None,
 26 |         attributes: dict | None = None,
 27 |     ) -> None:
 28 |         """
 29 |         Create a ManifestGroup containing ManifestArrays and/or sub-groups, as well as any group-level metadata.
 30 | 
 31 |         Parameters
 32 |         ----------
 33 |         arrays : Mapping[str, ManifestArray], optional
 34 |             ManifestArray objects to represent virtual zarr arrays.
 35 |         groups : Mapping[str, ManifestGroup], optional
 36 |             ManifestGroup objects to represent virtual zarr subgroups.
 37 |         attributes : dict, optional
 38 |             Zarr attributes to add as zarr group metadata.
 39 |         """
 40 |         self._metadata = GroupMetadata(attributes=attributes)
 41 | 
 42 |         _arrays: Mapping[str, ManifestArray] = {} if arrays is None else arrays
 43 | 
 44 |         if groups:
 45 |             # TODO add support for nested groups
 46 |             raise NotImplementedError
 47 |         else:
 48 |             _groups: Mapping[str, ManifestGroup] = {} if groups is None else groups
 49 | 
 50 |         for name, arr in _arrays.items():
 51 |             if not isinstance(arr, ManifestArray):
 52 |                 raise TypeError(
 53 |                     f"ManifestGroup can only wrap ManifestArray objects, but array {name} passed is of type {type(arr)}"
 54 |                 )
 55 | 
 56 |         # TODO type check groups passed
 57 | 
 58 |         # TODO check that all arrays have the same shapes or dimensions?
 59 |         # Technically that's allowed by the zarr model, so we should theoretically only check that upon converting to xarray
 60 | 
 61 |         colliding_names = set(_arrays.keys()).intersection(set(_groups.keys()))
 62 |         if colliding_names:
 63 |             raise ValueError(
 64 |                 f"Some names collide as they are present in both the array and group keys: {colliding_names}"
 65 |             )
 66 | 
 67 |         self._members = {**_arrays, **_groups}
 68 | 
 69 |     @property
 70 |     def metadata(self) -> GroupMetadata:
 71 |         """Zarr group metadata."""
 72 |         return self._metadata
 73 | 
 74 |     @property
 75 |     def arrays(self) -> dict[str, ManifestArray]:
 76 |         """ManifestArrays contained in this group."""
 77 |         return {k: v for k, v in self._members.items() if isinstance(v, ManifestArray)}
 78 | 
 79 |     @property
 80 |     def groups(self) -> dict[str, "ManifestGroup"]:
 81 |         """Subgroups contained in this group."""
 82 |         return {k: v for k, v in self._members.items() if isinstance(v, ManifestGroup)}
 83 | 
 84 |     def __getitem__(self, path: str) -> "ManifestArray | ManifestGroup":
 85 |         """Obtain a group member."""
 86 |         if "/" in path:
 87 |             raise ValueError(
 88 |                 f"ManifestGroup.__getitem__ can only be used to get immediate subgroups and subarrays, but received multi-part path {path}"
 89 |             )
 90 | 
 91 |         return self._members[path]
 92 | 
 93 |     def __iter__(self) -> Iterator[str]:
 94 |         return iter(self._members.keys())
 95 | 
 96 |     def __len__(self) -> int:
 97 |         return len(self._members)
 98 | 
 99 |     def __repr__(self) -> str:
100 |         return textwrap.dedent(
101 |             f"""
102 |             ManifestGroup(
103 |                 arrays={self.arrays},
104 |                 groups={self.groups},
105 |                 metadata={self.metadata},
106 |             )
107 |             """
108 |         )
109 | 
110 |     def to_virtual_dataset(self) -> xr.Dataset:
111 |         """
112 |         Create a "virtual" xarray.Dataset containing the contents of one zarr group.
113 | 
114 |         All variables in the returned Dataset will be "virtual", i.e. they will wrap ManifestArray objects.
115 |         """
116 | 
117 |         from virtualizarr.xarray import construct_fully_virtual_dataset
118 | 
119 |         # The xarray data model stores coordinate names outside of the arbitrary extra metadata it can store on a Dataset,
120 |         # so to avoid that information being duplicated we strip it from the zarr group attributes before storing it.
121 |         metadata_dict = self.metadata.to_dict()
122 |         attributes = metadata_dict["attributes"]
123 |         coord_names = attributes.pop("coordinates", [])
124 | 
125 |         virtual_vars = {
126 |             name: marr.to_virtual_variable() for name, marr in self.arrays.items()
127 |         }
128 | 
129 |         return construct_fully_virtual_dataset(
130 |             virtual_vars=virtual_vars,
131 |             coord_names=coord_names,
132 |             attrs=attributes,
133 |         )
134 | 


--------------------------------------------------------------------------------
/virtualizarr/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/py.typed


--------------------------------------------------------------------------------
/virtualizarr/readers/__init__.py:
--------------------------------------------------------------------------------
 1 | from virtualizarr.readers.dmrpp import DMRPPVirtualBackend
 2 | from virtualizarr.readers.fits import FITSVirtualBackend
 3 | from virtualizarr.readers.hdf import HDFVirtualBackend
 4 | from virtualizarr.readers.hdf5 import HDF5VirtualBackend
 5 | from virtualizarr.readers.kerchunk import KerchunkVirtualBackend
 6 | from virtualizarr.readers.netcdf3 import NetCDF3VirtualBackend
 7 | from virtualizarr.readers.tiff import TIFFVirtualBackend
 8 | from virtualizarr.readers.zarr import (
 9 |     ZarrVirtualBackend,
10 | )
11 | 
12 | __all__ = [
13 |     "DMRPPVirtualBackend",
14 |     "FITSVirtualBackend",
15 |     "HDFVirtualBackend",
16 |     "HDF5VirtualBackend",
17 |     "KerchunkVirtualBackend",
18 |     "NetCDF3VirtualBackend",
19 |     "TIFFVirtualBackend",
20 |     "ZarrVirtualBackend",
21 | ]
22 | 


--------------------------------------------------------------------------------
/virtualizarr/readers/api.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from collections.abc import Iterable, Mapping
 3 | from typing import Optional
 4 | 
 5 | import xarray as xr
 6 | 
 7 | 
 8 | class VirtualBackend(ABC):
 9 |     @staticmethod
10 |     def open_virtual_dataset(
11 |         filepath: str,
12 |         group: str | None = None,
13 |         drop_variables: Iterable[str] | None = None,
14 |         loadable_variables: Iterable[str] | None = None,
15 |         decode_times: bool | None = None,
16 |         indexes: Mapping[str, xr.Index] | None = None,
17 |         virtual_backend_kwargs: Optional[dict] = None,
18 |         reader_options: Optional[dict] = None,
19 |     ) -> xr.Dataset:
20 |         raise NotImplementedError()
21 | 
22 |     @staticmethod
23 |     def open_virtual_datatree(
24 |         path: str,
25 |         group: str | None = None,
26 |         drop_variables: Iterable[str] | None = None,
27 |         loadable_variables: Iterable[str] | None = None,
28 |         decode_times: bool | None = None,
29 |         indexes: Mapping[str, xr.Index] | None = None,
30 |         virtual_backend_kwargs: Optional[dict] = None,
31 |         reader_options: Optional[dict] = None,
32 |     ) -> xr.DataTree:
33 |         raise NotImplementedError()
34 | 


--------------------------------------------------------------------------------
/virtualizarr/readers/fits.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Hashable, Iterable, Mapping, Optional
 3 | 
 4 | from xarray import Dataset, Index
 5 | 
 6 | from virtualizarr.readers.api import (
 7 |     VirtualBackend,
 8 | )
 9 | from virtualizarr.translators.kerchunk import (
10 |     extract_group,
11 |     virtual_vars_and_metadata_from_kerchunk_refs,
12 | )
13 | from virtualizarr.types.kerchunk import KerchunkStoreRefs
14 | from virtualizarr.xarray import construct_fully_virtual_dataset
15 | 
16 | 
17 | class FITSVirtualBackend(VirtualBackend):
18 |     @staticmethod
19 |     def open_virtual_dataset(
20 |         filepath: str,
21 |         group: str | None = None,
22 |         drop_variables: Iterable[str] | None = None,
23 |         loadable_variables: Iterable[str] | None = None,
24 |         decode_times: bool | None = None,
25 |         indexes: Mapping[str, Index] | None = None,
26 |         virtual_backend_kwargs: Optional[dict] = None,
27 |         reader_options: Optional[dict] = None,
28 |     ) -> Dataset:
29 |         from kerchunk.fits import process_file
30 | 
31 |         if virtual_backend_kwargs:
32 |             raise NotImplementedError(
33 |                 "FITS reader does not understand any virtual_backend_kwargs"
34 |             )
35 | 
36 |         _drop_vars: list[Hashable] = (
37 |             [] if drop_variables is None else list(drop_variables)
38 |         )
39 | 
40 |         # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160
41 |         refs = KerchunkStoreRefs({"refs": process_file(filepath, **reader_options)})
42 | 
43 |         # both group=None and group='' mean to read root group
44 |         if group:
45 |             refs = extract_group(refs, group)
46 | 
47 |         # TODO This wouldn't work until either you had an xarray backend for FITS installed, or issue #124 is implemented to load data from ManifestArrays directly
48 |         if loadable_variables or indexes:
49 |             raise NotImplementedError(
50 |                 "Cannot load variables or indexes from FITS files as there is no xarray backend engine for FITS"
51 |             )
52 | 
53 |         virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs(
54 |             refs,
55 |             fs_root=Path.cwd().as_uri(),
56 |         )
57 | 
58 |         vds = construct_fully_virtual_dataset(
59 |             virtual_vars=virtual_vars,
60 |             coord_names=coord_names,
61 |             attrs=attrs,
62 |         )
63 | 
64 |         return vds.drop_vars(_drop_vars)
65 | 


--------------------------------------------------------------------------------
/virtualizarr/readers/hdf/__init__.py:
--------------------------------------------------------------------------------
1 | from .hdf import (
2 |     HDFVirtualBackend,
3 | )
4 | 
5 | __all__ = [
6 |     "HDFVirtualBackend",
7 | ]
8 | 


--------------------------------------------------------------------------------
/virtualizarr/readers/hdf/filters.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import dataclasses
  4 | from typing import TYPE_CHECKING, List, Tuple, TypedDict, Union
  5 | 
  6 | import numcodecs.registry as registry
  7 | import numpy as np
  8 | from numcodecs.abc import Codec
  9 | from numcodecs.fixedscaleoffset import FixedScaleOffset
 10 | from xarray.coding.variables import _choose_float_dtype
 11 | 
 12 | from virtualizarr.utils import soft_import
 13 | 
 14 | h5py = soft_import("h5py", "For reading hdf files", strict=False)
 15 | 
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from h5py import Dataset
 19 | 
 20 | 
 21 | hdf5plugin = soft_import(
 22 |     "hdf5plugin", "For reading hdf files with filters", strict=False
 23 | )
 24 | imagecodecs = soft_import(
 25 |     "imagecodecs", "For reading hdf files with filters", strict=False
 26 | )
 27 | 
 28 | _non_standard_filters = {
 29 |     "gzip": "zlib",
 30 |     "lzf": "imagecodecs_lzf",
 31 | }
 32 | 
 33 | _hdf5plugin_imagecodecs = {"lz4": "imagecodecs_lz4h5", "bzip2": "imagecodecs_bz2"}
 34 | 
 35 | 
 36 | @dataclasses.dataclass
 37 | class BloscProperties:
 38 |     blocksize: int
 39 |     clevel: int
 40 |     shuffle: int
 41 |     cname: str
 42 | 
 43 |     def __post_init__(self):
 44 |         blosc_compressor_codes = {
 45 |             value: key
 46 |             for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
 47 |         }
 48 |         self.cname = blosc_compressor_codes[self.cname]
 49 | 
 50 | 
 51 | @dataclasses.dataclass
 52 | class ZstdProperties:
 53 |     level: int
 54 | 
 55 | 
 56 | @dataclasses.dataclass
 57 | class ShuffleProperties:
 58 |     elementsize: int
 59 | 
 60 | 
 61 | @dataclasses.dataclass
 62 | class ZlibProperties:
 63 |     level: int
 64 | 
 65 | 
 66 | class CFCodec(TypedDict):
 67 |     target_dtype: np.dtype
 68 |     codec: Codec
 69 | 
 70 | 
 71 | def _filter_to_codec(
 72 |     filter_id: str, filter_properties: Union[int, None, Tuple] = None
 73 | ) -> Codec:
 74 |     """
 75 |     Convert an h5py filter to an equivalent numcodec
 76 | 
 77 |     Parameters
 78 |     ----------
 79 |     filter_id: str
 80 |         An h5py filter id code.
 81 |     filter_properties : int or None or Tuple
 82 |         A single or Tuple of h5py filter configuration codes.
 83 | 
 84 |     Returns
 85 |     -------
 86 |         A numcodec codec
 87 |     """
 88 |     id_int = None
 89 |     id_str = None
 90 |     try:
 91 |         id_int = int(filter_id)
 92 |     except ValueError:
 93 |         id_str = filter_id
 94 |     conf = {}
 95 |     if id_str:
 96 |         if id_str in _non_standard_filters.keys():
 97 |             id = _non_standard_filters[id_str]
 98 |         else:
 99 |             id = id_str
100 |         if id == "zlib":
101 |             zlib_props = ZlibProperties(level=filter_properties)  # type: ignore
102 |             conf = dataclasses.asdict(zlib_props)
103 |         if id == "shuffle" and isinstance(filter_properties, tuple):
104 |             shuffle_props = ShuffleProperties(elementsize=filter_properties[0])
105 |             conf = dataclasses.asdict(shuffle_props)
106 |         conf["id"] = id  # type: ignore[assignment]
107 |     if id_int:
108 |         filter = hdf5plugin.get_filters(id_int)[0]
109 |         id = filter.filter_name
110 |         if id in _hdf5plugin_imagecodecs.keys():
111 |             id = _hdf5plugin_imagecodecs[id]
112 |         if id == "blosc" and isinstance(filter_properties, tuple):
113 |             blosc_fields = [field.name for field in dataclasses.fields(BloscProperties)]
114 |             blosc_props = BloscProperties(
115 |                 **{k: v for k, v in zip(blosc_fields, filter_properties[-4:])}
116 |             )
117 |             conf = dataclasses.asdict(blosc_props)
118 |         if id == "zstd" and isinstance(filter_properties, tuple):
119 |             zstd_props = ZstdProperties(level=filter_properties[0])
120 |             conf = dataclasses.asdict(zstd_props)
121 |         conf["id"] = id
122 |     codec = registry.get_codec(conf)
123 |     return codec
124 | 
125 | 
126 | def cfcodec_from_dataset(dataset: Dataset) -> Codec | None:
127 |     """
128 |     Converts select h5py dataset CF convention attrs to CFCodec
129 | 
130 |     Parameters
131 |     ----------
132 |     dataset: h5py.Dataset
133 |        An h5py dataset.
134 | 
135 |     Returns
136 |     -------
137 |     CFCodec
138 |         A CFCodec.
139 |     """
140 |     attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
141 |     mapping = {}
142 |     if "scale_factor" in attributes:
143 |         try:
144 |             scale_factor = attributes["scale_factor"][0]
145 |         except IndexError:
146 |             scale_factor = attributes["scale_factor"]
147 |         mapping["scale_factor"] = float(1 / scale_factor)
148 |     else:
149 |         mapping["scale_factor"] = 1
150 |     if "add_offset" in attributes:
151 |         try:
152 |             offset = attributes["add_offset"][0]
153 |         except IndexError:
154 |             offset = attributes["add_offset"]
155 |         mapping["add_offset"] = float(offset)
156 |     else:
157 |         mapping["add_offset"] = 0
158 |     if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0:
159 |         float_dtype = _choose_float_dtype(dtype=dataset.dtype, mapping=mapping)
160 |         target_dtype = np.dtype(float_dtype)
161 |         codec = FixedScaleOffset(
162 |             offset=mapping["add_offset"],
163 |             scale=mapping["scale_factor"],
164 |             dtype=target_dtype,
165 |             astype=dataset.dtype,
166 |         )
167 |         cfcodec = CFCodec(target_dtype=target_dtype, codec=codec)
168 |         return cfcodec
169 |     else:
170 |         return None
171 | 
172 | 
173 | def codecs_from_dataset(dataset: Dataset) -> List[Codec]:
174 |     """
175 |     Extracts a list of numcodecs from an h5py dataset
176 | 
177 |     Parameters
178 |     ----------
179 |     dataset: h5py.Dataset
180 |        An h5py dataset.
181 | 
182 |     Returns
183 |     -------
184 |     list
185 |         A list of numcodecs codecs.
186 |     """
187 |     codecs = []
188 |     for filter_id, filter_properties in dataset._filters.items():
189 |         codec = _filter_to_codec(filter_id, filter_properties)
190 |         codecs.append(codec)
191 |     return codecs
192 | 


--------------------------------------------------------------------------------
/virtualizarr/readers/hdf5.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Hashable, Iterable, Mapping, Optional
 3 | 
 4 | from xarray import Dataset, Index
 5 | 
 6 | from virtualizarr.readers.api import VirtualBackend
 7 | from virtualizarr.translators.kerchunk import (
 8 |     extract_group,
 9 |     virtual_vars_and_metadata_from_kerchunk_refs,
10 | )
11 | from virtualizarr.xarray import (
12 |     construct_fully_virtual_dataset,
13 |     construct_virtual_dataset,
14 | )
15 | 
16 | 
17 | class HDF5VirtualBackend(VirtualBackend):
18 |     @staticmethod
19 |     def open_virtual_dataset(
20 |         filepath: str,
21 |         group: str | None = None,
22 |         drop_variables: Iterable[str] | None = None,
23 |         loadable_variables: Iterable[str] | None = None,
24 |         decode_times: bool | None = None,
25 |         indexes: Mapping[str, Index] | None = None,
26 |         virtual_backend_kwargs: Optional[dict] = None,
27 |         reader_options: Optional[dict] = None,
28 |     ) -> Dataset:
29 |         from kerchunk.hdf import SingleHdf5ToZarr
30 | 
31 |         if virtual_backend_kwargs:
32 |             raise NotImplementedError(
33 |                 "HDF5 reader does not understand any virtual_backend_kwargs"
34 |             )
35 | 
36 |         _drop_vars: list[Hashable] = (
37 |             [] if drop_variables is None else list(drop_variables)
38 |         )
39 | 
40 |         refs = SingleHdf5ToZarr(
41 |             filepath, inline_threshold=0, **reader_options
42 |         ).translate()
43 | 
44 |         # both group=None and group='' mean to read root group
45 |         if group:
46 |             refs = extract_group(refs, group)
47 | 
48 |         virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs(
49 |             refs,
50 |             fs_root=Path.cwd().as_uri(),
51 |         )
52 | 
53 |         fully_virtual_dataset = construct_fully_virtual_dataset(
54 |             virtual_vars=virtual_vars,
55 |             coord_names=coord_names,
56 |             attrs=attrs,
57 |         )
58 | 
59 |         vds = construct_virtual_dataset(
60 |             fully_virtual_ds=fully_virtual_dataset,
61 |             filepath=filepath,
62 |             group=group,
63 |             loadable_variables=loadable_variables,
64 |             reader_options=reader_options,
65 |             indexes=indexes,
66 |             decode_times=decode_times,
67 |         )
68 | 
69 |         return vds.drop_vars(_drop_vars)
70 | 


--------------------------------------------------------------------------------
/virtualizarr/readers/kerchunk.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from typing import Hashable, Iterable, Mapping, Optional
 3 | 
 4 | import ujson
 5 | from xarray import Dataset, Index
 6 | 
 7 | from virtualizarr.readers.api import VirtualBackend
 8 | from virtualizarr.translators.kerchunk import dataset_from_kerchunk_refs
 9 | from virtualizarr.types.kerchunk import (
10 |     KerchunkStoreRefs,
11 | )
12 | from virtualizarr.utils import _FsspecFSFromFilepath
13 | 
14 | 
15 | class KerchunkVirtualBackend(VirtualBackend):
16 |     @staticmethod
17 |     def open_virtual_dataset(
18 |         filepath: str,
19 |         group: str | None = None,
20 |         drop_variables: Iterable[str] | None = None,
21 |         loadable_variables: Iterable[str] | None = None,
22 |         decode_times: bool | None = None,
23 |         indexes: Mapping[str, Index] | None = None,
24 |         virtual_backend_kwargs: Optional[dict] = None,
25 |         reader_options: Optional[dict] = None,
26 |     ) -> Dataset:
27 |         """Reads existing kerchunk references (in JSON or parquet) format."""
28 | 
29 |         if virtual_backend_kwargs is None:
30 |             virtual_backend_kwargs = {}
31 | 
32 |         _drop_vars: list[Hashable] = (
33 |             [] if drop_variables is None else list(drop_variables)
34 |         )
35 | 
36 |         fs_root = virtual_backend_kwargs.pop("fs_root", None)
37 | 
38 |         if virtual_backend_kwargs:
39 |             raise NotImplementedError(
40 |                 f"Kerchunk reader does not understand any of the virtual_backend_kwargs {virtual_backend_kwargs}"
41 |             )
42 | 
43 |         if group:
44 |             raise NotImplementedError()
45 | 
46 |         if loadable_variables or indexes or decode_times:
47 |             raise NotImplementedError()
48 | 
49 |         # TODO: whilst this keeps backwards-compatible behaviour for the `loadable_variables`` kwarg,
50 |         # it probably has to change, see https://github.com/zarr-developers/VirtualiZarr/pull/477/#issuecomment-2744448626
51 |         if loadable_variables is None or indexes is None:
52 |             warnings.warn(
53 |                 "The default value of the `loadable_variables` kwarg may attempt to load data from the referenced virtual chunks."
54 |                 "As this is unlikely to be the desired behaviour when opening a Kerchunk file, `loadable_variables` has been overridden, and set to `loadable_variables=[]`."
55 |                 "To silence this warning pass `loadable_variables` explicitly.",
56 |                 UserWarning,
57 |             )
58 |             loadable_variables = []
59 |             indexes = {}
60 | 
61 |         fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options)
62 | 
63 |         # The kerchunk .parquet storage format isn't actually a parquet, but a directory that contains named parquets for each group/variable.
64 |         if fs.filepath.endswith(".parquet") and fs.fs.isfile(
65 |             f"{fs.filepath}/.zmetadata"
66 |         ):
67 |             from fsspec.implementations.reference import LazyReferenceMapper
68 | 
69 |             lrm = LazyReferenceMapper(filepath, fs.fs)
70 | 
71 |             # build reference dict from KV pairs in LazyReferenceMapper
72 |             # is there a better / more performant way to extract this?
73 |             array_refs = {k: lrm[k] for k in lrm.keys()}
74 | 
75 |             full_reference = {"refs": array_refs}
76 | 
77 |             vds = dataset_from_kerchunk_refs(
78 |                 KerchunkStoreRefs(full_reference), fs_root=fs_root
79 |             )
80 | 
81 |         # JSON has no magic bytes, but the Kerchunk version 1 spec starts with 'version':
82 |         # https://fsspec.github.io/kerchunk/spec.html
83 |         elif fs.read_bytes(9).startswith(b'{"version'):
84 |             with fs.open_file() as of:
85 |                 refs = ujson.load(of)
86 | 
87 |             vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(refs), fs_root=fs_root)
88 | 
89 |         else:
90 |             raise ValueError(
91 |                 "The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. If your Kerchunk generated references are saved in parquet format, make sure the file extension is `.parquet`. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues"
92 |             )
93 | 
94 |         # TODO would be more efficient to drop these before converting them into ManifestArrays, i.e. drop them from the kerchunk refs dict
95 |         return vds.drop_vars(_drop_vars)
96 | 


--------------------------------------------------------------------------------
/virtualizarr/readers/netcdf3.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Hashable, Iterable, Mapping, Optional
 3 | 
 4 | from xarray import Dataset, Index
 5 | 
 6 | from virtualizarr.readers.api import VirtualBackend
 7 | from virtualizarr.translators.kerchunk import (
 8 |     virtual_vars_and_metadata_from_kerchunk_refs,
 9 | )
10 | from virtualizarr.xarray import (
11 |     construct_fully_virtual_dataset,
12 |     construct_virtual_dataset,
13 | )
14 | 
15 | 
16 | class NetCDF3VirtualBackend(VirtualBackend):
17 |     @staticmethod
18 |     def open_virtual_dataset(
19 |         filepath: str,
20 |         group: str | None = None,
21 |         drop_variables: Iterable[str] | None = None,
22 |         loadable_variables: Iterable[str] | None = None,
23 |         decode_times: bool | None = None,
24 |         indexes: Mapping[str, Index] | None = None,
25 |         virtual_backend_kwargs: Optional[dict] = None,
26 |         reader_options: Optional[dict] = None,
27 |     ) -> Dataset:
28 |         from kerchunk.netCDF3 import NetCDF3ToZarr
29 | 
30 |         if virtual_backend_kwargs:
31 |             raise NotImplementedError(
32 |                 "netcdf3 reader does not understand any virtual_backend_kwargs"
33 |             )
34 | 
35 |         _drop_vars: list[Hashable] = (
36 |             [] if drop_variables is None else list(drop_variables)
37 |         )
38 | 
39 |         refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate()
40 | 
41 |         # both group=None and group='' mean to read root group
42 |         if group:
43 |             raise ValueError(
44 |                 "group kwarg passed, but netCDF3 files can't have multiple groups!"
45 |             )
46 | 
47 |         virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs(
48 |             refs,
49 |             fs_root=Path.cwd().as_uri(),
50 |         )
51 | 
52 |         fully_virtual_dataset = construct_fully_virtual_dataset(
53 |             virtual_vars=virtual_vars,
54 |             coord_names=coord_names,
55 |             attrs=attrs,
56 |         )
57 | 
58 |         vds = construct_virtual_dataset(
59 |             fully_virtual_ds=fully_virtual_dataset,
60 |             filepath=filepath,
61 |             group=group,
62 |             loadable_variables=loadable_variables,
63 |             reader_options=reader_options,
64 |             indexes=indexes,
65 |             decode_times=decode_times,
66 |         )
67 | 
68 |         return vds.drop_vars(_drop_vars)
69 | 


--------------------------------------------------------------------------------
/virtualizarr/readers/tiff.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from pathlib import Path
 3 | from typing import Hashable, Iterable, Mapping, Optional
 4 | 
 5 | from xarray import Dataset, Index
 6 | 
 7 | from virtualizarr.readers.api import VirtualBackend
 8 | from virtualizarr.translators.kerchunk import (
 9 |     extract_group,
10 |     virtual_vars_and_metadata_from_kerchunk_refs,
11 | )
12 | from virtualizarr.types.kerchunk import KerchunkStoreRefs
13 | from virtualizarr.xarray import (
14 |     construct_fully_virtual_dataset,
15 |     construct_virtual_dataset,
16 | )
17 | 
18 | 
19 | class TIFFVirtualBackend(VirtualBackend):
20 |     @staticmethod
21 |     def open_virtual_dataset(
22 |         filepath: str,
23 |         group: str | None = None,
24 |         drop_variables: Iterable[str] | None = None,
25 |         loadable_variables: Iterable[str] | None = None,
26 |         decode_times: bool | None = None,
27 |         indexes: Mapping[str, Index] | None = None,
28 |         virtual_backend_kwargs: Optional[dict] = None,
29 |         reader_options: Optional[dict] = None,
30 |     ) -> Dataset:
31 |         if virtual_backend_kwargs:
32 |             raise NotImplementedError(
33 |                 "TIFF reader does not understand any virtual_backend_kwargs"
34 |             )
35 | 
36 |         from kerchunk.tiff import tiff_to_zarr
37 | 
38 |         if reader_options is None:
39 |             reader_options = {}
40 | 
41 |         reader_options.pop("storage_options", {})
42 |         warnings.warn(
43 |             "storage_options have been dropped from reader_options as they are not supported by kerchunk.tiff.tiff_to_zarr",
44 |             UserWarning,
45 |         )
46 | 
47 |         _drop_vars: list[Hashable] = (
48 |             [] if drop_variables is None else list(drop_variables)
49 |         )
50 | 
51 |         # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160
52 |         refs = KerchunkStoreRefs({"refs": tiff_to_zarr(filepath, **reader_options)})
53 | 
54 |         # both group=None and group='' mean to read root group
55 |         if group:
56 |             refs = extract_group(refs, group)
57 | 
58 |         virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs(
59 |             refs,
60 |             fs_root=Path.cwd().as_uri(),
61 |         )
62 | 
63 |         fully_virtual_dataset = construct_fully_virtual_dataset(
64 |             virtual_vars=virtual_vars,
65 |             coord_names=coord_names,
66 |             attrs=attrs,
67 |         )
68 | 
69 |         vds = construct_virtual_dataset(
70 |             fully_virtual_ds=fully_virtual_dataset,
71 |             filepath=filepath,
72 |             group=group,
73 |             loadable_variables=loadable_variables,
74 |             reader_options=reader_options,
75 |             indexes=indexes,
76 |             decode_times=decode_times,
77 |         )
78 | 
79 |         return vds.drop_vars(_drop_vars)
80 | 


--------------------------------------------------------------------------------
/virtualizarr/readers/zarr.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import asyncio
  4 | from pathlib import Path  # noqa
  5 | from typing import (
  6 |     Any,
  7 |     Hashable,
  8 |     Iterable,
  9 |     Mapping,
 10 |     Optional,
 11 | )
 12 | 
 13 | import numpy as np
 14 | from xarray import Dataset, Index
 15 | from zarr.api.asynchronous import open_group as open_group_async
 16 | from zarr.core.metadata import ArrayV3Metadata
 17 | 
 18 | from virtualizarr.manifests import (
 19 |     ChunkManifest,
 20 |     ManifestArray,
 21 |     ManifestGroup,
 22 |     ManifestStore,
 23 | )
 24 | from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri  # noqa
 25 | from virtualizarr.readers.api import VirtualBackend
 26 | from virtualizarr.vendor.zarr.core.common import _concurrent_map
 27 | 
 28 | FillValueT = bool | str | float | int | list | None
 29 | 
 30 | ZARR_DEFAULT_FILL_VALUE: dict[str, FillValueT] = {
 31 |     # numpy dtypes's hierarchy lets us avoid checking for all the widths
 32 |     # https://numpy.org/doc/stable/reference/arrays.scalars.html
 33 |     np.dtype("bool").kind: False,
 34 |     np.dtype("int").kind: 0,
 35 |     np.dtype("float").kind: 0.0,
 36 |     np.dtype("complex").kind: [0.0, 0.0],
 37 |     np.dtype("datetime64").kind: 0,
 38 | }
 39 | 
 40 | 
 41 | import zarr
 42 | 
 43 | 
 44 | async def get_chunk_mapping_prefix(zarr_array: zarr.AsyncArray, filepath: str) -> dict:
 45 |     """Create a dictionary to pass into ChunkManifest __init__"""
 46 | 
 47 |     # TODO: For when we want to support reading V2 we should parse the /c/ and "/" between chunks
 48 |     if zarr_array.shape == ():
 49 |         # If we have a scalar array `c`
 50 |         # https://zarr-specs.readthedocs.io/en/latest/v3/chunk-key-encodings/default/index.html#description
 51 | 
 52 |         prefix = zarr_array.name.lstrip("/") + "/c"
 53 |         prefix_keys = [(prefix,)]
 54 |         _lengths = [await zarr_array.store.getsize("c")]
 55 |         _dict_keys = ["c"]
 56 |         _paths = [filepath + "/" + _dict_keys[0]]
 57 | 
 58 |     else:
 59 |         prefix = zarr_array.name.lstrip("/") + "/c/"
 60 |         prefix_keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)]
 61 |         _lengths = await _concurrent_map(prefix_keys, zarr_array.store.getsize)
 62 |         chunk_keys = [x[0].split(prefix)[1] for x in prefix_keys]
 63 |         _dict_keys = [key.replace("/", ".") for key in chunk_keys]
 64 |         _paths = [filepath + "/" + prefix + key for key in chunk_keys]
 65 | 
 66 |     _offsets = [0] * len(_lengths)
 67 |     return {
 68 |         key: {"path": path, "offset": offset, "length": length}
 69 |         for key, path, offset, length in zip(
 70 |             _dict_keys,
 71 |             _paths,
 72 |             _offsets,
 73 |             _lengths,
 74 |         )
 75 |     }
 76 | 
 77 | 
 78 | async def build_chunk_manifest(
 79 |     zarr_array: zarr.AsyncArray, filepath: str
 80 | ) -> ChunkManifest:
 81 |     """Build a ChunkManifest from a dictionary"""
 82 |     chunk_map = await get_chunk_mapping_prefix(zarr_array=zarr_array, filepath=filepath)
 83 |     return ChunkManifest(chunk_map)
 84 | 
 85 | 
 86 | def get_metadata(zarr_array: zarr.AsyncArray[Any]) -> ArrayV3Metadata:
 87 |     fill_value = zarr_array.metadata.fill_value
 88 |     if fill_value is not None:
 89 |         fill_value = ZARR_DEFAULT_FILL_VALUE[zarr_array.metadata.fill_value.dtype.kind]
 90 | 
 91 |     zarr_format = zarr_array.metadata.zarr_format
 92 | 
 93 |     if zarr_format == 2:
 94 |         # TODO: Once we want to support V2, we will have to deconstruct the
 95 |         # zarr_array codecs etc. and reconstruct them with create_v3_array_metadata
 96 |         raise NotImplementedError("Reading Zarr V2 currently not supported.")
 97 | 
 98 |     elif zarr_format == 3:
 99 |         return zarr_array.metadata
100 | 
101 |     else:
102 |         raise NotImplementedError("Zarr format is not recognized as v2 or v3.")
103 | 
104 | 
105 | async def _construct_manifest_array(zarr_array: zarr.AsyncArray[Any], filepath: str):
106 |     array_metadata = get_metadata(zarr_array=zarr_array)
107 | 
108 |     chunk_manifest = await build_chunk_manifest(zarr_array, filepath=filepath)
109 |     return ManifestArray(metadata=array_metadata, chunkmanifest=chunk_manifest)
110 | 
111 | 
112 | async def _construct_manifest_group(
113 |     filepath: str,
114 |     *,
115 |     reader_options: Optional[dict] = None,
116 |     drop_variables: str | Iterable[str] | None = None,
117 |     group: str | None = None,
118 | ):
119 |     reader_options = reader_options or {}
120 |     zarr_group = await open_group_async(
121 |         filepath,
122 |         storage_options=reader_options.get("storage_options"),
123 |         path=group,
124 |         mode="r",
125 |     )
126 | 
127 |     zarr_array_keys = [key async for key in zarr_group.array_keys()]
128 | 
129 |     _drop_vars: list[Hashable] = [] if drop_variables is None else list(drop_variables)
130 | 
131 |     zarr_arrays = await asyncio.gather(
132 |         *[zarr_group.getitem(var) for var in zarr_array_keys if var not in _drop_vars]
133 |     )
134 | 
135 |     manifest_arrays = await asyncio.gather(
136 |         *[
137 |             _construct_manifest_array(zarr_array=array, filepath=filepath)  # type: ignore[arg-type]
138 |             for array in zarr_arrays
139 |         ]
140 |     )
141 | 
142 |     manifest_dict = {
143 |         array.basename: result for array, result in zip(zarr_arrays, manifest_arrays)
144 |     }
145 |     return ManifestGroup(manifest_dict, attributes=zarr_group.attrs)
146 | 
147 | 
148 | def _construct_manifest_store(
149 |     filepath: str,
150 |     *,
151 |     reader_options: Optional[dict] = None,
152 |     drop_variables: str | Iterable[str] | None = None,
153 |     group: str | None = None,
154 | ) -> ManifestStore:
155 |     import asyncio
156 | 
157 |     manifest_group = asyncio.run(
158 |         _construct_manifest_group(
159 |             filepath=filepath,
160 |             group=group,
161 |             drop_variables=drop_variables,
162 |             reader_options=reader_options,
163 |         )
164 |     )
165 |     return ManifestStore(manifest_group)
166 | 
167 | 
168 | class ZarrVirtualBackend(VirtualBackend):
169 |     @staticmethod
170 |     def open_virtual_dataset(
171 |         filepath: str,
172 |         group: str | None = None,
173 |         drop_variables: str | Iterable[str] | None = None,
174 |         loadable_variables: Iterable[str] | None = None,
175 |         decode_times: bool | None = None,
176 |         indexes: Mapping[str, Index] | None = None,
177 |         virtual_backend_kwargs: Optional[dict] = None,
178 |         reader_options: Optional[dict] = None,
179 |     ) -> Dataset:
180 |         filepath = validate_and_normalize_path_to_uri(
181 |             filepath, fs_root=Path.cwd().as_uri()
182 |         )
183 | 
184 |         manifest_store = _construct_manifest_store(
185 |             filepath=filepath,
186 |             group=group,
187 |             drop_variables=drop_variables,
188 |             reader_options=reader_options,
189 |         )
190 | 
191 |         ds = manifest_store.to_virtual_dataset(
192 |             loadable_variables=loadable_variables,
193 |             decode_times=decode_times,
194 |             indexes=indexes,
195 |         )
196 |         return ds
197 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | import pytest
 4 | from packaging.version import Version
 5 | 
 6 | from virtualizarr.readers import HDF5VirtualBackend
 7 | from virtualizarr.readers.hdf import HDFVirtualBackend
 8 | 
 9 | requires_network = pytest.mark.network
10 | requires_minio = pytest.mark.minio
11 | 
12 | 
13 | def _importorskip(
14 |     modname: str, minversion: str | None = None
15 | ) -> tuple[bool, pytest.MarkDecorator]:
16 |     try:
17 |         mod = importlib.import_module(modname)
18 |         has = True
19 |         if minversion is not None:
20 |             v = getattr(mod, "__version__", "999")
21 |             if Version(v) < Version(minversion):
22 |                 raise ImportError("Minimum version not satisfied")
23 |     except ImportError:
24 |         has = False
25 | 
26 |     reason = f"requires {modname}"
27 |     if minversion is not None:
28 |         reason += f">={minversion}"
29 |     func = pytest.mark.skipif(not has, reason=reason)
30 |     return has, func
31 | 
32 | 
33 | has_astropy, requires_astropy = _importorskip("astropy")
34 | has_icechunk, requires_icechunk = _importorskip("icechunk")
35 | has_kerchunk, requires_kerchunk = _importorskip("kerchunk")
36 | has_fastparquet, requires_fastparquet = _importorskip("fastparquet")
37 | has_s3fs, requires_s3fs = _importorskip("s3fs")
38 | has_lithops, requires_lithops = _importorskip("lithops")
39 | has_scipy, requires_scipy = _importorskip("scipy")
40 | has_tifffile, requires_tifffile = _importorskip("tifffile")
41 | has_imagecodecs, requires_imagecodecs = _importorskip("imagecodecs")
42 | has_hdf5plugin, requires_hdf5plugin = _importorskip("hdf5plugin")
43 | has_zarr_python, requires_zarr_python = _importorskip("zarr")
44 | has_dask, requires_dask = _importorskip("dask")
45 | has_obstore, requires_obstore = _importorskip("obstore")
46 | 
47 | parametrize_over_hdf_backends = pytest.mark.parametrize(
48 |     "hdf_backend",
49 |     [HDF5VirtualBackend, HDFVirtualBackend] if has_kerchunk else [HDFVirtualBackend],
50 | )
51 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | 
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.fixture(scope="session")
 8 | def container():
 9 |     import docker
10 | 
11 |     client = docker.from_env()
12 |     port = 9000
13 |     minio_container = client.containers.run(
14 |         "quay.io/minio/minio",
15 |         "server /data",
16 |         detach=True,
17 |         ports={f"{port}/tcp": port},
18 |         environment={
19 |             "MINIO_ACCESS_KEY": "minioadmin",
20 |             "MINIO_SECRET_KEY": "minioadmin",
21 |         },
22 |     )
23 |     time.sleep(3)  # give it time to boot
24 |     # enter
25 |     yield {
26 |         "port": port,
27 |         "endpoint": f"http://localhost:{port}",
28 |         "username": "minioadmin",
29 |         "password": "minioadmin",
30 |     }
31 |     # exit
32 |     minio_container.stop()
33 |     minio_container.remove()
34 | 
35 | 
36 | @pytest.fixture(scope="session")
37 | def minio_bucket(container):
38 |     # Setup with guidance from https://medium.com/@sant1/using-minio-with-docker-and-python-cbbad397cb5d
39 |     from minio import Minio
40 | 
41 |     bucket = "my-bucket"
42 |     filename = "test.nc"
43 |     # Initialize MinIO client
44 |     client = Minio(
45 |         "localhost:9000",
46 |         access_key=container["username"],
47 |         secret_key=container["password"],
48 |         secure=False,
49 |     )
50 |     client.make_bucket(bucket)
51 |     policy = {
52 |         "Version": "2012-10-17",
53 |         "Statement": [
54 |             {
55 |                 "Effect": "Allow",
56 |                 "Principal": {"AWS": "*"},
57 |                 "Action": ["s3:GetBucketLocation", "s3:ListBucket"],
58 |                 "Resource": "arn:aws:s3:::my-bucket",
59 |             },
60 |             {
61 |                 "Effect": "Allow",
62 |                 "Principal": {"AWS": "*"},
63 |                 "Action": [
64 |                     "s3:GetObject",
65 |                     "s3:GetObjectRetention",
66 |                     "s3:GetObjectLegalHold",
67 |                 ],
68 |                 "Resource": "arn:aws:s3:::my-bucket/*",
69 |             },
70 |         ],
71 |     }
72 |     client.set_bucket_policy(bucket, json.dumps(policy))
73 |     yield {
74 |         "port": container["port"],
75 |         "endpoint": container["endpoint"],
76 |         "username": container["username"],
77 |         "password": container["password"],
78 |         "bucket": bucket,
79 |         "file": filename,
80 |         "client": client,
81 |     }
82 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/test_codecs.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | from zarr.codecs import BytesCodec
  4 | from zarr.core.codec_pipeline import BatchedCodecPipeline
  5 | from zarr.registry import get_codec_class
  6 | 
  7 | from conftest import (
  8 |     ARRAYBYTES_CODEC,
  9 |     BLOSC_CODEC,
 10 |     DELTA_CODEC,
 11 |     ZLIB_CODEC,
 12 | )
 13 | from virtualizarr.codecs import (
 14 |     convert_to_codec_pipeline,
 15 |     extract_codecs,
 16 |     get_codec_config,
 17 |     get_codecs,
 18 | )
 19 | 
 20 | 
 21 | class TestGetCodecs:
 22 |     """Test the get_codecs function."""
 23 | 
 24 |     def test_manifest_array_zarr_v3_default(self, manifest_array):
 25 |         """Test get_codecs with ManifestArray using default v3 codec."""
 26 |         test_manifest_array = manifest_array(codecs=None)
 27 |         actual_codecs = get_codecs(test_manifest_array)
 28 |         expected_codecs = tuple([BytesCodec(endian="little")])
 29 |         assert actual_codecs == expected_codecs
 30 | 
 31 |     def test_manifest_array_with_codecs(self, manifest_array):
 32 |         """Test get_codecs with ManifestArray using multiple v3 codecs."""
 33 |         test_codecs = [DELTA_CODEC, ARRAYBYTES_CODEC, BLOSC_CODEC]
 34 |         manifest_array = manifest_array(codecs=test_codecs)
 35 |         actual_codecs = get_codecs(manifest_array)
 36 |         assert actual_codecs == tuple(
 37 |             [
 38 |                 get_codec_class(codec["name"])(**codec["configuration"])
 39 |                 for codec in test_codecs
 40 |             ]
 41 |         )
 42 | 
 43 |     def test_zarr_v3_default_codecs(self, zarr_array):
 44 |         """Test get_codecs with Zarr array using default v3 codec."""
 45 |         zarr_array = zarr_array()
 46 |         actual_codecs = get_codecs(zarr_array)
 47 |         assert isinstance(actual_codecs[0], BytesCodec)
 48 | 
 49 |     def test_zarr_v3_with_codecs(self, zarr_array):
 50 |         """Test get_codecs with Zarr array using multiple v3 codecs."""
 51 |         test_codecs = [DELTA_CODEC, ARRAYBYTES_CODEC, BLOSC_CODEC]
 52 |         zarr_array = zarr_array(codecs=test_codecs)
 53 |         actual_codecs = get_codecs(zarr_array)
 54 |         assert actual_codecs == tuple(
 55 |             [
 56 |                 get_codec_class(codec["name"])(**codec["configuration"])
 57 |                 for codec in test_codecs
 58 |             ]
 59 |         )
 60 | 
 61 |     def test_zarr_v2_error(self, zarr_array):
 62 |         """Test that using v2 format raises an error."""
 63 |         zarr_array = zarr_array(zarr_format=2)
 64 |         with pytest.raises(
 65 |             ValueError,
 66 |             match="Only zarr v3 format arrays are supported. Please convert your array to v3 format.",
 67 |         ):
 68 |             get_codecs(zarr_array)
 69 | 
 70 | 
 71 | class TestConvertToCodecPipeline:
 72 |     """Test the convert_to_codec_pipeline function."""
 73 | 
 74 |     @pytest.mark.parametrize(
 75 |         "input_codecs,expected_pipeline",
 76 |         [
 77 |             # Case 1: No codecs - should result in just BytesCodec
 78 |             (
 79 |                 None,
 80 |                 BatchedCodecPipeline(
 81 |                     array_array_codecs=(),
 82 |                     array_bytes_codec=BytesCodec(endian="little"),
 83 |                     bytes_bytes_codecs=(),
 84 |                     batch_size=1,
 85 |                 ),
 86 |             ),
 87 |             # Case 2: Delta codec - should result in DeltaCodec + BytesCodec
 88 |             (
 89 |                 [DELTA_CODEC],
 90 |                 BatchedCodecPipeline(
 91 |                     array_array_codecs=(
 92 |                         get_codec_class("numcodecs.delta").from_dict(DELTA_CODEC),  # type: ignore[arg-type]
 93 |                     ),
 94 |                     array_bytes_codec=BytesCodec(endian="little"),
 95 |                     bytes_bytes_codecs=(),
 96 |                     batch_size=1,
 97 |                 ),
 98 |             ),
 99 |             # Case 3: Delta + Blosc + Zlib - should result in all codecs + BytesCodec
100 |             (
101 |                 [DELTA_CODEC, BLOSC_CODEC, ZLIB_CODEC],
102 |                 BatchedCodecPipeline(
103 |                     array_array_codecs=(
104 |                         get_codec_class("numcodecs.delta").from_dict(DELTA_CODEC),  # type: ignore[arg-type]
105 |                     ),
106 |                     array_bytes_codec=BytesCodec(endian="little"),
107 |                     bytes_bytes_codecs=(
108 |                         get_codec_class(key="blosc").from_dict(BLOSC_CODEC),  # type: ignore[arg-type]
109 |                         get_codec_class("numcodecs.zlib").from_dict(ZLIB_CODEC),  # type: ignore[arg-type]
110 |                     ),
111 |                     batch_size=1,
112 |                 ),
113 |             ),
114 |         ],
115 |     )
116 |     def test_convert_to_codec_pipeline_scenarios(self, input_codecs, expected_pipeline):
117 |         """Test different scenarios for convert_to_codec_pipeline function."""
118 |         dtype = np.dtype("<i4")
119 |         if input_codecs is not None:
120 |             input_codecs = list(input_codecs)
121 | 
122 |         result = convert_to_codec_pipeline(dtype=dtype, codecs=input_codecs)
123 |         assert result == expected_pipeline
124 | 
125 | 
126 | class TestExtractCodecs:
127 |     """Test the extract_codecs function."""
128 | 
129 |     def test_extract_codecs_with_all_types(self):
130 |         """Test extract_codecs with all types of codecs."""
131 |         arrayarray_codec = get_codec_class("numcodecs.delta").from_dict(DELTA_CODEC)
132 |         arraybytes_codec = BytesCodec(endian="little")
133 |         bytesbytes_codec = get_codec_class("numcodecs.zlib").from_dict(ZLIB_CODEC)
134 | 
135 |         codecs = (arrayarray_codec, arraybytes_codec, bytesbytes_codec)
136 |         result = extract_codecs(codecs)
137 | 
138 |         assert result == (
139 |             (arrayarray_codec,),
140 |             arraybytes_codec,
141 |             (bytesbytes_codec,),
142 |         )
143 | 
144 |     def test_extract_codecs_with_only_arrayarray(self):
145 |         """Test extract_codecs with only ArrayArrayCodec."""
146 |         arrayarray_codec = get_codec_class("numcodecs.delta").from_dict(DELTA_CODEC)
147 | 
148 |         codecs = (arrayarray_codec,)
149 |         result = extract_codecs(codecs)
150 | 
151 |         assert result == (
152 |             (arrayarray_codec,),
153 |             None,
154 |             (),
155 |         )
156 | 
157 |     def test_extract_codecs_with_only_arraybytes(self):
158 |         """Test extract_codecs with only ArrayBytesCodec."""
159 |         arraybytes_codec = BytesCodec(endian="little")
160 | 
161 |         codecs = (arraybytes_codec,)
162 |         result = extract_codecs(codecs)
163 | 
164 |         assert result == (
165 |             (),
166 |             arraybytes_codec,
167 |             (),
168 |         )
169 | 
170 |     def test_extract_codecs_with_only_bytesbytes(self):
171 |         """Test extract_codecs with only BytesBytesCodec."""
172 |         bytesbytes_codec = get_codec_class("numcodecs.zlib").from_dict(ZLIB_CODEC)
173 | 
174 |         codecs = (bytesbytes_codec,)
175 |         result = extract_codecs(codecs)
176 | 
177 |         assert result == (
178 |             (),
179 |             None,
180 |             (bytesbytes_codec,),
181 |         )
182 | 
183 |     def test_extract_codecs_with_empty_list(self):
184 |         """Test extract_codecs with an empty list."""
185 |         codecs = ()
186 |         result = extract_codecs(codecs)
187 | 
188 |         assert result == (
189 |             (),
190 |             None,
191 |             (),
192 |         )
193 | 
194 | 
195 | class TestGetCodecConfig:
196 |     """Test the get_codec_config function."""
197 | 
198 |     def test_codec_with_codec_config(self):
199 |         """Test get_codec_config with a codec having codec_config attribute."""
200 |         codec = get_codec_class("numcodecs.delta").from_dict(DELTA_CODEC)
201 |         expected_config = codec.codec_config
202 |         actual_config = get_codec_config(codec)
203 |         assert actual_config == expected_config
204 | 
205 |     def test_codec_with_to_dict(self):
206 |         """Test get_codec_config with a codec having get_config method."""
207 |         from zarr.codecs import BloscCodec
208 | 
209 |         codec = BloscCodec(typesize=4, clevel=5, shuffle="shuffle", cname="lz4")
210 |         expected_config = codec.to_dict()
211 |         actual_config = get_codec_config(codec)
212 |         assert actual_config == expected_config
213 | 
214 |     def test_codec_with_get_config(self):
215 |         """Test get_codec_config with a codec having to_dict method."""
216 |         from numcodecs import FixedScaleOffset
217 | 
218 |         codec = FixedScaleOffset(offset=0, scale=1, dtype="<i4")
219 |         expected_config = codec.get_config()
220 |         actual_config = get_codec_config(codec)
221 |         assert actual_config == expected_config
222 | 
223 |     def test_codec_with_no_config_methods(self):
224 |         """Test get_codec_config with a codec having no config methods."""
225 | 
226 |         class DummyCodec:
227 |             pass
228 | 
229 |         codec = DummyCodec()
230 |         with pytest.raises(ValueError, match="Unable to parse codec configuration:"):
231 |             get_codec_config(codec)
232 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/test_manifests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/tests/test_manifests/__init__.py


--------------------------------------------------------------------------------
/virtualizarr/tests/test_manifests/test_group.py:
--------------------------------------------------------------------------------
 1 | import textwrap
 2 | 
 3 | import pytest
 4 | from zarr.core.group import GroupMetadata
 5 | 
 6 | from virtualizarr.manifests import ManifestArray, ManifestGroup
 7 | 
 8 | 
 9 | class TestManifestGroup:
10 |     def test_group_containing_array(self, manifest_array):
11 |         var = "foo"
12 |         marr = manifest_array()
13 |         manifest_group = ManifestGroup(arrays={var: marr}, attributes={})
14 | 
15 |         assert manifest_group.arrays == {var: marr}
16 |         assert manifest_group.groups == {}
17 |         assert isinstance(manifest_group[var], ManifestArray)
18 |         with pytest.raises(KeyError):
19 |             manifest_group["bar"]
20 |         assert isinstance(manifest_group.metadata, GroupMetadata)
21 |         assert len(manifest_group) == 1
22 |         assert list(manifest_group) == [var]
23 | 
24 |     def test_manifest_repr(self, manifest_array):
25 |         marr = manifest_array(shape=(5, 2), chunks=(5, 2))
26 |         manifest_group = ManifestGroup(arrays={"foo": marr}, attributes={})
27 |         expected_repr = textwrap.dedent(
28 |             """
29 |             ManifestGroup(
30 |                 arrays={'foo': ManifestArray<shape=(5, 2), dtype=int32, chunks=(5, 2)>},
31 |                 groups={},
32 |                 metadata=GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group'),
33 |             )
34 |             """
35 |         )
36 |         assert repr(manifest_group) == expected_repr
37 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/test_readers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/tests/test_readers/__init__.py


--------------------------------------------------------------------------------
/virtualizarr/tests/test_readers/test_fits.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from xarray import Dataset
 3 | 
 4 | from virtualizarr import open_virtual_dataset
 5 | from virtualizarr.tests import requires_kerchunk, requires_network
 6 | 
 7 | pytest.importorskip("astropy")
 8 | 
 9 | 
10 | @requires_kerchunk
11 | @requires_network
12 | @pytest.mark.xfail(
13 |     reason="Big endian not yet supported by zarr-python 3.0"
14 | )  # https://github.com/zarr-developers/zarr-python/issues/2324
15 | def test_open_hubble_data():
16 |     # data from https://registry.opendata.aws/hst/
17 |     vds = open_virtual_dataset(
18 |         "s3://stpubdata/hst/public/f05i/f05i0201m/f05i0201m_a1f.fits",
19 |         reader_options={"storage_options": {"anon": True}},
20 |     )
21 | 
22 |     assert isinstance(vds, Dataset)
23 |     assert list(vds.variables) == ["PRIMARY"]
24 |     var = vds["PRIMARY"].variable
25 |     assert var.sizes == {"y": 17, "x": 589}
26 |     assert var.dtype == ">i4"
27 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/test_readers/test_hdf/test_hdf.py:
--------------------------------------------------------------------------------
  1 | import h5py  # type: ignore
  2 | import numpy as np
  3 | import pytest
  4 | from obstore.store import LocalStore
  5 | 
  6 | from virtualizarr import open_virtual_dataset
  7 | from virtualizarr.readers.hdf import HDFVirtualBackend
  8 | from virtualizarr.tests import (
  9 |     requires_hdf5plugin,
 10 |     requires_imagecodecs,
 11 | )
 12 | 
 13 | 
 14 | @requires_hdf5plugin
 15 | @requires_imagecodecs
 16 | class TestDatasetChunkManifest:
 17 |     def test_empty_chunks(self, empty_chunks_hdf5_file):
 18 |         f = h5py.File(empty_chunks_hdf5_file)
 19 |         ds = f["data"]
 20 |         manifest = HDFVirtualBackend._dataset_chunk_manifest(
 21 |             path=empty_chunks_hdf5_file, dataset=ds
 22 |         )
 23 |         assert manifest.shape_chunk_grid == (0,)
 24 | 
 25 |     def test_empty_dataset(self, empty_dataset_hdf5_file):
 26 |         f = h5py.File(empty_dataset_hdf5_file)
 27 |         ds = f["data"]
 28 |         manifest = HDFVirtualBackend._dataset_chunk_manifest(
 29 |             path=empty_dataset_hdf5_file, dataset=ds
 30 |         )
 31 |         assert manifest.shape_chunk_grid == (0,)
 32 | 
 33 |     def test_no_chunking(self, no_chunks_hdf5_file):
 34 |         f = h5py.File(no_chunks_hdf5_file)
 35 |         ds = f["data"]
 36 |         manifest = HDFVirtualBackend._dataset_chunk_manifest(
 37 |             path=no_chunks_hdf5_file, dataset=ds
 38 |         )
 39 |         assert manifest.shape_chunk_grid == (1, 1)
 40 | 
 41 |     def test_chunked(self, chunked_hdf5_file):
 42 |         f = h5py.File(chunked_hdf5_file)
 43 |         ds = f["data"]
 44 |         manifest = HDFVirtualBackend._dataset_chunk_manifest(
 45 |             path=chunked_hdf5_file, dataset=ds
 46 |         )
 47 |         assert manifest.shape_chunk_grid == (2, 2)
 48 | 
 49 |     def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file):
 50 |         f = h5py.File(chunked_roundtrip_hdf5_file)
 51 |         ds = f["var2"]
 52 |         manifest = HDFVirtualBackend._dataset_chunk_manifest(
 53 |             path=chunked_roundtrip_hdf5_file, dataset=ds
 54 |         )
 55 |         assert manifest.shape_chunk_grid == (2, 8)
 56 | 
 57 | 
 58 | @requires_hdf5plugin
 59 | @requires_imagecodecs
 60 | class TestDatasetDims:
 61 |     def test_single_dimension_scale(self, single_dimension_scale_hdf5_file):
 62 |         f = h5py.File(single_dimension_scale_hdf5_file)
 63 |         ds = f["data"]
 64 |         dims = HDFVirtualBackend._dataset_dims(ds)
 65 |         assert dims[0] == "x"
 66 | 
 67 |     def test_is_dimension_scale(self, is_scale_hdf5_file):
 68 |         f = h5py.File(is_scale_hdf5_file)
 69 |         ds = f["data"]
 70 |         dims = HDFVirtualBackend._dataset_dims(ds)
 71 |         assert dims[0] == "data"
 72 | 
 73 |     def test_multiple_dimension_scales(self, multiple_dimension_scales_hdf5_file):
 74 |         f = h5py.File(multiple_dimension_scales_hdf5_file)
 75 |         ds = f["data"]
 76 |         with pytest.raises(ValueError, match="dimension scales attached"):
 77 |             HDFVirtualBackend._dataset_dims(ds)
 78 | 
 79 |     def test_no_dimension_scales(self, no_chunks_hdf5_file):
 80 |         f = h5py.File(no_chunks_hdf5_file)
 81 |         ds = f["data"]
 82 |         dims = HDFVirtualBackend._dataset_dims(ds)
 83 |         assert dims == ["phony_dim_0", "phony_dim_1"]
 84 | 
 85 | 
 86 | @requires_hdf5plugin
 87 | @requires_imagecodecs
 88 | class TestDatasetToManifestArray:
 89 |     def test_chunked_dataset(self, chunked_dimensions_netcdf4_file):
 90 |         f = h5py.File(chunked_dimensions_netcdf4_file)
 91 |         ds = f["data"]
 92 |         ma = HDFVirtualBackend._construct_manifest_array(
 93 |             chunked_dimensions_netcdf4_file, ds, group=""
 94 |         )
 95 |         assert ma.chunks == (50, 50)
 96 | 
 97 |     def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file):
 98 |         f = h5py.File(single_dimension_scale_hdf5_file)
 99 |         ds = f["data"]
100 |         ma = HDFVirtualBackend._construct_manifest_array(
101 |             single_dimension_scale_hdf5_file, ds, group=""
102 |         )
103 |         assert ma.chunks == (2,)
104 | 
105 |     def test_dataset_attributes(self, string_attributes_hdf5_file):
106 |         f = h5py.File(string_attributes_hdf5_file)
107 |         ds = f["data"]
108 |         ma = HDFVirtualBackend._construct_manifest_array(
109 |             string_attributes_hdf5_file, ds, group=""
110 |         )
111 |         assert ma.metadata.attributes["attribute_name"] == "attribute_name"
112 | 
113 |     def test_scalar_fill_value(self, scalar_fill_value_hdf5_file):
114 |         f = h5py.File(scalar_fill_value_hdf5_file)
115 |         ds = f["data"]
116 |         ma = HDFVirtualBackend._construct_manifest_array(
117 |             scalar_fill_value_hdf5_file, ds, group=""
118 |         )
119 |         assert ma.metadata.fill_value == 42
120 | 
121 |     def test_cf_fill_value(self, cf_fill_value_hdf5_file):
122 |         f = h5py.File(cf_fill_value_hdf5_file)
123 |         ds = f["data"]
124 |         if ds.dtype.kind in "S":
125 |             pytest.xfail("Investigate fixed-length binary encoding in Zarr v3")
126 |         if ds.dtype.names:
127 |             pytest.xfail(
128 |                 "To fix, structured dtype fill value encoding for Zarr backend"
129 |             )
130 |         ma = HDFVirtualBackend._construct_manifest_array(
131 |             cf_fill_value_hdf5_file, ds, group=""
132 |         )
133 |         assert "_FillValue" in ma.metadata.attributes
134 | 
135 |     def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file):
136 |         f = h5py.File(cf_array_fill_value_hdf5_file)
137 |         ds = f["data"]
138 |         ma = HDFVirtualBackend._construct_manifest_array(
139 |             cf_array_fill_value_hdf5_file, ds, group=""
140 |         )
141 |         assert not isinstance(ma.metadata.attributes["_FillValue"], np.ndarray)
142 | 
143 | 
144 | @requires_hdf5plugin
145 | @requires_imagecodecs
146 | class TestExtractAttributes:
147 |     def test_string_attribute(self, string_attributes_hdf5_file):
148 |         f = h5py.File(string_attributes_hdf5_file)
149 |         ds = f["data"]
150 |         attrs = HDFVirtualBackend._extract_attrs(ds)
151 |         assert attrs["attribute_name"] == "attribute_name"
152 | 
153 |     def test_root_attribute(self, root_attributes_hdf5_file):
154 |         f = h5py.File(root_attributes_hdf5_file)
155 |         attrs = HDFVirtualBackend._extract_attrs(f)
156 |         assert attrs["attribute_name"] == "attribute_name"
157 | 
158 |     def test_multiple_attributes(self, string_attributes_hdf5_file):
159 |         f = h5py.File(string_attributes_hdf5_file)
160 |         ds = f["data"]
161 |         attrs = HDFVirtualBackend._extract_attrs(ds)
162 |         assert len(attrs.keys()) == 2
163 | 
164 | 
165 | @requires_hdf5plugin
166 | @requires_imagecodecs
167 | class TestManifestGroupFromHDF:
168 |     def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
169 |         store = LocalStore()
170 |         manifest_group = HDFVirtualBackend._construct_manifest_group(
171 |             store=store,
172 |             filepath=chunked_dimensions_netcdf4_file,
173 |         )
174 |         assert len(manifest_group.arrays) == 3
175 | 
176 |     def test_nested_groups_are_ignored(self, nested_group_hdf5_file):
177 |         store = LocalStore()
178 |         manifest_group = HDFVirtualBackend._construct_manifest_group(
179 |             store=store,
180 |             filepath=nested_group_hdf5_file,
181 |             group="group",
182 |         )
183 |         assert len(manifest_group.arrays) == 1
184 | 
185 |     def test_drop_variables(self, multiple_datasets_hdf5_file):
186 |         store = LocalStore()
187 |         manifest_group = HDFVirtualBackend._construct_manifest_group(
188 |             store=store,
189 |             filepath=multiple_datasets_hdf5_file,
190 |             drop_variables=["data2"],
191 |         )
192 |         assert "data2" not in manifest_group.arrays.keys()
193 | 
194 |     def test_dataset_in_group(self, group_hdf5_file):
195 |         store = LocalStore()
196 |         manifest_group = HDFVirtualBackend._construct_manifest_group(
197 |             store=store,
198 |             filepath=group_hdf5_file,
199 |             group="group",
200 |         )
201 |         assert len(manifest_group.arrays) == 1
202 | 
203 |     def test_non_group_error(self, group_hdf5_file):
204 |         store = LocalStore()
205 |         with pytest.raises(ValueError):
206 |             HDFVirtualBackend._construct_manifest_group(
207 |                 store=store,
208 |                 filepath=group_hdf5_file,
209 |                 group="group/data",
210 |             )
211 | 
212 | 
213 | @requires_hdf5plugin
214 | @requires_imagecodecs
215 | class TestOpenVirtualDataset:
216 |     def test_coord_names(
217 |         self,
218 |         root_coordinates_hdf5_file,
219 |     ):
220 |         vds = HDFVirtualBackend.open_virtual_dataset(root_coordinates_hdf5_file)
221 |         assert set(vds.coords) == {"lat", "lon"}
222 | 
223 |     @pytest.mark.xfail(reason="Requires Zarr v3 big endian dtype support")
224 |     def test_big_endian(
225 |         self,
226 |         big_endian_dtype_hdf5_file,
227 |     ):
228 |         vds = HDFVirtualBackend.open_virtual_dataset(big_endian_dtype_hdf5_file)
229 |         print(vds)
230 | 
231 | 
232 | @requires_hdf5plugin
233 | @requires_imagecodecs
234 | @pytest.mark.parametrize("group", [None, "subgroup", "subgroup/"])
235 | def test_subgroup_variable_names(netcdf4_file_with_data_in_multiple_groups, group):
236 |     # regression test for GH issue #364
237 |     vds = open_virtual_dataset(
238 |         netcdf4_file_with_data_in_multiple_groups,
239 |         group=group,
240 |         backend=HDFVirtualBackend,
241 |     )
242 |     assert list(vds.dims) == ["dim_0"]
243 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/test_readers/test_hdf/test_hdf_filters.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | 
  3 | import h5py  # type: ignore
  4 | import numcodecs
  5 | import numpy as np
  6 | 
  7 | try:
  8 |     import imagecodecs  # noqa
  9 | except ModuleNotFoundError:
 10 |     imagecodecs = None  # type: ignore
 11 |     warnings.warn("imagecodecs is required for HDF reader")
 12 | 
 13 | 
 14 | from virtualizarr.readers.hdf.filters import (
 15 |     _filter_to_codec,
 16 |     cfcodec_from_dataset,
 17 |     codecs_from_dataset,
 18 | )
 19 | from virtualizarr.tests import (
 20 |     requires_hdf5plugin,
 21 |     requires_imagecodecs,
 22 | )
 23 | 
 24 | 
 25 | @requires_hdf5plugin
 26 | @requires_imagecodecs
 27 | class TestFilterToCodec:
 28 |     def test_gzip_uses_zlib_numcodec(self):
 29 |         codec = _filter_to_codec("gzip", 1)
 30 |         assert isinstance(codec, numcodecs.zlib.Zlib)
 31 | 
 32 |     def test_lzf(self):
 33 |         codec = _filter_to_codec("lzf")
 34 |         assert isinstance(codec, imagecodecs.numcodecs.Lzf)
 35 | 
 36 |     def test_blosc(self):
 37 |         import numcodecs
 38 |         from packaging import version
 39 | 
 40 |         codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1))
 41 |         assert isinstance(codec, numcodecs.blosc.Blosc)
 42 |         expected_config = {
 43 |             "id": "blosc",
 44 |             "blocksize": 800,
 45 |             "clevel": 9,
 46 |             "shuffle": 2,
 47 |             "cname": "lz4",
 48 |         }
 49 |         if (
 50 |             version.parse("0.16.1")
 51 |             > version.parse(numcodecs.__version__)
 52 |             > version.parse("0.15.1")
 53 |         ):
 54 |             expected_config["typesize"] = None
 55 |         assert codec.get_config() == expected_config
 56 | 
 57 |     def test_zstd(self):
 58 |         codec = _filter_to_codec("32015", (5,))
 59 |         assert isinstance(codec, numcodecs.zstd.Zstd)
 60 |         config = codec.get_config()
 61 |         assert config["id"] == "zstd"
 62 |         assert config["level"] == 5
 63 | 
 64 |     def test_shuffle(self):
 65 |         codec = _filter_to_codec("shuffle", (7,))
 66 |         assert isinstance(codec, numcodecs.shuffle.Shuffle)
 67 |         expected_config = {"id": "shuffle", "elementsize": 7}
 68 |         assert codec.get_config() == expected_config
 69 | 
 70 | 
 71 | @requires_hdf5plugin
 72 | @requires_imagecodecs
 73 | class TestCodecsFromDataSet:
 74 |     def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file):
 75 |         f = h5py.File(filter_encoded_hdf5_file)
 76 |         ds = f["data"]
 77 |         chunk_info = ds.id.get_chunk_info(0)
 78 |         codecs = codecs_from_dataset(ds)
 79 |         with open(filter_encoded_hdf5_file, "rb") as file:
 80 |             file.seek(chunk_info.byte_offset)
 81 |             bytes_read = file.read(chunk_info.size)
 82 |             decoded = codecs[0].decode(bytes_read)
 83 |             if isinstance(decoded, np.ndarray):
 84 |                 assert decoded.tobytes() == np_uncompressed.tobytes()
 85 |             else:
 86 |                 assert decoded == np_uncompressed.tobytes()
 87 | 
 88 | 
 89 | @requires_hdf5plugin
 90 | @requires_imagecodecs
 91 | class TestCFCodecFromDataset:
 92 |     def test_no_cf_convention(self, filter_encoded_hdf5_file):
 93 |         f = h5py.File(filter_encoded_hdf5_file)
 94 |         ds = f["data"]
 95 |         cf_codec = cfcodec_from_dataset(ds)
 96 |         assert cf_codec is None
 97 | 
 98 |     def test_cf_scale_factor(self, netcdf4_file):
 99 |         f = h5py.File(netcdf4_file)
100 |         ds = f["air"]
101 |         cf_codec = cfcodec_from_dataset(ds)
102 |         assert cf_codec["target_dtype"] == np.dtype(np.float64)
103 |         assert cf_codec["codec"].scale == 100.0
104 |         assert cf_codec["codec"].offset == 0
105 |         assert cf_codec["codec"].dtype == "<f8"
106 |         assert cf_codec["codec"].astype == "<i2"
107 | 
108 |     def test_cf_add_offset(self, add_offset_hdf5_file):
109 |         f = h5py.File(add_offset_hdf5_file)
110 |         ds = f["data"]
111 |         cf_codec = cfcodec_from_dataset(ds)
112 |         assert cf_codec["target_dtype"] == np.dtype(np.float64)
113 |         assert cf_codec["codec"].scale == 1
114 |         assert cf_codec["codec"].offset == 5
115 |         assert cf_codec["codec"].dtype == "<f8"
116 | 
117 |     def test_cf_codec_decoding_offset(
118 |         self, add_offset_hdf5_file, np_uncompressed_int16
119 |     ):
120 |         f = h5py.File(add_offset_hdf5_file)
121 |         ds = f["data"]
122 |         chunk_info = ds.id.get_chunk_info(0)
123 |         cfcodec = cfcodec_from_dataset(ds)
124 |         with open(add_offset_hdf5_file, "rb") as file:
125 |             file.seek(chunk_info.byte_offset)
126 |             bytes_read = file.read(chunk_info.size)
127 |             decoded = cfcodec["codec"].decode(bytes_read)
128 |             assert np.array_equal(decoded, np_uncompressed_int16)
129 |             assert decoded.dtype == np.float64
130 | 
131 |     def test_cf_codec_decoding_scale_offset(
132 |         self, scale_add_offset_hdf5_file, np_uncompressed_int16
133 |     ):
134 |         f = h5py.File(scale_add_offset_hdf5_file)
135 |         ds = f["data"]
136 |         chunk_info = ds.id.get_chunk_info(0)
137 |         cfcodec = cfcodec_from_dataset(ds)
138 |         with open(scale_add_offset_hdf5_file, "rb") as file:
139 |             file.seek(chunk_info.byte_offset)
140 |             bytes_read = file.read(chunk_info.size)
141 |             decoded = cfcodec["codec"].decode(bytes_read)
142 |             assert np.allclose(decoded, np_uncompressed_int16)
143 |             assert decoded.dtype == np.float64
144 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import xarray as xr
  3 | import xarray.testing as xrt
  4 | 
  5 | import virtualizarr
  6 | from virtualizarr.readers.hdf import HDFVirtualBackend
  7 | from virtualizarr.tests import (
  8 |     requires_hdf5plugin,
  9 |     requires_icechunk,
 10 |     requires_imagecodecs,
 11 |     requires_kerchunk,
 12 | )
 13 | from virtualizarr.tests.test_integration import (
 14 |     roundtrip_as_in_memory_icechunk,
 15 | )
 16 | 
 17 | 
 18 | @requires_kerchunk
 19 | @requires_hdf5plugin
 20 | @requires_imagecodecs
 21 | class TestIntegration:
 22 |     @pytest.mark.xfail(
 23 |         reason="0 time start is being interpreted as fillvalue see issues/280"
 24 |     )
 25 |     def test_filters_h5netcdf_roundtrip(
 26 |         self, tmp_path, filter_encoded_roundtrip_hdf5_file
 27 |     ):
 28 |         with (
 29 |             xr.open_dataset(
 30 |                 filter_encoded_roundtrip_hdf5_file, decode_times=True
 31 |             ) as ds,
 32 |             virtualizarr.open_virtual_dataset(
 33 |                 filter_encoded_roundtrip_hdf5_file,
 34 |                 loadable_variables=["time"],
 35 |                 cftime_variables=["time"],
 36 |                 backend=HDFVirtualBackend,
 37 |             ) as vds,
 38 |         ):
 39 |             kerchunk_file = str(tmp_path / "kerchunk.json")
 40 |             vds.virtualize.to_kerchunk(kerchunk_file, format="json")
 41 |             with xr.open_dataset(
 42 |                 kerchunk_file, engine="kerchunk", decode_times=True
 43 |             ) as roundtrip:
 44 |                 xrt.assert_allclose(ds, roundtrip)
 45 | 
 46 |     def test_filters_netcdf4_roundtrip(
 47 |         self, tmp_path, filter_encoded_roundtrip_netcdf4_file
 48 |     ):
 49 |         filepath = filter_encoded_roundtrip_netcdf4_file["filepath"]
 50 |         with (
 51 |             xr.open_dataset(filepath) as ds,
 52 |             virtualizarr.open_virtual_dataset(
 53 |                 filepath, backend=HDFVirtualBackend
 54 |             ) as vds,
 55 |         ):
 56 |             kerchunk_file = str(tmp_path / "kerchunk.json")
 57 |             vds.virtualize.to_kerchunk(kerchunk_file, format="json")
 58 |             with xr.open_dataset(kerchunk_file, engine="kerchunk") as roundtrip:
 59 |                 xrt.assert_equal(ds, roundtrip)
 60 | 
 61 |     def test_filter_and_cf_roundtrip(self, tmp_path, filter_and_cf_roundtrip_hdf5_file):
 62 |         with (
 63 |             xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) as ds,
 64 |             virtualizarr.open_virtual_dataset(
 65 |                 filter_and_cf_roundtrip_hdf5_file, backend=HDFVirtualBackend
 66 |             ) as vds,
 67 |         ):
 68 |             kerchunk_file = str(tmp_path / "filter_cf_kerchunk.json")
 69 |             vds.virtualize.to_kerchunk(kerchunk_file, format="json")
 70 |             with xr.open_dataset(kerchunk_file, engine="kerchunk") as roundtrip:
 71 |                 xrt.assert_allclose(ds, roundtrip)
 72 |                 assert (
 73 |                     ds["temperature"].encoding["_FillValue"]
 74 |                     == roundtrip["temperature"].encoding["_FillValue"]
 75 |                 )
 76 | 
 77 |     def test_non_coord_dim_roundtrip(self, tmp_path, non_coord_dim):
 78 |         with (
 79 |             xr.open_dataset(non_coord_dim) as ds,
 80 |             virtualizarr.open_virtual_dataset(
 81 |                 non_coord_dim, backend=HDFVirtualBackend
 82 |             ) as vds,
 83 |         ):
 84 |             kerchunk_file = str(tmp_path / "kerchunk.json")
 85 |             vds.virtualize.to_kerchunk(kerchunk_file, format="json")
 86 |             with xr.open_dataset(kerchunk_file, engine="kerchunk") as roundtrip:
 87 |                 xrt.assert_equal(ds, roundtrip)
 88 | 
 89 |     @requires_icechunk
 90 |     def test_cf_fill_value_roundtrip(self, tmp_path, cf_fill_value_hdf5_file):
 91 |         with xr.open_dataset(cf_fill_value_hdf5_file, engine="h5netcdf") as ds:
 92 |             if ds["data"].dtype in [float, object]:
 93 |                 pytest.xfail(
 94 |                     "TODO: fix handling fixed-length and structured type fill value"
 95 |                     " encoding in xarray zarr backend."
 96 |                 )
 97 |             with virtualizarr.open_virtual_dataset(
 98 |                 cf_fill_value_hdf5_file, backend=HDFVirtualBackend
 99 |             ) as vds:
100 |                 roundtrip = roundtrip_as_in_memory_icechunk(
101 |                     vds, tmp_path, decode_times=False
102 |                 )
103 |                 xrt.assert_equal(ds, roundtrip)
104 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/test_readers/test_hdf/test_hdf_manifest_store.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | import xarray as xr
  4 | 
  5 | from virtualizarr.manifests import ManifestArray
  6 | from virtualizarr.readers.hdf import HDFVirtualBackend
  7 | from virtualizarr.tests import (
  8 |     requires_hdf5plugin,
  9 |     requires_minio,
 10 |     requires_network,
 11 |     requires_obstore,
 12 | )
 13 | 
 14 | 
 15 | @pytest.fixture(name="basic_ds")
 16 | def basic_ds():
 17 |     x = np.arange(100)
 18 |     y = np.arange(100)
 19 |     temperature = 0.1 * x[:, None] + 0.1 * y[None, :]
 20 |     ds = xr.Dataset(
 21 |         {"temperature": (["x", "y"], temperature)},
 22 |         coords={"x": np.arange(100), "y": np.arange(100)},
 23 |     )
 24 |     return ds
 25 | 
 26 | 
 27 | @requires_hdf5plugin
 28 | @requires_obstore
 29 | class TestHDFManifestStore:
 30 |     def test_rountrip_simple_virtualdataset(self, tmpdir, basic_ds):
 31 |         "Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore"
 32 | 
 33 |         filepath = f"{tmpdir}/basic_ds_roundtrip.nc"
 34 |         basic_ds.to_netcdf(filepath, engine="h5netcdf")
 35 |         store = HDFVirtualBackend._create_manifest_store(
 36 |             filepath=filepath,
 37 |         )
 38 |         rountripped_ds = xr.open_dataset(
 39 |             store, engine="zarr", consolidated=False, zarr_format=3
 40 |         )
 41 |         xr.testing.assert_allclose(basic_ds, rountripped_ds)
 42 | 
 43 |     def test_rountrip_partial_chunk_virtualdataset(self, tmpdir, basic_ds):
 44 |         "Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore with a single partial chunk"
 45 | 
 46 |         filepath = f"{tmpdir}/basic_ds_roundtrip.nc"
 47 |         encoding = {
 48 |             "temperature": {"chunksizes": (90, 90), "original_shape": (100, 100)}
 49 |         }
 50 |         basic_ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
 51 |         store = HDFVirtualBackend._create_manifest_store(
 52 |             filepath=filepath,
 53 |         )
 54 |         rountripped_ds = xr.open_dataset(
 55 |             store, engine="zarr", consolidated=False, zarr_format=3
 56 |         )
 57 |         xr.testing.assert_allclose(basic_ds, rountripped_ds)
 58 | 
 59 |     def test_rountrip_simple_virtualdataset_default_store(self, tmpdir, basic_ds):
 60 |         "Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore"
 61 | 
 62 |         filepath = f"{tmpdir}/basic_ds_roundtrip.nc"
 63 |         basic_ds.to_netcdf(filepath, engine="h5netcdf")
 64 |         store = HDFVirtualBackend._create_manifest_store(filepath=filepath)
 65 |         rountripped_ds = xr.open_dataset(
 66 |             store, engine="zarr", consolidated=False, zarr_format=3
 67 |         )
 68 |         xr.testing.assert_allclose(basic_ds, rountripped_ds)
 69 | 
 70 |     @requires_minio
 71 |     @requires_obstore
 72 |     def test_store(self, minio_bucket, chunked_roundtrip_hdf5_s3_file):
 73 |         import obstore as obs
 74 | 
 75 |         s3store = obs.store.S3Store(
 76 |             bucket=minio_bucket["bucket"],
 77 |             config={
 78 |                 "endpoint": minio_bucket["endpoint"],
 79 |                 "virtual_hosted_style_request": False,
 80 |                 "skip_signature": True,
 81 |             },
 82 |             client_options={"allow_http": True},
 83 |         )
 84 |         store = HDFVirtualBackend._create_manifest_store(
 85 |             filepath=chunked_roundtrip_hdf5_s3_file,
 86 |             store=s3store,
 87 |         )
 88 |         vds = store.to_virtual_dataset()
 89 |         assert vds.sizes == {"phony_dim_0": 5}
 90 |         assert isinstance(vds["data"].data, ManifestArray)
 91 | 
 92 |     @requires_network
 93 |     @requires_obstore
 94 |     def test_default_store(self):
 95 |         store = HDFVirtualBackend._create_manifest_store(
 96 |             filepath="s3://carbonplan-share/virtualizarr/local.nc",
 97 |         )
 98 |         vds = store.to_virtual_dataset()
 99 |         assert vds.sizes == {"time": 2920, "lat": 25, "lon": 53}
100 |         assert isinstance(vds["air"].data, ManifestArray)
101 |         for name in ["time", "lat", "lon"]:
102 |             assert isinstance(vds[name].data, np.ndarray)
103 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/test_readers/test_netcdf3.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import xarray as xr
 3 | import xarray.testing as xrt
 4 | 
 5 | from virtualizarr import open_virtual_dataset
 6 | from virtualizarr.manifests import ChunkManifest, ManifestArray
 7 | from virtualizarr.tests import requires_scipy
 8 | 
 9 | 
10 | @requires_scipy
11 | @pytest.mark.xfail(
12 |     reason="Big endian not yet supported by zarr-python 3.0"
13 | )  # https://github.com/zarr-developers/zarr-python/issues/2324
14 | def test_read_netcdf3(netcdf3_file, array_v3_metadata):
15 |     filepath = str(netcdf3_file)
16 |     vds = open_virtual_dataset(filepath)
17 | 
18 |     assert isinstance(vds, xr.Dataset)
19 |     assert list(vds.variables.keys()) == ["foo"]
20 |     assert isinstance(vds["foo"].data, ManifestArray)
21 | 
22 |     expected_manifest = ChunkManifest(
23 |         entries={"0": {"path": filepath, "offset": 80, "length": 12}}
24 |     )
25 |     metadata = array_v3_metadata(shape=(3,), chunks=(3,))
26 |     expected_ma = ManifestArray(chunkmanifest=expected_manifest, metadata=metadata)
27 |     expected_vds = xr.Dataset({"foo": xr.Variable(data=expected_ma, dims=["x"])})
28 | 
29 |     xrt.assert_identical(vds, expected_vds)
30 | 
31 | 
32 | # TODO test loading data against xarray backend, see issue #394 for context
33 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/test_readers/test_zarr.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | 
 4 | from virtualizarr import open_virtual_dataset
 5 | from virtualizarr.manifests import ManifestArray
 6 | from virtualizarr.readers.zarr import get_chunk_mapping_prefix
 7 | from virtualizarr.tests import requires_network
 8 | 
 9 | 
10 | @requires_network
11 | @pytest.mark.parametrize(
12 |     "zarr_store",
13 |     [
14 |         pytest.param(
15 |             2,
16 |             id="Zarr V2",
17 |             marks=pytest.mark.skip(reason="Zarr V2 not currently supported."),
18 |         ),
19 |         pytest.param(3, id="Zarr V3"),
20 |     ],
21 |     indirect=True,
22 | )
23 | class TestOpenVirtualDatasetZarr:
24 |     def test_loadable_variables(self, zarr_store, loadable_variables=["time", "air"]):
25 |         # check loadable variables
26 |         vds = open_virtual_dataset(
27 |             filepath=zarr_store, loadable_variables=loadable_variables
28 |         )
29 |         assert isinstance(vds["time"].data, np.ndarray)
30 |         assert isinstance(vds["air"].data, np.ndarray), type(vds["air"].data)
31 | 
32 |     def test_drop_variables(self, zarr_store, drop_variables=["air"]):
33 |         # check variable is dropped
34 |         vds = open_virtual_dataset(filepath=zarr_store, drop_variables=drop_variables)
35 |         assert len(vds.data_vars) == 0
36 | 
37 |     def test_manifest_indexing(self, zarr_store):
38 |         vds = open_virtual_dataset(filepath=zarr_store)
39 |         assert "0.0.0" in vds["air"].data.manifest.dict().keys()
40 | 
41 |     def test_virtual_dataset_zarr_attrs(self, zarr_store):
42 |         import zarr
43 | 
44 |         zg = zarr.open_group(zarr_store)
45 |         vds = open_virtual_dataset(filepath=zarr_store, loadable_variables=[])
46 | 
47 |         non_var_arrays = ["time", "lat", "lon"]
48 | 
49 |         # check dims and coords are present
50 |         assert set(vds.coords) == set(non_var_arrays)
51 |         assert set(vds.sizes) == set(non_var_arrays)
52 |         # check vars match
53 |         assert set(vds.keys()) == set(["air"])
54 | 
55 |         # check top level attrs
56 |         assert zg.attrs.asdict() == vds.attrs
57 | 
58 |         arrays = [val for val in zg.keys()]
59 | 
60 |         # arrays are ManifestArrays
61 |         for array in arrays:
62 |             # check manifest array ArrayV3Metadata dtype
63 |             assert isinstance(vds[array].data, ManifestArray)
64 |             # compare manifest array ArrayV3Metadata
65 |             expected = zg[array].metadata.to_dict()
66 |             # Check attributes
67 |             assert expected["attributes"] == vds[array].attrs
68 |             assert expected["dimension_names"] == vds[array].dims
69 |             expected.pop(
70 |                 "dimension_names"
71 |             )  # dimension_names are removed in conversion to virtual variable
72 |             expected[
73 |                 "attributes"
74 |             ] = {}  # attributes are removed in conversion to virtual variable
75 |             actual = vds[array].data.metadata.to_dict()
76 |             assert expected == actual
77 | 
78 | 
79 | def test_scalar_get_chunk_mapping_prefix(zarr_store_scalar):
80 |     # Use a scalar zarr store with a /c/ representing the scalar:
81 |     # https://zarr-specs.readthedocs.io/en/latest/v3/chunk-key-encodings/default/index.html#description
82 | 
83 |     import asyncio
84 | 
85 |     chunk_map = asyncio.run(
86 |         get_chunk_mapping_prefix(
87 |             zarr_array=zarr_store_scalar, filepath=str(zarr_store_scalar.store_path)
88 |         )
89 |     )
90 |     assert chunk_map["c"]["offset"] == 0
91 |     assert chunk_map["c"]["length"] == 10
92 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | import pathlib
 3 | 
 4 | import fsspec
 5 | import fsspec.implementations.local
 6 | import fsspec.implementations.memory
 7 | import numpy as np
 8 | import pytest
 9 | import xarray as xr
10 | 
11 | from virtualizarr.manifests.utils import copy_and_replace_metadata
12 | from virtualizarr.tests import requires_scipy
13 | from virtualizarr.utils import _FsspecFSFromFilepath
14 | 
15 | 
16 | @pytest.fixture
17 | def dataset() -> xr.Dataset:
18 |     return xr.Dataset(
19 |         {"x": xr.DataArray([10, 20, 30], dims="a", coords={"a": [0, 1, 2]})}
20 |     )
21 | 
22 | 
23 | def test_fsspec_openfile_from_path(tmp_path: pathlib.Path, dataset: xr.Dataset) -> None:
24 |     f = tmp_path / "dataset.nc"
25 |     dataset.to_netcdf(f)
26 | 
27 |     result = _FsspecFSFromFilepath(filepath=f.as_posix()).open_file()
28 |     assert isinstance(result, fsspec.implementations.local.LocalFileOpener)
29 | 
30 | 
31 | @requires_scipy
32 | def test_fsspec_openfile_memory(dataset: xr.Dataset):
33 |     fs = fsspec.filesystem("memory")
34 |     with contextlib.redirect_stderr(None):
35 |         # Suppress "Exception ignored in: <function netcdf_file.close at ...>"
36 |         with fs.open("dataset.nc", mode="wb") as f:
37 |             dataset.to_netcdf(f, engine="h5netcdf")
38 | 
39 |     result = _FsspecFSFromFilepath(filepath="memory://dataset.nc").open_file()
40 |     with result:
41 |         assert isinstance(result, fsspec.implementations.memory.MemoryFile)
42 | 
43 | 
44 | def test_copy_and_replace_metadata(array_v3_metadata):
45 |     old_metadata = array_v3_metadata(
46 |         shape=(10, 10),
47 |         data_type=np.dtype("float32"),
48 |         chunks=(5, 5),
49 |         fill_value=0,
50 |     )
51 | 
52 |     new_shape = (20, 20)
53 |     new_chunks = (10, 10)
54 | 
55 |     # Test updating both shape and chunk shape
56 |     updated_metadata = copy_and_replace_metadata(
57 |         old_metadata, new_shape=new_shape, new_chunks=new_chunks
58 |     )
59 |     assert updated_metadata.shape == tuple(new_shape)
60 |     assert updated_metadata.chunks == tuple(new_chunks)
61 |     # Test other values are still the same
62 |     assert updated_metadata.data_type == old_metadata.data_type
63 |     assert updated_metadata.fill_value == old_metadata.fill_value
64 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/test_writers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/tests/test_writers/__init__.py


--------------------------------------------------------------------------------
/virtualizarr/tests/test_writers/conftest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from xarray import Dataset
 4 | from xarray.core.variable import Variable
 5 | 
 6 | from conftest import ARRAYBYTES_CODEC, ZLIB_CODEC
 7 | from virtualizarr.manifests import ChunkManifest, ManifestArray
 8 | 
 9 | 
10 | @pytest.fixture
11 | def vds_with_manifest_arrays(array_v3_metadata) -> Dataset:
12 |     arr = ManifestArray(
13 |         chunkmanifest=ChunkManifest(
14 |             entries={"0.0": dict(path="/test.nc", offset=6144, length=48)}
15 |         ),
16 |         metadata=array_v3_metadata(
17 |             shape=(2, 3),
18 |             data_type=np.dtype("<i8"),
19 |             chunks=(2, 3),
20 |             codecs=[ARRAYBYTES_CODEC, ZLIB_CODEC],
21 |             fill_value=0,
22 |         ),
23 |     )
24 |     var = Variable(dims=["x", "y"], data=arr, attrs={"units": "km"})
25 |     return Dataset({"a": var}, attrs={"something": 0})
26 | 


--------------------------------------------------------------------------------
/virtualizarr/tests/test_writers/test_kerchunk.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from xarray import Dataset
  4 | from zarr.core.metadata.v2 import ArrayV2Metadata
  5 | 
  6 | from conftest import ARRAYBYTES_CODEC
  7 | from virtualizarr.manifests import ChunkManifest, ManifestArray
  8 | from virtualizarr.tests import requires_fastparquet, requires_kerchunk
  9 | from virtualizarr.utils import convert_v3_to_v2_metadata
 10 | 
 11 | 
 12 | @requires_kerchunk
 13 | class TestAccessor:
 14 |     def test_accessor_to_kerchunk_dict(self, array_v3_metadata):
 15 |         manifest = ChunkManifest(
 16 |             entries={"0.0": dict(path="file:///test.nc", offset=6144, length=48)}
 17 |         )
 18 |         arr = ManifestArray(
 19 |             chunkmanifest=manifest,
 20 |             metadata=array_v3_metadata(
 21 |                 shape=(2, 3),
 22 |                 data_type=np.dtype("<i8"),
 23 |                 chunks=(2, 3),
 24 |                 codecs=[],
 25 |                 fill_value=None,
 26 |             ),
 27 |         )
 28 |         ds = Dataset({"a": (["x", "y"], arr)})
 29 | 
 30 |         expected_ds_refs = {
 31 |             "version": 1,
 32 |             "refs": {
 33 |                 ".zgroup": '{"zarr_format":2}',
 34 |                 ".zattrs": "{}",
 35 |                 "a/.zarray": '{"shape":[2,3],"chunks":[2,3],"fill_value":0,"order":"C","filters":null,"dimension_separator":".","compressor":null,"attributes":{},"zarr_format":2,"dtype":"<i8"}',
 36 |                 "a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
 37 |                 "a/0.0": ["/test.nc", 6144, 48],
 38 |             },
 39 |         }
 40 | 
 41 |         result_ds_refs = ds.virtualize.to_kerchunk(format="dict")
 42 |         assert result_ds_refs == expected_ds_refs
 43 | 
 44 |     def test_accessor_to_kerchunk_dict_empty(self, array_v3_metadata):
 45 |         manifest = ChunkManifest(entries={}, shape=(1, 1))
 46 |         arr = ManifestArray(
 47 |             chunkmanifest=manifest,
 48 |             metadata=array_v3_metadata(
 49 |                 shape=(2, 3),
 50 |                 data_type=np.dtype("<i8"),
 51 |                 chunks=(2, 3),
 52 |                 codecs=[],
 53 |                 fill_value=None,
 54 |             ),
 55 |         )
 56 |         ds = Dataset({"a": (["x", "y"], arr)})
 57 | 
 58 |         expected_ds_refs = {
 59 |             "version": 1,
 60 |             "refs": {
 61 |                 ".zgroup": '{"zarr_format":2}',
 62 |                 ".zattrs": "{}",
 63 |                 "a/.zarray": '{"shape":[2,3],"chunks":[2,3],"fill_value":0,"order":"C","filters":null,"dimension_separator":".","compressor":null,"attributes":{},"zarr_format":2,"dtype":"<i8"}',
 64 |                 "a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
 65 |             },
 66 |         }
 67 | 
 68 |         result_ds_refs = ds.virtualize.to_kerchunk(format="dict")
 69 |         assert result_ds_refs == expected_ds_refs
 70 | 
 71 |     def test_accessor_to_kerchunk_json(self, tmp_path, array_v3_metadata):
 72 |         import ujson
 73 | 
 74 |         manifest = ChunkManifest(
 75 |             entries={"0.0": dict(path="file:///test.nc", offset=6144, length=48)}
 76 |         )
 77 |         arr = ManifestArray(
 78 |             chunkmanifest=manifest,
 79 |             metadata=array_v3_metadata(
 80 |                 shape=(2, 3),
 81 |                 data_type=np.dtype("<i8"),
 82 |                 chunks=(2, 3),
 83 |                 codecs=[],
 84 |                 fill_value=None,
 85 |             ),
 86 |         )
 87 |         ds = Dataset({"a": (["x", "y"], arr)})
 88 | 
 89 |         filepath = tmp_path / "refs.json"
 90 | 
 91 |         ds.virtualize.to_kerchunk(filepath, format="json")
 92 | 
 93 |         with open(filepath) as json_file:
 94 |             loaded_refs = ujson.load(json_file)
 95 | 
 96 |         expected_ds_refs = {
 97 |             "version": 1,
 98 |             "refs": {
 99 |                 ".zgroup": '{"zarr_format":2}',
100 |                 ".zattrs": "{}",
101 |                 "a/.zarray": '{"shape":[2,3],"chunks":[2,3],"fill_value":0,"order":"C","filters":null,"dimension_separator":".","compressor":null,"attributes":{},"zarr_format":2,"dtype":"<i8"}',
102 |                 "a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
103 |                 "a/0.0": ["/test.nc", 6144, 48],
104 |             },
105 |         }
106 |         assert loaded_refs == expected_ds_refs
107 | 
108 |     @requires_fastparquet
109 |     def test_accessor_to_kerchunk_parquet(self, tmp_path, array_v3_metadata):
110 |         import ujson
111 | 
112 |         chunks_dict = {
113 |             "0.0": {"path": "file:///foo.nc", "offset": 100, "length": 100},
114 |             "0.1": {"path": "file:///foo.nc", "offset": 200, "length": 100},
115 |         }
116 |         manifest = ChunkManifest(entries=chunks_dict)
117 |         arr = ManifestArray(
118 |             chunkmanifest=manifest,
119 |             metadata=array_v3_metadata(
120 |                 shape=(2, 4),
121 |                 data_type=np.dtype("<i8"),
122 |                 chunks=(2, 2),
123 |                 codecs=[],
124 |                 fill_value=None,
125 |             ),
126 |         )
127 |         ds = Dataset({"a": (["x", "y"], arr)})
128 | 
129 |         filepath = tmp_path / "refs"
130 | 
131 |         ds.virtualize.to_kerchunk(filepath, format="parquet", record_size=2)
132 | 
133 |         with open(tmp_path / "refs" / ".zmetadata") as f:
134 |             meta = ujson.load(f)
135 |             assert list(meta) == ["metadata", "record_size"]
136 |             assert meta["record_size"] == 2
137 | 
138 |         df0 = pd.read_parquet(filepath / "a" / "refs.0.parq")
139 | 
140 |         assert df0.to_dict() == {
141 |             "offset": {0: 100, 1: 200},
142 |             "path": {
143 |                 0: "/foo.nc",
144 |                 1: "/foo.nc",
145 |             },
146 |             "size": {0: 100, 1: 100},
147 |             "raw": {0: None, 1: None},
148 |         }
149 | 
150 | 
151 | def testconvert_v3_to_v2_metadata(array_v3_metadata):
152 |     shape = (5, 20)
153 |     chunks = (5, 10)
154 |     codecs = [
155 |         ARRAYBYTES_CODEC,
156 |         {"name": "numcodecs.delta", "configuration": {"dtype": "<i8"}},
157 |         {
158 |             "name": "numcodecs.blosc",
159 |             "configuration": {"cname": "zstd", "clevel": 5, "shuffle": 1},
160 |         },
161 |     ]
162 | 
163 |     v3_metadata = array_v3_metadata(shape=shape, chunks=chunks, codecs=codecs)
164 |     v2_metadata = convert_v3_to_v2_metadata(v3_metadata)
165 | 
166 |     assert isinstance(v2_metadata, ArrayV2Metadata)
167 |     assert v2_metadata.shape == shape
168 |     assert v2_metadata.dtype == np.dtype("int32")
169 |     assert v2_metadata.chunks == chunks
170 |     assert v2_metadata.fill_value == 0
171 |     compressor_config = v2_metadata.compressor.get_config()
172 |     assert compressor_config["id"] == "blosc"
173 |     assert compressor_config["cname"] == "zstd"
174 |     assert compressor_config["clevel"] == 5
175 |     assert compressor_config["shuffle"] == 1
176 |     assert compressor_config["blocksize"] == 0
177 |     filters_config = v2_metadata.filters[0].get_config()
178 |     assert filters_config["id"] == "delta"
179 |     assert filters_config["dtype"] == "<i8"
180 |     assert filters_config["astype"] == "<i8"
181 |     assert v2_metadata.attributes == {}
182 | 


--------------------------------------------------------------------------------
/virtualizarr/translators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/translators/__init__.py


--------------------------------------------------------------------------------
/virtualizarr/types/__init__.py:
--------------------------------------------------------------------------------
1 | from virtualizarr.types.general import ChunkKey  # type: ignore[F401]
2 | 
3 | __all__ = ["ChunkKey"]
4 | 


--------------------------------------------------------------------------------
/virtualizarr/types/general.py:
--------------------------------------------------------------------------------
1 | from typing import NewType
2 | 
3 | ChunkKey = NewType("ChunkKey", str)  # a string of the form '1.0.1' etc.
4 | 


--------------------------------------------------------------------------------
/virtualizarr/types/kerchunk.py:
--------------------------------------------------------------------------------
 1 | from typing import NewType
 2 | 
 3 | # Distinguishing these via type hints makes it a lot easier to mentally keep track of what the opaque kerchunk "reference dicts" actually mean
 4 | # (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html)
 5 | # TODO I would prefer to be more specific about these types
 6 | KerchunkStoreRefs = NewType(
 7 |     "KerchunkStoreRefs",
 8 |     dict,  # dict_keys(['version', 'refs'])
 9 | )  # top-level dict containing kerchunk version and 'refs' dictionary which assumes single '.zgroup' key and multiple KerchunkArrRefs
10 | KerchunkArrRefs = NewType(
11 |     "KerchunkArrRefs",
12 |     dict,  # dict_keys(['.zarray', '.zattrs', '0.0', '0.1', ...)
13 | )  # lower-level dict defining a single Zarr Array, with keys for '.zarray', '.zattrs', and every chunk
14 | 


--------------------------------------------------------------------------------
/virtualizarr/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import importlib
  4 | import io
  5 | from dataclasses import dataclass, field
  6 | from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
  7 | from urllib.parse import urlparse
  8 | 
  9 | from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec
 10 | from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata
 11 | 
 12 | from virtualizarr.codecs import extract_codecs, get_codec_config
 13 | 
 14 | if TYPE_CHECKING:
 15 |     import fsspec.core
 16 |     import fsspec.spec
 17 |     import upath
 18 |     from obstore import ReadableFile
 19 |     from obstore.store import ObjectStore
 20 | 
 21 |     # See pangeo_forge_recipes.storage
 22 |     OpenFileType = Union[
 23 |         fsspec.core.OpenFile, fsspec.spec.AbstractBufferedFile, io.IOBase
 24 |     ]
 25 | 
 26 | 
 27 | class ObstoreReader:
 28 |     _reader: ReadableFile
 29 | 
 30 |     def __init__(self, store: ObjectStore, path: str) -> None:
 31 |         import obstore as obs
 32 | 
 33 |         parsed = urlparse(path)
 34 | 
 35 |         self._reader = obs.open_reader(store, parsed.path)
 36 | 
 37 |     def read(self, size: int, /) -> bytes:
 38 |         return self._reader.read(size).to_bytes()
 39 | 
 40 |     def seek(self, offset: int, whence: int = 0, /):
 41 |         # TODO: Check on default for whence
 42 |         return self._reader.seek(offset, whence)
 43 | 
 44 |     def tell(self) -> int:
 45 |         return self._reader.tell()
 46 | 
 47 | 
 48 | @dataclass
 49 | class _FsspecFSFromFilepath:
 50 |     """Class to create fsspec Filesystem from input filepath.
 51 | 
 52 |     Parameters
 53 |     ----------
 54 |     filepath : str
 55 |         Input filepath
 56 |     reader_options : dict, optional
 57 |         dict containing kwargs to pass to file opener, by default {}
 58 |     fs : Option | None
 59 |         The fsspec filesystem object, created in __post_init__
 60 | 
 61 |     """
 62 | 
 63 |     filepath: str
 64 |     reader_options: Optional[dict] = field(default_factory=dict)
 65 |     fs: fsspec.AbstractFileSystem = field(init=False)
 66 |     upath: upath.core.UPath = field(init=False)
 67 | 
 68 |     def open_file(self) -> OpenFileType:
 69 |         """Calls `.open` on fsspec.Filesystem instantiation using self.filepath as an input.
 70 | 
 71 |         Returns
 72 |         -------
 73 |         OpenFileType
 74 |             file opened with fsspec
 75 |         """
 76 |         return self.fs.open(self.filepath)
 77 | 
 78 |     def read_bytes(self, bytes: int) -> bytes:
 79 |         with self.open_file() as of:
 80 |             return of.read(bytes)
 81 | 
 82 |     def get_mapper(self):
 83 |         """Returns a mapper for use with Zarr"""
 84 |         return self.fs.get_mapper(self.filepath)
 85 | 
 86 |     def __post_init__(self) -> None:
 87 |         """Initialize the fsspec filesystem object"""
 88 |         import fsspec
 89 |         from upath import UPath
 90 | 
 91 |         if not isinstance(self.filepath, UPath):
 92 |             upath = UPath(self.filepath)
 93 | 
 94 |         self.upath = upath
 95 |         self.protocol = upath.protocol
 96 | 
 97 |         self.reader_options = self.reader_options or {}
 98 |         storage_options = self.reader_options.get("storage_options", {})  # type: ignore
 99 | 
100 |         self.fs = fsspec.filesystem(self.protocol, **storage_options)
101 | 
102 | 
103 | def check_for_collisions(
104 |     drop_variables: Iterable[str] | None,
105 |     loadable_variables: Iterable[str] | None,
106 | ) -> tuple[list[str], list[str]]:
107 |     if drop_variables is None:
108 |         drop_variables = []
109 |     elif isinstance(drop_variables, str):
110 |         drop_variables = [drop_variables]
111 |     else:
112 |         drop_variables = list(drop_variables)
113 | 
114 |     if loadable_variables is None:
115 |         loadable_variables = []
116 |     elif isinstance(loadable_variables, str):
117 |         loadable_variables = [loadable_variables]
118 |     else:
119 |         loadable_variables = list(loadable_variables)
120 | 
121 |     common = set(drop_variables).intersection(set(loadable_variables))
122 |     if common:
123 |         raise ValueError(f"Cannot both load and drop variables {common}")
124 | 
125 |     return drop_variables, loadable_variables
126 | 
127 | 
128 | def soft_import(name: str, reason: str, strict: Optional[bool] = True):
129 |     try:
130 |         return importlib.import_module(name)
131 |     except (ImportError, ModuleNotFoundError):
132 |         if strict:
133 |             raise ImportError(
134 |                 f"for {reason}, the {name} package is required. "
135 |                 f"Please install it via pip or conda."
136 |             )
137 |         else:
138 |             return None
139 | 
140 | 
141 | def ceildiv(a: int, b: int) -> int:
142 |     """
143 |     Ceiling division operator for integers.
144 | 
145 |     See https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python
146 |     """
147 |     return -(a // -b)
148 | 
149 | 
150 | def determine_chunk_grid_shape(
151 |     shape: tuple[int, ...], chunks: tuple[int, ...]
152 | ) -> tuple[int, ...]:
153 |     """Calculate the shape of the chunk grid based on array shape and chunk size."""
154 |     return tuple(ceildiv(length, chunksize) for length, chunksize in zip(shape, chunks))
155 | 
156 | 
157 | def convert_v3_to_v2_metadata(
158 |     v3_metadata: ArrayV3Metadata, fill_value: Any = None
159 | ) -> ArrayV2Metadata:
160 |     """
161 |     Convert ArrayV3Metadata to ArrayV2Metadata.
162 | 
163 |     Parameters
164 |     ----------
165 |     v3_metadata : ArrayV3Metadata
166 |         The metadata object in v3 format.
167 |     fill_value : Any, optional
168 |         Override the fill value from v3 metadata.
169 | 
170 |     Returns
171 |     -------
172 |     ArrayV2Metadata
173 |         The metadata object in v2 format.
174 |     """
175 |     import warnings
176 | 
177 |     array_filters: tuple[ArrayArrayCodec, ...]
178 |     bytes_compressors: tuple[BytesBytesCodec, ...]
179 |     array_filters, _, bytes_compressors = extract_codecs(v3_metadata.codecs)
180 |     # Handle compressor configuration
181 |     compressor_config: dict[str, Any] | None = None
182 |     if bytes_compressors:
183 |         if len(bytes_compressors) > 1:
184 |             warnings.warn(
185 |                 "Multiple compressors found in v3 metadata. Using the first compressor, "
186 |                 "others will be ignored. This may affect data compatibility.",
187 |                 UserWarning,
188 |             )
189 |         compressor_config = get_codec_config(bytes_compressors[0])
190 | 
191 |     # Handle filter configurations
192 |     filter_configs = [get_codec_config(filter_) for filter_ in array_filters]
193 | 
194 |     v2_metadata = ArrayV2Metadata(
195 |         shape=v3_metadata.shape,
196 |         dtype=v3_metadata.data_type.to_numpy(),
197 |         chunks=v3_metadata.chunks,
198 |         fill_value=fill_value or v3_metadata.fill_value,
199 |         compressor=compressor_config,
200 |         filters=filter_configs,
201 |         order="C",
202 |         attributes=v3_metadata.attributes,
203 |         dimension_separator=".",  # Assuming '.' as default dimension separator
204 |     )
205 |     return v2_metadata
206 | 


--------------------------------------------------------------------------------
/virtualizarr/vendor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/vendor/__init__.py


--------------------------------------------------------------------------------
/virtualizarr/vendor/zarr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/vendor/zarr/__init__.py


--------------------------------------------------------------------------------
/virtualizarr/vendor/zarr/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/vendor/zarr/core/__init__.py


--------------------------------------------------------------------------------
/virtualizarr/vendor/zarr/core/common.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from itertools import starmap
 3 | from typing import (
 4 |     Any,
 5 |     Awaitable,
 6 |     Callable,
 7 |     Iterable,
 8 |     TypeVar,
 9 | )
10 | 
11 | # Vendored directly from Zarr-python V3's private API
12 | # https://github.com/zarr-developers/zarr-python/blob/458299857141a5470ba3956d8a1607f52ac33857/src/zarr/core/common.py#L53
13 | T = TypeVar("T", bound=tuple[Any, ...])
14 | V = TypeVar("V")
15 | 
16 | 
17 | async def _concurrent_map(
18 |     items: Iterable[T],
19 |     func: Callable[..., Awaitable[V]],
20 |     limit: int | None = None,
21 | ) -> list[V]:
22 |     if limit is None:
23 |         return await asyncio.gather(*list(starmap(func, items)))
24 | 
25 |     else:
26 |         sem = asyncio.Semaphore(limit)
27 | 
28 |         async def run(item: tuple[Any]) -> V:
29 |             async with sem:
30 |                 return await func(*item)
31 | 
32 |         return await asyncio.gather(
33 |             *[asyncio.ensure_future(run(item)) for item in items]
34 |         )
35 | 


--------------------------------------------------------------------------------
/virtualizarr/vendor/zarr/core/metadata.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any
 3 | 
 4 | import numpy as np
 5 | from zarr.core.buffer import Buffer, BufferPrototype
 6 | from zarr.core.metadata.v3 import V3JsonEncoder
 7 | 
 8 | 
 9 | def _replace_special_floats(obj: object) -> Any:
10 |     """Helper function to replace NaN/Inf/-Inf values with special strings
11 | 
12 |     Note: this cannot be done in the V3JsonEncoder because Python's `json.dumps` optimistically
13 |     converts NaN/Inf values to special types outside of the encoding step.
14 |     """
15 |     if isinstance(obj, float):
16 |         if np.isnan(obj):
17 |             return "NaN"
18 |         elif np.isinf(obj):
19 |             return "Infinity" if obj > 0 else "-Infinity"
20 |     elif isinstance(obj, dict):
21 |         # Recursively replace in dictionaries
22 |         return {k: _replace_special_floats(v) for k, v in obj.items()}
23 |     elif isinstance(obj, list):
24 |         # Recursively replace in lists
25 |         return [_replace_special_floats(item) for item in obj]
26 |     return obj
27 | 
28 | 
29 | def dict_to_buffer(input: dict, prototype: BufferPrototype) -> Buffer:
30 |     # modified from ArrayV3Metadata.to_buffer_dict
31 |     d = _replace_special_floats(input)
32 |     return prototype.buffer.from_bytes(json.dumps(d, cls=V3JsonEncoder).encode())
33 | 


--------------------------------------------------------------------------------
/virtualizarr/writers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/writers/__init__.py


--------------------------------------------------------------------------------
/virtualizarr/writers/kerchunk.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import json
  3 | from typing import cast
  4 | 
  5 | import numpy as np
  6 | from xarray import Dataset, Variable
  7 | from xarray.coding.times import CFDatetimeCoder
  8 | from xarray.conventions import encode_dataset_coordinates
  9 | 
 10 | from virtualizarr.manifests.manifest import join
 11 | from virtualizarr.types.kerchunk import KerchunkArrRefs, KerchunkStoreRefs
 12 | from virtualizarr.utils import convert_v3_to_v2_metadata
 13 | 
 14 | 
 15 | class NumpyEncoder(json.JSONEncoder):
 16 |     """JSON encoder that handles common scientific Python types found in attributes.
 17 | 
 18 |     This encoder converts various Python types to JSON-serializable formats:
 19 |     - NumPy arrays and scalars to Python lists and native types
 20 |     - NumPy dtypes to strings
 21 |     - Sets to lists
 22 |     - Other objects that implement __array__ to lists
 23 |     - Objects with to_dict method (like pandas objects)
 24 |     - Objects with __str__ method as fallback
 25 |     """
 26 | 
 27 |     def default(self, obj):
 28 |         if isinstance(obj, np.ndarray):
 29 |             return obj.tolist()  # Convert NumPy array to Python list
 30 |         elif isinstance(obj, np.generic):
 31 |             return obj.item()  # Convert NumPy scalar to Python scalar
 32 |         elif isinstance(obj, np.dtype):
 33 |             return str(obj)
 34 |         elif isinstance(obj, set):
 35 |             return list(obj)  # Convert sets to lists
 36 |         elif hasattr(obj, "__array__"):
 37 |             return np.asarray(obj).tolist()  # Handle array-like objects
 38 |         elif hasattr(obj, "to_dict"):
 39 |             return obj.to_dict()  # Handle objects with to_dict method
 40 | 
 41 |         try:
 42 |             return json.JSONEncoder.default(self, obj)
 43 |         except TypeError:
 44 |             if hasattr(obj, "__str__"):
 45 |                 return str(obj)
 46 |             raise
 47 | 
 48 | 
 49 | def dataset_to_kerchunk_refs(ds: Dataset) -> KerchunkStoreRefs:
 50 |     """
 51 |     Create a dictionary containing kerchunk-style store references from a single xarray.Dataset (which wraps ManifestArray objects).
 52 |     """
 53 | 
 54 |     import ujson
 55 | 
 56 |     # xarray's .to_zarr() does this, so we need to do it for kerchunk too
 57 |     variables, attrs = encode_dataset_coordinates(ds)
 58 | 
 59 |     all_arr_refs = {}
 60 |     for var_name, var in variables.items():
 61 |         arr_refs = variable_to_kerchunk_arr_refs(var, str(var_name))
 62 | 
 63 |         prepended_with_var_name = {
 64 |             f"{var_name}/{key}": val for key, val in arr_refs.items()
 65 |         }
 66 |         all_arr_refs.update(prepended_with_var_name)
 67 | 
 68 |     ds_refs = {
 69 |         "version": 1,
 70 |         "refs": {
 71 |             ".zgroup": '{"zarr_format":2}',
 72 |             ".zattrs": ujson.dumps(attrs),
 73 |             **all_arr_refs,
 74 |         },
 75 |     }
 76 | 
 77 |     return cast(KerchunkStoreRefs, ds_refs)
 78 | 
 79 | 
 80 | def remove_file_uri_prefix(path: str):
 81 |     if path.startswith("file:///"):
 82 |         return path.removeprefix("file://")
 83 |     else:
 84 |         return path
 85 | 
 86 | 
 87 | def variable_to_kerchunk_arr_refs(var: Variable, var_name: str) -> KerchunkArrRefs:
 88 |     """
 89 |     Create a dictionary containing kerchunk-style array references from a single xarray.Variable (which wraps either a ManifestArray or a numpy array).
 90 | 
 91 |     Partially encodes the inner dicts to json to match kerchunk behaviour (see https://github.com/fsspec/kerchunk/issues/415).
 92 |     """
 93 |     from virtualizarr.manifests import ManifestArray
 94 |     from virtualizarr.translators.kerchunk import to_kerchunk_json
 95 | 
 96 |     if isinstance(var.data, ManifestArray):
 97 |         marr = var.data
 98 | 
 99 |         arr_refs: dict[str, str | list[str | int]] = {
100 |             str(chunk_key): [
101 |                 remove_file_uri_prefix(entry["path"]),
102 |                 entry["offset"],
103 |                 entry["length"],
104 |             ]
105 |             for chunk_key, entry in marr.manifest.dict().items()
106 |         }
107 |         array_v2_metadata = convert_v3_to_v2_metadata(marr.metadata)
108 |         zattrs = {**var.attrs, **var.encoding}
109 |     else:
110 |         from xarray.backends.zarr import encode_zarr_variable
111 |         from zarr.core.metadata.v2 import ArrayV2Metadata
112 | 
113 |         var = encode_zarr_variable(var)
114 |         try:
115 |             np_arr = var.to_numpy()
116 |         except AttributeError as e:
117 |             raise TypeError(
118 |                 f"Can only serialize wrapped arrays of type ManifestArray or numpy.ndarray, but got type {type(var.data)}"
119 |             ) from e
120 | 
121 |         if var.encoding:
122 |             if "scale_factor" in var.encoding:
123 |                 raise NotImplementedError(
124 |                     f"Cannot serialize loaded variable {var_name}, as it is encoded with a scale_factor"
125 |                 )
126 |             if "offset" in var.encoding:
127 |                 raise NotImplementedError(
128 |                     f"Cannot serialize loaded variable {var_name}, as it is encoded with an offset"
129 |                 )
130 |             if "calendar" in var.encoding:
131 |                 np_arr = CFDatetimeCoder().encode(var.copy(), name=var_name).values
132 |                 dtype = var.encoding.get("dtype", None)
133 |                 if dtype and np_arr.dtype != dtype:
134 |                     np_arr = np.asarray(np_arr, dtype=dtype)
135 | 
136 |         # This encoding is what kerchunk does when it "inlines" data, see https://github.com/fsspec/kerchunk/blob/a0c4f3b828d37f6d07995925b324595af68c4a19/kerchunk/hdf.py#L472
137 |         byte_data = np_arr.tobytes()
138 |         # TODO do I really need to encode then decode like this?
139 |         inlined_data = (b"base64:" + base64.b64encode(byte_data)).decode("utf-8")
140 | 
141 |         # TODO can this be generalized to save individual chunks of a dask array?
142 |         # TODO will this fail for a scalar?
143 |         arr_refs = {join(0 for _ in np_arr.shape): inlined_data}
144 | 
145 |         array_v2_metadata = ArrayV2Metadata(
146 |             chunks=np_arr.shape,
147 |             shape=np_arr.shape,
148 |             dtype=np_arr.dtype,
149 |             order="C",
150 |             fill_value=None,
151 |         )
152 |         zattrs = {**var.attrs}
153 | 
154 |     zarray_dict = to_kerchunk_json(array_v2_metadata)
155 |     arr_refs[".zarray"] = zarray_dict
156 | 
157 |     zattrs["_ARRAY_DIMENSIONS"] = list(var.dims)
158 |     arr_refs[".zattrs"] = json.dumps(zattrs, separators=(",", ":"), cls=NumpyEncoder)
159 | 
160 |     return cast(KerchunkArrRefs, arr_refs)
161 | 


--------------------------------------------------------------------------------
/virtualizarr/xarray.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import Iterable, Mapping
  2 | from typing import (
  3 |     Any,
  4 |     Hashable,
  5 |     MutableMapping,
  6 |     Optional,
  7 | )
  8 | 
  9 | import xarray as xr
 10 | import xarray.indexes
 11 | 
 12 | from virtualizarr.manifests import ManifestStore
 13 | from virtualizarr.utils import _FsspecFSFromFilepath
 14 | 
 15 | 
 16 | def construct_fully_virtual_dataset(
 17 |     virtual_vars: Mapping[str, xr.Variable],
 18 |     coord_names: Iterable[str] | None = None,
 19 |     attrs: dict[str, Any] | None = None,
 20 | ) -> xr.Dataset:
 21 |     """Construct a fully virtual Dataset from constituent parts."""
 22 | 
 23 |     data_vars, coords = separate_coords(
 24 |         vars=virtual_vars,
 25 |         indexes={},  # we specifically avoid creating any indexes yet to avoid loading any data
 26 |         coord_names=coord_names,
 27 |     )
 28 | 
 29 |     vds = xr.Dataset(
 30 |         data_vars=data_vars,
 31 |         coords=coords,
 32 |         attrs=attrs,
 33 |     )
 34 | 
 35 |     return vds
 36 | 
 37 | 
 38 | def construct_virtual_dataset(
 39 |     manifest_store: ManifestStore | None = None,
 40 |     # TODO remove filepath option once all readers use ManifestStore approach
 41 |     fully_virtual_ds: xr.Dataset | None = None,
 42 |     filepath: str | None = None,
 43 |     group: str | None = None,
 44 |     loadable_variables: Iterable[Hashable] | None = None,
 45 |     decode_times: bool | None = None,
 46 |     indexes: Mapping[str, xr.Index] | None = None,
 47 |     reader_options: Optional[dict] = None,
 48 | ) -> xr.Dataset:
 49 |     """
 50 |     Construct a fully or partly virtual dataset from a ManifestStore (or filepath for backwards compatibility),
 51 |     containing the contents of one group.
 52 | 
 53 |     Accepts EITHER manifest_store OR fully_virtual_ds and filepath. The latter option should be removed once all readers use ManifestStore approach.
 54 |     """
 55 | 
 56 |     if indexes is not None:
 57 |         raise NotImplementedError()
 58 | 
 59 |     if manifest_store:
 60 |         if group:
 61 |             raise NotImplementedError(
 62 |                 "ManifestStore does not yet support nested groups"
 63 |             )
 64 |         else:
 65 |             manifestgroup = manifest_store._group
 66 | 
 67 |         fully_virtual_ds = manifestgroup.to_virtual_dataset()
 68 | 
 69 |         with xr.open_zarr(
 70 |             manifest_store,
 71 |             group=group,
 72 |             consolidated=False,
 73 |             zarr_format=3,
 74 |             chunks=None,
 75 |             decode_times=decode_times,
 76 |         ) as loadable_ds:
 77 |             return replace_virtual_with_loadable_vars(
 78 |                 fully_virtual_ds, loadable_ds, loadable_variables
 79 |             )
 80 |     else:
 81 |         # TODO pre-ManifestStore codepath, remove once all readers use ManifestStore approach
 82 | 
 83 |         fpath = _FsspecFSFromFilepath(
 84 |             filepath=filepath,  # type: ignore[arg-type]
 85 |             reader_options=reader_options,
 86 |         ).open_file()
 87 | 
 88 |         with xr.open_dataset(
 89 |             fpath,  # type: ignore[arg-type]
 90 |             group=group,
 91 |             decode_times=decode_times,
 92 |         ) as loadable_ds:
 93 |             return replace_virtual_with_loadable_vars(
 94 |                 fully_virtual_ds,  # type: ignore[arg-type]
 95 |                 loadable_ds,
 96 |                 loadable_variables,
 97 |             )
 98 | 
 99 | 
100 | def replace_virtual_with_loadable_vars(
101 |     fully_virtual_ds: xr.Dataset,
102 |     loadable_ds: xr.Dataset,
103 |     loadable_variables: Iterable[Hashable] | None = None,
104 | ) -> xr.Dataset:
105 |     """
106 |     Merge a fully virtual and the corresponding fully loadable dataset, keeping only `loadable_variables` from the latter (plus defaults needed for indexes).
107 |     """
108 | 
109 |     var_names_to_load: list[Hashable]
110 | 
111 |     if isinstance(loadable_variables, list):
112 |         var_names_to_load = list(loadable_variables)
113 |     elif loadable_variables is None:
114 |         # If `loadable_variables` is None, then we have to explicitly match default
115 |         # behaviour of xarray, i.e., load and create indexes only for dimension
116 |         # coordinate variables.  We already have all the indexes and variables
117 |         # we should be keeping - we just need to distinguish them.
118 |         var_names_to_load = [
119 |             name for name, var in loadable_ds.variables.items() if var.dims == (name,)
120 |         ]
121 |     else:
122 |         raise ValueError(
123 |             "loadable_variables must be an iterable of string variable names,"
124 |             f" or None, but got type {type(loadable_variables)}"
125 |         )
126 | 
127 |     # this will automatically keep any IndexVariables needed for loadable 1D coordinates
128 |     loadable_var_names_to_drop = set(loadable_ds.variables).difference(
129 |         var_names_to_load
130 |     )
131 |     ds_loadable_to_keep = loadable_ds.drop_vars(
132 |         loadable_var_names_to_drop, errors="ignore"
133 |     )
134 | 
135 |     ds_virtual_to_keep = fully_virtual_ds.drop_vars(var_names_to_load, errors="ignore")
136 | 
137 |     # we don't need `compat` or `join` kwargs here because there should be no variables with the same name in both datasets
138 |     return xr.merge(
139 |         [
140 |             ds_loadable_to_keep,
141 |             ds_virtual_to_keep,
142 |         ],
143 |     )
144 | 
145 | 
146 | # TODO this probably doesn't need to actually support indexes != {}
147 | def separate_coords(
148 |     vars: Mapping[str, xr.Variable],
149 |     indexes: MutableMapping[str, xr.Index],
150 |     coord_names: Iterable[str] | None = None,
151 | ) -> tuple[dict[str, xr.Variable], xr.Coordinates]:
152 |     """
153 |     Try to generate a set of coordinates that won't cause xarray to automatically build a pandas.Index for the 1D coordinates.
154 | 
155 |     Currently requires this function as a workaround unless xarray PR #8124 is merged.
156 | 
157 |     Will also preserve any loaded variables and indexes it is passed.
158 |     """
159 | 
160 |     if coord_names is None:
161 |         coord_names = []
162 | 
163 |     # split data and coordinate variables (promote dimension coordinates)
164 |     data_vars = {}
165 |     coord_vars: dict[
166 |         str, tuple[Hashable, Any, dict[Any, Any], dict[Any, Any]] | xr.Variable
167 |     ] = {}
168 |     found_coord_names: set[str] = set()
169 |     # Search through variable attributes for coordinate names
170 |     for var in vars.values():
171 |         if "coordinates" in var.attrs:
172 |             found_coord_names.update(var.attrs["coordinates"].split(" "))
173 |     for name, var in vars.items():
174 |         if name in coord_names or var.dims == (name,) or name in found_coord_names:
175 |             # use workaround to avoid creating IndexVariables described here https://github.com/pydata/xarray/pull/8107#discussion_r1311214263
176 |             if len(var.dims) == 1:
177 |                 dim1d, *_ = var.dims
178 |                 coord_vars[name] = (dim1d, var.data, var.attrs, var.encoding)
179 | 
180 |                 if isinstance(var, xr.IndexVariable):
181 |                     # unless variable actually already is a loaded IndexVariable,
182 |                     # in which case we need to keep it and add the corresponding indexes explicitly
183 |                     coord_vars[str(name)] = var
184 |                     # TODO this seems suspect - will it handle datetimes?
185 |                     indexes[name] = xarray.indexes.PandasIndex(var, dim1d)
186 |             else:
187 |                 coord_vars[name] = var
188 |         else:
189 |             data_vars[name] = var
190 | 
191 |     coords = xr.Coordinates(coord_vars, indexes=indexes)
192 | 
193 |     return data_vars, coords
194 | 


--------------------------------------------------------------------------------