├── .github ├── PULL_REQUEST_TEMPLATE.md ├── codecov.yml └── workflows │ ├── main.yml │ ├── release.yml │ ├── typing.yml │ └── upstream.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── LICENSE ├── README.md ├── citation.cff ├── conftest.py ├── docs ├── Makefile ├── _static │ └── custom.css ├── api.rst ├── conf.py ├── contributing.md ├── core_team_guide.md ├── custom_readers.md ├── data_structures.md ├── examples.md ├── faq.md ├── index.md ├── installation.md ├── make.bat ├── releases.rst └── usage.md ├── examples ├── append │ └── noaa-cdr-sst.ipynb ├── coiled │ └── terraclimate.ipynb ├── mursst-icechunk-with-lithops │ ├── Dockerfile │ ├── README.md │ ├── __init__.py │ ├── cli.py │ ├── config.py │ ├── ec2_for_lithops_runtime │ │ ├── 00-create-security-group.sh │ │ ├── 01-launch-ec2.sh │ │ ├── 02-setup-ec2-role.sh │ │ ├── 03-setup-ec2.sh │ │ └── README.md │ ├── helpers.py │ ├── lithops.yaml │ ├── lithops_functions.py │ ├── main.py │ ├── models.py │ ├── repo.py │ ├── requirements.txt │ ├── url_utils.py │ ├── virtual_datasets.py │ └── zarr_operations.py └── virtualizarr-with-lithops │ ├── Dockerfile_virtualizarr │ ├── README.md │ ├── lithops.yaml │ ├── requirements.txt │ └── virtualizarr-with-lithops.py ├── pyproject.toml └── virtualizarr ├── __init__.py ├── accessor.py ├── backend.py ├── codecs.py ├── manifests ├── __init__.py ├── array.py ├── array_api.py ├── group.py ├── manifest.py ├── store.py └── utils.py ├── parallel.py ├── py.typed ├── readers ├── __init__.py ├── api.py ├── dmrpp.py ├── fits.py ├── hdf │ ├── __init__.py │ ├── filters.py │ └── hdf.py ├── hdf5.py ├── kerchunk.py ├── netcdf3.py ├── tiff.py └── zarr.py ├── tests ├── __init__.py ├── conftest.py ├── test_backend.py ├── test_codecs.py ├── test_integration.py ├── test_manifests │ ├── __init__.py │ ├── test_array.py │ ├── test_group.py │ ├── test_manifest.py │ └── test_store.py ├── test_readers │ ├── __init__.py │ ├── conftest.py │ ├── test_dmrpp.py │ ├── test_fits.py │ ├── test_hdf │ │ ├── test_hdf.py │ │ ├── test_hdf_filters.py │ │ ├── test_hdf_integration.py │ │ └── test_hdf_manifest_store.py │ ├── test_kerchunk.py │ ├── test_netcdf3.py │ └── test_zarr.py ├── test_utils.py ├── test_writers │ ├── __init__.py │ ├── conftest.py │ ├── test_icechunk.py │ └── test_kerchunk.py └── test_xarray.py ├── translators ├── __init__.py └── kerchunk.py ├── types ├── __init__.py ├── general.py └── kerchunk.py ├── utils.py ├── vendor ├── __init__.py └── zarr │ ├── __init__.py │ └── core │ ├── __init__.py │ ├── common.py │ └── metadata.py ├── writers ├── __init__.py ├── icechunk.py └── kerchunk.py └── xarray.py /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - [ ] Closes #xxxx 4 | - [ ] Tests added 5 | - [ ] Tests passing 6 | - [ ] Full type hint coverage 7 | - [ ] Changes are documented in `docs/releases.rst` 8 | - [ ] New functions/methods are listed in `api.rst` 9 | - [ ] New functionality has documentation 10 | -------------------------------------------------------------------------------- /.github/codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | project: 4 | default: 5 | target: 75 6 | # See https://json.schemastore.org/codecov.json 7 | threshold: "0.1%" 8 | patch: 9 | default: 10 | target: 75 11 | comment: 12 | layout: "diff, files" 13 | behavior: default 14 | require_changes: true # if true: only post the comment if coverage changes 15 | branches: # branch names that can post comment 16 | - "main" 17 | - "develop" 18 | ignore: 19 | - "conftest.py" 20 | - "virtualizarr/tests" # ignore folders and all its contents 21 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ "main" , "develop"] 6 | paths-ignore: 7 | - 'docs/**' 8 | pull_request: 9 | branches: [ "main" , "develop"] 10 | paths-ignore: 11 | - 'docs/**' 12 | schedule: 13 | - cron: "0 0 * * *" 14 | 15 | concurrency: 16 | group: ${{ github.workflow }}-${{ github.ref }} 17 | cancel-in-progress: true 18 | 19 | jobs: 20 | test: 21 | name: ${{ matrix.environment }}-build 22 | runs-on: ubuntu-latest 23 | defaults: 24 | run: 25 | shell: bash -l {0} 26 | strategy: 27 | matrix: 28 | environment: [test-py311, test-py312, min-deps, minio] 29 | steps: 30 | - uses: actions/checkout@v4 31 | - uses: prefix-dev/setup-pixi@v0.8.3 32 | with: 33 | pixi-version: v0.41.4 34 | environments: ${{ matrix.environment }} 35 | 36 | - name: List installed libraries 37 | run: | 38 | pixi install --environment ${{ matrix.environment }} 39 | pixi list --environment ${{ matrix.environment }} 40 | 41 | - name: Running Tests 42 | run: | 43 | pixi run -e ${{ matrix.environment }} run-tests-xml-cov 44 | 45 | - name: Upload code coverage to Codecov 46 | uses: codecov/codecov-action@v3.1.4 47 | with: 48 | file: ./coverage.xml 49 | flags: unittests 50 | env_vars: OS,PYTHON 51 | name: codecov-umbrella 52 | fail_ci_if_error: false 53 | token: ${{ secrets.CODECOV_TOKEN }} 54 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Build distribution 2 | on: 3 | release: 4 | types: 5 | - published 6 | # push: 7 | # branches: [ "main" ] 8 | pull_request: 9 | branches: [ "main" , "develop"] 10 | 11 | permissions: 12 | contents: read 13 | 14 | env: 15 | PIP_ROOT_USER_ACTION: ignore 16 | 17 | jobs: 18 | build-artifacts: 19 | runs-on: ubuntu-latest 20 | if: github.repository == 'zarr-developers/VirtualiZarr' 21 | steps: 22 | - uses: actions/checkout@v4 23 | with: 24 | fetch-depth: 0 25 | - uses: actions/setup-python@v5.0.0 26 | name: Install Python 27 | with: 28 | python-version: "3.12" 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | python -m pip install setuptools setuptools-scm wheel twine check-manifest 33 | git clean -xdf 34 | git restore -SW . 35 | - name: Build tarball and wheels 36 | run: | 37 | python -m build --sdist --wheel . 38 | - name: Check built artifacts 39 | run: | 40 | python -m twine check --strict dist/* 41 | pwd 42 | if [ -f dist/virtualizarr-unknown.tar.gz ]; then 43 | echo "❌ INVALID VERSION NUMBER" 44 | exit 1 45 | else 46 | echo "✅ Looks good" 47 | fi 48 | - uses: actions/upload-artifact@v4 49 | with: 50 | name: releases 51 | path: dist 52 | 53 | test-built-dist: 54 | needs: build-artifacts 55 | runs-on: ubuntu-latest 56 | environment: 57 | name: test-release 58 | url: https://test.pypi.org/p/virtualizarr 59 | permissions: 60 | id-token: write 61 | steps: 62 | - uses: actions/setup-python@v5.0.0 63 | name: Install Python 64 | with: 65 | python-version: "3.12" 66 | - uses: actions/download-artifact@v4 67 | with: 68 | name: releases 69 | path: dist 70 | - name: List contents of built dist 71 | run: | 72 | ls -ltrh 73 | ls -ltrh dist 74 | - name: Verify the built dist/wheel is valid 75 | run: | 76 | python -m pip install --upgrade pip 77 | python -m pip install dist/virtualizarr*.whl 78 | python -c "import virtualizarr; print(virtualizarr.__version__)" 79 | - name: Publish package to TestPyPI 80 | if: github.event_name == 'release' 81 | uses: pypa/gh-action-pypi-publish@v1.8.14 82 | with: 83 | repository-url: https://test.pypi.org/legacy/ 84 | # verbose: true 85 | 86 | upload-to-pypi: 87 | needs: test-built-dist 88 | if: github.event_name == 'release' 89 | runs-on: ubuntu-latest 90 | environment: 91 | name: release 92 | url: https://pypi.org/p/virtualizarr 93 | permissions: 94 | id-token: write 95 | steps: 96 | - uses: actions/download-artifact@v4 97 | with: 98 | name: releases 99 | path: dist 100 | - name: Publish package to PyPI 101 | uses: pypa/gh-action-pypi-publish@v1.8.14 102 | -------------------------------------------------------------------------------- /.github/workflows/typing.yml: -------------------------------------------------------------------------------- 1 | name: Typing 2 | 3 | on: 4 | push: 5 | branches: [ "main" , "develop"] 6 | paths-ignore: 7 | - 'docs/**' 8 | pull_request: 9 | branches: [ "main" , "develop"] 10 | paths-ignore: 11 | - 'docs/**' 12 | schedule: 13 | - cron: "0 0 * * *" 14 | 15 | concurrency: 16 | group: ${{ github.workflow }}-${{ github.ref }} 17 | cancel-in-progress: true 18 | 19 | env: 20 | PIP_ROOT_USER_ACTION: ignore 21 | 22 | jobs: 23 | mypy: 24 | name: mypy 25 | runs-on: ubuntu-latest 26 | steps: 27 | - uses: actions/checkout@v4 28 | - uses: prefix-dev/setup-pixi@v0.8.3 29 | with: 30 | pixi-version: v0.41.4 31 | - name: List installed libraries 32 | run: | 33 | pixi install --environment test 34 | pixi list --environment test 35 | - name: Type check 36 | run: | 37 | pixi run -e test run-mypy 38 | -------------------------------------------------------------------------------- /.github/workflows/upstream.yml: -------------------------------------------------------------------------------- 1 | name: Upstream 2 | 3 | on: 4 | push: 5 | branches: [ "main" , "develop"] 6 | paths-ignore: 7 | - 'docs/**' 8 | pull_request: 9 | branches: [ "main" , "develop"] 10 | types: [ labeled ] 11 | paths-ignore: 12 | - 'docs/**' 13 | schedule: 14 | - cron: "0 0 * * *" 15 | 16 | concurrency: 17 | group: ${{ github.workflow }}-${{ github.ref }} 18 | cancel-in-progress: true 19 | 20 | jobs: 21 | test-upstream: 22 | name: ${{ matrix.environment }}-build 23 | if: | 24 | github.event_name == 'push' || 25 | github.event_name == 'schedule' || 26 | (github.event_name == 'pull_request' && github.event.label.name == 'test-upstream') 27 | runs-on: ubuntu-latest 28 | defaults: 29 | run: 30 | shell: bash -l {0} 31 | strategy: 32 | matrix: 33 | environment: [upstream] 34 | steps: 35 | - uses: actions/checkout@v4 36 | - uses: prefix-dev/setup-pixi@v0.8.3 37 | with: 38 | pixi-version: v0.41.4 39 | environments: ${{ matrix.environment }} 40 | 41 | - name: List installed libraries 42 | run: | 43 | pixi install --environment ${{ matrix.environment }} 44 | pixi list --environment ${{ matrix.environment }} 45 | 46 | - name: Running Tests 47 | run: | 48 | pixi run -e ${{ matrix.environment }} run-tests-xml-cov 49 | 50 | - name: Upload code coverage to Codecov 51 | uses: codecov/codecov-action@v3.1.4 52 | with: 53 | file: ./coverage.xml 54 | flags: unittests 55 | env_vars: OS,PYTHON 56 | name: codecov-umbrella 57 | fail_ci_if_error: false 58 | token: ${{ secrets.CODECOV_TOKEN }} 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | virtualizarr/_version.py 162 | docs/generated/ 163 | docs/jupyter_execute/ 164 | examples/ 165 | 166 | # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode 167 | # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode 168 | 169 | ### VisualStudioCode ### 170 | .vscode 171 | 172 | # Local History for Visual Studio Code 173 | .history/ 174 | 175 | # Built Visual Studio Code Extensions 176 | *.vsix 177 | 178 | ### VisualStudioCode Patch ### 179 | # Ignore all local history of files 180 | .history 181 | .ionide 182 | 183 | # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode 184 | 185 | # Pixi folder 186 | .pixi/ 187 | 188 | # python virtual environment 189 | .venv 190 | venv 191 | 192 | # Pixi lock file (because it changes with every upstream commit) 193 | pixi.lock 194 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # https://pre-commit.com/ 2 | ci: 3 | autoupdate_schedule: monthly 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v5.0.0 7 | hooks: 8 | - id: trailing-whitespace 9 | - id: end-of-file-fixer 10 | - id: check-yaml 11 | 12 | - repo: https://github.com/astral-sh/ruff-pre-commit 13 | # Ruff version. 14 | rev: v0.11.8 15 | hooks: 16 | # Run the linter. 17 | - id: ruff 18 | args: [ --fix ] 19 | # Run the formatter. 20 | - id: ruff-format 21 | - repo: https://github.com/citation-file-format/cff-converter-python 22 | rev: ebf0b5e44d67f8beaa1cd13a0d0393ea04c6058d 23 | hooks: 24 | - id: validate-cff 25 | - repo: https://github.com/codespell-project/codespell 26 | rev: v2.4.1 27 | hooks: 28 | - id: codespell 29 | args: ["-L", "fo,ihs,kake,te", "-S", "fixture"] 30 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | build: 6 | os: ubuntu-22.04 7 | tools: 8 | # this ensures a viable `mamba` is on `$PATH`` 9 | python: mambaforge-latest 10 | commands: 11 | - mamba install -c conda-forge -c nodefaults pixi 12 | - pixi install --environment docs 13 | - pixi run build-docs 14 | - pixi run readthedocs 15 | 16 | # Build documentation in the doc/ directory with Sphinx 17 | sphinx: 18 | configuration: docs/conf.py 19 | fail_on_warning: true 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VirtualiZarr 2 | 3 | [![CI](https://github.com/zarr-developers/VirtualiZarr/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/zarr-developers/VirtualiZarr/actions?query=workflow%3ACI) 4 | [![Code coverage](https://codecov.io/gh/zarr-developers/VirtualiZarr/branch/main/graph/badge.svg?flag=unittests)](https://codecov.io/gh/zarr-developers/VirtualiZarr) 5 | [![Docs](https://readthedocs.org/projects/virtualizarr/badge/?version=latest)](https://virtualizarr.readthedocs.io/en/latest/) 6 | [![Linted and Formatted with Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) 7 | [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) 8 | [![pre-commit Enabled](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://pre-commit.com/) 9 | [![Apache 2.0 License](https://img.shields.io/badge/license-Apache%202-cb2533.svg)](https://www.apache.org/licenses/LICENSE-2.0) 10 | [![Python Versions](https://img.shields.io/python/required-version-toml?tomlFilePath=https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/main/pyproject.toml&logo=Python&logoColor=gold&label=Python)](https://docs.python.org) 11 | [![slack](https://img.shields.io/badge/slack-virtualizarr-purple.svg?logo=slack)](https://join.slack.com/t/earthmover-community/shared_invite/zt-32to7398i-HorUXmzPzyy9U87yLxweIA) 12 | [![Latest Release](https://img.shields.io/github/v/release/zarr-developers/VirtualiZarr)](https://github.com/zarr-developers/VirtualiZarr/releases) 13 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/virtualizarr?label=pypi%7Cdownloads)](https://pypistats.org/packages/virtualizarr) 14 | [![Conda - Downloads](https://img.shields.io/conda/d/conda-forge/virtualizarr 15 | )](https://anaconda.org/conda-forge/virtualizarr) 16 | 17 | 18 | 19 | ## Cloud-Optimize your Scientific Data as a Virtual Zarr Datacube, using Xarray syntax. 20 | 21 | The best way to distribute large scientific datasets is via the Cloud, in [Cloud-Optimized formats](https://guide.cloudnativegeo.org/) [^1]. But often this data is stuck in archival pre-Cloud file formats such as netCDF. 22 | 23 | **VirtualiZarr[^2] makes it easy to create "Virtual" Zarr datacubes, allowing performant access to archival data as if it were in the Cloud-Optimized [Zarr format](https://zarr.dev/), _without duplicating any data_.** 24 | 25 | Please see the [documentation](https://virtualizarr.readthedocs.io/en/stable/index.html). 26 | 27 | ### Features 28 | 29 | * Create virtual references pointing to bytes inside an archival file with [`open_virtual_dataset`](https://virtualizarr.readthedocs.io/en/latest/usage.html#opening-files-as-virtual-datasets). 30 | * Supports a [range of archival file formats](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare), including netCDF4 and HDF5, and has a pluggable system for supporting new formats. 31 | * [Combine data from multiple files](https://virtualizarr.readthedocs.io/en/latest/usage.html#combining-virtual-datasets) into one larger datacube using [xarray's combining functions](https://docs.xarray.dev/en/stable/user-guide/combining.html), such as [`xarray.concat`](https://docs.xarray.dev/en/stable/generated/xarray.concat.html). 32 | * Commit the virtual references to storage either using the [Kerchunk references](https://fsspec.github.io/kerchunk/spec.html) specification or the [Icechunk](https://icechunk.io/) transactional storage engine. 33 | * Users access the virtual datacube simply as a single zarr-compatible store using [`xarray.open_zarr`](https://docs.xarray.dev/en/stable/generated/xarray.open_zarr.html). 34 | 35 | ### Inspired by Kerchunk 36 | 37 | VirtualiZarr grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [Kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk but in a zarr-native way, and with a familiar array-like API. 38 | 39 | You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk. 40 | 41 | ### Development Status and Roadmap 42 | 43 | VirtualiZarr version 1 (mostly) achieves [feature parity](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) with kerchunk's logic for combining datasets, providing an easier way to manipulate kerchunk references in memory and generate kerchunk reference files on disk. 44 | 45 | Future VirtualiZarr development will focus on generalizing and upstreaming useful concepts into the Zarr specification, the Zarr-Python library, Xarray, and possibly some new packages. 46 | 47 | We have a lot of ideas, including: 48 | - [Zarr v3 support](https://github.com/zarr-developers/VirtualiZarr/issues/17) 49 | - [Zarr-native on-disk chunk manifest format](https://github.com/zarr-developers/zarr-specs/issues/287) 50 | - ["Virtual concatenation"](https://github.com/zarr-developers/zarr-specs/issues/288) of separate Zarr arrays 51 | - ManifestArrays as an [intermediate layer in-memory](https://github.com/zarr-developers/VirtualiZarr/issues/71) in Zarr-Python 52 | - [Separating CF-related Codecs from xarray](https://github.com/zarr-developers/VirtualiZarr/issues/68#issuecomment-2197682388) 53 | - [Generating references without kerchunk](https://github.com/zarr-developers/VirtualiZarr/issues/78) 54 | 55 | If you see other opportunities then we would love to hear your ideas! 56 | 57 | ### Talks and Presentations 58 | 59 | - 2024/11/21 - MET Office Architecture Guild - Tom Nicholas - [Slides](https://speakerdeck.com/tomnicholas/virtualizarr-talk-at-met-office) 60 | - 2024/11/13 - Cloud-Native Geospatial conference - Raphael Hagen - [Slides](https://decks.carbonplan.org/cloud-native-geo/11-13-24) 61 | - 2024/07/24 - ESIP Meeting - Sean Harkins - [Event](https://2024julyesipmeeting.sched.com/event/1eVP6) / [Recording](https://youtu.be/T6QAwJIwI3Q?t=3689) 62 | - 2024/05/15 - Pangeo showcase - Tom Nicholas - [Event](https://discourse.pangeo.io/t/pangeo-showcase-virtualizarr-create-virtual-zarr-stores-using-xarray-syntax/4127/2) / [Recording](https://youtu.be/ioxgzhDaYiE) / [Slides](https://speakerdeck.com/tomnicholas/virtualizarr-create-virtual-zarr-stores-using-xarray-syntax) 63 | 64 | ### Credits 65 | 66 | This package was originally developed by [Tom Nicholas](https://github.com/TomNicholas) whilst working at [[C]Worthy](https://cworthy.org), who deserve credit for allowing him to prioritise a generalizable open-source solution to the dataset virtualization problem. VirtualiZarr is now a community-owned multi-stakeholder project. 67 | 68 | ### Licence 69 | 70 | Apache 2.0 71 | 72 | ### References 73 | 74 | [^1]: [_Cloud-Native Repositories for Big Scientific Data_, Abernathey et. al., _Computing in Science & Engineering_.](https://ieeexplore.ieee.org/abstract/document/9354557) 75 | 76 | [^2]: (Pronounced "Virtual-Eye-Zarr" - like "virtualizer" but more piratey 🦜) 77 | -------------------------------------------------------------------------------- /citation.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | title: "VirtualiZarr" 4 | abstract: "Create virtual Zarr stores for cloud-friendly access to archival data, using familiar xarray syntax." 5 | license: Apache-2.0 6 | repository-code: "https://github.com/zarr-developers/VirtualiZarr" 7 | authors: 8 | - family-names: "Nicholas" 9 | given-names: "Thomas" 10 | orcid: "https://orcid.org/0000-0002-2176-0530" 11 | - family-names: "Hagen" 12 | given-names: "Norland" 13 | orcid: "https://orcid.org/0000-0000-0000-0000" 14 | - family-names: "Harkins" 15 | given-names: "Sean" 16 | orcid: "https://orcid.org/0000-0000-0000-0000" 17 | - family-names: "Barciauskas" 18 | given-names: "Aimee" 19 | orcid: "https://orcid.org/0000-0002-3158-9554" 20 | - family-names: "Jones" 21 | given-names: "Max" 22 | orcid: "https://orcid.org/0000-0003-0180-8928" 23 | - family-names: "Signell" 24 | given-names: "Julia" 25 | orcid: "https://orcid.org/0000-0002-4120-3192" 26 | - family-names: "Nag" 27 | given-names: "Ayush" 28 | orcid: "https://orcid.org/0009-0008-1790-597X" 29 | - family-names: "Hidalgo" 30 | given-names: "Gustavo" 31 | orcid: "https://orcid.org/0000-0000-0000-0000" 32 | - family-names: "Augspurger" 33 | given-names: "Tom" 34 | orcid: "https://orcid.org/0000-0002-8136-7087" 35 | - family-names: "Abernathey" 36 | given-names: "Ryan" 37 | orcid: "https://orcid.org/0000-0001-5999-4917" 38 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/custom.css: -------------------------------------------------------------------------------- 1 | .bd-sidebar-primary { 2 | display: none; !important; 3 | } 4 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | ############# 2 | API Reference 3 | ############# 4 | 5 | .. currentmodule:: virtualizarr 6 | 7 | VirtualiZarr has a small API surface, because most of the complexity is handled by xarray functions like ``xarray.concat`` and ``xarray.merge``. 8 | Users can use xarray for every step apart from reading and serializing virtual references. 9 | 10 | User API 11 | ======== 12 | 13 | Reading 14 | ------- 15 | 16 | .. currentmodule:: virtualizarr.backend 17 | .. autosummary:: 18 | :nosignatures: 19 | :toctree: generated/ 20 | 21 | open_virtual_dataset 22 | 23 | Serialization 24 | ------------- 25 | 26 | .. currentmodule:: virtualizarr.accessor 27 | .. autosummary:: 28 | :nosignatures: 29 | :toctree: generated/ 30 | 31 | VirtualiZarrDatasetAccessor.to_kerchunk 32 | VirtualiZarrDatasetAccessor.to_icechunk 33 | VirtualiZarrDataTreeAccessor.to_icechunk 34 | 35 | Information 36 | ----------- 37 | 38 | .. currentmodule:: virtualizarr.accessor 39 | .. autosummary:: 40 | :nosignatures: 41 | :toctree: generated/ 42 | 43 | VirtualiZarrDatasetAccessor.nbytes 44 | 45 | Rewriting 46 | --------- 47 | 48 | .. currentmodule:: virtualizarr.accessor 49 | .. autosummary:: 50 | :nosignatures: 51 | :toctree: generated/ 52 | 53 | VirtualiZarrDatasetAccessor.rename_paths 54 | 55 | Developer API 56 | ============= 57 | 58 | If you want to write a new reader to create virtual references pointing to a custom file format, you will need to use VirtualiZarr's internal classes. 59 | 60 | Manifests 61 | --------- 62 | 63 | VirtualiZarr uses these classes to store virtual references internally. 64 | 65 | .. currentmodule:: virtualizarr.manifests 66 | .. autosummary:: 67 | :nosignatures: 68 | :toctree: generated/ 69 | 70 | ChunkManifest 71 | ManifestArray 72 | 73 | 74 | Array API 75 | --------- 76 | 77 | VirtualiZarr's :py:class:`~virtualizarr.ManifestArray` objects support a limited subset of the Python Array API standard in :py:mod:`virtualizarr.manifests.array_api`. 78 | 79 | .. currentmodule:: virtualizarr.manifests.array_api 80 | .. autosummary:: 81 | :nosignatures: 82 | :toctree: generated/ 83 | 84 | concatenate 85 | stack 86 | expand_dims 87 | broadcast_to 88 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = "virtualizarr" 10 | copyright = "2024, Thomas Nicholas" 11 | author = "Thomas Nicholas" 12 | 13 | # -- General configuration --------------------------------------------------- 14 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 15 | 16 | 17 | extensions = [ 18 | "myst_nb", 19 | "sphinx.ext.autodoc", 20 | "sphinx.ext.autosummary", 21 | "sphinx.ext.extlinks", 22 | "sphinx.ext.intersphinx", 23 | "sphinx_copybutton", 24 | "sphinx_togglebutton", 25 | "sphinx_design", 26 | "sphinx.ext.napoleon", 27 | ] 28 | 29 | extlinks = { 30 | "issue": ("https://github.com/zarr-developers/virtualizarr/issues/%s", "GH%s"), 31 | "pull": ("https://github.com/zarr-developers/virtualizarr/pull/%s", "PR%s"), 32 | "discussion": ("https://github.com/zarr-developers/virtualizarr/discussions/%s", "D%s"), 33 | } 34 | 35 | # Example configuration for intersphinx: refer to the Python standard library. 36 | # use in refs e.g: 37 | # :ref:`comparison manual ` 38 | intersphinx_mapping = { 39 | "python": ("https://docs.python.org/3/", None), 40 | "numpy": ("https://numpy.org/doc/stable/", None), 41 | "zarr": ("https://zarr.readthedocs.io/en/stable/", None), 42 | "xarray": ("https://docs.xarray.dev/en/stable/", None), 43 | "obstore": ("https://developmentseed.org/obstore/latest/", None), 44 | } 45 | 46 | # Add any paths that contain templates here, relative to this directory. 47 | templates_path = ["_templates"] 48 | 49 | # The master toctree document. 50 | master_doc = "index" 51 | 52 | # The language for content autogenerated by Sphinx. Refer to documentation 53 | # for a list of supported languages. 54 | # 55 | # This is also used if you do content translation via gettext catalogs. 56 | # Usually you set "language" from the command line for these cases. 57 | language = "en" 58 | 59 | # List of patterns, relative to source directory, that match files and 60 | # directories to ignore when looking for source files. 61 | # This patterns also effect to html_static_path and html_extra_path 62 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 63 | 64 | # The name of the Pygments (syntax highlighting) style to use. 65 | pygments_style = "sphinx" 66 | 67 | # If true, `todo` and `todoList` produce output, else they produce nothing. 68 | todo_include_todos = False 69 | 70 | # -- Myst Options ------------------------------------------------- 71 | 72 | myst_heading_anchors = 3 73 | 74 | # -- Options for HTML output ------------------------------------------------- 75 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 76 | 77 | html_theme = "pydata_sphinx_theme" 78 | html_theme_options = { 79 | "use_edit_page_button": True, 80 | "icon_links": [ 81 | { 82 | "name": "GitHub", 83 | "url": "https://github.com/zarr-developers/VirtualiZarr", 84 | "icon": "fa-brands fa-github", 85 | "type": "fontawesome", 86 | }, 87 | ] 88 | } 89 | html_title = "VirtualiZarr" 90 | html_context = { 91 | "github_user": "zarr-developers", 92 | "github_repo": "VirtualiZarr", 93 | "github_version": "main", 94 | "doc_path": "docs", 95 | } 96 | 97 | # remove sidebar, see GH issue #82 98 | html_css_files = [ 99 | 'custom.css', 100 | ] 101 | 102 | # html_logo = "_static/_future_logo.png" 103 | 104 | html_static_path = ["_static"] 105 | 106 | 107 | # issues 108 | # dark mode/lm switch 109 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions are welcome and encouraged! We ask only that all contributors follow the [Zarr Developers Code of Conduct](https://github.com/zarr-developers/.github/blob/main/CODE_OF_CONDUCT.md). 4 | 5 | ## Contributing code 6 | 7 | Before opening a PR to contribute code you should check that your changes work by running the test suite locally. 8 | 9 | ```{important} 10 | :name: dependencies 11 | We use [pixi](https://pixi.sh/latest/) to manage dependencies, which you'll want to install to get started. 12 | ``` 13 | 14 | Run tests with the `pixi run --environment test run-tests` command. Some tests require downloading files over the network. 15 | Use the `run-tests-no-network` task if you want to run tests faster or have no internet access: 16 | 17 | ```bash 18 | # Run all tests 19 | pixi run --environment test run-tests 20 | # Skip tests that require a network connection 21 | pixi run --environment test run-tests-no-network 22 | ``` 23 | 24 | You can also run tests in other environments: 25 | 26 | ```bash 27 | pixi run --environment min-deps run-tests # Test with the minimal set of dependencies installed 28 | pixi run --environment upstream run-tests # Test with unreleased versions of upstream libraries 29 | # List which versions are installed in the `min-deps` environment 30 | pixi list --environment min-deps 31 | ``` 32 | 33 | Further, the `pytest-cov` plugin is a test dependency, so you can generate a test 34 | coverage report locally, if you wish (CI will automatically do so). Here are some 35 | examples: 36 | 37 | ```bash 38 | pixi run --environment test run-tests-cov # Terminal report showing missing coverage 39 | pixi run --environment test run-tests-html-cov # HTML report written to htmlcov/index.html 40 | ``` 41 | 42 | Rather than using pixi tasks (essentially aliases for running commands in a given shell), you can explicitly start 43 | a shell within a given environment and execute `pytest` (or other commands) directly: 44 | 45 | ```bash 46 | # Start a shell within the environment 47 | pixi shell --environment test 48 | # Run the tests 49 | pytest virtualizarr 50 | # Exit the shell 51 | exit 52 | ``` 53 | 54 | If you run into issues with the development environment, here are some recommending steps: 55 | - Update pixi using `pixi self-update` and then retry the development workflow. 56 | - Clean up environments using `pixi clean` and then retry the development workflow. 57 | - Manually find and clean the cache dir listed in `pixi info` and then retry the development workflow. 58 | - Ask for help in the [VirtualiZarr channel of the Earthmover community slack](https://earthmover-community.slack.com/archives/C08EXCE8ZQX). 59 | 60 | ### Code standards 61 | 62 | #### Pre-commit 63 | 64 | All code must conform to the PEP8 standard. `VirtualiZarr` uses a set of `pre-commit` hooks and the `pre-commit` bot to format, type-check, and prettify the codebase. `pre-commit` can be installed locally by running: 65 | 66 | ``` 67 | python -m pip install pre-commit 68 | ``` 69 | The hooks can be installed locally by running: 70 | 71 | ``` 72 | pre-commit install 73 | ``` 74 | 75 | This would run the checks every time a commit is created locally. These checks will also run on every commit pushed to an open PR, resulting in some automatic styling fixes by the `pre-commit` bot. The checks will by default only run on the files modified by a commit, but the checks can be triggered for all the files by running: 76 | 77 | ``` 78 | pre-commit run --all-files 79 | ``` 80 | 81 | If you would like to skip the failing checks and push the code for further discussion, use the `--no-verify` option with `git commit`. 82 | 83 | #### Private functions 84 | 85 | `VirtualiZarr` uses the following convention for private functions: 86 | 87 | - Functions are preceded with an `_` (single underscore) if they should only be used within that module and may change at any time 88 | - Functions without a preceding `_` (single underscore) are treated as relatively stable by the rest of the codebase, but not for public use (i.e. they are stable developer API). 89 | - Public functions are documented in the fully public API and should follow the backwards-compatibility expectations of Effective Effort Versioning. 90 | 91 | ## Contributing documentation 92 | 93 | Whilst the CI will build the updated documentation for each PR, it can also be useful to check that the documentation has rendered as expected by building it locally. 94 | 95 | ### Build the documentation locally 96 | 97 | ```bash 98 | pixi install --environment docs 99 | pixi run build-docs 100 | ``` 101 | Pixi can also be used to serve continuously updating version of the documentation during development at [http://0.0.0.0:8000/](http://0.0.0.0:8000/). 102 | This can be done by navigating to [http://0.0.0.0:8000/](http://0.0.0.0:8000/) in your browser after running: 103 | 104 | ```bash 105 | pixi run serve-docs 106 | ``` 107 | 108 | ### Access the documentation locally 109 | 110 | Open `docs/_build/html/index.html` in a web browser (on MacOS you can do this from the terminal using `open docs/_build/html/index.html`). 111 | 112 | ## Making a release 113 | 114 | Anyone with commit privileges to the repository can issue a release, and you should feel free to issue a release at any point in time when all the CI tests on `main` are passing. 115 | 116 | 1. Decide on the release version number for the new release, following the [EffVer](https://jacobtomlinson.dev/effver/) versioning scheme (e.g., releasing v0.2.0 as the next release after v0.1.0 denotes that “some small effort may be required to make sure this version works for you”). 117 | 2. Write a high-level summary of the changes in this release, and write it into the release notes in `docs/releases.rst`. Create and merge a PR which adds the summary and also changes the release notes to say today's date and the version number of the new release. Don't add the blank template for future releases yet. 118 | 3. Navigate to the [https://github.com/zarr-developers/virtualizarr/releases](https://github.com/zarr-developers/virtualizarr/releases) releases page. 119 | 4. Select 'Draft a new release'. 120 | 5. Select 'Choose a tag', then 'Create a new tag' 121 | 6. Enter the name for the new tag (i.e. the release version number). 122 | 7. Click 'Generate Release Notes' to draft notes based on merged pull requests, and paste the same release summary you wrote earlier at the top. 123 | 8. Edit the draft release notes for consistency. 124 | 9. Select 'Publish' to publish the release. This should automatically upload the new release to [PyPI](https://pypi.org/project/virtualizarr/) and [conda-forge](https://anaconda.org/conda-forge/virtualizarr). 125 | 10. Check that this has run successfully (PyPI should show the new version number very quickly, but conda-forge might take several hours). 126 | 11. Create and merge a PR to add a new empty section to the `docs/releases.rst` for the next release in the future. See [this commit](https://github.com/zarr-developers/VirtualiZarr/commit/e3912f08e22f2e3230af6eb1a2aacb5728822fa1) for an example (you can assume the next release will be numbered `vX.Y.Z+1`, but the number doesn't actually matter). 127 | 12. (Optional) Advertise the release on social media 📣 128 | -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | The following examples demonstrate the use of VirtualiZarr to create virtual datasets of various kinds: 4 | 5 | 1. [Appending new daily NOAA SST data to Icechunk](https://github.com/zarr-developers/VirtualiZarr/blob/main/examples/append/noaa-cdr-sst.ipynb) 6 | 2. [Parallel reference generation using Coiled Functions](https://github.com/zarr-developers/VirtualiZarr/blob/main/examples/coiled/terraclimate.ipynb) 7 | 3. [Serverless parallel reference generation using Lithops](https://github.com/zarr-developers/VirtualiZarr/tree/main/examples/virtualizarr-with-lithops) 8 | 4. [MUR SST Virtual and Zarr Icechunk Store Generation using Lithops](https://github.com/zarr-developers/VirtualiZarr/tree/main/examples/mursst-icechunk-with-lithops) 9 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # VirtualiZarr 2 | 3 | **Create virtual Zarr stores for cloud-friendly access to archival data, using familiar xarray syntax.** 4 | 5 | The best way to distribute large scientific datasets is via the Cloud, in [Cloud-Optimized formats](https://guide.cloudnativegeo.org/) [^1]. But often this data is stuck in archival pre-Cloud file formats such as netCDF. 6 | 7 | **VirtualiZarr[^2] makes it easy to create "Virtual" Zarr stores, allowing performant access to archival data as if it were in the Cloud-Optimized [Zarr format](https://zarr.dev/), _without duplicating any data_.** 8 | 9 | ## Motivation 10 | 11 | "Virtualized data" solves an incredibly important problem: accessing big archival datasets via a cloud-optimized pattern, but without copying or modifying the original data in any way. This is a win-win-win for users, data engineers, and data providers. Users see fast-opening zarr-compliant stores that work performantly with libraries like xarray and dask, data engineers can provide this speed by adding a lightweight virtualization layer on top of existing data (without having to ask anyone's permission), and data providers don't have to change anything about their archival files for them to be used in a cloud-optimized way. 12 | 13 | VirtualiZarr aims to make the creation of cloud-optimized virtualized zarr data from existing scientific data as easy as possible. 14 | 15 | ## Features 16 | 17 | * Create virtual references pointing to bytes inside a archival file with [`open_virtual_dataset`](https://virtualizarr.readthedocs.io/en/latest/usage.html#opening-files-as-virtual-datasets), 18 | * Supports a [range of archival file formats](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare), including netCDF4 and HDF5, 19 | * [Combine data from multiple files](https://virtualizarr.readthedocs.io/en/latest/usage.html#combining-virtual-datasets) into one larger store using [xarray's combining functions](https://docs.xarray.dev/en/stable/user-guide/combining.html), such as [`xarray.concat`](https://docs.xarray.dev/en/stable/generated/xarray.concat.html), 20 | * Commit the virtual references to storage either using the [Kerchunk references](https://fsspec.github.io/kerchunk/spec.html) specification or the [Icechunk](https://icechunk.io/) transactional storage engine. 21 | * Users access the virtual dataset using [`xarray.open_dataset`](https://docs.xarray.dev/en/stable/generated/xarray.open_dataset.html#xarray.open_dataset). 22 | 23 | ## Inspired by Kerchunk 24 | 25 | VirtualiZarr grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [Kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk but in a zarr-native way, and with a familiar array-like API. 26 | 27 | You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk. 28 | 29 | ## Usage 30 | 31 | Creating the virtual store looks very similar to how we normally open data with xarray: 32 | 33 | ```python 34 | from virtualizarr import open_virtual_dataset 35 | 36 | virtual_datasets = [ 37 | open_virtual_dataset(filepath) 38 | for filepath in glob.glob('/my/files*.nc') 39 | ] 40 | 41 | # this Dataset wraps a bunch of virtual ManifestArray objects directly 42 | virtual_ds = xr.combine_nested(virtual_datasets, concat_dim=['time']) 43 | 44 | # cache the combined dataset pattern to disk, in this case using the existing kerchunk specification for reference files 45 | virtual_ds.virtualize.to_kerchunk('combined.json', format='json') 46 | ``` 47 | 48 | Now you can open your shiny new Zarr store instantly: 49 | 50 | ```python 51 | ds = xr.open_dataset('combined.json', engine='kerchunk', chunks={}) # normal xarray.Dataset object, wrapping dask/numpy arrays etc. 52 | ``` 53 | 54 | No data has been loaded or copied in this process, we have merely created an on-disk lookup table that points xarray into the specific parts of the original netCDF files when it needs to read each chunk. 55 | 56 | See the [Usage docs page](#usage) for more details. 57 | 58 | ## Talks and Presentations 59 | 60 | - 2024/11/21 - MET Office Architecture Guild - Tom Nicholas - [Slides](https://speakerdeck.com/tomnicholas/virtualizarr-talk-at-met-office) 61 | - 2024/11/13 - Cloud-Native Geospatial conference - Raphael Hagen - [Slides](https://decks.carbonplan.org/cloud-native-geo/11-13-24) 62 | - 2024/07/24 - ESIP Meeting - Sean Harkins - [Event](https://2024julyesipmeeting.sched.com/event/1eVP6) / [Recording](https://youtu.be/T6QAwJIwI3Q?t=3689) 63 | - 2024/05/15 - Pangeo showcase - Tom Nicholas - [Event](https://discourse.pangeo.io/t/pangeo-showcase-virtualizarr-create-virtual-zarr-stores-using-xarray-syntax/4127/2) / [Recording](https://youtu.be/ioxgzhDaYiE) / [Slides](https://speakerdeck.com/tomnicholas/virtualizarr-create-virtual-zarr-stores-using-xarray-syntax) 64 | 65 | ## Credits 66 | 67 | This package was originally developed by [Tom Nicholas](https://github.com/TomNicholas) whilst working at [[C]Worthy](https://cworthy.org), who deserve credit for allowing him to prioritise a generalizable open-source solution to the dataset virtualization problem. VirtualiZarr is now a community-owned multi-stakeholder project. 68 | 69 | ## Licence 70 | 71 | Apache 2.0 72 | 73 | ## Pages 74 | 75 | ```{toctree} 76 | :maxdepth: 2 77 | 78 | self 79 | installation 80 | usage 81 | examples 82 | faq 83 | api 84 | data_structures 85 | custom_readers 86 | releases 87 | contributing 88 | core_team_guide 89 | ``` 90 | 91 | ## References 92 | 93 | [^1]: [_Cloud-Native Repositories for Big Scientific Data_, Abernathey et. al., _Computing in Science & Engineering_.](https://ieeexplore.ieee.org/abstract/document/9354557) 94 | 95 | [^2]: (Pronounced like "virtualizer" but more piratey 🦜) 96 | -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | VirtualiZarr is available on PyPI via pip: 4 | 5 | ```shell 6 | pip install virtualizarr 7 | ``` 8 | 9 | and on conda-forge: 10 | 11 | ```shell 12 | conda install -c conda-forge virtualizarr 13 | ``` 14 | 15 | ## Optional dependencies 16 | 17 | VirtualiZarr has many optional dependencies, split into those for reading various file formats, and those for writing virtual references out to different formats. 18 | 19 | Optional dependencies can be installed in groups via pip. For example to read HDF files and write virtual references to icechunk you could install all necessary dependencies via: 20 | 21 | ```shell 22 | pip install "virtualizarr[hdf, icechunk]" 23 | ``` 24 | 25 | The full list of optional dependencies can be seen in the `pyproject.toml` file: 26 | 27 | ```{literalinclude} ../pyproject.toml 28 | :start-at: "[project.optional-dependencies]" 29 | :end-before: "# Dependency sets under dependencies-groups are NOT available via PyPI" 30 | ``` 31 | 32 | The compound groups allow you to install multiple sets of dependencies at once, e.g. install every file reader via 33 | 34 | ```shell 35 | pip install "virtualizarr[all_readers]" 36 | ``` 37 | 38 | The basic `pip install virtualizarr` will only install the minimal required dependencies, and so may not be particularly useful on its own. 39 | 40 | ## Install Test Dependencies 41 | 42 | For local development you will want to install the test dependencies so that you can run all the tests in the test suite: 43 | 44 | ```shell 45 | pip install '-e .[test]' 46 | ``` 47 | 48 | ## Install Docs Dependencies 49 | 50 | To build the documentation locally you will need further dependencies: 51 | 52 | ```shell 53 | pip install '-e .[docs]' 54 | ``` 55 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /examples/coiled/terraclimate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Virtualizarr and Coiled - Building a virtual dataset of Terraclimate\n", 8 | "\n", 9 | "This notebook is an example of using Virtualizarr together with the Python distributed processing framework [Coiled](https://www.coiled.io/) to generate references using [serverless functions](https://docs.coiled.io/user_guide/functions.html). \n", 10 | "- **Note:** running this notebook requires a coiled account.\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## The dataset\n", 18 | "For this example, we are going to create a virtual zarr store from the [Terraclimate](https://www.climatologylab.org/terraclimate.html) dataset. Terraclimate is a monthly dataset spanning 66 years and containing 14 climate and water balance variables. It is made up of 924 individual NetCDF4 files. When represented as an Xarray dataset, it is over 1TB in size." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Parallelizing `virtualizarr` reference generation with coiled serverless functions\n", 26 | "Coiled serverless functions allow us to easily spin up hundreds of small compute instances, which are great for individual file reference generation. We were able to process 924 netCDF files into a 1TB virtual xarray dataset in 9 minutes for ~$0.24." 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Installation and environment\n", 34 | "\n", 35 | "You should install the Python requirements in a clean virtual environment of your choice. Each coiled serverless function will reuse this environment, so it's best to start with a clean slate.\n", 36 | "\n", 37 | "```bash\n", 38 | "pip install 'virtualizarr['icechunk','hdf']' coiled ipykernel bokeh\n", 39 | "```" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Imports\n" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "import coiled\n", 56 | "import icechunk\n", 57 | "import numpy as np\n", 58 | "import xarray as xr\n", 59 | "\n", 60 | "from virtualizarr import open_virtual_dataset" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Create the Terraclimate variable and year url combinations \n", 68 | "`14 variables * 66 years = 924 NetCDF files`\n", 69 | "\n", 70 | "\n", 71 | "\n" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "tvars = [\n", 81 | " \"aet\",\n", 82 | " \"def\",\n", 83 | " \"pet\",\n", 84 | " \"ppt\",\n", 85 | " \"q\",\n", 86 | " \"soil\",\n", 87 | " \"srad\",\n", 88 | " \"swe\",\n", 89 | " \"tmax\",\n", 90 | " \"tmin\",\n", 91 | " \"vap\",\n", 92 | " \"ws\",\n", 93 | " \"vpd\",\n", 94 | " \"PDSI\",\n", 95 | "]\n", 96 | "min_year = 1958\n", 97 | "max_year = 2023\n", 98 | "time_list = np.arange(min_year, max_year + 1, 1)\n", 99 | "\n", 100 | "combinations = [\n", 101 | " f\"https://climate.northwestknowledge.net/TERRACLIMATE-DATA/TerraClimate_{var}_{year}.nc\"\n", 102 | " for year in time_list\n", 103 | " for var in tvars\n", 104 | "]" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Define the coiled serverless function\n", 112 | "\n", 113 | "### Serverless function setup notes:\n", 114 | "- This coiled function is tailored to AWS\n", 115 | "- `vm_type=[\"t4g.small\"]` - This is a small instance, you shouldn't need large machines for reference generation\n", 116 | "- `spot_policy=\"spot_with_fallback\"` is cheaper, but might have unintended consequences\n", 117 | "- `arm=True` uses VMs with ARM architecture, which is cheaper\n", 118 | "- `idle_timeout=\"10 minutes\"` workers will shut down after 10 minutes of inactivity \n", 119 | "- `n_workers=[100, 300]` adaptive scaling between 100 & 300 workers\n", 120 | "- `name` [optional] if you want to keep track of your cluster in the coiled dashboard\n", 121 | "\n", 122 | "More details can be found in the [serverless function API](https://docs.coiled.io/user_guide/functions.html#api)." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "@coiled.function(\n", 132 | " region=\"us-west-2\",\n", 133 | " vm_type=[\"t4g.small\"],\n", 134 | " spot_policy=\"spot_with_fallback\",\n", 135 | " arm=True,\n", 136 | " idle_timeout=\"10 minutes\",\n", 137 | " n_workers=[10, 100],\n", 138 | " name=\"parallel_reference_generation\",\n", 139 | ")\n", 140 | "def process(filename):\n", 141 | " vds = open_virtual_dataset(\n", 142 | " filename,\n", 143 | " decode_times=True,\n", 144 | " loadable_variables=[\"time\", \"lat\", \"lon\", \"crs\"],\n", 145 | " )\n", 146 | " return vds\n", 147 | "\n", 148 | "\n", 149 | "# process.map distributes out the input file urls to coiled functions\n", 150 | "# retires=10 allows for individual task retires, which can be useful for inconsistent server behavior\n", 151 | "results = process.map(combinations[0:2], retries=10)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "\n", 159 | "## Combine references into virtual dataset" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "# extract generator values into a list\n", 169 | "vds_list = [result for result in results]\n", 170 | "\n", 171 | "# combine individual refs into a virtual Xarray dataset\n", 172 | "mds = xr.combine_by_coords(\n", 173 | " vds_list, coords=\"minimal\", compat=\"override\", combine_attrs=\"drop\"\n", 174 | ")\n", 175 | "\n", 176 | "mds" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "print(str(\"{0:.2f}\".format(mds.nbytes / 1e12)), \" TB\")" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "## Save the virtual dataset to Icechunk\n", 193 | "\n", 194 | "Now that we have this virtual dataset, we can write it to Icechunk. \n", 195 | "\n", 196 | "In this example we're creating a local icechunk store, but you could configure it for cloud storage." 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "local_storage_conifg = icechunk.local_filesystem_storage(\"./terraclimate\")\n", 206 | "repo = icechunk.Repository.open_or_create(local_storage_conifg)\n", 207 | "session = repo.writable_session(\"main\")" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "mds.virtualize.to_icechunk(store=session.store)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "## Open the Icechunk store with Xarray\n", 224 | "\n", 225 | "**Warning:** Calling `to_zarr` on this dataset will try to write out 1TB of data.\n" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "combined_ds = xr.open_zarr(session.store, consolidated=False, zarr_format=3)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "combined_ds" 244 | ] 245 | } 246 | ], 247 | "metadata": { 248 | "kernelspec": { 249 | "display_name": "Python 3", 250 | "language": "python", 251 | "name": "python3" 252 | }, 253 | "language_info": { 254 | "codemirror_mode": { 255 | "name": "ipython", 256 | "version": 3 257 | }, 258 | "file_extension": ".py", 259 | "mimetype": "text/x-python", 260 | "name": "python", 261 | "nbconvert_exporter": "python", 262 | "pygments_lexer": "ipython3", 263 | "version": "3.12.8" 264 | } 265 | }, 266 | "nbformat": 4, 267 | "nbformat_minor": 2 268 | } 269 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use AWS Lambda base image for Python 3.11 2 | FROM public.ecr.aws/lambda/python:3.11 3 | 4 | ARG FUNCTION_DIR 5 | 6 | # Set working directory 7 | WORKDIR /var/task 8 | 9 | # Update system libraries and install necessary utilities 10 | RUN yum update -y && \ 11 | yum install -y wget unzip tar gzip git && \ 12 | yum clean all 13 | 14 | # Install uv package manager and move it to /usr/local/bin 15 | RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ 16 | mv ~/.local/bin/uv /usr/local/bin/uv && \ 17 | chmod +x /usr/local/bin/uv 18 | 19 | # Verify uv installation 20 | RUN uv --version 21 | 22 | RUN uv pip install --upgrade pip wheel six setuptools --system \ 23 | && uv pip install --upgrade --no-cache-dir --system \ 24 | awslambdaric \ 25 | boto3 \ 26 | redis \ 27 | httplib2 \ 28 | requests \ 29 | numpy \ 30 | scipy \ 31 | pandas \ 32 | pika \ 33 | kafka-python \ 34 | cloudpickle \ 35 | ps-mem \ 36 | tblib \ 37 | psutil 38 | 39 | # Set environment variables for Lambda 40 | ENV PYTHONPATH="/var/lang/lib/python3.11/site-packages:${FUNCTION_DIR}" 41 | 42 | # Copy and install dependencies from requirements.txt using uv 43 | COPY requirements.txt /tmp/requirements.txt 44 | RUN uv pip install --no-cache-dir -r /tmp/requirements.txt --system 45 | 46 | # Copy application code 47 | COPY lithops_lambda.zip ${FUNCTION_DIR} 48 | RUN unzip lithops_lambda.zip \ 49 | && rm lithops_lambda.zip \ 50 | && mkdir handler \ 51 | && touch handler/__init__.py \ 52 | && mv entry_point.py handler/ 53 | 54 | # Set Lambda entry point 55 | CMD [ "handler.entry_point.lambda_handler" ] 56 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/README.md: -------------------------------------------------------------------------------- 1 | # Lithops Package for MUR SST Data Processing 2 | 3 | This package provides functionality for processing MUR SST (Multi-scale Ultra-high Resolution Sea Surface Temperature) data using [Lithops](https://lithops-cloud.github.io/), a framework for serverless computing. 4 | 5 | ## Environment + Lithops Setup 6 | 7 | 1. Set up a Python environment. The below example uses [`uv`](https://docs.astral.sh/uv/), but other environment managers should work as well: 8 | 9 | ```sh 10 | uv venv virtualizarr-lithops --python 3.11 11 | source virtualizarr-lithops/bin/activate 12 | uv pip install -r requirements.txt 13 | ``` 14 | 15 | 2. Follow the [AWS Lambda Configuration](https://lithops-cloud.github.io/docs/source/compute_config/aws_lambda.html#configuration) instructions, unless you already have an appropriate AWS IAM role to use. 16 | 17 | 3. Follow the [AWS Credential setup](https://lithops-cloud.github.io/docs/source/compute_config/aws_lambda.html#aws-credential-setup) instructions. 18 | 19 | 4. Check and modify as necessary compute and storage backends for [lithops](https://lithops-cloud.github.io/docs/source/configuration.html) in `lithops.yaml`. 20 | 21 | 22 | 5. Build the lithops lambda runtime if it does not exist in your target AWS environment. 23 | ```bash 24 | export LITHOPS_CONFIG_FILE=$(pwd)/lithops.yaml 25 | lithops runtime build -b aws_lambda -f Dockerfile vz-runtime 26 | ``` 27 | 28 | For various reasons, you may want to build the lambda runtime on EC2 (docker can be a resource hog and pushing to ECR is faster, for example). If you wish to use EC2, please see the scripts in `ec2_for_lithops_runtime/` in this directory. 29 | 30 | > [!IMPORTANT] 31 | > If the runtime was created with a different IAM identity, an appropriate `user_id` will need to be included in the lithops configuration under `aws_lamda`. 32 | 33 | > [!TIP] 34 | > You can configure the AWS Lambda architecture via the `architecture` key under `aws_lambda` in the lithops configuration file. 35 | 36 | 37 | 6. (Optional) To rebuild the Lithops Lambda runtime image, delete the existing one: 38 | 39 | ```bash 40 | lithops runtime delete -b aws_lambda -d virtualizarr-runtime 41 | ``` 42 | 43 | ## Package Structure 44 | 45 | The package is organized into the following modules: 46 | 47 | - `__init__.py`: Package initialization and exports 48 | - `config.py`: Configuration settings and constants 49 | - `models.py`: Data models and structures 50 | - `url_utils.py`: URL generation and file listing 51 | - `repo.py`: Icechunk repository management 52 | - `virtual_datasets.py`: Virtual dataset operations 53 | - `zarr_operations.py`: Zarr array operations 54 | - `helpers.py`: Data helpers 55 | - `lithops_functions.py`: Lithops execution wrappers 56 | - `cli.py`: Command-line interface 57 | 58 | ## Usage 59 | 60 | ### Command-line Interface 61 | 62 | The package provides a command-line interface for running various functions: 63 | 64 | ```bash 65 | python main.py [options] 66 | ``` 67 | 68 | Available functions: 69 | 70 | - `write_to_icechunk`: Write data to Icechunk 71 | - `check_data_store_access`: Check access to the data store 72 | - `calc_icechunk_store_mean`: Calculate the mean of the Icechunk store 73 | - `calc_original_files_mean`: Calculate the mean of the original files 74 | - `list_installed_packages`: List installed packages 75 | 76 | Options: 77 | 78 | - `--start_date`: Start date for data processing (YYYY-MM-DD) 79 | - `--end_date`: End date for data processing (YYYY-MM-DD) 80 | - `--append_dim`: Append dimension for writing to Icechunk 81 | 82 | ### Examples 83 | 84 | #### Writing Data to Icechunk 85 | 86 | ```bash 87 | python main.py write_to_icechunk --start_date 2022-01-01 --end_date 2022-01-02 88 | ``` 89 | 90 | #### Calculating the Mean of the Icechunk Store 91 | 92 | ```bash 93 | python main.py calc_icechunk_store_mean --start_date 2022-01-01 --end_date 2022-01-31 94 | ``` 95 | 96 | #### Checking Data Store Access 97 | 98 | ```bash 99 | python main.py check_data_store_access 100 | ``` 101 | 102 | ## Programmatic Usage 103 | 104 | You can also use the package programmatically: 105 | 106 | ```python 107 | from lithops_functions import write_to_icechunk 108 | 109 | # Write data to Icechunk 110 | write_to_icechunk(start_date="2022-01-01", end_date="2022-01-31") 111 | ``` 112 | 113 | ## Testing 114 | 115 | To test the package, you can use the provided test functions: 116 | 117 | ```bash 118 | python main.py check_data_store_access 119 | ``` 120 | 121 | This will verify that the package can access the data store. 122 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Lithops package for MUR SST data processing import 3 | 4 | This package provides functionality for processing MUR SST data using Lithops, 5 | a framework for serverless computing import 6 | """ 7 | 8 | from . import ( 9 | config, 10 | data_processing, 11 | lithops_functions, 12 | models, 13 | repo, 14 | url_utils, 15 | virtual_datasets, 16 | ) 17 | 18 | __all__ = [ 19 | "config", 20 | "data_processing", 21 | "lithops_functions", 22 | "models", 23 | "repo", 24 | "url_utils", 25 | "virtual_datasets", 26 | ] 27 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command-line interface. 3 | 4 | This module provides a command-line interface for the package. 5 | """ 6 | 7 | import argparse 8 | 9 | from lithops_functions import ( 10 | lithops_calc_icechunk_store_mean, 11 | lithops_calc_original_files_mean, 12 | lithops_check_data_store_access, 13 | lithops_list_installed_packages, 14 | write_to_icechunk, 15 | ) 16 | 17 | 18 | def parse_args(): 19 | """ 20 | Parse command-line arguments. 21 | 22 | Returns: 23 | The parsed arguments 24 | """ 25 | parser = argparse.ArgumentParser(description="Run lithops functions.") 26 | parser.add_argument( 27 | "function", 28 | choices=[ 29 | "write_to_icechunk", 30 | "check_data_store_access", 31 | "calc_icechunk_store_mean", 32 | "calc_original_files_mean", 33 | "list_installed_packages", 34 | ], 35 | help="The function to run.", 36 | ) 37 | parser.add_argument( 38 | "--start_date", 39 | type=str, 40 | help="Start date for data processing (YYYY-MM-DD).", 41 | ) 42 | parser.add_argument( 43 | "--end_date", 44 | type=str, 45 | help="End date for data processing (YYYY-MM-DD).", 46 | ) 47 | parser.add_argument( 48 | "--append_dim", 49 | type=str, 50 | help="Append dimension for writing to icechunk.", 51 | ) 52 | return parser.parse_args() 53 | 54 | 55 | def main(): 56 | """ 57 | Main entry point for the command-line interface. 58 | """ 59 | args = parse_args() 60 | start_date = args.start_date 61 | end_date = args.end_date 62 | append_dim = args.append_dim 63 | 64 | if args.function == "write_to_icechunk": 65 | write_to_icechunk( 66 | start_date=start_date, end_date=end_date, append_dim=append_dim 67 | ) 68 | elif args.function == "check_data_store_access": 69 | lithops_check_data_store_access() 70 | elif args.function == "calc_icechunk_store_mean": 71 | lithops_calc_icechunk_store_mean(start_date=start_date, end_date=end_date) 72 | elif args.function == "calc_original_files_mean": 73 | lithops_calc_original_files_mean(start_date=start_date, end_date=end_date) 74 | elif args.function == "list_installed_packages": 75 | lithops_list_installed_packages() 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration settings for MUR SST data processing. 3 | 4 | This module contains all the configuration settings and constants used 5 | throughout the package. 6 | """ 7 | 8 | import fsspec 9 | 10 | # S3 filesystem for reading data 11 | fs_read = fsspec.filesystem("s3", anon=False, skip_instance_cache=True) 12 | 13 | # Data source configuration 14 | base_url = "s3://podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1" 15 | data_vars = ["analysed_sst", "analysis_error", "mask", "sea_ice_fraction"] 16 | drop_vars = ["dt_1km_data", "sst_anomaly"] 17 | 18 | # Storage configuration 19 | bucket = "nasa-eodc-scratch" 20 | store_name = "MUR-JPL-L4-GLOB-v4.1-virtual-v1" 21 | directory = "test" 22 | 23 | # Spatial subset configuration 24 | lat_slice = slice(48.5, 48.7) 25 | lon_slice = slice(-124.7, -124.5) 26 | 27 | # Date range processing dictionary 28 | date_process_dict = { 29 | ("2002-06-30", "2003-09-10"): "virtual_dataset", 30 | ("2003-09-11", "2003-09-11"): "zarr", 31 | ("2003-09-12", "2021-02-19"): "virtual_dataset", 32 | ("2021-02-20", "2021-02-21"): "zarr", 33 | ("2021-02-22", "2021-12-23"): "virtual_dataset", 34 | ("2021-12-24", "2022-01-26"): "zarr", 35 | ("2022-01-27", "2022-11-08"): "virtual_dataset", 36 | ("2022-11-09", "2022-11-09"): "zarr", 37 | ("2022-11-10", "2023-02-23"): "virtual_dataset", 38 | ("2023-02-24", "2023-02-28"): "zarr", 39 | ("2023-03-01", "2023-04-21"): "virtual_dataset", 40 | ("2023-04-22", "2023-04-22"): "zarr", 41 | ("2023-04-23", "2023-09-03"): "virtual_dataset", 42 | } 43 | 44 | zarr_concurrency = 4 45 | 46 | mursst_var_chunks = { 47 | "analysed_sst": {"time": 1, "lat": 1023, "lon": 2047}, 48 | "analysis_error": {"time": 1, "lat": 1023, "lon": 2047}, 49 | "mask": {"time": 1, "lat": 1447, "lon": 2895}, 50 | "sea_ice_fraction": {"time": 1, "lat": 1447, "lon": 2895}, 51 | } 52 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/ec2_for_lithops_runtime/00-create-security-group.sh: -------------------------------------------------------------------------------- 1 | export SECURITY_GROUP_NAME=XXX 2 | export VPC_ID=XXX 3 | aws ec2 create-security-group --group-name $SG_GROUP_NAME --description "security group for ithops runtime builder ec2" --vpc-id $VPC_ID 4 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/ec2_for_lithops_runtime/01-launch-ec2.sh: -------------------------------------------------------------------------------- 1 | # look up the group id created 2 | export SECURITY_GROUP_ID=XXX 3 | export YOUR_IP=$(curl -s https://checkip.amazonaws.com) 4 | export AMI_ID=ami-027951e78de46a00e 5 | export SSH_KEY_NAME=XXX 6 | aws ec2 authorize-security-group-ingress --group-id $SECURITY_GROUP_ID --ip-permissions '{"IpProtocol":"tcp","FromPort":22,"ToPort":22,"IpRanges":[{"CidrIp":"'$YOUR_IP'/32"}]}' 7 | aws ec2 run-instances --image-id $AMI_ID \ 8 | --instance-type "t3.medium" --key-name $SSH_KEY_NAME \ 9 | --block-device-mappings '{"DeviceName":"/dev/xvda","Ebs":{"Encrypted":false,"DeleteOnTermination":true,"Iops":3000,"SnapshotId":"snap-01783d80c688baa0f","VolumeSize":30,"VolumeType":"gp3","Throughput":125}}' \ 10 | --network-interfaces '{"AssociatePublicIpAddress":true,"DeviceIndex":0,"Groups":["'$SECURITY_GROUP_ID'"]}' \ 11 | --credit-specification '{"CpuCredits":"unlimited"}' \ 12 | --metadata-options '{"HttpEndpoint":"enabled","HttpPutResponseHopLimit":2,"HttpTokens":"required"}' \ 13 | --private-dns-name-options '{"HostnameType":"ip-name","EnableResourceNameDnsARecord":true,"EnableResourceNameDnsAAAARecord":false}' \ 14 | --count "1" 15 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/ec2_for_lithops_runtime/02-setup-ec2-role.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set variables 4 | ROLE_NAME="EC2_Lithops_Lambda_Builder" 5 | INSTANCE_ID=XXX # Replace with your EC2 instance ID 6 | POLICY_NAME="EC2LithopsLambdaPolicy" 7 | REGION=XXX 8 | 9 | # Step 1: Create the IAM role 10 | aws iam create-role --role-name $ROLE_NAME \ 11 | --assume-role-policy-document '{ 12 | "Version": "2012-10-17", 13 | "Statement": [ 14 | { 15 | "Effect": "Allow", 16 | "Principal": { "Service": "ec2.amazonaws.com" }, 17 | "Action": "sts:AssumeRole" 18 | } 19 | ] 20 | }' > /dev/null 21 | 22 | echo "✅ IAM Role '$ROLE_NAME' created." 23 | 24 | # Step 2: Attach necessary policies 25 | aws iam put-role-policy --role-name $ROLE_NAME --policy-name $POLICY_NAME \ 26 | --policy-document '{ 27 | "Version": "2012-10-17", 28 | "Statement": [ 29 | { 30 | "Effect": "Allow", 31 | "Action": [ 32 | "ecr:GetAuthorizationToken", 33 | "ecr:BatchCheckLayerAvailability", 34 | "ecr:CompleteLayerUpload", 35 | "ecr:UploadLayerPart", 36 | "ecr:InitiateLayerUpload", 37 | "ecr:PutImage", 38 | "ecr:BatchGetImage", 39 | "lambda:CreateFunction", 40 | "lambda:UpdateFunctionCode", 41 | "s3:GetObject", 42 | "s3:ListBucket", 43 | "ecr:CreateRepository" 44 | ], 45 | "Resource": "*" 46 | } 47 | ] 48 | }' > /dev/null 49 | 50 | echo "✅ IAM policy attached to role '$ROLE_NAME'." 51 | 52 | # Step 3: Create an Instance Profile and associate with the role 53 | aws iam create-instance-profile --instance-profile-name $ROLE_NAME > /dev/null 54 | aws iam add-role-to-instance-profile --instance-profile-name $ROLE_NAME --role-name $ROLE_NAME 55 | 56 | echo "✅ Instance profile '$ROLE_NAME' created and role attached." 57 | 58 | # Step 4: Attach the IAM role to the running EC2 instance 59 | aws ec2 associate-iam-instance-profile --instance-id $INSTANCE_ID \ 60 | --iam-instance-profile Name=$ROLE_NAME > /dev/null 61 | 62 | echo "✅ IAM role '$ROLE_NAME' attached to instance '$INSTANCE_ID'." 63 | 64 | # Step 5: Confirm the role is attached 65 | echo "🔄 Waiting for role to be active..." 66 | sleep 10 67 | aws ec2 describe-instances --instance-ids $INSTANCE_ID --query "Reservations[*].Instances[*].IamInstanceProfile" --output json 68 | 69 | echo "✅ Done! The EC2 instance now has the necessary permissions." 70 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/ec2_for_lithops_runtime/03-setup-ec2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Exit on error 4 | set -e 5 | 6 | echo "Updating system packages..." 7 | sudo yum update -y 8 | 9 | echo "Installing Python 3 and pip..." 10 | sudo yum install -y python3 python3-pip 11 | 12 | echo "Installing Docker..." 13 | sudo yum install -y docker git 14 | 15 | echo "Starting Docker service..." 16 | sudo systemctl start docker 17 | sudo systemctl enable docker 18 | 19 | echo "Adding current user to Docker group..." 20 | sudo usermod -aG docker $USER 21 | 22 | echo "Installing uv package manager..." 23 | curl -LsSf https://astral.sh/uv/install.sh | sh 24 | 25 | echo "Verifying installations..." 26 | python3 --version 27 | pip3 --version 28 | docker --version 29 | uv --version 30 | 31 | echo "Setup complete! Please log out and log back in to apply Docker group changes." 32 | 33 | # lithops environment setup 34 | git clone https://github.com/zarr-developers/Virtualizarr 35 | cd Virtualizarr/ 36 | cd examples/mursst-icechunk-with-lithops/ 37 | uv venv virtualizarr-lithops 38 | source virtualizarr-lithops/bin/activate 39 | uv pip install -r requirements.txt 40 | export LITHOPS_CONFIG_FILE=$(pwd)/lithops.yaml 41 | lithops runtime build -b aws_lambda -f Dockerfile virtualizarr-runtime 42 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/ec2_for_lithops_runtime/README.md: -------------------------------------------------------------------------------- 1 | # Launch and use an EC2 for building the Lithops lambda runtime 2 | 3 | The scripts in this directly will help to launch and set up an ec2 so that you can build and push a lithops lambda runtime. 4 | 5 | You will need AWS console and CLI access. 6 | 7 | Steps: 8 | 9 | 1. Access the AWS console to create an SSH key in AWS that you can associate the EC2 when launching. 10 | 2. Add a `SECURITY_GROUP_NAME` of your choosing and appropriate `VPC_ID` to `00-create-security-group.sh` and execute that script. 11 | 3. Add the `SECURITY_GROUP_ID` and other required variables to `01-launch-ec2.sh` and execute that script. 12 | 4. Add the `INSTANCE_ID` to `02-setup-ec2-role.sh` and execute that script. 13 | 5. SSH into the instance and execute the scripts in `03-setup-ec2.sh`. 14 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/helpers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers. 3 | """ 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import xarray as xr 8 | from config import date_process_dict, lat_slice, lon_slice 9 | from repo import open_or_create_repo 10 | from zarr_operations import configure_zarr 11 | 12 | 13 | def xarray_open_icechunk(open_or_create_repo_func: callable = open_or_create_repo): 14 | """ 15 | Open an Icechunk repository as an xarray Dataset. 16 | 17 | Args: 18 | open_or_create_repo_func: Function to open or create a repository 19 | 20 | Returns: 21 | An xarray Dataset 22 | """ 23 | # Configure Zarr for optimal performance 24 | configure_zarr() 25 | repo = open_or_create_repo_func() 26 | session = repo.readonly_session("main") 27 | return xr.open_dataset( 28 | session.store, consolidated=False, zarr_format=3, engine="zarr" 29 | ) 30 | 31 | 32 | def open_and_read_data( 33 | file: str, lat_slice_arg: slice = lat_slice, lon_slice_arg: slice = lon_slice 34 | ): 35 | """ 36 | Open and read data from a file. 37 | 38 | Args: 39 | file: The file to open 40 | lat_slice_arg: The latitude slice 41 | lon_slice_arg: The longitude slice 42 | 43 | Returns: 44 | The data values 45 | """ 46 | from config import fs_read 47 | 48 | ds = xr.open_dataset(fs_read.open(file), chunks={}) 49 | return ds.analysed_sst.sel(lat=lat_slice_arg, lon=lon_slice_arg).values 50 | 51 | 52 | def get_mean(values: np.ndarray): 53 | """ 54 | Calculate the mean of an array. 55 | 56 | Args: 57 | values: The array to calculate the mean of 58 | 59 | Returns: 60 | The mean value 61 | """ 62 | return np.nanmean(values) 63 | 64 | 65 | # Convert dictionary to a Pandas DataFrame with IntervalIndex 66 | interval_df = pd.DataFrame( 67 | [ 68 | { 69 | "interval": pd.Interval( 70 | pd.Timestamp(start), pd.Timestamp(end), closed="both" 71 | ), 72 | "label": label, 73 | } 74 | for (start, end), label in date_process_dict.items() 75 | ] 76 | ) 77 | 78 | 79 | def find_label_for_range(date_str1, date_str2, df=interval_df): 80 | """ 81 | Find the corresponding label for two dates. 82 | 83 | Args: 84 | date_str1: The first date in YYYY-MM-DD format 85 | date_str2: The second date in YYYY-MM-DD format 86 | df: The DataFrame with intervals and labels 87 | 88 | Returns: 89 | The label for the date range 90 | """ 91 | date1, date2 = pd.Timestamp(date_str1), pd.Timestamp(date_str2) 92 | 93 | # Find intervals where both dates are contained 94 | match = df[ 95 | df["interval"].apply(lambda interval: date1 in interval and date2 in interval) 96 | ] 97 | if match.empty: 98 | raise ValueError( 99 | f"No matching interval found for dates {date_str1} and {date_str2}" 100 | ) 101 | 102 | return match["label"].iloc[0] if not match.empty else None 103 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/lithops.yaml: -------------------------------------------------------------------------------- 1 | lithops: 2 | backend: aws_lambda 3 | storage: aws_s3 4 | data_limit: False # Max (iter)data size (in MB). Set to False for unlimited size. 5 | 6 | aws: 7 | region: us-west-2 8 | 9 | aws_lambda: 10 | execution_role: arn:aws:iam::CHANGE_ME:role/veda-data-reader-dev 11 | runtime: vz-runtime:latest 12 | runtime_memory: 10240 13 | # user_id: kuf3 # if the runtime was created with a different IAM identity, this user id will need to be included 14 | 15 | aws_s3: 16 | bucket: arn:aws:s3:::nasa-eodc-lithops 17 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Main entry point for the lithops package. 4 | 5 | This script provides a simple interface for running the package from the command line. 6 | """ 7 | 8 | from cli import main 9 | 10 | if __name__ == "__main__": 11 | main() 12 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data models for MUR SST data processing. 3 | 4 | This module contains data structures used throughout the package. 5 | """ 6 | 7 | from dataclasses import dataclass 8 | 9 | 10 | @dataclass 11 | class Task: 12 | """ 13 | Represents a data processing task. 14 | 15 | Attributes: 16 | var: The variable name to process 17 | dt: The datetime string 18 | time_idx: The time index in the array 19 | """ 20 | 21 | var: str 22 | dt: str 23 | time_idx: int 24 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/repo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Icechunk repository management. 3 | 4 | This module contains functions for creating and managing Icechunk repositories. 5 | """ 6 | 7 | import boto3 8 | import icechunk 9 | from config import bucket, directory, store_name 10 | 11 | 12 | def open_or_create_repo(): 13 | """ 14 | Open or create an Icechunk repository. 15 | 16 | Returns: 17 | An Icechunk repository object 18 | """ 19 | # Config for repo storage 20 | session = boto3.Session() 21 | 22 | # Get the credentials from the session 23 | credentials = session.get_credentials() 24 | 25 | # Extract the actual key, secret, and token 26 | creds = credentials.get_frozen_credentials() 27 | storage_config = icechunk.s3_storage( 28 | bucket=bucket, 29 | prefix=f"{directory}/{store_name}", 30 | region="us-west-2", 31 | access_key_id=creds.access_key, 32 | secret_access_key=creds.secret_key, 33 | session_token=creds.token, 34 | ) 35 | 36 | # Config for repo 37 | repo_config = icechunk.RepositoryConfig.default() 38 | repo_config.set_virtual_chunk_container( 39 | icechunk.VirtualChunkContainer( 40 | "s3", "s3://", icechunk.s3_store(region="us-west-2") 41 | ) 42 | ) 43 | 44 | # Config for repo virtual chunk credentials 45 | virtual_chunk_creds = icechunk.containers_credentials( 46 | s3=icechunk.s3_credentials(anonymous=False) 47 | ) 48 | 49 | repo = icechunk.Repository.open_or_create( 50 | storage=storage_config, 51 | config=repo_config, 52 | virtual_chunk_credentials=virtual_chunk_creds, 53 | ) 54 | return repo 55 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/requirements.txt: -------------------------------------------------------------------------------- 1 | xarray>=2025.1.0 2 | h5netcdf 3 | h5py 4 | pandas 5 | s3fs 6 | boto3==1.35.99 7 | dask 8 | distributed 9 | lithops 10 | git+https://github.com/zarr-developers/virtualizarr.git@main[icechunk] 11 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/url_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | URL utilities for MUR SST data processing. 3 | 4 | This module contains functions for generating URLs and listing files. 5 | """ 6 | 7 | import datetime 8 | from typing import List 9 | 10 | import pandas as pd 11 | from config import base_url 12 | 13 | 14 | def make_url(date: datetime) -> str: 15 | """ 16 | Create an S3 URL for a specific datetime. 17 | 18 | Args: 19 | date: The datetime to create a URL for 20 | 21 | Returns: 22 | The S3 URL for the specified datetime 23 | """ 24 | date_string = date.strftime("%Y%m%d") + "090000" 25 | components = [ 26 | base_url, 27 | f"{date_string}-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc", 28 | ] 29 | return "/".join(components) 30 | 31 | 32 | def list_mur_sst_files(start_date: str, end_date: str, dmrpp: bool = True) -> List[str]: 33 | """ 34 | List all files in S3 with a certain date prefix. 35 | 36 | Args: 37 | start_date: The start date in YYYY-MM-DD format 38 | end_date: The end date in YYYY-MM-DD format 39 | dmrpp: Whether to return DMR++ URLs (default: True) 40 | 41 | Returns: 42 | A list of S3 URLs for the specified date range 43 | """ 44 | dates = pd.date_range(start=start_date, end=end_date, freq="1D") 45 | netcdf_urls = [make_url(date) for date in dates] 46 | if not dmrpp: 47 | return netcdf_urls 48 | return [f + ".dmrpp" for f in netcdf_urls] 49 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/virtual_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Virtual dataset operations. 3 | 4 | This module contains functions for working with virtual datasets. 5 | """ 6 | 7 | import xarray as xr 8 | from config import drop_vars 9 | from repo import open_or_create_repo 10 | 11 | from virtualizarr import open_virtual_dataset 12 | 13 | 14 | def map_open_virtual_dataset(uri, open_args: dict = {}): 15 | """ 16 | Map function to open virtual datasets. 17 | 18 | Args: 19 | uri: The URI of the virtual dataset 20 | 21 | Returns: 22 | A virtual dataset 23 | """ 24 | vds = open_virtual_dataset( 25 | uri, 26 | indexes={}, 27 | **open_args, 28 | ) 29 | return vds.drop_vars(drop_vars, errors="ignore") 30 | 31 | 32 | def concat_virtual_datasets(results): 33 | """ 34 | Reduce to concat virtual datasets. 35 | 36 | Args: 37 | results: A list of virtual datasets 38 | 39 | Returns: 40 | A concatenated virtual dataset 41 | """ 42 | combined_vds = xr.concat( 43 | results, 44 | dim="time", 45 | coords="minimal", 46 | compat="override", 47 | combine_attrs="override", 48 | ) 49 | return combined_vds 50 | 51 | 52 | def write_virtual_results_to_icechunk( 53 | virtual_ds, start_date: str, end_date: str, append_dim: str = None 54 | ): 55 | """ 56 | Write virtual dataset results to IceChunk. 57 | 58 | Args: 59 | virtual_ds: The virtual dataset to write 60 | start_date: The start date in YYYY-MM-DD format 61 | end_date: The end date in YYYY-MM-DD format 62 | append_dim: The dimension to append to (optional) 63 | 64 | Returns: 65 | The commit ID 66 | """ 67 | repo = open_or_create_repo() 68 | session = repo.writable_session("main") 69 | 70 | # Check if store is already populated 71 | with session.allow_pickling(): 72 | if append_dim: 73 | # Only use append_dim if store already has data 74 | virtual_ds.virtualize.to_icechunk(session.store, append_dim=append_dim) 75 | else: 76 | # If we can't check or there's an error, assume store is empty 77 | virtual_ds.virtualize.to_icechunk(session.store) 78 | 79 | return session.commit(f"Commit data {start_date} to {end_date}") 80 | 81 | 82 | def concat_and_write_virtual_datasets( 83 | results, start_date: str, end_date: str, append_dim: str = None 84 | ): 85 | """ 86 | Reduce to concat virtual datasets and write to icechunk. 87 | 88 | Args: 89 | results: A list of virtual datasets 90 | start_date: The start date in YYYY-MM-DD format 91 | end_date: The end date in YYYY-MM-DD format 92 | append_dim: The dimension to append to (optional) 93 | 94 | Returns: 95 | The commit ID 96 | """ 97 | combined_vds = concat_virtual_datasets(results) 98 | return write_virtual_results_to_icechunk( 99 | combined_vds, start_date, end_date, append_dim 100 | ) 101 | -------------------------------------------------------------------------------- /examples/mursst-icechunk-with-lithops/zarr_operations.py: -------------------------------------------------------------------------------- 1 | """ 2 | Zarr operations. 3 | 4 | This module contains functions for working with Zarr arrays. 5 | """ 6 | 7 | from typing import cast 8 | 9 | import icechunk 10 | import numpy as np 11 | import pandas as pd 12 | import xarray as xr 13 | import zarr 14 | from config import mursst_var_chunks, zarr_concurrency 15 | from models import Task 16 | 17 | 18 | def resize_data_array(var_name: str, session: icechunk.Session, n_timesteps: int): 19 | """ 20 | Resize a data variable array. 21 | 22 | Args: 23 | var_name: The name of the variable to resize 24 | session: The IceChunk session 25 | n_timesteps: The number of timesteps to add 26 | 27 | Returns: 28 | The updated session 29 | """ 30 | group = zarr.group(store=session.store, overwrite=False) 31 | current_shape = group[var_name].shape 32 | group[var_name].resize((current_shape[0] + n_timesteps,) + current_shape[1:]) 33 | return session 34 | 35 | 36 | def handle_time_dimension(session: icechunk.Session, start_date: str, end_date: str): 37 | """ 38 | Handle time dimension and return datetime-index pairs. 39 | 40 | Args: 41 | session: The Icechunk session 42 | start_date: The start date in YYYY-MM-DD format 43 | end_date: The end date in YYYY-MM-DD format 44 | 45 | Returns: 46 | A tuple containing the updated session and a list of datetime-index pairs 47 | """ 48 | group = zarr.group(store=session.store, overwrite=False) 49 | dt_index = pd.date_range(start=start_date, end=end_date, freq="1D") 50 | n_timesteps = len(dt_index) 51 | current_time_length = group["time"].shape[0] 52 | 53 | # Resize time array 54 | group["time"].resize((current_time_length + n_timesteps,)) 55 | 56 | # Update time values 57 | reference_date = pd.Timestamp("1981-01-01 00:00:00") 58 | dt_index_seconds_since_1981 = (dt_index - reference_date).total_seconds() 59 | group["time"][-n_timesteps:] = np.int32(dt_index_seconds_since_1981) 60 | 61 | # Return list of (datetime, index) pairs 62 | return ( 63 | session, 64 | [(dt, current_time_length + idx) for idx, dt in enumerate(dt_index)], 65 | ) 66 | 67 | 68 | def write_data_to_zarr(task: Task, session: icechunk.Session, ds: xr.Dataset): 69 | """ 70 | Write data to Zarr array. 71 | 72 | Args: 73 | task: The task containing variable, datetime, and time index 74 | session: The Icechunk session 75 | ds: The xarray Dataset containing the data 76 | 77 | Returns: 78 | The updated session 79 | """ 80 | group = zarr.group(store=session.store, overwrite=False) 81 | var, dt, time_idx = task.var, task.dt, task.time_idx 82 | data_array = ds[var].sel(time=dt) 83 | current_array = cast(zarr.Array, group[var]) 84 | # where we actually write the data 85 | current_array[time_idx, :, :] = data_array.values 86 | return session 87 | 88 | 89 | def configure_zarr(): 90 | """ 91 | Configure Zarr settings for optimal performance. 92 | """ 93 | zarr.config.set( 94 | { 95 | "async": {"concurrency": zarr_concurrency, "timeout": None}, 96 | "threading": {"max_workers": None}, 97 | } 98 | ) 99 | 100 | 101 | def map_open_files(file: str): 102 | """ 103 | Map function to open files. 104 | 105 | Args: 106 | file: The file to open 107 | 108 | Returns: 109 | An opened file object 110 | """ 111 | from config import fs_read 112 | 113 | return fs_read.open(file) 114 | 115 | 116 | def xarray_open_mfdataset(files: list[str]): 117 | """ 118 | Open multiple files as an xarray Dataset. 119 | 120 | Args: 121 | files: A list of file objects 122 | 123 | Returns: 124 | An xarray Dataset 125 | """ 126 | from config import drop_vars 127 | 128 | ds = xr.open_mfdataset( 129 | files, mask_and_scale=False, drop_variables=drop_vars, chunks={} 130 | ) 131 | for var, chunks in mursst_var_chunks.items(): 132 | ds[var] = ds[var].chunk(chunks) 133 | return ds 134 | -------------------------------------------------------------------------------- /examples/virtualizarr-with-lithops/Dockerfile_virtualizarr: -------------------------------------------------------------------------------- 1 | # Python 3.11 2 | FROM python:3.11-slim-buster 3 | 4 | 5 | RUN apt-get update \ 6 | # Install aws-lambda-cpp build dependencies 7 | && apt-get install -y \ 8 | g++ \ 9 | make \ 10 | cmake \ 11 | unzip \ 12 | # cleanup package lists, they are not used anymore in this image 13 | && rm -rf /var/lib/apt/lists/* \ 14 | && apt-cache search linux-headers-generic 15 | 16 | ARG FUNCTION_DIR="/function" 17 | 18 | # Copy function code 19 | RUN mkdir -p ${FUNCTION_DIR} 20 | 21 | # Update pip 22 | # NB botocore/boto3 are pinned due to https://github.com/boto/boto3/issues/3648 23 | # using versions from https://github.com/aio-libs/aiobotocore/blob/72b8dd5d7d4ef2f1a49a0ae0c37b47e5280e2070/setup.py 24 | # due to s3fs dependency 25 | RUN pip install --upgrade --ignore-installed pip wheel six setuptools \ 26 | && pip install --upgrade --no-cache-dir --ignore-installed \ 27 | awslambdaric \ 28 | botocore==1.29.76 \ 29 | boto3==1.26.76 \ 30 | redis \ 31 | httplib2 \ 32 | requests \ 33 | numpy \ 34 | scipy \ 35 | pandas \ 36 | pika \ 37 | kafka-python \ 38 | cloudpickle \ 39 | ps-mem \ 40 | tblib 41 | 42 | # Set working directory to function root directory 43 | WORKDIR ${FUNCTION_DIR} 44 | 45 | # Add Lithops 46 | COPY lithops_lambda.zip ${FUNCTION_DIR} 47 | RUN unzip lithops_lambda.zip \ 48 | && rm lithops_lambda.zip \ 49 | && mkdir handler \ 50 | && touch handler/__init__.py \ 51 | && mv entry_point.py handler/ 52 | 53 | # Put your dependencies here, using RUN pip install... or RUN apt install... 54 | 55 | COPY requirements.txt requirements.txt 56 | RUN pip install --no-cache-dir -r requirements.txt 57 | 58 | ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ] 59 | CMD [ "handler.entry_point.lambda_handler" ] 60 | -------------------------------------------------------------------------------- /examples/virtualizarr-with-lithops/README.md: -------------------------------------------------------------------------------- 1 | # Generate a virtual zarr dataset using lithops 2 | 3 | This example walks through how to create a virtual dataset from a collection of 4 | netCDF files on s3 using lithops to open each file in parallel then concatenate 5 | them into a single virtual dataset. 6 | 7 | ## Credits 8 | Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook 9 | by norlandrhagen. 10 | 11 | Please, contribute improvements. 12 | 13 | 14 | 15 | 1. Set up a Python environment 16 | ```bash 17 | conda create --name virtualizarr-lithops -y python=3.11 18 | conda activate virtualizarr-lithops 19 | pip install -r requirements.txt 20 | ``` 21 | 22 | 2. Configure compute and storage backends for [lithops](https://lithops-cloud.github.io/docs/source/configuration.html). 23 | The configuration in `lithops.yaml` uses AWS Lambda for [compute](https://lithops-cloud.github.io/docs/source/compute_config/aws_lambda.html) and AWS S3 for [storage](https://lithops-cloud.github.io/docs/source/storage_config/aws_s3.html). 24 | To use those backends, simply edit `lithops.yaml` with your `bucket` and `execution_role`. 25 | 26 | 1. Build a runtime image for Cubed 27 | ```bash 28 | export LITHOPS_CONFIG_FILE=$(pwd)/lithops.yaml 29 | lithops runtime build -b aws_lambda -f Dockerfile_virtualizarr virtualizarr-runtime 30 | ``` 31 | 32 | 1. Run the script 33 | ```bash 34 | python virtualizarr-with-lithops.py 35 | ``` 36 | 37 | ## Cleaning up 38 | To rebuild the Lithops image, delete the existing one by running 39 | ```bash 40 | lithops runtime delete -b aws_lambda -d virtualizarr-runtime 41 | ``` 42 | -------------------------------------------------------------------------------- /examples/virtualizarr-with-lithops/lithops.yaml: -------------------------------------------------------------------------------- 1 | lithops: 2 | backend: aws_lambda 3 | storage: aws_s3 4 | 5 | aws: 6 | region: us-west-2 7 | 8 | aws_lambda: 9 | execution_role: arn:aws:iam::807615458658:role/lambdaLithopsExecutionRole 10 | runtime: virtualizarr-runtime 11 | runtime_memory: 2000 12 | 13 | aws_s3: 14 | bucket: arn:aws:s3:::cubed-thodson-temp 15 | -------------------------------------------------------------------------------- /examples/virtualizarr-with-lithops/requirements.txt: -------------------------------------------------------------------------------- 1 | boto 2 | cftime 3 | h5py 4 | kerchunk 5 | lithops 6 | s3fs 7 | virtualizarr 8 | xarray 9 | -------------------------------------------------------------------------------- /examples/virtualizarr-with-lithops/virtualizarr-with-lithops.py: -------------------------------------------------------------------------------- 1 | # Use lithops to create a virtual dataset from a collection of necdf files on s3. 2 | # 3 | # Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook 4 | # by norlandrhagen. 5 | # 6 | # Please, contribute improvements. 7 | 8 | import fsspec 9 | import lithops 10 | import xarray as xr 11 | 12 | from virtualizarr import open_virtual_dataset 13 | 14 | # to demonstrate this workflow, we will use a collection of netcdf files from the WRF-SE-AK-AR5 project. 15 | fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True) 16 | files_paths = fs_read.glob("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/*") 17 | file_pattern = sorted(["s3://" + f for f in files_paths]) 18 | 19 | # optionally, truncate file_pattern while debugging 20 | # file_pattern = file_pattern[:4] 21 | 22 | print(f"{len(file_pattern)} file paths were retrieved.") 23 | 24 | 25 | def map_references(fil): 26 | """Map function to open virtual datasets.""" 27 | vds = open_virtual_dataset( 28 | fil, 29 | indexes={}, 30 | loadable_variables=["Time"], 31 | cftime_variables=["Time"], 32 | ) 33 | return vds 34 | 35 | 36 | def reduce_references(results): 37 | """Reduce to concat virtual datasets.""" 38 | combined_vds = xr.combine_nested( 39 | results, 40 | concat_dim=["Time"], 41 | coords="minimal", 42 | compat="override", 43 | ) 44 | return combined_vds 45 | 46 | 47 | fexec = lithops.FunctionExecutor(config_file="lithops.yaml") 48 | 49 | futures = fexec.map_reduce( 50 | map_references, 51 | file_pattern, 52 | reduce_references, 53 | spawn_reducer=100, 54 | ) 55 | 56 | ds = futures.get_result() 57 | 58 | # write out the virtual dataset to a kerchunk json 59 | ds.virtualize.to_kerchunk("combined.json", format="json") 60 | -------------------------------------------------------------------------------- /virtualizarr/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import version as _version 2 | 3 | from virtualizarr.accessor import ( 4 | VirtualiZarrDatasetAccessor, 5 | VirtualiZarrDataTreeAccessor, 6 | ) 7 | from virtualizarr.backend import open_virtual_dataset, open_virtual_mfdataset 8 | from virtualizarr.manifests import ChunkManifest, ManifestArray 9 | 10 | try: 11 | __version__ = _version("virtualizarr") 12 | except Exception: 13 | # Local copy or not installed with setuptools. 14 | # Disable minimum version checks on downstream libraries. 15 | __version__ = "9999" 16 | 17 | __all__ = [ 18 | "ChunkManifest", 19 | "ManifestArray", 20 | "VirtualiZarrDatasetAccessor", 21 | "VirtualiZarrDataTreeAccessor", 22 | "open_virtual_dataset", 23 | "open_virtual_mfdataset", 24 | ] 25 | -------------------------------------------------------------------------------- /virtualizarr/codecs.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Any, Tuple, Union 2 | 3 | import numpy as np 4 | import zarr 5 | from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec 6 | from zarr.abc.codec import Codec as ZarrCodec 7 | from zarr.core.codec_pipeline import BatchedCodecPipeline 8 | from zarr.core.metadata.v3 import ArrayV3Metadata 9 | 10 | if TYPE_CHECKING: 11 | from .manifests.array import ManifestArray 12 | 13 | CodecPipeline = Tuple[ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec, ...] 14 | 15 | DeconstructedCodecPipeline = tuple[ 16 | tuple[ArrayArrayCodec, ...], # Array-to-array transformations 17 | ArrayBytesCodec | None, # Array-to-bytes conversion 18 | tuple[BytesBytesCodec, ...], # Bytes-to-bytes transformations 19 | ] 20 | 21 | 22 | def numcodec_config_to_configurable(num_codec: dict) -> dict: 23 | """ 24 | Convert a numcodecs codec into a zarr v3 configurable. 25 | """ 26 | if num_codec["id"].startswith("numcodecs."): 27 | return num_codec 28 | 29 | num_codec_copy = num_codec.copy() 30 | name = "numcodecs." + num_codec_copy.pop("id") 31 | return {"name": name, "configuration": num_codec_copy} 32 | 33 | 34 | def extract_codecs( 35 | codecs: CodecPipeline, 36 | ) -> DeconstructedCodecPipeline: 37 | """Extracts various codec types.""" 38 | arrayarray_codecs: tuple[ArrayArrayCodec, ...] = () 39 | arraybytes_codec: ArrayBytesCodec | None = None 40 | bytesbytes_codecs: tuple[BytesBytesCodec, ...] = () 41 | for codec in codecs: 42 | if isinstance(codec, ArrayArrayCodec): 43 | arrayarray_codecs += (codec,) 44 | if isinstance(codec, ArrayBytesCodec): 45 | arraybytes_codec = codec 46 | if isinstance(codec, BytesBytesCodec): 47 | bytesbytes_codecs += (codec,) 48 | return (arrayarray_codecs, arraybytes_codec, bytesbytes_codecs) 49 | 50 | 51 | def convert_to_codec_pipeline( 52 | dtype: np.dtype, 53 | codecs: list[dict] | None = [], 54 | ) -> BatchedCodecPipeline: 55 | """ 56 | Convert list of codecs to valid BatchedCodecPipeline. 57 | 58 | Parameters 59 | ---------- 60 | dtype : np.dtype 61 | codecs: list[dict] | None 62 | 63 | Returns 64 | ------- 65 | BatchedCodecPipeline 66 | """ 67 | from zarr.core.array import _get_default_chunk_encoding_v3 68 | from zarr.registry import get_codec_class 69 | 70 | zarr_codecs: tuple[ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec, ...] = () 71 | if codecs and len(codecs) > 0: 72 | zarr_codecs = tuple( 73 | get_codec_class(codec["name"]).from_dict(codec) for codec in codecs 74 | ) 75 | 76 | # It would be nice to use zarr.core.codec_pipeline.codecs_from_list here but that function requires 77 | # array array codecs and array bytes codecs to already be present in the list and in the correct order. 78 | arrayarray_codecs, arraybytes_codec, bytesbytes_codecs = extract_codecs(zarr_codecs) 79 | 80 | if arraybytes_codec is None: 81 | arraybytes_codec = _get_default_chunk_encoding_v3(dtype)[1] 82 | 83 | codec_pipeline = BatchedCodecPipeline( 84 | array_array_codecs=arrayarray_codecs, 85 | array_bytes_codec=arraybytes_codec, 86 | bytes_bytes_codecs=bytesbytes_codecs, 87 | batch_size=1, 88 | ) 89 | 90 | return codec_pipeline 91 | 92 | 93 | def get_codec_config(codec: ZarrCodec) -> dict[str, Any]: 94 | """ 95 | Extract configuration from a codec, handling both zarr-python and numcodecs codecs. 96 | """ 97 | 98 | if hasattr(codec, "codec_config"): 99 | return codec.codec_config 100 | elif hasattr(codec, "get_config"): 101 | return codec.get_config() 102 | elif hasattr(codec, "_zstd_codec"): 103 | # related issue: https://github.com/zarr-developers/VirtualiZarr/issues/514 104 | # very silly workaround. codec.to_dict for zstd gives: 105 | # {'name': 'zstd', 'configuration': {'level': 0, 'checksum': False}} 106 | # which when passed through ArrayV2Metadata -> numcodecs.get_codec gives the error: 107 | # *** numcodecs.errors.UnknownCodecError: codec not available: 'None' 108 | # if codec._zstd_codec.get_config() : {'id': 'zstd', 'level': 0, 'checksum': False} 109 | # is passed to numcodecs.get_codec. It works fine. 110 | return codec._zstd_codec.get_config() 111 | elif hasattr(codec, "to_dict"): 112 | return codec.to_dict() 113 | else: 114 | raise ValueError(f"Unable to parse codec configuration: {codec}") 115 | 116 | 117 | def get_codecs(array: Union["ManifestArray", "zarr.Array"]) -> CodecPipeline: 118 | """ 119 | Get the zarr v3 codec pipeline for either a ManifestArray or a Zarr Array. 120 | 121 | Parameters 122 | ---------- 123 | array : Union[ManifestArray, Array] 124 | The input array, either ManifestArray or Zarr Array. 125 | 126 | Returns 127 | ------- 128 | CodecPipeline 129 | A tuple of zarr v3 codecs representing the codec pipeline. 130 | 131 | Raises 132 | ------ 133 | ValueError 134 | If the array type is unsupported or the array's metadata is not in zarr v3 format. 135 | """ 136 | if not isinstance(array.metadata, ArrayV3Metadata): 137 | raise ValueError( 138 | "Only zarr v3 format arrays are supported. Please convert your array to v3 format." 139 | ) 140 | 141 | return array.metadata.codecs 142 | -------------------------------------------------------------------------------- /virtualizarr/manifests/__init__.py: -------------------------------------------------------------------------------- 1 | # Note: This directory is named "manifests" rather than "manifest". 2 | # This is just to avoid conflicting with some type of file called manifest that .gitignore recommends ignoring. 3 | 4 | from virtualizarr.manifests.array import ManifestArray # type: ignore # noqa 5 | from virtualizarr.manifests.group import ManifestGroup # type: ignore # noqa 6 | from virtualizarr.manifests.manifest import ChunkEntry, ChunkManifest # type: ignore # noqa 7 | from virtualizarr.manifests.store import ManifestStore, ObjectStoreRegistry # type: ignore # noqa 8 | -------------------------------------------------------------------------------- /virtualizarr/manifests/group.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import textwrap 4 | from typing import Iterator, Mapping 5 | 6 | import xarray as xr 7 | from zarr.core.group import GroupMetadata 8 | 9 | from virtualizarr.manifests import ManifestArray 10 | 11 | 12 | class ManifestGroup( 13 | Mapping[str, "ManifestArray | ManifestGroup"], 14 | ): 15 | """ 16 | Immutable representation of a single virtual zarr group. 17 | """ 18 | 19 | _members: Mapping[str, "ManifestArray | ManifestGroup"] 20 | _metadata: GroupMetadata 21 | 22 | def __init__( 23 | self, 24 | arrays: Mapping[str, ManifestArray] | None = None, 25 | groups: Mapping[str, "ManifestGroup"] | None = None, 26 | attributes: dict | None = None, 27 | ) -> None: 28 | """ 29 | Create a ManifestGroup containing ManifestArrays and/or sub-groups, as well as any group-level metadata. 30 | 31 | Parameters 32 | ---------- 33 | arrays : Mapping[str, ManifestArray], optional 34 | ManifestArray objects to represent virtual zarr arrays. 35 | groups : Mapping[str, ManifestGroup], optional 36 | ManifestGroup objects to represent virtual zarr subgroups. 37 | attributes : dict, optional 38 | Zarr attributes to add as zarr group metadata. 39 | """ 40 | self._metadata = GroupMetadata(attributes=attributes) 41 | 42 | _arrays: Mapping[str, ManifestArray] = {} if arrays is None else arrays 43 | 44 | if groups: 45 | # TODO add support for nested groups 46 | raise NotImplementedError 47 | else: 48 | _groups: Mapping[str, ManifestGroup] = {} if groups is None else groups 49 | 50 | for name, arr in _arrays.items(): 51 | if not isinstance(arr, ManifestArray): 52 | raise TypeError( 53 | f"ManifestGroup can only wrap ManifestArray objects, but array {name} passed is of type {type(arr)}" 54 | ) 55 | 56 | # TODO type check groups passed 57 | 58 | # TODO check that all arrays have the same shapes or dimensions? 59 | # Technically that's allowed by the zarr model, so we should theoretically only check that upon converting to xarray 60 | 61 | colliding_names = set(_arrays.keys()).intersection(set(_groups.keys())) 62 | if colliding_names: 63 | raise ValueError( 64 | f"Some names collide as they are present in both the array and group keys: {colliding_names}" 65 | ) 66 | 67 | self._members = {**_arrays, **_groups} 68 | 69 | @property 70 | def metadata(self) -> GroupMetadata: 71 | """Zarr group metadata.""" 72 | return self._metadata 73 | 74 | @property 75 | def arrays(self) -> dict[str, ManifestArray]: 76 | """ManifestArrays contained in this group.""" 77 | return {k: v for k, v in self._members.items() if isinstance(v, ManifestArray)} 78 | 79 | @property 80 | def groups(self) -> dict[str, "ManifestGroup"]: 81 | """Subgroups contained in this group.""" 82 | return {k: v for k, v in self._members.items() if isinstance(v, ManifestGroup)} 83 | 84 | def __getitem__(self, path: str) -> "ManifestArray | ManifestGroup": 85 | """Obtain a group member.""" 86 | if "/" in path: 87 | raise ValueError( 88 | f"ManifestGroup.__getitem__ can only be used to get immediate subgroups and subarrays, but received multi-part path {path}" 89 | ) 90 | 91 | return self._members[path] 92 | 93 | def __iter__(self) -> Iterator[str]: 94 | return iter(self._members.keys()) 95 | 96 | def __len__(self) -> int: 97 | return len(self._members) 98 | 99 | def __repr__(self) -> str: 100 | return textwrap.dedent( 101 | f""" 102 | ManifestGroup( 103 | arrays={self.arrays}, 104 | groups={self.groups}, 105 | metadata={self.metadata}, 106 | ) 107 | """ 108 | ) 109 | 110 | def to_virtual_dataset(self) -> xr.Dataset: 111 | """ 112 | Create a "virtual" xarray.Dataset containing the contents of one zarr group. 113 | 114 | All variables in the returned Dataset will be "virtual", i.e. they will wrap ManifestArray objects. 115 | """ 116 | 117 | from virtualizarr.xarray import construct_fully_virtual_dataset 118 | 119 | # The xarray data model stores coordinate names outside of the arbitrary extra metadata it can store on a Dataset, 120 | # so to avoid that information being duplicated we strip it from the zarr group attributes before storing it. 121 | metadata_dict = self.metadata.to_dict() 122 | attributes = metadata_dict["attributes"] 123 | coord_names = attributes.pop("coordinates", []) 124 | 125 | virtual_vars = { 126 | name: marr.to_virtual_variable() for name, marr in self.arrays.items() 127 | } 128 | 129 | return construct_fully_virtual_dataset( 130 | virtual_vars=virtual_vars, 131 | coord_names=coord_names, 132 | attrs=attributes, 133 | ) 134 | -------------------------------------------------------------------------------- /virtualizarr/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/py.typed -------------------------------------------------------------------------------- /virtualizarr/readers/__init__.py: -------------------------------------------------------------------------------- 1 | from virtualizarr.readers.dmrpp import DMRPPVirtualBackend 2 | from virtualizarr.readers.fits import FITSVirtualBackend 3 | from virtualizarr.readers.hdf import HDFVirtualBackend 4 | from virtualizarr.readers.hdf5 import HDF5VirtualBackend 5 | from virtualizarr.readers.kerchunk import KerchunkVirtualBackend 6 | from virtualizarr.readers.netcdf3 import NetCDF3VirtualBackend 7 | from virtualizarr.readers.tiff import TIFFVirtualBackend 8 | from virtualizarr.readers.zarr import ( 9 | ZarrVirtualBackend, 10 | ) 11 | 12 | __all__ = [ 13 | "DMRPPVirtualBackend", 14 | "FITSVirtualBackend", 15 | "HDFVirtualBackend", 16 | "HDF5VirtualBackend", 17 | "KerchunkVirtualBackend", 18 | "NetCDF3VirtualBackend", 19 | "TIFFVirtualBackend", 20 | "ZarrVirtualBackend", 21 | ] 22 | -------------------------------------------------------------------------------- /virtualizarr/readers/api.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from collections.abc import Iterable, Mapping 3 | from typing import Optional 4 | 5 | import xarray as xr 6 | 7 | 8 | class VirtualBackend(ABC): 9 | @staticmethod 10 | def open_virtual_dataset( 11 | filepath: str, 12 | group: str | None = None, 13 | drop_variables: Iterable[str] | None = None, 14 | loadable_variables: Iterable[str] | None = None, 15 | decode_times: bool | None = None, 16 | indexes: Mapping[str, xr.Index] | None = None, 17 | virtual_backend_kwargs: Optional[dict] = None, 18 | reader_options: Optional[dict] = None, 19 | ) -> xr.Dataset: 20 | raise NotImplementedError() 21 | 22 | @staticmethod 23 | def open_virtual_datatree( 24 | path: str, 25 | group: str | None = None, 26 | drop_variables: Iterable[str] | None = None, 27 | loadable_variables: Iterable[str] | None = None, 28 | decode_times: bool | None = None, 29 | indexes: Mapping[str, xr.Index] | None = None, 30 | virtual_backend_kwargs: Optional[dict] = None, 31 | reader_options: Optional[dict] = None, 32 | ) -> xr.DataTree: 33 | raise NotImplementedError() 34 | -------------------------------------------------------------------------------- /virtualizarr/readers/fits.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Hashable, Iterable, Mapping, Optional 3 | 4 | from xarray import Dataset, Index 5 | 6 | from virtualizarr.readers.api import ( 7 | VirtualBackend, 8 | ) 9 | from virtualizarr.translators.kerchunk import ( 10 | extract_group, 11 | virtual_vars_and_metadata_from_kerchunk_refs, 12 | ) 13 | from virtualizarr.types.kerchunk import KerchunkStoreRefs 14 | from virtualizarr.xarray import construct_fully_virtual_dataset 15 | 16 | 17 | class FITSVirtualBackend(VirtualBackend): 18 | @staticmethod 19 | def open_virtual_dataset( 20 | filepath: str, 21 | group: str | None = None, 22 | drop_variables: Iterable[str] | None = None, 23 | loadable_variables: Iterable[str] | None = None, 24 | decode_times: bool | None = None, 25 | indexes: Mapping[str, Index] | None = None, 26 | virtual_backend_kwargs: Optional[dict] = None, 27 | reader_options: Optional[dict] = None, 28 | ) -> Dataset: 29 | from kerchunk.fits import process_file 30 | 31 | if virtual_backend_kwargs: 32 | raise NotImplementedError( 33 | "FITS reader does not understand any virtual_backend_kwargs" 34 | ) 35 | 36 | _drop_vars: list[Hashable] = ( 37 | [] if drop_variables is None else list(drop_variables) 38 | ) 39 | 40 | # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 41 | refs = KerchunkStoreRefs({"refs": process_file(filepath, **reader_options)}) 42 | 43 | # both group=None and group='' mean to read root group 44 | if group: 45 | refs = extract_group(refs, group) 46 | 47 | # TODO This wouldn't work until either you had an xarray backend for FITS installed, or issue #124 is implemented to load data from ManifestArrays directly 48 | if loadable_variables or indexes: 49 | raise NotImplementedError( 50 | "Cannot load variables or indexes from FITS files as there is no xarray backend engine for FITS" 51 | ) 52 | 53 | virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( 54 | refs, 55 | fs_root=Path.cwd().as_uri(), 56 | ) 57 | 58 | vds = construct_fully_virtual_dataset( 59 | virtual_vars=virtual_vars, 60 | coord_names=coord_names, 61 | attrs=attrs, 62 | ) 63 | 64 | return vds.drop_vars(_drop_vars) 65 | -------------------------------------------------------------------------------- /virtualizarr/readers/hdf/__init__.py: -------------------------------------------------------------------------------- 1 | from .hdf import ( 2 | HDFVirtualBackend, 3 | ) 4 | 5 | __all__ = [ 6 | "HDFVirtualBackend", 7 | ] 8 | -------------------------------------------------------------------------------- /virtualizarr/readers/hdf/filters.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import dataclasses 4 | from typing import TYPE_CHECKING, List, Tuple, TypedDict, Union 5 | 6 | import numcodecs.registry as registry 7 | import numpy as np 8 | from numcodecs.abc import Codec 9 | from numcodecs.fixedscaleoffset import FixedScaleOffset 10 | from xarray.coding.variables import _choose_float_dtype 11 | 12 | from virtualizarr.utils import soft_import 13 | 14 | h5py = soft_import("h5py", "For reading hdf files", strict=False) 15 | 16 | 17 | if TYPE_CHECKING: 18 | from h5py import Dataset 19 | 20 | 21 | hdf5plugin = soft_import( 22 | "hdf5plugin", "For reading hdf files with filters", strict=False 23 | ) 24 | imagecodecs = soft_import( 25 | "imagecodecs", "For reading hdf files with filters", strict=False 26 | ) 27 | 28 | _non_standard_filters = { 29 | "gzip": "zlib", 30 | "lzf": "imagecodecs_lzf", 31 | } 32 | 33 | _hdf5plugin_imagecodecs = {"lz4": "imagecodecs_lz4h5", "bzip2": "imagecodecs_bz2"} 34 | 35 | 36 | @dataclasses.dataclass 37 | class BloscProperties: 38 | blocksize: int 39 | clevel: int 40 | shuffle: int 41 | cname: str 42 | 43 | def __post_init__(self): 44 | blosc_compressor_codes = { 45 | value: key 46 | for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items() 47 | } 48 | self.cname = blosc_compressor_codes[self.cname] 49 | 50 | 51 | @dataclasses.dataclass 52 | class ZstdProperties: 53 | level: int 54 | 55 | 56 | @dataclasses.dataclass 57 | class ShuffleProperties: 58 | elementsize: int 59 | 60 | 61 | @dataclasses.dataclass 62 | class ZlibProperties: 63 | level: int 64 | 65 | 66 | class CFCodec(TypedDict): 67 | target_dtype: np.dtype 68 | codec: Codec 69 | 70 | 71 | def _filter_to_codec( 72 | filter_id: str, filter_properties: Union[int, None, Tuple] = None 73 | ) -> Codec: 74 | """ 75 | Convert an h5py filter to an equivalent numcodec 76 | 77 | Parameters 78 | ---------- 79 | filter_id: str 80 | An h5py filter id code. 81 | filter_properties : int or None or Tuple 82 | A single or Tuple of h5py filter configuration codes. 83 | 84 | Returns 85 | ------- 86 | A numcodec codec 87 | """ 88 | id_int = None 89 | id_str = None 90 | try: 91 | id_int = int(filter_id) 92 | except ValueError: 93 | id_str = filter_id 94 | conf = {} 95 | if id_str: 96 | if id_str in _non_standard_filters.keys(): 97 | id = _non_standard_filters[id_str] 98 | else: 99 | id = id_str 100 | if id == "zlib": 101 | zlib_props = ZlibProperties(level=filter_properties) # type: ignore 102 | conf = dataclasses.asdict(zlib_props) 103 | if id == "shuffle" and isinstance(filter_properties, tuple): 104 | shuffle_props = ShuffleProperties(elementsize=filter_properties[0]) 105 | conf = dataclasses.asdict(shuffle_props) 106 | conf["id"] = id # type: ignore[assignment] 107 | if id_int: 108 | filter = hdf5plugin.get_filters(id_int)[0] 109 | id = filter.filter_name 110 | if id in _hdf5plugin_imagecodecs.keys(): 111 | id = _hdf5plugin_imagecodecs[id] 112 | if id == "blosc" and isinstance(filter_properties, tuple): 113 | blosc_fields = [field.name for field in dataclasses.fields(BloscProperties)] 114 | blosc_props = BloscProperties( 115 | **{k: v for k, v in zip(blosc_fields, filter_properties[-4:])} 116 | ) 117 | conf = dataclasses.asdict(blosc_props) 118 | if id == "zstd" and isinstance(filter_properties, tuple): 119 | zstd_props = ZstdProperties(level=filter_properties[0]) 120 | conf = dataclasses.asdict(zstd_props) 121 | conf["id"] = id 122 | codec = registry.get_codec(conf) 123 | return codec 124 | 125 | 126 | def cfcodec_from_dataset(dataset: Dataset) -> Codec | None: 127 | """ 128 | Converts select h5py dataset CF convention attrs to CFCodec 129 | 130 | Parameters 131 | ---------- 132 | dataset: h5py.Dataset 133 | An h5py dataset. 134 | 135 | Returns 136 | ------- 137 | CFCodec 138 | A CFCodec. 139 | """ 140 | attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} 141 | mapping = {} 142 | if "scale_factor" in attributes: 143 | try: 144 | scale_factor = attributes["scale_factor"][0] 145 | except IndexError: 146 | scale_factor = attributes["scale_factor"] 147 | mapping["scale_factor"] = float(1 / scale_factor) 148 | else: 149 | mapping["scale_factor"] = 1 150 | if "add_offset" in attributes: 151 | try: 152 | offset = attributes["add_offset"][0] 153 | except IndexError: 154 | offset = attributes["add_offset"] 155 | mapping["add_offset"] = float(offset) 156 | else: 157 | mapping["add_offset"] = 0 158 | if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0: 159 | float_dtype = _choose_float_dtype(dtype=dataset.dtype, mapping=mapping) 160 | target_dtype = np.dtype(float_dtype) 161 | codec = FixedScaleOffset( 162 | offset=mapping["add_offset"], 163 | scale=mapping["scale_factor"], 164 | dtype=target_dtype, 165 | astype=dataset.dtype, 166 | ) 167 | cfcodec = CFCodec(target_dtype=target_dtype, codec=codec) 168 | return cfcodec 169 | else: 170 | return None 171 | 172 | 173 | def codecs_from_dataset(dataset: Dataset) -> List[Codec]: 174 | """ 175 | Extracts a list of numcodecs from an h5py dataset 176 | 177 | Parameters 178 | ---------- 179 | dataset: h5py.Dataset 180 | An h5py dataset. 181 | 182 | Returns 183 | ------- 184 | list 185 | A list of numcodecs codecs. 186 | """ 187 | codecs = [] 188 | for filter_id, filter_properties in dataset._filters.items(): 189 | codec = _filter_to_codec(filter_id, filter_properties) 190 | codecs.append(codec) 191 | return codecs 192 | -------------------------------------------------------------------------------- /virtualizarr/readers/hdf5.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Hashable, Iterable, Mapping, Optional 3 | 4 | from xarray import Dataset, Index 5 | 6 | from virtualizarr.readers.api import VirtualBackend 7 | from virtualizarr.translators.kerchunk import ( 8 | extract_group, 9 | virtual_vars_and_metadata_from_kerchunk_refs, 10 | ) 11 | from virtualizarr.xarray import ( 12 | construct_fully_virtual_dataset, 13 | construct_virtual_dataset, 14 | ) 15 | 16 | 17 | class HDF5VirtualBackend(VirtualBackend): 18 | @staticmethod 19 | def open_virtual_dataset( 20 | filepath: str, 21 | group: str | None = None, 22 | drop_variables: Iterable[str] | None = None, 23 | loadable_variables: Iterable[str] | None = None, 24 | decode_times: bool | None = None, 25 | indexes: Mapping[str, Index] | None = None, 26 | virtual_backend_kwargs: Optional[dict] = None, 27 | reader_options: Optional[dict] = None, 28 | ) -> Dataset: 29 | from kerchunk.hdf import SingleHdf5ToZarr 30 | 31 | if virtual_backend_kwargs: 32 | raise NotImplementedError( 33 | "HDF5 reader does not understand any virtual_backend_kwargs" 34 | ) 35 | 36 | _drop_vars: list[Hashable] = ( 37 | [] if drop_variables is None else list(drop_variables) 38 | ) 39 | 40 | refs = SingleHdf5ToZarr( 41 | filepath, inline_threshold=0, **reader_options 42 | ).translate() 43 | 44 | # both group=None and group='' mean to read root group 45 | if group: 46 | refs = extract_group(refs, group) 47 | 48 | virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( 49 | refs, 50 | fs_root=Path.cwd().as_uri(), 51 | ) 52 | 53 | fully_virtual_dataset = construct_fully_virtual_dataset( 54 | virtual_vars=virtual_vars, 55 | coord_names=coord_names, 56 | attrs=attrs, 57 | ) 58 | 59 | vds = construct_virtual_dataset( 60 | fully_virtual_ds=fully_virtual_dataset, 61 | filepath=filepath, 62 | group=group, 63 | loadable_variables=loadable_variables, 64 | reader_options=reader_options, 65 | indexes=indexes, 66 | decode_times=decode_times, 67 | ) 68 | 69 | return vds.drop_vars(_drop_vars) 70 | -------------------------------------------------------------------------------- /virtualizarr/readers/kerchunk.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Hashable, Iterable, Mapping, Optional 3 | 4 | import ujson 5 | from xarray import Dataset, Index 6 | 7 | from virtualizarr.readers.api import VirtualBackend 8 | from virtualizarr.translators.kerchunk import dataset_from_kerchunk_refs 9 | from virtualizarr.types.kerchunk import ( 10 | KerchunkStoreRefs, 11 | ) 12 | from virtualizarr.utils import _FsspecFSFromFilepath 13 | 14 | 15 | class KerchunkVirtualBackend(VirtualBackend): 16 | @staticmethod 17 | def open_virtual_dataset( 18 | filepath: str, 19 | group: str | None = None, 20 | drop_variables: Iterable[str] | None = None, 21 | loadable_variables: Iterable[str] | None = None, 22 | decode_times: bool | None = None, 23 | indexes: Mapping[str, Index] | None = None, 24 | virtual_backend_kwargs: Optional[dict] = None, 25 | reader_options: Optional[dict] = None, 26 | ) -> Dataset: 27 | """Reads existing kerchunk references (in JSON or parquet) format.""" 28 | 29 | if virtual_backend_kwargs is None: 30 | virtual_backend_kwargs = {} 31 | 32 | _drop_vars: list[Hashable] = ( 33 | [] if drop_variables is None else list(drop_variables) 34 | ) 35 | 36 | fs_root = virtual_backend_kwargs.pop("fs_root", None) 37 | 38 | if virtual_backend_kwargs: 39 | raise NotImplementedError( 40 | f"Kerchunk reader does not understand any of the virtual_backend_kwargs {virtual_backend_kwargs}" 41 | ) 42 | 43 | if group: 44 | raise NotImplementedError() 45 | 46 | if loadable_variables or indexes or decode_times: 47 | raise NotImplementedError() 48 | 49 | # TODO: whilst this keeps backwards-compatible behaviour for the `loadable_variables`` kwarg, 50 | # it probably has to change, see https://github.com/zarr-developers/VirtualiZarr/pull/477/#issuecomment-2744448626 51 | if loadable_variables is None or indexes is None: 52 | warnings.warn( 53 | "The default value of the `loadable_variables` kwarg may attempt to load data from the referenced virtual chunks." 54 | "As this is unlikely to be the desired behaviour when opening a Kerchunk file, `loadable_variables` has been overridden, and set to `loadable_variables=[]`." 55 | "To silence this warning pass `loadable_variables` explicitly.", 56 | UserWarning, 57 | ) 58 | loadable_variables = [] 59 | indexes = {} 60 | 61 | fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) 62 | 63 | # The kerchunk .parquet storage format isn't actually a parquet, but a directory that contains named parquets for each group/variable. 64 | if fs.filepath.endswith(".parquet") and fs.fs.isfile( 65 | f"{fs.filepath}/.zmetadata" 66 | ): 67 | from fsspec.implementations.reference import LazyReferenceMapper 68 | 69 | lrm = LazyReferenceMapper(filepath, fs.fs) 70 | 71 | # build reference dict from KV pairs in LazyReferenceMapper 72 | # is there a better / more performant way to extract this? 73 | array_refs = {k: lrm[k] for k in lrm.keys()} 74 | 75 | full_reference = {"refs": array_refs} 76 | 77 | vds = dataset_from_kerchunk_refs( 78 | KerchunkStoreRefs(full_reference), fs_root=fs_root 79 | ) 80 | 81 | # JSON has no magic bytes, but the Kerchunk version 1 spec starts with 'version': 82 | # https://fsspec.github.io/kerchunk/spec.html 83 | elif fs.read_bytes(9).startswith(b'{"version'): 84 | with fs.open_file() as of: 85 | refs = ujson.load(of) 86 | 87 | vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(refs), fs_root=fs_root) 88 | 89 | else: 90 | raise ValueError( 91 | "The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. If your Kerchunk generated references are saved in parquet format, make sure the file extension is `.parquet`. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues" 92 | ) 93 | 94 | # TODO would be more efficient to drop these before converting them into ManifestArrays, i.e. drop them from the kerchunk refs dict 95 | return vds.drop_vars(_drop_vars) 96 | -------------------------------------------------------------------------------- /virtualizarr/readers/netcdf3.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Hashable, Iterable, Mapping, Optional 3 | 4 | from xarray import Dataset, Index 5 | 6 | from virtualizarr.readers.api import VirtualBackend 7 | from virtualizarr.translators.kerchunk import ( 8 | virtual_vars_and_metadata_from_kerchunk_refs, 9 | ) 10 | from virtualizarr.xarray import ( 11 | construct_fully_virtual_dataset, 12 | construct_virtual_dataset, 13 | ) 14 | 15 | 16 | class NetCDF3VirtualBackend(VirtualBackend): 17 | @staticmethod 18 | def open_virtual_dataset( 19 | filepath: str, 20 | group: str | None = None, 21 | drop_variables: Iterable[str] | None = None, 22 | loadable_variables: Iterable[str] | None = None, 23 | decode_times: bool | None = None, 24 | indexes: Mapping[str, Index] | None = None, 25 | virtual_backend_kwargs: Optional[dict] = None, 26 | reader_options: Optional[dict] = None, 27 | ) -> Dataset: 28 | from kerchunk.netCDF3 import NetCDF3ToZarr 29 | 30 | if virtual_backend_kwargs: 31 | raise NotImplementedError( 32 | "netcdf3 reader does not understand any virtual_backend_kwargs" 33 | ) 34 | 35 | _drop_vars: list[Hashable] = ( 36 | [] if drop_variables is None else list(drop_variables) 37 | ) 38 | 39 | refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate() 40 | 41 | # both group=None and group='' mean to read root group 42 | if group: 43 | raise ValueError( 44 | "group kwarg passed, but netCDF3 files can't have multiple groups!" 45 | ) 46 | 47 | virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( 48 | refs, 49 | fs_root=Path.cwd().as_uri(), 50 | ) 51 | 52 | fully_virtual_dataset = construct_fully_virtual_dataset( 53 | virtual_vars=virtual_vars, 54 | coord_names=coord_names, 55 | attrs=attrs, 56 | ) 57 | 58 | vds = construct_virtual_dataset( 59 | fully_virtual_ds=fully_virtual_dataset, 60 | filepath=filepath, 61 | group=group, 62 | loadable_variables=loadable_variables, 63 | reader_options=reader_options, 64 | indexes=indexes, 65 | decode_times=decode_times, 66 | ) 67 | 68 | return vds.drop_vars(_drop_vars) 69 | -------------------------------------------------------------------------------- /virtualizarr/readers/tiff.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from pathlib import Path 3 | from typing import Hashable, Iterable, Mapping, Optional 4 | 5 | from xarray import Dataset, Index 6 | 7 | from virtualizarr.readers.api import VirtualBackend 8 | from virtualizarr.translators.kerchunk import ( 9 | extract_group, 10 | virtual_vars_and_metadata_from_kerchunk_refs, 11 | ) 12 | from virtualizarr.types.kerchunk import KerchunkStoreRefs 13 | from virtualizarr.xarray import ( 14 | construct_fully_virtual_dataset, 15 | construct_virtual_dataset, 16 | ) 17 | 18 | 19 | class TIFFVirtualBackend(VirtualBackend): 20 | @staticmethod 21 | def open_virtual_dataset( 22 | filepath: str, 23 | group: str | None = None, 24 | drop_variables: Iterable[str] | None = None, 25 | loadable_variables: Iterable[str] | None = None, 26 | decode_times: bool | None = None, 27 | indexes: Mapping[str, Index] | None = None, 28 | virtual_backend_kwargs: Optional[dict] = None, 29 | reader_options: Optional[dict] = None, 30 | ) -> Dataset: 31 | if virtual_backend_kwargs: 32 | raise NotImplementedError( 33 | "TIFF reader does not understand any virtual_backend_kwargs" 34 | ) 35 | 36 | from kerchunk.tiff import tiff_to_zarr 37 | 38 | if reader_options is None: 39 | reader_options = {} 40 | 41 | reader_options.pop("storage_options", {}) 42 | warnings.warn( 43 | "storage_options have been dropped from reader_options as they are not supported by kerchunk.tiff.tiff_to_zarr", 44 | UserWarning, 45 | ) 46 | 47 | _drop_vars: list[Hashable] = ( 48 | [] if drop_variables is None else list(drop_variables) 49 | ) 50 | 51 | # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 52 | refs = KerchunkStoreRefs({"refs": tiff_to_zarr(filepath, **reader_options)}) 53 | 54 | # both group=None and group='' mean to read root group 55 | if group: 56 | refs = extract_group(refs, group) 57 | 58 | virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( 59 | refs, 60 | fs_root=Path.cwd().as_uri(), 61 | ) 62 | 63 | fully_virtual_dataset = construct_fully_virtual_dataset( 64 | virtual_vars=virtual_vars, 65 | coord_names=coord_names, 66 | attrs=attrs, 67 | ) 68 | 69 | vds = construct_virtual_dataset( 70 | fully_virtual_ds=fully_virtual_dataset, 71 | filepath=filepath, 72 | group=group, 73 | loadable_variables=loadable_variables, 74 | reader_options=reader_options, 75 | indexes=indexes, 76 | decode_times=decode_times, 77 | ) 78 | 79 | return vds.drop_vars(_drop_vars) 80 | -------------------------------------------------------------------------------- /virtualizarr/readers/zarr.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | from pathlib import Path # noqa 5 | from typing import ( 6 | Any, 7 | Hashable, 8 | Iterable, 9 | Mapping, 10 | Optional, 11 | ) 12 | 13 | import numpy as np 14 | from xarray import Dataset, Index 15 | from zarr.api.asynchronous import open_group as open_group_async 16 | from zarr.core.metadata import ArrayV3Metadata 17 | 18 | from virtualizarr.manifests import ( 19 | ChunkManifest, 20 | ManifestArray, 21 | ManifestGroup, 22 | ManifestStore, 23 | ) 24 | from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri # noqa 25 | from virtualizarr.readers.api import VirtualBackend 26 | from virtualizarr.vendor.zarr.core.common import _concurrent_map 27 | 28 | FillValueT = bool | str | float | int | list | None 29 | 30 | ZARR_DEFAULT_FILL_VALUE: dict[str, FillValueT] = { 31 | # numpy dtypes's hierarchy lets us avoid checking for all the widths 32 | # https://numpy.org/doc/stable/reference/arrays.scalars.html 33 | np.dtype("bool").kind: False, 34 | np.dtype("int").kind: 0, 35 | np.dtype("float").kind: 0.0, 36 | np.dtype("complex").kind: [0.0, 0.0], 37 | np.dtype("datetime64").kind: 0, 38 | } 39 | 40 | 41 | import zarr 42 | 43 | 44 | async def get_chunk_mapping_prefix(zarr_array: zarr.AsyncArray, filepath: str) -> dict: 45 | """Create a dictionary to pass into ChunkManifest __init__""" 46 | 47 | # TODO: For when we want to support reading V2 we should parse the /c/ and "/" between chunks 48 | if zarr_array.shape == (): 49 | # If we have a scalar array `c` 50 | # https://zarr-specs.readthedocs.io/en/latest/v3/chunk-key-encodings/default/index.html#description 51 | 52 | prefix = zarr_array.name.lstrip("/") + "/c" 53 | prefix_keys = [(prefix,)] 54 | _lengths = [await zarr_array.store.getsize("c")] 55 | _dict_keys = ["c"] 56 | _paths = [filepath + "/" + _dict_keys[0]] 57 | 58 | else: 59 | prefix = zarr_array.name.lstrip("/") + "/c/" 60 | prefix_keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)] 61 | _lengths = await _concurrent_map(prefix_keys, zarr_array.store.getsize) 62 | chunk_keys = [x[0].split(prefix)[1] for x in prefix_keys] 63 | _dict_keys = [key.replace("/", ".") for key in chunk_keys] 64 | _paths = [filepath + "/" + prefix + key for key in chunk_keys] 65 | 66 | _offsets = [0] * len(_lengths) 67 | return { 68 | key: {"path": path, "offset": offset, "length": length} 69 | for key, path, offset, length in zip( 70 | _dict_keys, 71 | _paths, 72 | _offsets, 73 | _lengths, 74 | ) 75 | } 76 | 77 | 78 | async def build_chunk_manifest( 79 | zarr_array: zarr.AsyncArray, filepath: str 80 | ) -> ChunkManifest: 81 | """Build a ChunkManifest from a dictionary""" 82 | chunk_map = await get_chunk_mapping_prefix(zarr_array=zarr_array, filepath=filepath) 83 | return ChunkManifest(chunk_map) 84 | 85 | 86 | def get_metadata(zarr_array: zarr.AsyncArray[Any]) -> ArrayV3Metadata: 87 | fill_value = zarr_array.metadata.fill_value 88 | if fill_value is not None: 89 | fill_value = ZARR_DEFAULT_FILL_VALUE[zarr_array.metadata.fill_value.dtype.kind] 90 | 91 | zarr_format = zarr_array.metadata.zarr_format 92 | 93 | if zarr_format == 2: 94 | # TODO: Once we want to support V2, we will have to deconstruct the 95 | # zarr_array codecs etc. and reconstruct them with create_v3_array_metadata 96 | raise NotImplementedError("Reading Zarr V2 currently not supported.") 97 | 98 | elif zarr_format == 3: 99 | return zarr_array.metadata 100 | 101 | else: 102 | raise NotImplementedError("Zarr format is not recognized as v2 or v3.") 103 | 104 | 105 | async def _construct_manifest_array(zarr_array: zarr.AsyncArray[Any], filepath: str): 106 | array_metadata = get_metadata(zarr_array=zarr_array) 107 | 108 | chunk_manifest = await build_chunk_manifest(zarr_array, filepath=filepath) 109 | return ManifestArray(metadata=array_metadata, chunkmanifest=chunk_manifest) 110 | 111 | 112 | async def _construct_manifest_group( 113 | filepath: str, 114 | *, 115 | reader_options: Optional[dict] = None, 116 | drop_variables: str | Iterable[str] | None = None, 117 | group: str | None = None, 118 | ): 119 | reader_options = reader_options or {} 120 | zarr_group = await open_group_async( 121 | filepath, 122 | storage_options=reader_options.get("storage_options"), 123 | path=group, 124 | mode="r", 125 | ) 126 | 127 | zarr_array_keys = [key async for key in zarr_group.array_keys()] 128 | 129 | _drop_vars: list[Hashable] = [] if drop_variables is None else list(drop_variables) 130 | 131 | zarr_arrays = await asyncio.gather( 132 | *[zarr_group.getitem(var) for var in zarr_array_keys if var not in _drop_vars] 133 | ) 134 | 135 | manifest_arrays = await asyncio.gather( 136 | *[ 137 | _construct_manifest_array(zarr_array=array, filepath=filepath) # type: ignore[arg-type] 138 | for array in zarr_arrays 139 | ] 140 | ) 141 | 142 | manifest_dict = { 143 | array.basename: result for array, result in zip(zarr_arrays, manifest_arrays) 144 | } 145 | return ManifestGroup(manifest_dict, attributes=zarr_group.attrs) 146 | 147 | 148 | def _construct_manifest_store( 149 | filepath: str, 150 | *, 151 | reader_options: Optional[dict] = None, 152 | drop_variables: str | Iterable[str] | None = None, 153 | group: str | None = None, 154 | ) -> ManifestStore: 155 | import asyncio 156 | 157 | manifest_group = asyncio.run( 158 | _construct_manifest_group( 159 | filepath=filepath, 160 | group=group, 161 | drop_variables=drop_variables, 162 | reader_options=reader_options, 163 | ) 164 | ) 165 | return ManifestStore(manifest_group) 166 | 167 | 168 | class ZarrVirtualBackend(VirtualBackend): 169 | @staticmethod 170 | def open_virtual_dataset( 171 | filepath: str, 172 | group: str | None = None, 173 | drop_variables: str | Iterable[str] | None = None, 174 | loadable_variables: Iterable[str] | None = None, 175 | decode_times: bool | None = None, 176 | indexes: Mapping[str, Index] | None = None, 177 | virtual_backend_kwargs: Optional[dict] = None, 178 | reader_options: Optional[dict] = None, 179 | ) -> Dataset: 180 | filepath = validate_and_normalize_path_to_uri( 181 | filepath, fs_root=Path.cwd().as_uri() 182 | ) 183 | 184 | manifest_store = _construct_manifest_store( 185 | filepath=filepath, 186 | group=group, 187 | drop_variables=drop_variables, 188 | reader_options=reader_options, 189 | ) 190 | 191 | ds = manifest_store.to_virtual_dataset( 192 | loadable_variables=loadable_variables, 193 | decode_times=decode_times, 194 | indexes=indexes, 195 | ) 196 | return ds 197 | -------------------------------------------------------------------------------- /virtualizarr/tests/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | import pytest 4 | from packaging.version import Version 5 | 6 | from virtualizarr.readers import HDF5VirtualBackend 7 | from virtualizarr.readers.hdf import HDFVirtualBackend 8 | 9 | requires_network = pytest.mark.network 10 | requires_minio = pytest.mark.minio 11 | 12 | 13 | def _importorskip( 14 | modname: str, minversion: str | None = None 15 | ) -> tuple[bool, pytest.MarkDecorator]: 16 | try: 17 | mod = importlib.import_module(modname) 18 | has = True 19 | if minversion is not None: 20 | v = getattr(mod, "__version__", "999") 21 | if Version(v) < Version(minversion): 22 | raise ImportError("Minimum version not satisfied") 23 | except ImportError: 24 | has = False 25 | 26 | reason = f"requires {modname}" 27 | if minversion is not None: 28 | reason += f">={minversion}" 29 | func = pytest.mark.skipif(not has, reason=reason) 30 | return has, func 31 | 32 | 33 | has_astropy, requires_astropy = _importorskip("astropy") 34 | has_icechunk, requires_icechunk = _importorskip("icechunk") 35 | has_kerchunk, requires_kerchunk = _importorskip("kerchunk") 36 | has_fastparquet, requires_fastparquet = _importorskip("fastparquet") 37 | has_s3fs, requires_s3fs = _importorskip("s3fs") 38 | has_lithops, requires_lithops = _importorskip("lithops") 39 | has_scipy, requires_scipy = _importorskip("scipy") 40 | has_tifffile, requires_tifffile = _importorskip("tifffile") 41 | has_imagecodecs, requires_imagecodecs = _importorskip("imagecodecs") 42 | has_hdf5plugin, requires_hdf5plugin = _importorskip("hdf5plugin") 43 | has_zarr_python, requires_zarr_python = _importorskip("zarr") 44 | has_dask, requires_dask = _importorskip("dask") 45 | has_obstore, requires_obstore = _importorskip("obstore") 46 | 47 | parametrize_over_hdf_backends = pytest.mark.parametrize( 48 | "hdf_backend", 49 | [HDF5VirtualBackend, HDFVirtualBackend] if has_kerchunk else [HDFVirtualBackend], 50 | ) 51 | -------------------------------------------------------------------------------- /virtualizarr/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | 4 | import pytest 5 | 6 | 7 | @pytest.fixture(scope="session") 8 | def container(): 9 | import docker 10 | 11 | client = docker.from_env() 12 | port = 9000 13 | minio_container = client.containers.run( 14 | "quay.io/minio/minio", 15 | "server /data", 16 | detach=True, 17 | ports={f"{port}/tcp": port}, 18 | environment={ 19 | "MINIO_ACCESS_KEY": "minioadmin", 20 | "MINIO_SECRET_KEY": "minioadmin", 21 | }, 22 | ) 23 | time.sleep(3) # give it time to boot 24 | # enter 25 | yield { 26 | "port": port, 27 | "endpoint": f"http://localhost:{port}", 28 | "username": "minioadmin", 29 | "password": "minioadmin", 30 | } 31 | # exit 32 | minio_container.stop() 33 | minio_container.remove() 34 | 35 | 36 | @pytest.fixture(scope="session") 37 | def minio_bucket(container): 38 | # Setup with guidance from https://medium.com/@sant1/using-minio-with-docker-and-python-cbbad397cb5d 39 | from minio import Minio 40 | 41 | bucket = "my-bucket" 42 | filename = "test.nc" 43 | # Initialize MinIO client 44 | client = Minio( 45 | "localhost:9000", 46 | access_key=container["username"], 47 | secret_key=container["password"], 48 | secure=False, 49 | ) 50 | client.make_bucket(bucket) 51 | policy = { 52 | "Version": "2012-10-17", 53 | "Statement": [ 54 | { 55 | "Effect": "Allow", 56 | "Principal": {"AWS": "*"}, 57 | "Action": ["s3:GetBucketLocation", "s3:ListBucket"], 58 | "Resource": "arn:aws:s3:::my-bucket", 59 | }, 60 | { 61 | "Effect": "Allow", 62 | "Principal": {"AWS": "*"}, 63 | "Action": [ 64 | "s3:GetObject", 65 | "s3:GetObjectRetention", 66 | "s3:GetObjectLegalHold", 67 | ], 68 | "Resource": "arn:aws:s3:::my-bucket/*", 69 | }, 70 | ], 71 | } 72 | client.set_bucket_policy(bucket, json.dumps(policy)) 73 | yield { 74 | "port": container["port"], 75 | "endpoint": container["endpoint"], 76 | "username": container["username"], 77 | "password": container["password"], 78 | "bucket": bucket, 79 | "file": filename, 80 | "client": client, 81 | } 82 | -------------------------------------------------------------------------------- /virtualizarr/tests/test_codecs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from zarr.codecs import BytesCodec 4 | from zarr.core.codec_pipeline import BatchedCodecPipeline 5 | from zarr.registry import get_codec_class 6 | 7 | from conftest import ( 8 | ARRAYBYTES_CODEC, 9 | BLOSC_CODEC, 10 | DELTA_CODEC, 11 | ZLIB_CODEC, 12 | ) 13 | from virtualizarr.codecs import ( 14 | convert_to_codec_pipeline, 15 | extract_codecs, 16 | get_codec_config, 17 | get_codecs, 18 | ) 19 | 20 | 21 | class TestGetCodecs: 22 | """Test the get_codecs function.""" 23 | 24 | def test_manifest_array_zarr_v3_default(self, manifest_array): 25 | """Test get_codecs with ManifestArray using default v3 codec.""" 26 | test_manifest_array = manifest_array(codecs=None) 27 | actual_codecs = get_codecs(test_manifest_array) 28 | expected_codecs = tuple([BytesCodec(endian="little")]) 29 | assert actual_codecs == expected_codecs 30 | 31 | def test_manifest_array_with_codecs(self, manifest_array): 32 | """Test get_codecs with ManifestArray using multiple v3 codecs.""" 33 | test_codecs = [DELTA_CODEC, ARRAYBYTES_CODEC, BLOSC_CODEC] 34 | manifest_array = manifest_array(codecs=test_codecs) 35 | actual_codecs = get_codecs(manifest_array) 36 | assert actual_codecs == tuple( 37 | [ 38 | get_codec_class(codec["name"])(**codec["configuration"]) 39 | for codec in test_codecs 40 | ] 41 | ) 42 | 43 | def test_zarr_v3_default_codecs(self, zarr_array): 44 | """Test get_codecs with Zarr array using default v3 codec.""" 45 | zarr_array = zarr_array() 46 | actual_codecs = get_codecs(zarr_array) 47 | assert isinstance(actual_codecs[0], BytesCodec) 48 | 49 | def test_zarr_v3_with_codecs(self, zarr_array): 50 | """Test get_codecs with Zarr array using multiple v3 codecs.""" 51 | test_codecs = [DELTA_CODEC, ARRAYBYTES_CODEC, BLOSC_CODEC] 52 | zarr_array = zarr_array(codecs=test_codecs) 53 | actual_codecs = get_codecs(zarr_array) 54 | assert actual_codecs == tuple( 55 | [ 56 | get_codec_class(codec["name"])(**codec["configuration"]) 57 | for codec in test_codecs 58 | ] 59 | ) 60 | 61 | def test_zarr_v2_error(self, zarr_array): 62 | """Test that using v2 format raises an error.""" 63 | zarr_array = zarr_array(zarr_format=2) 64 | with pytest.raises( 65 | ValueError, 66 | match="Only zarr v3 format arrays are supported. Please convert your array to v3 format.", 67 | ): 68 | get_codecs(zarr_array) 69 | 70 | 71 | class TestConvertToCodecPipeline: 72 | """Test the convert_to_codec_pipeline function.""" 73 | 74 | @pytest.mark.parametrize( 75 | "input_codecs,expected_pipeline", 76 | [ 77 | # Case 1: No codecs - should result in just BytesCodec 78 | ( 79 | None, 80 | BatchedCodecPipeline( 81 | array_array_codecs=(), 82 | array_bytes_codec=BytesCodec(endian="little"), 83 | bytes_bytes_codecs=(), 84 | batch_size=1, 85 | ), 86 | ), 87 | # Case 2: Delta codec - should result in DeltaCodec + BytesCodec 88 | ( 89 | [DELTA_CODEC], 90 | BatchedCodecPipeline( 91 | array_array_codecs=( 92 | get_codec_class("numcodecs.delta").from_dict(DELTA_CODEC), # type: ignore[arg-type] 93 | ), 94 | array_bytes_codec=BytesCodec(endian="little"), 95 | bytes_bytes_codecs=(), 96 | batch_size=1, 97 | ), 98 | ), 99 | # Case 3: Delta + Blosc + Zlib - should result in all codecs + BytesCodec 100 | ( 101 | [DELTA_CODEC, BLOSC_CODEC, ZLIB_CODEC], 102 | BatchedCodecPipeline( 103 | array_array_codecs=( 104 | get_codec_class("numcodecs.delta").from_dict(DELTA_CODEC), # type: ignore[arg-type] 105 | ), 106 | array_bytes_codec=BytesCodec(endian="little"), 107 | bytes_bytes_codecs=( 108 | get_codec_class(key="blosc").from_dict(BLOSC_CODEC), # type: ignore[arg-type] 109 | get_codec_class("numcodecs.zlib").from_dict(ZLIB_CODEC), # type: ignore[arg-type] 110 | ), 111 | batch_size=1, 112 | ), 113 | ), 114 | ], 115 | ) 116 | def test_convert_to_codec_pipeline_scenarios(self, input_codecs, expected_pipeline): 117 | """Test different scenarios for convert_to_codec_pipeline function.""" 118 | dtype = np.dtype("}, 31 | groups={}, 32 | metadata=GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group'), 33 | ) 34 | """ 35 | ) 36 | assert repr(manifest_group) == expected_repr 37 | -------------------------------------------------------------------------------- /virtualizarr/tests/test_readers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/tests/test_readers/__init__.py -------------------------------------------------------------------------------- /virtualizarr/tests/test_readers/test_fits.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from xarray import Dataset 3 | 4 | from virtualizarr import open_virtual_dataset 5 | from virtualizarr.tests import requires_kerchunk, requires_network 6 | 7 | pytest.importorskip("astropy") 8 | 9 | 10 | @requires_kerchunk 11 | @requires_network 12 | @pytest.mark.xfail( 13 | reason="Big endian not yet supported by zarr-python 3.0" 14 | ) # https://github.com/zarr-developers/zarr-python/issues/2324 15 | def test_open_hubble_data(): 16 | # data from https://registry.opendata.aws/hst/ 17 | vds = open_virtual_dataset( 18 | "s3://stpubdata/hst/public/f05i/f05i0201m/f05i0201m_a1f.fits", 19 | reader_options={"storage_options": {"anon": True}}, 20 | ) 21 | 22 | assert isinstance(vds, Dataset) 23 | assert list(vds.variables) == ["PRIMARY"] 24 | var = vds["PRIMARY"].variable 25 | assert var.sizes == {"y": 17, "x": 589} 26 | assert var.dtype == ">i4" 27 | -------------------------------------------------------------------------------- /virtualizarr/tests/test_readers/test_hdf/test_hdf.py: -------------------------------------------------------------------------------- 1 | import h5py # type: ignore 2 | import numpy as np 3 | import pytest 4 | from obstore.store import LocalStore 5 | 6 | from virtualizarr import open_virtual_dataset 7 | from virtualizarr.readers.hdf import HDFVirtualBackend 8 | from virtualizarr.tests import ( 9 | requires_hdf5plugin, 10 | requires_imagecodecs, 11 | ) 12 | 13 | 14 | @requires_hdf5plugin 15 | @requires_imagecodecs 16 | class TestDatasetChunkManifest: 17 | def test_empty_chunks(self, empty_chunks_hdf5_file): 18 | f = h5py.File(empty_chunks_hdf5_file) 19 | ds = f["data"] 20 | manifest = HDFVirtualBackend._dataset_chunk_manifest( 21 | path=empty_chunks_hdf5_file, dataset=ds 22 | ) 23 | assert manifest.shape_chunk_grid == (0,) 24 | 25 | def test_empty_dataset(self, empty_dataset_hdf5_file): 26 | f = h5py.File(empty_dataset_hdf5_file) 27 | ds = f["data"] 28 | manifest = HDFVirtualBackend._dataset_chunk_manifest( 29 | path=empty_dataset_hdf5_file, dataset=ds 30 | ) 31 | assert manifest.shape_chunk_grid == (0,) 32 | 33 | def test_no_chunking(self, no_chunks_hdf5_file): 34 | f = h5py.File(no_chunks_hdf5_file) 35 | ds = f["data"] 36 | manifest = HDFVirtualBackend._dataset_chunk_manifest( 37 | path=no_chunks_hdf5_file, dataset=ds 38 | ) 39 | assert manifest.shape_chunk_grid == (1, 1) 40 | 41 | def test_chunked(self, chunked_hdf5_file): 42 | f = h5py.File(chunked_hdf5_file) 43 | ds = f["data"] 44 | manifest = HDFVirtualBackend._dataset_chunk_manifest( 45 | path=chunked_hdf5_file, dataset=ds 46 | ) 47 | assert manifest.shape_chunk_grid == (2, 2) 48 | 49 | def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file): 50 | f = h5py.File(chunked_roundtrip_hdf5_file) 51 | ds = f["var2"] 52 | manifest = HDFVirtualBackend._dataset_chunk_manifest( 53 | path=chunked_roundtrip_hdf5_file, dataset=ds 54 | ) 55 | assert manifest.shape_chunk_grid == (2, 8) 56 | 57 | 58 | @requires_hdf5plugin 59 | @requires_imagecodecs 60 | class TestDatasetDims: 61 | def test_single_dimension_scale(self, single_dimension_scale_hdf5_file): 62 | f = h5py.File(single_dimension_scale_hdf5_file) 63 | ds = f["data"] 64 | dims = HDFVirtualBackend._dataset_dims(ds) 65 | assert dims[0] == "x" 66 | 67 | def test_is_dimension_scale(self, is_scale_hdf5_file): 68 | f = h5py.File(is_scale_hdf5_file) 69 | ds = f["data"] 70 | dims = HDFVirtualBackend._dataset_dims(ds) 71 | assert dims[0] == "data" 72 | 73 | def test_multiple_dimension_scales(self, multiple_dimension_scales_hdf5_file): 74 | f = h5py.File(multiple_dimension_scales_hdf5_file) 75 | ds = f["data"] 76 | with pytest.raises(ValueError, match="dimension scales attached"): 77 | HDFVirtualBackend._dataset_dims(ds) 78 | 79 | def test_no_dimension_scales(self, no_chunks_hdf5_file): 80 | f = h5py.File(no_chunks_hdf5_file) 81 | ds = f["data"] 82 | dims = HDFVirtualBackend._dataset_dims(ds) 83 | assert dims == ["phony_dim_0", "phony_dim_1"] 84 | 85 | 86 | @requires_hdf5plugin 87 | @requires_imagecodecs 88 | class TestDatasetToManifestArray: 89 | def test_chunked_dataset(self, chunked_dimensions_netcdf4_file): 90 | f = h5py.File(chunked_dimensions_netcdf4_file) 91 | ds = f["data"] 92 | ma = HDFVirtualBackend._construct_manifest_array( 93 | chunked_dimensions_netcdf4_file, ds, group="" 94 | ) 95 | assert ma.chunks == (50, 50) 96 | 97 | def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file): 98 | f = h5py.File(single_dimension_scale_hdf5_file) 99 | ds = f["data"] 100 | ma = HDFVirtualBackend._construct_manifest_array( 101 | single_dimension_scale_hdf5_file, ds, group="" 102 | ) 103 | assert ma.chunks == (2,) 104 | 105 | def test_dataset_attributes(self, string_attributes_hdf5_file): 106 | f = h5py.File(string_attributes_hdf5_file) 107 | ds = f["data"] 108 | ma = HDFVirtualBackend._construct_manifest_array( 109 | string_attributes_hdf5_file, ds, group="" 110 | ) 111 | assert ma.metadata.attributes["attribute_name"] == "attribute_name" 112 | 113 | def test_scalar_fill_value(self, scalar_fill_value_hdf5_file): 114 | f = h5py.File(scalar_fill_value_hdf5_file) 115 | ds = f["data"] 116 | ma = HDFVirtualBackend._construct_manifest_array( 117 | scalar_fill_value_hdf5_file, ds, group="" 118 | ) 119 | assert ma.metadata.fill_value == 42 120 | 121 | def test_cf_fill_value(self, cf_fill_value_hdf5_file): 122 | f = h5py.File(cf_fill_value_hdf5_file) 123 | ds = f["data"] 124 | if ds.dtype.kind in "S": 125 | pytest.xfail("Investigate fixed-length binary encoding in Zarr v3") 126 | if ds.dtype.names: 127 | pytest.xfail( 128 | "To fix, structured dtype fill value encoding for Zarr backend" 129 | ) 130 | ma = HDFVirtualBackend._construct_manifest_array( 131 | cf_fill_value_hdf5_file, ds, group="" 132 | ) 133 | assert "_FillValue" in ma.metadata.attributes 134 | 135 | def test_cf_array_fill_value(self, cf_array_fill_value_hdf5_file): 136 | f = h5py.File(cf_array_fill_value_hdf5_file) 137 | ds = f["data"] 138 | ma = HDFVirtualBackend._construct_manifest_array( 139 | cf_array_fill_value_hdf5_file, ds, group="" 140 | ) 141 | assert not isinstance(ma.metadata.attributes["_FillValue"], np.ndarray) 142 | 143 | 144 | @requires_hdf5plugin 145 | @requires_imagecodecs 146 | class TestExtractAttributes: 147 | def test_string_attribute(self, string_attributes_hdf5_file): 148 | f = h5py.File(string_attributes_hdf5_file) 149 | ds = f["data"] 150 | attrs = HDFVirtualBackend._extract_attrs(ds) 151 | assert attrs["attribute_name"] == "attribute_name" 152 | 153 | def test_root_attribute(self, root_attributes_hdf5_file): 154 | f = h5py.File(root_attributes_hdf5_file) 155 | attrs = HDFVirtualBackend._extract_attrs(f) 156 | assert attrs["attribute_name"] == "attribute_name" 157 | 158 | def test_multiple_attributes(self, string_attributes_hdf5_file): 159 | f = h5py.File(string_attributes_hdf5_file) 160 | ds = f["data"] 161 | attrs = HDFVirtualBackend._extract_attrs(ds) 162 | assert len(attrs.keys()) == 2 163 | 164 | 165 | @requires_hdf5plugin 166 | @requires_imagecodecs 167 | class TestManifestGroupFromHDF: 168 | def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): 169 | store = LocalStore() 170 | manifest_group = HDFVirtualBackend._construct_manifest_group( 171 | store=store, 172 | filepath=chunked_dimensions_netcdf4_file, 173 | ) 174 | assert len(manifest_group.arrays) == 3 175 | 176 | def test_nested_groups_are_ignored(self, nested_group_hdf5_file): 177 | store = LocalStore() 178 | manifest_group = HDFVirtualBackend._construct_manifest_group( 179 | store=store, 180 | filepath=nested_group_hdf5_file, 181 | group="group", 182 | ) 183 | assert len(manifest_group.arrays) == 1 184 | 185 | def test_drop_variables(self, multiple_datasets_hdf5_file): 186 | store = LocalStore() 187 | manifest_group = HDFVirtualBackend._construct_manifest_group( 188 | store=store, 189 | filepath=multiple_datasets_hdf5_file, 190 | drop_variables=["data2"], 191 | ) 192 | assert "data2" not in manifest_group.arrays.keys() 193 | 194 | def test_dataset_in_group(self, group_hdf5_file): 195 | store = LocalStore() 196 | manifest_group = HDFVirtualBackend._construct_manifest_group( 197 | store=store, 198 | filepath=group_hdf5_file, 199 | group="group", 200 | ) 201 | assert len(manifest_group.arrays) == 1 202 | 203 | def test_non_group_error(self, group_hdf5_file): 204 | store = LocalStore() 205 | with pytest.raises(ValueError): 206 | HDFVirtualBackend._construct_manifest_group( 207 | store=store, 208 | filepath=group_hdf5_file, 209 | group="group/data", 210 | ) 211 | 212 | 213 | @requires_hdf5plugin 214 | @requires_imagecodecs 215 | class TestOpenVirtualDataset: 216 | def test_coord_names( 217 | self, 218 | root_coordinates_hdf5_file, 219 | ): 220 | vds = HDFVirtualBackend.open_virtual_dataset(root_coordinates_hdf5_file) 221 | assert set(vds.coords) == {"lat", "lon"} 222 | 223 | @pytest.mark.xfail(reason="Requires Zarr v3 big endian dtype support") 224 | def test_big_endian( 225 | self, 226 | big_endian_dtype_hdf5_file, 227 | ): 228 | vds = HDFVirtualBackend.open_virtual_dataset(big_endian_dtype_hdf5_file) 229 | print(vds) 230 | 231 | 232 | @requires_hdf5plugin 233 | @requires_imagecodecs 234 | @pytest.mark.parametrize("group", [None, "subgroup", "subgroup/"]) 235 | def test_subgroup_variable_names(netcdf4_file_with_data_in_multiple_groups, group): 236 | # regression test for GH issue #364 237 | vds = open_virtual_dataset( 238 | netcdf4_file_with_data_in_multiple_groups, 239 | group=group, 240 | backend=HDFVirtualBackend, 241 | ) 242 | assert list(vds.dims) == ["dim_0"] 243 | -------------------------------------------------------------------------------- /virtualizarr/tests/test_readers/test_hdf/test_hdf_filters.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import h5py # type: ignore 4 | import numcodecs 5 | import numpy as np 6 | 7 | try: 8 | import imagecodecs # noqa 9 | except ModuleNotFoundError: 10 | imagecodecs = None # type: ignore 11 | warnings.warn("imagecodecs is required for HDF reader") 12 | 13 | 14 | from virtualizarr.readers.hdf.filters import ( 15 | _filter_to_codec, 16 | cfcodec_from_dataset, 17 | codecs_from_dataset, 18 | ) 19 | from virtualizarr.tests import ( 20 | requires_hdf5plugin, 21 | requires_imagecodecs, 22 | ) 23 | 24 | 25 | @requires_hdf5plugin 26 | @requires_imagecodecs 27 | class TestFilterToCodec: 28 | def test_gzip_uses_zlib_numcodec(self): 29 | codec = _filter_to_codec("gzip", 1) 30 | assert isinstance(codec, numcodecs.zlib.Zlib) 31 | 32 | def test_lzf(self): 33 | codec = _filter_to_codec("lzf") 34 | assert isinstance(codec, imagecodecs.numcodecs.Lzf) 35 | 36 | def test_blosc(self): 37 | import numcodecs 38 | from packaging import version 39 | 40 | codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1)) 41 | assert isinstance(codec, numcodecs.blosc.Blosc) 42 | expected_config = { 43 | "id": "blosc", 44 | "blocksize": 800, 45 | "clevel": 9, 46 | "shuffle": 2, 47 | "cname": "lz4", 48 | } 49 | if ( 50 | version.parse("0.16.1") 51 | > version.parse(numcodecs.__version__) 52 | > version.parse("0.15.1") 53 | ): 54 | expected_config["typesize"] = None 55 | assert codec.get_config() == expected_config 56 | 57 | def test_zstd(self): 58 | codec = _filter_to_codec("32015", (5,)) 59 | assert isinstance(codec, numcodecs.zstd.Zstd) 60 | config = codec.get_config() 61 | assert config["id"] == "zstd" 62 | assert config["level"] == 5 63 | 64 | def test_shuffle(self): 65 | codec = _filter_to_codec("shuffle", (7,)) 66 | assert isinstance(codec, numcodecs.shuffle.Shuffle) 67 | expected_config = {"id": "shuffle", "elementsize": 7} 68 | assert codec.get_config() == expected_config 69 | 70 | 71 | @requires_hdf5plugin 72 | @requires_imagecodecs 73 | class TestCodecsFromDataSet: 74 | def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file): 75 | f = h5py.File(filter_encoded_hdf5_file) 76 | ds = f["data"] 77 | chunk_info = ds.id.get_chunk_info(0) 78 | codecs = codecs_from_dataset(ds) 79 | with open(filter_encoded_hdf5_file, "rb") as file: 80 | file.seek(chunk_info.byte_offset) 81 | bytes_read = file.read(chunk_info.size) 82 | decoded = codecs[0].decode(bytes_read) 83 | if isinstance(decoded, np.ndarray): 84 | assert decoded.tobytes() == np_uncompressed.tobytes() 85 | else: 86 | assert decoded == np_uncompressed.tobytes() 87 | 88 | 89 | @requires_hdf5plugin 90 | @requires_imagecodecs 91 | class TestCFCodecFromDataset: 92 | def test_no_cf_convention(self, filter_encoded_hdf5_file): 93 | f = h5py.File(filter_encoded_hdf5_file) 94 | ds = f["data"] 95 | cf_codec = cfcodec_from_dataset(ds) 96 | assert cf_codec is None 97 | 98 | def test_cf_scale_factor(self, netcdf4_file): 99 | f = h5py.File(netcdf4_file) 100 | ds = f["air"] 101 | cf_codec = cfcodec_from_dataset(ds) 102 | assert cf_codec["target_dtype"] == np.dtype(np.float64) 103 | assert cf_codec["codec"].scale == 100.0 104 | assert cf_codec["codec"].offset == 0 105 | assert cf_codec["codec"].dtype == " xr.Dataset: 18 | return xr.Dataset( 19 | {"x": xr.DataArray([10, 20, 30], dims="a", coords={"a": [0, 1, 2]})} 20 | ) 21 | 22 | 23 | def test_fsspec_openfile_from_path(tmp_path: pathlib.Path, dataset: xr.Dataset) -> None: 24 | f = tmp_path / "dataset.nc" 25 | dataset.to_netcdf(f) 26 | 27 | result = _FsspecFSFromFilepath(filepath=f.as_posix()).open_file() 28 | assert isinstance(result, fsspec.implementations.local.LocalFileOpener) 29 | 30 | 31 | @requires_scipy 32 | def test_fsspec_openfile_memory(dataset: xr.Dataset): 33 | fs = fsspec.filesystem("memory") 34 | with contextlib.redirect_stderr(None): 35 | # Suppress "Exception ignored in: " 36 | with fs.open("dataset.nc", mode="wb") as f: 37 | dataset.to_netcdf(f, engine="h5netcdf") 38 | 39 | result = _FsspecFSFromFilepath(filepath="memory://dataset.nc").open_file() 40 | with result: 41 | assert isinstance(result, fsspec.implementations.memory.MemoryFile) 42 | 43 | 44 | def test_copy_and_replace_metadata(array_v3_metadata): 45 | old_metadata = array_v3_metadata( 46 | shape=(10, 10), 47 | data_type=np.dtype("float32"), 48 | chunks=(5, 5), 49 | fill_value=0, 50 | ) 51 | 52 | new_shape = (20, 20) 53 | new_chunks = (10, 10) 54 | 55 | # Test updating both shape and chunk shape 56 | updated_metadata = copy_and_replace_metadata( 57 | old_metadata, new_shape=new_shape, new_chunks=new_chunks 58 | ) 59 | assert updated_metadata.shape == tuple(new_shape) 60 | assert updated_metadata.chunks == tuple(new_chunks) 61 | # Test other values are still the same 62 | assert updated_metadata.data_type == old_metadata.data_type 63 | assert updated_metadata.fill_value == old_metadata.fill_value 64 | -------------------------------------------------------------------------------- /virtualizarr/tests/test_writers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/tests/test_writers/__init__.py -------------------------------------------------------------------------------- /virtualizarr/tests/test_writers/conftest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from xarray import Dataset 4 | from xarray.core.variable import Variable 5 | 6 | from conftest import ARRAYBYTES_CODEC, ZLIB_CODEC 7 | from virtualizarr.manifests import ChunkManifest, ManifestArray 8 | 9 | 10 | @pytest.fixture 11 | def vds_with_manifest_arrays(array_v3_metadata) -> Dataset: 12 | arr = ManifestArray( 13 | chunkmanifest=ChunkManifest( 14 | entries={"0.0": dict(path="/test.nc", offset=6144, length=48)} 15 | ), 16 | metadata=array_v3_metadata( 17 | shape=(2, 3), 18 | data_type=np.dtype(" None: 31 | import obstore as obs 32 | 33 | parsed = urlparse(path) 34 | 35 | self._reader = obs.open_reader(store, parsed.path) 36 | 37 | def read(self, size: int, /) -> bytes: 38 | return self._reader.read(size).to_bytes() 39 | 40 | def seek(self, offset: int, whence: int = 0, /): 41 | # TODO: Check on default for whence 42 | return self._reader.seek(offset, whence) 43 | 44 | def tell(self) -> int: 45 | return self._reader.tell() 46 | 47 | 48 | @dataclass 49 | class _FsspecFSFromFilepath: 50 | """Class to create fsspec Filesystem from input filepath. 51 | 52 | Parameters 53 | ---------- 54 | filepath : str 55 | Input filepath 56 | reader_options : dict, optional 57 | dict containing kwargs to pass to file opener, by default {} 58 | fs : Option | None 59 | The fsspec filesystem object, created in __post_init__ 60 | 61 | """ 62 | 63 | filepath: str 64 | reader_options: Optional[dict] = field(default_factory=dict) 65 | fs: fsspec.AbstractFileSystem = field(init=False) 66 | upath: upath.core.UPath = field(init=False) 67 | 68 | def open_file(self) -> OpenFileType: 69 | """Calls `.open` on fsspec.Filesystem instantiation using self.filepath as an input. 70 | 71 | Returns 72 | ------- 73 | OpenFileType 74 | file opened with fsspec 75 | """ 76 | return self.fs.open(self.filepath) 77 | 78 | def read_bytes(self, bytes: int) -> bytes: 79 | with self.open_file() as of: 80 | return of.read(bytes) 81 | 82 | def get_mapper(self): 83 | """Returns a mapper for use with Zarr""" 84 | return self.fs.get_mapper(self.filepath) 85 | 86 | def __post_init__(self) -> None: 87 | """Initialize the fsspec filesystem object""" 88 | import fsspec 89 | from upath import UPath 90 | 91 | if not isinstance(self.filepath, UPath): 92 | upath = UPath(self.filepath) 93 | 94 | self.upath = upath 95 | self.protocol = upath.protocol 96 | 97 | self.reader_options = self.reader_options or {} 98 | storage_options = self.reader_options.get("storage_options", {}) # type: ignore 99 | 100 | self.fs = fsspec.filesystem(self.protocol, **storage_options) 101 | 102 | 103 | def check_for_collisions( 104 | drop_variables: Iterable[str] | None, 105 | loadable_variables: Iterable[str] | None, 106 | ) -> tuple[list[str], list[str]]: 107 | if drop_variables is None: 108 | drop_variables = [] 109 | elif isinstance(drop_variables, str): 110 | drop_variables = [drop_variables] 111 | else: 112 | drop_variables = list(drop_variables) 113 | 114 | if loadable_variables is None: 115 | loadable_variables = [] 116 | elif isinstance(loadable_variables, str): 117 | loadable_variables = [loadable_variables] 118 | else: 119 | loadable_variables = list(loadable_variables) 120 | 121 | common = set(drop_variables).intersection(set(loadable_variables)) 122 | if common: 123 | raise ValueError(f"Cannot both load and drop variables {common}") 124 | 125 | return drop_variables, loadable_variables 126 | 127 | 128 | def soft_import(name: str, reason: str, strict: Optional[bool] = True): 129 | try: 130 | return importlib.import_module(name) 131 | except (ImportError, ModuleNotFoundError): 132 | if strict: 133 | raise ImportError( 134 | f"for {reason}, the {name} package is required. " 135 | f"Please install it via pip or conda." 136 | ) 137 | else: 138 | return None 139 | 140 | 141 | def ceildiv(a: int, b: int) -> int: 142 | """ 143 | Ceiling division operator for integers. 144 | 145 | See https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python 146 | """ 147 | return -(a // -b) 148 | 149 | 150 | def determine_chunk_grid_shape( 151 | shape: tuple[int, ...], chunks: tuple[int, ...] 152 | ) -> tuple[int, ...]: 153 | """Calculate the shape of the chunk grid based on array shape and chunk size.""" 154 | return tuple(ceildiv(length, chunksize) for length, chunksize in zip(shape, chunks)) 155 | 156 | 157 | def convert_v3_to_v2_metadata( 158 | v3_metadata: ArrayV3Metadata, fill_value: Any = None 159 | ) -> ArrayV2Metadata: 160 | """ 161 | Convert ArrayV3Metadata to ArrayV2Metadata. 162 | 163 | Parameters 164 | ---------- 165 | v3_metadata : ArrayV3Metadata 166 | The metadata object in v3 format. 167 | fill_value : Any, optional 168 | Override the fill value from v3 metadata. 169 | 170 | Returns 171 | ------- 172 | ArrayV2Metadata 173 | The metadata object in v2 format. 174 | """ 175 | import warnings 176 | 177 | array_filters: tuple[ArrayArrayCodec, ...] 178 | bytes_compressors: tuple[BytesBytesCodec, ...] 179 | array_filters, _, bytes_compressors = extract_codecs(v3_metadata.codecs) 180 | # Handle compressor configuration 181 | compressor_config: dict[str, Any] | None = None 182 | if bytes_compressors: 183 | if len(bytes_compressors) > 1: 184 | warnings.warn( 185 | "Multiple compressors found in v3 metadata. Using the first compressor, " 186 | "others will be ignored. This may affect data compatibility.", 187 | UserWarning, 188 | ) 189 | compressor_config = get_codec_config(bytes_compressors[0]) 190 | 191 | # Handle filter configurations 192 | filter_configs = [get_codec_config(filter_) for filter_ in array_filters] 193 | 194 | v2_metadata = ArrayV2Metadata( 195 | shape=v3_metadata.shape, 196 | dtype=v3_metadata.data_type.to_numpy(), 197 | chunks=v3_metadata.chunks, 198 | fill_value=fill_value or v3_metadata.fill_value, 199 | compressor=compressor_config, 200 | filters=filter_configs, 201 | order="C", 202 | attributes=v3_metadata.attributes, 203 | dimension_separator=".", # Assuming '.' as default dimension separator 204 | ) 205 | return v2_metadata 206 | -------------------------------------------------------------------------------- /virtualizarr/vendor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/vendor/__init__.py -------------------------------------------------------------------------------- /virtualizarr/vendor/zarr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/vendor/zarr/__init__.py -------------------------------------------------------------------------------- /virtualizarr/vendor/zarr/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/vendor/zarr/core/__init__.py -------------------------------------------------------------------------------- /virtualizarr/vendor/zarr/core/common.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from itertools import starmap 3 | from typing import ( 4 | Any, 5 | Awaitable, 6 | Callable, 7 | Iterable, 8 | TypeVar, 9 | ) 10 | 11 | # Vendored directly from Zarr-python V3's private API 12 | # https://github.com/zarr-developers/zarr-python/blob/458299857141a5470ba3956d8a1607f52ac33857/src/zarr/core/common.py#L53 13 | T = TypeVar("T", bound=tuple[Any, ...]) 14 | V = TypeVar("V") 15 | 16 | 17 | async def _concurrent_map( 18 | items: Iterable[T], 19 | func: Callable[..., Awaitable[V]], 20 | limit: int | None = None, 21 | ) -> list[V]: 22 | if limit is None: 23 | return await asyncio.gather(*list(starmap(func, items))) 24 | 25 | else: 26 | sem = asyncio.Semaphore(limit) 27 | 28 | async def run(item: tuple[Any]) -> V: 29 | async with sem: 30 | return await func(*item) 31 | 32 | return await asyncio.gather( 33 | *[asyncio.ensure_future(run(item)) for item in items] 34 | ) 35 | -------------------------------------------------------------------------------- /virtualizarr/vendor/zarr/core/metadata.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any 3 | 4 | import numpy as np 5 | from zarr.core.buffer import Buffer, BufferPrototype 6 | from zarr.core.metadata.v3 import V3JsonEncoder 7 | 8 | 9 | def _replace_special_floats(obj: object) -> Any: 10 | """Helper function to replace NaN/Inf/-Inf values with special strings 11 | 12 | Note: this cannot be done in the V3JsonEncoder because Python's `json.dumps` optimistically 13 | converts NaN/Inf values to special types outside of the encoding step. 14 | """ 15 | if isinstance(obj, float): 16 | if np.isnan(obj): 17 | return "NaN" 18 | elif np.isinf(obj): 19 | return "Infinity" if obj > 0 else "-Infinity" 20 | elif isinstance(obj, dict): 21 | # Recursively replace in dictionaries 22 | return {k: _replace_special_floats(v) for k, v in obj.items()} 23 | elif isinstance(obj, list): 24 | # Recursively replace in lists 25 | return [_replace_special_floats(item) for item in obj] 26 | return obj 27 | 28 | 29 | def dict_to_buffer(input: dict, prototype: BufferPrototype) -> Buffer: 30 | # modified from ArrayV3Metadata.to_buffer_dict 31 | d = _replace_special_floats(input) 32 | return prototype.buffer.from_bytes(json.dumps(d, cls=V3JsonEncoder).encode()) 33 | -------------------------------------------------------------------------------- /virtualizarr/writers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zarr-developers/VirtualiZarr/6fd8634b4a5f32ae17cab87ea886d3df3bed8a15/virtualizarr/writers/__init__.py -------------------------------------------------------------------------------- /virtualizarr/writers/kerchunk.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | from typing import cast 4 | 5 | import numpy as np 6 | from xarray import Dataset, Variable 7 | from xarray.coding.times import CFDatetimeCoder 8 | from xarray.conventions import encode_dataset_coordinates 9 | 10 | from virtualizarr.manifests.manifest import join 11 | from virtualizarr.types.kerchunk import KerchunkArrRefs, KerchunkStoreRefs 12 | from virtualizarr.utils import convert_v3_to_v2_metadata 13 | 14 | 15 | class NumpyEncoder(json.JSONEncoder): 16 | """JSON encoder that handles common scientific Python types found in attributes. 17 | 18 | This encoder converts various Python types to JSON-serializable formats: 19 | - NumPy arrays and scalars to Python lists and native types 20 | - NumPy dtypes to strings 21 | - Sets to lists 22 | - Other objects that implement __array__ to lists 23 | - Objects with to_dict method (like pandas objects) 24 | - Objects with __str__ method as fallback 25 | """ 26 | 27 | def default(self, obj): 28 | if isinstance(obj, np.ndarray): 29 | return obj.tolist() # Convert NumPy array to Python list 30 | elif isinstance(obj, np.generic): 31 | return obj.item() # Convert NumPy scalar to Python scalar 32 | elif isinstance(obj, np.dtype): 33 | return str(obj) 34 | elif isinstance(obj, set): 35 | return list(obj) # Convert sets to lists 36 | elif hasattr(obj, "__array__"): 37 | return np.asarray(obj).tolist() # Handle array-like objects 38 | elif hasattr(obj, "to_dict"): 39 | return obj.to_dict() # Handle objects with to_dict method 40 | 41 | try: 42 | return json.JSONEncoder.default(self, obj) 43 | except TypeError: 44 | if hasattr(obj, "__str__"): 45 | return str(obj) 46 | raise 47 | 48 | 49 | def dataset_to_kerchunk_refs(ds: Dataset) -> KerchunkStoreRefs: 50 | """ 51 | Create a dictionary containing kerchunk-style store references from a single xarray.Dataset (which wraps ManifestArray objects). 52 | """ 53 | 54 | import ujson 55 | 56 | # xarray's .to_zarr() does this, so we need to do it for kerchunk too 57 | variables, attrs = encode_dataset_coordinates(ds) 58 | 59 | all_arr_refs = {} 60 | for var_name, var in variables.items(): 61 | arr_refs = variable_to_kerchunk_arr_refs(var, str(var_name)) 62 | 63 | prepended_with_var_name = { 64 | f"{var_name}/{key}": val for key, val in arr_refs.items() 65 | } 66 | all_arr_refs.update(prepended_with_var_name) 67 | 68 | ds_refs = { 69 | "version": 1, 70 | "refs": { 71 | ".zgroup": '{"zarr_format":2}', 72 | ".zattrs": ujson.dumps(attrs), 73 | **all_arr_refs, 74 | }, 75 | } 76 | 77 | return cast(KerchunkStoreRefs, ds_refs) 78 | 79 | 80 | def remove_file_uri_prefix(path: str): 81 | if path.startswith("file:///"): 82 | return path.removeprefix("file://") 83 | else: 84 | return path 85 | 86 | 87 | def variable_to_kerchunk_arr_refs(var: Variable, var_name: str) -> KerchunkArrRefs: 88 | """ 89 | Create a dictionary containing kerchunk-style array references from a single xarray.Variable (which wraps either a ManifestArray or a numpy array). 90 | 91 | Partially encodes the inner dicts to json to match kerchunk behaviour (see https://github.com/fsspec/kerchunk/issues/415). 92 | """ 93 | from virtualizarr.manifests import ManifestArray 94 | from virtualizarr.translators.kerchunk import to_kerchunk_json 95 | 96 | if isinstance(var.data, ManifestArray): 97 | marr = var.data 98 | 99 | arr_refs: dict[str, str | list[str | int]] = { 100 | str(chunk_key): [ 101 | remove_file_uri_prefix(entry["path"]), 102 | entry["offset"], 103 | entry["length"], 104 | ] 105 | for chunk_key, entry in marr.manifest.dict().items() 106 | } 107 | array_v2_metadata = convert_v3_to_v2_metadata(marr.metadata) 108 | zattrs = {**var.attrs, **var.encoding} 109 | else: 110 | from xarray.backends.zarr import encode_zarr_variable 111 | from zarr.core.metadata.v2 import ArrayV2Metadata 112 | 113 | var = encode_zarr_variable(var) 114 | try: 115 | np_arr = var.to_numpy() 116 | except AttributeError as e: 117 | raise TypeError( 118 | f"Can only serialize wrapped arrays of type ManifestArray or numpy.ndarray, but got type {type(var.data)}" 119 | ) from e 120 | 121 | if var.encoding: 122 | if "scale_factor" in var.encoding: 123 | raise NotImplementedError( 124 | f"Cannot serialize loaded variable {var_name}, as it is encoded with a scale_factor" 125 | ) 126 | if "offset" in var.encoding: 127 | raise NotImplementedError( 128 | f"Cannot serialize loaded variable {var_name}, as it is encoded with an offset" 129 | ) 130 | if "calendar" in var.encoding: 131 | np_arr = CFDatetimeCoder().encode(var.copy(), name=var_name).values 132 | dtype = var.encoding.get("dtype", None) 133 | if dtype and np_arr.dtype != dtype: 134 | np_arr = np.asarray(np_arr, dtype=dtype) 135 | 136 | # This encoding is what kerchunk does when it "inlines" data, see https://github.com/fsspec/kerchunk/blob/a0c4f3b828d37f6d07995925b324595af68c4a19/kerchunk/hdf.py#L472 137 | byte_data = np_arr.tobytes() 138 | # TODO do I really need to encode then decode like this? 139 | inlined_data = (b"base64:" + base64.b64encode(byte_data)).decode("utf-8") 140 | 141 | # TODO can this be generalized to save individual chunks of a dask array? 142 | # TODO will this fail for a scalar? 143 | arr_refs = {join(0 for _ in np_arr.shape): inlined_data} 144 | 145 | array_v2_metadata = ArrayV2Metadata( 146 | chunks=np_arr.shape, 147 | shape=np_arr.shape, 148 | dtype=np_arr.dtype, 149 | order="C", 150 | fill_value=None, 151 | ) 152 | zattrs = {**var.attrs} 153 | 154 | zarray_dict = to_kerchunk_json(array_v2_metadata) 155 | arr_refs[".zarray"] = zarray_dict 156 | 157 | zattrs["_ARRAY_DIMENSIONS"] = list(var.dims) 158 | arr_refs[".zattrs"] = json.dumps(zattrs, separators=(",", ":"), cls=NumpyEncoder) 159 | 160 | return cast(KerchunkArrRefs, arr_refs) 161 | -------------------------------------------------------------------------------- /virtualizarr/xarray.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable, Mapping 2 | from typing import ( 3 | Any, 4 | Hashable, 5 | MutableMapping, 6 | Optional, 7 | ) 8 | 9 | import xarray as xr 10 | import xarray.indexes 11 | 12 | from virtualizarr.manifests import ManifestStore 13 | from virtualizarr.utils import _FsspecFSFromFilepath 14 | 15 | 16 | def construct_fully_virtual_dataset( 17 | virtual_vars: Mapping[str, xr.Variable], 18 | coord_names: Iterable[str] | None = None, 19 | attrs: dict[str, Any] | None = None, 20 | ) -> xr.Dataset: 21 | """Construct a fully virtual Dataset from constituent parts.""" 22 | 23 | data_vars, coords = separate_coords( 24 | vars=virtual_vars, 25 | indexes={}, # we specifically avoid creating any indexes yet to avoid loading any data 26 | coord_names=coord_names, 27 | ) 28 | 29 | vds = xr.Dataset( 30 | data_vars=data_vars, 31 | coords=coords, 32 | attrs=attrs, 33 | ) 34 | 35 | return vds 36 | 37 | 38 | def construct_virtual_dataset( 39 | manifest_store: ManifestStore | None = None, 40 | # TODO remove filepath option once all readers use ManifestStore approach 41 | fully_virtual_ds: xr.Dataset | None = None, 42 | filepath: str | None = None, 43 | group: str | None = None, 44 | loadable_variables: Iterable[Hashable] | None = None, 45 | decode_times: bool | None = None, 46 | indexes: Mapping[str, xr.Index] | None = None, 47 | reader_options: Optional[dict] = None, 48 | ) -> xr.Dataset: 49 | """ 50 | Construct a fully or partly virtual dataset from a ManifestStore (or filepath for backwards compatibility), 51 | containing the contents of one group. 52 | 53 | Accepts EITHER manifest_store OR fully_virtual_ds and filepath. The latter option should be removed once all readers use ManifestStore approach. 54 | """ 55 | 56 | if indexes is not None: 57 | raise NotImplementedError() 58 | 59 | if manifest_store: 60 | if group: 61 | raise NotImplementedError( 62 | "ManifestStore does not yet support nested groups" 63 | ) 64 | else: 65 | manifestgroup = manifest_store._group 66 | 67 | fully_virtual_ds = manifestgroup.to_virtual_dataset() 68 | 69 | with xr.open_zarr( 70 | manifest_store, 71 | group=group, 72 | consolidated=False, 73 | zarr_format=3, 74 | chunks=None, 75 | decode_times=decode_times, 76 | ) as loadable_ds: 77 | return replace_virtual_with_loadable_vars( 78 | fully_virtual_ds, loadable_ds, loadable_variables 79 | ) 80 | else: 81 | # TODO pre-ManifestStore codepath, remove once all readers use ManifestStore approach 82 | 83 | fpath = _FsspecFSFromFilepath( 84 | filepath=filepath, # type: ignore[arg-type] 85 | reader_options=reader_options, 86 | ).open_file() 87 | 88 | with xr.open_dataset( 89 | fpath, # type: ignore[arg-type] 90 | group=group, 91 | decode_times=decode_times, 92 | ) as loadable_ds: 93 | return replace_virtual_with_loadable_vars( 94 | fully_virtual_ds, # type: ignore[arg-type] 95 | loadable_ds, 96 | loadable_variables, 97 | ) 98 | 99 | 100 | def replace_virtual_with_loadable_vars( 101 | fully_virtual_ds: xr.Dataset, 102 | loadable_ds: xr.Dataset, 103 | loadable_variables: Iterable[Hashable] | None = None, 104 | ) -> xr.Dataset: 105 | """ 106 | Merge a fully virtual and the corresponding fully loadable dataset, keeping only `loadable_variables` from the latter (plus defaults needed for indexes). 107 | """ 108 | 109 | var_names_to_load: list[Hashable] 110 | 111 | if isinstance(loadable_variables, list): 112 | var_names_to_load = list(loadable_variables) 113 | elif loadable_variables is None: 114 | # If `loadable_variables` is None, then we have to explicitly match default 115 | # behaviour of xarray, i.e., load and create indexes only for dimension 116 | # coordinate variables. We already have all the indexes and variables 117 | # we should be keeping - we just need to distinguish them. 118 | var_names_to_load = [ 119 | name for name, var in loadable_ds.variables.items() if var.dims == (name,) 120 | ] 121 | else: 122 | raise ValueError( 123 | "loadable_variables must be an iterable of string variable names," 124 | f" or None, but got type {type(loadable_variables)}" 125 | ) 126 | 127 | # this will automatically keep any IndexVariables needed for loadable 1D coordinates 128 | loadable_var_names_to_drop = set(loadable_ds.variables).difference( 129 | var_names_to_load 130 | ) 131 | ds_loadable_to_keep = loadable_ds.drop_vars( 132 | loadable_var_names_to_drop, errors="ignore" 133 | ) 134 | 135 | ds_virtual_to_keep = fully_virtual_ds.drop_vars(var_names_to_load, errors="ignore") 136 | 137 | # we don't need `compat` or `join` kwargs here because there should be no variables with the same name in both datasets 138 | return xr.merge( 139 | [ 140 | ds_loadable_to_keep, 141 | ds_virtual_to_keep, 142 | ], 143 | ) 144 | 145 | 146 | # TODO this probably doesn't need to actually support indexes != {} 147 | def separate_coords( 148 | vars: Mapping[str, xr.Variable], 149 | indexes: MutableMapping[str, xr.Index], 150 | coord_names: Iterable[str] | None = None, 151 | ) -> tuple[dict[str, xr.Variable], xr.Coordinates]: 152 | """ 153 | Try to generate a set of coordinates that won't cause xarray to automatically build a pandas.Index for the 1D coordinates. 154 | 155 | Currently requires this function as a workaround unless xarray PR #8124 is merged. 156 | 157 | Will also preserve any loaded variables and indexes it is passed. 158 | """ 159 | 160 | if coord_names is None: 161 | coord_names = [] 162 | 163 | # split data and coordinate variables (promote dimension coordinates) 164 | data_vars = {} 165 | coord_vars: dict[ 166 | str, tuple[Hashable, Any, dict[Any, Any], dict[Any, Any]] | xr.Variable 167 | ] = {} 168 | found_coord_names: set[str] = set() 169 | # Search through variable attributes for coordinate names 170 | for var in vars.values(): 171 | if "coordinates" in var.attrs: 172 | found_coord_names.update(var.attrs["coordinates"].split(" ")) 173 | for name, var in vars.items(): 174 | if name in coord_names or var.dims == (name,) or name in found_coord_names: 175 | # use workaround to avoid creating IndexVariables described here https://github.com/pydata/xarray/pull/8107#discussion_r1311214263 176 | if len(var.dims) == 1: 177 | dim1d, *_ = var.dims 178 | coord_vars[name] = (dim1d, var.data, var.attrs, var.encoding) 179 | 180 | if isinstance(var, xr.IndexVariable): 181 | # unless variable actually already is a loaded IndexVariable, 182 | # in which case we need to keep it and add the corresponding indexes explicitly 183 | coord_vars[str(name)] = var 184 | # TODO this seems suspect - will it handle datetimes? 185 | indexes[name] = xarray.indexes.PandasIndex(var, dim1d) 186 | else: 187 | coord_vars[name] = var 188 | else: 189 | data_vars[name] = var 190 | 191 | coords = xr.Coordinates(coord_vars, indexes=indexes) 192 | 193 | return data_vars, coords 194 | --------------------------------------------------------------------------------