├── docs ├── _static │ └── .gitkeep ├── collections.rst ├── index.rst ├── Makefile ├── api.rst └── conf.py ├── .coveragerc ├── MANIFEST.in ├── eosdis_store ├── version.py ├── __init__.py ├── dmrpp.py └── stores.py ├── presentation ├── requirements.txt ├── images │ ├── summary.png │ ├── output_12_2.png │ ├── output_15_1.png │ ├── output_18_2.png │ ├── output_3_1.png │ ├── output_5_1.png │ ├── output_7_1.png │ ├── output_9_2.png │ └── request-overhead.png ├── tutorial.ipynb └── background.md ├── .flake8 ├── requirements.txt ├── requirements-dev.txt ├── .github ├── release-drafter.yml └── workflows │ ├── draft-release.yml │ ├── tests.yml │ └── publish-release.yml ├── CHANGELOG.md ├── scripts └── mkdmrpp ├── .gitignore ├── Makefile ├── setup.py ├── tests ├── test_dmrpp.py ├── test_stores.py └── fixtures │ ├── 20200911000001-JPL-L2P_GHRSST-SSTskin-MODIS_A-N-v02.0-fv01.0.zarr.json │ ├── 20200911000001-JPL-L2P_GHRSST-SSTskin-MODIS_A-N-v02.0-fv01.0.nc.dmrpp │ └── 3B-HHR.MS.MRG.3IMERG.20051022-S000000-E002959.0000.V06B.zarr.json ├── README.rst └── LICENSE /docs/_static/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = . -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /eosdis_store/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.6" 2 | -------------------------------------------------------------------------------- /presentation/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib>=3.8.0 2 | h5py>=3.9.0 3 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 99 3 | ignore = F401, W503 4 | -------------------------------------------------------------------------------- /presentation/images/summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/zarr-eosdis-store/HEAD/presentation/images/summary.png -------------------------------------------------------------------------------- /presentation/images/output_12_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/zarr-eosdis-store/HEAD/presentation/images/output_12_2.png -------------------------------------------------------------------------------- /presentation/images/output_15_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/zarr-eosdis-store/HEAD/presentation/images/output_15_1.png -------------------------------------------------------------------------------- /presentation/images/output_18_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/zarr-eosdis-store/HEAD/presentation/images/output_18_2.png -------------------------------------------------------------------------------- /presentation/images/output_3_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/zarr-eosdis-store/HEAD/presentation/images/output_3_1.png -------------------------------------------------------------------------------- /presentation/images/output_5_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/zarr-eosdis-store/HEAD/presentation/images/output_5_1.png -------------------------------------------------------------------------------- /presentation/images/output_7_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/zarr-eosdis-store/HEAD/presentation/images/output_7_1.png -------------------------------------------------------------------------------- /presentation/images/output_9_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/zarr-eosdis-store/HEAD/presentation/images/output_9_2.png -------------------------------------------------------------------------------- /presentation/images/request-overhead.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/zarr-eosdis-store/HEAD/presentation/images/request-overhead.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | CacheControl>=0.12.6 2 | requests>=2.23.0 3 | requests-futures>=1.0.0 4 | zarr>=2.7.1 5 | ipypb~=0.5 6 | xarray>=0.16 7 | numcodecs>=0.8.1 -------------------------------------------------------------------------------- /eosdis_store/__init__.py: -------------------------------------------------------------------------------- 1 | from .stores import EosdisStore, ConsolidatedChunkStore 2 | from .version import __version__ 3 | 4 | __all__ = ['EosdisStore', '__version__', 'version'] 5 | version = __version__ -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | setuptools >= 21.0.0 2 | pytest~=5.4 3 | flake8~=3.8 4 | safety >= 1.8.5 5 | coverage >= 4.5.4 6 | pygments ~= 2.9 7 | sphinx >= 3.2.1 8 | sphinx-rtd-theme >= 0.5.0 9 | recommonmark >= 0.7.1 10 | -------------------------------------------------------------------------------- /docs/collections.rst: -------------------------------------------------------------------------------- 1 | Compatible Collections 2 | ====================== 3 | 4 | Production collections coming soon 5 | 6 | Test Collections 7 | ---------------- 8 | 9 | The following collections are available for user testing and feedback. 10 | 11 | * Coming soon -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | 3 | .. toctree:: 4 | :maxdepth: 2 5 | :caption: Contents: 6 | 7 | api 8 | collections 9 | 10 | Indices and tables 11 | ================== 12 | 13 | * :ref:`genindex` 14 | * :ref:`modindex` 15 | * :ref:`search` 16 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | version-resolver: 2 | major: 3 | labels: 4 | - 'major' 5 | minor: 6 | labels: 7 | - 'minor' 8 | patch: 9 | labels: 10 | - 'patch' 11 | default: patch 12 | name-template: 'v$RESOLVED_VERSION' 13 | tag-template: 'v$RESOLVED_VERSION' 14 | template: | 15 | $CHANGES 16 | -------------------------------------------------------------------------------- /.github/workflows/draft-release.yml: -------------------------------------------------------------------------------- 1 | name: Draft Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Release Drafter 13 | uses: release-drafter/release-drafter@v5.12.1 14 | env: 15 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 16 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) 5 | and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). 6 | 7 | ## [Unreleased] 8 | 9 | ## [v0.1.0] - 2020-xx-xx 10 | 11 | Initial Release 12 | 13 | [Unreleased]: https://github.com///compare/master...develop 14 | [v0.1.0]: https://github.com///tree/0.1.0 15 | -------------------------------------------------------------------------------- /scripts/mkdmrpp: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | resolve_dir() { 6 | (builtin cd `dirname "${1/#~/$HOME}"`'/'`basename "${1/#~/$HOME}"` 2>/dev/null; if [ $? -eq 0 ]; then pwd; fi) 7 | } 8 | 9 | for var in "$@"; do 10 | filename=$(basename $var) 11 | dir=$(resolve_dir `dirname $var`) 12 | ext=h5 13 | echo $filename 14 | echo $dir 15 | 16 | cp $var $var.$ext 17 | docker run --rm -it -v $dir:/tmp --entrypoint get_dmrpp opendap/hyrax:snapshot -V -o /tmp/$filename.dmrpp /$filename.$ext 18 | rm -rf $dir/conf_* $dir/dmr_* $dir/hyrax_ux 19 | rm $var.$ext 20 | done 21 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | API Documentation 2 | ================= 3 | 4 | .. automodule:: eosdis_store 5 | :members: 6 | :special-members: 7 | :show-inheritance: 8 | 9 | 10 | Submodules 11 | ---------- 12 | 13 | eosdis_store.common module 14 | -------------------------- 15 | 16 | .. automodule:: eosdis_store.common 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | 21 | eosdis_store.dmrpp module 22 | ------------------------- 23 | 24 | .. automodule:: eosdis_store.dmrpp 25 | :members: 26 | :undoc-members: 27 | :show-inheritance: 28 | 29 | eosdis_store.stores module 30 | -------------------------- 31 | 32 | .. automodule:: eosdis_store.stores 33 | :members: 34 | :special-members: 35 | :show-inheritance: 36 | 37 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | # More conservative about duplicate tests due to tests accessing real files 4 | on: [pull_request] 5 | 6 | jobs: 7 | build: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [3.8] 12 | 13 | steps: 14 | - uses: actions/checkout@v4 15 | 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | 21 | - uses: extractions/netrc@v1 22 | with: 23 | machine: uat.urs.earthdata.nasa.gov 24 | username: ${{ secrets.EDL_USER }} 25 | password: ${{ secrets.EDL_PASSWORD }} 26 | 27 | - name: Install dependencies 28 | run: | 29 | make install 30 | 31 | - name: Tests 32 | run: | 33 | make ci 34 | 35 | - name: Archive code coverage results 36 | uses: actions/upload-artifact@v4 37 | with: 38 | name: code-coverage-report 39 | path: htmlcov/* 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # General 2 | tmp/ 3 | .DS_Store 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | docs/_build 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | cover/ 58 | 59 | # Jupyter Notebook 60 | .ipynb_checkpoints 61 | 62 | # IPython 63 | profile_default/ 64 | ipython_config.py 65 | 66 | # pyenv 67 | .python-version 68 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: venv-setup pyenv-setup install install-examples clean examples lint test test-watch ci docs 2 | .SILENT: virtualenv 3 | 4 | VERSION ?= $(shell git describe --tags | sed 's/-/\+/' | sed 's/-/\./g') 5 | REPO ?= https://upload.pypi.org/legacy/ 6 | REPO_USER ?= __token__ 7 | REPO_PASS ?= unset 8 | 9 | venv-setup: 10 | python -m venv .venv 11 | 12 | pyenv-setup: 13 | if ! type pyenv > /dev/null; \ 14 | then \ 15 | echo "\nUnable to create virtualenv: pyenv not found. Please install pyenv & pyenv-virtualenv."; \ 16 | echo " See:"; \ 17 | echo " https://github.com/pyenv/pyenv"; \ 18 | echo " https://github.com/pyenv/pyenv-virtualenv"; \ 19 | exit; \ 20 | else \ 21 | pyenv install 3.9.1; \ 22 | pyenv virtualenv 3.9.1 zarr-eosdis-store; \ 23 | pyenv activate zarr-eosdis-store; \ 24 | fi 25 | 26 | clean: 27 | coverage erase 28 | rm -rf htmlcov 29 | rm -rf build dist *.egg-info || true 30 | 31 | clean-docs: 32 | cd docs && $(MAKE) clean 33 | 34 | install: 35 | python -m pip install --upgrade pip 36 | pip install -r requirements.txt -r requirements-dev.txt 37 | 38 | lint: 39 | flake8 eosdis_store --show-source --statistics 40 | 41 | test: 42 | coverage run -m pytest 43 | 44 | ci: test 45 | coverage html 46 | 47 | build: clean 48 | sed -i.bak "s/__version__ .*/__version__ = \"$(VERSION)\"/" eosdis_store/version.py && rm eosdis_store/version.py.bak 49 | python -m pip install --upgrade --quiet setuptools wheel twine 50 | python setup.py --quiet sdist bdist_wheel 51 | 52 | publish: build 53 | python -m twine check dist/* 54 | python -m twine upload --username "$(REPO_USER)" --password "$(REPO_PASS)" --repository-url "$(REPO)" dist/* 55 | -------------------------------------------------------------------------------- /.github/workflows/publish-release.yml: -------------------------------------------------------------------------------- 1 | name: Publish Release 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v4 12 | with: 13 | fetch-depth: '0' 14 | - uses: actions/setup-python@v5 15 | with: 16 | python-version: '3.8' 17 | - shell: bash 18 | env: 19 | VERSION_TAG: ${{ github.event.release.tag_name }} 20 | BRANCH: ${{ github.event.release.target_commitish }} 21 | run: | 22 | make install 23 | VERSION=$(echo "${VERSION_TAG}" | cut -c2-) make build 24 | 25 | # Setup git 26 | # https://api.github.com/users/github-actions%5Bbot%5D 27 | git config --global user.name "github-actions[bot]" 28 | git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" 29 | 30 | # Commit and push updated release files 31 | git checkout -b "${BRANCH}" 32 | git add . 33 | git commit -m "Update release version to ${VERSION_TAG}" 34 | git push origin "${BRANCH}" 35 | 36 | git tag --force "${VERSION_TAG}" 37 | git push --force origin "${VERSION_TAG}" 38 | - name: upload dists 39 | uses: actions/upload-artifact@v4 40 | with: 41 | name: release-dists 42 | path: dist/ 43 | 44 | pypi-publish: 45 | runs-on: ubuntu-latest 46 | needs: 47 | - build 48 | permissions: 49 | id-token: write 50 | 51 | steps: 52 | - name: Retrieve release distributions 53 | uses: actions/download-artifact@v4 54 | with: 55 | name: release-dists 56 | path: dist/ 57 | 58 | - name: Publish release distributions to PyPI 59 | uses: pypa/gh-action-pypi-publish@release/v1 60 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup, find_packages 3 | from imp import load_source 4 | from os import path 5 | import io 6 | 7 | # use README for setup desciption 8 | with open("README.rst", "r") as f: 9 | long_description = f.read() 10 | 11 | # get version of package 12 | __version__ = load_source('eosdis_store.version', 'eosdis_store/version.py').__version__ 13 | 14 | # get the dependencies and installs 15 | with io.open(path.join(path.abspath(path.dirname(__file__)), 'requirements.txt'), encoding='utf-8') as f: 16 | all_reqs = f.read().split('\n') 17 | 18 | # remove direct installs from github 19 | install_requires = [x.strip() for x in all_reqs if 'git+' not in x] 20 | dependency_links = [x.strip().replace('git+', '') for x in all_reqs if 'git+' in x] 21 | 22 | # get dev dependencies 23 | with io.open(path.join(path.abspath(path.dirname(__file__)), 'requirements-dev.txt'), encoding='utf-8') as f: 24 | dev_reqs = f.read().split('\n') 25 | 26 | 27 | setup( 28 | name="zarr-eosdis-store", 29 | version=__version__, 30 | author="Patrick Quinn, Matthew Hanson", 31 | author_email="patrick@patrickquinn.net", 32 | description="Zarr Store class for working with EOSDIS cloud data", 33 | long_description=long_description, 34 | long_description_content_type="text/markdown", 35 | url="https://github.com/nasa/zarr-eosdis-store", 36 | packages=find_packages(exclude=['docs', 'tests*']), 37 | classifiers=[ 38 | 'Development Status :: 7 - Inactive', 39 | 'Intended Audience :: Developers', 40 | 'Intended Audience :: Science/Research', 41 | 'License :: OSI Approved :: Apache Software License', 42 | 'Programming Language :: Python :: 3', 43 | 'Programming Language :: Python :: 3.8', 44 | 'Programming Language :: Python :: 3 :: Only', 45 | ], 46 | python_requires='>=3.8', 47 | install_requires=install_requires, 48 | dependency_links=dependency_links, 49 | extras_require={ 50 | 'dev': dev_reqs # Run `pip install -e .[dev]` to install dev dependencies 51 | }, 52 | ) 53 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../')) 16 | 17 | import sphinx_rtd_theme 18 | 19 | from importlib.machinery import SourceFileLoader 20 | 21 | module_path = os.path.abspath('../eosdis_store/version.py') 22 | module = SourceFileLoader('eosdis_store.version', module_path).load_module(None) 23 | 24 | # -- Project information ----------------------------------------------------- 25 | 26 | project = 'zarr-eosdis-store' 27 | copyright = '2020 United States Government as represented by the Administrator of the National Aeronautics and Space Administration. All Rights Reserved.' 28 | author = 'Patrick Quinn, Matthew Hanson' 29 | 30 | # The full version, including alpha/beta/rc tags 31 | release = module.__version__ 32 | 33 | # -- General configuration --------------------------------------------------- 34 | 35 | # Add any Sphinx extension module names here, as strings. They can be 36 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 37 | # ones. 38 | extensions = [ 39 | 'sphinx.ext.autodoc', 40 | 'sphinx_rtd_theme', 41 | 'recommonmark' 42 | ] 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ['_templates'] 46 | 47 | # List of patterns, relative to source directory, that match files and 48 | # directories to ignore when looking for source files. 49 | # This pattern also affects html_static_path and html_extra_path. 50 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 51 | 52 | source_suffix = { 53 | '.rst': 'restructuredtext', 54 | '.txt': 'markdown', 55 | '.md': 'markdown', 56 | } 57 | 58 | # -- Options for HTML output ------------------------------------------------- 59 | 60 | # The theme to use for HTML and HTML Help pages. See the documentation for 61 | # a list of builtin themes. 62 | # 63 | html_theme = 'sphinx_rtd_theme' 64 | 65 | # Add any paths that contain custom static files (such as style sheets) here, 66 | # relative to this directory. They are copied after the builtin static files, 67 | # so a file named "default.css" will overwrite the builtin "default.css". 68 | html_static_path = ['_static'] -------------------------------------------------------------------------------- /tests/test_dmrpp.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import unittest 4 | from xml.etree import ElementTree 5 | 6 | import requests 7 | 8 | import eosdis_store.dmrpp as dmr 9 | 10 | 11 | testpath = os.path.dirname(__file__) 12 | 13 | s3_folder = 'https://harmony.uat.earthdata.nasa.gov/service-results/harmony-uat-eedtest-data/zarr-store' 14 | 15 | class Test(unittest.TestCase): 16 | 17 | test_files = [ 18 | f'{s3_folder}/f16_ssmis_20051022v7.nc.dmrpp', 19 | f'{s3_folder}/3B-HHR.MS.MRG.3IMERG.20051022-S000000-E002959.0000.V06B.HDF5.dmrpp', 20 | # MODIS data 21 | 'https://archive.podaac.uat.earthdata.nasa.gov/podaac-uat-cumulus-protected/MODIS_A-JPL-L2P-v2019.0/20200911000001-JPL-L2P_GHRSST-SSTskin-MODIS_A-N-v02.0-fv01.0.nc.dmrpp', 22 | ] 23 | 24 | @classmethod 25 | def get_test_xml(cls, idx=0): 26 | dmrpp = requests.get(cls.test_files[idx]).text 27 | tree = ElementTree.fromstring(dmrpp) 28 | return tree 29 | 30 | def test_find_child(self): 31 | tree = self.get_test_xml() 32 | node = dmr.find_child(tree, 'HDF5_GLOBAL') 33 | assert(node.attrib['name'] == 'HDF5_GLOBAL') 34 | 35 | def test_get_attribute_values(self): 36 | xml = 'T' 37 | node = ElementTree.fromstring(xml) 38 | vals = dmr.get_attribute_values(node) 39 | assert(vals == 'T') 40 | # TODO - test lists 41 | 42 | def test_get_attributes(self): 43 | tree = self.get_test_xml() 44 | node = dmr.find_child(tree, 'HDF5_GLOBAL') 45 | arr = dmr.get_attributes(node) 46 | assert(arr['chunksize'] == '90') 47 | assert(arr['Conventions'] == 'CF-1.6') 48 | assert(arr['numberofpasses'] == '2') 49 | 50 | def test_get_dimensions(self): 51 | tree = self.get_test_xml() 52 | dims = dmr.get_dimensions(tree) 53 | assert(len(dims) == 3) 54 | assert('/time' in dims.keys()) 55 | assert('/latitude' in dims.keys()) 56 | assert('/longitude' in dims.keys()) 57 | assert(dims['/latitude']['size'] == 720) 58 | 59 | def test_chunks_to_zarr_single(self): 60 | tree = self.get_test_xml() 61 | node = tree.find(".//d:Float32[@name='latitude']/dpp:chunks", dmr.NS) 62 | chunks = dmr.chunks_to_zarr(node) 63 | assert('zarray' in chunks) 64 | assert('zchunkstore' in chunks) 65 | assert(chunks['zchunkstore']['0']['offset'] == 106784) 66 | assert(chunks['zchunkstore']['0']['size'] == 2880) 67 | 68 | def test_chunks_to_zarr_multi(self): 69 | tree = self.get_test_xml() 70 | node = tree.find(".//d:Int16[@name='sst_dtime']/dpp:chunks", dmr.NS) 71 | chunks = dmr.chunks_to_zarr(node) 72 | assert('zarray' in chunks) 73 | assert('zchunkstore' in chunks) 74 | assert(len(chunks['zchunkstore']) == 128) 75 | assert(chunks['zchunkstore']['0.7.15']['size'] == 4324) 76 | 77 | def test_array_to_zarr(self): 78 | tree = self.get_test_xml() 79 | dims = dmr.get_dimensions(tree) 80 | assert(dims['/longitude']['size'] == 1440) 81 | # test on wind_speed array 82 | node = tree.find(".//d:Int16[@name='wind_speed']", dmr.NS) 83 | arr = dmr.array_to_zarr(node, dims) 84 | assert('wind_speed/.zarray' in arr) 85 | assert('wind_speed/.zattrs' in arr) 86 | assert('wind_speed/.zchunkstore' in arr) 87 | assert(arr['wind_speed/.zattrs']['_ARRAY_DIMENSIONS'] == ['time', 'latitude', 'longitude']) 88 | assert(arr['wind_speed/.zchunkstore']['0.6.11']['size'] == 888) 89 | # test on latitude array 90 | node = tree.find(".//d:Float32[@name='latitude']", dmr.NS) 91 | arr = dmr.array_to_zarr(node, dims) 92 | assert('latitude/.zarray' in arr) 93 | assert('latitude/.zattrs' in arr) 94 | assert('latitude/.zchunkstore' in arr) 95 | assert(arr['latitude/.zattrs']['_ARRAY_DIMENSIONS'] == ['latitude']) 96 | assert(arr['latitude/.zchunkstore']['0']['size'] == 2880) 97 | 98 | def test_to_zarr(self): 99 | tree = self.get_test_xml() 100 | zarr = dmr.to_zarr(tree) 101 | with open(os.path.join(testpath, 'fixtures', 'f16_ssmis_20051022v7.zarr.json')) as f: 102 | fixture = json.loads(f.read()) 103 | json1 = json.dumps(fixture, sort_keys=True) 104 | json2 = json.dumps(zarr, sort_keys=True) 105 | assert(json1 == json2) 106 | 107 | def test_to_zarr_more_examples(self): 108 | for i in range(2, len(self.test_files)): 109 | tree = self.get_test_xml(i) 110 | zarr = dmr.to_zarr(tree) 111 | 112 | bname = os.path.splitext(os.path.basename(self.test_files[i].replace('.dmrpp', '')))[0] 113 | with open(os.path.join(testpath, 'fixtures', f"{bname}.zarr.json")) as f: 114 | fixture = json.loads(f.read()) 115 | json1 = json.dumps(fixture, sort_keys=True) 116 | json2 = json.dumps(zarr, sort_keys=True) 117 | assert(json1 == json2) 118 | 119 | def test_deflate_shuffle(self): 120 | filename = '20210715090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc.dmrpp' 121 | with open(os.path.join(testpath, 'fixtures', filename)) as f: 122 | dmrpp = f.read() 123 | tree = ElementTree.fromstring(dmrpp) 124 | zarr = dmr.to_zarr(tree) 125 | attributes = zarr['analysed_sst/.zarray'] 126 | expected = { 127 | 'zarr_format': 2, 128 | 'filters': [{'id': 'shuffle', 'elementsize': 2}], 129 | 'order': 'C', 130 | 'dtype': '`_ instead. 5 | 6 | ---- 7 | 8 | .. image:: https://img.shields.io/badge/maintenance-no--longer--maintained-red 9 | :alt: No Longer Maintained 10 | 11 | zarr-eosdis-store 12 | ================= 13 | 14 | The zarr-eosdis-store library allows NASA EOSDIS Collections to be accessed efficiently 15 | by the `Zarr Python library `_, provided they 16 | have a sidecar DMR++ metadata file generated. 17 | 18 | Installation 19 | ============ 20 | 21 | This module requires Python 3.8 or greater:: 22 | 23 | $ python --version 24 | Python 3.8.2 25 | 26 | Install from PyPI:: 27 | 28 | $ pip install zarr-eosdis-store 29 | 30 | To install the latest development version:: 31 | 32 | $ pip install pip install git+https://github.com/nasa/zarr-eosdis-store.git@main#egg=zarr-eosdis-store 33 | 34 | Earthdata Login 35 | =============== 36 | 37 | To access EOSDIS data, you need to sign in with a free NASA Earthdata Login account, which you can obtain at 38 | ``_. 39 | 40 | Once you have an account, you will need to add your credentials to your ``~/.netrc`` file:: 41 | 42 | machine urs.earthdata.nasa.gov login YOUR_USERNAME password YOUR_PASSWORD 43 | 44 | If you are accessing test data, you will need to use an account from the Earthdata Login test system at 45 | ``_ instead, adding a corresponding line to your ``~/.netrc`` file:: 46 | 47 | machine uat.urs.earthdata.nasa.gov login YOUR_USERNAME password YOUR_PASSWORD 48 | 49 | 50 | Usage 51 | ===== 52 | 53 | To use the library, simply instantiate ``eosdis_store.EosdisStore`` with the URL to the data file you would 54 | like to access, pass it to the Zarr library as you would with any other store, and use the Zarr API as with any 55 | other read-only Zarr file. Note: the URL to the data file will typically end with an HDF5 or NetCDF4 extension, 56 | not .zarr. 57 | 58 | .. code-block:: python 59 | 60 | from eosdis_store import EosdisStore 61 | import zarr 62 | 63 | # Assumes you have set up .netrc with your Earthdata Login information 64 | f = zarr.open(EosdisStore('https://example.com/your/data/file.nc4')) 65 | 66 | # Read metadata and data from f using the Zarr API 67 | print(f['parameter_name'][0:0:0]) 68 | 69 | If the data has _FillValue (to flag nodata), scale_factor, or add_offset set (defined in metadata using CF-conventions) 70 | they can be retrieved from the parameter attributes. 71 | 72 | .. code-block:: python 73 | 74 | import numpy as np 75 | 76 | scale_factor = f['parameter_name].scale_factor 77 | add_offset = f['parameter_name].add_offset 78 | nodata = f['parameter_name]._FillValue 79 | 80 | arr = f['parameter_name'][] * scale_factor + add_offset 81 | 82 | nodata_locs = np.where(arr == nodata) 83 | 84 | 85 | A better way to handle these is to use XArray. Rather than reading the data immediately when a slice is requested, XArray 86 | defers the read until the data is actually accessed. With the Zarr backend to XArray, the scale and offset can be set so that 87 | when the data is accessed it will apply those values. This is more efficient if the data is going to be used in other operations. 88 | 89 | The scale_factor and get_offset will be used if specified in the NetCDF/HDF5 file. 90 | 91 | .. code-block:: python 92 | 93 | import xarray 94 | 95 | store = EosdisStore('https://example.com/your/data/file.nc4') 96 | 97 | f = xarray.open_zarr(store) 98 | 99 | # the data is not read yet 100 | xa = f['parameter_name'][] 101 | 102 | # convert to numpy array, data is read 103 | arr = xa.values 104 | 105 | The resulting array will have had scale and offset applied, and any element that is equal to the _FillValue attribute will be 106 | set to numpy `nan`. To use XArray without apply the scale and offset or setting the nodata to `nan`, supply the `mask_and_scale` 107 | keyword to xarray.open_zarr to False: 108 | 109 | .. code-block:: python 110 | 111 | store = EosdisStore('https://example.com/your/data/file.nc4') 112 | 113 | f = xarray.open_zarr(store, mask_and_scale=False) 114 | 115 | 116 | Technical Summary 117 | ================= 118 | 119 | We make use of a technique to read NetCDF4 and some HDF5 files that was prototyped by The HDF Group and USGS, described 120 | `here `_. 121 | 122 | To allow the technique to work with EOSDIS data, we have extended it and optimized access in the following key ways: 123 | 124 | * The ``EosdisStore`` reads a DMR++ file generated by OPeNDAP to present its metadata and determine byte offsets to the 125 | Zarr library. By reusing these, we avoid needing to generate new metadata sidecar files to support new data. 126 | 127 | * The store uses HTTPS and authenticates with a ``.netrc`` entry, rather than the S3 API, making it compatible with 128 | EOSDIS access patterns and requirements 129 | 130 | * The store caches redirect URLs for a period of time set by the Cache-Control header. Doing this avoids the overhead 131 | of repeated redirects when accessing parts of files. 132 | 133 | * The store uses a parallel API that allows it to make more efficient access optimizations: 134 | * 135 | * When the Zarr library accesses data that requires reading multiple near-sequential bytes in the file, the store combines 136 | these smaller requests into a single larger request. 137 | 138 | * After an initial request to cache any authentication and redirect information, the store runs subsequent requests in 139 | parallel. 140 | 141 | Development 142 | =========== 143 | 144 | Clone the repository, then ``pip install`` its dependencies:: 145 | 146 | pip install -r requirements.txt 147 | pip install -r requirements-dev.txt 148 | 149 | To check code coverage and run tests:: 150 | 151 | coverage run -m pytest 152 | 153 | To check coding style:: 154 | 155 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 156 | 157 | To build documentation, generated at ``docs/_build/html/index.html``:: 158 | 159 | cd docs && make html 160 | -------------------------------------------------------------------------------- /tests/test_stores.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import unittest 4 | from xml.etree import ElementTree 5 | 6 | import numpy as np 7 | import requests 8 | import xarray 9 | import zarr 10 | 11 | from eosdis_store import EosdisStore, ConsolidatedChunkStore 12 | 13 | testpath = os.path.dirname(__file__) 14 | 15 | s3_folder = 'https://harmony.uat.earthdata.nasa.gov/service-results/harmony-uat-eedtest-data/zarr-store' 16 | 17 | fixtures = [ 18 | { 19 | "url": f"{s3_folder}/f16_ssmis_20051022v7.nc", 20 | "aoi": (0, slice(400, 549, None), slice(1040, 1261, None)) 21 | }, 22 | { 23 | "url": f"{s3_folder}/3B-HHR.MS.MRG.3IMERG.20051022-S000000-E002959.0000.V06B.HDF5", 24 | "aoi": (0, slice(800, 1351, None), slice(1000, 1371, None)) 25 | }, 26 | { 27 | "url": "https://archive.podaac.uat.earthdata.nasa.gov/podaac-uat-cumulus-protected/MODIS_A-JPL-L2P-v2019.0/20200911000001-JPL-L2P_GHRSST-SSTskin-MODIS_A-N-v02.0-fv01.0.nc", 28 | "aoi": (0, slice(1800, 2000, None), slice(100, 400, None)) 29 | }, 30 | { 31 | "url": "https://archive.podaac.uat.earthdata.nasa.gov/podaac-uat-cumulus-protected/MODIS_A-JPL-L2P-v2019.0/20200911004001-JPL-L2P_GHRSST-SSTskin-MODIS_A-D-v02.0-fv01.0.nc", 32 | "aoi": (0, slice(1800, 2000, None), slice(100, 400, None)) 33 | } 34 | ] 35 | 36 | 37 | def open_eosdis_store(idx=0): 38 | return EosdisStore(fixtures[idx]["url"], fixtures[idx]["url"] + ".dmrpp") 39 | 40 | 41 | class TestZarr(unittest.TestCase): 42 | 43 | @classmethod 44 | def get_test_xml(cls, idx=0): 45 | url = fixtures[idx]["url"] 46 | dmrpp = requests.get(url + '.dmrpp').text 47 | tree = ElementTree.fromstring(dmrpp) 48 | return tree 49 | 50 | def test_eosdis_store_metadata(self): 51 | store = open_eosdis_store() 52 | keys = list(store.keys()) 53 | assert(len(keys) == 26) 54 | # test default dmr_url 55 | store = EosdisStore(fixtures[0]["url"]) 56 | keys = list(store.keys()) 57 | assert(len(keys) == 26) 58 | 59 | def test_eosdis_store_open(self): 60 | store = zarr.open(open_eosdis_store()) 61 | arrays = list(store.arrays()) 62 | assert(len(arrays) == 8) 63 | assert(arrays[0][0] == 'atmosphere_cloud_liquid_water_content') 64 | arr = arrays[0][1] 65 | assert(type(arr) == zarr.core.Array) 66 | assert(arr.name == '/atmosphere_cloud_liquid_water_content') 67 | assert(arr.shape == (2, 720, 1440)) 68 | 69 | def test_eosdis_store_read(self): 70 | store = zarr.open(open_eosdis_store()) 71 | arr = store['wind_speed'][fixtures[0]["aoi"]] 72 | assert(arr.shape == (149, 221)) 73 | assert(arr[0][0] == 19) 74 | assert(arr.mean() == 169.29050381123022) 75 | 76 | def test_eosdis_store_getranges_combined(self): 77 | store = open_eosdis_store() 78 | ranges = [ 79 | ('wind_speed/0.4.11', 768280, 6830), 80 | ('wind_speed/0.4.12', 775112, 5759) 81 | ] 82 | result = store._getranges(ranges) 83 | assert(len(result) == 2) 84 | assert(len(store.responses) == 1) 85 | 86 | def test_eosdis_store_getranges_split(self): 87 | store = open_eosdis_store() 88 | ranges = [ 89 | ('wind_speed/0.4.11', 768280, 6830), 90 | ('wind_speed/0.4.12', 785112, 5759) 91 | ] 92 | result = store._getranges(ranges) 93 | assert(len(result) == 2) 94 | assert(len(store.responses) == 2) 95 | 96 | def test_eosdis_store_parallel_reads(self): 97 | store = zarr.open(open_eosdis_store()) 98 | arr = store['wind_speed'][fixtures[0]["aoi"]] 99 | responses = store.store.responses 100 | end_time = responses[0].start + responses[1].elapsed 101 | for r in responses[1:]: 102 | assert(r.start < end_time) 103 | 104 | 105 | class TestXArray(unittest.TestCase): 106 | 107 | @classmethod 108 | def _setUpClass(cls): 109 | store = open_eosdis_store(2) 110 | # use patched zarr metadata file as workaround to incorrect DMR++ files showing Int8 datasets as Int16 111 | fix = fixtures[2] 112 | bname = f"{os.path.splitext(os.path.basename(fix['url']))[0]}.zarr.json" 113 | with open(os.path.join(testpath, 'fixtures', bname)) as f: 114 | meta = json.loads(f.read()) 115 | 116 | store = ConsolidatedChunkStore(meta, fix["url"]) 117 | 118 | cls.xa_noscale = xarray.open_zarr(store, mask_and_scale=False) 119 | cls.xa = xarray.open_zarr(store, mask_and_scale=True) 120 | 121 | def test_scale_offset(self): 122 | store = open_eosdis_store(0) 123 | var = 'wind_speed' 124 | 125 | xa_noscale = xarray.open_zarr(store, mask_and_scale=False) 126 | xa = xarray.open_zarr(store, mask_and_scale=True) 127 | 128 | # get values without scale and offset 129 | wv = xa_noscale[var] 130 | assert(hasattr(wv, "scale_factor")) 131 | assert(hasattr(wv, "add_offset")) 132 | arr = wv[fixtures[0]["aoi"]] 133 | mean = arr.mean().item() 134 | scale_factor = wv.scale_factor 135 | add_offset = wv.add_offset 136 | 137 | # test with scale and offset 138 | wv = xa[var] 139 | assert(not hasattr(wv, "scale_factor")) 140 | assert(not hasattr(wv, "add_offset")) 141 | arr = wv[fixtures[0]["aoi"]] 142 | 143 | self.assertAlmostEqual(arr.mean().item(), mean * scale_factor + add_offset, places=5) 144 | 145 | def test_fillvalue(self): 146 | # use patched zarr metadata file as workaround to incorrect DMR++ files showing Int8 datasets as Int16 147 | fix = fixtures[2] 148 | bname = f"{os.path.splitext(os.path.basename(fix['url']))[0]}.zarr.json" 149 | with open(os.path.join(testpath, 'fixtures', bname)) as f: 150 | meta = json.loads(f.read()) 151 | 152 | store = ConsolidatedChunkStore(meta, fix["url"]) 153 | 154 | xa = xarray.open_zarr(store, mask_and_scale=True) 155 | 156 | var = 'sea_surface_temperature' 157 | 158 | # do not apply mask - check that fill value exists and calculate mean excluding them 159 | xa_nofill = xarray.open_zarr(store, mask_and_scale=False) 160 | arr = xa_nofill[var][fix["aoi"]].values 161 | locs = np.where(arr == xa_nofill['sea_surface_temperature']._FillValue) 162 | assert(len(locs[0]) > 0) 163 | mean = arr[arr != xa_nofill['sea_surface_temperature']._FillValue].mean() 164 | mean = mean * xa_nofill['sea_surface_temperature'].scale_factor + xa_nofill['sea_surface_temperature'].add_offset 165 | 166 | # apply mask and use numpy nanmean function to calculate mean 167 | arr2 = xa[var][fix["aoi"]].values 168 | mean2 = np.nanmean(arr2) 169 | 170 | self.assertAlmostEqual(mean, mean2, places=4) 171 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2019-2021 United States Government as represented by the Administrator of the National Aeronautics and Space Administration. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 4 | You may obtain a copy of the License at 5 | 6 | http://www.apache.org/licenses/LICENSE-2.0 7 | 8 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 10 | 11 | --- 12 | 13 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 14 | 15 | 1. Definitions. 16 | 17 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. 18 | 19 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. 20 | 21 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. 26 | 27 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. 28 | 29 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). 30 | 31 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. 32 | 33 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." 34 | 35 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 36 | 37 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 38 | 39 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 40 | 41 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: 42 | 43 | You must give any other recipients of the Work or Derivative Works a copy of this License; and 44 | You must cause any modified files to carry prominent notices stating that You changed the files; and 45 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and 46 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. 47 | 48 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 49 | 50 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 51 | 52 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 53 | 54 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 55 | 56 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 57 | 58 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. 59 | 60 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /eosdis_store/dmrpp.py: -------------------------------------------------------------------------------- 1 | __all__ = ['to_zarr'] 2 | 3 | import logging 4 | import os.path as op 5 | import requests 6 | import xml.etree.ElementTree as ElementTree 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | # Environment variables 11 | 12 | """ Namespaces used in DMRPP XML Files """ 13 | NS = { 14 | 'dpp': 'http://xml.opendap.org/dap/dmrpp/1.0.0#', 15 | 'd': 'http://xml.opendap.org/ns/DAP/4.0#' 16 | } 17 | 18 | """ Default compression level """ 19 | UNKNOWN_COMPRESSION_LEVEL = 4 20 | 21 | """ Data type mappings """ 22 | TYPE_INFO = { 23 | 'Int8': (int, '|i1'), 24 | 'Int16': (int, ' max_gap + 1: 239 | logger.debug("Starting new range due to gap of %d bytes" % (offset - prev_offset,)) 240 | result.append((group_offset, prev_offset - group_offset, group)) 241 | group_offset = offset 242 | group = [] 243 | group.append((key, offset - group_offset, size)) 244 | prev_offset = offset + size 245 | result.append((group_offset, prev_offset - group_offset, group)) 246 | return result 247 | 248 | 249 | class EosdisStore(ConsolidatedChunkStore): 250 | """Store representing a HDF5/NetCDF file accessed over HTTP with zarr metadata derived from a DMR++ file 251 | 252 | Args: 253 | ConsolidatedChunkStore (ConsolidatedChunkStore): Parent class is a store for doing byte range reads 254 | """ 255 | def __init__(self, data_url, dmr_url=None): 256 | """Construct the store 257 | 258 | Args: 259 | data_url (String): The URL of the remote data file which should be accessed through Zarr 260 | dmr_url (String): Optional URL to a DMR++ file describing metadata and byte offsets of the 261 | given file. If not provided, the URL is assumed to be the original file with a .dmrpp suffix 262 | """ 263 | if dmr_url is None: 264 | dmr_url = data_url + '.dmrpp' 265 | dmrpp = requests.get(dmr_url).text 266 | tree = ElementTree.fromstring(dmrpp) 267 | meta_store = to_zarr(tree) 268 | super(EosdisStore, self).__init__(meta_store, data_url) 269 | -------------------------------------------------------------------------------- /presentation/tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# EOSDIS Zarr Store: Spatial & Variable Subsetting without Services\n", 8 | "\n", 9 | "## Goal\n", 10 | "\n", 11 | "Produce a library that would let the Zarr Python library read EOSDIS cloud holdings efficiently, without requiring any modifications to our archive. This has the potential to expand use to new communities and tools, allow more efficient access both in place and outside of the cloud, and therefore save money for the archive as well as time for users.\n", 12 | "\n", 13 | "## Background\n", 14 | "\n", 15 | "This is a demo of a data store I've been working on, building off of the work of a few others. Adapting the Zarr library (which is meant to read cloud-optimized data stores) read NetCDF4 / HDF 5 files was discussed as a possibility at Summer ESIP last year. Rich Signell from USGS worked with HDF Group to get [a prototype](https://medium.com/pangeo/cloud-performant-reading-of-netcdf4-hdf5-data-using-the-zarr-library-1a95c5c92314). The resulting code showed no performance degradation over an equivalent native Zarr store. This adaptation requies an up-front generation of metadata containing data attributes and byte offsets to allow efficient reads.\n", 16 | "\n", 17 | "## What I did\n", 18 | "\n", 19 | "I recognized that the DMR++ files OPeNDAP / GHRC have started generating on ingest in PI 20.1 contain nearly equivalent information to that required by the Zarr library. Hearing that small chunk sizes (chunks are a region of data that can / must be read all at once) caused issues for some NetCDF files and required re-chunking (i.e. altering the original data file), I further looked at mitigating that issue to avoid having to re-host data. In picking through the Zarr code, I came across a for loop that, if changed, would allow a set of optimizations that would greatly improve performance. I advocated for this in the Zarr tracker and what we need is now being planned.\n", 20 | "![png](summary.png)\n", 21 | "\n", 22 | "In terms of actual code, I produced a Python library, eosdis-zarr-store that:\n", 23 | "\n", 24 | "1. Implements the Zarr storage API in a natural and familiar way to Zarr developers\n", 25 | "2. Sets up HTTP access to allow EDL credential handshaking and, importantly, caching of redirect URLs\n", 26 | "3. Adapts our underlying data files and DMR++ files generated on ingest to a Zarr-compatible API\n", 27 | "4. Implements optimizations using the API worked out with the Zarr community to make fewer total data reads and do them in parallel where possible\n", 28 | "\n", 29 | "The remainder of this notebook contains results and conclusions.\n", 30 | "\n", 31 | "## How to use it\n", 32 | "\n", 33 | "In the eosdis-zarr-store directory run `pip install -e .`. Obtain or stage an HDF5 (NetCDF4) file along with a DMR++ file with identical URL + \".dmrpp\", you can run \"mkdmrpp\" in this folder to produce DMR++ files. Then:\n", 34 | "\n", 35 | "```python\n", 36 | "from eosdis_zarr_store import Store\n", 37 | "import zarr\n", 38 | "\n", 39 | "f = zarr.open(Store(data_file_url))\n", 40 | "# Manipulate f as any Zarr store (see examples below)\n", 41 | "```\n", 42 | "\n", 43 | "The URLs in this notebook will not be available for general use, since one example produces 500 MB of egress for benchmarking.\n", 44 | "\n", 45 | "## Helpers and Constants (You can skip this)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# Helpers to draw stuff, generate URLs, and translate bounding boxes to array indices, \n", 55 | "# wholly ignoring all the helpful attributes present in the HDF and Zarr metadata\n", 56 | "# Please don't judge me on this mess. It's not called \"Clean code fest\"\n", 57 | "\n", 58 | "from matplotlib import pyplot as plt\n", 59 | "from matplotlib import colors\n", 60 | "from ipypb import track\n", 61 | "import numpy as np\n", 62 | "\n", 63 | "def show(data, transpose=True):\n", 64 | " plt.rcParams[\"figure.figsize\"] = [16, 8]\n", 65 | " if transpose:\n", 66 | " data = np.transpose(data)\n", 67 | " plt.imshow(data[::-1,:], norm=colors.Normalize(0, 150), cmap='Blues')\n", 68 | "\n", 69 | "def get_aoi(bbox, scale, x0=180, y0=90):\n", 70 | " aoi = (0, \n", 71 | " slice(scale * int(bbox[1] + x0), scale * int(bbox[3] + x0) + 1), \n", 72 | " slice(scale * int(bbox[0] + y0), scale * int(bbox[2] + y0) + 1))\n", 73 | " shape = [d.stop - d.start for d in aoi[1:]]\n", 74 | " return aoi, shape\n", 75 | "\n", 76 | "url_root = 'https://harmony.uat.earthdata.nasa.gov/service-results/harmony-uat-staging/public/demo/zarr-store/'\n", 77 | "# GPM HHR URLs\n", 78 | "filename_template = '3B-HHR.MS.MRG.3IMERG.20051022-S%02d%02d00-E%02d%02d59.%04d.V06B.HDF5'\n", 79 | "data_urls = [ url_root + filename_template % (h, m, h, m + 29, h * 60 + m) for h in range(0, 24) for m in range(0, 60, 30) ]\n", 80 | "\n", 81 | "bbox = [10, -100, 47.5, -45]\n", 82 | "\n", 83 | "# Basic file info (also readable from metadata)\n", 84 | "GPM_NODATA = -9999.9\n", 85 | "gpm_aoi, gpm_shape = get_aoi(bbox, 10)\n", 86 | "RSS_NODATA = 251\n", 87 | "RSS_SCALE_FACTOR = 0.5 # In-file scale factor is 0.1. This increases it solely for the purpose of making it show up in pics\n", 88 | "rss_aoi, rss_shape = get_aoi(bbox, 4, 360)\n", 89 | "rss_aoi = (0, rss_aoi[2], rss_aoi[1])\n" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "gpm_aoi" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "## Problem 1: Atmospheric water vapor off the East Coast on Patrick's wedding day\n", 106 | "\n", 107 | "It rained a little that day in DC and hurricanes were threatening our honeymoon in the Carribbean.\n", 108 | "\n", 109 | "We have a bounding box defined above. Use data distributed by GHRC derived from the SSMIS sensor of the F16 DMSP satellite to build a picture.\n", 110 | "\n", 111 | "### Without Partial Access\n", 112 | "\n", 113 | "Download 2.6 MB file and subset it" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "scrolled": false 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "%%time\n", 125 | "from h5py import File as H5File\n", 126 | "import requests\n", 127 | "from io import BytesIO\n", 128 | "\n", 129 | "response = requests.get(url_root + 'f16_ssmis_20051022v7.nc')\n", 130 | "with H5File(BytesIO(response.content), 'r') as f:\n", 131 | " aoi_data = f['atmosphere_water_vapor_content'][rss_aoi]\n", 132 | " show(np.where(aoi_data < RSS_NODATA, aoi_data * RSS_SCALE_FACTOR, 0), transpose=False)\n" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "### Partial Access Step 1 - Make our data readable in Zarr for partial access\n", 140 | "\n", 141 | "Mimick a Zarr store by reading OPeNDAP's DMR++ files and returning their metadata in a Zarr interface. DMR++ files are planned to be generated on ingest\n", 142 | "\n", 143 | "Downloads 200 KB of data from the 2.6 MB file with conventional Zarr storage implementation.\n", 144 | "\n", 145 | "Result: 12 data requests, each of which goes through internet services, pre-signs a URL, and redirects to the data range. All sequentially. Slow." 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "%%time\n", 155 | "from eosdis_store import EosdisStore\n", 156 | "import zarr\n", 157 | "\n", 158 | "f = zarr.open(EosdisStore(url_root + 'f16_ssmis_20051022v7.nc'))\n", 159 | "aoi_data = f['atmosphere_water_vapor_content'][rss_aoi]\n", 160 | "print(aoi_data)\n", 161 | "show(np.where(aoi_data < RSS_NODATA, aoi_data * RSS_SCALE_FACTOR, 0), transpose=False)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "### Partial Access Step 2 - Make Zarr reads fast\n", 169 | "\n", 170 | "Downloads 200 KB of data from the 2.6 MB file with Zarr optimizations: \n", 171 | "1. (Working with Zarr community) Implement \"getitems\" concept, allowing storage to know all of the chunks that will be accessed up front\n", 172 | "2. Combine nearby range requests into single HTTP requests before sending them, allowing fewer requests.\n", 173 | "3. Cache presigned URLs returned by the archive for a short time, as directed by caching headers (TEA has a ticket to add these), allowing reuse and avoiding many round-trips and redirects\n", 174 | "4. Run the first data range request serially to get the presigned URL. Run subsequent requests in parallel.\n", 175 | "\n", 176 | "Result: 3 data requests, one of which goes through internet services, pre-signs a URL, and redirects to the data range. The following two reuse the signed URL and fetch in parallel. Faster!\n", 177 | "\n", 178 | "When more than a couple of chunks are involved, this is expected to be faster than the native Zarr S3 format is capable of, and the more chunks involved in a read the more it improves." 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": { 185 | "scrolled": true 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "%%time\n", 190 | "from eosdis_store import EosdisStore\n", 191 | "import zarr\n", 192 | "\n", 193 | "f = zarr.open(EosdisStore(url_root + 'f16_ssmis_20051022v7.nc'))\n", 194 | "aoi_data = f['atmosphere_water_vapor_content'][rss_aoi]\n", 195 | "print(rss_aoi)\n", 196 | "show(np.where(aoi_data < RSS_NODATA, aoi_data * RSS_SCALE_FACTOR, 0), transpose=False)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "scrolled": false 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "f['atmosphere_water_vapor_content'].shape\n" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## Problem 2: Rain along the East Coast on Patrick's wedding day\n", 215 | "\n", 216 | "With the same bounding box above, get half-hourly high-quality precipitation values from GPM and sum them for the entire day (48 L3 global data files)\n", 217 | "\n", 218 | "### Without Partial Access\n", 219 | "\n", 220 | "Download approximately 500MB of data in whole files and processes them" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "%%time\n", 230 | "from h5py import File as H5File\n", 231 | "import requests\n", 232 | "from io import BytesIO\n", 233 | "\n", 234 | "result = np.zeros(gpm_shape)\n", 235 | "for url in track(data_urls):\n", 236 | " response = requests.get(url)\n", 237 | " with H5File(BytesIO(response.content), 'r') as f:\n", 238 | " aoi_data = f['Grid/HQprecipitation'][gpm_aoi]\n", 239 | " result = result + np.where(aoi_data != GPM_NODATA, aoi_data / 2, 0)\n", 240 | "show(result)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "### With the EOSDIS Zarr Store" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "Downloads approximately 5 MB of data by doing partial reads in fewer lines of code" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "%%time\n", 264 | "from eosdis_store import EosdisStore\n", 265 | "import zarr\n", 266 | "\n", 267 | "result = np.zeros(gpm_shape)\n", 268 | "for url in track(data_urls):\n", 269 | " f = zarr.open(EosdisStore(url, quiet=True))\n", 270 | " aoi_data = f['Grid/HQprecipitation'][gpm_aoi]\n", 271 | " result = result + np.where(aoi_data != GPM_NODATA, aoi_data / 2, 0)\n", 272 | "show(result)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "### Comparison to L3 Daily Average Product" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "Download a 30 MB file from the daily average collection to produce a similar result, validating the result at 6x egress cost of partial access for all of the half-hourly source files." 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "%%time\n", 296 | "from h5py import File as H5File\n", 297 | "import requests\n", 298 | "from io import BytesIO\n", 299 | "\n", 300 | "response = requests.get(url_root + '3B-DAY.MS.MRG.3IMERG.20051022-S000000-E235959.V06.nc4')\n", 301 | "with H5File(BytesIO(response.content), 'r') as f:\n", 302 | " show(f['HQprecipitation'][gpm_aoi])" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "## I can see my house from here!\n", 310 | "\n", 311 | "Download GEDI L2B data. Use small geolocation arrays to find the area of interest, then download only the data within those chunks. \n", 312 | "\n", 313 | "A full file download is 1.3 GB. The code below downloads approximately 15 MB of data and metadata. This reduces a 15 minute download to about 8s. (Aside: the download is 2/3 metadata, which could be dramatically reduced by using Zarr's default format rather than DMR++)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "from eosdis_zarr_store import Store\n", 323 | "import zarr\n", 324 | "import numpy as np\n", 325 | "\n", 326 | "url = 'http://localhost:4000/data/GEDI02_B_2019182140038_O03117_T05635_02_001_01.h5'\n", 327 | "f = zarr.open(Store(url))\n", 328 | "print(f.tree())" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "n, w, s, e = [40.2, -75.25, 40.15, -75.2]\n", 338 | "\n", 339 | "geoloc = f['BEAM0000/geolocation']\n", 340 | "all_lats = geoloc['latitude_bin0'][:]\n", 341 | "all_lons = geoloc['longitude_bin0'][:]\n", 342 | "valid_lat_i = np.where(np.logical_and(all_lats >= s, all_lats <= n))\n", 343 | "valid_lon_i = np.where(np.logical_and(all_lons >= w, all_lons <= e))\n", 344 | "indices = np.intersect1d(valid_lat_i, valid_lon_i)\n", 345 | "\n", 346 | "lats = all_lats[indices]\n", 347 | "lons = all_lons[indices]\n", 348 | "data = f['BEAM0000/cover'][:][indices]\n", 349 | "data_i = np.where(data != -9999)\n", 350 | "data = data[data_i]\n", 351 | "lats = lats[data_i]\n", 352 | "lons = lons[data_i]\n", 353 | "\n", 354 | "ambler = plt.imread('ambler.png')\n", 355 | "fig, ax = plt.subplots(figsize=(10,10))\n", 356 | "ax.scatter(lons, lats, s=50, c=data, cmap='Greens')\n", 357 | "ax.set_xlim(w, e)\n", 358 | "ax.set_ylim(s, n)\n", 359 | "ax.imshow(ambler, zorder=0, extent = [w, e, s, n], aspect='equal')" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "## Why is it sometimes slower?\n", 367 | "\n", 368 | "![png](request-overhead.png)\n", 369 | "\n", 370 | "We pay a penalty for every new file we access, needing to go over the Internet, through the Internet services stack, the request signing process, and ultimately get redirected to S3. The Zarr store has to pay this penalty twice to read the metadata and then the file, while a full-file download only pays the penalty once. With current performance, the break-even point in file size is about 10 MB. That is to say, if a user wants to access even a tiny amount of data in each granule from a collection whose granules are under 10 MB in size, he or she is better off downloading the granules. While there is some uncontrollable overhead, there is significant room for improvement in areas that are under our control to promote inexpensive access patterns while improving time to science.\n", 371 | "\n", 372 | "## Conclusions\n", 373 | "\n", 374 | "* If providers generate DMR++ on ingest, we can expose our data efficiently using a Python API that is gaining increasing traction, particulary in the Pangeo community, with minimal storage overhead\n", 375 | "* Works out of the cloud, but works even better / faster in the cloud for analysis near data\n", 376 | "* For partial access cases, an overall egress reduction of 90% or more could be possible, as demonstrated\n", 377 | "* Chunking matters. This work makes smaller chunks more desirable, which has not historically been the case with Zarr\n", 378 | "* Overhead in our stack, from EDL, to Internet services, to redirects, are eating up the potential user savings. At a 90% egress reduction, we struggle to compete with \"Just download everything.\" How do we balance preventing undesirable behavior with encouraging desirable behavior?\n", 379 | "* There are lingering questions about whether DMR++ is the correct format to capture this metadata in. Zarr's native format is in many cases more complete and easier to parse while having mechanisms for more easily working with the 100,000-ish chunks in GEDI granules and for unifying multiple granules into a coherent view.\n", 380 | "\n", 381 | "## Limitations / Needs\n", 382 | "\n", 383 | "* The DMR++ file must be generated on ingest into the cloud, which is currently optional\n", 384 | "* Only works on HDF5 and NetCDF4 files. In principle, it could work on HDF4 / NetCDF Classic files but nothing yet generates the necessary metadata\n", 385 | "* DMR++ does not quite specify everything we could need for some datasets. We assume little endian byte order and column-major ordering.\n", 386 | "\n", 387 | "## Future Work\n", 388 | "\n", 389 | "* Packaging, unit tests, and docs sufficient for publication\n", 390 | "* Open source (relies on a naming decision)\n", 391 | "* Cache repeated calls for the same byte ranges to avoid requerying data we have\n", 392 | "* Implement unknown / undocumented areas of the DMR++ spec, including compression types and data filters\n", 393 | "* Tests with Dask and XArray\n", 394 | "* Implement CF conventions to populate fill values, offsets, scales, etc\n", 395 | "* Extensions to present L3 global collections as a coherent data cube\n", 396 | "\n", 397 | "I strongly believe in this access pattern as a win for our users and ourselves. To the extent it is not fully realized, it suffers from being an early adopter of our cloud access stack. My sincere hope is that we can learn from it to improve partial file access not only here but in other tools and libraries." 398 | ] 399 | } 400 | ], 401 | "metadata": { 402 | "kernelspec": { 403 | "display_name": "Python 3", 404 | "language": "python", 405 | "name": "python3" 406 | }, 407 | "language_info": { 408 | "codemirror_mode": { 409 | "name": "ipython", 410 | "version": 3 411 | }, 412 | "file_extension": ".py", 413 | "mimetype": "text/x-python", 414 | "name": "python", 415 | "nbconvert_exporter": "python", 416 | "pygments_lexer": "ipython3", 417 | "version": "3.8.3" 418 | } 419 | }, 420 | "nbformat": 4, 421 | "nbformat_minor": 4 422 | } 423 | -------------------------------------------------------------------------------- /tests/fixtures/20200911000001-JPL-L2P_GHRSST-SSTskin-MODIS_A-N-v02.0-fv01.0.zarr.json: -------------------------------------------------------------------------------- 1 | { 2 | ".zgroup": { 3 | "zarr_format": 2 4 | }, 5 | "wind_speed/.zarray": { 6 | "zarr_format": 2, 7 | "filters": null, 8 | "order": "C", 9 | "dtype": " 127, please subtract 256 from this reported value", 94 | "scale_factor": 0.07874015719, 95 | "add_offset": 10.0, 96 | "coverage_content_type": "auxiliaryInformation", 97 | "coordinates": "time lat lon", 98 | "_ARRAY_DIMENSIONS": [ 99 | "time", 100 | "nj", 101 | "ni" 102 | ] 103 | }, 104 | "sses_standard_deviation/.zchunkstore": { 105 | "0.0.0": { 106 | "offset": 22317539, 107 | "size": 23320 108 | }, 109 | "0.0.1": { 110 | "offset": 22340859, 111 | "size": 28041 112 | }, 113 | "0.1.0": { 114 | "offset": 22368900, 115 | "size": 13222 116 | }, 117 | "0.1.1": { 118 | "offset": 22382122, 119 | "size": 4287 120 | } 121 | }, 122 | "sst_dtime/.zarray": { 123 | "zarr_format": 2, 124 | "filters": null, 125 | "order": "C", 126 | "dtype": " 127, please subtract 256 from this reported value", 321 | "flag_values": [ 322 | 0, 323 | 1, 324 | 2, 325 | 3, 326 | 4, 327 | 5 328 | ], 329 | "flag_meanings": "no_data bad_data worst_quality low_quality acceptable_quality best_quality", 330 | "coverage_content_type": "qualityInformation", 331 | "coordinates": "time lat lon", 332 | "_ARRAY_DIMENSIONS": [ 333 | "time", 334 | "nj", 335 | "ni" 336 | ] 337 | }, 338 | "quality_level/.zchunkstore": { 339 | "0.0.0": { 340 | "offset": 22162994, 341 | "size": 24444 342 | }, 343 | "0.0.1": { 344 | "offset": 22187438, 345 | "size": 30682 346 | }, 347 | "0.1.0": { 348 | "offset": 22218120, 349 | "size": 13679 350 | }, 351 | "0.1.1": { 352 | "offset": 22231799, 353 | "size": 4661 354 | } 355 | }, 356 | "dt_analysis/.zarray": { 357 | "zarr_format": 2, 358 | "filters": null, 359 | "order": "C", 360 | "dtype": " 127, please subtract 256 from this reported value", 441 | "scale_factor": 0.07874015719, 442 | "add_offset": 10.0, 443 | "coverage_content_type": "auxiliaryInformation", 444 | "coordinates": "time lat lon", 445 | "_ARRAY_DIMENSIONS": [ 446 | "time", 447 | "nj", 448 | "ni" 449 | ] 450 | }, 451 | "sses_standard_deviation_4um/.zchunkstore": { 452 | "0.0.0": { 453 | "offset": 27251747, 454 | "size": 30619 455 | }, 456 | "0.0.1": { 457 | "offset": 27282366, 458 | "size": 39107 459 | }, 460 | "0.1.0": { 461 | "offset": 27321473, 462 | "size": 18171 463 | }, 464 | "0.1.1": { 465 | "offset": 27339644, 466 | "size": 6275 467 | } 468 | }, 469 | "sses_bias_4um/.zarray": { 470 | "zarr_format": 2, 471 | "filters": null, 472 | "order": "C", 473 | "dtype": " 127, please subtract 256 from this reported value", 497 | "scale_factor": 0.1574803144, 498 | "add_offset": 0.0, 499 | "coverage_content_type": "auxiliaryInformation", 500 | "coordinates": "time lat lon", 501 | "_ARRAY_DIMENSIONS": [ 502 | "time", 503 | "nj", 504 | "ni" 505 | ] 506 | }, 507 | "sses_bias_4um/.zchunkstore": { 508 | "0.0.0": { 509 | "offset": 27160872, 510 | "size": 27847 511 | }, 512 | "0.0.1": { 513 | "offset": 27188719, 514 | "size": 35589 515 | }, 516 | "0.1.0": { 517 | "offset": 27224308, 518 | "size": 14687 519 | }, 520 | "0.1.1": { 521 | "offset": 27238995, 522 | "size": 5623 523 | } 524 | }, 525 | "sses_bias/.zarray": { 526 | "zarr_format": 2, 527 | "filters": null, 528 | "order": "C", 529 | "dtype": " 127, please subtract 256 from this reported value", 553 | "scale_factor": 0.1574803144, 554 | "add_offset": 0.0, 555 | "coverage_content_type": "auxiliaryInformation", 556 | "coordinates": "time lat lon", 557 | "_ARRAY_DIMENSIONS": [ 558 | "time", 559 | "nj", 560 | "ni" 561 | ] 562 | }, 563 | "sses_bias/.zchunkstore": { 564 | "0.0.0": { 565 | "offset": 22244733, 566 | "size": 22729 567 | }, 568 | "0.0.1": { 569 | "offset": 22267462, 570 | "size": 26981 571 | }, 572 | "0.1.0": { 573 | "offset": 22294443, 574 | "size": 12149 575 | }, 576 | "0.1.1": { 577 | "offset": 22306592, 578 | "size": 4272 579 | } 580 | }, 581 | "sea_surface_temperature_4um/.zarray": { 582 | "zarr_format": 2, 583 | "filters": null, 584 | "order": "C", 585 | "dtype": " 127, please subtract 256 from this reported value", 664 | "flag_values": [ 665 | 0, 666 | 1, 667 | 2, 668 | 3, 669 | 4, 670 | 5 671 | ], 672 | "flag_meanings": "no_data bad_data worst_quality low_quality acceptable_quality best_quality", 673 | "coverage_content_type": "qualityInformation", 674 | "coordinates": "time lat lon", 675 | "_ARRAY_DIMENSIONS": [ 676 | "time", 677 | "nj", 678 | "ni" 679 | ] 680 | }, 681 | "quality_level_4um/.zchunkstore": { 682 | "0.0.0": { 683 | "offset": 27076250, 684 | "size": 25483 685 | }, 686 | "0.0.1": { 687 | "offset": 27101733, 688 | "size": 33457 689 | }, 690 | "0.1.0": { 691 | "offset": 27135190, 692 | "size": 13796 693 | }, 694 | "0.1.1": { 695 | "offset": 27148986, 696 | "size": 5147 697 | } 698 | }, 699 | "time/.zarray": { 700 | "zarr_format": 2, 701 | "filters": null, 702 | "order": "C", 703 | "dtype": " 127, please subtract 256 from this reported value; Quicklook", 842 | "license": "GHRSST and PO.DAAC protocol allow data use as free and open.", 843 | "id": "MODIS_A-JPL-L2P-v2019.0", 844 | "naming_authority": "org.ghrsst", 845 | "product_version": "2019.0", 846 | "uuid": "f6e1f61d-c4a4-4c17-8354-0c15e12d688b", 847 | "gds_version_id": "2.0", 848 | "netcdf_version_id": "4.1", 849 | "date_created": "20200911T024514Z", 850 | "file_quality_level": 3, 851 | "spatial_resolution": "1km", 852 | "start_time": "20200911T000001Z", 853 | "time_coverage_start": "20200911T000001Z", 854 | "stop_time": "20200911T000458Z", 855 | "time_coverage_end": "20200911T000458Z", 856 | "northernmost_latitude": -36.22299957, 857 | "southernmost_latitude": -57.91799927, 858 | "easternmost_longitude": 31.05480003, 859 | "westernmost_longitude": -7.165909767, 860 | "source": "MODIS sea surface temperature observations for the OBPG", 861 | "platform": "Aqua", 862 | "sensor": "MODIS", 863 | "metadata_link": "http://podaac.jpl.nasa.gov/ws/metadata/dataset/?format=iso&shortName=MODIS_A-JPL-L2P-v2019.0", 864 | "keywords": "Oceans > Ocean Temperature > Sea Surface Temperature", 865 | "keywords_vocabulary": "NASA Global Change Master Directory (GCMD) Science Keywords", 866 | "standard_name_vocabulary": "NetCDF Climate and Forecast (CF) Metadata Convention", 867 | "geospatial_lat_units": "degrees_north", 868 | "geospatial_lat_resolution": 0.009999999776, 869 | "geospatial_lon_units": "degrees_east", 870 | "geospatial_lon_resolution": 0.009999999776, 871 | "acknowledgment": "The MODIS L2P sea surface temperature data are sponsored by NASA", 872 | "creator_name": "Ed Armstrong, JPL PO.DAAC", 873 | "creator_email": "edward.m.armstrong@jpl.nasa.gov", 874 | "creator_url": "http://podaac.jpl.nasa.gov", 875 | "project": "Group for High Resolution Sea Surface Temperature", 876 | "publisher_name": "The GHRSST Project Office", 877 | "publisher_url": "http://www.ghrsst.org", 878 | "publisher_email": "ghrsst-po@nceo.ac.uk", 879 | "processing_level": "L2P", 880 | "cdm_data_type": "swath", 881 | "startDirection": "Descending", 882 | "endDirection": "Descending", 883 | "day_night_flag": "Night" 884 | } 885 | } -------------------------------------------------------------------------------- /presentation/background.md: -------------------------------------------------------------------------------- 1 | # EOSDIS Zarr Store: Spatial & Variable Subsetting without Services 2 | 3 | ## Goal 4 | 5 | Produce a library that would let the Zarr Python library read EOSDIS cloud holdings efficiently, without requiring any modifications to our archive. This has the potential to expand use to new communities and tools, allow more efficient access both in place and outside of the cloud, and therefore save money for the archive as well as time for users. 6 | 7 | ## Background 8 | 9 | This is a demo of a data store I've been working on, building off of the work of a few others. Adapting the Zarr library (which is meant to read cloud-optimized data stores) read NetCDF4 / HDF 5 files was discussed as a possibility at Summer ESIP last year. Rich Signell from USGS worked with HDF Group to get [a prototype](https://medium.com/pangeo/cloud-performant-reading-of-netcdf4-hdf5-data-using-the-zarr-library-1a95c5c92314). The resulting code showed no performance degradation over an equivalent native Zarr store. This adaptation requies an up-front generation of metadata containing data attributes and byte offsets to allow efficient reads. 10 | 11 | ## What I did 12 | 13 | I recognized that the DMR++ files OPeNDAP / GHRC have started generating on ingest in PI 20.1 contain nearly equivalent information to that required by the Zarr library. Hearing that small chunk sizes (chunks are a region of data that can / must be read all at once) caused issues for some NetCDF files and required re-chunking (i.e. altering the original data file), I further looked at mitigating that issue to avoid having to re-host data. In picking through the Zarr code, I came across a for loop that, if changed, would allow a set of optimizations that would greatly improve performance. I advocated for this in the Zarr tracker and what we need is now being planned. 14 | ![png](images/summary.png) 15 | 16 | In terms of actual code, I produced a Python library, eosdis-zarr-store that: 17 | 18 | 1. Implements the Zarr storage API in a natural and familiar way to Zarr developers 19 | 2. Sets up HTTP access to allow EDL credential handshaking and, importantly, caching of redirect URLs 20 | 3. Adapts our underlying data files and DMR++ files generated on ingest to a Zarr-compatible API 21 | 4. Implements optimizations using the API worked out with the Zarr community to make fewer total data reads and do them in parallel where possible 22 | 23 | The remainder of this notebook contains results and conclusions. 24 | 25 | ## How to use it 26 | 27 | In the eosdis-zarr-store directory run `pip install -e .`. Obtain or stage an HDF5 (NetCDF4) file along with a DMR++ file with identical URL + ".dmrpp", you can run "mkdmrpp" ([scripts/mkdmrpp](scripts/mkdmrpp))in this folder to produce DMR++ files. Then: 28 | 29 | ```python 30 | from eosdis_zarr_store import Store 31 | import zarr 32 | 33 | f = zarr.open(Store(data_file_url)) 34 | # Manipulate f as any Zarr store (see examples below) 35 | ``` 36 | 37 | The URLs in this notebook have been redacted, since some produce substantial egress for benchmarking and illustration. If you need example data, please reach out. 38 | 39 | ## Helpers and Constants (You can skip this) 40 | 41 | 42 | ```python 43 | # Helpers to draw stuff, generate URLs, and translate bounding boxes to array indices, 44 | # wholly ignoring all the helpful attributes present in the HDF and Zarr metadata 45 | # Please don't judge me on this mess. It's not called "Clean code fest" 46 | 47 | from matplotlib import pyplot as plt 48 | from matplotlib import colors 49 | from ipypb import track 50 | import numpy as np 51 | 52 | def show(data, transpose=True): 53 | plt.rcParams["figure.figsize"] = [16, 8] 54 | if transpose: 55 | data = np.transpose(data) 56 | plt.imshow(data[::-1,:], norm=colors.Normalize(0, 150), cmap='Blues') 57 | 58 | def get_aoi(bbox, scale, x0=180, y0=90): 59 | aoi = (0, 60 | slice(scale * int(bbox[1] + x0), scale * int(bbox[3] + x0) + 1), 61 | slice(scale * int(bbox[0] + y0), scale * int(bbox[2] + y0) + 1)) 62 | shape = [d.stop - d.start for d in aoi[1:]] 63 | return aoi, shape 64 | 65 | url_root = 'https://example.earthdata.nasa.gov/example-staging-url/' 66 | # GPM HHR URLs 67 | filename_template = '3B-HHR.MS.MRG.3IMERG.20051022-S%02d%02d00-E%02d%02d59.%04d.V06B.HDF5' 68 | data_urls = [ url_root + filename_template % (h, m, h, m + 29, h * 60 + m) for h in range(0, 24) for m in range(0, 60, 30) ] 69 | 70 | bbox = [10, -100, 47.5, -45] 71 | 72 | # Basic file info (also readable from metadata) 73 | GPM_NODATA = -9999.9 74 | gpm_aoi, gpm_shape = get_aoi(bbox, 10) 75 | RSS_NODATA = 251 76 | RSS_SCALE_FACTOR = 0.5 # In-file scale factor is 0.1. This increases it solely for the purpose of making it show up in pics 77 | rss_aoi, rss_shape = get_aoi(bbox, 4, 360) 78 | rss_aoi = (0, rss_aoi[2], rss_aoi[1]) 79 | 80 | ``` 81 | 82 | ## Problem 1: Atmospheric water vapor off the East Coast on Patrick's wedding day 83 | 84 | It rained a little that day in DC and hurricanes were threatening our honeymoon in the Carribbean. 85 | 86 | We have a bounding box defined above. Use data distributed by GHRC derived from the SSMIS sensor of the F16 DMSP satellite to build a picture. 87 | 88 | ### Without Partial Access 89 | 90 | Download 2.6 MB file and subset it 91 | 92 | 93 | ```python 94 | %%time 95 | from h5py import File as H5File 96 | import requests 97 | from io import BytesIO 98 | 99 | response = requests.get(url_root + 'f16_ssmis_20051022v7.nc') 100 | with H5File(BytesIO(response.content), 'r') as f: 101 | aoi_data = f['atmosphere_water_vapor_content'][rss_aoi] 102 | show(np.where(aoi_data < RSS_NODATA, aoi_data * RSS_SCALE_FACTOR, 0), transpose=False) 103 | 104 | ``` 105 | 106 | CPU times: user 206 ms, sys: 43.1 ms, total: 249 ms 107 | Wall time: 8.32 s 108 | 109 | 110 | 111 | ![png](images/output_3_1.png) 112 | 113 | 114 | ### Partial Access Step 1 - Make our data readable in Zarr for partial access 115 | 116 | Mimick a Zarr store by reading OPeNDAP's DMR++ files and returning their metadata in a Zarr interface. DMR++ files are planned to be generated on ingest 117 | 118 | Downloads 200 KB of data from the 2.6 MB file with conventional Zarr storage implementation. 119 | 120 | Result: 12 data requests, each of which goes through internet services, pre-signs a URL, and redirects to the data range. All sequentially. Slow. 121 | 122 | 123 | ```python 124 | %%time 125 | from unoptimized_zarr_store import Store 126 | import zarr 127 | 128 | f = zarr.open(Store(url_root + 'f16_ssmis_20051022v7.nc')) 129 | aoi_data = f['atmosphere_water_vapor_content'][rss_aoi] 130 | show(np.where(aoi_data < RSS_NODATA, aoi_data * RSS_SCALE_FACTOR, 0), transpose=False) 131 | ``` 132 | 133 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1455912:1465233] (9321 bytes) 134 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1465240:1472085] (6845 bytes) 135 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1472088:1479515] (7427 bytes) 136 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1479520:1488808] (9288 bytes) 137 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1562808:1564995] (2187 bytes) 138 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1565000:1572245] (7245 bytes) 139 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1572248:1582809] (10561 bytes) 140 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1582816:1593902] (11086 bytes) 141 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1635624:1636392] (768 bytes) 142 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1636392:1639400] (3008 bytes) 143 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1639400:1645720] (6320 bytes) 144 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1645720:1654949] (9229 bytes) 145 | CPU times: user 246 ms, sys: 45.5 ms, total: 292 ms 146 | Wall time: 39.3 s 147 | 148 | 149 | 150 | ![png](images/output_5_1.png) 151 | 152 | 153 | ### Partial Access Step 2 - Make Zarr reads fast 154 | 155 | Downloads 200 KB of data from the 2.6 MB file with Zarr optimizations: 156 | 1. (Working with Zarr community) Implement "getitems" concept, allowing storage to know all of the chunks that will be accessed up front 157 | 2. Combine nearby range requests into single HTTP requests before sending them, allowing fewer requests. 158 | 3. Cache presigned URLs returned by the archive for a short time, as directed by caching headers (TEA has a ticket to add these), allowing reuse and avoiding many round-trips and redirects 159 | 4. Run the first data range request serially to get the presigned URL. Run subsequent requests in parallel. 160 | 161 | Result: 3 data requests, one of which goes through internet services, pre-signs a URL, and redirects to the data range. The following two reuse the signed URL and fetch in parallel. Faster! 162 | 163 | When more than a couple of chunks are involved, this is expected to be faster than the native Zarr S3 format is capable of, and the more chunks involved in a read the more it improves. 164 | 165 | 166 | ```python 167 | %%time 168 | from eosdis_zarr_store import Store 169 | import zarr 170 | 171 | f = zarr.open(Store(url_root + 'f16_ssmis_20051022v7.nc')) 172 | aoi_data = f['atmosphere_water_vapor_content'][rss_aoi] 173 | show(np.where(aoi_data < RSS_NODATA, aoi_data * RSS_SCALE_FACTOR, 0), transpose=False) 174 | ``` 175 | 176 | Starting new range due to gap of 74000 bytes 177 | Starting new range due to gap of 41722 bytes 178 | Merged 12 requests into 3 179 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1455912:1488808] (32896 bytes) 180 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1562808:1593902] (31094 bytes) 181 | Reading https://example.earthdata.nasa.gov/example-staging-url/f16_ssmis_20051022v7.nc [1635624:1654949] (19325 bytes) 182 | CPU times: user 137 ms, sys: 19.9 ms, total: 156 ms 183 | Wall time: 17.3 s 184 | 185 | 186 | 187 | ![png](images/output_7_1.png) 188 | 189 | 190 | ## Problem 2: Rain along the East Coast on Patrick's wedding day 191 | 192 | With the same bounding box above, get half-hourly high-quality precipitation values from GPM and sum them for the entire day (48 L3 global data files) 193 | 194 | ### Without Partial Access 195 | 196 | Download approximately 500MB of data in whole files and processes them 197 | 198 | 199 | ```python 200 | %%time 201 | from h5py import File as H5File 202 | import requests 203 | from io import BytesIO 204 | 205 | result = np.zeros(gpm_shape) 206 | for url in track(data_urls): 207 | response = requests.get(url) 208 | with H5File(BytesIO(response.content), 'r') as f: 209 | aoi_data = f['Grid/HQprecipitation'][gpm_aoi] 210 | result = result + np.where(aoi_data != GPM_NODATA, aoi_data / 2, 0) 211 | show(result) 212 | ``` 213 | 214 | 215 |
216 | 217 | 100% 218 | 48/48 219 | [05:55<00:06, 7.39s/it]
220 | 221 | 222 | CPU times: user 8.51 s, sys: 3 s, total: 11.5 s 223 | Wall time: 5min 54s 224 | 225 | 226 | 227 | ![png](images/output_9_2.png) 228 | 229 | 230 | ### With the EOSDIS Zarr Store 231 | 232 | Downloads approximately 5 MB of data by doing partial reads in fewer lines of code 233 | 234 | 235 | ```python 236 | %%time 237 | from eosdis_zarr_store import Store 238 | import zarr 239 | 240 | result = np.zeros(gpm_shape) 241 | for url in track(data_urls): 242 | f = zarr.open(Store(url, quiet=True)) 243 | aoi_data = f['Grid/HQprecipitation'][gpm_aoi] 244 | result = result + np.where(aoi_data != GPM_NODATA, aoi_data / 2, 0) 245 | show(result) 246 | ``` 247 | 248 | 249 |
250 | 251 | 100% 252 | 48/48 253 | [03:58<00:05, 4.96s/it]
254 | 255 | 256 | CPU times: user 1.72 s, sys: 185 ms, total: 1.91 s 257 | Wall time: 3min 57s 258 | 259 | 260 | 261 | ![png](images/output_12_2.png) 262 | 263 | 264 | ### Comparison to L3 Daily Average Product 265 | 266 | Download a 30 MB file from the daily average collection to produce a similar result, validating the result at 6x egress cost of partial access for all of the half-hourly source files. 267 | 268 | 269 | ```python 270 | %%time 271 | from h5py import File as H5File 272 | import requests 273 | from io import BytesIO 274 | 275 | response = requests.get(url_root + '3B-DAY.MS.MRG.3IMERG.20051022-S000000-E235959.V06.nc4') 276 | with H5File(BytesIO(response.content), 'r') as f: 277 | show(f['HQprecipitation'][gpm_aoi]) 278 | ``` 279 | 280 | CPU times: user 415 ms, sys: 187 ms, total: 602 ms 281 | Wall time: 11.2 s 282 | 283 | 284 | 285 | ![png](images/output_15_1.png) 286 | 287 | 288 | ## I can see my house from here! 289 | 290 | Download GEDI L2B data. Use small geolocation arrays to find the area of interest, then download only the data within those chunks. 291 | 292 | A full file download is 1.3 GB. The code below downloads approximately 15 MB of data and metadata. This reduces a 15 minute download to about 8s. (Aside: the download is 2/3 metadata, which could be dramatically reduced by using Zarr's default format rather than DMR++) 293 | 294 | 295 | ```python 296 | from eosdis_zarr_store import Store 297 | import zarr 298 | import numpy as np 299 | 300 | url = 'https://example.earthdata.nasa.gov/example-staging-url/GEDI02_B_2019182140038_O03117_T05635_02_001_01.h5' 301 | f = zarr.open(Store(url)) 302 | print(f.tree()) 303 | ``` 304 | 305 | / 306 | ├── BEAM0000 307 | │ ├── algorithmrun_flag (238914,) uint8 308 | │ ├── ancillary 309 | │ │ ├── dz (1,) float64 310 | │ │ ├── maxheight_cuttoff (1,) float64 311 | │ │ ├── rg_eg_constraint_center_buffer (1,) int32 312 | │ │ ├── rg_eg_mpfit_max_func_evals (1,) uint16 313 | │ │ ├── rg_eg_mpfit_maxiters (1,) uint16 314 | │ │ ├── rg_eg_mpfit_tolerance (1,) float64 315 | │ │ ├── signal_search_buff (1,) float64 316 | │ │ └── tx_noise_stddev_multiplier (1,) float64 317 | │ ├── beam (238914,) uint16 318 | │ ├── channel (238914,) uint8 319 | │ ├── cover (238914,) float32 320 | │ ├── cover_z (238914, 30) float32 321 | │ ├── fhd_normal (238914,) float32 322 | │ ├── geolocation 323 | │ │ ├── degrade_flag (238914,) int16 324 | │ │ ├── delta_time (238914,) float64 325 | │ │ ├── digital_elevation_model (238914,) float32 326 | │ │ ├── elev_highestreturn (238914,) float32 327 | │ │ ├── elev_lowestmode (238914,) float32 328 | │ │ ├── elevation_bin0 (238914,) float64 329 | │ │ ├── elevation_bin0_error (238914,) float32 330 | │ │ ├── elevation_lastbin (238914,) float64 331 | │ │ ├── elevation_lastbin_error (238914,) float32 332 | │ │ ├── height_bin0 (238914,) float32 333 | │ │ ├── height_lastbin (238914,) float32 334 | │ │ ├── lat_highestreturn (238914,) float64 335 | │ │ ├── lat_lowestmode (238914,) float64 336 | │ │ ├── latitude_bin0 (238914,) float64 337 | │ │ ├── latitude_bin0_error (238914,) float32 338 | │ │ ├── latitude_lastbin (238914,) float64 339 | │ │ ├── latitude_lastbin_error (238914,) float32 340 | │ │ ├── local_beam_azimuth (238914,) float32 341 | │ │ ├── local_beam_elevation (238914,) float32 342 | │ │ ├── lon_highestreturn (238914,) float64 343 | │ │ ├── lon_lowestmode (238914,) float64 344 | │ │ ├── longitude_bin0 (238914,) float64 345 | │ │ ├── longitude_bin0_error (238914,) float32 346 | │ │ ├── longitude_lastbin (238914,) float64 347 | │ │ ├── longitude_lastbin_error (238914,) float32 348 | │ │ ├── solar_azimuth (238914,) float32 349 | │ │ └── solar_elevation (238914,) float32 350 | │ ├── l2a_quality_flag (238914,) uint8 351 | │ ├── l2b_quality_flag (238914,) uint8 352 | │ ├── land_cover_data 353 | │ │ ├── landsat_treecover (238914,) float64 354 | │ │ ├── modis_nonvegetated (238914,) float64 355 | │ │ ├── modis_nonvegetated_sd (238914,) float64 356 | │ │ ├── modis_treecover (238914,) float64 357 | │ │ └── modis_treecover_sd (238914,) float64 358 | │ ├── master_frac (238914,) float64 359 | │ ├── master_int (238914,) uint32 360 | │ ├── num_detectedmodes (238914,) uint8 361 | │ ├── omega (238914,) float32 362 | │ ├── pai (238914,) float32 363 | │ ├── pai_z (238914, 30) float32 364 | │ ├── pavd_z (238914, 30) float32 365 | │ ├── pgap_theta (238914,) float32 366 | │ ├── pgap_theta_error (238914,) float32 367 | │ ├── pgap_theta_z (7926559,) float32 368 | │ ├── rg (238914,) float32 369 | │ ├── rh100 (238914,) int16 370 | │ ├── rhog (238914,) float32 371 | │ ├── rhog_error (238914,) float32 372 | │ ├── rhov (238914,) float32 373 | │ ├── rhov_error (238914,) float32 374 | │ ├── rossg (238914,) float32 375 | │ ├── rv (238914,) float32 376 | │ ├── rx_processing 377 | │ │ ├── algorithmrun_flag_a1 (238914,) uint8 378 | │ │ ├── algorithmrun_flag_a2 (238914,) uint8 379 | │ │ ├── algorithmrun_flag_a3 (238914,) uint8 380 | │ │ ├── algorithmrun_flag_a4 (238914,) uint8 381 | │ │ ├── algorithmrun_flag_a5 (238914,) uint8 382 | │ │ ├── algorithmrun_flag_a6 (238914,) uint8 383 | │ │ ├── pgap_theta_a1 (238914,) float32 384 | │ │ ├── pgap_theta_a2 (238914,) float32 385 | │ │ ├── pgap_theta_a3 (238914,) float32 386 | │ │ ├── pgap_theta_a4 (238914,) float32 387 | │ │ ├── pgap_theta_a5 (238914,) float32 388 | │ │ ├── pgap_theta_a6 (238914,) float32 389 | │ │ ├── pgap_theta_error_a1 (238914,) float32 390 | │ │ ├── pgap_theta_error_a2 (238914,) float32 391 | │ │ ├── pgap_theta_error_a3 (238914,) float32 392 | │ │ ├── pgap_theta_error_a4 (238914,) float32 393 | │ │ ├── pgap_theta_error_a5 (238914,) float32 394 | │ │ ├── pgap_theta_error_a6 (238914,) float32 395 | │ │ ├── rg_a1 (238914,) float32 396 | │ │ ├── rg_a2 (238914,) float32 397 | │ │ ├── rg_a3 (238914,) float32 398 | │ │ ├── rg_a4 (238914,) float32 399 | │ │ ├── rg_a5 (238914,) float32 400 | │ │ ├── rg_a6 (238914,) float32 401 | │ │ ├── rg_eg_amplitude_a1 (238914,) float32 402 | │ │ ├── rg_eg_amplitude_a2 (238914,) float32 403 | │ │ ├── rg_eg_amplitude_a3 (238914,) float32 404 | │ │ ├── rg_eg_amplitude_a4 (238914,) float32 405 | │ │ ├── rg_eg_amplitude_a5 (238914,) float32 406 | │ │ ├── rg_eg_amplitude_a6 (238914,) float32 407 | │ │ ├── rg_eg_amplitude_error_a1 (238914,) float32 408 | │ │ ├── rg_eg_amplitude_error_a2 (238914,) float32 409 | │ │ ├── rg_eg_amplitude_error_a3 (238914,) float32 410 | │ │ ├── rg_eg_amplitude_error_a4 (238914,) float32 411 | │ │ ├── rg_eg_amplitude_error_a5 (238914,) float32 412 | │ │ ├── rg_eg_amplitude_error_a6 (238914,) float32 413 | │ │ ├── rg_eg_center_a1 (238914,) float32 414 | │ │ ├── rg_eg_center_a2 (238914,) float32 415 | │ │ ├── rg_eg_center_a3 (238914,) float32 416 | │ │ ├── rg_eg_center_a4 (238914,) float32 417 | │ │ ├── rg_eg_center_a5 (238914,) float32 418 | │ │ ├── rg_eg_center_a6 (238914,) float32 419 | │ │ ├── rg_eg_center_error_a1 (238914,) float32 420 | │ │ ├── rg_eg_center_error_a2 (238914,) float32 421 | │ │ ├── rg_eg_center_error_a3 (238914,) float32 422 | │ │ ├── rg_eg_center_error_a4 (238914,) float32 423 | │ │ ├── rg_eg_center_error_a5 (238914,) float32 424 | │ │ ├── rg_eg_center_error_a6 (238914,) float32 425 | │ │ ├── rg_eg_chisq_a1 (238914,) float32 426 | │ │ ├── rg_eg_chisq_a2 (238914,) float32 427 | │ │ ├── rg_eg_chisq_a3 (238914,) float32 428 | │ │ ├── rg_eg_chisq_a4 (238914,) float32 429 | │ │ ├── rg_eg_chisq_a5 (238914,) float32 430 | │ │ ├── rg_eg_chisq_a6 (238914,) float32 431 | │ │ ├── rg_eg_flag_a1 (238914,) int16 432 | │ │ ├── rg_eg_flag_a2 (238914,) int16 433 | │ │ ├── rg_eg_flag_a3 (238914,) int16 434 | │ │ ├── rg_eg_flag_a4 (238914,) int16 435 | │ │ ├── rg_eg_flag_a5 (238914,) int16 436 | │ │ ├── rg_eg_flag_a6 (238914,) int16 437 | │ │ ├── rg_eg_gamma_a1 (238914,) float32 438 | │ │ ├── rg_eg_gamma_a2 (238914,) float32 439 | │ │ ├── rg_eg_gamma_a3 (238914,) float32 440 | │ │ ├── rg_eg_gamma_a4 (238914,) float32 441 | │ │ ├── rg_eg_gamma_a5 (238914,) float32 442 | │ │ ├── rg_eg_gamma_a6 (238914,) float32 443 | │ │ ├── rg_eg_gamma_error_a1 (238914,) float32 444 | │ │ ├── rg_eg_gamma_error_a2 (238914,) float32 445 | │ │ ├── rg_eg_gamma_error_a3 (238914,) float32 446 | │ │ ├── rg_eg_gamma_error_a4 (238914,) float32 447 | │ │ ├── rg_eg_gamma_error_a5 (238914,) float32 448 | │ │ ├── rg_eg_gamma_error_a6 (238914,) float32 449 | │ │ ├── rg_eg_niter_a1 (238914,) uint8 450 | │ │ ├── rg_eg_niter_a2 (238914,) uint8 451 | │ │ ├── rg_eg_niter_a3 (238914,) uint8 452 | │ │ ├── rg_eg_niter_a4 (238914,) uint8 453 | │ │ ├── rg_eg_niter_a5 (238914,) uint8 454 | │ │ ├── rg_eg_niter_a6 (238914,) uint8 455 | │ │ ├── rg_eg_sigma_a1 (238914,) float32 456 | │ │ ├── rg_eg_sigma_a2 (238914,) float32 457 | │ │ ├── rg_eg_sigma_a3 (238914,) float32 458 | │ │ ├── rg_eg_sigma_a4 (238914,) float32 459 | │ │ ├── rg_eg_sigma_a5 (238914,) float32 460 | │ │ ├── rg_eg_sigma_a6 (238914,) float32 461 | │ │ ├── rg_eg_sigma_error_a1 (238914,) float32 462 | │ │ ├── rg_eg_sigma_error_a2 (238914,) float32 463 | │ │ ├── rg_eg_sigma_error_a3 (238914,) float32 464 | │ │ ├── rg_eg_sigma_error_a4 (238914,) float32 465 | │ │ ├── rg_eg_sigma_error_a5 (238914,) float32 466 | │ │ ├── rg_eg_sigma_error_a6 (238914,) float32 467 | │ │ ├── rg_error_a1 (238914,) float32 468 | │ │ ├── rg_error_a2 (238914,) float32 469 | │ │ ├── rg_error_a3 (238914,) float32 470 | │ │ ├── rg_error_a4 (238914,) float32 471 | │ │ ├── rg_error_a5 (238914,) float32 472 | │ │ ├── rg_error_a6 (238914,) float32 473 | │ │ ├── rv_a1 (238914,) float32 474 | │ │ ├── rv_a2 (238914,) float32 475 | │ │ ├── rv_a3 (238914,) float32 476 | │ │ ├── rv_a4 (238914,) float32 477 | │ │ ├── rv_a5 (238914,) float32 478 | │ │ ├── rv_a6 (238914,) float32 479 | │ │ ├── rx_energy_a1 (238914,) float32 480 | │ │ ├── rx_energy_a2 (238914,) float32 481 | │ │ ├── rx_energy_a3 (238914,) float32 482 | │ │ ├── rx_energy_a4 (238914,) float32 483 | │ │ ├── rx_energy_a5 (238914,) float32 484 | │ │ └── rx_energy_a6 (238914,) float32 485 | │ ├── rx_range_highestreturn (238914,) float64 486 | │ ├── selected_l2a_algorithm (238914,) uint8 487 | │ ├── selected_rg_algorithm (238914,) uint8 488 | │ ├── sensitivity (238914,) float32 489 | │ ├── stale_return_flag (238914,) uint8 490 | │ └── surface_flag (238914,) uint8 491 | ==== 1000 lines removed for brevity, see commit history ==== 492 | └── METADATA 493 | 494 | 495 | 496 | ```python 497 | n, w, s, e = [40.2, -75.25, 40.15, -75.2] 498 | 499 | geoloc = f['BEAM0000/geolocation'] 500 | all_lats = geoloc['latitude_bin0'][:] 501 | all_lons = geoloc['longitude_bin0'][:] 502 | valid_lat_i = np.where(np.logical_and(all_lats >= s, all_lats <= n)) 503 | valid_lon_i = np.where(np.logical_and(all_lons >= w, all_lons <= e)) 504 | indices = np.intersect1d(valid_lat_i, valid_lon_i) 505 | 506 | lats = all_lats[indices] 507 | lons = all_lons[indices] 508 | data = f['BEAM0000/cover'][:][indices] 509 | data_i = np.where(data != -9999) 510 | data = data[data_i] 511 | lats = lats[data_i] 512 | lons = lons[data_i] 513 | 514 | ambler = plt.imread('ambler.png') 515 | fig, ax = plt.subplots(figsize=(10,10)) 516 | ax.scatter(lons, lats, s=50, c=data, cmap='Greens') 517 | ax.set_xlim(w, e) 518 | ax.set_ylim(s, n) 519 | ax.imshow(ambler, zorder=0, extent = [w, e, s, n], aspect='equal') 520 | ``` 521 | 522 | Merged 17 requests into 1 523 | Reading https://example.earthdata.nasa.gov/example-staging-url/GEDI02_B_2019182140038_O03117_T05635_02_001_01.h5 [35434732:36859543] (1424811 bytes) 524 | Merged 17 requests into 1 525 | Reading https://example.earthdata.nasa.gov/example-staging-url/GEDI02_B_2019182140038_O03117_T05635_02_001_01.h5 [43931476:45340903] (1409427 bytes) 526 | Merged 17 requests into 1 527 | Reading https://example.earthdata.nasa.gov/example-staging-url/GEDI02_B_2019182140038_O03117_T05635_02_001_01.h5 [12035:442316] (430281 bytes) 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | ![png](images/output_18_2.png) 539 | 540 | 541 | ## Why is it sometimes slower? 542 | 543 | ![png](images/request-overhead.png) 544 | 545 | We pay a penalty for every new file we access, needing to go over the Internet, through the Internet services stack, the request signing process, and ultimately get redirected to S3. The Zarr store has to pay this penalty twice to read the metadata and then the file, while a full-file download only pays the penalty once. With current performance, the break-even point in file size is about 10 MB. That is to say, if a user wants to access even a tiny amount of data in each granule from a collection whose granules are under 10 MB in size, he or she is better off downloading the granules. While there is some uncontrollable overhead, there is significant room for improvement in areas that are under our control to promote inexpensive access patterns while improving time to science. 546 | 547 | ## Conclusions 548 | 549 | * If providers generate DMR++ on ingest, we can expose our data efficiently using a Python API that is gaining increasing traction, particulary in the Pangeo community, with minimal storage overhead 550 | * Works out of the cloud, but works even better / faster in the cloud for analysis near data 551 | * For partial access cases, an overall egress reduction of 90% or more could be possible, as demonstrated 552 | * Chunking matters. This work makes smaller chunks more desirable, which has not historically been the case with Zarr 553 | * Overhead in our stack, from EDL, to Internet services, to redirects, are eating up the potential user savings. At a 90% egress reduction, we struggle to compete with "Just download everything." How do we balance preventing undesirable behavior with encouraging desirable behavior? 554 | * There are lingering questions about whether DMR++ is the correct format to capture this metadata in. Zarr's native format is in many cases more complete and easier to parse while having mechanisms for more easily working with the 100,000-ish chunks in GEDI granules and for unifying multiple granules into a coherent view. 555 | 556 | ## Limitations / Needs 557 | 558 | * The DMR++ file must be generated on ingest into the cloud, which is currently optional 559 | * Only works on HDF5 and NetCDF4 files. In principle, it could work on HDF4 / NetCDF Classic files but nothing yet generates the necessary metadata 560 | * DMR++ does not quite specify everything we could need for some datasets. We assume little endian byte order and column-major ordering. 561 | 562 | ## Future Work 563 | 564 | * Packaging, unit tests, and docs sufficient for publication 565 | * Open source (relies on a naming decision) 566 | * Cache repeated calls for the same byte ranges to avoid requerying data we have 567 | * Implement unknown / undocumented areas of the DMR++ spec, including compression types and data filters 568 | * Tests with Dask and XArray 569 | * Implement CF conventions to populate fill values, offsets, scales, etc 570 | * Extensions to present L3 global collections as a coherent data cube 571 | 572 | I strongly believe in this access pattern as a win for our users and ourselves. To the extent it is not fully realized, it suffers from being an early adopter of our cloud access stack. My sincere hope is that we can learn from it to improve partial file access not only here but in other tools and libraries. 573 | -------------------------------------------------------------------------------- /tests/fixtures/20200911000001-JPL-L2P_GHRSST-SSTskin-MODIS_A-N-v02.0-fv01.0.nc.dmrpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 10m wind speed 12 | 13 | 14 | wind_speed 15 | 16 | 17 | m s-1 18 | 19 | 20 | -128 21 | 22 | 23 | -127 24 | 25 | 26 | 127 27 | 28 | 29 | Wind at 10 meters above the sea surface 30 | 31 | 32 | 0.2000000030 33 | 34 | 35 | 25.00000000 36 | 37 | 38 | TBD. Placeholder. Currently empty 39 | 40 | 41 | TBD 42 | 43 | 44 | 2.000000000 45 | 46 | 47 | 10 m 48 | 49 | 50 | auxiliaryInformation 51 | 52 | 53 | wind_speed 54 | 55 | 56 | /wind_speed 57 | 58 | 59 | time lat lon 60 | 61 | 62 | 1 1015 677 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | SSES standard deviation error based on proximity confidence flags 75 | 76 | 77 | kelvin 78 | 79 | 80 | -128 81 | 82 | 83 | -127 84 | 85 | 86 | 127 87 | 88 | 89 | thermal IR SST standard deviation error; signed byte array: WARNING Some applications are unable to properly handle signed byte values. If values are encountered > 127, please subtract 256 from this reported value 90 | 91 | 92 | 0.07874015719 93 | 94 | 95 | 10.00000000 96 | 97 | 98 | auxiliaryInformation 99 | 100 | 101 | sses_standard_deviation 102 | 103 | 104 | /sses_standard_deviation 105 | 106 | 107 | time lat lon 108 | 109 | 110 | 1 1015 677 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | time difference from reference time 123 | 124 | 125 | seconds 126 | 127 | 128 | -32768 129 | 130 | 131 | -32767 132 | 133 | 134 | 32767 135 | 136 | 137 | time plus sst_dtime gives seconds after 00:00:00 UTC January 1, 1981 138 | 139 | 140 | referenceInformation 141 | 142 | 143 | sst_dtime 144 | 145 | 146 | /sst_dtime 147 | 148 | 149 | time lat lon 150 | 151 | 152 | 1 1015 677 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | L2P flags 165 | 166 | 167 | 0 168 | 169 | 170 | 16 171 | 172 | 173 | These flags can be used to further filter data variables 174 | 175 | 176 | microwave land ice lake river 177 | 178 | 179 | 1 180 | 2 181 | 4 182 | 8 183 | 16 184 | 185 | 186 | qualityInformation 187 | 188 | 189 | l2p_flags 190 | 191 | 192 | /l2p_flags 193 | 194 | 195 | time lat lon 196 | 197 | 198 | 1 1015 677 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | sea surface temperature 211 | 212 | 213 | sea_surface_skin_temperature 214 | 215 | 216 | kelvin 217 | 218 | 219 | -32767 220 | 221 | 222 | -1000 223 | 224 | 225 | 10000 226 | 227 | 228 | sea surface temperature from thermal IR (11 um) channels 229 | 230 | 231 | 0.004999999888 232 | 233 | 234 | 273.1499939 235 | 236 | 237 | NASA and University of Miami 238 | 239 | 240 | physicalMeasurement 241 | 242 | 243 | sea_surface_temperature 244 | 245 | 246 | /sea_surface_temperature 247 | 248 | 249 | time lat lon 250 | 251 | 252 | 1 1015 677 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | quality level of SST pixel 265 | 266 | 267 | -128 268 | 269 | 270 | 0 271 | 272 | 273 | 5 274 | 275 | 276 | thermal IR SST proximity confidence value; signed byte array: WARNING Some applications are unable to properly handle signed byte values. If values are encountered > 127, please subtract 256 from this reported value 277 | 278 | 279 | 0 280 | 1 281 | 2 282 | 3 283 | 4 284 | 5 285 | 286 | 287 | no_data bad_data worst_quality low_quality acceptable_quality best_quality 288 | 289 | 290 | qualityInformation 291 | 292 | 293 | quality_level 294 | 295 | 296 | /quality_level 297 | 298 | 299 | time lat lon 300 | 301 | 302 | 1 1015 677 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | deviation from SST reference climatology 315 | 316 | 317 | kelvin 318 | 319 | 320 | -128 321 | 322 | 323 | -127 324 | 325 | 326 | 127 327 | 328 | 329 | TBD 330 | 331 | 332 | 0.1000000015 333 | 334 | 335 | 0.000000000 336 | 337 | 338 | TBD. Placeholder. Currently empty 339 | 340 | 341 | auxiliaryInformation 342 | 343 | 344 | dt_analysis 345 | 346 | 347 | /dt_analysis 348 | 349 | 350 | time lat lon 351 | 352 | 353 | 1 1015 677 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | SSES standard deviation error based on proximity confidence flags 366 | 367 | 368 | kelvin 369 | 370 | 371 | -128 372 | 373 | 374 | -127 375 | 376 | 377 | 127 378 | 379 | 380 | mid-IR SST standard deviation error; non L2P core field; signed byte array: WARNING Some applications are unable to properly handle signed byte values. If values are encountered > 127, please subtract 256 from this reported value 381 | 382 | 383 | 0.07874015719 384 | 385 | 386 | 10.00000000 387 | 388 | 389 | auxiliaryInformation 390 | 391 | 392 | sses_standard_deviation_4um 393 | 394 | 395 | /sses_standard_deviation_4um 396 | 397 | 398 | time lat lon 399 | 400 | 401 | 1 1015 677 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | SSES bias error based on proximity confidence flags 414 | 415 | 416 | kelvin 417 | 418 | 419 | -128 420 | 421 | 422 | -127 423 | 424 | 425 | 127 426 | 427 | 428 | mid-IR SST bias error; non L2P core field; signed byte array: WARNING Some applications are unable to properly handle signed byte values. If values are encountered > 127, please subtract 256 from this reported value 429 | 430 | 431 | 0.1574803144 432 | 433 | 434 | 0.000000000 435 | 436 | 437 | auxiliaryInformation 438 | 439 | 440 | sses_bias_4um 441 | 442 | 443 | /sses_bias_4um 444 | 445 | 446 | time lat lon 447 | 448 | 449 | 1 1015 677 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | SSES bias error based on proximity confidence flags 462 | 463 | 464 | kelvin 465 | 466 | 467 | -128 468 | 469 | 470 | -127 471 | 472 | 473 | 127 474 | 475 | 476 | thermal IR SST bias error; signed byte array: WARNING Some applications are unable to properly handle signed byte values. If values are encountered > 127, please subtract 256 from this reported value 477 | 478 | 479 | 0.1574803144 480 | 481 | 482 | 0.000000000 483 | 484 | 485 | auxiliaryInformation 486 | 487 | 488 | sses_bias 489 | 490 | 491 | /sses_bias 492 | 493 | 494 | time lat lon 495 | 496 | 497 | 1 1015 677 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | sea surface temperature 510 | 511 | 512 | kelvin 513 | 514 | 515 | -32767 516 | 517 | 518 | -1000 519 | 520 | 521 | 10000 522 | 523 | 524 | sea surface temperature from mid-IR (4 um) channels; non L2P core field 525 | 526 | 527 | 0.004999999888 528 | 529 | 530 | 273.1499939 531 | 532 | 533 | physicalMeasurement 534 | 535 | 536 | sea_surface_temperature_4um 537 | 538 | 539 | /sea_surface_temperature_4um 540 | 541 | 542 | time lat lon 543 | 544 | 545 | 1 1015 677 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | quality level of SST pixel 558 | 559 | 560 | -128 561 | 562 | 563 | 0 564 | 565 | 566 | 5 567 | 568 | 569 | mid-IR SST proximity confidence value; non L2P core field; signed byte array: WARNING Some applications are unable to properly handle signed byte values. If values are encountered > 127, please subtract 256 from this reported value 570 | 571 | 572 | 0 573 | 1 574 | 2 575 | 3 576 | 4 577 | 5 578 | 579 | 580 | no_data bad_data worst_quality low_quality acceptable_quality best_quality 581 | 582 | 583 | qualityInformation 584 | 585 | 586 | quality_level_4um 587 | 588 | 589 | /quality_level_4um 590 | 591 | 592 | time lat lon 593 | 594 | 595 | 1 1015 677 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | reference time of sst file 606 | 607 | 608 | time 609 | 610 | 611 | seconds since 1981-01-01 00:00:00 612 | 613 | 614 | time of first sensor observation 615 | 616 | 617 | coordinate 618 | 619 | 620 | time 621 | 622 | 623 | /time 624 | 625 | 626 | 1 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | latitude 635 | 636 | 637 | latitude 638 | 639 | 640 | degrees_north 641 | 642 | 643 | -999.0000000 644 | 645 | 646 | -90.00000000 647 | 648 | 649 | 90.00000000 650 | 651 | 652 | geographical coordinates, WGS84 projection 653 | 654 | 655 | coordinate 656 | 657 | 658 | lat 659 | 660 | 661 | /lat 662 | 663 | 664 | 1015 677 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | longitude 676 | 677 | 678 | longitude 679 | 680 | 681 | degrees_east 682 | 683 | 684 | -999.0000000 685 | 686 | 687 | -180.0000000 688 | 689 | 690 | 180.0000000 691 | 692 | 693 | geographical coordinates, WGS84 projection 694 | 695 | 696 | coordinate 697 | 698 | 699 | lon 700 | 701 | 702 | /lon 703 | 704 | 705 | 1015 677 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | CF-1.7, ACDD-1.3 715 | 716 | 717 | MODIS Aqua L2P SST 718 | 719 | 720 | Sea surface temperature retrievals produced at the NASA OBPG for the MODIS Aqua sensor. These have been reformatted to GHRSST GDS specifications by the JPL PO.DAAC 721 | 722 | 723 | GHRSST Data Processing Specification v2r5 724 | 725 | 726 | NASA/JPL/OBPG/RSMAS 727 | 728 | 729 | MODIS L2P created at JPL PO.DAAC 730 | 731 | 732 | L2P Core without DT analysis or other ancillary fields; Night, Start Node:Descending, End Node:Descending; WARNING Some applications are unable to properly handle signed byte values. If values are encountered > 127, please subtract 256 from this reported value; Quicklook 733 | 734 | 735 | GHRSST and PO.DAAC protocol allow data use as free and open. 736 | 737 | 738 | MODIS_A-JPL-L2P-v2019.0 739 | 740 | 741 | org.ghrsst 742 | 743 | 744 | 2019.0 745 | 746 | 747 | f6e1f61d-c4a4-4c17-8354-0c15e12d688b 748 | 749 | 750 | 2.0 751 | 752 | 753 | 4.1 754 | 755 | 756 | 20200911T024514Z 757 | 758 | 759 | 3 760 | 761 | 762 | 1km 763 | 764 | 765 | 20200911T000001Z 766 | 767 | 768 | 20200911T000001Z 769 | 770 | 771 | 20200911T000458Z 772 | 773 | 774 | 20200911T000458Z 775 | 776 | 777 | -36.22299957 778 | 779 | 780 | -57.91799927 781 | 782 | 783 | 31.05480003 784 | 785 | 786 | -7.165909767 787 | 788 | 789 | MODIS sea surface temperature observations for the OBPG 790 | 791 | 792 | Aqua 793 | 794 | 795 | MODIS 796 | 797 | 798 | http://podaac.jpl.nasa.gov/ws/metadata/dataset/?format=iso&shortName=MODIS_A-JPL-L2P-v2019.0 799 | 800 | 801 | Oceans > Ocean Temperature > Sea Surface Temperature 802 | 803 | 804 | NASA Global Change Master Directory (GCMD) Science Keywords 805 | 806 | 807 | NetCDF Climate and Forecast (CF) Metadata Convention 808 | 809 | 810 | degrees_north 811 | 812 | 813 | 0.009999999776 814 | 815 | 816 | degrees_east 817 | 818 | 819 | 0.009999999776 820 | 821 | 822 | The MODIS L2P sea surface temperature data are sponsored by NASA 823 | 824 | 825 | Ed Armstrong, JPL PO.DAAC 826 | 827 | 828 | edward.m.armstrong@jpl.nasa.gov 829 | 830 | 831 | http://podaac.jpl.nasa.gov 832 | 833 | 834 | Group for High Resolution Sea Surface Temperature 835 | 836 | 837 | The GHRSST Project Office 838 | 839 | 840 | http://www.ghrsst.org 841 | 842 | 843 | ghrsst-po@nceo.ac.uk 844 | 845 | 846 | L2P 847 | 848 | 849 | swath 850 | 851 | 852 | Descending 853 | 854 | 855 | Descending 856 | 857 | 858 | Night 859 | 860 | 861 | 862 | -------------------------------------------------------------------------------- /tests/fixtures/3B-HHR.MS.MRG.3IMERG.20051022-S000000-E002959.0000.V06B.zarr.json: -------------------------------------------------------------------------------- 1 | { 2 | ".zgroup": { 3 | "zarr_format": 2 4 | }, 5 | "Grid/precipitationQualityIndex/.zarray": { 6 | "zarr_format": 2, 7 | "filters": null, 8 | "order": "C", 9 | "dtype": "