├── .gitattributes ├── examples ├── core │ └── output │ │ └── .gitignore ├── netcdf_engine │ ├── output │ │ └── .gitignore │ └── netcdf-to-tiledb-set-max-fragment-size.ipynb ├── xarray_engine │ ├── output │ │ └── .gitignore │ └── tiledb-xarray-partially-filled-arrays.ipynb └── README.md ├── setup.py ├── tiledb └── cf │ ├── __main__.py │ ├── xarray_engine │ ├── _common.py │ ├── __init__.py │ ├── engine.py │ ├── _encoding.py │ └── _array_wrapper.py │ ├── netcdf_engine │ ├── __init__.py │ ├── api.py │ └── _utils.py │ ├── testing.py │ ├── core │ ├── __init__.py │ ├── registry.py │ ├── source.py │ ├── _shared_dim.py │ ├── api.py │ ├── _dim_creator.py │ ├── _attr_creator.py │ └── _metadata.py │ ├── __init__.py │ ├── _utils.py │ └── cli.py ├── requirements_dev.txt ├── quarto-materials ├── tiledb-logo.png ├── Background-tdb-header.jpg ├── tiledb.css ├── tiledb-logo.svg └── tiledb.scss ├── .editorconfig ├── documentation ├── core.md ├── code-of-conduct.md ├── index.md ├── netcdf-engine.md ├── xarray-engine.md ├── contributing.md └── tiledb-cf-spec.md ├── .gitignore ├── pyproject.toml ├── tests ├── core │ ├── test_shared_dimension.py │ ├── test_dim_metadata.py │ ├── test_attr_metadata.py │ ├── test_array_metadata.py │ ├── test_group.py │ ├── test_fragment_writer.py │ └── test_write_array.py ├── netcdf_engine │ ├── test_netcdf_coord_to_dim_converter.py │ ├── test_convert_timestamp.py │ ├── test_open_netcdf_group.py │ ├── test_netcdf4_to_dim_converter.py │ ├── test_cli_netcdf_convert.py │ ├── test_utils.py │ ├── conftest.py │ ├── test_netcdf4_converter_array.py │ └── test_convert_multifragments.py └── xarray_engine │ ├── test_plugin_distributed.py │ ├── test_plugin_timestamp.py │ └── conftest.py ├── tools ├── lint.sh └── hooks │ └── pre-commit.sh ├── .github └── workflows │ ├── release.yml │ ├── quarto-render.yml │ └── ci.yml ├── LICENSE ├── README.md ├── setup.cfg └── _quarto.yml /.gitattributes: -------------------------------------------------------------------------------- 1 | *.qmd linguist-language=RMarkdown 2 | -------------------------------------------------------------------------------- /examples/core/output/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | setuptools.setup() 4 | -------------------------------------------------------------------------------- /examples/netcdf_engine/output/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /examples/xarray_engine/output/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /tiledb/cf/__main__.py: -------------------------------------------------------------------------------- 1 | from tiledb.cf import cli 2 | 3 | cli() 4 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | black 2 | isort 3 | flake8 4 | mypy 5 | flake8-bugbear 6 | pytest 7 | pytidylib 8 | -------------------------------------------------------------------------------- /quarto-materials/tiledb-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TileDB-Inc/TileDB-CF-Py/HEAD/quarto-materials/tiledb-logo.png -------------------------------------------------------------------------------- /quarto-materials/Background-tdb-header.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TileDB-Inc/TileDB-CF-Py/HEAD/quarto-materials/Background-tdb-header.jpg -------------------------------------------------------------------------------- /tiledb/cf/xarray_engine/_common.py: -------------------------------------------------------------------------------- 1 | # Group level metadata 2 | _ARRAY_FIXED_DIMS_PREFIX = "__tiledb_array_fixed_dimensions." 3 | _ATTR_PREFIX = "__tiledb_attr." 4 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | # Unix-style newlines with a newline ending every file 4 | [*] 5 | end_of_line = lf 6 | insert_final_newline = true 7 | trim_trailing_whitespace = true 8 | indent_style = space 9 | indent_size = 4 10 | -------------------------------------------------------------------------------- /documentation/core.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: TileDB-CF Core 3 | --- 4 | 5 | :::{.callout-warning} 6 | The TileDB-CF library is still under initial development and changes may not be backward compatible. 7 | ::: 8 | 9 | Working with large multi-array datasets with complex metadata can be unwieldy. The core TileDB-CF library provides additional support for common operations. 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | 4 | # Distribution / packaging 5 | build/ 6 | dist/ 7 | *.egg-info/ 8 | 9 | # Unit test / coverage reports 10 | .tox/ 11 | .pytest_cache/ 12 | .mypy_cache/ 13 | .coverage* 14 | 15 | # Quarto documentation 16 | /.quarto/ 17 | docs/.quarto 18 | /docs 19 | documentation/api 20 | objects.json 21 | 22 | # pyenv 23 | .python-version 24 | -------------------------------------------------------------------------------- /tiledb/cf/xarray_engine/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import xarray 3 | 4 | has_xarray = True 5 | 6 | except ImportError: 7 | has_xarray = False 8 | 9 | 10 | from .api import ( 11 | copy_data_from_xarray, 12 | copy_metadata_from_xarray, 13 | create_group_from_xarray, 14 | from_xarray, 15 | ) 16 | 17 | __all__ = [ 18 | "has_xarray", 19 | "copy_data_from_xarray", 20 | "copy_metadata_from_xarray", 21 | "create_group_from_xarray", 22 | "from_xarray", 23 | ] # type: ignore 24 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "build"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.black] 6 | line-length = 88 7 | target-version = ['py38'] 8 | exclude = ''' 9 | /( 10 | | \.git 11 | | \.mypy_cache 12 | | \.pytest_cache 13 | | docs 14 | | dist 15 | )/ 16 | ''' 17 | 18 | [tool.isort] 19 | profile = "black" 20 | multi_line_output = 3 21 | 22 | [tool.pylint.messages_control] 23 | disable = "C0330, C0326" 24 | 25 | [tool.pylint.format] 26 | max-line-length = "88" 27 | 28 | [tool.pytest.ini_options] 29 | markers = [ 30 | "flaky: flaky tests", 31 | "network: tests requiring a network connection", 32 | ] 33 | -------------------------------------------------------------------------------- /tiledb/cf/netcdf_engine/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import netCDF4 3 | 4 | has_netCDF4 = True 5 | 6 | except ImportError: 7 | has_netCDF4 = False 8 | 9 | from .api import from_netcdf 10 | 11 | __all__ = ["has_netCDF4", "from_netcdf"] # type: ignore 12 | 13 | if has_netCDF4: 14 | from ._array_converters import NetCDF4ArrayConverter, NetCDF4DomainConverter 15 | from ._attr_converters import NetCDF4VarToAttrConverter 16 | from ._dim_converters import ( 17 | NetCDF4CoordToDimConverter, 18 | NetCDF4DimToDimConverter, 19 | NetCDF4ScalarToDimConverter, 20 | NetCDF4ToDimConverter, 21 | ) 22 | from ._utils import open_netcdf_group 23 | from .converter import NetCDF4ConverterEngine 24 | 25 | __all__.append("NetCDF4ConverterEngine") 26 | -------------------------------------------------------------------------------- /tests/core/test_shared_dimension.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import tiledb 5 | from tiledb.cf.core._shared_dim import SharedDim 6 | 7 | _tiledb_dim = [ 8 | tiledb.Dim(name="dim", domain=(1, 4), tile=4, dtype=np.int32), 9 | ] 10 | 11 | 12 | @pytest.mark.parametrize( 13 | "domain, dtype, result", 14 | [ 15 | ((0, 4), np.int32, True), 16 | ((0, 4), np.uint32, True), 17 | ((1, 4), np.int32, False), 18 | ((0, 4), np.float64, False), 19 | (None, np.int32, False), 20 | ], 21 | ) 22 | def test_is_index_dim(domain, dtype, result): 23 | shared_dim = SharedDim("name", domain, dtype) 24 | assert shared_dim.is_index_dim == result 25 | 26 | 27 | def test_compare_other_object(): 28 | assert SharedDim("dim", (1, 4), np.int32) != "not a dimension" 29 | -------------------------------------------------------------------------------- /tiledb/cf/testing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def assert_dict_arrays_equal(d1, d2, ordered=True): 5 | assert d1.keys() == d2.keys(), "Keys not equal" 6 | 7 | if ordered: 8 | for k in d1.keys(): 9 | np.testing.assert_array_equal(d1[k], d2[k]) 10 | else: 11 | d1_dtypes = [tuple((name, value.dtype)) for name, value in d1.items()] 12 | d2_dtypes = [tuple((name, value.dtype)) for name, value in d2.items()] 13 | 14 | assert d1_dtypes == d2_dtypes 15 | 16 | d1_records = [tuple(values) for values in zip(*d1.values())] 17 | array1 = np.sort(np.array(d1_records, dtype=d1_dtypes)) 18 | 19 | d2_records = [tuple(values) for values in zip(*d2.values())] 20 | array2 = np.sort(np.array(d2_records, dtype=d2_dtypes)) 21 | 22 | np.testing.assert_array_equal(array1, array2) 23 | -------------------------------------------------------------------------------- /tiledb/cf/core/__init__.py: -------------------------------------------------------------------------------- 1 | """Core TileDB-CF functionality.""" 2 | 3 | from ._array_creator import ArrayCreator, DomainCreator 4 | from ._attr_creator import AttrCreator 5 | from ._dataspace_creator import DataspaceCreator 6 | from ._dim_creator import DimCreator 7 | from ._metadata import ( 8 | ATTR_METADATA_FLAG, 9 | DIM_METADATA_FLAG, 10 | ArrayMetadata, 11 | AttrMetadata, 12 | DimMetadata, 13 | ) 14 | from ._shared_dim import SharedDim 15 | from .api import create_group, open_group_array 16 | from .source import NumpyData 17 | 18 | __all__ = [ 19 | ATTR_METADATA_FLAG, 20 | DIM_METADATA_FLAG, 21 | ArrayCreator, 22 | AttrCreator, 23 | ArrayMetadata, 24 | AttrMetadata, 25 | DataspaceCreator, 26 | DimCreator, 27 | DimMetadata, 28 | DomainCreator, 29 | NumpyData, 30 | SharedDim, 31 | create_group, 32 | open_group_array, 33 | ] 34 | -------------------------------------------------------------------------------- /tiledb/cf/__init__.py: -------------------------------------------------------------------------------- 1 | """``tiledb.cf`` is the core module for the TileDB-CF-Py library. 2 | 3 | This module contains core classes and functions for supporting the NetCDF data model in 4 | the `TileDB storage engine `_. To use this module 5 | simply import using: 6 | 7 | .. code-block:: python 8 | 9 | import tiledb.cf 10 | """ 11 | 12 | from .cli import cli 13 | from .core import ( 14 | ATTR_METADATA_FLAG, 15 | DIM_METADATA_FLAG, 16 | ArrayMetadata, 17 | AttrMetadata, 18 | DataspaceCreator, 19 | DimMetadata, 20 | create_group, 21 | open_group_array, 22 | ) 23 | from .netcdf_engine import from_netcdf, has_netCDF4 24 | from .xarray_engine import ( 25 | copy_data_from_xarray, 26 | copy_metadata_from_xarray, 27 | create_group_from_xarray, 28 | from_xarray, 29 | has_xarray, 30 | ) 31 | 32 | if has_netCDF4: 33 | from .netcdf_engine import NetCDF4ConverterEngine 34 | -------------------------------------------------------------------------------- /tools/lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Function to automate running linting/formatting tests. 4 | ask_run_tool() { 5 | name=$2 6 | read -r -p "Run ${name}? [Y/n] " response 7 | case "$response" in 8 | [nN][oO]|[nN]) 9 | echo "* Skipping ${name}" 10 | ;; 11 | [yY][eE][sS]|[yY]|"") 12 | echo "* Running ${name}.. " 13 | echo "..................." 14 | $1 15 | echo "..................." 16 | ;; 17 | *) 18 | echo "Not a valid response. Skipping ${name}." 19 | esac 20 | } 21 | 22 | project_root=$(git rev-parse --show-toplevel) 23 | source_dir="${project_root}/tiledb" 24 | test_dir="${project_root}/tests" 25 | 26 | ask_run_tool "isort ${project_root}" "isort" 27 | ask_run_tool "black ${project_root}" "black" 28 | ask_run_tool "flake8 ${project_root}" "flake8" 29 | ask_run_tool "mypy ${source_dir} ${test_dir}" "mypy" 30 | ask_run_tool "pytest --cov-report term-missing --cov=${source_dir} ${test_dir}" "pytest" 31 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | examples.md 2 | 3 | # Example notebooks 4 | 5 | ## Core 6 | 7 | The [core](./core) subdirectory includes examples on using TileDB-CF core module. This includes: 8 | 9 | * [Working with arrays in a TileDB group](./core/group-basics.ipynb) 10 | 11 | ## NetCDF Engine 12 | 13 | The [netcdf_engine](./netcdf_engine) subdirectory includes examples for converting NetCDF to TileDB. 14 | 15 | * [NetCDF to TileDB conversion basics](./netcdf_engine/netcdf-to-tiledb-basics.ipynb) 16 | * [Setting tile size for TileDB arrrays](./netcdf_engine/netcdf-to-tiledb-set-tiles.ipynb) 17 | * [Setting the max fragment size or max chunk size for copying data from NetCDF to TileDB](./netcdf_engine/netcdf-to-tiledb-set-max-fragment-size.ipynb) 18 | 19 | ## Xarray Backend 20 | 21 | The [xarray_engine](./xarray_engine) subdirectory inclues examples for using the TileDB backenf for xarray. 22 | 23 | * [Getting started with TileDB and xarray](./xarray_engine/tiledb-xarray-basics.ipynb) 24 | -------------------------------------------------------------------------------- /tests/netcdf_engine/test_netcdf_coord_to_dim_converter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from tiledb.cf.netcdf_engine import NetCDF4CoordToDimConverter 5 | 6 | netCDF4 = pytest.importorskip("netCDF4") 7 | 8 | 9 | def test_coord_converter_simple(): 10 | with netCDF4.Dataset("example.nc", mode="w", diskless=True) as dataset: 11 | dataset.createDimension("x", 4) 12 | x = dataset.createVariable("x", datatype=np.float64, dimensions=("x",)) 13 | converter = NetCDF4CoordToDimConverter.from_netcdf(x) 14 | assert converter.name == "x" 15 | assert converter.dtype == np.dtype("float64") 16 | assert converter.domain is None 17 | 18 | 19 | def test_bad_size_error(): 20 | with netCDF4.Dataset("example.nc", mode="w", diskless=True) as group: 21 | group.createDimension("x", 16) 22 | group.createDimension("y", 16) 23 | x = group.createVariable( 24 | "x", datatype=np.dtype("float64"), dimensions=("x", "y") 25 | ) 26 | with pytest.raises(ValueError): 27 | NetCDF4CoordToDimConverter.from_netcdf(x) 28 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | workflow_dispatch: 5 | release: 6 | types: [ 'published' ] 7 | push: 8 | branches: 9 | - 'release-*' 10 | 11 | jobs: 12 | release: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Setup Python 16 | uses: actions/setup-python@v2 17 | - name: Checkout 18 | uses: actions/checkout@v2 19 | - name: Build package 20 | run: pip install build && python -m build && ls -l dist 21 | - name: Publish package to TestPyPI 22 | if: github.repository == 'TileDB-Inc/TileDB-CF-Py' 23 | uses: pypa/gh-action-pypi-publish@release/v1 24 | with: 25 | user: __token__ 26 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 27 | repository_url: https://test.pypi.org/legacy/ 28 | - name: Publish package to PyPI 29 | if: github.event_name == 'release' && github.event.action == 'published' && github.repository == 'TileDB-Inc/TileDB-CF-Py' 30 | uses: pypa/gh-action-pypi-publish@release/v1 31 | with: 32 | user: __token__ 33 | password: ${{ secrets.PYPI_API_TOKEN }} 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 TileDB, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/netcdf_engine/test_convert_timestamp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import tiledb 5 | from tiledb.cf import NetCDF4ConverterEngine 6 | 7 | netCDF4 = pytest.importorskip("netCDF4") 8 | 9 | 10 | class TestCopyAtTimestamp: 11 | """Test copying a simple NetCDF file at a specified timestamp. 12 | 13 | NetCDF File: 14 | 15 | dimensions: 16 | x (8) 17 | 18 | variables: 19 | f (x) = np.linspace(0, 1, 8) 20 | """ 21 | 22 | attr_data = np.linspace(0, 1, 8) 23 | 24 | def test_copy_to_timestamp(self, tmpdir): 25 | uri = str(tmpdir.mkdir("output").join("timestamp_array")) 26 | timestamp = 1 27 | with netCDF4.Dataset("tmp", mode="w", diskless=True) as dataset: 28 | dataset.setncatts({"title": "test timestamp"}) 29 | dataset.createDimension("x", 8) 30 | var = dataset.createVariable("f", np.float64, ("x",)) 31 | var[:] = self.attr_data 32 | converter = NetCDF4ConverterEngine.from_group(dataset) 33 | converter.convert_to_array( 34 | uri, input_netcdf_group=dataset, timestamp=timestamp 35 | ) 36 | with tiledb.open(uri, timestamp=(1, 1)) as array: 37 | assert array.meta["title"] == "test timestamp" 38 | result_data = array[:]["f"] 39 | np.testing.assert_equal(self.attr_data, result_data) 40 | -------------------------------------------------------------------------------- /tests/netcdf_engine/test_open_netcdf_group.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from tiledb.cf.netcdf_engine import open_netcdf_group 4 | 5 | netCDF4 = pytest.importorskip("netCDF4") 6 | 7 | 8 | def test_open_netcdf_group_with_group(tmpdir): 9 | with netCDF4.Dataset("example.nc", mode="w", diskless=True) as dataset: 10 | with open_netcdf_group(dataset) as group: 11 | assert isinstance(group, netCDF4.Dataset) 12 | assert group == dataset 13 | 14 | 15 | def test_open_netcdf_group_with_file(tmpdir): 16 | filepath = str(tmpdir.mkdir("open_group").join("simple_dataset.nc")) 17 | with netCDF4.Dataset(filepath, mode="w") as dataset: 18 | group1 = dataset.createGroup("group1") 19 | group1.createGroup("group2") 20 | with open_netcdf_group(input_file=filepath, group_path="/group1/group2") as group: 21 | assert isinstance(group, netCDF4.Group) 22 | assert group.path == "/group1/group2" 23 | 24 | 25 | def test_open_netcdf_group_bad_type_error(): 26 | with pytest.raises(TypeError): 27 | with open_netcdf_group("input_file"): 28 | pass 29 | 30 | 31 | def test_open_netcdf_group_no_file_error(): 32 | with pytest.raises(ValueError): 33 | with open_netcdf_group(): 34 | pass 35 | 36 | 37 | def test_open_netcdf_group_no_group_error(): 38 | with pytest.raises(ValueError): 39 | with open_netcdf_group(input_file="test.nc"): 40 | pass 41 | -------------------------------------------------------------------------------- /tests/netcdf_engine/test_netcdf4_to_dim_converter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from tiledb.cf.core._shared_dim import SharedDim 5 | from tiledb.cf.netcdf_engine import NetCDF4ToDimConverter 6 | 7 | netCDF4 = pytest.importorskip("netCDF4") 8 | 9 | 10 | class TestSharedDimBase: 11 | """This class tests the NetCDF4ToDimConverter for a non-NetCDF dimension.""" 12 | 13 | shared_dim = SharedDim( 14 | name="dim0", 15 | dtype=np.dtype("int32"), 16 | domain=(0, 31), 17 | registry=None, 18 | ) 19 | 20 | def test_default_properties(self): 21 | """Tests the default properties are correctly set.""" 22 | dim_converter = NetCDF4ToDimConverter(self.shared_dim) 23 | assert dim_converter.tile is None 24 | assert dim_converter.filters is None 25 | assert dim_converter.max_fragment_length is None 26 | 27 | def test_set_max_fragment_length(self): 28 | """Tests setting max_fragment_length.""" 29 | dim_converter = NetCDF4ToDimConverter(self.shared_dim) 30 | dim_converter.max_fragment_length = 1 31 | assert dim_converter.max_fragment_length == 1 32 | 33 | def test_bad_max_fragment_length_error(self): 34 | """Tests error when setting an invalid max_fragment_length.""" 35 | dim_converter = NetCDF4ToDimConverter(self.shared_dim) 36 | with pytest.raises(ValueError): 37 | dim_converter.max_fragment_length = 0 38 | -------------------------------------------------------------------------------- /.github/workflows/quarto-render.yml: -------------------------------------------------------------------------------- 1 | # Cloned from https://github.com/TileDB-Inc/tiledb-quarto-template 2 | 3 | name: Render and deploy Quarto files 4 | on: 5 | push: 6 | pull_request: 7 | 8 | jobs: 9 | quarto-render-and-deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | 14 | - name: "Install Quarto" 15 | uses: quarto-dev/quarto-actions/setup@v2 16 | with: 17 | version: 0.9.141 18 | 19 | - name: "Setup Python" 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: "3.11" 23 | 24 | - name: "Upgrade pip" 25 | run: python -m pip install --upgrade pip 26 | 27 | - name: "Install Python Dependencies" 28 | run: python -m pip install ".[docs]" 29 | 30 | - name: "Quarto render" 31 | shell: bash 32 | run: | 33 | quartodoc build 34 | quarto render --fail-if-warnings 35 | # https://github.com/quarto-dev/quarto-cli/issues/493 36 | 37 | - name: "Deploy to gh-pages" 38 | uses: peaceiris/actions-gh-pages@v3 39 | # Change to the name of your repo's primary branch name: 40 | if: github.ref == 'refs/heads/dev' 41 | with: 42 | # This is GitHub Actions magic; no secrets for us to manage; and this works first-time 43 | # without any extra configs other than visiting Settings -> Pages in your GitHub repo. 44 | github_token: ${{ secrets.GITHUB_TOKEN }} 45 | publish_dir: docs 46 | destination_dir: docs 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | TileDB logo 2 | 3 | # TileDB-CF-Py 4 | 5 | The TileDB-CF-Py library is a Python library for supporting the NetCDF data model in the [TileDB storage engine](https://github.com/TileDB-Inc/TileDB). TileDB-CF-Py provides readers and writers for viewing and manipulating TileDB arrays and groups using TileDB CF Dataspaces - a special TileDB group that follows the requirements in [docs/source/tiledb-cf-spec.md](documentation/tiledb-cf-spec.md). 6 | 7 | :warning: This library is still under initial development and changes may not be backward compatible. 8 | 9 | ## TileDB Quick Links 10 | 11 | * [Homepage](https://tiledb.com) 12 | * [Documentation](https://docs.tiledb.com/main/) 13 | * [Forum](https://forum.tiledb.io/) 14 | * [Organization](https://github.com/TileDB-Inc/) 15 | 16 | ## Getting Started 17 | 18 | ### Quick Installation 19 | 20 | This project is available from [PyPI](https://pypi.org/project/tiledb-cf/) and may be installed with ``pip``: 21 | 22 | ```bash 23 | pip install tiledb-cf 24 | ``` 25 | 26 | ### Documentation 27 | 28 | Documentation is available at: [https://tiledb-inc.github.io/TileDB-CF-Py](https://tiledb-inc.github.io/TileDB-CF-Py/) 29 | 30 | ### Example Notebooks 31 | 32 | Example Jupyter notebooks are available in the [examples](./examples) folder. 33 | 34 | 35 | ## Development 36 | 37 | For information on contributing to this project see the [contributing](documentation/contributing.md) document and the [code of conduct](documentation/code-of-conduct.md). 38 | -------------------------------------------------------------------------------- /quarto-materials/tiledb.css: -------------------------------------------------------------------------------- 1 | /* 2 | Cloned from https://github.com/TileDB-Inc/tiledb-quarto-template 3 | 4 | tiledb light blue #4d9fff 5 | tiledb dark blue #0a2580 6 | */ 7 | 8 | .navbar-nav:hover .nav-link:hover { 9 | color: #4d9fff; 10 | } 11 | 12 | .nav-page:hover .nav-page-previous:hover { 13 | color: #4d9fff; 14 | } 15 | 16 | .nav-page:hover .nav-page-next:hover { 17 | color: #4d9fff; 18 | } 19 | 20 | .nav-page:hover .nav-page-text:hover { 21 | color: #4d9fff; 22 | } 23 | 24 | .toc-actions a:hover { 25 | color: #4d9fff; 26 | } 27 | 28 | .page-navigation:hover { 29 | color: #4d9fff; 30 | } 31 | 32 | a.pagination-link:hover { 33 | color: #4d9fff; 34 | } 35 | 36 | .sidebar-navigation .text-start { 37 | font-weight: bold; 38 | } 39 | 40 | .sidebar.sidebar-navigation .active { 41 | /* 42 | color: #800000; 43 | background-color: #e0e0e0; 44 | */ 45 | } 46 | 47 | .sidebar.sidebar-navigation .active, 48 | .sidebar.sidebar-navigation .show > .nav-link { 49 | /*color: #0a2580;*/ 50 | color: #2c4396; 51 | background-color: #e0e0e0; 52 | padding-left: 4px; 53 | padding-right: 4px; 54 | } 55 | 56 | a { 57 | color: #2c4396; 58 | } 59 | a:before, 60 | a:focus, 61 | a:hover, 62 | a:link, 63 | a:visited { 64 | color: #4629c9; 65 | } 66 | 67 | code, 68 | p code:not(.sourceCode), 69 | li code:not(.sourceCode), 70 | kbd, 71 | pre { 72 | color: #000000; 73 | background-color: #f0f0f0; 74 | font-size: 12px; 75 | direction: ltr; 76 | border-radius: 3px; 77 | } 78 | 79 | pre { 80 | font-size: 12px; 81 | padding: 10px; 82 | text-decoration: none; 83 | 84 | white-space: pre-wrap; /* css-3 */ 85 | white-space: -moz-pre-wrap; /* Mozilla, since 1999 */ 86 | white-space: -pre-wrap; /* Opera 4-6 */ 87 | white-space: -o-pre-wrap; /* Opera 7 */ 88 | } 89 | -------------------------------------------------------------------------------- /tiledb/cf/core/registry.py: -------------------------------------------------------------------------------- 1 | """Create a name registry for use in modifying grouped objects.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import Optional, TypeVar 6 | 7 | from typing_extensions import Protocol, Self 8 | 9 | T = TypeVar("T") 10 | 11 | 12 | class Registry(Protocol[T]): 13 | def __delitem__(self, name: str): 14 | """Delete the element with the provided name.""" 15 | 16 | def __getitem__(self, name: str) -> T: 17 | """Get the element with the provided name.""" 18 | 19 | def __setitem__(self, name: str, value: T): 20 | """Set the elemetn with the provided name to the provided value.""" 21 | 22 | def rename(self, old_name: str, new_name: str): 23 | """Rename an element of the registry. 24 | 25 | If the rename fails, the registry should be left unchanged. 26 | """ 27 | 28 | 29 | class RegisteredByNameMixin: 30 | def __init__(self, name: str, registry: Optional[Registry[Self]]): 31 | self._name = name 32 | self._registry: Optional[Registry[Self]] = None 33 | self.set_registry(registry) 34 | 35 | @property 36 | def is_registered(self) -> bool: 37 | return self._registry is not None 38 | 39 | @property 40 | def name(self) -> str: 41 | return self._name 42 | 43 | @name.setter 44 | def name(self, name: str): 45 | if self._registry is not None: 46 | self._registry.rename(self.name, name) 47 | self._name = name 48 | 49 | def set_registry(self, registry: Optional[Registry[Self]]): 50 | if self._registry is not None: 51 | raise ValueError("Registry is already set.") 52 | if registry is not None: 53 | registry[self.name] = self 54 | self._registry = registry 55 | -------------------------------------------------------------------------------- /tests/netcdf_engine/test_cli_netcdf_convert.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from click.testing import CliRunner 3 | 4 | import tiledb 5 | import tiledb.cf 6 | 7 | 8 | def test_netcdf_convert_collect(tmpdir, simple1_netcdf_file): 9 | uri = str(tmpdir.mkdir("output").join("simple1")) 10 | runner = CliRunner() 11 | result = runner.invoke( 12 | tiledb.cf.cli, 13 | [ 14 | "netcdf-convert", 15 | "-i", 16 | simple1_netcdf_file.filepath, 17 | "-o", 18 | uri, 19 | "--collect-attrs", 20 | ], 21 | ) 22 | assert result.exit_code == 0 23 | array_schema = tiledb.ArraySchema.load(uri + "/array0") 24 | attr_names = [attr.name for attr in array_schema] 25 | dim_names = [dim.name for dim in array_schema.domain] 26 | assert attr_names == ["x1"] 27 | assert dim_names == ["row"] 28 | with tiledb.open(uri + "/array0", attr="x1") as array: 29 | x1 = array[:] 30 | np.testing.assert_equal(x1, np.linspace(1.0, 4.0, 8)) 31 | 32 | 33 | def test_netcdf_convert_separate(tmpdir, simple1_netcdf_file): 34 | uri = str(tmpdir.mkdir("output").join("simple1")) 35 | runner = CliRunner() 36 | result = runner.invoke( 37 | tiledb.cf.cli, 38 | [ 39 | "netcdf-convert", 40 | "-i", 41 | simple1_netcdf_file.filepath, 42 | "-o", 43 | uri, 44 | "--array-per-attr", 45 | ], 46 | ) 47 | assert result.exit_code == 0 48 | array_schema = tiledb.ArraySchema.load(uri + "/x1") 49 | attr_names = [attr.name for attr in array_schema] 50 | dim_names = [dim.name for dim in array_schema.domain] 51 | assert attr_names == ["x1"] 52 | assert dim_names == ["row"] 53 | with tiledb.open(uri + "/x1", attr="x1") as array: 54 | x1 = array[:] 55 | np.testing.assert_equal(x1, np.linspace(1.0, 4.0, 8)) 56 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [workflow_dispatch, pull_request] 4 | 5 | jobs: 6 | precommit_checks: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Setup Python 10 | uses: actions/setup-python@v2 11 | with: 12 | python-version: '3.11' 13 | - name: Display Python version 14 | run: python -c "import sys; print(sys.version)" 15 | - name: Upgrade pip 16 | run: python -m pip install --upgrade pip 17 | - name: Checkout 18 | uses: actions/checkout@v2 19 | - name: Install dependencies 20 | run: python -m pip install -r requirements_dev.txt 21 | - name: Run black 22 | run: black --check . 23 | - name: Run isort 24 | run: isort --check . 25 | - name: Run flake8 26 | run: flake8 --statistics . 27 | 28 | tests: 29 | runs-on: ${{ matrix.os }} 30 | strategy: 31 | fail-fast: false 32 | matrix: 33 | os: [ubuntu-latest, macos-latest, windows-latest] 34 | python-version: ['3.8', '3.9', '3.10', '3.11'] 35 | steps: 36 | - name: Setup Python 37 | uses: actions/setup-python@v2 38 | with: 39 | python-version: ${{ matrix.python-version }} 40 | - name: Display Python version 41 | run: python -c "import sys; print(sys.version)" 42 | - name: Upgrade pip 43 | run: python -m pip install --upgrade pip 44 | - name: Install Pytest 45 | run: python -m pip install pytest pytest-cov 46 | - name: Checkout 47 | uses: actions/checkout@v2 48 | - name: Install TileDB-CF-Py 49 | run: python -m pip install ".[netCDF4,xarray,parallel]" 50 | - name: Test with coverage 51 | run: | 52 | python -m pytest --cov-report term-missing --cov-report=html:coverage --cov-report=xml:coverage/coverage.xml --cov="tiledb/cf" 53 | - name: Archive code coverage results 54 | uses: actions/upload-artifact@v2 55 | with: 56 | name: code-coverage-report 57 | path: coverage 58 | -------------------------------------------------------------------------------- /tests/xarray_engine/test_plugin_distributed.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import pytest 4 | 5 | xr = pytest.importorskip("xarray") # isort:skip 6 | dask = pytest.importorskip("dask") # isort:skip 7 | distributed = pytest.importorskip("distributed") # isort:skip 8 | 9 | from dask.distributed import Client 10 | from distributed.utils_test import cleanup, cluster, loop, loop_in_thread # noqa: F401 11 | from xarray.tests import assert_allclose 12 | 13 | pytestmark = pytest.mark.skipif( 14 | sys.version_info < (3, 9), reason="xarray requires python3.9 or higher" 15 | ) 16 | 17 | da = pytest.importorskip("dask.array") 18 | loop = loop # loop is an imported fixture, which flake8 has issues ack-ing 19 | 20 | 21 | def test_dask_distributed_tiledb_integration_test(loop, create_tiledb_example): 22 | array_uri, expected = create_tiledb_example 23 | with cluster() as (s, [a, b]): 24 | with Client(s["address"], loop=loop): 25 | ds = xr.open_dataset(array_uri, chunks={"time": 1}, engine="tiledb") 26 | assert isinstance(ds["pressure"].data, da.Array) 27 | actual = ds.compute() 28 | assert_allclose(actual, expected) 29 | 30 | 31 | @pytest.mark.skip( 32 | reason="failing test of deprecated engine. Will implement for new backend engine" 33 | "after xarray implements better non-nanosecond datetime support." 34 | ) 35 | def test_dask_distributed_tiledb_datetime_integration_test( 36 | loop, 37 | create_tiledb_datetime_example, 38 | ): 39 | array_uri, expected = create_tiledb_datetime_example 40 | with cluster() as (s, [a, b]): 41 | with Client(s["address"], loop=loop): 42 | with pytest.deprecated_call(): 43 | ds = xr.open_dataset( 44 | array_uri, 45 | chunks={"date": 1}, 46 | use_deprecated_engine=True, 47 | engine="tiledb", 48 | ) 49 | assert isinstance(ds["temperature"].data, da.Array) 50 | actual = ds.compute() 51 | assert_allclose(actual, expected) 52 | -------------------------------------------------------------------------------- /tiledb/cf/core/source.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Mapping, Optional, Tuple, Union 2 | 3 | import numpy as np 4 | from typing_extensions import Protocol 5 | 6 | 7 | class FieldData(Protocol): 8 | @property 9 | def dtype(self) -> np.dtype: 10 | """The numpy dtype of the data.""" 11 | 12 | @property 13 | def metadata(self) -> Mapping[str, Any]: 14 | """A mapping of metadata string-to-value pairs.""" 15 | 16 | @property 17 | def shape(self) -> Optional[Tuple[int, ...]]: 18 | """Shape of the data, or `None` if no shape.""" 19 | 20 | @shape.setter 21 | def shape(self, new_shape: Tuple[int, ...]): 22 | """Set the shape to `new_shape`.""" 23 | 24 | @property 25 | def size(self) -> int: 26 | """Size of the data.""" 27 | 28 | @property 29 | def values(self) -> np.array: 30 | """Data values.""" 31 | 32 | 33 | class NumpyData: 34 | def __init__( 35 | self, input: np.array, *, metadata: Optional[Mapping[str, Any]] = None 36 | ): 37 | self._source_data = input 38 | self._metadata = dict() if metadata is None else dict(metadata) 39 | 40 | @property 41 | def dtype(self): 42 | return self._source_data.dtype 43 | 44 | @property 45 | def metadata(self): 46 | return self._metadata 47 | 48 | @property 49 | def shape(self): 50 | return self._source_data.shape 51 | 52 | @shape.setter 53 | def shape(self, new_shape): 54 | self._source_data = np.reshape(self._source_data, new_shape) 55 | 56 | @property 57 | def size(self): 58 | return self._source_data.size 59 | 60 | @property 61 | def values(self): 62 | return self._source_data 63 | 64 | 65 | def create_field_data( 66 | source: Union[np.ndarray, int, FieldData], dtype: np.dtype 67 | ) -> FieldData: 68 | if isinstance(source, np.ndarray): 69 | field_data = NumpyData(source.astype(dtype)) 70 | elif isinstance(source, int): 71 | field_data = NumpyData(np.ndarray(source, dtype=dtype)) 72 | else: 73 | field_data = source 74 | return field_data 75 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = tiledb-cf 3 | version = 0.9.1 4 | description = TileDB Python library for supporting Climate and Forecast datasets. 5 | author = TileDB, Inc. 6 | author_email = help@tiledb.io 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | license = MIT 10 | keywords = tiledb, climate, forecast, netcdf 11 | url = https://github.com/TileDB-Inc/TileDB-CF-Py 12 | project_urls = 13 | Documentation = https://docs.tiledb.com 14 | classifiers = 15 | Development Status :: 3 - Alpha 16 | Intended Audience :: Developers 17 | Intended Audience :: Information Technology 18 | Intended Audience :: Science/Research 19 | License :: OSI Approved :: MIT License 20 | Operating System :: OS Independent 21 | Programming Language :: Python :: 3 22 | Programming Language :: Python :: 3.7 23 | Programming Language :: Python :: 3.8 24 | Programming Language :: Python :: 3.9 25 | Programming Language :: Python :: Implementation :: PyPy 26 | Topic :: Software Development 27 | 28 | [options] 29 | zip_safe = False 30 | packages = 31 | tiledb.cf 32 | tiledb.cf.core 33 | tiledb.cf.netcdf_engine 34 | tiledb.cf.xarray_engine 35 | python_requires = >=3.7 36 | install_requires = 37 | numpy >= 1.16.5 38 | setuptools >= 40.4 39 | tiledb >= 0.21.2 40 | click >= 0.7.0 41 | typing-extensions >= 4.0.0 42 | 43 | [options.extras_require] 44 | netCDF4 = netCDF4 45 | xarray = xarray >= 0.18.0 46 | parallel = dask[complete] 47 | complete = 48 | %(netCDF4)s 49 | %(xarray)s 50 | %(parallel)s 51 | docs = 52 | quartodoc 53 | matplotlib 54 | jupyter 55 | %(complete)s 56 | 57 | [options.entry_points] 58 | console_scripts = 59 | tiledb-cf = tiledb.cf:cli 60 | xarray.backends = 61 | tiledb = tiledb.cf.xarray_engine.engine:TileDBXarrayBackendEntrypoint 62 | 63 | 64 | [flake8] 65 | ignore = E41,E203,E226,E302,E402,W503,B024 66 | max-line-length = 88 67 | exclude = docs/* ./.* 68 | max-complexity = 10 69 | per-file-ignores = __init__.py:F401 70 | 71 | [mypy] 72 | ignore_missing_imports = True 73 | exclude = conftest.py 74 | -------------------------------------------------------------------------------- /tiledb/cf/_utils.py: -------------------------------------------------------------------------------- 1 | """Helper functions for internal use only.""" 2 | from __future__ import annotations 3 | 4 | import os.path 5 | from typing import Dict, Optional, Union 6 | 7 | import numpy as np 8 | 9 | import tiledb 10 | 11 | DType = Union[int, float, str, None] 12 | 13 | 14 | def check_valid_group(group_uri, ctx): 15 | """Raise a ValueError if the provided URI is not for a TileDB group.""" 16 | object_type = tiledb.object_type(group_uri, ctx=ctx) 17 | if object_type != "group": 18 | raise ValueError( 19 | f"Cannot open group at URI '{group_uri}'. TileDB object with " 20 | f"type '{object_type}' is no a valid TileDB group." 21 | ) 22 | 23 | 24 | def get_array_key( 25 | key: Optional[Union[Dict[str, str], str]], array_name 26 | ) -> Optional[str]: 27 | """Returns a key for the array with name ``array_name``. 28 | 29 | Parameters 30 | ---------- 31 | key 32 | If not ``None``, encryption key, or dictionary of encryption keys, to decrypt 33 | arrays. 34 | array_name 35 | Name of the array to decrypt. 36 | 37 | Returns 38 | ------- 39 | Optional[str] 40 | Key for the array with name ``array_name``. 41 | """ 42 | return key.get(array_name) if isinstance(key, dict) else key 43 | 44 | 45 | def get_array_uri(group_uri: str, array_name: str) -> str: 46 | """Returns a URI for an array with name ``array_name`` inside a group at URI 47 | ``group_uri``. 48 | 49 | This method is only needed for creating relative arrays before adding them 50 | to a group. 51 | 52 | Parameters 53 | ---------- 54 | group_uri 55 | URI of the group containing the array 56 | array_name 57 | name of the array 58 | 59 | Returns 60 | ------- 61 | str: 62 | Array URI of an array with name ``array_name`` inside a group at URI 63 | ``group_uri``. 64 | """ 65 | return os.path.join(group_uri, array_name) 66 | 67 | 68 | def safe_set_metadata(meta, key, value): 69 | """Copy a metadata item to a TileDB array catching any errors as warnings.""" 70 | if isinstance(value, np.ndarray): 71 | value = tuple(value.tolist()) 72 | elif isinstance(value, np.generic): 73 | value = (value.tolist(),) 74 | meta[key] = value 75 | -------------------------------------------------------------------------------- /tiledb/cf/core/_shared_dim.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional, Tuple 4 | 5 | import numpy as np 6 | from tiledb.datatypes import DataType 7 | from typing_extensions import Self 8 | 9 | from .._utils import DType 10 | from .registry import RegisteredByNameMixin, Registry 11 | 12 | 13 | class SharedDim(RegisteredByNameMixin): 14 | """Definition for the name, domain and data type of a collection of dimensions. 15 | 16 | Parameters 17 | ---------- 18 | name 19 | The name of the shared dimension. 20 | domain 21 | The domain for the shared dimension. 22 | dtype 23 | The datatype of the shared dimension. 24 | registry 25 | If provided, a registry for the shared dimension. 26 | """ 27 | 28 | def __init__( 29 | self, 30 | name: str, 31 | domain: Optional[Tuple[Optional[DType], Optional[DType]]], 32 | dtype: np.dtype, 33 | *, 34 | registry: Optional[Registry[Self]] = None, 35 | ): 36 | self._name = name 37 | self.domain = domain 38 | self.dtype = DataType.from_numpy(dtype).np_dtype 39 | super().__init__(name, registry) 40 | 41 | def __eq__(self, other): 42 | if not isinstance(other, self.__class__) or not isinstance( 43 | self, other.__class__ 44 | ): 45 | return False 46 | return ( 47 | self.name == other.name 48 | and self.domain == other.domain 49 | and self.dtype == other.dtype 50 | ) 51 | 52 | def __repr__(self) -> str: 53 | return ( 54 | f"SharedDim(name={self.name}, domain={self.domain}, dtype='{self.dtype!s}')" 55 | ) 56 | 57 | def html_input_summary(self) -> str: 58 | """Returns a HTML string summarizing the input for the dimension.""" 59 | return "" 60 | 61 | def html_output_summary(self) -> str: 62 | """Returns a string HTML summary of the :class:`SharedDim`.""" 63 | return f"name={self.name}, domain={self.domain}, dtype='{self.dtype!s}'" 64 | 65 | @property 66 | def is_index_dim(self) -> bool: 67 | """Returns ``True`` if this is an `index dimension` and ``False`` otherwise. 68 | 69 | An index dimension is a dimension with an integer data type and whose domain 70 | starts at 0. 71 | """ 72 | if self.domain: 73 | return np.issubdtype(self.dtype, np.integer) and self.domain[0] == 0 74 | return False 75 | -------------------------------------------------------------------------------- /quarto-materials/tiledb-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /documentation/code-of-conduct.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: TileDB-CF Code of Conduct 3 | --- 4 | 5 | ## Introduction 6 | 7 | All participants in TileDB spaces are expected to adhere to high standards of 8 | professionalism in all interactions. These standards include, but are not 9 | limited to, the specific behaviors outlined below. Upholding these standards 10 | is fundamental to our commitment to create a welcoming, positive, and 11 | inclusive environment for everyone. We as contributors and maintainers 12 | pledge to making participation in our project and our community a 13 | harassment-free experience for everyone, regardless of age, body 14 | size, disability, ethnicity, gender identity and expression, level 15 | of experience, nationality, personal appearance, race, religion, or 16 | sexual identity and orientation. 17 | 18 | ### Our Standards 19 | 20 | Examples of behavior that contributes to creating a positive environment 21 | include: 22 | 23 | * Using welcoming and inclusive language 24 | * Being respectful of differing viewpoints and experiences 25 | * Gracefully accepting constructive criticism 26 | * Focusing on what is best for the community 27 | * Showing empathy towards other community members 28 | 29 | All of these serve to help the make the project better, but also serve to make 30 | the experience of participating in the project better as well. 31 | 32 | Examples of unacceptable behavior by participants include: 33 | 34 | * Sexist, racist, and other exclusionary language 35 | * The use of sexualized language or imagery and unwelcome sexual attention or 36 | advances 37 | * Trolling, insulting/derogatory comments, and personal or political attacks 38 | * Public or private harassment or intimidation 39 | * Publishing others' private information, such as a physical or electronic 40 | address, without explicit permission 41 | * Other conduct which could reasonably be considered inappropriate in a 42 | professional setting 43 | 44 | ### Responsibilities 45 | 46 | Project maintainers are responsible for maintaining, upholding, and 47 | observing these standards. 48 | 49 | ### Reporting 50 | 51 | Please contact [conduct@tiledb.com](conduct@tiledb.com). All code of conduct 52 | reports will be kept in confidence. 53 | 54 | ### Attribution 55 | 56 | This document is adapted from the [Bokeh](https://raw.githubusercontent.com/bokeh/bokeh/9844e3240aab1100d7ad2621e8b62a2597846b96/CODE_OF_CONDUCT.md) 57 | code of conduct, which is in turn adapted from the [Contributor Covenant][homepage], version 1.4, 58 | available at [http://contributor-covenant.org/version/1/4][version] 59 | 60 | [homepage]: http://contributor-covenant.org 61 | [version]: http://contributor-covenant.org/version/1/4/. 62 | -------------------------------------------------------------------------------- /tests/netcdf_engine/test_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from tiledb.cf.netcdf_engine._utils import get_netcdf_metadata, get_unpacked_dtype 5 | 6 | netCDF4 = pytest.importorskip("netCDF4") 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "input_dtype,scale_factor,add_offset,output_dtype", 11 | ( 12 | (np.int16, None, None, np.int16), 13 | (np.int16, np.float32(1), None, np.float32), 14 | (np.int16, None, np.float32(1), np.float32), 15 | (np.int16, np.float64(1), np.float32(1), np.float64), 16 | ), 17 | ) 18 | def test_unpacked_dtype(input_dtype, scale_factor, add_offset, output_dtype): 19 | """Tests computing the unpacked data type for a NetCDF variable.""" 20 | with netCDF4.Dataset("tmp.nc", diskless=True, mode="w") as dataset: 21 | dataset.createDimension("t", None) 22 | variable = dataset.createVariable("x", dimensions=("t",), datatype=input_dtype) 23 | if scale_factor is not None: 24 | variable.setncattr("scale_factor", scale_factor) 25 | if add_offset is not None: 26 | variable.setncattr("add_offset", add_offset) 27 | dtype = get_unpacked_dtype(variable) 28 | assert dtype == output_dtype 29 | 30 | 31 | def test_unpacked_dtype_unsupported_dtype_error(): 32 | """Tests attempting to unpack a NetCDF variable with a data type that does not 33 | support packing/unpacking.""" 34 | with netCDF4.Dataset("tmp.nc", diskless=True, mode="w") as dataset: 35 | variable = dataset.createVariable("x", dimensions=tuple(), datatype="S1") 36 | with pytest.raises(ValueError): 37 | get_unpacked_dtype(variable) 38 | 39 | 40 | @pytest.mark.parametrize( 41 | "value, expected_result", 42 | ( 43 | (np.float64(1), np.float64(1)), 44 | (np.array((1), dtype=np.float64), np.float64(1)), 45 | (np.array([1], dtype=np.int32), np.int32(1)), 46 | ), 47 | ) 48 | def test_get_netcdf_metadata_number(value, expected_result): 49 | """Tests computing the unpacked data type for a NetCDF variable.""" 50 | key = "name" 51 | with netCDF4.Dataset("tmp.nc", diskless=True, mode="w") as dataset: 52 | dataset.setncattr(key, value) 53 | result = get_netcdf_metadata(dataset, key, is_number=True) 54 | assert result == expected_result 55 | 56 | 57 | @pytest.mark.parametrize("value", (("",), (1, 2))) 58 | def test_get_netcdf_metadata_number_with_warning(value): 59 | """Tests computing the unpacked data type for a NetCDF variable.""" 60 | key = "name" 61 | with netCDF4.Dataset("tmp.nc", diskless=True, mode="w") as dataset: 62 | dataset.setncattr(key, value) 63 | with pytest.warns(Warning): 64 | result = get_netcdf_metadata(dataset, key, is_number=True) 65 | assert result is None 66 | -------------------------------------------------------------------------------- /tiledb/cf/cli.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import click 4 | import numpy as np 5 | 6 | from .netcdf_engine import from_netcdf 7 | 8 | 9 | @click.group() 10 | def cli(): 11 | pass 12 | 13 | 14 | @cli.command("netcdf-convert") 15 | @click.option( 16 | "-i", 17 | "--input-file", 18 | required=True, 19 | type=str, 20 | help="The path or URI to the NetCDF file that will be converted.", 21 | ) 22 | @click.option( 23 | "-o", 24 | "--output-uri", 25 | required=True, 26 | type=str, 27 | help="The URI for the output TileDB group.", 28 | ) 29 | @click.option( 30 | "--input-group-path", 31 | type=str, 32 | default="/", 33 | show_default=True, 34 | help="The path in the input NetCDF for the root group that will be converted.", 35 | ) 36 | @click.option( 37 | "--recursive/--no-recursive", 38 | default=True, 39 | show_default=True, 40 | help="Recursively convert all groups contained in the input group path.", 41 | ) 42 | @click.option( 43 | "--collect-attrs/--array-per-attr", 44 | default=True, 45 | show_default=True, 46 | help="Collect variables with the same dimensions into a single array.", 47 | ) 48 | @click.option( 49 | "-k", 50 | "--output-key", 51 | type=str, 52 | default=None, 53 | show_default=True, 54 | help="Key for the generated TileDB arrays.", 55 | ) 56 | @click.option( 57 | "--unlimited-dim-size", 58 | type=int, 59 | default=10000, 60 | show_default=True, 61 | help="Size to convert unlimited dimensions to.", 62 | ) 63 | @click.option( 64 | "--dim-dtype", 65 | type=click.Choice( 66 | [ 67 | "int8", 68 | "int16", 69 | "int32", 70 | "int64", 71 | "uint8", 72 | "uint16", 73 | "uint32", 74 | "uint64", 75 | ] 76 | ), 77 | default="uint64", 78 | show_default=True, 79 | help="The data type for TileDB dimensions created from converted NetCDF.", 80 | ) 81 | def netcdf_convert( 82 | input_file: str, 83 | output_uri: str, 84 | input_group_path: str, 85 | recursive: bool, 86 | output_key: Optional[str], 87 | unlimited_dim_size: int, 88 | dim_dtype: str, 89 | collect_attrs: bool, 90 | ): 91 | """Converts a NetCDF input file to nested TileDB groups.""" 92 | from_netcdf( 93 | input_file=input_file, 94 | output_uri=output_uri, 95 | input_group_path=input_group_path, 96 | recursive=recursive, 97 | output_key=output_key, 98 | output_ctx=None, 99 | unlimited_dim_size=unlimited_dim_size, 100 | dim_dtype=np.dtype(dim_dtype), 101 | tiles_by_var=None, 102 | tiles_by_dims=None, 103 | coords_to_dims=False, 104 | collect_attrs=collect_attrs, 105 | ) 106 | -------------------------------------------------------------------------------- /tests/xarray_engine/test_plugin_timestamp.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | import tiledb 7 | 8 | xr = pytest.importorskip("xarray") 9 | 10 | pytestmark = pytest.mark.skipif( 11 | sys.version_info < (3, 9), reason="xarray requires python3.9 or higher" 12 | ) 13 | 14 | 15 | class TestOpenDatasetTimestep: 16 | """Test reading an empty TileDB array into xarray.""" 17 | 18 | @pytest.fixture(scope="class") 19 | def tiledb_uri(self, tmpdir_factory): 20 | """Creates a TileDB array and returns the URI.""" 21 | uri = str(tmpdir_factory.mktemp("output").join("empty_array")) 22 | tiledb.Array.create( 23 | uri, 24 | tiledb.ArraySchema( 25 | domain=tiledb.Domain( 26 | tiledb.Dim("x", domain=(0, 3), dtype=np.uint64), 27 | ), 28 | attrs=[tiledb.Attr("z", dtype=np.float64)], 29 | ), 30 | ) 31 | with tiledb.open(uri, mode="w", timestamp=1) as array: 32 | array[:] = np.zeros((4)) 33 | array.meta["global"] = 0 34 | array.meta["__tiledb_attr.z.variable"] = 0 35 | with tiledb.open(uri, mode="w", timestamp=2) as array: 36 | array[:] = np.ones((4)) 37 | array.meta["global"] = 1 38 | array.meta["__tiledb_attr.z.variable"] = 1 39 | with tiledb.open(uri, mode="w", timestamp=3) as array: 40 | array[1] = 2 41 | array.meta["global"] = 2 42 | array.meta["__tiledb_attr.z.variable"] = 2 43 | with tiledb.open(uri, mode="w", timestamp=4) as array: 44 | array[2] = 3 45 | array.meta["global"] = 3 46 | array.meta["__tiledb_attr.z.variable"] = 3 47 | return uri 48 | 49 | def test_variable_data_timestamp_int(self, tiledb_uri): 50 | result = xr.open_dataset(tiledb_uri, timestamp=2, engine="tiledb") 51 | expected = xr.Dataset({"z": xr.DataArray(np.ones((4)), dims=("x",))}) 52 | xr.testing.assert_equal(result, expected) 53 | 54 | def test_variable_metadata_timestamp_int(self, tiledb_uri): 55 | result = xr.open_dataset(tiledb_uri, timestamp=2, engine="tiledb") 56 | assert result["z"].attrs["variable"] == 1 57 | 58 | def test_global_metadata_timestamp_int(self, tiledb_uri): 59 | result = xr.open_dataset(tiledb_uri, timestamp=2, engine="tiledb") 60 | assert result.attrs["global"] == 1 61 | 62 | def test_variable_data_timestamp_tuple(self, tiledb_uri): 63 | result = xr.open_dataset(tiledb_uri, timestamp=(2, 3), engine="tiledb") 64 | expected = xr.Dataset({"z": xr.DataArray(np.array((1, 2, 1, 1)), dims=("x",))}) 65 | xr.testing.assert_equal(result, expected) 66 | 67 | def test_variable_metadata_timestamp_tuple(self, tiledb_uri): 68 | result = xr.open_dataset(tiledb_uri, timestamp=(2, 3), engine="tiledb") 69 | assert result["z"].attrs["variable"] == 2 70 | 71 | def test_global_metadata_timestamp_tuple(self, tiledb_uri): 72 | result = xr.open_dataset(tiledb_uri, timestamp=(2, 3), engine="tiledb") 73 | assert result.attrs["global"] == 2 74 | -------------------------------------------------------------------------------- /tests/core/test_dim_metadata.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import tiledb 5 | from tiledb.cf import DimMetadata 6 | 7 | 8 | class TestDimMetadata: 9 | @pytest.fixture(scope="class") 10 | def array_uri(self, tmpdir_factory): 11 | array_uri = str(tmpdir_factory.mktemp("test_array")) 12 | schema = tiledb.ArraySchema( 13 | domain=tiledb.Domain( 14 | tiledb.Dim(name="dim", domain=(0, 0), tile=1, dtype=np.int32) 15 | ), 16 | attrs=[ 17 | tiledb.Attr(name="attr", dtype=np.int32), 18 | ], 19 | ) 20 | tiledb.Array.create(array_uri, schema) 21 | with tiledb.DenseArray(array_uri, mode="w") as array: 22 | array.meta["array_key"] = "array_value" 23 | return array_uri 24 | 25 | def test_modify_metadata(self, array_uri): 26 | with tiledb.DenseArray(array_uri, mode="r") as array: 27 | meta = DimMetadata(array.meta, "dim") 28 | assert len(meta) == 0 29 | with tiledb.DenseArray(array_uri, mode="w", timestamp=1) as array: 30 | meta = DimMetadata(array.meta, "dim") 31 | meta["key0"] = "dim_value" 32 | meta["key1"] = 10 33 | meta["key2"] = 0.1 34 | with tiledb.DenseArray(array_uri, mode="w", timestamp=2) as array: 35 | meta = DimMetadata(array.meta, "dim") 36 | del meta["key2"] 37 | with tiledb.DenseArray(array_uri, mode="r") as array: 38 | meta = DimMetadata(array.meta, "dim") 39 | assert set(meta.keys()) == set(["key0", "key1"]) 40 | assert "key0" in meta 41 | assert meta["key0"] == "dim_value" 42 | 43 | def test_open_from_index(self, array_uri): 44 | with tiledb.DenseArray(array_uri, mode="r") as array: 45 | DimMetadata(array.meta, 0) 46 | 47 | def test_attr_not_in_array_exception(self, array_uri): 48 | with pytest.raises(KeyError): 49 | with tiledb.DenseArray(array_uri, mode="w") as array: 50 | _ = DimMetadata(array.meta, "x") 51 | 52 | def test_contains_not_string_exception(self, array_uri): 53 | with pytest.raises(TypeError): 54 | with tiledb.DenseArray(array_uri, mode="r") as array: 55 | meta = DimMetadata(array.meta, "dim") 56 | _ = 1 in meta 57 | 58 | def test_delitem_not_string_exception(self, array_uri): 59 | with pytest.raises(TypeError): 60 | with tiledb.DenseArray(array_uri, mode="w") as array: 61 | meta = DimMetadata(array.meta, "dim") 62 | del meta[1] 63 | 64 | def test_getitem_not_string_exception(self, array_uri): 65 | with pytest.raises(TypeError): 66 | with tiledb.DenseArray(array_uri, mode="r") as array: 67 | meta = DimMetadata(array.meta, "dim") 68 | _ = meta[1] 69 | 70 | def test_setitem_not_string_exception(self, array_uri): 71 | with pytest.raises(TypeError): 72 | with tiledb.DenseArray(array_uri, mode="w") as array: 73 | meta = DimMetadata(array.meta, "dim") 74 | meta[1] = "value" 75 | -------------------------------------------------------------------------------- /tests/core/test_attr_metadata.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import tiledb 5 | from tiledb.cf import AttrMetadata 6 | 7 | 8 | class TestAttrMetadata: 9 | @pytest.fixture(scope="class") 10 | def array_uri(self, tmpdir_factory): 11 | array_uri = str(tmpdir_factory.mktemp("test_array")) 12 | schema = tiledb.ArraySchema( 13 | domain=tiledb.Domain( 14 | tiledb.Dim(name="dim", domain=(0, 0), tile=1, dtype=np.int32) 15 | ), 16 | attrs=[ 17 | tiledb.Attr(name="attr", dtype=np.int32), 18 | ], 19 | ) 20 | tiledb.Array.create(array_uri, schema) 21 | with tiledb.DenseArray(array_uri, mode="w") as array: 22 | array.meta["array_key"] = "array_value" 23 | return array_uri 24 | 25 | def test_modify_metadata(self, array_uri): 26 | with tiledb.DenseArray(array_uri, mode="r") as array: 27 | meta = AttrMetadata(array.meta, "attr") 28 | assert len(meta) == 0 29 | with tiledb.DenseArray(array_uri, mode="w", timestamp=1) as array: 30 | meta = AttrMetadata(array.meta, "attr") 31 | meta["key0"] = "attribute_value" 32 | meta["key1"] = 10 33 | meta["key2"] = 0.1 34 | with tiledb.DenseArray(array_uri, mode="w", timestamp=2) as array: 35 | meta = AttrMetadata(array.meta, "attr") 36 | del meta["key2"] 37 | with tiledb.DenseArray(array_uri, mode="r") as array: 38 | meta = AttrMetadata(array.meta, "attr") 39 | assert set(meta.keys()) == set(["key0", "key1"]) 40 | assert "key0" in meta 41 | assert meta["key0"] == "attribute_value" 42 | 43 | def test_open_from_index(self, array_uri): 44 | with tiledb.DenseArray(array_uri, mode="r") as array: 45 | AttrMetadata(array.meta, 0) 46 | 47 | def test_attr_not_in_array_exception(self, array_uri): 48 | with pytest.raises(KeyError): 49 | with tiledb.DenseArray(array_uri, mode="w") as array: 50 | _ = AttrMetadata(array.meta, "x") 51 | 52 | def test_contains_not_string_exception(self, array_uri): 53 | with pytest.raises(TypeError): 54 | with tiledb.DenseArray(array_uri, mode="r") as array: 55 | meta = AttrMetadata(array.meta, "attr") 56 | _ = 1 in meta 57 | 58 | def test_delitem_not_string_exception(self, array_uri): 59 | with pytest.raises(TypeError): 60 | with tiledb.DenseArray(array_uri, mode="w") as array: 61 | meta = AttrMetadata(array.meta, "attr") 62 | del meta[1] 63 | 64 | def test_getitem_not_string_exception(self, array_uri): 65 | with pytest.raises(TypeError): 66 | with tiledb.DenseArray(array_uri, mode="r") as array: 67 | meta = AttrMetadata(array.meta, "attr") 68 | _ = meta[1] 69 | 70 | def test_setitem_not_string_exception(self, array_uri): 71 | with pytest.raises(TypeError): 72 | with tiledb.DenseArray(array_uri, mode="w") as array: 73 | meta = AttrMetadata(array.meta, "attr") 74 | meta[1] = "value" 75 | -------------------------------------------------------------------------------- /tools/hooks/pre-commit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Function to automate running linting/formatting tests. 4 | run_test() { 5 | name=$2 6 | fix_msg=$3 7 | echo "* Running ${name}.. " 8 | echo "..................." 9 | $1 10 | status=$? 11 | echo "..................." 12 | if [ $status -ne 0 ]; then 13 | read -r -p "..failed. Would you like continue with commit? [y/N] " response 14 | case "$response" in 15 | [yY][eE][sS]|[yY]) 16 | echo "Continuing with tests .." 17 | ;; 18 | *) 19 | echo $fix_msg 20 | exit $status 21 | esac 22 | else 23 | echo "..passed" 24 | fi 25 | } 26 | 27 | # get all python files that aren't deleted 28 | python_files=$(git diff --cached --name-only --diff-filter=AM | grep '\.py$') 29 | 30 | if [ ! -z "${python_files}" ]; then 31 | # run isort 32 | run_test "isort --check --diff ${python_files}" \ 33 | "isort" \ 34 | "Try running 'isort .' and add changes to git." 35 | # run black 36 | run_test "black --check ${python_files}" \ 37 | "black" \ 38 | "Try running 'black .' and add changes to git." 39 | # run flake8 40 | run_test "flake8 ${python_files}" "flake8" "" 41 | # run mypy 42 | run_test "mypy ${python_files}" "mypy" "" 43 | fi 44 | 45 | # Check for whitespace errors 46 | if git rev-parse --verify HEAD >/dev/null 2>&1 47 | then 48 | against=HEAD 49 | else 50 | # Initial commit: diff against an empty tree object 51 | against=$(git hash-object -t tree /dev/null) 52 | fi 53 | 54 | exec git diff-index --check --cached $against -- 55 | 56 | 57 | #!/bin/sh 58 | 59 | # Function to automate running linting/formatting tests. 60 | run_test() { 61 | name=$2 62 | fix_msg=$3 63 | echo "* Running ${name}.. " 64 | echo "..................." 65 | $1 66 | status=$? 67 | echo "..................." 68 | if [ $status -ne 0 ]; then 69 | read -r -p "..failed. Would you like continue with commit? [y/N] " response 70 | case "$response" in 71 | [yY][eE][sS]|[yY]) 72 | echo "Continuing with tests .." 73 | ;; 74 | *) 75 | echo $fix_msg 76 | exit $status 77 | esac 78 | else 79 | echo "..passed" 80 | fi 81 | } 82 | 83 | # get all python files that aren't deleted 84 | python_files=$(git diff --cached --name-only --diff-filter=AM | grep '\.py$') 85 | 86 | if [ ! -z "${python_files}" ]; then 87 | # run isort 88 | run_test "isort --check --diff ${python_files}" \ 89 | "isort" \ 90 | "Try running 'isort .' and add changes to git." 91 | # run black 92 | run_test "black --check ${python_files}" \ 93 | "black" \ 94 | "Try running 'black .' and add changes to git." 95 | # run flake8 96 | run_test "flake8 ${python_files}" "flake8" "" 97 | # run mypy 98 | run_test "mypy ${python_files}" "mypy" "" 99 | fi 100 | 101 | # Check for whitespace errors 102 | if git rev-parse --verify HEAD >/dev/null 2>&1 103 | then 104 | against=HEAD 105 | else 106 | # Initial commit: diff against an empty tree object 107 | against=$(git hash-object -t tree /dev/null) 108 | fi 109 | 110 | exec git diff-index --check --cached $against -- 111 | -------------------------------------------------------------------------------- /documentation/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: TileDB-CF Overview 3 | --- 4 | 5 | :::{.callout-warning} 6 | The TileDB-CF library is still under initial development and changes may not be backward compatible. 7 | ::: 8 | 9 | ## About 10 | 11 | TileDB-CF is a python package intended to aid in modeling and analyzing complex multi-dimensional data in TileDB. It currently contains the following components: 12 | 13 | * **Core**: High-level API for common TileDB group and metadata actions, and a creator class for generating TileDB groups following the TileDB-CF Dataspace specification. 14 | 15 | * **NetCDF Engine**: Support for creating a TileDB group or array from NetCDF data and copying the data into the new group or array. 16 | 17 | * **Xarray Engine**: 18 | 19 | - Backend engine that can be used with `xarray.open_dataset`. 20 | - Support for creating a TileDB from an xarray dataset and copying the data into the new group. 21 | 22 | 23 | ## Installation 24 | 25 | This project is available from [PyPI](https://pypi.org/project/tiledb-cf) and may be installed with `pip`: 26 | 27 | ```bash 28 | pip install tiledb-cf 29 | ``` 30 | 31 | TileDB-CF contains optional features that will be enabled if the required python packages are included in the python environment. These include: 32 | 33 | * `netCDF4`: support for the NetCDF engine, 34 | * `xarray`: support for the xarray engine, 35 | * `parallel`: support for dask operations (used with the xarray engine), 36 | * `complete`: all of the above packages, 37 | * `docs`: support for quartodoc and the example notebooks. 38 | 39 | To install tiledb-cf with additional dependencies use: 40 | 41 | ```bash 42 | pip install tiledb-cf[] 43 | ``` 44 | 45 | For example, to install TileDB-CF-Py and enable the xarray engine with dask support: 46 | 47 | ```bash 48 | pip install 'tiledb-cf[xarray,parallel]' 49 | ``` 50 | 51 | ## TileDB Data Model 52 | 53 | [TileDB](https://github.com/TileDB-Inc/TileDB) is a powerful open-source engine for storing and accessing dense and sparse multi-dimensional arrays. A complete description of the TileDB data model can be found at the [TileDB website](https://docs.tiledb.com). 54 | 55 | TileDB stores data as dense or sparse multi-dimensional arrays. The arrays can be grouped together in TileDB groups. A brief summary: 56 | 57 | * **Group**: A group is a TileDB object that stores metadata, arrays, and other groups. The groups use URIs to track members, so multiple groups can store the same assets. 58 | 59 | * **Array**: A set of attributes and dimensions that can be queried together: 60 | 61 | * **Dimensions**: The dimensions along with their domains orient a multi-dimensional space of cells. A dimension is defined by its name, domain, and data type along with additional data that specifies data storage and compression. The dimension values is called the cell coordinates. There can be any number of dimensions in an array. 62 | 63 | * **Attributes**: In each cell in the logical layout, TileDB stores a tuple comprised of any number of attributes, each of any data type (fixed- or variable-sized). 64 | 65 | * **Metadata**: This is (typically small) key-value data associated with an array or a group. 66 | 67 | * **Dimension labels** (experimental): Dimension labels store either increasing of decreasing data in a one-dimensional TileDB array that can be used to indirectly query other dimensions. 68 | -------------------------------------------------------------------------------- /tests/core/test_array_metadata.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import tiledb 5 | from tiledb.cf import ArrayMetadata 6 | 7 | 8 | class TestArrayMetadata: 9 | @pytest.fixture(scope="class") 10 | def array_uri(self, tmpdir_factory): 11 | array_uri = str(tmpdir_factory.mktemp("test_array")) 12 | schema = tiledb.ArraySchema( 13 | domain=tiledb.Domain( 14 | tiledb.Dim(name="dim", domain=(0, 0), tile=1, dtype=np.int32) 15 | ), 16 | attrs=[ 17 | tiledb.Attr(name="attr", dtype=np.int32), 18 | ], 19 | ) 20 | tiledb.Array.create(array_uri, schema) 21 | with tiledb.DenseArray(array_uri, mode="w") as array: 22 | array.meta["__tiledb_attr.attr"] = "attribute value" 23 | array.meta["__tiledb_dim.dim"] = "dimension value" 24 | return array_uri 25 | 26 | def test_modify_metadata(self, array_uri): 27 | with tiledb.DenseArray(array_uri, mode="r") as array: 28 | meta = ArrayMetadata(array.meta) 29 | assert len(meta) == 0 30 | assert "__tiledb_attr.attr" not in meta 31 | assert "__tiledb_dim.dim" not in meta 32 | with tiledb.DenseArray(array_uri, mode="w", timestamp=1) as array: 33 | meta = ArrayMetadata(array.meta) 34 | meta["key0"] = "array value" 35 | meta["key1"] = 10 36 | meta["key2"] = 0.1 37 | with tiledb.DenseArray(array_uri, mode="w", timestamp=2) as array: 38 | meta = ArrayMetadata(array.meta) 39 | del meta["key2"] 40 | with tiledb.DenseArray(array_uri, mode="r") as array: 41 | meta = ArrayMetadata(array.meta) 42 | assert set(meta.keys()) == set(["key0", "key1"]) 43 | assert "key0" in meta 44 | assert meta["key0"] == "array value" 45 | 46 | def test_delitem_attr_key_exception(self, array_uri): 47 | with pytest.raises(KeyError): 48 | with tiledb.DenseArray(array_uri, mode="w") as array: 49 | meta = ArrayMetadata(array.meta) 50 | del meta["__tiledb_attr.attr"] 51 | 52 | def test_delitem_dim_key_exeception(self, array_uri): 53 | with pytest.raises(KeyError): 54 | with tiledb.DenseArray(array_uri, mode="w") as array: 55 | meta = ArrayMetadata(array.meta) 56 | del meta["__tiledb_dim.dim"] 57 | 58 | def test_getitem_attr_key_exception(self, array_uri): 59 | with pytest.raises(KeyError): 60 | with tiledb.DenseArray(array_uri, mode="r") as array: 61 | meta = ArrayMetadata(array.meta) 62 | _ = meta["__tiledb_attr.attr"] 63 | 64 | def test_getitem_dim_key_exception(self, array_uri): 65 | with pytest.raises(KeyError): 66 | with tiledb.DenseArray(array_uri, mode="r") as array: 67 | meta = ArrayMetadata(array.meta) 68 | _ = meta["__tiledb_dim.dim"] 69 | 70 | def test_setitem_attr_key_exception(self, array_uri): 71 | with pytest.raises(KeyError): 72 | with tiledb.DenseArray(array_uri, mode="w") as array: 73 | meta = ArrayMetadata(array.meta) 74 | meta["__tiledb_attr.a"] = "value" 75 | 76 | def test_setitem_dim_key_exception(self, array_uri): 77 | with pytest.raises(KeyError): 78 | with tiledb.DenseArray(array_uri, mode="w") as array: 79 | meta = ArrayMetadata(array.meta) 80 | meta["__tiledb_dim.a"] = "value" 81 | -------------------------------------------------------------------------------- /_quarto.yml: -------------------------------------------------------------------------------- 1 | project: 2 | type: website 3 | output-dir: docs 4 | render: 5 | - "documentation/index.md" 6 | - "documentation/" 7 | - "examples/" 8 | 9 | format: 10 | html: 11 | toc: true 12 | theme: 13 | light: [flatly, "quarto-materials/tiledb.scss"] 14 | mainfont: Helvetica 15 | fontsize: 1rem 16 | linkcolor: "#4d9fff" 17 | code-copy: true 18 | code-overflow: scroll 19 | css: "quarto-materials/tiledb.css" 20 | 21 | quartodoc: 22 | title: "API Reference" 23 | package: tiledb 24 | dir: "documentation/api" 25 | sections: 26 | - title: "Core" 27 | desc: "" 28 | contents: 29 | - cf.create_group 30 | - cf.open_group_array 31 | - cf.ArrayMetadata 32 | - cf.AttrMetadata 33 | - cf.DimMetadata 34 | - cf.DataspaceCreator 35 | 36 | - title: "NetCDF Support" 37 | desc: "" 38 | contents: 39 | - cf.from_netcdf 40 | - cf.NetCDF4ConverterEngine 41 | - cf.netcdf_engine.NetCDF4CoordToDimConverter 42 | - cf.netcdf_engine.NetCDF4DimToDimConverter 43 | - cf.netcdf_engine.NetCDF4ScalarToDimConverter 44 | - cf.netcdf_engine.NetCDF4ArrayConverter 45 | - cf.netcdf_engine.NetCDF4DomainConverter 46 | - cf.netcdf_engine.NetCDF4ToDimConverter 47 | - cf.netcdf_engine.NetCDF4VarToAttrConverter 48 | 49 | - title: "Xarray Support" 50 | desc: "" 51 | contents: 52 | - cf.from_xarray 53 | - cf.create_group_from_xarray 54 | - cf.copy_data_from_xarray 55 | - cf.copy_metadata_from_xarray 56 | 57 | website: 58 | favicon: "images/favicon.ico" 59 | site-url: https://tiledb-inc.github.io/tiledb-quarto-template/ 60 | repo-url: https://github.com/TileDB-Inc/tiledb-quarto-template 61 | 62 | repo-actions: [issue] 63 | page-navigation: true 64 | navbar: 65 | background: light 66 | logo: "quarto-materials/tiledb-logo.png" 67 | collapse-below: lg 68 | left: 69 | - text: "Home page" 70 | href: "https://tiledb.com" 71 | - text: "Login" 72 | href: "https://cloud.tiledb.com/auth/login" 73 | - text: "Contact us" 74 | href: "https://tiledb.com/contact" 75 | - text: "Repo" 76 | href: "https://github.com/TileDB-Inc/tiledb-cf-py" 77 | 78 | sidebar: 79 | - style: "floating" 80 | collapse-level: 2 81 | align: left 82 | contents: 83 | 84 | - section: "Overview" 85 | contents: 86 | - href: "documentation/index.md" 87 | 88 | - section: "TileDB-CF Core" 89 | contents: 90 | - href: "documentation/core.md" 91 | - href: "documentation/tiledb-cf-spec.md" 92 | - section: "Examples" 93 | contents: 94 | - href: "examples/core/group-basics.ipynb" 95 | 96 | - section: "NetCDF Engine" 97 | contents: 98 | - href: "documentation/netcdf-engine.md" 99 | - section: "Examples" 100 | contents: 101 | - href: "examples/netcdf_engine/netcdf-to-tiledb-basics.ipynb" 102 | - href: "examples/netcdf_engine/netcdf-to-tiledb-set-max-fragment-size.ipynb" 103 | - href: "examples/netcdf_engine/netcdf-to-tiledb-set-tiles.ipynb" 104 | 105 | - section: "Xarray Engine" 106 | contents: 107 | - href: "documentation/xarray-engine.md" 108 | - section: "Examples" 109 | contents: 110 | - href: "examples/xarray_engine/tiledb-xarray-basics.ipynb" 111 | - href: "examples/xarray_engine/tiledb-xarray-partially-filled-arrays.ipynb" 112 | 113 | - section: "API Reference" 114 | contents: 115 | - href: "documentation/api/index.qmd" 116 | 117 | - section: "Contributing" 118 | contents: 119 | - href: "documentation/contributing.md" 120 | - href: "documentation/code-of-conduct.md" 121 | -------------------------------------------------------------------------------- /tiledb/cf/core/api.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections.abc import Mapping 4 | from typing import Dict, Optional, Union 5 | 6 | import tiledb 7 | 8 | from .._utils import check_valid_group, get_array_key, get_array_uri 9 | 10 | 11 | def create_group( 12 | uri: str, 13 | group_schema: Mapping[str, tiledb.ArraySchema], 14 | *, 15 | key: Optional[Union[Dict[str, str], str]] = None, 16 | ctx: Optional[tiledb.Ctx] = None, 17 | config: Optional[tiledb.Config] = None, 18 | append: bool = False, 19 | ): 20 | """Creates a TileDB group with arrays at relative locations inside the group. 21 | 22 | All arrays in the group will be added at a relative URI that matches the array name. 23 | 24 | Parameters 25 | ---------- 26 | uri 27 | Uniform resource identifier for TileDB group or array. 28 | group_schema 29 | A mapping from array names to array schemas to add to the group. 30 | key 31 | A encryption key or dict from array names to encryption keys. 32 | ctx 33 | If not ``None``, TileDB context wrapper for a TileDB storage manager. 34 | append 35 | If ``True``, add arrays from the provided group schema to an already existing 36 | group. The names for the arrays in the group schema cannot already exist in the 37 | group being append to. 38 | """ 39 | if append: 40 | check_valid_group(uri, ctx=ctx) 41 | with tiledb.Group(uri, ctx=ctx) as group: 42 | for array_name in group_schema: 43 | if array_name in group: 44 | raise ValueError( 45 | f"Cannot append to group. Array `{array_name}` already exists." 46 | ) 47 | else: 48 | tiledb.group_create(uri, ctx) 49 | with tiledb.Group(uri, mode="w", ctx=ctx) as group: 50 | for array_name, array_schema in group_schema.items(): 51 | tiledb.Array.create( 52 | uri=get_array_uri(uri, array_name), 53 | schema=array_schema, 54 | key=get_array_key(key, array_name), 55 | ctx=ctx, 56 | ) 57 | group.add(uri=array_name, name=array_name, relative=True) 58 | 59 | 60 | def open_group_array( 61 | group: tiledb.Group, 62 | *, 63 | array: Optional[str] = None, 64 | attr: Optional[str] = None, 65 | **kwargs, 66 | ) -> tiledb.Array: 67 | """Opens an array in a group either by specifying the name of the array or the name 68 | of an attribute in the array. 69 | 70 | If only providing the attribute, there must be exactly one array in the group with 71 | an attribute with the requested name. 72 | 73 | Parameters 74 | ---------- 75 | group 76 | The tiledb group to open the array in. 77 | array 78 | If not ``None``, the name of the array to open. Overrides attr if both are 79 | provided. 80 | attr 81 | If not ``None``, open the array that contains this attr. Attr must be in only 82 | one of the group arrays. 83 | **kwargs: dict, optional 84 | Keyword arguments to pass to the ``tiledb.open`` method. 85 | 86 | Returns 87 | ------- 88 | tiledb.Array: 89 | An array opened in the specified mode 90 | """ 91 | # Get the item in the group that either has the requested array name or 92 | # requested attribute. 93 | if array is not None: 94 | item = group[array] 95 | elif attr is not None: 96 | arrays = tuple( 97 | item 98 | for item in group 99 | if item.type == tiledb.libtiledb.Array 100 | and tiledb.ArraySchema.load(item.uri).has_attr(attr) 101 | ) 102 | if not arrays: 103 | raise KeyError(f"No attribute with name '{attr}' found.") 104 | if len(arrays) > 1: 105 | raise ValueError( 106 | f"The array must be specified when opening an attribute that " 107 | f"exists in multiple arrays in a group. Arrays with attribute " 108 | f"'{attr}' include: {list(item.name for item in group)}." 109 | ) 110 | item = arrays[0] 111 | else: 112 | raise ValueError( 113 | "Cannot open array. Either an array or attribute must be specified." 114 | ) 115 | return tiledb.open(item.uri, attr=attr, **kwargs) 116 | -------------------------------------------------------------------------------- /tiledb/cf/core/_dim_creator.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional, Tuple, Union 4 | 5 | import numpy as np 6 | from typing_extensions import Protocol 7 | 8 | import tiledb 9 | 10 | from .._utils import DType 11 | from ._shared_dim import SharedDim 12 | from .source import FieldData, create_field_data 13 | 14 | 15 | class DimRegistry(Protocol): 16 | def set_writer_data( 17 | self, writer_index: Optional[int], dim_name: str, data: FieldData 18 | ): 19 | """Set the data to the requested frgament writer.""" 20 | 21 | 22 | class DimCreator: 23 | """Creator for a TileDB dimension using a SharedDim. 24 | 25 | Parameters 26 | ---------- 27 | base 28 | The core shared dimension describing the dimension. 29 | tile 30 | The tile size for the dimension. 31 | filters 32 | Specifies compression filters for the dimension. 33 | registry 34 | An optional registry for the dimension registry. 35 | 36 | Attributes 37 | ---------- 38 | tile 39 | The tile size for the dimension. 40 | filters 41 | Specifies compression filters for the dimension. 42 | """ 43 | 44 | def __init__( 45 | self, 46 | base: SharedDim, 47 | *, 48 | tile: Optional[Union[int, float]] = None, 49 | filters: Optional[tiledb.FilterList] = None, 50 | registry: Optional[DimRegistry] = None, 51 | ): 52 | self._base = base 53 | self.tile = tile 54 | self.filters = filters 55 | self._registry = registry 56 | 57 | def __repr__(self): 58 | filters_str = "" 59 | if self.filters: 60 | filters_str = ", filters=FilterList([" 61 | for dim_filter in self.filters: 62 | filters_str += repr(dim_filter) + ", " 63 | filters_str += "])" 64 | return f"DimCreator({repr(self._base)}, tile={self.tile}{filters_str})" 65 | 66 | @property 67 | def base(self) -> SharedDim: 68 | """Shared definition for the dimensions name, domain, and dtype.""" 69 | return self._base 70 | 71 | @property 72 | def dtype(self) -> np.dtype: 73 | """The numpy dtype of the values and domain of the dimension.""" 74 | return self._base.dtype 75 | 76 | @property 77 | def domain(self) -> Optional[Tuple[Optional[DType], Optional[DType]]]: 78 | """The (inclusive) interval on which the dimension is valid.""" 79 | return self._base.domain 80 | 81 | def html_summary(self) -> str: 82 | """Returns a string HTML summary of the :class:`DimCreator`.""" 83 | filters_str = "" 84 | if self.filters: 85 | filters_str = ", filters=FilterList([" 86 | for dim_filter in self.filters: 87 | filters_str += repr(dim_filter) + ", " 88 | filters_str += "])" 89 | return ( 90 | f"{self._base.html_input_summary()} → tiledb.Dim(" 91 | f"{self._base.html_output_summary()}, tile={self.tile}{filters_str})" 92 | ) 93 | 94 | @property 95 | def name(self) -> str: 96 | """Name of the dimension.""" 97 | return self._base.name 98 | 99 | def set_writer_data( 100 | self, 101 | dim_data: Union[np.ndarray, FieldData], 102 | *, 103 | writer_index: Optional[int] = None, 104 | ): 105 | """Set dimension data on a fragment writer 106 | 107 | Parameters 108 | ---------- 109 | dim_data 110 | The dimension data to set. 111 | writer_index 112 | The index of the fragment writer to set the data on. 113 | """ 114 | if self._registry is None: 115 | raise ValueError("Dimension creator is not registered to an array.") 116 | data = create_field_data(dim_data, self.dtype) 117 | self._registry.set_writer_data(writer_index, self.name, data) 118 | 119 | def to_tiledb(self, ctx: Optional[tiledb.Ctx] = None) -> tiledb.Dim: 120 | """Returns a `tiledb.Dim` using the creator properties. 121 | 122 | Parameters 123 | ---------- 124 | ctx 125 | If not ``None``, TileDB context wrapper for a TileDB storage manager. 126 | 127 | Returns 128 | ------- 129 | tiledb.Dim 130 | A tiledb dimension with the set properties. 131 | """ 132 | return tiledb.Dim( 133 | name=self.name, 134 | domain=self.domain, 135 | tile=self.tile, 136 | filters=self.filters, 137 | dtype=self.dtype, 138 | ctx=ctx, 139 | ) 140 | -------------------------------------------------------------------------------- /documentation/netcdf-engine.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: TileDB-CF NetCDF Engine 3 | --- 4 | 5 | ## NetCDF Data Model 6 | The NetCDF data model is a common choice for multi-dimensional data, especially in the climate and weather space. NetCDF and TileDB use over lapping terminology to refer to concepts in their respective data model. 7 | 8 | A complete description of the NetCDF data model can be found at the [UCAR website](https://www.unidata.ucar.edu/software/netcdf/docs/netcdf_data_model.html). 9 | 10 | A NetCDF file consists of **groups**, **dimensions**, **variables**, and **attributes**. Each NetCDF file has at least one root group that contains all other objects. Additional subgroups can be added to heirarchically organize the data. 11 | 12 | * **Dimensions**: A dimension is a name-size pair that describes an axis of a multi-dimension array. The size of the dimension may be "unlimited" (allowed to grow). The NetCDF dimension is roughly ananlogous to a TileDB dimension in a dense TileDB array. 13 | 14 | * **Variables**: A variable is a multi-dimensional array with a NetCDF dimension associated to each axis of the array. The size of the dimensions must match the shape of the multi-dimensional array. A NetCDF variable is roughly equivalent to either a TileDB attribute in a sparse or dense TileDB array or a TileDB dimension in a sparse TileDB array. 15 | 16 | * **Attribute**: An attribute is a key-value pair that is associated with either a group or variable. Attributes are used to store (typically small) metadata. NetCDF attributes are roughly equivalent to TileDB metadata. 17 | 18 | * **Group**: A NetCDF group is a collection of dimensions, variables, and attributes. A simple NetCDF group might map to a TileDB array. A more complex group would need to be mapped to a TileDB group. 19 | 20 | 21 | ## NetCDF-to-TileDB Compatibility 22 | 23 | The TileDB-CF package provides an interface for generating TileDB groups from NetCDF datasets using the TileDB-CF Dataspace convention. The CF Dataspace model supports the classic NetCDF-4 data model by mapping: 24 | 25 | * NetCDF groups to TileDB groups; 26 | * NetCDF dimensions to TileDB dimensions; 27 | * NetCDF variables to TileDB attributes or TileDB dimensions; 28 | * NetCDF attributes to TileDB group or array metadata. 29 | 30 | Some features and use cases do not directly transfer or may need to be modified before use in TileDB. 31 | 32 | * **Coordinates**: In NetCDF, it is a common convention to name a one-dimensional variable with the same name as its dimension to signify it as a "coordinate" or independent variable other variables are defined on. In TileDB, a variable and dimension in the same array cannot have the same name. This can be handled by renaming either the dimension or the variable when copying to TileDB. 33 | 34 | * **Unlimited Dimensions**: TileDB can support unlimited dimensions by creating the domain on a dimension larger than the initial data. The domain must be set at creation time, and cannot be modified after array creation. 35 | 36 | * **Compound data types**: As of TileDB version 2.16, compound data types are not directly supported in TileDB. Compound data types can be broken into their constituent parts; however, this breaks storage locality. Variable, opaque, and string data types are supported. 37 | 38 | 39 | ## Programmatic Interface 40 | 41 | The `NetCDFConverterEngine` is a configurable tool for ingesting data from NetCDF into TileDB. The class can be manually constructed, or it can be auto-generated from a NetCDF file or group. 42 | 43 | ## Command-Line Interface 44 | 45 | TileDB-CF provides a command line interface to the NetCDF converter engine. It contains the following options: 46 | 47 | ```bash 48 | Usage: tiledb-cf netcdf-convert [OPTIONS] 49 | 50 | Converts a NetCDF input file to nested TileDB groups. 51 | 52 | Options: 53 | -i, --input-file TEXT The path or URI to the NetCDF file that will be converted. [required] 54 | 55 | -o, --output-uri TEXT The URI for the output TileDB group. [required] 56 | 57 | --input-group-path TEXT The path in the input NetCDF for the root group that will be converted. [default: /] 58 | 59 | --recursive / --no-recursive Recursively convert all groups contained in the input group path. [default: True] 60 | 61 | -k, --output-key TEXT Key for the generated TileDB arrays. 62 | 63 | --unlimited-dim-size INTEGER Size to convert unlimited dimensions to. [default: 10000] 64 | 65 | --dim-dtype [int8|int16|int32|int64|uint8|uint16|uint32|uint64] 66 | The data type for TileDB dimensions created from converted NetCDF. [default: uint64] 67 | 68 | --help Show this message and exit. 69 | ``` 70 | -------------------------------------------------------------------------------- /tiledb/cf/netcdf_engine/api.py: -------------------------------------------------------------------------------- 1 | """Functions for converting NetCDF files to TileDB.""" 2 | 3 | from pathlib import Path 4 | from typing import Dict, Optional, Sequence, Union 5 | 6 | import numpy as np 7 | 8 | import tiledb 9 | 10 | _DEFAULT_INDEX_DTYPE = np.dtype("uint64") 11 | 12 | 13 | def from_netcdf( 14 | input_file: Union[str, Path], 15 | output_uri: str, 16 | input_group_path: str = "/", 17 | recursive: bool = True, 18 | output_key: Optional[str] = None, 19 | output_ctx: Optional[tiledb.Ctx] = None, 20 | unlimited_dim_size: int = 10000, 21 | dim_dtype: np.dtype = _DEFAULT_INDEX_DTYPE, 22 | tiles_by_var: Optional[Dict[str, Dict[str, Optional[Sequence[int]]]]] = None, 23 | tiles_by_dims: Optional[ 24 | Dict[str, Dict[Sequence[str], Optional[Sequence[int]]]] 25 | ] = None, 26 | coords_to_dims: bool = False, 27 | collect_attrs: bool = True, 28 | unpack_vars: bool = False, 29 | offsets_filters: Optional[tiledb.FilterList] = None, 30 | attrs_filters: Optional[tiledb.FilterList] = None, 31 | copy_metadata: bool = True, 32 | ): 33 | """Converts a NetCDF input file to nested TileDB CF dataspaces. 34 | 35 | See ``tiledb.cf.NetCDF4ConverterEngine`` for more information on the backend 36 | converter engine used for the conversion. 37 | 38 | Parameters 39 | ---------- 40 | input_file 41 | The input NetCDF file to generate the converter engine from. 42 | output_uri 43 | The uniform resource identifier for the TileDB group to be created. 44 | input_group_path: 45 | The path to the NetCDF group to copy data from. Use ``'/'`` for the root group. 46 | recursive 47 | If ``True``, recursively convert groups in a NetCDF file. Otherwise, only 48 | convert group provided. 49 | output_key 50 | If not ``None``, encryption key to decrypt arrays. 51 | output_ctx 52 | If not ``None``, TileDB context wrapper for a TileDB storage manager. 53 | unlimited_dim_size: 54 | The size of the domain for TileDB dimensions created from unlimited NetCDF 55 | dimensions. 56 | dim_dtype 57 | The numpy dtype for TileDB dimensions. 58 | tiles_by_var 59 | A map from the name of a NetCDF variable to the tiles of the dimensions of the 60 | variable in the generated TileDB array. 61 | tiles_by_dims 62 | A map from the name of NetCDF dimensions defining a variable to the tiles of 63 | those dimensions in the generated TileDB array. 64 | coords_to_dims 65 | If ``True``, convert the NetCDF coordinate variable into a TileDB dimension for 66 | sparse arrays. Otherwise, convert the coordinate dimension into a TileDB 67 | dimension and the coordinate variable into a TileDB attribute. 68 | collect_attrs 69 | If ``True``, store all attributes with the same dimensions in the same array. 70 | Otherwise, store each attribute in a scalar array. 71 | unpack_vars 72 | Unpack NetCDF variables with NetCDF attributes ``scale_factor`` or 73 | ``add_offset`` using the transformation ``scale_factor * value + unpack``. 74 | offsets_filters 75 | Default filters for all offsets for variable attributes and dimensions. 76 | attrs_filters 77 | Default filters for all attributes. 78 | copy_metadata 79 | If ``True`` copy NetCDF group and variable attributes to TileDB metadata. If 80 | ``False`` do not copy metadata. 81 | """ 82 | from .converter import NetCDF4ConverterEngine, open_netcdf_group 83 | 84 | output_uri = output_uri if not output_uri.endswith("/") else output_uri[:-1] 85 | 86 | if tiles_by_var is None: 87 | tiles_by_var = {} 88 | if tiles_by_dims is None: 89 | tiles_by_dims = {} 90 | 91 | def recursive_convert(netcdf_group): 92 | converter = NetCDF4ConverterEngine.from_group( 93 | netcdf_group, 94 | unlimited_dim_size, 95 | dim_dtype, 96 | tiles_by_var.get(netcdf_group.path), 97 | tiles_by_dims.get(netcdf_group.path), 98 | coords_to_dims=coords_to_dims, 99 | collect_attrs=collect_attrs, 100 | unpack_vars=unpack_vars, 101 | offsets_filters=offsets_filters, 102 | attrs_filters=attrs_filters, 103 | ) 104 | group_uri = output_uri + netcdf_group.path 105 | converter.convert_to_group( 106 | group_uri, 107 | output_key, 108 | output_ctx, 109 | input_netcdf_group=netcdf_group, 110 | copy_metadata=copy_metadata, 111 | ) 112 | if recursive: 113 | for subgroup in netcdf_group.groups.values(): 114 | recursive_convert(subgroup) 115 | 116 | with open_netcdf_group( 117 | input_file=input_file, 118 | group_path=input_group_path, 119 | ) as dataset: 120 | recursive_convert(dataset) 121 | -------------------------------------------------------------------------------- /quarto-materials/tiledb.scss: -------------------------------------------------------------------------------- 1 | /* 2 | Cloned from https://github.com/TileDB-Inc/tiledb-quarto-template 3 | 4 | tiledb light blue #4d9fff 5 | tiledb dark blue #0a2580 6 | */ 7 | 8 | /*-- scss:defaults --*/ 9 | /* 10 | $navbar-bg: #800000; 11 | $navbar-fg: #eeeeee; 12 | $sidebar-fg: #800000; 13 | $footer-bg: #800000; 14 | $footer-fg: #eeeeee; 15 | */ 16 | 17 | /*-- scss:rules --*/ 18 | h1, 19 | h2, 20 | h3 { 21 | font-weight: 600; /* semibold */ 22 | /* 23 | color: #4d9fff; 24 | background-color: #e0e0e0; 25 | */ 26 | padding: 6px; 27 | } 28 | 29 | h1, 30 | h2, 31 | h3 { 32 | line-height: 1.38; 33 | } 34 | 35 | p, 36 | ul { 37 | line-height: 1.74 !important; 38 | border-radius: 0 !important; 39 | } 40 | 41 | li { 42 | line-height: 2.06 !important; 43 | } 44 | 45 | .navbar-dark { 46 | background-image: url('./quarto-materials/Background-tdb-header.jpg'); 47 | background-position: center; 48 | background-size: cover; 49 | } 50 | 51 | .navbar-nav .nav-link { 52 | border-radius: 8px !important; 53 | } 54 | 55 | .nav-link { 56 | padding-left: 1rem !important; 57 | padding-right: 1rem !important; 58 | } 59 | 60 | .navbar-dark .navbar-nav .nav-link { 61 | color: white; 62 | font-weight: 500; 63 | &:hover { 64 | color: rgba(255, 255, 255, 1) !important; 65 | background-color: rgba(255, 255, 255, 0.1); 66 | } 67 | transition: all 0.3s ease; 68 | } 69 | 70 | .navbar-dark .navbar-nav .nav-link:focus { 71 | color: white !important; 72 | } 73 | 74 | .navbar #quarto-search.type-overlay .aa-Autocomplete svg.aa-SubmitIcon { 75 | color: white; 76 | } 77 | 78 | .sidebar.sidebar-navigation .active, 79 | .sidebar.sidebar-navigation .show > .nav-link { 80 | color: #2c4396 !important; 81 | } 82 | 83 | .sidebar-item { 84 | margin-bottom: 0.75em; 85 | } 86 | 87 | .sidebar-section { 88 | margin-top: 0.75em; 89 | padding-bottom: 0.75em; 90 | padding-left: 1.25em; 91 | border-left: 1px solid rgba(0, 0, 0, 0.1); 92 | } 93 | 94 | .sidebar-item-container a { 95 | color: rgba(0, 0, 0, 0.6) !important; 96 | &:hover { 97 | color: rgba(0, 0, 0, 0.8) !important; 98 | } 99 | &:visited { 100 | color: rgba(0, 0, 0, 0.4) !important; 101 | } 102 | transition: 0.3s ease color; 103 | } 104 | 105 | .sidebar.sidebar-navigation .active { 106 | font-weight: 600; 107 | color: rgba(0, 0, 0, 0.7) !important; 108 | background-color: transparent !important; 109 | padding-left: 0px !important; 110 | } 111 | 112 | .figure-img { 113 | border-radius: 6px; 114 | box-shadow: 0 6px 18px 0 rgba(0, 0, 0, 0.06); 115 | border: 1px solid rgba(0, 0, 0, 0.1); 116 | } 117 | 118 | .nav-page span { 119 | color: rgba(0, 0, 0, 0.8) !important; 120 | } 121 | 122 | .nav-page i { 123 | transition: color 0.3s ease; 124 | } 125 | 126 | code, 127 | p code:not(.sourceCode), 128 | li code:not(.sourceCode), 129 | kbd, 130 | pre { 131 | font-size: 14px !important; 132 | padding: 0.3em 0.8em !important; 133 | border-radius: 6px !important; 134 | } 135 | 136 | .sidebar nav[role="doc-toc"] > ul li a { 137 | padding: 0.6rem 0.8rem; 138 | font-weight: 500; 139 | color: rgba(0, 0, 0, 0.6); 140 | &:hover { 141 | color: rgba(0, 0, 0, 0.8); 142 | } 143 | transition: 0.3s ease color; 144 | } 145 | 146 | .sidebar nav[role="doc-toc"] ul > li > ul > li > a.active { 147 | color: rgba(0, 0, 0, 0.8) !important; 148 | } 149 | 150 | .sidebar nav[role="doc-toc"] ul > li > a.active { 151 | color: rgba(0, 0, 0, 0.8) !important; 152 | } 153 | 154 | .sidebar.sidebar-navigation > * { 155 | padding-top: 1.6em !important; 156 | } 157 | 158 | .nav-tabs .nav-link:hover, 159 | .nav-tabs .nav-link:focus { 160 | color: rgba(0, 0, 0, 0.6); 161 | } 162 | 163 | .navbar-dark .navbar-toggler { 164 | color: white !important; 165 | border-color: white !important; 166 | background-color: rgba(255, 255, 255, 0.94); 167 | } 168 | 169 | // Search form 170 | 171 | .aa-DetachedContainer { 172 | background-color: transparent !important; 173 | } 174 | 175 | .aa-DetachedContainer--modal { 176 | border-radius: 12px !important; 177 | } 178 | 179 | .aa-DetachedFormContainer { 180 | border-bottom: none !important; 181 | background-color: rgba(255, 255, 255, 0.7); 182 | backdrop-filter: blur(5px); 183 | padding: 12px !important; 184 | } 185 | 186 | .aa-Autocomplete .aa-Form .aa-InputWrapper .aa-Input, 187 | .aa-DetachedFormContainer .aa-Form .aa-InputWrapper .aa-Input { 188 | height: calc(2.5em + (0.1rem + 2px)) !important; 189 | } 190 | 191 | .aa-Autocomplete .aa-Form, 192 | .aa-DetachedFormContainer .aa-Form { 193 | background-color: rgba(255, 255, 255, 0.9) !important; 194 | box-shadow: 0 2px 6px 0 rgba(0, 0, 0, 0.4); 195 | border: none !important; 196 | border-radius: 8px !important; 197 | transition: box-shadow 0.3s ease; 198 | padding-left: 8px !important; 199 | padding-right: 8px !important; 200 | } 201 | 202 | .aa-Autocomplete .aa-Form:focus-within, 203 | .aa-DetachedFormContainer .aa-Form:focus-within { 204 | box-shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.5); 205 | } 206 | -------------------------------------------------------------------------------- /tests/core/test_group.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import tiledb 5 | from tiledb.cf import create_group, open_group_array 6 | 7 | _row = tiledb.Dim( 8 | name="rows", 9 | domain=(1, 4), 10 | tile=4, 11 | dtype=np.uint64, 12 | filters=tiledb.FilterList([tiledb.ZstdFilter()]), 13 | ) 14 | _col = tiledb.Dim( 15 | name="cols", 16 | domain=(1, 4), 17 | tile=4, 18 | dtype=np.uint64, 19 | filters=tiledb.FilterList([tiledb.ZstdFilter()]), 20 | ) 21 | 22 | 23 | _attr_a = tiledb.Attr(name="a", dtype=np.uint64) 24 | _attr_b = tiledb.Attr(name="b", dtype=np.float64) 25 | _attr_c = tiledb.Attr(name="c", dtype=np.dtype("U")) 26 | _array_schema_1 = tiledb.ArraySchema( 27 | domain=tiledb.Domain(_row, _col), 28 | attrs=[_attr_a], 29 | ) 30 | _array_schema_2 = tiledb.ArraySchema( 31 | domain=tiledb.Domain(_row), 32 | sparse=True, 33 | attrs=[_attr_b, _attr_c], 34 | ) 35 | _array_schema_3 = tiledb.ArraySchema( 36 | domain=tiledb.Domain(_row, _col), 37 | attrs=[_attr_c], 38 | ) 39 | 40 | 41 | class TestCreateGroup: 42 | _array_schemas = {"A1": _array_schema_1, "A2": _array_schema_2} 43 | _key = None 44 | 45 | @pytest.fixture(scope="class") 46 | def group_uri(self, tmpdir_factory): 47 | """Creates a TileDB Group from a mapping of arrays and returns scenario dict.""" 48 | uri = str(tmpdir_factory.mktemp("group1")) 49 | ctx = None 50 | create_group(uri, self._array_schemas, key=self._key, ctx=ctx) 51 | return uri 52 | 53 | def test_array_schemas(self, group_uri): 54 | uri = group_uri 55 | assert tiledb.object_type(uri) == "group" 56 | for name, schema in self._array_schemas.items(): 57 | with tiledb.Group(uri) as group: 58 | assert tiledb.ArraySchema.load(group[name].uri) == schema 59 | 60 | 61 | class TestGroupWithArrays: 62 | _A1_data = np.array( 63 | ([1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]), dtype=np.uint64 64 | ) 65 | 66 | @pytest.fixture(scope="class") 67 | def group_uri(self, tmpdir_factory): 68 | uri = str(tmpdir_factory.mktemp("simple_group")) 69 | tiledb.group_create(uri) 70 | tiledb.Array.create(uri + "/A1", _array_schema_1) 71 | with tiledb.DenseArray(uri + "/A1", mode="w") as array: 72 | array[:] = self._A1_data 73 | tiledb.Array.create(uri + "/A2", _array_schema_2) 74 | tiledb.Array.create(uri + "/A3", _array_schema_3) 75 | with tiledb.Group(uri, mode="w") as group: 76 | group.add(uri="A1", name="A1", relative=True) 77 | group.add(uri="A2", name="A2", relative=True) 78 | group.add(uri="A3", name="A3", relative=True) 79 | filesystem = tiledb.VFS() 80 | filesystem.create_dir(uri + "/empty_dir") 81 | return uri 82 | 83 | def test_open_array_from_group(self, group_uri): 84 | with tiledb.Group(group_uri) as group: 85 | with open_group_array(group, array="A1") as array: 86 | assert isinstance(array, tiledb.Array) 87 | assert array.mode == "r" 88 | np.testing.assert_equal(array[:, :]["a"], self._A1_data) 89 | 90 | def test_open_attr(self, group_uri): 91 | with tiledb.Group(group_uri) as group: 92 | with open_group_array(group, attr="a") as array: 93 | assert isinstance(array, tiledb.Array) 94 | assert array.mode == "r" 95 | np.testing.assert_equal(array[:, :], self._A1_data) 96 | 97 | def test_no_array_with_attr_exception(self, group_uri): 98 | with tiledb.Group(group_uri) as group: 99 | with pytest.raises(KeyError): 100 | open_group_array(group, attr="bad_name") 101 | 102 | def test_ambiguous_array_exception(self, group_uri): 103 | with tiledb.Group(group_uri) as group: 104 | with pytest.raises(ValueError): 105 | open_group_array(group, attr="c") 106 | 107 | def test_no_values_error(self, group_uri): 108 | with tiledb.Group(group_uri) as group: 109 | with pytest.raises(ValueError): 110 | open_group_array(group) 111 | 112 | 113 | def test_append_group(tmpdir): 114 | uri = str(tmpdir.mkdir("append_group_test")) 115 | create_group(uri, {"A1": _array_schema_1}) 116 | create_group(uri, {"A2": _array_schema_2}, append=True) 117 | with tiledb.Group(uri) as group: 118 | assert group["A1"].type == tiledb.libtiledb.Array 119 | assert group["A2"].type == tiledb.libtiledb.Array 120 | a1_schema = tiledb.ArraySchema.load(group["A1"].uri) 121 | a2_schema = tiledb.ArraySchema.load(group["A2"].uri) 122 | assert a1_schema == _array_schema_1 123 | assert a2_schema == _array_schema_2 124 | 125 | 126 | def test_append_group_array_exists_error(tmpdir): 127 | uri = str(tmpdir.mkdir("append_group_test")) 128 | create_group(uri, {"A1": _array_schema_1}) 129 | with pytest.raises(ValueError): 130 | create_group(uri, {"A1": _array_schema_1}, append=True) 131 | -------------------------------------------------------------------------------- /examples/netcdf_engine/netcdf-to-tiledb-set-max-fragment-size.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# NetCDF-to-TileDB: How to set the max fragment size for copying data\n", 8 | "\n", 9 | "## About this Example\n", 10 | "\n", 11 | "### What it Shows\n", 12 | "\n", 13 | "This shows a simple example of copying a NetCDF file in multiple chunks by setting the maximum fragment size for arrays in the `NetCDF4ConverterEngine`.\n", 14 | "\n", 15 | "### Example dataset\n", 16 | "\n", 17 | "* Dimensions:\n", 18 | " * x: size=8\n", 19 | " * y: size=8\n", 20 | " * z: size=8\n", 21 | "* Variables:\n", 22 | " * f(x, y, z) = [0, ..., 511]\n", 23 | "\n", 24 | "### Set-up Requirements\n", 25 | "\n", 26 | "This example requires the following python packages are installed: netCDF4, numpy, tiledb, and tiledb-cf" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "import netCDF4\n", 36 | "import numpy as np\n", 37 | "import tiledb\n", 38 | "import tiledb.cf" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# Set names for the output generated by the example.\n", 48 | "output_dir = \"output/netcdf-to-tiledb-set-max-fragment-size\"\n", 49 | "netcdf_file = f\"{output_dir}/simple1.nc\"\n", 50 | "array_uri = f\"{output_dir}/simple_copy_chunks\"" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Reset output folder\n", 60 | "import os\n", 61 | "import shutil\n", 62 | "\n", 63 | "shutil.rmtree(output_dir, ignore_errors=True)\n", 64 | "os.mkdir(output_dir)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "with netCDF4.Dataset(netcdf_file, mode=\"w\") as dataset:\n", 74 | " dataset.setncatts({\"title\": \"Simple dataset for examples\"})\n", 75 | " dataset.createDimension(\"x\", 8)\n", 76 | " dataset.createDimension(\"y\", 8)\n", 77 | " dataset.createDimension(\"z\", 8)\n", 78 | " f = dataset.createVariable(\"f\", np.int64, (\"x\", \"y\", \"z\"))\n", 79 | " f[:, :, :] = np.reshape(np.arange(512), (8, 8, 8))\n", 80 | "print(f\"Created example NetCDF file `{netcdf_file}`.\")" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# Create NetCDF4 converter and print output\n", 90 | "converter = tiledb.cf.NetCDF4ConverterEngine.from_file(netcdf_file)\n", 91 | "converter" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "# Set max_fragment_shape for array\n", 101 | "converter.get_array_creator(\"array0\").domain_creator.max_fragment_shape = (4, 8, 2)\n", 102 | "converter" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# Run conversion (using `convert_to_array` since there is only 1 array in the group)\n", 112 | "# Consolidate fragment metadata (recommended for copying multiple fragments)\n", 113 | "converter.convert_to_array(array_uri)\n", 114 | "tiledb.consolidate(\n", 115 | " array_uri, config=tiledb.Config({\"sm.consolidation.mode\": \"fragment_meta\"})\n", 116 | ")" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "# View fragments information to confirm multiple separate chunks were copied\n", 126 | "fragment_info = tiledb.FragmentInfoList(array_uri)\n", 127 | "print(f\"Number of fragments: {len(fragment_info)}\")\n", 128 | "for frag in fragment_info:\n", 129 | " print(\n", 130 | " f\"Fragment {frag.num}: nonempty_domain={frag.nonempty_domain}, has_consolidated_metadata={frag.has_consolidated_metadata}\"\n", 131 | " )" 132 | ] 133 | } 134 | ], 135 | "metadata": { 136 | "interpreter": { 137 | "hash": "022b808d35d9188bc114e3dbdd31978ae285e77fefec36d9e39c13a87da8d5e5" 138 | }, 139 | "kernelspec": { 140 | "display_name": "Python 3.9.0 64-bit ('cf-3.9.0': pyenv)", 141 | "name": "python3" 142 | }, 143 | "language_info": { 144 | "codemirror_mode": { 145 | "name": "ipython", 146 | "version": 3 147 | }, 148 | "file_extension": ".py", 149 | "mimetype": "text/x-python", 150 | "name": "python", 151 | "nbconvert_exporter": "python", 152 | "pygments_lexer": "ipython3", 153 | "version": "3.11.3" 154 | }, 155 | "orig_nbformat": 4 156 | }, 157 | "nbformat": 4, 158 | "nbformat_minor": 2 159 | } 160 | -------------------------------------------------------------------------------- /tests/netcdf_engine/conftest.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Any, Dict, Optional, Sequence, Tuple 3 | 4 | import numpy as np 5 | import pytest 6 | 7 | 8 | @dataclass(frozen=True) 9 | class NetCDFSingleGroupExample: 10 | """Dataclass the holds values required to generate NetCDF test cases 11 | 12 | name: name of the test case 13 | dimension_args: sequence of arguments required to create NetCDF4 dimensions 14 | variable_args: sequence of arguments required to create NetCDF4 variables 15 | variable_data: dict of variable data by variable name 16 | variable_matadata: dict of variable metadata key-value pairs by variable name 17 | group_metadata: group metadata key-value pairs 18 | """ 19 | 20 | name: str 21 | directory_path: str 22 | dimension_args: Sequence[Tuple[str, Optional[int]]] 23 | variable_kwargs: Sequence[Dict[str, Any]] 24 | variable_data: Dict[str, np.ndarray] 25 | variable_metadata: Dict[str, Dict[str, Any]] = field(default_factory=dict) 26 | group_metadata: Dict[str, Any] = field(default_factory=dict) 27 | 28 | def __post_init__(self): 29 | netCDF4 = pytest.importorskip("netCDF4") 30 | with netCDF4.Dataset(self.filepath, mode="w") as dataset: 31 | if self.group_metadata: 32 | dataset.setncatts(self.group_metadata) 33 | for dim_args in self.dimension_args: 34 | dataset.createDimension(*dim_args) 35 | for var_kwargs in self.variable_kwargs: 36 | variable = dataset.createVariable(**var_kwargs) 37 | variable[...] = self.variable_data[variable.name] 38 | if variable.name in self.variable_metadata: 39 | variable.setncatts(self.variable_metadata[variable.name]) 40 | 41 | @property 42 | def filepath(self): 43 | return self.directory_path.join(f"{self.name}.nc") 44 | 45 | 46 | @pytest.fixture(scope="session") 47 | def simple1_netcdf_file(tmpdir_factory): 48 | directory_path = tmpdir_factory.mktemp("sample_netcdf") 49 | example = NetCDFSingleGroupExample( 50 | "simple1", 51 | directory_path, 52 | dimension_args=[ 53 | ("row", 8), 54 | ], 55 | variable_kwargs=[ 56 | {"varname": "x1", "datatype": np.float64, "dimensions": ("row",)}, 57 | ], 58 | variable_data={"x1": np.linspace(1.0, 4.0, 8)}, 59 | ) 60 | return example 61 | 62 | 63 | @pytest.fixture(scope="session") 64 | def simple2_netcdf_file(tmpdir_factory): 65 | directory_path = tmpdir_factory.mktemp("sample_netcdf") 66 | xdata = np.linspace(0.0, 1.0, 8) 67 | example = NetCDFSingleGroupExample( 68 | "simple2", 69 | directory_path, 70 | dimension_args=[("row", 8)], 71 | variable_kwargs=[ 72 | {"varname": "x1", "datatype": np.float64, "dimensions": ("row",)}, 73 | {"varname": "x2", "datatype": np.float64, "dimensions": ("row",)}, 74 | ], 75 | variable_data={"x1": xdata, "x2": xdata**2}, 76 | group_metadata={"name": "simple2"}, 77 | ) 78 | return example 79 | 80 | 81 | @pytest.fixture(scope="session") 82 | def group1_netcdf_file(tmpdir_factory): 83 | """Sample NetCDF file with groups 84 | 85 | root: 86 | dimensions: row(8) 87 | variables: x1(row) = np.linspace(-1.0, 1.0, 8) 88 | group1: 89 | variables: x2(row) = 2 * np.linspace(-1.0, 1.0, 8) 90 | group2: 91 | dimensions: col(4) 92 | variables: y1(col) = np.linspace(-1.0, 1.0, 4) 93 | group3: 94 | dimensions: row(4), col(4) 95 | variables: 96 | A1[:, :] = np.outer(y1, y1) 97 | A2[:, :] = np.zeros((4,4), dtype=np.float64) 98 | A3[:, :] = np.identity(4) 99 | """ 100 | netCDF4 = pytest.importorskip("netCDF4") 101 | filepath = str(tmpdir_factory.mktemp("sample_netcdf").join("simple1.nc")) 102 | x = np.linspace(-1.0, 1.0, 8) 103 | y = np.linspace(-1.0, 1.0, 4) 104 | with netCDF4.Dataset(filepath, mode="w") as dataset: 105 | dataset.createDimension("row", 8) 106 | x1 = dataset.createVariable("x1", np.float64, ("row",)) 107 | x1[:] = x 108 | group1 = dataset.createGroup("group1") 109 | x2 = group1.createVariable("x2", np.float64, ("row",)) 110 | x2[:] = 2.0 * x 111 | group2 = group1.createGroup("group2") 112 | group2.createDimension("col", 4) 113 | y1 = group2.createVariable("y1", np.float64, ("col",)) 114 | y1[:] = y 115 | group3 = dataset.createGroup("group3") 116 | group3.createDimension("row", 4) 117 | group3.createDimension("col", 4) 118 | A1 = group3.createVariable("A1", np.float64, ("row", "col")) 119 | A2 = group3.createVariable("A2", np.float64, ("row", "col")) 120 | A3 = group3.createVariable("A3", np.int32, ("row", "col")) 121 | A1[:, :] = np.outer(y, y) 122 | A2[:, :] = np.zeros((4, 4), dtype=np.float64) 123 | A3[:, :] = np.identity(4) 124 | return filepath 125 | 126 | 127 | @pytest.fixture 128 | def netcdf_test_case(tmpdir_factory, request): 129 | """Creates a NetCDF file and returns the filepath stem, filepath, and dict of 130 | expected attribtues. 131 | """ 132 | return NetCDFSingleGroupExample( 133 | **request.param, 134 | directory_path=tmpdir_factory.mktemp("sample_netcdf"), 135 | ) 136 | -------------------------------------------------------------------------------- /tiledb/cf/core/_attr_creator.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional, Sequence, Union 4 | 5 | import numpy as np 6 | from tiledb.datatypes import DataType 7 | from typing_extensions import Protocol 8 | 9 | import tiledb 10 | 11 | from .._utils import DType 12 | from ._fragment_writer import FragmentWriter 13 | from .registry import RegisteredByNameMixin 14 | from .source import FieldData, create_field_data 15 | 16 | 17 | class AttrRegistry(Protocol): 18 | def __delitem__(self, name: str): 19 | """Delete the element with the provided name.""" 20 | 21 | def __getitem__(self, name: str) -> AttrCreator: 22 | """Get the element with the provided name.""" 23 | 24 | def __setitem__(self, name: str, value: AttrCreator): 25 | """Set the elemetn with the provided name to the provided value.""" 26 | 27 | def set_writer_data( 28 | self, writer_index: Optional[int], attr_name: str, data: FieldData 29 | ): 30 | """Set the data to the requested frgament writer.""" 31 | 32 | def rename(self, old_name: str, new_name: str): 33 | """Rename an element of the registry. 34 | 35 | If the rename fails, the registry should be left unchanged. 36 | """ 37 | 38 | 39 | class AttrCreator(RegisteredByNameMixin): 40 | """Creator for a TileDB attribute. 41 | 42 | Parameters 43 | ---------- 44 | name 45 | Name of the attribute that will be created. 46 | dtype 47 | The datatype of the attribute that will be created. 48 | fill 49 | Optional fill value for the attribute that will be created. 50 | var 51 | Specifies if the attribute that will be created will be variable length 52 | (automatic for byte/strings). 53 | nullable 54 | Specifies if the attribute that will be created will be nullable using 55 | validity tiles. 56 | filters 57 | Filter pipeline to apply to the attribute. 58 | registry 59 | Registry for this attribute creator. 60 | fragment_writers 61 | Fragment writers for this attribute creator. 62 | 63 | Attributes 64 | ---------- 65 | dtype: np.dtype 66 | Numpy dtype of the attribute. 67 | fill: int or float or str, optional 68 | Fill value for unset cells. 69 | var: bool 70 | Specifies if the attribute is variable length (automatic for 71 | byte/strings). 72 | nullable: bool 73 | Specifies if the attribute is nullable using validity tiles. 74 | filters: tiledb.FilterList, optional 75 | Specifies compression filters for the attribute. 76 | """ 77 | 78 | def __init__( 79 | self, 80 | name: str, 81 | dtype: np.dtype, 82 | *, 83 | fill: Optional[DType] = None, 84 | var: bool = False, 85 | nullable: bool = False, 86 | filters: Optional[tiledb.FilterList] = None, 87 | registry: Optional[AttrRegistry] = None, 88 | fragment_writers: Optional[Sequence[FragmentWriter]] = None, 89 | ): 90 | self.dtype = DataType.from_numpy(dtype).np_dtype 91 | self.fill = fill 92 | self.var = var 93 | self.nullable = nullable 94 | self.filters = filters 95 | self._fragment_writers = fragment_writers 96 | super().__init__(name, registry) 97 | 98 | def __repr__(self): 99 | filters_str = f", filters=FilterList({self.filters})" if self.filters else "" 100 | return ( 101 | f"AttrCreator(name={self.name}, dtype='{self.dtype!s}', var={self.var}, " 102 | f"nullable={self.nullable}{filters_str})" 103 | ) 104 | 105 | def html_summary(self) -> str: 106 | """Returns a string HTML summary of the ``AttrCreator``.""" 107 | filters_str = f", filters=FilterList({self.filters})" if self.filters else "" 108 | return ( 109 | f" → tiledb.Attr(name={self.name}, dtype='{self.dtype!s}', " 110 | f"var={self.var}, nullable={self.nullable}{filters_str})" 111 | ) 112 | 113 | def set_writer_data( 114 | self, 115 | attr_data: Union[np.ndarray, FieldData], 116 | *, 117 | writer_index: Optional[int] = None, 118 | ): 119 | """Set attribute data to the specified fragment writer. 120 | 121 | Parameters 122 | ---------- 123 | attr_data 124 | Attribute data to add to the writer. 125 | writer_index 126 | The index of the fragment writer to add to. 127 | """ 128 | if self._registry is None: 129 | raise ValueError("Attribute creator is not registered to an array.") 130 | data = create_field_data(attr_data, self.dtype) 131 | self._registry.set_writer_data(writer_index, self.name, data) 132 | 133 | def to_tiledb(self, ctx: Optional[tiledb.Ctx] = None) -> tiledb.Attr: 134 | """Returns a :class:`tiledb.Attr` using the current properties. 135 | 136 | Parameters 137 | ---------- 138 | ctx 139 | If not ``None``, TileDB context wrapper for a TileDB storage manager. 140 | 141 | Returns 142 | ------- 143 | tiledb.Attr 144 | An attribute with the properties defined in this attribute creator. 145 | """ 146 | return tiledb.Attr( 147 | name=self.name, 148 | dtype=self.dtype, 149 | fill=self.fill, 150 | var=self.var, 151 | nullable=self.nullable, 152 | filters=self.filters, 153 | ctx=ctx, 154 | ) 155 | -------------------------------------------------------------------------------- /tests/xarray_engine/conftest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import tiledb 5 | 6 | 7 | @pytest.fixture 8 | def create_tiledb_group_example(tmpdir): 9 | xr = pytest.importorskip("xarray") 10 | 11 | # Define data 12 | data = { 13 | "pressure": np.linspace( 14 | -1.0, 1.0, num=32, endpoint=True, dtype=np.float64 15 | ).reshape(8, 4), 16 | "count": np.arange(0, 32, dtype=np.int32).reshape(8, 4), 17 | } 18 | 19 | # Create expected dataset 20 | expected = xr.Dataset( 21 | data_vars={ 22 | "pressure": xr.DataArray( 23 | data=data["pressure"], 24 | dims=["time", "x"], 25 | attrs={"long_name": "example float data"}, 26 | ), 27 | "count": xr.DataArray( 28 | data=data["count"], 29 | dims=["time", "x"], 30 | attrs={"long_name": "example int data"}, 31 | ), 32 | }, 33 | attrs={"global_1": "value1", "global_2": "value2"}, 34 | ) 35 | 36 | # Create the TileDB group 37 | group_uri = str(tmpdir.join("tiledb_group_example_1")) 38 | count_uri = str(tmpdir.join("count_array")) 39 | pressure_uri = str(tmpdir.join("pressure_array")) 40 | count_schema = tiledb.ArraySchema( 41 | domain=tiledb.Domain( 42 | tiledb.Dim(name="time", domain=(0, 7), tile=4, dtype=np.int32), 43 | tiledb.Dim(name="x", domain=(0, 3), tile=4, dtype=np.int32), 44 | ), 45 | sparse=False, 46 | attrs=[tiledb.Attr(name="count", dtype=np.int32)], 47 | ) 48 | 49 | pressure_schema = tiledb.ArraySchema( 50 | domain=tiledb.Domain( 51 | tiledb.Dim(name="time", domain=(0, 7), tile=4, dtype=np.int32), 52 | tiledb.Dim(name="x", domain=(0, 3), tile=4, dtype=np.int32), 53 | ), 54 | sparse=False, 55 | attrs=[tiledb.Attr(name="pressure", dtype=np.float64)], 56 | ) 57 | 58 | # Create and write to arrays. 59 | tiledb.Array.create(count_uri, count_schema) 60 | with tiledb.open(count_uri, mode="w") as array: 61 | array[:, :] = data["count"] 62 | array.meta["__tiledb_attr.count.long_name"] = "example int data" 63 | tiledb.Array.create(pressure_uri, pressure_schema) 64 | with tiledb.open(pressure_uri, mode="w") as array: 65 | array[:, :] = data["pressure"] 66 | array.meta["__tiledb_attr.pressure.long_name"] = "example float data" 67 | 68 | # Create group and add arrays and metadata. 69 | tiledb.Group.create(group_uri) 70 | with tiledb.Group(group_uri, mode="w") as group: 71 | group.add(pressure_uri) 72 | group.add(count_uri) 73 | group.meta["global_1"] = "value1" 74 | group.meta["global_2"] = "value2" 75 | return group_uri, expected 76 | 77 | 78 | @pytest.fixture 79 | def create_tiledb_example(tmpdir): 80 | xr = pytest.importorskip("xarray") 81 | # Define data 82 | float_data = np.linspace( 83 | -1.0, 1.0, num=32, endpoint=True, dtype=np.float64 84 | ).reshape(8, 4) 85 | int_data = np.arange(0, 32, dtype=np.int32).reshape(8, 4) 86 | # Create expected dataset 87 | expected = xr.Dataset( 88 | data_vars={ 89 | "pressure": xr.DataArray( 90 | data=float_data, 91 | dims=["time", "x"], 92 | attrs={"long_name": "example float data"}, 93 | ), 94 | "count": xr.DataArray( 95 | data=int_data, 96 | dims=["time", "x"], 97 | attrs={"long_name": "example int data"}, 98 | ), 99 | }, 100 | attrs={"global_1": "value1", "global_2": "value2"}, 101 | ) 102 | array_uri = str(tmpdir.join("tiledb_example_1")) 103 | schema = tiledb.ArraySchema( 104 | domain=tiledb.Domain( 105 | tiledb.Dim(name="time", domain=(0, 7), tile=4, dtype=np.int32), 106 | tiledb.Dim(name="x", domain=(0, 3), tile=4, dtype=np.int32), 107 | ), 108 | sparse=False, 109 | attrs=[ 110 | tiledb.Attr(name="count", dtype=np.int32), 111 | tiledb.Attr(name="pressure", dtype=np.float64), 112 | ], 113 | ) 114 | tiledb.Array.create(array_uri, schema) 115 | with tiledb.open(array_uri, mode="w") as array: 116 | array[:, :] = { 117 | "pressure": float_data, 118 | "count": int_data, 119 | } 120 | array.meta["global_1"] = "value1" 121 | array.meta["global_2"] = "value2" 122 | array.meta["__tiledb_attr.pressure.long_name"] = "example float data" 123 | array.meta["__tiledb_attr.count.long_name"] = "example int data" 124 | return array_uri, expected 125 | 126 | 127 | @pytest.fixture 128 | def create_tiledb_datetime_example(tmpdir): 129 | xr = pytest.importorskip("xarray") 130 | data = np.linspace(-1.0, 20.0, num=16, endpoint=True, dtype=np.float64) 131 | date = np.arange(np.datetime64("2000-01-01"), np.datetime64("2000-01-17")) 132 | # Create expected dataset 133 | expected = xr.Dataset( 134 | data_vars={"temperature": xr.DataArray(data=data, dims="date")}, 135 | coords={"date": date}, 136 | ) 137 | # Create TileDB array 138 | array_uri = str(tmpdir.join("tiledb_example_2")) 139 | schema = tiledb.ArraySchema( 140 | domain=tiledb.Domain( 141 | tiledb.Dim( 142 | name="date", 143 | domain=(np.datetime64("2000-01-01"), np.datetime64("2000-01-16")), 144 | tile=np.timedelta64(4, "D"), 145 | dtype=np.datetime64("", "D"), 146 | ), 147 | ), 148 | attrs=[tiledb.Attr(name="temperature", dtype=np.float64)], 149 | ) 150 | tiledb.DenseArray.create(array_uri, schema) 151 | with tiledb.DenseArray(array_uri, mode="w") as array: 152 | array[:] = {"temperature": data} 153 | return array_uri, expected 154 | -------------------------------------------------------------------------------- /tests/netcdf_engine/test_netcdf4_converter_array.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import tiledb 5 | from tiledb.cf.core._shared_dim import SharedDim 6 | 7 | netCDF4 = pytest.importorskip("netCDF4") 8 | netcdf_engine = pytest.importorskip("tiledb.cf.netcdf_engine") 9 | 10 | 11 | class TestAttrsFilters: 12 | """Collection of tests for setting default attribute filters.""" 13 | 14 | def test_default_filter(self): 15 | """Tests new attribute filter is set to the attrs_filters value if the 16 | ``filters`` parameter is not specified.""" 17 | attrs_filters = tiledb.FilterList([tiledb.ZstdFilter()]) 18 | with netCDF4.Dataset("example.nc", mode="w", diskless=True) as dataset: 19 | dim = dataset.createDimension("row", 64) 20 | var = dataset.createVariable("x", np.float64, ("row",)) 21 | shared_dims = [ 22 | netcdf_engine.NetCDF4DimToDimConverter.from_netcdf(dim, None, np.uint64) 23 | ] 24 | converter = netcdf_engine.NetCDF4ArrayConverter( 25 | dim_order=("row",), shared_dims=shared_dims, attrs_filters=attrs_filters 26 | ) 27 | converter.add_var_to_attr_converter(var) 28 | assert converter.attr_creator("x").filters == attrs_filters 29 | 30 | def test_overwrite_default_filters(self): 31 | """Tests new attribute filter is set to the provided ``filters`` parameter when 32 | ``filters is not ``None``.""" 33 | attrs_filters = tiledb.FilterList([tiledb.ZstdFilter()]) 34 | new_filters = tiledb.FilterList([tiledb.GzipFilter(level=5)]) 35 | with netCDF4.Dataset("example.nc", mode="w", diskless=True) as dataset: 36 | dim = dataset.createDimension("row", 64) 37 | var = dataset.createVariable("x", np.float64, ("row",)) 38 | shared_dims = [ 39 | netcdf_engine.NetCDF4DimToDimConverter.from_netcdf(dim, None, np.uint64) 40 | ] 41 | converter = netcdf_engine.NetCDF4ArrayConverter( 42 | dim_order=("row",), shared_dims=shared_dims, attrs_filters=attrs_filters 43 | ) 44 | converter.add_var_to_attr_converter(var, filters=new_filters) 45 | assert converter.attr_creator("x").filters == new_filters 46 | 47 | 48 | def test_remove_dim_creator_front(): 49 | """Tests removing a dimension in the front of the domain.""" 50 | shared_dims = [ 51 | SharedDim("x0", (0, 7), np.uint32), 52 | SharedDim("x1", (0, 7), np.uint32), 53 | SharedDim("x2", (0, 4), np.uint32), 54 | ] 55 | creator = netcdf_engine.NetCDF4ArrayConverter( 56 | dim_order=("x0", "x1", "x2"), shared_dims=shared_dims 57 | ) 58 | creator.domain_creator.remove_dim_creator("x0") 59 | dim_names = tuple(dim_creator.name for dim_creator in creator.domain_creator) 60 | assert dim_names == ("x1", "x2") 61 | 62 | 63 | def test_remove_dim_creator_back(): 64 | """Tests removing a dimension in the back of the domain.""" 65 | shared_dims = [ 66 | SharedDim("x1", (0, 7), np.uint32), 67 | SharedDim("x2", (0, 7), np.uint32), 68 | SharedDim("x3", (0, 4), np.uint32), 69 | ] 70 | creator = netcdf_engine.NetCDF4ArrayConverter( 71 | dim_order=("x1", "x2", "x3"), shared_dims=shared_dims 72 | ) 73 | creator.domain_creator.remove_dim_creator("x3") 74 | dim_names = tuple(dim_creator.name for dim_creator in creator.domain_creator) 75 | assert dim_names == ("x1", "x2") 76 | 77 | 78 | def test_remove_dim_creator_middle(): 79 | """Tests removing a dimension in the middle of the domain.""" 80 | shared_dims = [ 81 | SharedDim("x0", (0, 7), np.uint32), 82 | SharedDim("x1", (0, 7), np.uint32), 83 | SharedDim("x2", (0, 4), np.uint32), 84 | ] 85 | creator = netcdf_engine.NetCDF4ArrayConverter( 86 | dim_order=("x0", "x1", "x2"), shared_dims=shared_dims 87 | ) 88 | creator.domain_creator.remove_dim_creator("x1") 89 | dim_names = tuple(dim_creator.name for dim_creator in creator.domain_creator) 90 | assert dim_names == ("x0", "x2") 91 | 92 | 93 | def test_remove_dim_creator_key_error(): 94 | """Tests key error when removing a dimension by name.""" 95 | shared_dims = [ 96 | SharedDim("x0", (0, 7), np.uint32), 97 | SharedDim("x1", (0, 7), np.uint32), 98 | SharedDim("x2", (0, 4), np.uint32), 99 | ] 100 | creator = netcdf_engine.NetCDF4ArrayConverter( 101 | dim_order=("x0", "x1", "x2"), shared_dims=shared_dims 102 | ) 103 | with pytest.raises(KeyError): 104 | creator.domain_creator.remove_dim_creator("x4") 105 | 106 | 107 | def test_set_max_fragment_shape_error(): 108 | """Tests raising an error when attempting to set max_fragment_shape with a value 109 | that is a bad length.""" 110 | shared_dims = [SharedDim("x", (0, 7), np.uint32)] 111 | creator = netcdf_engine.NetCDF4ArrayConverter( 112 | dim_order=("x"), shared_dims=shared_dims 113 | ) 114 | creator.add_attr_creator("y0", dtype=np.dtype("int32")) 115 | with pytest.raises(ValueError): 116 | creator.domain_creator.max_fragment_shape = (None, None) 117 | 118 | 119 | def test_array_converter_indexer_error(): 120 | """Tests value error when copying with an indexer of bad length.""" 121 | shared_dims = [SharedDim("x", (0, 7), np.uint32)] 122 | creator = netcdf_engine.NetCDF4ArrayConverter( 123 | dim_order=("x"), shared_dims=shared_dims 124 | ) 125 | creator.add_attr_creator("y0", dtype=np.dtype("int32")) 126 | with netCDF4.Dataset("example.nc", mode="w", diskless=True) as dataset: 127 | with pytest.raises(ValueError): 128 | creator.domain_creator.get_query_coordinates( 129 | netcdf_group=dataset, 130 | sparse=False, 131 | indexer=[slice(None), slice(None)], 132 | assigned_dim_values={"x": 0}, 133 | ) 134 | -------------------------------------------------------------------------------- /tests/core/test_fragment_writer.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | import tiledb 7 | from tiledb.cf.core._fragment_writer import FragmentWriter 8 | from tiledb.cf.core._shared_dim import SharedDim 9 | from tiledb.cf.core.source import NumpyData 10 | from tiledb.cf.testing import assert_dict_arrays_equal 11 | 12 | 13 | def test_fragment_writer_create_dense(): 14 | dims = ( 15 | SharedDim("dim1", (0, 100), np.uint32), 16 | SharedDim("dim2", (0, 100), np.uint32), 17 | ) 18 | attr_names = ["attr1", "attr2", "attr3", "attr4"] 19 | writer = FragmentWriter.create_dense(dims, attr_names, ((0, 10), (0, 100))) 20 | assert writer.is_dense_region 21 | assert writer.ndim == 2 22 | assert writer.nattr == 4 23 | 24 | 25 | def test_fragment_writer_create_sparse_coo(): 26 | dims = ( 27 | SharedDim("dim1", (0, 100), np.uint32), 28 | SharedDim("dim2", (0, 100), np.uint32), 29 | ) 30 | attr_names = ["attr1"] 31 | writer = FragmentWriter.create_sparse_coo(dims, attr_names, 8) 32 | assert not writer.is_dense_region 33 | assert writer.ndim == 2 34 | assert writer.nattr == 1 35 | 36 | 37 | def test_fragment_writer_create_sparse_row_major(): 38 | dims = ( 39 | SharedDim("dim1", (0, 100), np.uint32), 40 | SharedDim("dim2", (0, 100), np.uint32), 41 | ) 42 | attr_names = ["attr1", "attr2", "attr3"] 43 | writer = FragmentWriter.create_sparse_row_major( 44 | dims, attr_names, ((0, 10), (0, 100)) 45 | ) 46 | assert not writer.is_dense_region 47 | assert writer.ndim == 2 48 | assert writer.nattr == 3 49 | 50 | 51 | def test_fragment_writer_remove_attr(): 52 | dims = ( 53 | SharedDim("dim1", (0, 100), np.uint32), 54 | SharedDim("dim2", (0, 100), np.uint32), 55 | ) 56 | attr_names = ["attr1", "attr2", "attr3", "attr4"] 57 | writer = FragmentWriter.create_dense(dims, attr_names, ((0, 10), (0, 100))) 58 | assert writer.is_dense_region 59 | assert writer.nattr == 4 60 | writer.remove_attr("attr3") 61 | assert writer.nattr == 3 62 | 63 | 64 | def test_fragment_writer_dense_1D_full(tmpdir): 65 | # Define data. 66 | attr_data = np.arange(-3, 5) 67 | 68 | # Create fragment writer. 69 | writer = FragmentWriter.create_dense( 70 | (SharedDim("dim1", (0, 7), np.uint32),), [], None 71 | ) 72 | 73 | # Check fragment writer. 74 | assert writer.ndim == 1 75 | assert writer.nattr == 0 76 | 77 | # Add attribute and check update. 78 | writer.add_attr("attr1") 79 | assert writer.nattr == 1 80 | 81 | # Add attribute data. 82 | writer.set_attr_data("attr1", NumpyData(attr_data, metadata={"key": "value"})) 83 | 84 | # Create base array. 85 | uri = str(tmpdir.join("test_fragment_writer_dense,_1D_full")) 86 | schema = tiledb.ArraySchema( 87 | domain=tiledb.Domain(tiledb.Dim("dim1", domain=(0, 7), dtype=np.uint32)), 88 | attrs=[tiledb.Attr("attr1", dtype=np.int64)], 89 | ) 90 | tiledb.Array.create(uri, schema) 91 | 92 | with tiledb.open(uri, "w") as array: 93 | writer.write(array) 94 | 95 | with tiledb.open(uri) as array: 96 | result = array[...] 97 | meta = dict(array.meta.items()) 98 | 99 | assert_dict_arrays_equal(result, {"attr1": attr_data}) 100 | assert len(meta) == 1 101 | assert meta["__tiledb_attr.attr1.key"] == "value" 102 | 103 | 104 | def test_fragment_writer_sparse_row_major_1D_full(tmpdir): 105 | # Define data. 106 | attr_data = np.arange(-3, 5, dtype=np.int64) 107 | dim_data = np.arange(8, dtype=np.uint32) 108 | 109 | # Create fragment writer. 110 | writer = FragmentWriter.create_sparse_row_major( 111 | (SharedDim("dim1", (0, 7), np.uint32),), [], (8,) 112 | ) 113 | 114 | # Check fragment writer. 115 | assert writer.ndim == 1 116 | assert writer.nattr == 0 117 | 118 | # Add attribute and check update. 119 | writer.add_attr("attr1") 120 | assert writer.nattr == 1 121 | 122 | # Add attribute data and dimension data. 123 | writer.set_attr_data("attr1", NumpyData(attr_data, metadata={"key1": "attr_value"})) 124 | writer.set_dim_data("dim1", NumpyData(dim_data, metadata={"key2": "dim_value"})) 125 | 126 | # Create base array. 127 | uri = str(tmpdir.join("test_fragment_writer_dense,_1D_full")) 128 | schema = tiledb.ArraySchema( 129 | domain=tiledb.Domain(tiledb.Dim("dim1", domain=(0, 7), dtype=np.uint32)), 130 | attrs=[tiledb.Attr("attr1", dtype=np.int64)], 131 | sparse=True, 132 | ) 133 | tiledb.Array.create(uri, schema) 134 | 135 | with tiledb.open(uri, "w") as array: 136 | writer.write(array) 137 | 138 | with tiledb.open(uri) as array: 139 | result = array[...] 140 | meta = dict(array.meta.items()) 141 | 142 | assert_dict_arrays_equal( 143 | result, OrderedDict([("attr1", attr_data), ("dim1", dim_data)]), False 144 | ) 145 | assert len(meta) == 2 146 | assert meta["__tiledb_attr.attr1.key1"] == "attr_value" 147 | assert meta["__tiledb_dim.dim1.key2"] == "dim_value" 148 | 149 | 150 | def test_fragment_writer_set_attr_data_key_error(): 151 | dims = ( 152 | SharedDim("dim1", (0, 100), np.uint32), 153 | SharedDim("dim2", (0, 100), np.uint32), 154 | ) 155 | attr_names = ["attr1", "attr2"] 156 | writer = FragmentWriter.create_dense(dims, attr_names, ((0, 10), (0, 0))) 157 | with pytest.raises(KeyError): 158 | writer.set_attr_data("attr3", NumpyData(np.arange(11))) 159 | 160 | 161 | def test_fragment_writer_set_attr_data_size_value_error(): 162 | dims = ( 163 | SharedDim("dim1", (0, 100), np.uint32), 164 | SharedDim("dim2", (0, 100), np.uint32), 165 | ) 166 | attr_names = ["attr1", "attr2", "attr3"] 167 | writer = FragmentWriter.create_dense(dims, attr_names, ((0, 10), (0, 10))) 168 | with pytest.raises(ValueError): 169 | writer.set_attr_data("attr3", NumpyData(np.arange(11))) 170 | -------------------------------------------------------------------------------- /examples/xarray_engine/tiledb-xarray-partially-filled-arrays.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# TileDB Backend for xarray (Partially Filled Arrays)\n", 8 | "\n", 9 | "## About this Example\n", 10 | "\n", 11 | "### What it shows\n", 12 | "\n", 13 | "This example shows some of the basic usage for opening a TileDB array in xarray using the TileDB backend when the TileDB array is not fully filled.\n", 14 | "\n", 15 | "There are two possible values the TileDB-xarray backend can use for a dimension size:\n", 16 | "\n", 17 | "1. (default) The size of the current non-empty domain when the dataset is first loaded.\n", 18 | "2. The size of the full domain of the dimension.\n", 19 | "\n", 20 | "The default behavior of TileDB is to take the maximum value of all non-empty domains. If you have dimensions with mis-matched domain,\n", 21 | "the dimension will never return a size larger than the smallest domain.\n", 22 | "\n", 23 | "### Set-up Requirements\n", 24 | "This example requires `tiledb-cf` to be installed and uses the `tiledb`, `xarray`, and `numpy` libraries. " 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import tiledb\n", 34 | "import xarray as xr\n", 35 | "import numpy as np" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# Set names for the output generated by the example.\n", 45 | "output_dir = \"output/tiledb-xarray-partially-filled\"\n", 46 | "array_uri = f\"{output_dir}/example1\"\n", 47 | "group1_uri = f\"{output_dir}/group1\"\n", 48 | "group2_uri = f\"{output_dir}/group2\"" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# Reset output folder\n", 58 | "import os\n", 59 | "import shutil\n", 60 | "\n", 61 | "shutil.rmtree(output_dir, ignore_errors=True)\n", 62 | "os.mkdir(output_dir)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Example 1: Simple partially-filled 2D array" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "# Create array and write data.\n", 79 | "tiledb.Array.create(\n", 80 | " array_uri,\n", 81 | " tiledb.ArraySchema(\n", 82 | " domain=tiledb.Domain(\n", 83 | " tiledb.Dim(\"x\", domain=(0, 7), dtype=np.uint64),\n", 84 | " tiledb.Dim(\"y\", domain=(0, 7), dtype=np.uint64),\n", 85 | " ),\n", 86 | " attrs=[tiledb.Attr(\"z\", np.float64)],\n", 87 | " ),\n", 88 | ")\n", 89 | "with tiledb.open(array_uri, mode=\"w\") as array:\n", 90 | " array[0:4, 0:4] = np.reshape(np.arange(16), (4, 4))" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# Print non-empty domain and data.\n", 100 | "with tiledb.open(array_uri) as array:\n", 101 | " print(f\"Non-empty domain: {array.nonempty_domain()}\")\n", 102 | " print(f\"Data in non-empty domain:\\n {array.multi_index[:, :]['z']}\")\n", 103 | " print(f\"All data: \\n {array[:, :]['z']}\")" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# By default, xarray will only open the non-empty domain\n", 113 | "xr.open_dataset(array_uri, engine=\"tiledb\")" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "## Example 2: Fixed dimensions\n", 121 | "We can create a group that always reads some or all of the dimensions as full dimensions." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "# Set `x` to be a fixed-size dimension.\n", 131 | "tiledb.Group.create(group1_uri)\n", 132 | "with tiledb.Group(group1_uri, mode=\"w\") as group:\n", 133 | " group.add(uri=array_uri, name=\"z\")\n", 134 | " group.meta[\"__tiledb_array_fixed_dimensions.z\"] = \"x\"" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "xr.open_dataset(group1_uri, engine=\"tiledb\")" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# Set `y` to be a fixed-size dimension.\n", 153 | "tiledb.Group.create(group2_uri)\n", 154 | "with tiledb.Group(group2_uri, mode=\"w\") as group:\n", 155 | " group.add(uri=array_uri, name=\"z\")\n", 156 | " group.meta[\"__tiledb_array_fixed_dimensions.z\"] = \"x;y\"" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "xr.open_dataset(group2_uri, engine=\"tiledb\")" 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "interpreter": { 171 | "hash": "022b808d35d9188bc114e3dbdd31978ae285e77fefec36d9e39c13a87da8d5e5" 172 | }, 173 | "kernelspec": { 174 | "display_name": "Python 3.9.0 64-bit ('cf-3.9.0': pyenv)", 175 | "language": "python", 176 | "name": "python3" 177 | }, 178 | "language_info": { 179 | "codemirror_mode": { 180 | "name": "ipython", 181 | "version": 3 182 | }, 183 | "file_extension": ".py", 184 | "mimetype": "text/x-python", 185 | "name": "python", 186 | "nbconvert_exporter": "python", 187 | "pygments_lexer": "ipython3", 188 | "version": "3.11.3" 189 | }, 190 | "orig_nbformat": 4 191 | }, 192 | "nbformat": 4, 193 | "nbformat_minor": 2 194 | } 195 | -------------------------------------------------------------------------------- /tiledb/cf/core/_metadata.py: -------------------------------------------------------------------------------- 1 | """Classes for additional group and metadata support useful for the TileDB-CF data 2 | model.""" 3 | 4 | from __future__ import annotations 5 | 6 | from collections.abc import MutableMapping 7 | from typing import Any, Iterator, Optional, TypeVar, Union 8 | 9 | import tiledb 10 | 11 | DType = TypeVar("DType", covariant=True) 12 | ATTR_METADATA_FLAG = "__tiledb_attr." 13 | DIM_METADATA_FLAG = "__tiledb_dim." 14 | 15 | 16 | class Metadata(MutableMapping): 17 | """Class for accessing Metadata using the standard MutableMapping API. 18 | 19 | Parameters 20 | ---------- 21 | metadata 22 | TileDB array metadata object. 23 | """ 24 | 25 | def __init__(self, metadata: tiledb.Metadata): 26 | self._metadata = metadata 27 | 28 | def __iter__(self) -> Iterator[str]: 29 | """Iterates over all metadata keys.""" 30 | for tiledb_key in self._metadata.keys(): 31 | key = self._from_tiledb_key(tiledb_key) 32 | if key is not None: 33 | yield key 34 | 35 | def __len__(self) -> int: 36 | """Returns the number of metadata items.""" 37 | return sum(1 for _ in self) 38 | 39 | def __getitem__(self, key: str) -> Any: 40 | """Implementation of [key] -> val (dict item retrieval). 41 | 42 | Parameters 43 | ---------- 44 | key 45 | Key to find value from. 46 | 47 | Returns 48 | ------- 49 | Any 50 | Value stored with provided key. 51 | """ 52 | return self._metadata[self._to_tiledb_key(key)] 53 | 54 | def __setitem__(self, key: str, value: Any): 55 | """Implementation of [key] <- val (dict item assignment). 56 | 57 | Paremeters 58 | ---------- 59 | key 60 | Key to set 61 | value 62 | Corresponding value 63 | """ 64 | self._metadata[self._to_tiledb_key(key)] = value 65 | 66 | def __delitem__(self, key): 67 | """Implementation of del [key] (dict item deletion). 68 | 69 | Parameters 70 | ---------- 71 | key 72 | Key to remove. 73 | """ 74 | del self._metadata[self._to_tiledb_key(key)] 75 | 76 | def _to_tiledb_key(self, key: str) -> str: 77 | """Map an external user metadata key to an internal tiledb key.""" 78 | return key # pragma: no cover 79 | 80 | def _from_tiledb_key(self, tiledb_key: str) -> Optional[str]: 81 | """Map an internal tiledb key to an external user metadata key. 82 | 83 | Parameters 84 | ---------- 85 | tiledb_key 86 | Internal key to use for metadata. 87 | 88 | Returns 89 | ------- 90 | Optional[str] 91 | The external user metadata key corresponding to `tiledb_key`, 92 | or None if there is no such corresponding key. 93 | """ 94 | return tiledb_key # pragma: no cover 95 | 96 | 97 | class ArrayMetadata(Metadata): 98 | """Class for accessing array-related metadata from a TileDB metadata object. 99 | 100 | This class provides a way for accessing the TileDB array metadata that excludes 101 | attribute and dimension specific metadata. 102 | """ 103 | 104 | def _to_tiledb_key(self, key: str) -> str: 105 | if key.startswith(ATTR_METADATA_FLAG): 106 | raise KeyError("Key is reserved for attribute metadata.") 107 | if key.startswith(DIM_METADATA_FLAG): 108 | raise KeyError("Key is reserved for dimension metadata.") 109 | return key 110 | 111 | def _from_tiledb_key(self, tiledb_key: str) -> Optional[str]: 112 | if not ( 113 | tiledb_key.startswith(ATTR_METADATA_FLAG) 114 | or tiledb_key.startswith(DIM_METADATA_FLAG) 115 | ): 116 | return tiledb_key 117 | return None 118 | 119 | 120 | class AttrMetadata(Metadata): 121 | """Metadata wrapper for accessing attribute metadata. 122 | 123 | This class allows access to the metadata for an attribute stored in the metadata 124 | for a TileDB array. 125 | 126 | Parameters 127 | ---------- 128 | metadata 129 | TileDB array metadata for the array containing the desired attribute. 130 | attr 131 | Name or index of the arrary attribute being requested. 132 | """ 133 | 134 | def __init__(self, metadata: tiledb.Metadata, attr: Union[str, int]): 135 | super().__init__(metadata) 136 | try: 137 | attr_name = metadata.array.attr(attr).name 138 | except tiledb.TileDBError as err: 139 | raise KeyError(f"Attribute `{attr}` not found in array.") from err 140 | self._key_prefix = ATTR_METADATA_FLAG + attr_name + "." 141 | 142 | def _to_tiledb_key(self, key: str) -> str: 143 | return self._key_prefix + key 144 | 145 | def _from_tiledb_key(self, tiledb_key: str) -> Optional[str]: 146 | if tiledb_key.startswith(self._key_prefix): 147 | return tiledb_key[len(self._key_prefix) :] 148 | return None 149 | 150 | 151 | class DimMetadata(Metadata): 152 | """Metadata wrapper for accessing dimension metadata. 153 | 154 | This class allows access to the metadata for a dimension stored in the metadata 155 | for a TileDB array. 156 | 157 | Parameters 158 | ---------- 159 | metadata 160 | TileDB array metadata for the array containing the desired attribute. 161 | dim 162 | Name or index of the arrary attribute being requested. 163 | """ 164 | 165 | def __init__(self, metadata: tiledb.Metadata, dim: Union[str, int]): 166 | super().__init__(metadata) 167 | try: 168 | dim_name = metadata.array.dim(dim).name 169 | except tiledb.TileDBError as err: 170 | raise KeyError(f"Dimension `{dim}` not found in array.") from err 171 | self._key_prefix = DIM_METADATA_FLAG + dim_name + "." 172 | 173 | def _to_tiledb_key(self, key: str) -> str: 174 | return self._key_prefix + key 175 | 176 | def _from_tiledb_key(self, tiledb_key: str) -> Optional[str]: 177 | if tiledb_key.startswith(self._key_prefix): 178 | return tiledb_key[len(self._key_prefix) :] 179 | return None 180 | -------------------------------------------------------------------------------- /documentation/xarray-engine.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: TileDB-CF Xarray Engine 3 | --- 4 | 5 | 6 | ## Reading from TileDB with Xarray 7 | 8 | Xarray uses a plugin infrastructure that allows third-party developers to create their own backend engines for reading data into xarray. TileDB-CF contains one such backend. To use the backend, make sure `tiledb-cf` is installed in your current Python environment, and use the `tiledb` engine: 9 | 10 | 11 | ```python 12 | import xarray as xr 13 | 14 | xr.open_dataset(tiledb_uri, engine="tiledb") 15 | ``` 16 | 17 | The TileDB engine can be used to open either a TileDB array or a TileDB group. See the requirements on the arrays below. 18 | 19 | The backend engine will open the group or array as a dataset with TileDB dimensions mapping to dataset dimensions, TileDB attributes mapping to dataset variables/DataArrays, and TileDB metadata mapping to dataset attributes. 20 | 21 | 22 | For a TileDB array to be readable by xarray, the following must be satisfied: 23 | 24 | * The array must be dense. 25 | * All dimensions on the array must be either signed or unsigned integers. 26 | * Add dimensions must have a domain that starts at `0`. 27 | 28 | For a TileDB group to be readable by xarray, the following must be satisfied: 29 | 30 | * All arrays in the group satisfy the above requirements for the array to be readable. 31 | * Each attribute has a unique "variable name". 32 | 33 | The TileDB backend engine can be used with the standard xarray keyword arguments. It supports the additional TileDB-specific arguments: 34 | 35 | * `config`: An optional TileDB configuration object to use in arrays and groups. 36 | * `ctx`: An optional TileDB context object to use for all TileDB operations. 37 | * `timestamp`: An optional timestamp to open the TileDB array at (not supported on groups). 38 | 39 | 40 | ## Writing from Xarray to TileDB 41 | 42 | The xarray writer is stricter than the xarray backend engine (reader). While the reader will attempt to open arrays with multiple attributes, the xarray writer only creates arrays with one attribute per name. 43 | 44 | There are two sets of functions for writing to xarray: 45 | 46 | 1. Single dataset ingestion. 47 | 48 | * Functions used: `from_xarray` 49 | * Useful when copying an entire xarray dataset to a TileDB group in a single function call. 50 | * Creates the group and copies all data and metadata to the new group in a single function call. 51 | 52 | 2. Multi-dataset ingestion. 53 | 54 | * Main functions: `create_group_from_xarray` and `copy_data_from_xarray`. 55 | * Additional helper function: `copy_metadata_from_xarray`. 56 | * Useful when copying multiple xarray datasets to a single TileDB group. 57 | * Creates the group and copies data to the group in separate API calls. 58 | 59 | The xarray to TileDB writer will copy the dataset in the following way: 60 | 61 | * One group is created for the dataset. 62 | * Dataset "attributes" are copied to group level metadata. 63 | * Each xarray variable is copied to its own dense TileDB array with a single TileDB attribute. 64 | 65 | The array schema for an xarray variable is generated as follows: 66 | 67 | * TileDB array properties: 68 | 69 | - The TileDB array is dense. 70 | 71 | * TileDB Domain: 72 | 73 | - All dimensions have the same datatype determined by the `dim_dtype` encoding. 74 | 75 | - The dimension names in the TileDB array match the dimension names in the xarray variable. 76 | 77 | - The dimension tiles are determined by the `tiles` encoding. 78 | 79 | - The domain of each dimension is set to `[0, max_size - 1]` where `max_size` is computed as follows: 80 | 81 | 1. Use the corresponding element of the `max_shape` encoding if provided. 82 | 83 | 2. If the `max_shape` encoding is not provided and the xarray dimension is "unlimited", use the largest possible size for this integer type. 84 | 85 | 3. If the `max_shape` encoding is not provided and the xarray dimension is not "unlimited", use the size of the xarray dimension. 86 | 87 | * TileDB Attribute: 88 | 89 | - The attribute datatype is the same as the variable datatype (after applying xarray encodings). 90 | 91 | - The attribute name is set using the following: 92 | 93 | 1. Use the name provided by `attr_name` encoding. 94 | 95 | 2. If the `attr_name` encoding is not provided and there is no dimension on this variable with the same name as the variable, use the name of the variable. 96 | 97 | 3. If the `attr_name` encoding is provided and there is a dimension on this variable with the same name as the variable, use the variable name appended with `_`. 98 | 99 | - The attribute filters are determined by the `filters` encoding. 100 | 101 | 102 | 103 | ### TileDB Encoding 104 | 105 | The writer takes a dictionary from dataset variable names to a dictionary of encodings for setting TileDB properties. The possible encoding keywords are provided in the table below. 106 | 107 | +------------------+-----------------------------------------------+--------------------+ 108 | | Encoding Keyword | Details | Type | 109 | +==================+===============================================+====================+ 110 | | `attr_name` | Name to use for the TileDB attribute. | str | 111 | +------------------+-----------------------------------------------+--------------------+ 112 | | `filters` | Filter list to apply to the TileDB attribute. | tiledb.FilterList | 113 | +------------------+-----------------------------------------------+--------------------+ 114 | | `tiles` | Tile sizes to apply to the TileDB dimensions. | tuple of ints | 115 | +------------------+-----------------------------------------------+--------------------+ 116 | | `max_shape` | Maximum possible size of the TileDB array. | tuple of ints | 117 | +------------------+-----------------------------------------------+--------------------+ 118 | | `dim_dtype` | Datatype to use for the TileDB dimensions. | str or numpy.dtype | 119 | +------------------+-----------------------------------------------+--------------------+ 120 | 121 | 122 | ### Region to Write 123 | 124 | If the creating TileDB array's with either unlimited dimensions or with encoded `max_shape` larger than the current size of the xarray variable, then the region to write the data to needs to be provided. This is input as a dictionary from dimension names to slices. The slice uses xarray/numpy conventions and will write to a region that does **not** include the upper bound of the slice. 125 | 126 | 127 | ### Creating Multiple Fragments 128 | 129 | When copying data with either the `from_xarray` or `copy_data_from_xarray` functions, the copy routine will use Xarray chunks for separate writes - creating multiple fragments. 130 | -------------------------------------------------------------------------------- /tests/netcdf_engine/test_convert_multifragments.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import tiledb 5 | from tiledb.cf import NetCDF4ConverterEngine, open_group_array 6 | 7 | netCDF4 = pytest.importorskip("netCDF4") 8 | 9 | 10 | class TestSimplyCopyChunks: 11 | """Test converting a simple NetCDF in chunks. 12 | 13 | NetCDF File: 14 | 15 | dimensions: 16 | x (8) 17 | y (8) 18 | z (8) 19 | 20 | variables: 21 | f (x, y, z) = reshape([0, ..., 511], (8, 8, 8)) 22 | """ 23 | 24 | attr_data = np.reshape(np.arange(512), (8, 8, 8)) 25 | 26 | @pytest.fixture(scope="class") 27 | def netcdf_file(self, tmpdir_factory): 28 | """Returns the NetCDF file that will be used to test the conversion.""" 29 | filepath = tmpdir_factory.mktemp("input_file").join("simple_copy_chunks.nc") 30 | with netCDF4.Dataset(filepath, mode="w") as dataset: 31 | dataset.createDimension("x", 8) 32 | dataset.createDimension("y", 8) 33 | dataset.createDimension("z", 8) 34 | var = dataset.createVariable( 35 | varname="f", datatype=np.int64, dimensions=("x", "y", "z") 36 | ) 37 | var[:, :, :] = self.attr_data 38 | return filepath 39 | 40 | @pytest.mark.parametrize( 41 | "sparse,expected_result", ((False, attr_data), (True, np.arange(512))) 42 | ) 43 | def test_convert_chunks(self, netcdf_file, tmpdir, sparse, expected_result): 44 | """Test copying NetCDF file in chunks for a simple NetCDF file.""" 45 | uri = str(tmpdir.mkdir("output").join("simple_copy_chunks")) 46 | converter = NetCDF4ConverterEngine.from_file(netcdf_file) 47 | array_creator = converter.get_array_creator_by_attr("f") 48 | array_creator.sparse = sparse 49 | assert array_creator.domain_creator.max_fragment_shape == (None, None, None) 50 | array_creator.domain_creator.max_fragment_shape = (4, 8, 2) 51 | assert array_creator.domain_creator.max_fragment_shape == (4, 8, 2) 52 | converter.convert_to_group(uri) 53 | with tiledb.Group(uri) as group: 54 | with open_group_array(group, attr="f") as array: 55 | array_uri = array.uri 56 | result = array[...] 57 | result = result["f"] if isinstance(result, dict) else result 58 | np.testing.assert_equal(result, expected_result) 59 | fragment_info = tiledb.FragmentInfoList(array_uri) 60 | assert len(fragment_info) == 8 61 | 62 | @pytest.mark.parametrize( 63 | "sparse,expected_result", 64 | ((False, np.reshape(np.arange(512), (8, 8, 8))), (True, np.arange(512))), 65 | ) 66 | def test_convert_chunks_with_injected( 67 | self, netcdf_file, tmpdir, sparse, expected_result 68 | ): 69 | """Test copying NetCDF file in chunks for a simple NetCDF file with externally 70 | provided dimension and attribute values.""" 71 | uri = str(tmpdir.mkdir("output").join("simple_copy_chunks")) 72 | converter = NetCDF4ConverterEngine.from_file(netcdf_file) 73 | converter.add_shared_dim("t", domain=(0, 3), dtype=np.uint64) 74 | array_creator = converter.get_array_creator_by_attr("f") 75 | array_creator.sparse = sparse 76 | array_creator.add_attr_creator(name="g", dtype=np.float64) 77 | array_creator.domain_creator.inject_dim_creator("t", 0) 78 | array_creator.domain_creator.max_fragment_shape = (1, 4, 8, 2) 79 | # Define data for extra variable 80 | g_data = np.reshape(np.random.random_sample((512)), (1, 8, 8, 8)) 81 | converter.convert_to_group( 82 | uri, 83 | assigned_dim_values={"t": 0}, 84 | assigned_attr_values={"g": g_data}, 85 | ) 86 | with tiledb.Group(uri) as group: 87 | with open_group_array(group, array="array0") as array: 88 | array_uri = array.uri 89 | result = array[0, :, :, :] 90 | f_result = result["f"] 91 | np.testing.assert_equal(f_result, expected_result) 92 | g_result = np.reshape(result["g"], (1, 8, 8, 8)) 93 | np.testing.assert_equal(g_data, g_result) 94 | fragment_info = tiledb.FragmentInfoList(array_uri) 95 | assert len(fragment_info) == 8 96 | 97 | 98 | class TestCoordinateCopyChunks: 99 | """Test converting a simple NetCDF in chunks. 100 | 101 | NetCDF File: 102 | 103 | dimensions: 104 | x (8) 105 | y (8) 106 | 107 | variables: 108 | x (x) = linspace(-1, 1, 8) 109 | y (y) = linspace(0, 2, 8) 110 | f (x, y) = [[0, 1, ...],...,[...,62,63]] 111 | """ 112 | 113 | x_data = np.arange(-4, 4) 114 | y_data = np.arange(10, 81, 10) 115 | attr_data = np.reshape(np.arange(64), (8, 8)) 116 | 117 | @pytest.fixture(scope="class") 118 | def netcdf_file(self, tmpdir_factory): 119 | """Returns the NetCDF file that will be used to test the conversion.""" 120 | filepath = tmpdir_factory.mktemp("input_file").join("simple_copy_chunks.nc") 121 | with netCDF4.Dataset(filepath, mode="w") as dataset: 122 | dataset.createDimension("x", 8) 123 | dataset.createDimension("y", 8) 124 | var = dataset.createVariable( 125 | varname="f", datatype=np.int64, dimensions=("x", "y") 126 | ) 127 | var[:, :] = self.attr_data 128 | var = dataset.createVariable( 129 | varname="x", datatype=np.int64, dimensions=("x") 130 | ) 131 | var[:] = self.x_data 132 | var = dataset.createVariable( 133 | varname="y", datatype=np.int64, dimensions=("y") 134 | ) 135 | var[:] = self.y_data 136 | return filepath 137 | 138 | def test_convert_chunks(self, netcdf_file, tmpdir): 139 | """Test copying NetCDF file in chunks for a NetCDF to TileDB conversion that 140 | maps NetCDF coordinates to dimensions.""" 141 | uri = str(tmpdir.mkdir("output").join("simple_copy_chunks")) 142 | converter = NetCDF4ConverterEngine.from_file(netcdf_file, coords_to_dims=True) 143 | converter.get_shared_dim("x").domain = (-4, 3) 144 | converter.get_shared_dim("y").domain = (10, 80) 145 | array_creator = converter.get_array_creator_by_attr("f") 146 | array_creator.domain_creator.max_fragment_shape = (4, 4) 147 | converter.convert_to_group(uri) 148 | with tiledb.Group(uri) as group: 149 | with open_group_array(group, attr="f") as array: 150 | array_uri = array.uri 151 | result = array[...] 152 | for x_value, y_value, f_value in zip(result["x"], result["y"], result["f"]): 153 | ix = np.argwhere(self.x_data == x_value) 154 | assert len(ix) == 1 155 | assert 0 <= ix[0] <= 7 156 | iy = np.argwhere(self.y_data == y_value) 157 | assert len(iy) == 1 158 | assert 0 <= iy[0] <= 7 159 | f_expected = self.attr_data[ix[0], iy[0]] 160 | assert f_value == f_expected 161 | fragment_info = tiledb.FragmentInfoList(array_uri) 162 | assert len(fragment_info) == 4 163 | -------------------------------------------------------------------------------- /documentation/contributing.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Contributing to TileDB-CF-Py 3 | --- 4 | 5 | Thank you for your interest in contributing to TileDB-CF-Py. The following notes are intended to help you file issues, bug reports, or contribute code to this open source project. 6 | 7 | ## Contributing Checklist 8 | 9 | * Reporting a bug? Please read [how to file a bug report](#reporting-a-bug) section to make sure sufficient information is included. 10 | 11 | * Contributing code? You rock! Be sure to [review the contributor section](#contributing-code) for helpful tips on the tools we use to build this project, format code, and issue pull requests (PR)'s. 12 | 13 | Note: All participants in TileDB spaces are expected to adhere to a high standard of profectionalism in all interactions. See the [code of conduct](code-of-conduct.md) for more information. 14 | 15 | ## Reporting a Bug 16 | 17 | A useful bug report filed as a GitHub issue provides information about how to reproduce the error. 18 | 19 | 1. Before opening a new [GitHub issue](https://github.com/TileDB-Inc/TileDB-CF-Py/issues) try searching the existing issues to see if someone else has already noticed the same problem. 20 | 21 | 2. When filing a bug report, provide where possible: 22 | 23 | * The version of TileDB-CF-Py or if a `dev` version, the specific commit that triggers the error. 24 | * The full error message, including the backtrace (if possible). 25 | * A minimal working example, i.e. the smallest chunk of code that triggers the error. Ideally, this should be code that can be a small reduced python file. If the code to reproduce is somewhat long, consider putting it in a [gist](https://gist.github.com). 26 | 27 | 3. When pasting code blocks or output, put triple backquotes (\`\`\`) around the text so GitHub will format it nicely. Code statements should be surrounded by single backquotes (\`). See [GitHub's guide on Markdown](https://guides.github.com/features/mastering-markdown) for more formatting tricks. 28 | 29 | ## Contributing Code 30 | 31 | *By contributing code to TileDB-CF-Py, you are agreeing to release it under the [MIT License](https://github.com/TileDB-Inc/TileDB/tree/dev/LICENSE).* 32 | 33 | ### Quickstart Workflow 34 | 35 | [From a fork of TileDB-CF-Py](https://help.github.com/articles/fork-a-repo/) 36 | 37 | ```bash 38 | git clone https://github.com/username/TileDB-CF-Py 39 | pip install -e '.[parallel]' 40 | git checkout -b / 41 | # ... code changes ... 42 | ./tools/lint.sh # run linters 43 | git commit -a -m "descriptive commit message" 44 | git push --set-upstream origin / 45 | ``` 46 | 47 | [Issue a PR from your updated TileDB-CF-Py fork](https://help.github.com/articles/creating-a-pull-request-from-a-fork/) 48 | 49 | Branch conventions: 50 | 51 | * `dev` is the development branch of TileDB-CF-Py, all PR's are merged into `dev`. 52 | * `release-x.y.z` are major / bugfix release branches. 53 | 54 | ### Building Locally for Development 55 | 56 | This project uses setuptools for its build system, and can be built locally using pip. It is recommended you set-up a Python virtual environment with your preferred method before installing. Once the virtual environment is activated, install `tiledb.cf` as 'editable' using pip: 57 | 58 | ```bash 59 | pip install -e . 60 | ``` 61 | 62 | The following tools are used for testing, linting, and formatting. You may want to install them either in the local virtual environment or as command line tools for you system: 63 | 64 | * black 65 | * flake8 66 | * mypy 67 | * pytest (with pytest-cov) 68 | 69 | 70 | ### Formatting, Style, and Linting 71 | 72 | * 4 spaces per indentation level not tabs 73 | * class names use `CamelCase` 74 | * member functions, variables use `snake_case` 75 | * private module or class member use a leading underscore `_local_variable` 76 | * comments are good, the project uses Google-style docstrings with type hints 77 | * format code using [black](https://pypi.org/project/black/) and [isort](https://pypi.org/project/isort/) 78 | * lint code using [flake8](https://pypi.org/project/flake8/) and [mypy](https://pypi.org/project/mypy/) 79 | 80 | It is highly recommended to run formatting and linting tools before every commit. This can be automated by activating the pre-commit hook `tools/hooks/pre-commit.sh`. To do this symlink or copy `tools/hooks/pre-commit.sh` to `.git/hooks/pre-commit` in the local directory. Note that the pre-commit hook may fail due to unstaged changes. You may wish to stash these changes before committing. This can be done as follows: 81 | 82 | ```bash 83 | git add 84 | git stash --keep-index 85 | git commit 86 | git stash pop 87 | ``` 88 | 89 | ### Testing 90 | 91 | The testing for this project uses pytest and GitHub workflows for testing. The test suite will be run on GitHub when you submit your pull request. 92 | 93 | ### API Documentation 94 | 95 | To build the API documentation do the following from this projects root directory: 96 | 97 | 1. Install required packages: 98 | ```bash 99 | python3 -m pip install tiledb-cf[docs] 100 | ``` 101 | 2. Make the HTML document: 102 | ```bash 103 | make -C docs/ html 104 | ``` 105 | 3. Open [docs/_build/html/index.html](./docs/_build/html/index.html) in a web browser of your choice. 106 | 107 | 108 | ### Pull Requests 109 | 110 | * `dev` is the development branch, all PR’s should be rebased on top of the latest `dev` commit. 111 | 112 | * Commit changes to a local branch. The convention is to use your initials to identify branches. Branch names should be identifiable and reflect the feature or bug that they want to address / fix. This helps in deleting old branches later. 113 | 114 | * When ready to submit a PR, `git rebase` the branch on top of the latest `dev` commit. Be sure to squash / cleanup the commit history so that the PR preferably one, or a couple commits at most. All commits will be squashed into a single commit upon merging. 115 | 116 | * Run the formatting (`isort`, `black`) and linting tools (`flake8`, `mypy`) before submitting a final PR. Make sure that your contribution generally follows the format and naming conventions used by surrounding code. 117 | 118 | * Update the HISTORY with any changes/adds/removes to user-facing API or system behavior. Make sure to note any non-backward compatible changes as a breaking change. 119 | 120 | * Submit a PR, writing a descriptive message. If a PR closes an open issue, reference the issue in the PR message (e.g. If an issue closes issue number 10, you would write `closes #10`) 121 | 122 | * Make sure CI (continuous integration) is passing for your PR. 123 | 124 | ### Resources 125 | 126 | * TileDB-CF-Py 127 | * [Issues](https://github.com/TileDB-Inc/TileDB-CF-Py/issues) 128 | * [Documentation](https://docs.tiledb.com/geospatial) 129 | 130 | * TileDB 131 | * [Homepage](https://tiledb.com) 132 | * [Documentation](https://docs.tiledb.com/main/) 133 | * [Forum](https://forum.tiledb.io/) 134 | * [Organization](https://github.com/TileDB-Inc/) 135 | 136 | * Github / Git 137 | * [Git cheatsheet](https://services.github.com/on-demand/downloads/github-git-cheat-sheet/) 138 | * [Github Documentation](https://help.github.com/) 139 | * [Forking a Repo](https://help.github.com/articles/fork-a-repo/) 140 | * [More Learning Resources](https://help.github.com/articles/git-and-github-learning-resources/) 141 | -------------------------------------------------------------------------------- /tests/core/test_write_array.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import numpy as np 4 | 5 | import tiledb 6 | from tiledb.cf.core._array_creator import ArrayCreator 7 | from tiledb.cf.core._shared_dim import SharedDim 8 | from tiledb.cf.testing import assert_dict_arrays_equal 9 | 10 | 11 | def test_write_array_dense_1D_full(tmpdir): 12 | uri = str(tmpdir.mkdir("output").join("dense_1D_full")) 13 | attr_data = np.arange(-3, 5) 14 | 15 | creator = ArrayCreator( 16 | dim_order=("dim1",), 17 | shared_dims=[SharedDim("dim1", (0, 7), np.uint32)], 18 | ) 19 | creator.add_dense_fragment_writer() 20 | creator.add_attr_creator("attr1", dtype=np.int64) 21 | creator["attr1"].set_writer_data(attr_data) 22 | 23 | creator.write(uri) 24 | 25 | with tiledb.open(uri) as array: 26 | result = array[...] 27 | 28 | assert_dict_arrays_equal(result, {"attr1": attr_data}) 29 | 30 | 31 | def test_write_array_sparse_1D_dense_region_full(tmpdir): 32 | uri = str(tmpdir.mkdir("output").join("sparse_1D_dense_full")) 33 | attr_data = np.arange(-3, 5) 34 | 35 | creator = ArrayCreator( 36 | dim_order=("dim1",), 37 | shared_dims=[SharedDim("dim1", (0, 7), np.uint32)], 38 | sparse=True, 39 | ) 40 | creator.add_attr_creator("attr1", dtype=np.int64) 41 | creator.add_dense_fragment_writer() 42 | creator["attr1"].set_writer_data(attr_data) 43 | 44 | creator.write(uri) 45 | 46 | with tiledb.open(uri) as array: 47 | result = array[...] 48 | 49 | expected = OrderedDict() 50 | expected["dim1"] = np.arange(8, dtype=np.uint32) 51 | expected["attr1"] = attr_data 52 | assert_dict_arrays_equal(result, expected) 53 | 54 | 55 | def test_write_array_sparse_1D_sparse_coo_region(tmpdir): 56 | uri = str(tmpdir.mkdir("output").join("sparse_1D_sparse_region")) 57 | dim_data = np.array([7, 1, 5, 3], dtype=np.uint32) 58 | attr_data = np.array([-3, 0, 100, -100], dtype=np.int64) 59 | 60 | creator = ArrayCreator( 61 | dim_order=("dim1",), 62 | shared_dims=[SharedDim("dim1", (0, 7), np.uint32)], 63 | sparse=True, 64 | ) 65 | creator.add_attr_creator("attr1", dtype=np.int64) 66 | creator.add_sparse_fragment_writer(size=4) 67 | creator["attr1"].set_writer_data(attr_data) 68 | creator.domain_creator["dim1"].set_writer_data(dim_data) 69 | 70 | creator.write(uri) 71 | 72 | with tiledb.open(uri) as array: 73 | result = array.multi_index[:] 74 | 75 | expected = OrderedDict() 76 | expected["dim1"] = dim_data 77 | expected["attr1"] = attr_data 78 | assert_dict_arrays_equal(result, expected, False) 79 | 80 | 81 | def test_write_array_sparse_1D_sparse_row_major_region(tmpdir): 82 | uri = str(tmpdir.mkdir("output").join("sparse_1D_sparse_row_major_region")) 83 | dim_data = np.array([7, 1, 5, 3], dtype=np.uint32) 84 | attr_data = np.array([-3, 0, 100, -100], dtype=np.int64) 85 | 86 | creator = ArrayCreator( 87 | dim_order=("dim1",), 88 | shared_dims=[SharedDim("dim1", (0, 7), np.uint32)], 89 | sparse=True, 90 | ) 91 | creator.add_attr_creator("attr1", dtype=np.int64) 92 | creator.add_sparse_fragment_writer(shape=(4,), form="row-major") 93 | creator["attr1"].set_writer_data(attr_data) 94 | creator.domain_creator["dim1"].set_writer_data(dim_data) 95 | 96 | creator.write(uri) 97 | 98 | with tiledb.open(uri) as array: 99 | result = array.multi_index[:] 100 | 101 | expected = OrderedDict() 102 | expected["dim1"] = dim_data 103 | expected["attr1"] = attr_data 104 | assert_dict_arrays_equal(result, expected, False) 105 | 106 | 107 | def test_write_array_dense_2D_full(tmpdir): 108 | uri = str(tmpdir.mkdir("output").join("dense_2D_full")) 109 | attr_data = np.resize(np.arange(-3, 28), (8, 4)) 110 | 111 | creator = ArrayCreator( 112 | dim_order=("dim1", "dim2"), 113 | shared_dims=[ 114 | SharedDim("dim1", (0, 7), np.uint32), 115 | SharedDim("dim2", (0, 3), np.uint32), 116 | ], 117 | ) 118 | creator.add_attr_creator("attr1", dtype=np.int64) 119 | creator.add_dense_fragment_writer() 120 | creator["attr1"].set_writer_data(attr_data) 121 | 122 | creator.write(uri) 123 | 124 | with tiledb.open(uri) as array: 125 | result = array[...] 126 | 127 | assert_dict_arrays_equal(result, {"attr1": attr_data}) 128 | 129 | 130 | def test_write_array_sparse_2D_dense_region_full(tmpdir): 131 | uri = str(tmpdir.mkdir("output").join("sparse_2D_dense_full")) 132 | attr_data = np.resize(np.arange(-3, 28), (8, 4)) 133 | 134 | creator = ArrayCreator( 135 | dim_order=("dim1", "dim2"), 136 | shared_dims=[ 137 | SharedDim("dim1", (0, 7), np.uint32), 138 | SharedDim("dim2", (0, 3), np.uint32), 139 | ], 140 | sparse=True, 141 | ) 142 | creator.add_attr_creator("attr1", dtype=np.int64) 143 | creator.add_dense_fragment_writer() 144 | creator["attr1"].set_writer_data(attr_data) 145 | 146 | creator.write(uri) 147 | 148 | with tiledb.open(uri) as array: 149 | result = array[...] 150 | 151 | expected = OrderedDict() 152 | dim1_coords, dim2_coords = np.meshgrid( 153 | np.arange(8, dtype=np.uint32), np.arange(4, dtype=np.uint32), indexing="ij" 154 | ) 155 | expected["dim1"] = dim1_coords.reshape(-1) 156 | expected["dim2"] = dim2_coords.reshape(-1) 157 | expected["attr1"] = attr_data.reshape(-1) 158 | assert_dict_arrays_equal(result, expected) 159 | 160 | 161 | def test_write_array_sparse_2D_sparse_coo_region(tmpdir): 162 | uri = str(tmpdir.mkdir("output").join("sparse_1D_sparse_region")) 163 | dim1_data = np.array([7, 1, 5, 3], dtype=np.uint32) 164 | dim2_data = np.array([0, 1, 1, 0], dtype=np.uint32) 165 | attr_data = np.array([-3, 0, 100, -100], dtype=np.int64) 166 | 167 | creator = ArrayCreator( 168 | dim_order=("dim1", "dim2"), 169 | shared_dims=[ 170 | SharedDim("dim1", (0, 7), np.uint32), 171 | SharedDim("dim2", (0, 3), np.uint32), 172 | ], 173 | sparse=True, 174 | ) 175 | creator.add_attr_creator("attr1", dtype=np.int64) 176 | creator.add_sparse_fragment_writer(size=4) 177 | creator["attr1"].set_writer_data(attr_data) 178 | creator.domain_creator["dim1"].set_writer_data(dim1_data) 179 | creator.domain_creator["dim2"].set_writer_data(dim2_data) 180 | 181 | creator.write(uri) 182 | 183 | with tiledb.open(uri) as array: 184 | result = array.multi_index[:] 185 | 186 | expected = OrderedDict() 187 | expected["dim1"] = dim1_data 188 | expected["dim2"] = dim2_data 189 | expected["attr1"] = attr_data 190 | assert_dict_arrays_equal(result, expected, False) 191 | 192 | 193 | def test_write_array_sparse_2D_sparse_row_major_region(tmpdir): 194 | uri = str(tmpdir.mkdir("output").join("sparse_1D_sparse_region")) 195 | dim1_data = np.array([7, 1, 5], dtype=np.uint32) 196 | dim2_data = np.array([0, 3, 1, 2], dtype=np.uint32) 197 | attr_data = np.arange(-6, 6, dtype=np.int64) 198 | 199 | creator = ArrayCreator( 200 | dim_order=("dim1", "dim2"), 201 | shared_dims=[ 202 | SharedDim("dim1", (0, 7), np.uint32), 203 | SharedDim("dim2", (0, 3), np.uint32), 204 | ], 205 | sparse=True, 206 | ) 207 | creator.add_attr_creator("attr1", dtype=np.int64) 208 | creator.add_sparse_fragment_writer(shape=(3, 4), form="row-major") 209 | creator["attr1"].set_writer_data(attr_data) 210 | creator.domain_creator["dim1"].set_writer_data(dim1_data) 211 | creator.domain_creator["dim2"].set_writer_data(dim2_data) 212 | 213 | creator.write(uri) 214 | 215 | with tiledb.open(uri) as array: 216 | result = array.multi_index[:] 217 | 218 | expected = OrderedDict() 219 | expected["dim1"] = np.repeat(dim1_data, 4) 220 | print(expected["dim1"]) 221 | expected["dim2"] = np.tile(dim2_data, 3) 222 | expected["attr1"] = attr_data 223 | assert_dict_arrays_equal(result, expected, False) 224 | -------------------------------------------------------------------------------- /tiledb/cf/xarray_engine/engine.py: -------------------------------------------------------------------------------- 1 | """Module for xarray backend plugin using the TileDB-Xarray Convention. 2 | 3 | Example: 4 | Open a TileDB group with the xarray engine:: 5 | 6 | import xarray as xr 7 | dataset = xr.open_dataset( 8 | "dataset.tiledb", 9 | backend_kwargs={"Ctx": ctx}, 10 | engine="tiledb" 11 | ) 12 | 13 | 14 | """ 15 | from __future__ import annotations 16 | 17 | import os 18 | import warnings 19 | from typing import ClassVar, Iterable 20 | 21 | from xarray.backends.common import BACKEND_ENTRYPOINTS, BackendEntrypoint 22 | from xarray.backends.store import StoreBackendEntrypoint 23 | from xarray.core.dataset import Dataset 24 | from xarray.core.utils import close_on_error 25 | 26 | import tiledb 27 | 28 | from ._backend_store import TileDBXarrayStore 29 | from ._deprecated_backend_store import TileDBDataStore 30 | 31 | 32 | class TileDBXarrayBackendEntrypoint(BackendEntrypoint): 33 | """TileDB backend for xarray.""" 34 | 35 | open_dataset_parameters: ClassVar[tuple | None] = [ 36 | "filename_or_obj", 37 | "config", 38 | "ctx", 39 | "timestamp", 40 | ] 41 | description: ClassVar[ 42 | str 43 | ] = "TileDB backend for xarray for opening TileDB arrays and groups" 44 | url: ClassVar[str] = "https://github.com/TileDB-Inc/TileDB-CF-Py" 45 | 46 | def open_dataset( 47 | self, 48 | filename_or_obj, 49 | *, 50 | config=None, 51 | ctx=None, 52 | timestamp=None, 53 | use_deprecated_engine=None, 54 | key=None, 55 | encode_fill=None, 56 | coord_dims=None, 57 | open_full_domain=None, 58 | mask_and_scale=True, 59 | decode_times=True, 60 | concat_characters=True, 61 | decode_coords=True, 62 | drop_variables: str | Iterable[str] | None = None, 63 | use_cftime=None, 64 | decode_timedelta=None, 65 | ) -> Dataset: 66 | """ 67 | Open a TileDB group or array as an xarray dataset. 68 | 69 | Parameters 70 | ---------- 71 | filename_or_obj 72 | TileDB URI for the group or array to open in xarray. 73 | config 74 | TileDB config object to pass to TileDB objects. 75 | ctx 76 | TileDB context to use for TileDB operations. 77 | timestamp 78 | Timestamp to open the TileDB array at. Not valid for groups. 79 | key 80 | [Deprecated] Encryption key to use for the backend array. 81 | encode_fill 82 | [Deprecated] Encode the TileDB fill value. 83 | coord_dims 84 | [Deprecated] List of dimensions to convert to coordinates. 85 | open_full_domain 86 | [Deprecated] Open the full TileDB domain instead of the non-empty domain. 87 | mask_and_scale 88 | xarray decoder that masks fill value and applies float-scale filter using 89 | variable metadata. 90 | decode_times 91 | xarray decoder that converts variables with NetCDF CF-Convention time 92 | metadata to a numpy.datetime64 datatype. 93 | concat_characters 94 | xarray decoder not supported by TileDB. 95 | decode_coords 96 | xarray decoder that controls which variables are set as coordinate 97 | variables. 98 | drop_variables 99 | A variable or list of variables to exclude from being opened from the 100 | dataset. 101 | use_cftime 102 | xarray decoder option. Uses cftime for datetime decoding. 103 | decode_timedelta 104 | xarray decoder that converts variables with time units to a 105 | numpy.timedelta64 datatype. 106 | """ 107 | 108 | deprecated_kwargs = { 109 | "key": key, 110 | "encode_fill": encode_fill, 111 | "open_full_domain": open_full_domain, 112 | } 113 | 114 | # If deprecated keyword aguments were set, then switch to the deprecated engine. 115 | if use_deprecated_engine is None: 116 | 117 | def check_use_deprecated(key_name, key_value): 118 | if key_value is not None: 119 | warnings.warn( 120 | f"Deprecated keyword '{key_name}' provided; deprecated engine " 121 | f"is enabled.", 122 | DeprecationWarning, 123 | stacklevel=1, 124 | ) 125 | return True 126 | 127 | use_deprecated_engine = any( 128 | check_use_deprecated(key, val) 129 | for (key, val) in deprecated_kwargs.items() 130 | ) 131 | 132 | # Use the deprecated xarray engine for opening the array. 133 | if use_deprecated_engine: 134 | warnings.warn( 135 | "Using deprecated TileDB-Xarray plugin", 136 | DeprecationWarning, 137 | stacklevel=1, 138 | ) 139 | 140 | # Create the deprecated store. 141 | encode_fill = False if encode_fill is None else encode_fill 142 | open_full_domain = False if open_full_domain is None else open_full_domain 143 | datastore = TileDBDataStore( 144 | uri=filename_or_obj, 145 | key=key, 146 | timestamp=timestamp, 147 | ctx=ctx, 148 | encode_fill=encode_fill, 149 | open_full_domain=open_full_domain, 150 | coord_dims=coord_dims, 151 | ) 152 | 153 | # Use xarray indirection to open dataset defined in a plugin. 154 | store_entrypoint = StoreBackendEntrypoint() 155 | with close_on_error(datastore): 156 | dataset = store_entrypoint.open_dataset( 157 | datastore, 158 | mask_and_scale=mask_and_scale, 159 | decode_times=decode_times, 160 | concat_characters=concat_characters, 161 | decode_coords=decode_coords, 162 | drop_variables=drop_variables, 163 | use_cftime=use_cftime, 164 | decode_timedelta=decode_timedelta, 165 | ) 166 | return dataset 167 | 168 | # Using new engine: warn if any deprecated keyword arguments were set. 169 | for arg_name, arg_value in deprecated_kwargs.items(): 170 | if arg_value is not None: 171 | warnings.warn( 172 | f"Skipping deprecated keyword '{arg_name}' used when " 173 | f"`use_deprecated_engine=False`.", 174 | DeprecationWarning, 175 | stacklevel=1, 176 | ) 177 | 178 | # Create the TileDB backend store. 179 | datastore = TileDBXarrayStore( 180 | filename_or_obj, config=config, ctx=ctx, timestamp=timestamp 181 | ) 182 | 183 | # Use xarray indirection to open dataset defined in a plugin. 184 | store_entrypoint = StoreBackendEntrypoint() 185 | with close_on_error(datastore): 186 | dataset = store_entrypoint.open_dataset( 187 | datastore, 188 | mask_and_scale=mask_and_scale, 189 | decode_times=decode_times, 190 | concat_characters=concat_characters, 191 | decode_coords=decode_coords, 192 | drop_variables=drop_variables, 193 | use_cftime=use_cftime, 194 | decode_timedelta=decode_timedelta, 195 | ) 196 | return dataset 197 | 198 | def guess_can_open(self, filename_or_obj) -> bool: 199 | """Check for datasets that can be opened with this backend.""" 200 | if isinstance(filename_or_obj, (str, os.PathLike)): 201 | _, ext = os.path.splitext(filename_or_obj) 202 | if ext in {".tiledb", ".tdb"}: 203 | return True 204 | try: 205 | return tiledb.object_type(filename_or_obj) in {"array", "group"} 206 | except tiledb.TileDBError: 207 | return False 208 | 209 | 210 | BACKEND_ENTRYPOINTS["tiledb"] = ("tiledb", TileDBXarrayBackendEntrypoint) 211 | -------------------------------------------------------------------------------- /tiledb/cf/netcdf_engine/_utils.py: -------------------------------------------------------------------------------- 1 | """Class for helper functions for NetCDF to TileDB conversion.""" 2 | 3 | import time 4 | import warnings 5 | from contextlib import contextmanager 6 | from pathlib import Path 7 | from typing import Any, Optional, Sequence, Tuple, Union 8 | 9 | import netCDF4 10 | import numpy as np 11 | 12 | import tiledb 13 | 14 | from .._utils import safe_set_metadata 15 | 16 | _DEFAULT_INDEX_DTYPE = np.dtype("uint64") 17 | COORDINATE_SUFFIX = ".data" 18 | 19 | 20 | def copy_group_metadata(netcdf_group: netCDF4.Group, meta: tiledb.libtiledb.Metadata): 21 | """Copy all NetCDF group attributs to a the metadata in a TileDB array.""" 22 | for key in netcdf_group.ncattrs(): 23 | value = netcdf_group.getncattr(key) 24 | if key == "history": 25 | value = f"{value} - TileDB array created on {time.ctime(time.time())}" 26 | safe_set_metadata(meta, key, value) 27 | 28 | 29 | def get_netcdf_metadata( 30 | netcdf_item, key: str, default: Any = None, is_number: bool = False 31 | ) -> Any: 32 | """Returns a NetCDF attribute value from a key if it exists and the default value 33 | otherwise. 34 | 35 | If ``is_number=True``, the result is only returned if it is a numpy number. If the 36 | key exists but is not a numpy number, then a warning is raised. If the key exists 37 | and is an array of length 1, the scalar value is returned. 38 | 39 | Parameters 40 | ---------- 41 | key 42 | NetCDF attribute name to return. 43 | default 44 | Default value to return if the attribute is not found. 45 | is_number 46 | If ``True``, the result is only returned if it is a numpy number. 47 | 48 | Returns 49 | ------- 50 | Any 51 | The NetCDF attribute value, if found. Otherwise, return the default value. 52 | """ 53 | if key in netcdf_item.ncattrs(): 54 | value = netcdf_item.getncattr(key) 55 | if is_number: 56 | if ( 57 | isinstance(value, str) 58 | or not np.issubdtype(value.dtype, np.number) 59 | or np.size(value) != 1 60 | ): 61 | with warnings.catch_warnings(): 62 | warnings.warn( 63 | f"Attribute '{key}' has value='{value}' that not a number. " 64 | f"Using default {key}={default} instead.", 65 | stacklevel=3, 66 | ) 67 | return default 68 | if not np.isscalar(value): 69 | value = value.item() 70 | return value 71 | return default 72 | 73 | 74 | def get_unpacked_dtype(variable: netCDF4.Variable) -> np.dtype: 75 | """Returns the Numpy data type of a variable after it has been unpacked by applying 76 | any scale_factor or add_offset. 77 | 78 | Parameters 79 | ---------- 80 | variable 81 | The NetCDF variable to get the unpacked data type of. 82 | 83 | Returns 84 | ------- 85 | np.dtype 86 | The unpacked data from the NetCDF variable. 87 | """ 88 | input_dtype = np.dtype(variable.dtype) 89 | if not np.issubdtype(input_dtype, np.number): 90 | raise ValueError( 91 | f"Unpacking only support NetCDF variables with integer or floating-point " 92 | f"data. Input variable has datatype {input_dtype}." 93 | ) 94 | test = np.array(0, dtype=input_dtype) 95 | scale_factor = get_netcdf_metadata(variable, "scale_factor", is_number=True) 96 | add_offset = get_netcdf_metadata(variable, "add_offset", is_number=True) 97 | if scale_factor is not None: 98 | test = scale_factor * test 99 | if add_offset is not None: 100 | test = test + add_offset 101 | return test.dtype 102 | 103 | 104 | def get_variable_values( 105 | variable: netCDF4.Variable, 106 | indexer: Union[slice, Sequence[slice]], 107 | fill: Optional[Union[int, float, str]], 108 | unpack: bool, 109 | ) -> np.ndarray: 110 | """Returns the values for a NetCDF variable at the requested indices. 111 | 112 | Parameters 113 | ---------- 114 | variable 115 | NetCDF variable to get values from. 116 | indexer 117 | Sequence of slices used to index the NetCDF variable. 118 | fill 119 | If not ``None``, the fill value to use for the output data. 120 | unpack 121 | If ``True``, unpack the variable if it contains a ``scale_factor`` or 122 | ``add_offset``. 123 | 124 | Returns 125 | ------- 126 | np.ndarray 127 | The data from the NetCDF variable. 128 | """ 129 | values = variable.getValue() if variable.ndim == 0 else variable[indexer] 130 | netcdf_fill = get_netcdf_metadata(variable, "_FillValue") 131 | if fill is not None and netcdf_fill is not None and fill != netcdf_fill: 132 | np.putmask(values, values == netcdf_fill, fill) 133 | if unpack: 134 | scale_factor = get_netcdf_metadata(variable, "scale_factor", is_number=True) 135 | if scale_factor is not None: 136 | values = scale_factor * values 137 | add_offset = get_netcdf_metadata(variable, "add_offset", is_number=True) 138 | if add_offset is not None: 139 | values = values + add_offset 140 | return values 141 | 142 | 143 | def get_variable_chunks( 144 | variable: netCDF4.Variable, unlimited_dim_size 145 | ) -> Optional[Tuple[int, ...]]: 146 | """ 147 | Returns the chunks from a NetCDF variable if chunked and ``None`` otherwise. 148 | 149 | If one of the dimensions has a unlimited dimension, the chunk size will be 150 | reduced to the unlimited_dim_size. 151 | 152 | Parameters 153 | ---------- 154 | variable 155 | The variable to get chunks from. 156 | unlimited_dim_size 157 | The size to use for unlimited dimensions. 158 | 159 | Returns 160 | ------- 161 | Tuple[int, ...], optional 162 | Chunks from the NetCDF variable if it is chunked and ``None`` otherwise. 163 | """ 164 | chunks = variable.chunking() 165 | if chunks is None or chunks == "contiguous": 166 | return None 167 | return tuple( 168 | min(ck, dim.size if unlimited_dim_size is None else unlimited_dim_size) 169 | if dim.isunlimited() 170 | else ck 171 | for ck, dim in zip(chunks, variable.get_dims()) 172 | ) 173 | 174 | 175 | @contextmanager 176 | def open_netcdf_group( 177 | group: Optional[Union[netCDF4.Dataset, netCDF4.Group]] = None, 178 | input_file: Optional[Union[str, Path]] = None, 179 | group_path: Optional[str] = None, 180 | ): 181 | """Context manager for opening a NetCDF group. 182 | 183 | If both an input file and group are provided, this function will prioritize 184 | opening from the group. 185 | 186 | Parameters 187 | ---------- 188 | group 189 | A NetCDF group to read from. 190 | input_file 191 | A NetCDF file to read from. 192 | group_path 193 | The path to the NetCDF group to read from in a NetCDF file. Use ``'/'`` to 194 | specify the root group. 195 | """ 196 | if group is not None: 197 | if not isinstance(group, (netCDF4.Dataset, netCDF4.Group)): 198 | raise TypeError( 199 | f"Invalid input: group={group} of type {type(group)} is not a netCDF " 200 | f"Group or or Dataset." 201 | ) 202 | yield group 203 | else: 204 | if input_file is None: 205 | raise ValueError( 206 | "An input file must be provided; no default input file was set." 207 | ) 208 | if group_path is None: 209 | raise ValueError( 210 | "A group path must be provided; no default group path was set. Use " 211 | "``'/'`` for the root group." 212 | ) 213 | root_group = netCDF4.Dataset(input_file) 214 | root_group.set_auto_maskandscale(False) 215 | try: 216 | netcdf_group = root_group 217 | if group_path != "/": 218 | for child_group_name in group_path.strip("/").split("/"): 219 | netcdf_group = netcdf_group.groups[child_group_name] 220 | yield netcdf_group 221 | finally: 222 | root_group.close() 223 | -------------------------------------------------------------------------------- /documentation/tiledb-cf-spec.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: TileDB-CF Dataspace Specification 3 | --- 4 | 5 | ::: {.callout-warning} 6 | The current dataspace specification is not stable. Backwards compatibility is not guaranteed for specification less than 1.0.0. 7 | ::: 8 | 9 | ## Current TileDB-CF Dataspace Specification 10 | 11 | * The current TileDB-CF format version number is **0.3.0**. 12 | 13 | ### TileDB-CF Dataspace 0.3.0 14 | 15 | A TileDB CF dataspace is a TileDB group with arrays, attributes, and dimensions that satisfy the following rules. 16 | 17 | #### Terminology 18 | 19 | * **Collection of dimensions**: A set of TileDB dimensions with the same name, data type, and domain. 20 | 21 | #### CF Dataspace 22 | 23 | **Requirements for Dimensions** 24 | 25 | 1. All dimensions that share a name must belong to the same collection (they must have the same domain and data type). 26 | 27 | **Requirements for Metadata** 28 | 29 | 1. Attribute metadata is stored in the same array as the attribute. The metadata key must use the prefix `__tiledb_attr.{attr_name}.` where `{attr_name}` is the full name of the attribute. 30 | 2. Dimension metadata is stored in the same array as the dimension. The metadata key must use the prefix `__tiledb_dim.{dim_name}.` where `{dim_name}` is the full name of the dimension. 31 | 32 | ### Simple CF Dataspace 33 | 34 | A simple CF dataspace is a direct implementation of the NetCDF data model in TileDB. It follows the same rules as a CF dataspace along with the following requirements: 35 | 36 | **Additional Requirements for Dimensions** 37 | 38 | 1. All dimensions use integer indices and have a domain with lower bound of 0. 39 | 40 | **Additional Requirements for Arrays** 41 | 42 | 1. All arrays in the group are named and have a single attribute. 43 | 44 | **Additional Requirements for Metadata** 45 | 46 | 1. There is only group and attribute level metadata. 47 | 48 | ## Specification Q&A 49 | 50 | 1. Why have a special specification for the TileDB-CF library? 51 | 52 | The TileDB data model is very general and can be used to support a wide-range of applications. However, there is always a push-and-pull between how general your data model is and enabling specific behavior or interpretations for the data. The purpose of the TileDB-CF specification is to handle the case where we have multiple TileDB arrays defined on the same underlying dimensions. By creating a specificiation we make our assumptions explicit and let users know exactly what they must do to use this tool. 53 | 54 | 55 | 2. Is the specification backwards compatible? 56 | 57 | Not yet. This library and data model are still under initial development. When the data model has stabalized we will release a 1.0.0 version. 58 | 59 | 3. Why is there both a library version and a specification version? 60 | 61 | The TileDB-CF python package will update much more frequently the specification. The specification is more-or-less just a summary of the conventions the TileDB-CF library is using. As such, a change to the specification version will always coincide to a change to the library version, but the library version can update without effecting the specification. 62 | 63 | 4. What version is my current data? 64 | 65 | The TileDB-CF dataspace specification is fairly minimal. Your data may satisfy multiple versions. Currently, we do not provide support for checking your data satisfies the TileDB-CF dataspace convention, but some such tooling will be implemented before the 1.0.0 release of this specification. 66 | 67 | 68 | ## Changelog 69 | 70 | ### Version 0.3.0 71 | 72 | * Terminology 73 | 74 | - Remove notion of a dataspace name. 75 | 76 | * CF Dataspace 77 | 78 | - Remove requirement that all attributes and dimension are named (allow anonymous attributes and dimensions). 79 | - Remove group metadata array. Group metadata is now directly supported in the TileDB core engine. 80 | - Remove the notion of a dataspace name and the associated requirements. 81 | - Remove requirement attributes have unique dataspace names for general CF Datspace. 82 | - Remove requirement of `_FillValue`. 83 | - Add dimension-level metadata. 84 | 85 | * Simple CF Dataspace 86 | 87 | - Remove requiment all collections of dimension have a unique dataspace name. 88 | - Add requirement all arrays are uniquely named and have a single attribute. 89 | - Add restriction that metadata only exists for attributes and groups. 90 | 91 | ### Version 0.2.0 92 | 93 | - Major revision. See appendix for full specification. 94 | 95 | ### Version 0.1.0 96 | 97 | - Initial release. See appendix for full specification. 98 | 99 | 100 | ## Appendix 101 | 102 | ### TileDB-CF Dataspace 0.1.0 103 | 104 | #### Terminology 105 | 106 | * **Index dimension**: A TileDB dimension with an integer data type and domain with `0` as its lower bound. 107 | * **Data dimension**: Any TileDB dimension that is not an index dimension. 108 | * **Dataspace name**: The name of an attribute or dimension stripped of an optional suffix of `.index` or `.data`. 109 | 110 | #### CF Dataspace 111 | 112 | **Requirements for Attributes and Dimensions** 113 | 114 | 1. All attributes and dimension must be named (there must not be any anonymous attributes or dimensions). 115 | 2. All dimensions that share a name must have the same domain and data type. 116 | 3. All attributes must have a unique dataspace name. 117 | 4. If an attribute and data dimension share the same dataspace name, they must share the same full name and data type. 118 | 119 | **Requirements for Metadata** 120 | 121 | 1. Group metadata is stored in a special metadata array named `__tiledb_group` inside the TileDB group. 122 | 2. Attribute metadata is stored in the array the attribute is in using the prefix `__tiledb_attr.{attr_name}.` for the attribute key where `{attr_name}` is the full name of the attribute. 123 | 3. If the metadata key `_FillValue` exists for an attribute; it must have the same value as the fill value for the attribute. 124 | 125 | #### Indexable CF Dataspace 126 | 127 | A CF Dataspace is said to be indexable if it satisfies all requirements of a CF Dataspace along with the following condition: 128 | 129 | * All data dimensions must have an axis label that maps an index dimension with the same dataspace name as the data dimension to an attribute with the same full name and data type as the data dimension. 130 | 131 | 132 | ### TileDB-CF Dataspace 0.2.0 133 | 134 | A TileDB CF dataspace is a TileDB group with arrays, attributes, and dimensions that satisfy the following rules. 135 | 136 | #### Terminology 137 | 138 | * **Dataspace name**: The name of an attribute or dimension stripped of an optional suffix of `.index` or `.data`. 139 | * **Collection of dimensions**: A set of TileDB dimensions with the same name, data type, and domain. 140 | 141 | #### CF Dataspace 142 | 143 | A CF Dataspace is a TileDB group that follows certain requirements in order to provide additional relational context to dimensions and attributes using naming conventions. In a CF Dataspace, TileDB attributes within the entire group are unique and TileDB dimensions that share the same name are considered the same object. 144 | 145 | **Requirements for Attributes and Dimensions** 146 | 147 | 1. All attributes and dimension must be named (there must not be any anonymous attributes or dimensions). 148 | 2. All dimensions that share a name must belong to the same collection (they must have the same domain and data type). 149 | 3. All attributes must have a unique dataspace name. 150 | 151 | **Requirements for Metadata** 152 | 153 | 1. Group metadata is stored in a special metadata array named `__tiledb_group` inside the TileDB group. 154 | 2. Attribute metadata is stored in the same array the attribute is stored in. The metadata key must use the prefix `__tiledb_attr.{attr_name}.` where `{attr_name}` is the full name of the attribute. 155 | 3. If the metadata key `_FillValue` exists for an attribute; it must have the same value as the fill value for the attribute. 156 | 157 | ### Simple CF Dataspace 158 | 159 | A simple CF dataspace is a direct implementation of the NetCDF data model in TileDB. It follows the same rules as a CF dataspace along with the following requirements: 160 | 161 | **Additional Requirements for Dimensions** 162 | 163 | 1. All dimensions use integer indices and have a domain with lower bound of 0. 164 | 2. All collections of dimensions must have a unique dataspace name. 165 | -------------------------------------------------------------------------------- /tiledb/cf/xarray_engine/_encoding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import tiledb 4 | 5 | # Generic xarray encoding 6 | _UNLIMITED_DIMS_ENCODING = "unlimited_dims" 7 | _FILL_VALUE_ENCODING = "_FillValue" 8 | 9 | # TileDB specific variable encoding 10 | _ATTR_NAME_ENCODING = "attr_name" 11 | _ATTR_FILTERS_ENCODING = "filters" 12 | _TILE_SIZES_ENCODING = "tiles" 13 | _MAX_SHAPE_ENCODING = "max_shape" 14 | _DIM_DTYPE_ENCODING = "dim_dtype" 15 | 16 | 17 | class TileDBVariableEncoder: 18 | """Class for encoding array variables. 19 | 20 | Parameters 21 | ---------- 22 | name 23 | Name of the variable. 24 | variable 25 | Xarray variable to encode. 26 | encoding 27 | Dictionary of TileDB encoding keywords. 28 | unlimited_dims 29 | Unlimited dimensions. Only used if max_shape is not provided in the encoding. 30 | ctx 31 | Context object for TileDB operations. 32 | """ 33 | 34 | valid_encoding_keys = { 35 | _ATTR_FILTERS_ENCODING, 36 | _ATTR_NAME_ENCODING, 37 | _DIM_DTYPE_ENCODING, 38 | _MAX_SHAPE_ENCODING, 39 | _TILE_SIZES_ENCODING, 40 | } 41 | 42 | def __init__(self, name, variable, encoding, unlimited_dims, ctx): 43 | # Set initial class properties. 44 | self._ctx = ctx 45 | self._name = name 46 | self._variable = variable 47 | self._encoding = dict() 48 | 49 | # Check the input encoding data is valid. 50 | for key in encoding: 51 | if key not in self.valid_encoding_keys: 52 | raise KeyError( 53 | "Encoding error on variable '{self._name}'. Invalid encoding key " 54 | f"``{key}``." 55 | ) 56 | 57 | # Initialize encoding values. 58 | try: 59 | # Set attribute encodings: attr_name and attr_filters. 60 | self.attr_name = encoding.get( 61 | _ATTR_NAME_ENCODING, 62 | f"{self._name}_" if self._name in variable.dims else self._name, 63 | ) 64 | self.filters = encoding.get( 65 | _ATTR_FILTERS_ENCODING, 66 | tiledb.FilterList( 67 | (tiledb.ZstdFilter(level=5, ctx=self._ctx),), ctx=self._ctx 68 | ), 69 | ) 70 | 71 | # Set domain encodings: dim_dtype, tiles, and max_shape. 72 | self.dim_dtype = encoding.get(_DIM_DTYPE_ENCODING, np.dtype(np.uint32)) 73 | if _MAX_SHAPE_ENCODING in encoding: 74 | self.max_shape = encoding.get(_MAX_SHAPE_ENCODING) 75 | else: 76 | # Set unlimited dimensions to max possible size for datatype and 77 | # remaining dimensions to the size of the variable dimension. 78 | unlimited_dims = { 79 | dim_name for dim_name in variable.dims if dim_name in unlimited_dims 80 | } 81 | unlim = np.iinfo(self.dim_dtype).max 82 | self.max_shape = tuple( 83 | unlim if dim_name in unlimited_dims else dim_size 84 | for dim_name, dim_size in zip(variable.dims, variable.shape) 85 | ) 86 | self.tiles = encoding.get(_TILE_SIZES_ENCODING, None) 87 | except ValueError as err: 88 | raise ValueError(f"Encoding error for variable '{self._name}'.") from err 89 | 90 | @property 91 | def attr_name(self): 92 | return self._encoding.get(_ATTR_NAME_ENCODING, self._name) 93 | 94 | @attr_name.setter 95 | def attr_name(self, name): 96 | if name in self._variable.dims: 97 | raise ValueError( 98 | f"Attribute name '{name}' is already a dimension name. Attribute names " 99 | f"must be unique." 100 | ) 101 | self._encoding[_ATTR_NAME_ENCODING] = name 102 | 103 | @property 104 | def dim_dtype(self): 105 | return self._encoding[_DIM_DTYPE_ENCODING] 106 | 107 | @dim_dtype.setter 108 | def dim_dtype(self, dim_dtype): 109 | if dim_dtype.kind not in ("i", "u"): 110 | raise ValueError( 111 | f"Dimension dtype ``{dim_dtype}`` is not a valid signed or unsigned " 112 | f"integer dtype." 113 | ) 114 | self._encoding[_DIM_DTYPE_ENCODING] = dim_dtype 115 | 116 | @property 117 | def dtype(self): 118 | return self._variable.dtype 119 | 120 | @property 121 | def fill(self): 122 | fill = self._variable.encoding.get(_FILL_VALUE_ENCODING, None) 123 | if fill is np.nan: 124 | return None 125 | return fill 126 | 127 | @property 128 | def filters(self): 129 | return self._encoding[_ATTR_FILTERS_ENCODING] 130 | 131 | @filters.setter 132 | def filters(self, filters): 133 | self._encoding[_ATTR_FILTERS_ENCODING] = filters 134 | 135 | def create_array_schema(self): 136 | """Returns a TileDB attribute from the provided variable and encodings.""" 137 | attr = tiledb.Attr( 138 | name=self.attr_name, 139 | dtype=self.dtype, 140 | fill=self.fill, 141 | filters=self.filters, 142 | ctx=self._ctx, 143 | ) 144 | tiles = self.tiles 145 | max_shape = self.max_shape 146 | dims = tuple( 147 | tiledb.Dim( 148 | name=dim_name, 149 | dtype=self.dim_dtype, 150 | domain=(0, max_shape[index] - 1), 151 | tile=None if tiles is None else tiles[index], 152 | ctx=self._ctx, 153 | ) 154 | for index, dim_name in enumerate(self._variable.dims) 155 | ) 156 | return tiledb.ArraySchema( 157 | domain=tiledb.Domain(*dims, ctx=self._ctx), 158 | attrs=(attr,), 159 | ctx=self._ctx, 160 | ) 161 | 162 | def get_encoding_metadata(self): 163 | meta = dict() 164 | return meta 165 | 166 | @property 167 | def max_shape(self): 168 | return self._encoding[_MAX_SHAPE_ENCODING] 169 | 170 | @max_shape.setter 171 | def max_shape(self, max_shape): 172 | if len(max_shape) != self._variable.ndim: 173 | raise ValueError( 174 | f"Incompatible shape {max_shape} for variable with " 175 | f"{self._variable.ndim} dimensions." 176 | ) 177 | if any( 178 | dim_size < var_size 179 | for dim_size, var_size in zip(max_shape, self._variable.shape) 180 | ): 181 | raise ValueError( 182 | f"Incompatible max shape {max_shape} for variable with shape " 183 | f"{self._variable.shape}. Max shape must be greater than or equal " 184 | f"to the variable shape for all dimensions." 185 | ) 186 | if ( 187 | _TILE_SIZES_ENCODING in self._encoding 188 | and self.tiles is not None 189 | and any( 190 | dim_size < tile_size 191 | for tile_size, dim_size in zip(self.tiles, max_shape) 192 | ) 193 | ): 194 | raise ValueError( 195 | f"Incompatible max shape {max_shape} provied for a variable with tiles " 196 | f"{self.tiles}. Each tile must be less than or equal to the " 197 | f"max size of the dimension it is on." 198 | ) 199 | 200 | self._encoding[_MAX_SHAPE_ENCODING] = max_shape 201 | 202 | @property 203 | def encoding(self): 204 | return self._encoding 205 | 206 | @property 207 | def tiles(self): 208 | return self._encoding[_TILE_SIZES_ENCODING] 209 | 210 | @tiles.setter 211 | def tiles(self, tiles): 212 | if tiles is not None: 213 | if len(tiles) != self._variable.ndim: 214 | raise ValueError( 215 | f"Incompatible number of tiles given. {len(tiles)} tiles provided " 216 | f"for a variable with {self._variable.ndim} dimensions. There must " 217 | f"be exactly one tile per dimension." 218 | ) 219 | if _MAX_SHAPE_ENCODING in self._encoding and any( 220 | dim_size < tile_size 221 | for tile_size, dim_size in zip(tiles, self.max_shape) 222 | ): 223 | raise ValueError( 224 | f"Incompatible tiles {tiles} provied for a variable with max shape " 225 | f"{self.max_shape}. Each tile must be less than or equal to the " 226 | f"max size of the dimension it is on." 227 | ) 228 | self._encoding[_TILE_SIZES_ENCODING] = tiles 229 | 230 | @property 231 | def variable_name(self): 232 | return self._name 233 | -------------------------------------------------------------------------------- /tiledb/cf/xarray_engine/_array_wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from xarray.backends.common import BackendArray 3 | 4 | import tiledb 5 | 6 | from .._utils import safe_set_metadata 7 | from ._common import _ATTR_PREFIX 8 | 9 | 10 | def _to_zero_based_tiledb_index(dim_name, dim_size, index): 11 | """Converts an xarray integer, array, or slice to an index object usable by the 12 | TileDB multi_index function. Only for dimensions with integer domains that start 13 | at zero. 14 | 15 | The following is assumed about xarray indices: 16 | * An index may be an integer, a slice, or a Numpy array of integer indices. 17 | * An integer index or component of an array is such that -size <= value < size. 18 | * Non-negative values are a standard zero-based index. 19 | * Negative values count backwards from the end of the array with the last value 20 | of the array starting at -1. 21 | 22 | Parameters 23 | ---------- 24 | dim_name 25 | Name of the dimension. Used for errors. 26 | dim_size 27 | Size of the dimension as interpreted by xarray. May be smaller than the 28 | full domain of the TileDB dimension. 29 | index 30 | An integer index, array of integer indices, or a slice for indexing an 31 | xarray dimension. 32 | 33 | Returns 34 | ------- 35 | Union[int, List[int], slice] 36 | An integer, a list of integer values, or a slice for indexing a 37 | TileDB dimension using mulit_index. 38 | """ 39 | if np.isscalar(index): 40 | # Convert xarray index to TileDB dimension coordinate 41 | if not -dim_size <= index < dim_size: 42 | raise IndexError( 43 | f"Index {index} out of bounds for dimension '{dim_name}' with size " 44 | f"{dim_size}." 45 | ) 46 | return index if index >= 0 else index + dim_size - 1 47 | 48 | if isinstance(index, slice): 49 | # Using range handles negative numbers and `None` values. 50 | index = range(dim_size)[index] 51 | if index.step in (1, None): 52 | # Convert from index slice to coordinate slice (note that xarray 53 | # includes the starting point and excludes the ending point vs. TileDB 54 | # multi_index which includes both the staring point and ending point). 55 | return slice(index.start, index.stop - 1) 56 | # This can be replaced with a proper slice when TileDB supports steps. 57 | return list(np.arange(index.start, index.stop, index.step)) 58 | 59 | if isinstance(index, np.ndarray): 60 | # Check numpy array has valid data. 61 | if index.ndim != 1: 62 | raise TypeError( 63 | f"Invalid indexer array for dimension '{dim_name}'. Input array index " 64 | f"must have exactly 1 dimension." 65 | ) 66 | if not ((-dim_size <= index).all() and (index < dim_size).all()): 67 | raise IndexError( 68 | f"Index {index} out of bounds for dimension '{dim_name}' with size " 69 | f"{dim_size}." 70 | ) 71 | # Convert negative indices to positive indices and return as a list of 72 | # values. 73 | return list(index + np.where(index >= 0, 0, dim_size - 1)) 74 | raise TypeError( 75 | f"Unexpected indexer type {type(index)} for dimension '{dim_name}'." 76 | ) 77 | 78 | 79 | class TileDBArrayWrapper(BackendArray): 80 | """Wrapper that allows xarray to access a TileDB array.""" 81 | 82 | __slots__ = ( 83 | "dtype", 84 | "shape", 85 | "variable_name", 86 | "_array_kwargs", 87 | "_attr_name", 88 | "_dim_names", 89 | "_fill", 90 | "_index_converters", 91 | ) 92 | 93 | def __init__( 94 | self, 95 | *, 96 | variable_name, 97 | uri, 98 | schema, 99 | attr_key, 100 | config, 101 | ctx, 102 | timestamp, 103 | fixed_dimensions, 104 | dimension_sizes, 105 | ): 106 | # Set basic properties. 107 | self.variable_name = variable_name 108 | self._array_kwargs = { 109 | "uri": uri, 110 | "config": config, 111 | "ctx": ctx, 112 | "timestamp": timestamp, 113 | } 114 | self._dim_names = tuple(dim.name for dim in schema.domain) 115 | 116 | # Check the array. 117 | if schema.sparse: 118 | raise ValueError( 119 | f"Error for variable '{self.variable_name}'; sparse arrays are not " 120 | f"supported." 121 | ) 122 | 123 | # Check dimensions and get the array shape. 124 | for dim in schema.domain: 125 | if dim.domain[0] != 0: 126 | raise ValueError( 127 | f"Error for variable '{self.variable_name}'; dimension " 128 | f"'{dim.name}' does not have a domain with lower bound of 0." 129 | ) 130 | if dim.dtype.kind not in ("i", "u"): 131 | raise ValueError( 132 | f"Error for variable '{self.variable_name}'. Dimension " 133 | f"'{dim.name}' has unsupported dtype={dim.dtype}." 134 | ) 135 | 136 | # Set TileDB attribute properties. 137 | _attr = schema.attr(attr_key) 138 | self._attr_name = _attr.name 139 | self.dtype = _attr.dtype 140 | self._fill = _attr.fill 141 | 142 | # Get the shape. 143 | if dimension_sizes is None: 144 | self.shape = schema.shape 145 | else: 146 | self.shape = tuple( 147 | int(dim.domain[1]) + 1 148 | if dim.name in fixed_dimensions 149 | else dimension_sizes.get(dim.name, int(dim.domain[1]) + 1) 150 | for dim in schema.domain 151 | ) 152 | 153 | def __getitem__(self, key): 154 | # Check the length of the input. 155 | indices = key.tuple 156 | if len(indices) != len(self.shape): 157 | ndim = len(self.shape) 158 | raise ValueError( 159 | f"key of length {len(indices)} cannot be used for a TileDB array" 160 | f" with {ndim} {'dimension' if ndim == 1 else 'dimensions'}" 161 | ) 162 | 163 | # Compute the shape of the output, collapsing any dimensions with scalar input. 164 | # If a dimension is of length zero, return an appropriately shaped enpty array. 165 | shape = tuple( 166 | len(range(dim_size)[index] if isinstance(index, slice) else index) 167 | for dim_size, index in zip(self.shape, indices) 168 | if not np.isscalar(index) 169 | ) 170 | if 0 in shape: 171 | return np.zeros(shape) 172 | 173 | # Get data from the TileDB array. 174 | tiledb_indices = tuple( 175 | _to_zero_based_tiledb_index(self._dim_names[idim], dim_size, index) 176 | for idim, (dim_size, index) in enumerate(zip(self.shape, indices)) 177 | ) 178 | with tiledb.open(**self._array_kwargs, attr=self._attr_name) as array: 179 | result = array.multi_index[tiledb_indices][self._attr_name] 180 | 181 | # TileDB multi_index returns the same number of dimensions as the initial array. 182 | # To match the expected xarray output, we need to reshape the result to remove 183 | # any dimensions corresponding to scalar-valued input. 184 | return result.reshape(shape) 185 | 186 | def __setitem__(self, key, value): 187 | with tiledb.open(**self._array_kwargs, mode="w") as array: 188 | array[key] = value.astype(dtype=self.dtype) 189 | 190 | @property 191 | def dim_names(self): 192 | """A tuple of the dimension names.""" 193 | return self._dim_names 194 | 195 | def get_metadata(self): 196 | """Returns a dictionary of the variable metadata including xarray specific 197 | encodings. 198 | """ 199 | full_key_prefix = f"{_ATTR_PREFIX}{self._attr_name}." 200 | with tiledb.open(**self._array_kwargs) as array: 201 | variable_metadata = {"_FillValue": self._fill} 202 | for key in array.meta: 203 | if key.startswith(full_key_prefix) and not len(key) == len( 204 | full_key_prefix 205 | ): 206 | variable_metadata[key[len(full_key_prefix) :]] = array.meta[key] 207 | return variable_metadata 208 | 209 | def set_metadata(self, input_meta): 210 | key_prefix = f"{_ATTR_PREFIX}{self._attr_name}" 211 | with tiledb.open(**self._array_kwargs, mode="w") as array: 212 | for key, value in input_meta.items(): 213 | safe_set_metadata(array.meta, f"{key_prefix}.{key}", value) 214 | --------------------------------------------------------------------------------