├── .gitignore
├── conftest.py
├── CONTRIBUTING.md
├── setup.py
├── README.md
├── .github
    └── workflows
    │   └── tests.yml
├── LICENSE
├── xarray_tensorstore_test.py
└── xarray_tensorstore.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.egg-info
 2 | .DS_Store
 3 | build
 4 | dist
 5 | docs/.ipynb_checkpoints
 6 | docs/_build
 7 | docs/_autosummary
 8 | docs/*.zarr
 9 | __pycache__
10 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Configure FLAGS with default values for absltest."""
15 | from absl import app
16 | 
17 | try:
18 |   app.run(lambda argv: None)
19 | except SystemExit:
20 |   pass
21 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project.
 4 | 
 5 | ## Before you begin
 6 | 
 7 | ### Sign our Contributor License Agreement
 8 | 
 9 | Contributions to this project must be accompanied by a
10 | [Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
11 | You (or your employer) retain the copyright to your contribution; this simply
12 | gives us permission to use and redistribute your contributions as part of the
13 | project.
14 | 
15 | If you or your current employer have already signed the Google CLA (even if it
16 | was for a different project), you probably don't need to do it again.
17 | 
18 | Visit <https://cla.developers.google.com/> to see your current agreements or to
19 | sign a new one.
20 | 
21 | ### Review our community guidelines
22 | 
23 | This project follows
24 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/).
25 | 
26 | ## Contribution process
27 | 
28 | ### Code reviews
29 | 
30 | All submissions, including submissions by project members, require review. We
31 | use GitHub pull requests for this purpose. Consult
32 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
33 | information on using pull requests.


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Setup Xarray-Tensorstore."""
16 | import setuptools
17 | 
18 | 
19 | setuptools.setup(
20 |     name='xarray-tensorstore',
21 |     version='0.3.0',  # keep in sync with xarray_tensorstore.py
22 |     license='Apache-2.0',
23 |     author='Google LLC',
24 |     author_email='noreply@google.com',
25 |     install_requires=['numpy', 'xarray', 'zarr', 'tensorstore',],
26 |     extras_require={
27 |         'tests': ['absl-py', 'pandas', 'pytest', 'dask'],
28 |     },
29 |     urls={'source': 'https://github.com/google/xarray-tensorstore'},
30 |     py_modules=['xarray_tensorstore'],
31 |     python_requires='>=3.10',
32 | )
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Xarray-TensorStore
 2 | 
 3 | Xarray-TensorStore is a small library that allows opening Zarr arrays into
 4 | Xarray via TensorStore, instead of the standard Zarr-Python library. In some
 5 | cases, we've found it to be considerably faster.
 6 | 
 7 | **Warning**: Xarray-TensorStore relies upon internal Xarray APIs that will
 8 | likely change in
 9 | [future versions of Xarray](https://github.com/pydata/xarray/issues/3981),
10 | precisely to accommodate these sorts of use-cases. Expect that the current
11 | version of Xarray-TensorStore will break at some point in the future and require
12 | updates for a new Xarray release.
13 | 
14 | ## Installation
15 | 
16 | Xarray-TensorStore is available on pypi:
17 | ```
18 | pip install xarray-tensorstore
19 | ```
20 | 
21 | ## Usage
22 | 
23 | Open a Zarr file into an `xarray.Dataset` using `open_zarr()`, and then use
24 | `read()` to start reading data in the background:
25 | 
26 | ```python
27 | import xarray_tensorstore
28 | 
29 | ds = xarray_tensorstore.open_zarr(path)
30 | 
31 | # As with xarray.open_zarr(), indexing & transposing is lazy
32 | example = ds.sel(time='2020-01-01').transpose('longitude', 'latitude', ...)
33 | 
34 | # Optional: start reading data in all arrays asynchronously
35 | read_example = xarray_tensorstore.read(example)
36 | 
37 | # Blocking conversion of the data into NumPy arrays. This happens sequentially,
38 | # one array at a time, unless you call read() first.
39 | numpy_example = read_example.compute()
40 | ```
41 | 
42 | Open a list of Zarr files and concatenate them along a single dimension using
43 | `open_concatenated_zarrs()`. The returned `xarray.Dataset` behaves exactly as above.
44 | This function requires the Dask package to be installed.
45 | 
46 | ```python
47 | import xarray_tensorstore
48 | 
49 | ds = xarray_tensorstore.open_concatenated_zarrs(
50 |     paths=[path1, path2],
51 |     concat_dim="time",
52 | )
53 | ```
54 | 
55 | ## Limitations
56 | 
57 | - Xarray-TensorStore still uses Zarr-Python under the covers to open Zarr
58 |   groups and read coordinate data (TensorStore does not yet support Zarr
59 |   groups).
60 | - Unlike `xarray.open_zarr`, decoding of data arrays according to CF Conventions
61 |   (e.g., `scale` and `add_offset` attributes) is not supported.
62 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on:
 4 |   # Triggers the workflow on push or pull request events but only for the main branch
 5 |   push:
 6 |     branches: [ main ]
 7 |   pull_request:
 8 |     branches: [ main ]
 9 |   # Allows you to run this workflow manually from the Actions tab
10 |   workflow_dispatch:
11 | 
12 | jobs:
13 |   tests:
14 |     name: "python=${{ matrix.python-version }} zarr=${{ matrix.zarr-version }} tests"
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.11", "3.12", "3.13"]
20 |         zarr-version: [">=2,<3", ">=3"]
21 |     steps:
22 |     - name: Cancel previous
23 |       uses: styfle/cancel-workflow-action@0.7.0
24 |       with:
25 |         access_token: ${{ github.token }}
26 |       if: ${{github.ref != 'refs/head/main'}}
27 |     - uses: actions/checkout@v4
28 |     - name: Set up Python ${{ matrix.python-version }}
29 |       uses: actions/setup-python@v5
30 |       with:
31 |         python-version: ${{ matrix.python-version }}
32 |     - name: Get pip cache dir
33 |       id: pip-cache
34 |       run: |
35 |         python -m pip install --upgrade pip wheel
36 |         echo "::set-output name=dir::$(pip cache dir)"
37 |     - name: pip cache
38 |       uses: actions/cache@v4
39 |       with:
40 |         path: ${{ steps.pip-cache.outputs.dir }}
41 |         key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }}
42 |     - name: Install Xarray-Tensorstore
43 |       run: |
44 |         pip install -e .[tests] "zarr${{ matrix.zarr-version }}"
45 |     - name: Run unit tests
46 |       run: |
47 |         pytest .
48 | 
49 |   # Auto-publish when version is increased
50 |   publish:
51 |     # Only try to publish if:
52 |     # * Repo is self (prevents running from forks)
53 |     # * Branch is `main`
54 |     if: |
55 |       github.repository == 'google/xarray-Tensorstore'
56 |       && github.ref == 'refs/heads/main'
57 |     needs: tests  # Only publish after tests are successful
58 |     runs-on: ubuntu-latest
59 |     permissions:
60 |       contents: write
61 |     timeout-minutes: 30
62 | 
63 |     steps:
64 |     # Publish the package (if local `__version__` > pip version)
65 |     - uses: etils-actions/pypi-auto-publish@v1
66 |       with:
67 |         pypi-token: ${{ secrets.PYPI_API_TOKEN }}
68 |         gh-token: ${{ secrets.GITHUB_TOKEN }}
69 |         parse-changelog: false
70 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/xarray_tensorstore_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the 'License');
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an 'AS IS' BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from absl.testing import absltest
 15 | from absl.testing import parameterized
 16 | import numpy as np
 17 | import packaging
 18 | import pandas as pd
 19 | import pytest
 20 | import tensorstore
 21 | import xarray
 22 | import xarray_tensorstore
 23 | import zarr
 24 | 
 25 | 
 26 | _USING_ZARR_PYTHON_3 = packaging.version.parse(zarr.__version__).major >= 3
 27 | 
 28 | test_cases = [
 29 |     {
 30 |         'testcase_name': 'base',
 31 |         'transform': lambda ds: ds,
 32 |     },
 33 |     {
 34 |         'testcase_name': 'transposed',
 35 |         'transform': lambda ds: ds.transpose('z', 'x', 'y'),
 36 |     },
 37 |     {
 38 |         'testcase_name': 'basic_int',
 39 |         'transform': lambda ds: ds.isel(y=1),
 40 |     },
 41 |     {
 42 |         'testcase_name': 'negative_int',
 43 |         'transform': lambda ds: ds.isel(y=-1),
 44 |     },
 45 |     {
 46 |         'testcase_name': 'basic_slice',
 47 |         'transform': lambda ds: ds.isel(z=slice(2)),
 48 |     },
 49 |     {
 50 |         'testcase_name': 'full_slice',
 51 |         'transform': lambda ds: ds.isel(z=slice(0, 4)),
 52 |     },
 53 |     {
 54 |         'testcase_name': 'out_of_bounds_slice',
 55 |         'transform': lambda ds: ds.isel(z=slice(0, 10)),
 56 |     },
 57 |     {
 58 |         'testcase_name': 'strided_slice',
 59 |         'transform': lambda ds: ds.isel(z=slice(0, None, 2)),
 60 |     },
 61 |     {
 62 |         'testcase_name': 'negative_stride_slice',
 63 |         'transform': lambda ds: ds.isel(z=slice(None, None, -1)),
 64 |     },
 65 |     {
 66 |         'testcase_name': 'repeated_indexing',
 67 |         'transform': lambda ds: ds.isel(z=slice(1, None)).isel(z=0),
 68 |     },
 69 |     {
 70 |         'testcase_name': 'oindex',
 71 |         # includes repeated, negative and out of order indices
 72 |         'transform': lambda ds: ds.isel(x=[0], y=[1, 1], z=[1, -1, 0]),
 73 |     },
 74 |     {
 75 |         'testcase_name': 'vindex',
 76 |         'transform': lambda ds: ds.isel(x=('w', [0, 1]), y=('w', [1, 2])),
 77 |     },
 78 |     {
 79 |         'testcase_name': 'mixed_indexing_types',
 80 |         'transform': lambda ds: ds.isel(x=0, y=slice(2), z=[-1]),
 81 |     },
 82 |     {
 83 |         'testcase_name': 'select_a_variable',
 84 |         'transform': lambda ds: ds['foo'],
 85 |     },
 86 | ]
 87 | 
 88 | 
 89 | class XarrayTensorstoreTest(parameterized.TestCase):
 90 | 
 91 |   # TODO(shoyer): consider using hypothesis to convert these into
 92 |   # property-based tests
 93 |   @parameterized.named_parameters(test_cases)
 94 |   def test_open_zarr(self, transform):
 95 |     source = xarray.Dataset(
 96 |         {
 97 |             'foo': (('x',), np.arange(2), {'local': 'local metadata'}),
 98 |             'bar': (('x', 'y'), np.arange(6).reshape(2, 3)),
 99 |             'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4)),
100 |         },
101 |         coords={
102 |             'x': [1, 2],
103 |             'y': pd.to_datetime(['2000-01-01', '2000-01-02', '2000-01-03']),
104 |             'z': ['a', 'b', 'c', 'd'],
105 |         },
106 |         attrs={'global': 'global metadata'},
107 |     )
108 |     path = self.create_tempdir().full_path
109 |     source.chunk().to_zarr(path)
110 | 
111 |     expected = transform(source)
112 |     actual = transform(xarray_tensorstore.open_zarr(path)).compute()
113 |     xarray.testing.assert_identical(actual, expected)
114 | 
115 |   @parameterized.named_parameters(test_cases)
116 |   def test_open_concatenated_zarrs(self, transform):
117 |     sources = [
118 |         xarray.Dataset(
119 |             {
120 |                 'foo': (('x',), x, {'local': 'local metadata'}),
121 |                 'bar': (('x', 'y'), np.arange(6).reshape(2, 3)),
122 |                 'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4)),
123 |             },
124 |             coords={
125 |                 'x': [1, 2],
126 |                 'y': pd.to_datetime(['2000-01-01', '2000-01-02', '2000-01-03']),
127 |                 'z': ['a', 'b', 'c', 'd'],
128 |             },
129 |             attrs={'global': 'global metadata'},
130 |         )
131 |         for x in [range(0, 2), range(3, 5)]
132 |     ]
133 | 
134 |     zarr_dir = self.create_tempdir().full_path
135 |     paths = [f'{zarr_dir}/{i}' for i in range(len(sources))]
136 |     for source, path in zip(sources, paths, strict=True):
137 |       source.chunk().to_zarr(path)
138 | 
139 |     expected = transform(xarray.concat(sources, dim='x'))
140 |     actual = transform(
141 |         xarray_tensorstore.open_concatenated_zarrs(paths, concat_dim='x')
142 |     ).compute()
143 |     xarray.testing.assert_identical(actual, expected)
144 | 
145 |   @parameterized.parameters(
146 |       {'deep': True},
147 |       {'deep': False},
148 |   )
149 |   def test_copy(self, deep):
150 |     source = xarray.Dataset({'foo': (('x',), np.arange(10))})
151 |     path = self.create_tempdir().full_path
152 |     source.to_zarr(path)
153 |     opened = xarray_tensorstore.open_zarr(path)
154 |     copied = opened.copy(deep=deep)
155 |     xarray.testing.assert_identical(copied, source)
156 | 
157 |   def test_sortby(self):
158 |     # regression test for https://github.com/google/xarray-tensorstore/issues/1
159 |     x = np.arange(10)
160 |     source = xarray.Dataset({'foo': (('x',), x)}, {'x': x[::-1]})
161 |     path = self.create_tempdir().full_path
162 |     source.to_zarr(path)
163 |     opened = xarray_tensorstore.open_zarr(path)
164 |     opened.sortby('x')  # should not crash
165 | 
166 |   def test_compute(self):
167 |     # verify that get_duck_array() is working properly
168 |     source = xarray.Dataset({'foo': (('x',), np.arange(10))})
169 |     path = self.create_tempdir().full_path
170 |     source.to_zarr(path)
171 |     opened = xarray_tensorstore.open_zarr(path)
172 |     computed = opened.compute()
173 |     computed_data = computed['foo'].variable._data
174 |     self.assertNotIsInstance(computed_data, tensorstore.TensorStore)
175 | 
176 |   def test_open_zarr_from_uri(self):
177 |     source = xarray.Dataset(
178 |         {'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4))}
179 |     )
180 |     path = self.create_tempdir().full_path
181 |     source.chunk().to_zarr(path)
182 | 
183 |     opened = xarray_tensorstore.open_zarr('file://' + path)
184 |     xarray.testing.assert_identical(source, opened)
185 | 
186 |   @parameterized.parameters(
187 |       {'zarr_format': 2, 'consolidated': True},
188 |       {'zarr_format': 3, 'consolidated': True},
189 |       {'zarr_format': 2, 'consolidated': False},
190 |       {'zarr_format': 3, 'consolidated': False},
191 |   )
192 |   def test_read_dataset(self, zarr_format: int, consolidated: bool):
193 |     if not _USING_ZARR_PYTHON_3 and zarr_format == 3:
194 |       self.skipTest('zarr format 3 is not supported in zarr < 3.0.0')
195 |     source = xarray.Dataset(
196 |         {'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4))},
197 |         coords={'x': np.arange(2)},
198 |     )
199 |     path = self.create_tempdir().full_path
200 |     source.chunk().to_zarr(
201 |         path, zarr_format=zarr_format, consolidated=consolidated
202 |     )
203 | 
204 |     opened = xarray_tensorstore.open_zarr(path)
205 |     read = xarray_tensorstore.read(opened)
206 | 
207 |     self.assertIsNone(opened.variables['baz']._data.future)
208 |     self.assertIsNotNone(read.variables['baz']._data.future)
209 |     xarray.testing.assert_identical(read, source)
210 | 
211 |   @parameterized.parameters(
212 |       {'zarr_format': 2},
213 |       {'zarr_format': 3},
214 |   )
215 |   def test_read_dataarray(self, zarr_format: int):
216 |     if not _USING_ZARR_PYTHON_3 and zarr_format == 3:
217 |       self.skipTest('zarr format 3 is not supported in zarr < 3.0.0')
218 |     source = xarray.DataArray(
219 |         np.arange(24).reshape(2, 3, 4),
220 |         dims=('x', 'y', 'z'),
221 |         name='baz',
222 |         coords={'x': np.arange(2)},
223 |     )
224 |     path = self.create_tempdir().full_path
225 |     source.to_dataset().chunk().to_zarr(path, zarr_format=zarr_format)
226 | 
227 |     opened = xarray_tensorstore.open_zarr(path)['baz']
228 |     read = xarray_tensorstore.read(opened)
229 | 
230 |     self.assertIsNone(opened.variable._data.future)
231 |     self.assertIsNotNone(read.variable._data.future)
232 |     xarray.testing.assert_identical(read, source)
233 | 
234 |   def test_mask_and_scale(self):
235 |     source = xarray.DataArray(
236 |         np.arange(24).reshape(2, 3, 4),
237 |         dims=('x', 'y', 'z'),
238 |         name='baz',
239 |         coords={'x': np.arange(2)},
240 |     )
241 | 
242 |     # invalid fill-value
243 |     source.encoding = {'_FillValue': -1}
244 |     path = self.create_tempdir().full_path
245 |     source.to_dataset().chunk().to_zarr(path)
246 |     expected_msg = (
247 |         'variable baz has non-NaN fill value, which is not supported by'
248 |         ' xarray-tensorstore: -1. Consider re-opening with'
249 |         ' xarray_tensorstore.open_zarr(..., mask_and_scale=False), or falling'
250 |         ' back to use xarray.open_zarr().'
251 |     )
252 |     with self.assertRaisesWithLiteralMatch(ValueError, expected_msg):
253 |       xarray_tensorstore.open_zarr(path)
254 | 
255 |     actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)['baz']
256 |     xarray.testing.assert_equal(actual, source)  # no values are masked
257 | 
258 |     # invalid scaling
259 |     source.encoding = {'scale_factor': 10.0}
260 |     path = self.create_tempdir().full_path
261 |     source.to_dataset().chunk().to_zarr(path)
262 |     expected_msg = 'variable baz uses scale/offset encoding'
263 |     with self.assertRaisesRegex(ValueError, expected_msg):
264 |       xarray_tensorstore.open_zarr(path)
265 | 
266 |     actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)['baz']
267 |     self.assertFalse(actual.equals(source))  # not scaled properly
268 | 
269 |     # valid offset (coordinate only)
270 |     source.encoding = {}
271 |     source.coords['x'].encoding = {'add_offset': -1}
272 |     path = self.create_tempdir().full_path
273 |     source.to_dataset().chunk().to_zarr(path)
274 |     actual = xarray_tensorstore.open_zarr(path, mask_and_scale=True)['baz']
275 |     xarray.testing.assert_identical(actual, source)
276 |     self.assertEqual(actual.coords['x'].encoding['add_offset'], -1)
277 | 
278 |   @parameterized.named_parameters(
279 |       {
280 |           'testcase_name': 'basic_indexing',
281 |           'key': (slice(1, None), slice(None), slice(None)),
282 |           'value': np.full((1, 2, 3), -1),
283 |       },
284 |       {
285 |           'testcase_name': 'outer_indexing',
286 |           'key': (np.array([0]), np.array([1]), slice(None)),
287 |           'value': np.full((1, 1, 3), -2),
288 |       },
289 |       {
290 |           'testcase_name': 'vectorized_indexing',
291 |           'key': (np.array([0]), np.array([0, 1]), slice(None)),
292 |           'value': np.full((2, 3), -3),
293 |       },
294 |   )
295 |   def test_setitem(self, key, value):
296 |     source_data = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
297 |     source = xarray.DataArray(
298 |         source_data,
299 |         dims=('x', 'y', 'z'),
300 |         name='baz',
301 |     )
302 |     path = self.create_tempdir().full_path
303 |     source.to_dataset().chunk().to_zarr(path)
304 | 
305 |     opened = xarray_tensorstore.open_zarr(path, write=True)['baz']
306 | 
307 |     opened[key] = value
308 |     read = xarray_tensorstore.read(opened)
309 | 
310 |     expected_data = source_data.copy()
311 |     expected_data[key] = value
312 |     expected = xarray.DataArray(
313 |         expected_data,
314 |         dims=('x', 'y', 'z'),
315 |         name='baz',
316 |     )
317 | 
318 |     xarray.testing.assert_equal(read, expected)
319 | 
320 |   def test_setitem_readonly(self):
321 |     source = xarray.DataArray(
322 |         np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]),
323 |         dims=('x', 'y', 'z'),
324 |         name='baz',
325 |     )
326 |     path = self.create_tempdir().full_path
327 |     source.to_dataset().chunk().to_zarr(path)
328 | 
329 |     opened = xarray_tensorstore.open_zarr(path)['baz']
330 |     with pytest.raises(ValueError):
331 |       opened[1:, ...] = np.full((1, 2, 3), -1)
332 | 
333 | 
334 | if __name__ == '__main__':
335 |   absltest.main()
336 | 


--------------------------------------------------------------------------------
/xarray_tensorstore.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Utilities for loading TensorStore data into Xarray."""
 15 | from __future__ import annotations
 16 | 
 17 | import dataclasses
 18 | import math
 19 | import os.path
 20 | import re
 21 | from typing import Optional, TypeVar
 22 | 
 23 | import numpy as np
 24 | import packaging
 25 | import tensorstore
 26 | import xarray
 27 | from xarray.core import indexing
 28 | import zarr
 29 | 
 30 | 
 31 | __version__ = '0.3.0'  # keep in sync with setup.py
 32 | 
 33 | 
 34 | Index = TypeVar('Index', int, slice, np.ndarray, None)
 35 | XarrayData = TypeVar('XarrayData', xarray.Dataset, xarray.DataArray)
 36 | 
 37 | 
 38 | def _numpy_to_tensorstore_index(index: Index, size: int) -> Index:
 39 |   """Switch from NumPy to TensorStore indexing conventions."""
 40 |   # https://google.github.io/tensorstore/python/indexing.html#differences-compared-to-numpy-indexing
 41 |   if index is None:
 42 |     return None
 43 |   elif isinstance(index, int):
 44 |     # Negative integers do not count from the end in TensorStore
 45 |     return index + size if index < 0 else index
 46 |   elif isinstance(index, slice):
 47 |     start = _numpy_to_tensorstore_index(index.start, size)
 48 |     stop = _numpy_to_tensorstore_index(index.stop, size)
 49 |     if stop is not None:
 50 |       # TensorStore does not allow out of bounds slicing
 51 |       stop = min(stop, size)
 52 |     return slice(start, stop, index.step)
 53 |   else:
 54 |     assert isinstance(index, np.ndarray)
 55 |     return np.where(index < 0, index + size, index)
 56 | 
 57 | 
 58 | @dataclasses.dataclass(frozen=True)
 59 | class _TensorStoreAdapter(indexing.ExplicitlyIndexed):
 60 |   """TensorStore array that can be wrapped by xarray.Variable.
 61 | 
 62 |   We use Xarray's semi-internal ExplicitlyIndexed API so that Xarray will not
 63 |   attempt to load our array into memory as a NumPy array. In the future, this
 64 |   should be supported by public Xarray APIs, as part of the refactor discussed
 65 |   in: https://github.com/pydata/xarray/issues/3981
 66 |   """
 67 | 
 68 |   array: tensorstore.TensorStore
 69 |   future: Optional[tensorstore.Future] = None
 70 | 
 71 |   @property
 72 |   def shape(self) -> tuple[int, ...]:
 73 |     return self.array.shape
 74 | 
 75 |   @property
 76 |   def dtype(self) -> np.dtype:
 77 |     return self.array.dtype.numpy_dtype
 78 | 
 79 |   @property
 80 |   def ndim(self) -> int:
 81 |     return len(self.shape)
 82 | 
 83 |   @property
 84 |   def size(self) -> int:
 85 |     return math.prod(self.shape)
 86 | 
 87 |   def __getitem__(self, key: indexing.ExplicitIndexer) -> _TensorStoreAdapter:
 88 |     index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape))
 89 |     if isinstance(key, indexing.OuterIndexer):
 90 |       # TODO(shoyer): fix this for newer versions of Xarray.
 91 |       # We get the error message:
 92 |       # AttributeError: '_TensorStoreAdapter' object has no attribute 'oindex'
 93 |       indexed = self.array.oindex[index_tuple]
 94 |     elif isinstance(key, indexing.VectorizedIndexer):
 95 |       indexed = self.array.vindex[index_tuple]
 96 |     else:
 97 |       assert isinstance(key, indexing.BasicIndexer)
 98 |       indexed = self.array[index_tuple]
 99 |     # Translate to the origin so repeated indexing is relative to the new bounds
100 |     # like NumPy, not absolute like TensorStore
101 |     translated = indexed[tensorstore.d[:].translate_to[0]]
102 |     return type(self)(translated)
103 | 
104 |   def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None:
105 |     index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape))
106 |     if isinstance(key, indexing.OuterIndexer):
107 |       self.array.oindex[index_tuple] = value
108 |     elif isinstance(key, indexing.VectorizedIndexer):
109 |       self.array.vindex[index_tuple] = value
110 |     else:
111 |       assert isinstance(key, indexing.BasicIndexer)
112 |       self.array[index_tuple] = value
113 |     # Invalidate the future so that the next read will pick up the new value
114 |     object.__setattr__(self, 'future', None)
115 | 
116 |   # xarray>2024.02.0 uses oindex and vindex properties, which are expected to
117 |   # return objects whose __getitem__ method supports the appropriate form of
118 |   # indexing.
119 |   @property
120 |   def oindex(self) -> _TensorStoreAdapter:
121 |     return self
122 | 
123 |   @property
124 |   def vindex(self) -> _TensorStoreAdapter:
125 |     return self
126 | 
127 |   def transpose(self, order: tuple[int, ...]) -> _TensorStoreAdapter:
128 |     transposed = self.array[tensorstore.d[order].transpose[:]]
129 |     return type(self)(transposed)
130 | 
131 |   def read(self) -> _TensorStoreAdapter:
132 |     future = self.array.read()
133 |     return type(self)(self.array, future)
134 | 
135 |   def __array__(self, dtype: Optional[np.dtype] = None) -> np.ndarray:  # type: ignore
136 |     future = self.array.read() if self.future is None else self.future
137 |     return np.asarray(future.result(), dtype=dtype)
138 | 
139 |   def get_duck_array(self):
140 |     # special method for xarray to return an in-memory (computed) representation
141 |     return np.asarray(self)
142 | 
143 |   # Work around the missing __copy__ and __deepcopy__ methods from TensorStore,
144 |   # which are needed for Xarray:
145 |   # https://github.com/google/tensorstore/issues/109
146 |   # TensorStore objects are immutable, so there's no need to actually copy them.
147 | 
148 |   def __copy__(self) -> _TensorStoreAdapter:
149 |     return type(self)(self.array, self.future)
150 | 
151 |   def __deepcopy__(self, memo) -> _TensorStoreAdapter:
152 |     return self.__copy__()
153 | 
154 | 
155 | def _read_tensorstore(
156 |     array: indexing.ExplicitlyIndexed,
157 | ) -> indexing.ExplicitlyIndexed:
158 |   """Starts async reading on a TensorStore array."""
159 |   return array.read() if isinstance(array, _TensorStoreAdapter) else array
160 | 
161 | 
162 | def read(xarraydata: XarrayData, /) -> XarrayData:
163 |   """Starts async reads on all TensorStore arrays."""
164 |   # pylint: disable=protected-access
165 |   if isinstance(xarraydata, xarray.Dataset):
166 |     data = {
167 |         name: _read_tensorstore(var.variable._data)
168 |         for name, var in xarraydata.data_vars.items()
169 |     }
170 |   elif isinstance(xarraydata, xarray.DataArray):
171 |     data = _read_tensorstore(xarraydata.variable._data)
172 |   else:
173 |     raise TypeError(f'argument is not a DataArray or Dataset: {xarraydata}')
174 |   # pylint: enable=protected-access
175 |   return xarraydata.copy(data=data)
176 | 
177 | 
178 | _DEFAULT_STORAGE_DRIVER = 'file'
179 | 
180 | 
181 | def _zarr_spec_from_path(path: str, zarr_format: int) -> ...:
182 |   if re.match(r'\w+\://', path):  # path is a URI
183 |     kv_store = path
184 |   else:
185 |     kv_store = {'driver': _DEFAULT_STORAGE_DRIVER, 'path': path}
186 |   return {'driver': f'zarr{zarr_format}', 'kvstore': kv_store}
187 | 
188 | 
189 | def _raise_if_mask_and_scale_used_for_data_vars(ds: xarray.Dataset):
190 |   """Check a dataset for data variables that would need masking or scaling."""
191 |   advice = (
192 |       'Consider re-opening with xarray_tensorstore.open_zarr(..., '
193 |       'mask_and_scale=False), or falling back to use xarray.open_zarr().'
194 |   )
195 |   for k in ds:
196 |     encoding = ds[k].encoding
197 |     for attr in ['_FillValue', 'missing_value']:
198 |       fill_value = encoding.get(attr, np.nan)
199 |       if fill_value == fill_value:  # pylint: disable=comparison-with-itself
200 |         raise ValueError(
201 |             f'variable {k} has non-NaN fill value, which is not supported by'
202 |             f' xarray-tensorstore: {fill_value}. {advice}'
203 |         )
204 |     for attr in ['scale_factor', 'add_offset']:
205 |       if attr in encoding:
206 |         raise ValueError(
207 |             f'variable {k} uses scale/offset encoding, which is not supported'
208 |             f' by xarray-tensorstore: {encoding}. {advice}'
209 |         )
210 | 
211 | 
212 | def _get_zarr_format(path: str) -> int:
213 |   """Returns the Zarr format of the given path."""
214 |   if packaging.version.parse(zarr.__version__).major >= 3:
215 |     return zarr.open_group(path, mode='r').metadata.zarr_format
216 |   else:
217 |     return 2
218 | 
219 | 
220 | def _open_tensorstore_arrays(
221 |     path: str,
222 |     names: list[str],
223 |     group: zarr.Group | None,
224 |     zarr_format: int,
225 |     write: bool,
226 |     context: tensorstore.Context | None = None,
227 | ) -> dict[str, tensorstore.Future]:
228 |   """Open all arrays in a Zarr group using TensorStore."""
229 |   specs = {
230 |       k: _zarr_spec_from_path(os.path.join(path, k), zarr_format) for k in names
231 |   }
232 | 
233 |   assume_metadata = False
234 |   if packaging.version.parse(zarr.__version__).major >= 3 and group is not None:
235 |     consolidated_metadata = group.metadata.consolidated_metadata
236 |     if consolidated_metadata is not None:
237 |       assume_metadata = True
238 |       for name in names:
239 |         metadata = consolidated_metadata.metadata[name].to_dict()
240 |         metadata.pop('attributes', None)  # not supported by TensorStore
241 |         specs[name]['metadata'] = metadata
242 | 
243 |   array_futures = {}
244 |   for k, spec in specs.items():
245 |     array_futures[k] = tensorstore.open(
246 |         spec,
247 |         read=True,
248 |         write=write,
249 |         open=True,
250 |         context=context,
251 |         assume_metadata=assume_metadata,
252 |     )
253 |   return array_futures
254 | 
255 | 
256 | def open_zarr(
257 |     path: str,
258 |     *,
259 |     context: tensorstore.Context | None = None,
260 |     mask_and_scale: bool = True,
261 |     write: bool = False,
262 |     consolidated: bool | None = None,
263 | ) -> xarray.Dataset:
264 |   """Open an xarray.Dataset from Zarr using TensorStore.
265 | 
266 |   For best performance, explicitly call `read()` to asynchronously load data
267 |   in parallel. Otherwise, xarray's `.compute()` method will load each variable's
268 |   data in sequence.
269 | 
270 |   Example usage:
271 | 
272 |     import xarray_tensorstore
273 | 
274 |     ds = xarray_tensorstore.open_zarr(path)
275 | 
276 |     # indexing & transposing is lazy
277 |     example = ds.sel(time='2020-01-01').transpose('longitude', 'latitude', ...)
278 | 
279 |     # start reading data asynchronously
280 |     read_example = xarray_tensorstore.read(example)
281 | 
282 |     # blocking conversion of the data into NumPy arrays
283 |     numpy_example = read_example.compute()
284 | 
285 |   Args:
286 |     path: path or URI to Zarr group to open.
287 |     context: TensorStore configuration options to use when opening arrays.
288 |     mask_and_scale: if True (default), attempt to apply masking and scaling like
289 |       xarray.open_zarr(). This is only supported for coordinate variables and
290 |       otherwise will raise an error.
291 |     write: Allow write access. Defaults to False.
292 |     consolidated: If True, read consolidated metadata. By default, an attempt to
293 |       use consolidated metadata is made with a fallback to non-consolidated
294 |       metadata, like in Xarray.
295 | 
296 |   Returns:
297 |     Dataset with all data variables opened via TensorStore.
298 |   """
299 |   # We use xarray.open_zarr (which uses Zarr Python internally) to open the
300 |   # initial version of the dataset for a few reasons:
301 |   # 1. TensorStore does not support Zarr groups or array attributes, which we
302 |   #    need to open in the xarray.Dataset. We use Zarr Python instead of
303 |   #    parsing the raw Zarr metadata files ourselves.
304 |   # 2. TensorStore doesn't support non-standard Zarr dtypes like UTF-8 strings.
305 |   # 3. Xarray's open_zarr machinery does some pre-processing (e.g., from numeric
306 |   #    to datetime64 dtypes) that we would otherwise need to invoke explicitly
307 |   #    via xarray.decode_cf().
308 |   #
309 |   # Fortunately (2) and (3) are most commonly encountered on small coordinate
310 |   # arrays, for which the performance advantages of TensorStore are irrelevant.
311 | 
312 |   if context is None:
313 |     context = tensorstore.Context()
314 | 
315 |   # Open Xarray's backends.ZarrStore directly so we can get access to the
316 |   # underlying Zarr group's consolidated metadata.
317 |   store = xarray.backends.ZarrStore.open_group(
318 |       path, consolidated=consolidated
319 |   )
320 |   group = store.zarr_group
321 |   ds = xarray.open_dataset(
322 |       filename_or_obj='',  # ignored in favor of store=
323 |       chunks=None,  # avoid using dask
324 |       mask_and_scale=mask_and_scale,
325 |       store=store,
326 |       engine='zarr',
327 |   )
328 | 
329 |   if mask_and_scale:
330 |     # Data variables get replaced below with _TensorStoreAdapter arrays, which
331 |     # don't get masked or scaled. Raising an error avoids surprising users with
332 |     # incorrect data values.
333 |     _raise_if_mask_and_scale_used_for_data_vars(ds)
334 | 
335 |   zarr_format = _get_zarr_format(path)
336 |   array_futures = _open_tensorstore_arrays(
337 |       path, list(ds), group, zarr_format, write=write, context=context
338 |   )
339 |   arrays = {k: v.result() for k, v in array_futures.items()}
340 |   new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()}
341 | 
342 |   return ds.copy(data=new_data)
343 | 
344 | 
345 | def _tensorstore_open_concatenated_zarrs(
346 |     paths: list[str],
347 |     data_vars: list[str],
348 |     concat_axes: list[int],
349 |     context: tensorstore.Context,
350 | ) -> dict[str, tensorstore.TensorStore]:
351 |   """Open multiple zarrs with TensorStore.
352 | 
353 |   Args:
354 |     paths: List of paths to zarr stores.
355 |     data_vars: List of data variable names to open.
356 |     concat_axes: List of axes along which to concatenate the data variables.
357 |     context: TensorStore context.
358 | 
359 |   Returns:
360 |     Dictionary of data variable names to concatenated TensorStore arrays.
361 |   """
362 |   # Open all arrays in all datasets using tensorstore
363 |   arrays_list = []
364 |   for path in paths:
365 |     zarr_format = _get_zarr_format(path)
366 |     # TODO(shoyer): Figure out how to support opening concatenated Zarrs with
367 |     # consolidated metadata. xarray.open_mfdataset() doesn't support opening
368 |     # from an existing store, so we'd have to replicate that functionality for
369 |     # figuring out the structure of the concatenated dataset.
370 |     group = None
371 |     array_futures = _open_tensorstore_arrays(
372 |         path, data_vars, group, zarr_format, write=False, context=context
373 |     )
374 |     arrays_list.append(array_futures)
375 | 
376 |   # Concatenate the tensorstore arrays
377 |   arrays = {}
378 |   for k, axis in zip(data_vars, concat_axes, strict=True):
379 |     datasets = [array_futures[k].result() for array_futures in arrays_list]
380 |     arrays[k] = tensorstore.concat(datasets, axis=axis)
381 | 
382 |   return arrays
383 | 
384 | 
385 | def open_concatenated_zarrs(
386 |     paths: list[str],
387 |     concat_dim: str,
388 |     *,
389 |     context: tensorstore.Context | None = None,
390 |     mask_and_scale: bool = True,
391 | ) -> xarray.Dataset:
392 |   """Open an xarray.Dataset whilst concatenating multiple Zarr using TensorStore.
393 | 
394 |   Notes:
395 |     This function depends on the Dask package.
396 | 
397 |   Args:
398 |     paths: List of paths to zarr stores.
399 |     concat_dim: Dimension along which to concatenate the data variables.
400 |     context: TensorStore context.
401 |     mask_and_scale: Whether to mask and scale the data.
402 | 
403 |   Returns:
404 |     Concatentated Dataset with all data variables opened via TensorStore.
405 |   """
406 |   if context is None:
407 |     context = tensorstore.Context()
408 | 
409 |   ds = xarray.open_mfdataset(
410 |       paths,
411 |       concat_dim=concat_dim,
412 |       combine='nested',
413 |       mask_and_scale=mask_and_scale,
414 |       engine='zarr',
415 |   )
416 | 
417 |   if mask_and_scale:
418 |     # Data variables get replaced below with _TensorStoreAdapter arrays, which
419 |     # don't get masked or scaled. Raising an error avoids surprising users with
420 |     # incorrect data values.
421 |     _raise_if_mask_and_scale_used_for_data_vars(ds)
422 | 
423 |   data_vars = list(ds.data_vars)
424 |   concat_axes = [ds[v].dims.index(concat_dim) for v in data_vars]
425 |   arrays = _tensorstore_open_concatenated_zarrs(
426 |       paths, data_vars, concat_axes, context
427 |   )
428 |   new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()}
429 | 
430 |   return ds.copy(data=new_data)
431 | 


--------------------------------------------------------------------------------