├── docs
    ├── img
    │   └── shadows_header.jpg
    ├── references.md
    ├── api.md
    ├── references.bib
    ├── examples
    │   ├── index.md
    │   ├── shadows-zarr.ipynb
    │   └── shadows-features.ipynb
    ├── Makefile
    ├── make.bat
    ├── index.md
    └── conf.py
├── tests
    ├── conftest.py
    ├── test_shadows_zarr.py
    └── test_shadows_hdf5.py
├── src
    └── shadows
    │   ├── __init__.py
    │   ├── compat.py
    │   ├── anndatashadow.py
    │   ├── mudatashadow.py
    │   ├── elemshadow.py
    │   └── datashadow.py
├── .github
    └── workflows
    │   ├── pythonpackage.yml
    │   └── docs.yml
├── LICENSE
├── README.md
├── .gitignore
└── pyproject.toml


/docs/img/shadows_header.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scverse/shadows/HEAD/docs/img/shadows_header.jpg


--------------------------------------------------------------------------------
/docs/references.md:
--------------------------------------------------------------------------------
 1 | # References
 2 | 
 3 | ```{bibliography}
 4 | :cited:
 5 | ```
 6 | 
 7 | 
 8 | ```{autosummary}
 9 | :toctree: generated
10 | :recursive:
11 | 
12 | *
13 | ```
14 | 


--------------------------------------------------------------------------------
/docs/api.md:
--------------------------------------------------------------------------------
 1 | # API
 2 | 
 3 | 
 4 | ```{eval-rst}
 5 | .. autoclass:: shadows
 6 |     :show-inheritance:
 7 |     :members: 
 8 | ```
 9 | 
10 | ```{autosummary}
11 | :toctree: generated
12 | :recursive:
13 | 
14 | *
15 | ```
16 | 


--------------------------------------------------------------------------------
/docs/references.bib:
--------------------------------------------------------------------------------
 1 | @article{bredikhin2022muon,
 2 |   title={Muon: multimodal omics analysis framework},
 3 |   author={Bredikhin, Danila and Kats, Ilia and Stegle, Oliver},
 4 |   journal={Genome Biology},
 5 |   volume={23},
 6 |   number={1},
 7 |   pages={1--12},
 8 |   year={2022},
 9 |   publisher={Springer}
10 | }
11 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.fixture(scope="module")
 5 | def filepath_h5ad(tmpdir_factory):
 6 |     yield str(tmpdir_factory.mktemp("tmp_test_dir_shadows").join("test.h5ad"))
 7 | 
 8 | 
 9 | @pytest.fixture(scope="module")
10 | def filepath_h5mu(tmpdir_factory):
11 |     yield str(tmpdir_factory.mktemp("tmp_test_dir_shadows").join("test.h5mu"))
12 | 
13 | 
14 | @pytest.fixture(scope="module")
15 | def filepath_mudata_zarr(tmpdir_factory):
16 |     yield str(tmpdir_factory.mktemp("tmp_test_dir_shadows").join("test_mudata.zarr"))
17 | 


--------------------------------------------------------------------------------
/src/shadows/__init__.py:
--------------------------------------------------------------------------------
 1 | from .anndatashadow import AnnDataShadow
 2 | from .datashadow import DataShadow
 3 | from .mudatashadow import MuDataShadow
 4 | 
 5 | try:  # See https://github.com/maresb/hatch-vcs-footgun-example
 6 |     from setuptools_scm import get_version
 7 | 
 8 |     __version__ = get_version(root="../..", relative_to=__file__)
 9 | except (ImportError, LookupError):
10 |     try:
11 |         from ._version import __version__
12 |     except ModuleNotFoundError:
13 |         raise RuntimeError("pqdata is not correctly installed. Please install it, e.g. with pip.")
14 | 
15 | __all__ = ["DataShadow", "AnnDataShadow", "MuDataShadow", "__version__"]
16 | 


--------------------------------------------------------------------------------
/docs/examples/index.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | 
 3 | ```{toctree}
 4 | :maxdepth: 2
 5 | 
 6 | shadow-objects.ipynb
 7 | shadows-features.ipynb
 8 | shadows-zarr.ipynb
 9 | ```
10 | 
11 | ```{contents}
12 | :local:
13 | :depth: 3
14 | ```
15 | 
16 | Shadows offer an interface for AnnData and MuData files on disk that enables loading the necessary parts of the datasets into memory (and caching them) only when needed.
17 | 
18 | More features and details of (low!) memory consumption are outlined on the following pages:
19 | 
20 | - [key features and memory consumption tracking](shadow-objects.ipynb),
21 | 
22 | - [more features](shadows-features.ipynb),
23 | 
24 | - [zarr interface](shadows-zarr.ipynb).
25 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/src/shadows/compat.py:
--------------------------------------------------------------------------------
 1 | from anndata._io.specs import read_elem as ad_read_elem
 2 | 
 3 | try:
 4 |     from pqdata.core import Array as PqArray
 5 |     from pqdata.core import Group as PqGroup
 6 | except ImportError:
 7 | 
 8 |     class PqArray:
 9 |         @staticmethod
10 |         def __repr__():
11 |             return "mock pqdata.core.Array"
12 | 
13 |     class PqGroup:
14 |         @staticmethod
15 |         def __repr__():
16 |             return "mock pqdata.core.Group"
17 | 
18 | 
19 | def read_elem(*args, **kwargs):
20 |     if "_format" in kwargs:
21 |         format = kwargs.pop("_format")
22 |         if format == "parquet":
23 |             from pqdata.core import read_elem as pq_read_elem
24 | 
25 |             return pq_read_elem(*args, **kwargs)
26 |         else:
27 |             return ad_read_elem(*args, **kwargs)
28 |     else:
29 |         return ad_read_elem(*args, **kwargs)
30 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         python-version: ["3.10", "3.11", "3.12"]
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v3
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v4
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 |     - name: Install dependencies
20 |       run: |
21 |         python -m pip install --upgrade pip
22 |         python -m pip install uv
23 |         uv venv
24 |         source .venv/bin/activate
25 |         uv pip install ruff pytest
26 |         uv pip install '.[dev,test,all]'
27 |     - name: Ruff check
28 |       run: |
29 |         source .venv/bin/activate
30 |         ruff check src/shadows
31 |     - name: Test with pytest
32 |       run: |
33 |         source .venv/bin/activate
34 |         pytest
35 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | on: [push]
 3 | 
 4 | jobs:
 5 |   docs:
 6 | 
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     permissions:
10 |       contents: write
11 |     steps:
12 |     - uses: actions/checkout@v4
13 |     - name: Set up Python 3.11
14 |       uses: actions/setup-python@v4
15 |       with:
16 |         python-version: "3.11"
17 |     - name: Install
18 |       run: |
19 |         python -m pip install --upgrade pip
20 |         python -m pip install '.[doc]'
21 |     - name: Install pandoc
22 |       run: sudo apt-get install -y pandoc
23 |     - name: Build HTML
24 |       working-directory: docs
25 |       run: |
26 |         make html -e
27 |     - name: Upload artifacts
28 |       uses: actions/upload-artifact@v4
29 |       with:
30 |         name: html-docs
31 |         path: docs/_build/html/
32 |     - name: Deploy
33 |       uses: peaceiris/actions-gh-pages@v3
34 |       if: github.ref == 'refs/heads/main'
35 |       with:
36 |         github_token: ${{ secrets.GITHUB_TOKEN }}
37 |         publish_dir: docs/_build/html
38 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Shadows
 2 | 
 3 | ```{toctree}
 4 | :hidden: true
 5 | :maxdepth: 2
 6 | 
 7 | examples/index.md
 8 | api.md
 9 | references.md
10 | ```
11 | 
12 | `shadows` is a Python library with low-memory interfaces for [scverse](https://scverse.org) data structures such as [AnnData](https://github.com/scverse/anndata) and [MuData](https://github.com/scverse/mudata).
13 | 
14 | [//]: # (numfocus-fiscal-sponsor-attribution)
15 | 
16 | shadows is part of the scverse® project ([website](https://scverse.org), [governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/).
17 | If you like scverse® and want to support our mission, please consider making a tax-deductible [donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs.
18 | 
19 | <div align="center">
20 | <a href="https://numfocus.org/project/scverse">
21 |   <img
22 |     src="https://raw.githubusercontent.com/numfocus/templates/master/images/numfocus-logo.png"
23 |     width="200"
24 |   >
25 | </a>
26 | </div>
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, Danila Bredikhin
 4 | Copyright (c) 2025, scverse®
 5 | All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are met:
 9 | 
10 | 1. Redistributions of source code must retain the above copyright notice, this
11 |    list of conditions and the following disclaimer.
12 | 
13 | 2. Redistributions in binary form must reproduce the above copyright notice,
14 |    this list of conditions and the following disclaimer in the documentation
15 |    and/or other materials provided with the distribution.
16 | 
17 | 3. Neither the name of the copyright holder nor the names of its
18 |    contributors may be used to endorse or promote products derived from
19 |    this software without specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="./docs/img/shadows_header.jpg" data-canonical-src="./docs/img/shadows_header.svg" width="100%"/>
 2 | 
 3 | # Shadows
 4 | 
 5 | Shadows are on-disk interfaces for scverse data standards such as [AnnData](https://github.com/scverse/anndata) and [MuData](https://github.com/scverse/mudata).
 6 | 
 7 | It is an experimental project.
 8 | 
 9 | [![PyPi version](https://img.shields.io/pypi/v/shadows)](https://pypi.org/project/shadows)
10 | 
11 | ## Installation
12 | 
13 | ```
14 | pip install shadows
15 | # or
16 | pip install git+https://github.com/scverse/shadows
17 | ```
18 | 
19 | ## Features
20 | 
21 | The shadows library aims to implement the following features:
22 | 
23 | - [x] **Shadow objects**: Read-only AnnDataShadow and MuDataShadow for HDF5 files.
24 | 
25 | - [x] AnnDataShadow and MuDataShadow for Zarr files.
26 | 
27 | - [x] AnnDataShadow and MuDataShadow for Parquet-based serialization ([pqdata](https://github.com/gtca/pqdata)).
28 | 
29 | - [x] Data shadows for `.pqdata` and `.zarr` files on S3 storage.
30 | 
31 | 
32 | ### Shadow objects
33 | 
34 | Briefly, shadow objects simply work like this:
35 | 
36 | ```py
37 | from shadows import *
38 | ash = AnnDataShadow("pbmc3k.h5ad")
39 | msh = MuDataShadow("pbmc5k_citeseq.h5mu")
40 | ```
41 | 
42 | All the various features are showcased in the following tutorials:
43 | 
44 | - [Getting started with shadow objects](/docs/examples/shadow-objects.ipynb)
45 | 
46 | - [Advanced features of shadow objects](/docs/examples/shadows-features.ipynb)
47 | 
48 | [//]: # (numfocus-fiscal-sponsor-attribution)
49 | 
50 | shadows is part of the scverse® project ([website](https://scverse.org), [governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/).
51 | If you like scverse® and want to support our mission, please consider making a tax-deductible [donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs.
52 | 
53 | <div align="center">
54 | <a href="https://numfocus.org/project/scverse">
55 |   <img
56 |     src="https://raw.githubusercontent.com/numfocus/templates/master/images/numfocus-logo.png"
57 |     width="200"
58 |   >
59 | </a>
60 | </div>
61 | 
62 | 


--------------------------------------------------------------------------------
/tests/test_shadows_zarr.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import pytest
 3 | from typing import Optional
 4 | 
 5 | from shadows import AnnDataShadow, MuDataShadow
 6 | 
 7 | import numpy as np
 8 | from scipy.sparse import coo_matrix
 9 | from anndata import AnnData
10 | from mudata import MuData
11 | 
12 | N, D = 50, 20
13 | 
14 | 
15 | def matrix(sparse_x: bool = False, n: Optional[int] = None, d: Optional[int] = None):
16 |     np.random.seed(100)
17 | 
18 |     if n is None:
19 |         n = N
20 |     if d is None:
21 |         d = D
22 | 
23 |     if sparse_x:
24 |         sparsity = 0.2
25 |         row = np.random.choice(n, 1000 * sparsity)
26 |         col = np.random.choice(d, 1000 * sparsity)
27 |         data = np.random.normal(size=1000 * sparsity)
28 | 
29 |         x = coo_matrix((data, (row, col)), shape=(n, d)).tocsr()
30 |     else:
31 |         x = np.random.normal(size=(n, d))
32 |     return x
33 | 
34 | 
35 | @pytest.fixture()
36 | def adata(sparse_x: bool = False, obsm: bool = False):
37 |     x = matrix(sparse_x)
38 |     ad = AnnData(X=x)
39 | 
40 |     return ad
41 | 
42 | 
43 | @pytest.fixture()
44 | def mdata(sparse_x: bool = False, sparse_y: bool = False):
45 |     np.random.seed(42)
46 | 
47 |     xn, xd = np.random.choice(100, 2)
48 |     yn, yd = np.random.choice(100, 2)
49 | 
50 |     x = matrix(sparse_x, n=xn, d=xd)
51 |     y = matrix(sparse_y, n=yn, d=yd)
52 | 
53 |     ax = AnnData(X=x)
54 |     ay = AnnData(X=y)
55 | 
56 |     ax.var_names = [f"x{i}" for i in range(xd)]
57 |     ay.var_names = [f"y{i}" for i in range(yd)]
58 | 
59 |     mdata = MuData({"x": ax, "y": ay})
60 | 
61 |     return mdata
62 | 
63 | 
64 | @pytest.mark.usefixtures("filepath_mudata_zarr")
65 | class TestMuData:
66 |     def test_mudata_simple(self, mdata, filepath_mudata_zarr):
67 |         filename = filepath_mudata_zarr
68 |         mdata.write_zarr(filename)
69 | 
70 |         msh = MuDataShadow(filename)
71 | 
72 |         assert mdata.shape == msh.shape
73 | 
74 |         msh.close()
75 | 
76 |     def test_anndata_inside_mudata(self, mdata, filepath_mudata_zarr):
77 |         filename = filepath_mudata_zarr
78 |         mdata.write_zarr(filename)
79 | 
80 |         mod_x = Path(filename) / "mod" / "x"
81 |         mod_y = Path(filename) / "mod" / "y"
82 | 
83 |         ash_x = AnnDataShadow(mod_x)
84 |         ash_y = AnnDataShadow(mod_y)
85 | 
86 |         assert ash_x.shape == mdata["x"].shape
87 |         assert ash_y.shape == mdata["y"].shape
88 | 
89 |         ash_x.close()
90 |         ash_y.close()
91 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # macOS
132 | .DS_Store
133 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | build-backend = "hatchling.build"
  3 | requires = ["hatchling", "hatch-vcs"]
  4 | 
  5 | [project]
  6 | name = "shadows"
  7 | description = "Low-memory data interfaces for scverse"
  8 | requires-python = ">= 3.10"
  9 | license = { file = "LICENSE" }
 10 | authors = [
 11 |     { name = "Danila Bredikhin" },
 12 | ]
 13 | maintainers = [
 14 |     { name = "Danila Bredikhin", email = "danila@stanford.edu" },
 15 | ]
 16 | readme = "README.md"
 17 | classifiers = [
 18 |     "Programming Language :: Python :: 3",
 19 |     "License :: OSI Approved :: BSD License",
 20 |     "Operating System :: OS Independent",
 21 |     "Development Status :: 3 - Alpha",
 22 |     "Topic :: Scientific/Engineering :: Bio-Informatics",
 23 |     "Intended Audience :: Science/Research"
 24 | ]
 25 | dependencies = [
 26 | ]
 27 | dynamic = ["version"]
 28 | 
 29 | [project.urls]
 30 | Documentation = "https://scverse.github.io/shadows"
 31 | Source = "https://github.com/scverse/shadows"
 32 | Home-page = "https://github.com/scverse/shadows"
 33 | 
 34 | [project.optional-dependencies]
 35 | dev = [
 36 |     "setuptools_scm",
 37 | ]
 38 | doc = [
 39 |     "sphinx",
 40 |     "myst-parser",
 41 |     "sphinx-book-theme",
 42 |     "readthedocs-sphinx-search",
 43 |     "nbsphinx",
 44 |     "sphinx-automodapi",
 45 |     "sphinxcontrib-bibtex",
 46 |     "sphinx-autodoc-typehints",
 47 |     "furo",
 48 |     "ipython",  # fix nbsphinx syntax highlighting
 49 |     "pygments",
 50 | ]
 51 | test = [
 52 |     "scipy",
 53 |     "pytest",
 54 |     "pytest-cov",
 55 |     "zarr",
 56 |     "pqdata",
 57 |     "mudata",
 58 |     "anndata",
 59 | ]
 60 | all = [
 61 |     "scipy",
 62 |     "anndata >= 0.8",
 63 |     "mudata >= 0.2",
 64 | ]
 65 | 
 66 | [tool.pytest.ini_options]
 67 | python_files = "test_*.py"
 68 | testpaths = [
 69 |     "./tests", # unit tests
 70 | ]
 71 | 
 72 | [tool.black]
 73 | line-length = 100
 74 | target-version = ['py39']
 75 | 
 76 | [tool.hatch.version]
 77 | source = "vcs"
 78 | 
 79 | [tool.hatch.build.hooks.vcs]
 80 | version-file = "src/shadows/_version.py"
 81 | 
 82 | [tool.hatch.build.targets.wheel]
 83 | packages = ["src/shadows"]
 84 | 
 85 | [tool.hatch.build.targets.sdist]
 86 | exclude = [
 87 |   "/.github",
 88 |   "/docs",
 89 | ]
 90 | 
 91 | [tool.ruff]
 92 | src = ["src"]
 93 | exclude = ["src/shadows/_version.py"]
 94 | 
 95 | [tool.ruff.format]
 96 | docstring-code-format = true
 97 | 
 98 | [tool.ruff.lint]
 99 | select = [
100 |     "E",   # Error detected by Pycodestyle
101 |     "F",   # Errors detected by Pyflakes
102 |     "W",   # Warning detected by Pycodestyle
103 |     "UP",  # pyupgrade
104 |     "I",   # isort
105 |     "TCH", # manage type checking blocks
106 |     "ICN", # Follow import conventions
107 |     "PTH", # Pathlib instead of os.path
108 |     "PT",  # Pytest conventions
109 | ]
110 | ignore = [
111 |     # line too long -> we accept long comment lines; formatter gets rid of long code lines
112 |     "E501",
113 |     # Do not assign a lambda expression, use a def -> AnnData allows lambda expression assignments,
114 |     "E731",
115 |     # allow I, O, l as variable names -> I is the identity matrix, i, j, k, l is reasonable indexing notation
116 |     "E741",
117 | ]
118 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | import sys
10 | from pathlib import Path
11 | from datetime import datetime
12 | from importlib.metadata import metadata
13 | 
14 | HERE = Path(__file__).parent
15 | sys.path.insert(0, str(HERE / "extensions"))
16 | 
17 | # -- Project information -----------------------------------------------------
18 | 
19 | project = "shadows"
20 | author = "Danila Bredikhin"
21 | copyright = f"{datetime.now():%Y}, {author}"
22 | info = metadata("shadows")
23 | version = info["Version"]
24 | 
25 | bibtex_bibfiles = ["references.bib"]
26 | bibtex_reference_style = "author_year"
27 | templates_path = ["_templates"]
28 | nitpicky = True  # Warn about broken links
29 | needs_sphinx = "4.0"
30 | 
31 | html_context = {
32 |     "display_github": True,  # Integrate GitHub
33 |     "github_user": "gtca",  # Username
34 |     "github_repo": project,  # Repo name
35 |     "github_version": "main",  # Version
36 |     "conf_py_path": "/docs/",  # Path in the checkout to the docs root
37 | }
38 | 
39 | # -- General configuration ---------------------------------------------------
40 | 
41 | # Add any Sphinx extension module names here, as strings. They can be
42 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
43 | # ones.
44 | extensions = [
45 |     "myst_parser",
46 |     "sphinx.ext.autodoc",
47 |     "sphinx.ext.intersphinx",
48 |     "sphinx.ext.autosummary",
49 |     "sphinx.ext.napoleon",
50 |     "sphinxcontrib.bibtex",
51 |     "sphinx_autodoc_typehints",
52 |     "nbsphinx",
53 |     "sphinx.ext.mathjax",
54 |     *[p.stem for p in (HERE / "extensions").glob("*.py")],
55 | ]
56 | 
57 | 
58 | autosummary_generate = True
59 | autodoc_member_order = "groupwise"
60 | default_role = "literal"
61 | napoleon_google_docstring = False
62 | napoleon_numpy_docstring = True
63 | napoleon_include_init_with_doc = False
64 | napoleon_use_rtype = True  # having a separate entry generally helps readability
65 | napoleon_use_param = True
66 | 
67 | intersphinx_mapping = {
68 |     "anndata": ("https://anndata.readthedocs.io/en/stable/", None),
69 |     "numpy": ("https://numpy.org/doc/stable/", None),
70 | }
71 | 
72 | nbsphinx_execute = "never"
73 | 
74 | # Add any paths that contain templates here, relative to this directory.
75 | templates_path = ["_templates"]
76 | 
77 | # List of patterns, relative to source directory, that match files and
78 | # directories to ignore when looking for source files.
79 | # This pattern also affects html_static_path and html_extra_path.
80 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"]
81 | 
82 | 
83 | # -- Options for HTML output -------------------------------------------------
84 | 
85 | # The theme to use for HTML and HTML Help pages.  See the documentation for
86 | # a list of builtin themes.
87 | #
88 | html_theme = "furo"
89 | 
90 | # Add any paths that contain custom static files (such as style sheets) here,
91 | # relative to this directory. They are copied after the builtin static files,
92 | # so a file named "default.css" will overwrite the builtin "default.css".
93 | html_static_path = ["_static"]
94 | html_theme_options = {
95 |     "logo_only": True,
96 | }
97 | 


--------------------------------------------------------------------------------
/src/shadows/anndatashadow.py:
--------------------------------------------------------------------------------
  1 | from functools import cached_property
  2 | from pathlib import Path
  3 | 
  4 | import numpy as np
  5 | from anndata import AnnData
  6 | 
  7 | # For simplicity, use AnnData read_elem/write_elem
  8 | from anndata._core.index import _normalize_indices
  9 | 
 10 | from .datashadow import DataShadow
 11 | from .elemshadow import ElemShadow, RawElemShadow, _get_backend_reader
 12 | 
 13 | RUNECACHED = "\u1401"
 14 | RUNECACHEDALT = "\u25bc"
 15 | RUNENEW = "\u25b2"
 16 | 
 17 | 
 18 | class AnnDataShadow(DataShadow):
 19 |     def __init__(self, filepath, *args, **kwargs):
 20 |         super().__init__(filepath, *args, **kwargs)
 21 | 
 22 |     @classmethod
 23 |     def _init_as_view(cls, shadow, oidx, vidx):
 24 |         if shadow._format == "zarr":
 25 |             filename = shadow.file.store.path
 26 |             mode = "r+" if not shadow.file.read_only else "r"
 27 |         elif shadow._format == "parquet":
 28 |             filename = shadow.file.path
 29 |             mode = "r+"  # FIXME
 30 |             # raise NotImplementedError("Parquet format is not supported for views.")
 31 |         else:
 32 |             filename = shadow.file.filename
 33 |             mode = shadow.file.mode
 34 | 
 35 |         if shadow.root != "/":
 36 |             filename = (
 37 |                 str(Path(filename) / shadow.root[1:])
 38 |                 if shadow.root.startswith("/")
 39 |                 else str(Path(filename) / shadow.root)
 40 |             )
 41 |         view = AnnDataShadow(
 42 |             filename,
 43 |             array_backend=shadow._array_backend,
 44 |             table_backend=shadow._table_backend,
 45 |             mode=mode,
 46 |             format=shadow._format,
 47 |         )
 48 | 
 49 |         # NOTE: Cache is not preserved in a new object
 50 | 
 51 |         view._is_view = True
 52 |         view._ref = shadow
 53 |         view._oidx = oidx
 54 |         view._vidx = vidx
 55 | 
 56 |         if shadow.is_view:
 57 |             view._ref = shadow._ref
 58 |             for attr, idx in (("_oidx", oidx), ("_vidx", vidx)):
 59 |                 shadow_idx = getattr(shadow, attr)
 60 |                 if shadow_idx is not None:
 61 |                     n_attr = shadow._ref.n_obs if attr == "_oidx" else shadow._ref.n_vars
 62 |                     if isinstance(shadow_idx, slice) and isinstance(idx, int | np.integer | slice):
 63 |                         r = range(*shadow_idx.indices(n_attr)).__getitem__(idx)
 64 |                         if isinstance(r, int | np.integer):
 65 |                             setattr(view, attr, np.array([r]))
 66 |                         setattr(view, attr, slice(r.start, r.stop, r.step))
 67 |                     elif isinstance(shadow_idx, slice):
 68 |                         setattr(view, attr, np.arange(*shadow_idx.indices(shadow._ref.n_obs))[idx])
 69 |                     elif hasattr(shadow_idx.dtype, "type") and issubclass(
 70 |                         shadow_idx.dtype.type, np.bool_
 71 |                     ):
 72 |                         if hasattr(idx.dtype, "type") and issubclass(idx.dtype.type, np.bool_):
 73 |                             view_idx = shadow_idx.copy()
 74 |                             view_idx[view_idx] = idx
 75 |                             setattr(view, attr, view_idx)
 76 |                         else:
 77 |                             setattr(view, attr, shadow_idx[np.where(idx)[0]])
 78 |                     else:
 79 |                         setattr(view, attr, shadow_idx[idx])
 80 |         return view
 81 | 
 82 |     @cached_property
 83 |     def _X(self):
 84 |         reader = _get_backend_reader(self._array_backend, self._lazy)
 85 |         if self.is_view:
 86 |             if (
 87 |                 isinstance(self._vidx, slice)
 88 |                 and self._vidx.start is None
 89 |                 and self._vidx.stop is None
 90 |             ):
 91 |                 x = reader(self.file[self.root]["X"][self._oidx])
 92 |             elif (
 93 |                 isinstance(self._oidx, slice)
 94 |                 and self._oidx.start is None
 95 |                 and self._oidx.stop is None
 96 |             ):
 97 |                 x = reader(self.file[self.root]["X"][:, self._vidx])
 98 |             else:
 99 |                 # Only one indexing array at a time is possible
100 |                 x = reader(self.file[self.root]["X"][self._oidx][:, self._vidx])
101 |         else:
102 |             x = reader(self.file[self.root]["X"])
103 |         self._ids["X"] = id(x)
104 |         return x
105 | 
106 |     @property
107 |     def X(self):
108 |         return self._X
109 | 
110 |     @cached_property
111 |     def _layers(self):
112 |         group_storage = (
113 |             self.file[self.root]["layers"] if "layers" in self.file[self.root] else dict()
114 |         )
115 |         return ElemShadow(
116 |             group_storage,
117 |             key=str(Path(self.root) / "layers"),
118 |             cache=self.__dict__,
119 |             n_obs=self.n_obs,
120 |             n_vars=self.n_vars,
121 |             array_backend=self._array_backend,
122 |             table_backend=self._table_backend,
123 |             is_view=self.is_view,
124 |             idx=(self._oidx, self._vidx),
125 |         )
126 | 
127 |     @property
128 |     def layers(self):
129 |         return self._layers
130 | 
131 |     @cached_property
132 |     def _raw(self):
133 |         """
134 |         Legacy support. New objects should not use .raw.
135 |         """
136 |         if "raw" in self.file[self.root]:
137 |             group_storage = self.file[self.root]["raw"]
138 |         else:
139 |             group_storage = dict()
140 | 
141 |         return RawElemShadow(
142 |             group_storage,
143 |             key=str(Path(self.root) / "raw"),
144 |             cache=self.__dict__,
145 |             n_obs=self.n_obs,
146 |             n_vars=None,
147 |             array_backend=self._array_backend,
148 |             table_backend=self._table_backend,
149 |             file=self.file,
150 |             is_view=self.is_view,
151 |             idx=(self._oidx, None),
152 |         )
153 | 
154 |     @property
155 |     def raw(self):
156 |         return self._raw
157 | 
158 |     def __repr__(self):
159 |         if self.is_view:
160 |             if self._ref is not None:
161 |                 s = f"View of AnnData Shadow object with n_obs × n_vars = {self.n_obs} × {self.n_vars} (original {self._ref.n_obs} × {self._ref.n_vars})\n"
162 |             else:
163 |                 s = f"View of AnnData Shadow object with n_obs × n_vars = {self.n_obs} × {self.n_vars}\n"
164 |         else:
165 |             s = f"AnnData Shadow object with n_obs × n_vars = {self.n_obs} × {self.n_vars}\n"
166 | 
167 |         # X
168 |         key_cached = "X" in self.__dict__ or "_X" in self.__dict__
169 |         key_cached_str = RUNECACHED if key_cached else ""
170 |         if key_cached:
171 |             if "X" in self._ids and self._ids["X"] != id(self.X):
172 |                 key_cached_str = RUNECACHEDALT
173 |             elif "_X" in self._ids and self._ids["_X"] != id(self.X):
174 |                 key_cached_str = RUNECACHEDALT
175 | 
176 |         s += f"  X {key_cached_str} \n"
177 | 
178 |         # raw
179 |         if self.raw and len(self.raw.keys()) > 0:
180 |             s += "  " + self.raw.__repr__()
181 | 
182 |         # layers
183 |         if len(self.layers) > 0:
184 |             s += "  " + self.layers.__repr__()
185 | 
186 |         s += "\n".join(["  " + line for line in super().__repr__().strip().split("\n")]) + "\n"
187 | 
188 |         return s
189 | 
190 |     def obs_vector(self, key: str, layer: str | None = None):
191 |         if key not in self.obs.columns and key not in self.var_names:
192 |             key = str.encode(key)
193 |         if key in self.var_names:
194 |             # Assume unique var_names
195 |             key_i = np.where(self.var_names == key)[0][0]
196 |             if layer is not None:
197 |                 return self.layers[layer][:, key_i]
198 |             else:
199 |                 return self.X[:, key_i]
200 | 
201 |         return self.obs[key].values
202 | 
203 |     def var_vector(self, key: str, layer: str | None = None):
204 |         if key not in self.var.columns and key not in self.obs_names:
205 |             key = str.encode(key)
206 |         if key in self.obs_names:
207 |             # Assume unique obs_names
208 |             key_i = np.where(self.obs_names == key)[0][0]
209 |             if layer is not None:
210 |                 return self.layers[layer][key_i, :]
211 |             else:
212 |                 return self.X[key_i, :]
213 | 
214 |         return self.var[key].values
215 | 
216 |     # Views
217 | 
218 |     def __getitem__(self, index):
219 |         oidx, vidx = _normalize_indices(index, self.obs_names, self.var_names)
220 |         return AnnDataShadow._init_as_view(self, oidx, vidx)
221 | 
222 |     #
223 |     # It is either this or duck typing.
224 |     #
225 |     # Frequently used tools like scanpy
226 |     # check if the object is an AnnData instance
227 |     # inside quite a few functions.
228 |     #
229 |     # Until those instances are replaced with duck typing,
230 |     # the remedy is to mock the class name.
231 |     #
232 | 
233 |     @property
234 |     def __class__(self):
235 |         return AnnData
236 | 


--------------------------------------------------------------------------------
/tests/test_shadows_hdf5.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import pytest
  3 | from typing import Optional
  4 | 
  5 | from shadows import AnnDataShadow, MuDataShadow
  6 | 
  7 | import numpy as np
  8 | from scipy.sparse import coo_matrix
  9 | from anndata import AnnData
 10 | import mudata
 11 | from mudata import MuData
 12 | 
 13 | N, D = 50, 20
 14 | 
 15 | mudata.set_options(pull_on_update=False)
 16 | 
 17 | 
 18 | def matrix(sparse_x: bool = False, n: Optional[int] = None, d: Optional[int] = None):
 19 |     np.random.seed(100)
 20 | 
 21 |     if n is None:
 22 |         n = N
 23 |     if d is None:
 24 |         d = D
 25 | 
 26 |     if sparse_x:
 27 |         sparsity = 0.2
 28 |         row = np.random.choice(n, 1000 * sparsity)
 29 |         col = np.random.choice(d, 1000 * sparsity)
 30 |         data = np.random.normal(size=1000 * sparsity)
 31 | 
 32 |         x = coo_matrix((data, (row, col)), shape=(n, d)).tocsr()
 33 |     else:
 34 |         x = np.random.normal(size=(n, d))
 35 |     return x
 36 | 
 37 | 
 38 | @pytest.fixture()
 39 | def adata(sparse_x: bool = False, obsm: bool = False):
 40 |     x = matrix(sparse_x)
 41 |     ad = AnnData(X=x)
 42 | 
 43 |     return ad
 44 | 
 45 | 
 46 | @pytest.fixture()
 47 | def mdata(sparse_x: bool = False, sparse_y: bool = False):
 48 |     np.random.seed(42)
 49 | 
 50 |     xn, xd = np.random.choice(100, 2)
 51 |     yn, yd = np.random.choice(100, 2)
 52 | 
 53 |     x = matrix(sparse_x, n=xn, d=xd)
 54 |     y = matrix(sparse_y, n=yn, d=yd)
 55 | 
 56 |     ax = AnnData(X=x)
 57 |     ay = AnnData(X=y)
 58 | 
 59 |     ax.var_names = [f"x{i}" for i in range(xd)]
 60 |     ay.var_names = [f"y{i}" for i in range(yd)]
 61 | 
 62 |     mdata = MuData({"x": ax, "y": ay})
 63 | 
 64 |     return mdata
 65 | 
 66 | 
 67 | @pytest.mark.usefixtures("filepath_h5ad")
 68 | class TestAnnData:
 69 |     @pytest.mark.parametrize("sparse_x", [True, False])
 70 |     def test_anndata_simple(self, adata, filepath_h5ad, sparse_x):
 71 |         filename = filepath_h5ad
 72 |         adata.write(filename)
 73 | 
 74 |         ash = AnnDataShadow(filename)
 75 | 
 76 |         assert adata.shape == ash.shape
 77 | 
 78 |         ash.close()
 79 | 
 80 |     def test_anndata_obs(self, adata, filepath_h5ad):
 81 |         filename = filepath_h5ad.replace(".h5ad", "_obs.h5ad")
 82 | 
 83 |         adata.obs["logical"] = np.random.choice([True, False], size=N)
 84 |         adata.obs["integers"] = np.arange(N)
 85 |         adata.obs["floats"] = np.random.normal(size=N)
 86 |         adata.obs["strings"] = np.random.choice(["abc", "def"], size=N)
 87 |         adata.obs["categories"] = adata.obs["strings"].astype("category")
 88 | 
 89 |         adata.write(filename)
 90 | 
 91 |         ash = AnnDataShadow(filename)
 92 | 
 93 |         for key in ["logical", "integers", "floats", "strings", "categories"]:
 94 |             assert key in ash.obs.columns
 95 |             assert ash.obs[key].equals(adata.obs[key])
 96 | 
 97 |         assert adata.obs.shape == ash.obs.shape
 98 | 
 99 |     def test_anndata_obsm(self, adata, filepath_h5ad):
100 |         filename = filepath_h5ad.replace(".h5ad", "_obsm.h5ad")
101 | 
102 |         for i in range(2, 10):
103 |             adata.obsm["X_test"] = np.random.normal(size=(N, 2))
104 |             adata.write(filename)
105 | 
106 |             ash = AnnDataShadow(filename)
107 | 
108 |             assert "X_test" in ash.obsm
109 |             assert adata.obsm["X_test"].shape == ash.obsm["X_test"].shape
110 | 
111 |             ash.close()
112 | 
113 |     def test_anndata_var(self, adata, filepath_h5ad):
114 |         filename = filepath_h5ad.replace(".h5ad", "_var.h5ad")
115 | 
116 |         adata.var["logical"] = np.random.choice([True, False], size=D)
117 |         adata.var["integers"] = np.arange(D)
118 |         adata.var["floats"] = np.random.normal(size=D)
119 |         adata.var["strings"] = np.random.choice(["abc", "def"], size=D)
120 |         adata.var["categories"] = adata.var["strings"].astype("category")
121 | 
122 |         adata.write(filename)
123 | 
124 |         ash = AnnDataShadow(filename)
125 |         assert adata.var.shape == ash.var.shape
126 | 
127 |         assert ash.var.strings.equals(adata.var.strings)
128 |         assert ash.var.categories.equals(adata.var.categories)
129 | 
130 |         ash.close()
131 | 
132 |     def test_anndata_varm(self, adata, filepath_h5ad):
133 |         filename = filepath_h5ad.replace(".h5ad", "_varm.h5ad")
134 | 
135 |         for i in range(2, 10):
136 |             adata.varm["loadings"] = np.random.normal(size=(D, 2))
137 |             adata.write(filename)
138 | 
139 |             ash = AnnDataShadow(filename)
140 | 
141 |             assert "loadings" in ash.varm
142 |             assert adata.varm["loadings"].shape == ash.varm["loadings"].shape
143 | 
144 |             ash.close()
145 | 
146 |     def test_anndata_uns(self, adata, filepath_h5ad):
147 |         filename = filepath_h5ad.replace(".h5ad", "_uns.h5ad")
148 | 
149 |         adata.uns["logical"] = np.random.choice([True, False])
150 |         adata.uns["integer"] = 1
151 |         adata.uns["float"] = 0.1
152 |         adata.uns["string"] = "abc"
153 |         adata.uns["dict"] = {"a": 1, "b": 2}
154 | 
155 |         adata.write(filename)
156 | 
157 |         ash = AnnDataShadow(filename)
158 | 
159 |         assert adata.uns["string"] == ash.uns["string"]
160 |         assert adata.uns["dict"] == ash.uns["dict"]
161 | 
162 |         ash.close()
163 | 
164 | 
165 | @pytest.mark.usefixtures("filepath_h5ad")
166 | class TestViewsAnnData:
167 |     def test_single_view_range(self, adata, filepath_h5ad):
168 |         filename = filepath_h5ad
169 |         adata.write(filename)
170 | 
171 |         np.random.seed(42)
172 |         i = np.random.choice(N, 1)[0]
173 |         j = np.random.choice(D, 1)[0]
174 | 
175 |         ash = AnnDataShadow(filename)
176 | 
177 |         view = adata[:i, :j]
178 |         ash_view = ash[:i, :j]
179 | 
180 |         assert ash_view.shape == view.shape
181 |         assert ash_view.shape == (i, j)
182 |         assert ash_view.X.shape == (i, j)
183 | 
184 |         ash.close()
185 | 
186 |     def test_bool_slicing(self, adata, filepath_h5ad):
187 |         np.random.seed(42)
188 |         ix = np.random.choice(adata.obs_names, size=20, replace=False)
189 |         sel = adata.obs_names.isin(ix)
190 |         adata.obs["sel"] = sel
191 | 
192 |         filename = filepath_h5ad
193 |         adata.write(filename)
194 | 
195 |         ash = AnnDataShadow(filename)
196 |         view = adata[adata.obs.sel, :]
197 |         ash_view = ash[ash.obs.sel, :]
198 | 
199 |         assert ash_view.shape == view.shape
200 |         assert ash_view.shape == (len(ix), adata.n_vars)
201 |         assert ash_view.X.shape == (len(ix), adata.n_vars)
202 | 
203 |         ash.close()
204 | 
205 |     def test_nested_views(self, adata, filepath_h5ad):
206 |         filename = filepath_h5ad
207 |         adata.write(filename)
208 | 
209 |         np.random.seed(42)
210 |         i = np.random.choice(N, 1)[0]
211 |         j = np.random.choice(D, 1)[0]
212 |         ii = np.random.choice(i, 1)[0]
213 |         jj = np.random.choice(j, 1)[0]
214 | 
215 |         ash = AnnDataShadow(filename)
216 | 
217 |         view = adata[:i, :j]
218 |         view = view[:ii, :jj]
219 |         ash_view = ash[:i, :j]
220 |         ash_view = ash_view[:ii, :jj]
221 | 
222 |         assert ash_view.shape == view.shape
223 |         assert ash_view.shape == (ii, jj)
224 |         assert ash_view.X.shape == (ii, jj)
225 | 
226 |         assert ash_view.obs_names.equals(view.obs_names)
227 |         assert ash_view.var_names.equals(view.var_names)
228 | 
229 |         ash.close()
230 | 
231 | 
232 | @pytest.mark.usefixtures("filepath_h5mu")
233 | class TestMuData:
234 |     def test_mudata_simple(self, mdata, filepath_h5mu):
235 |         filename = filepath_h5mu
236 |         mdata.write(filename)
237 | 
238 |         msh = MuDataShadow(filename)
239 | 
240 |         assert mdata.shape == msh.shape
241 | 
242 |         msh.close()
243 | 
244 |     def test_anndata_inside_mudata(self, mdata, filepath_h5mu):
245 |         filename = filepath_h5mu
246 |         mdata.write(filename)
247 | 
248 |         mod_x = Path(filename) / "mod" / "x"
249 |         mod_y = Path(filename) / "mod" / "y"
250 | 
251 |         ash_x = AnnDataShadow(mod_x)
252 |         ash_y = AnnDataShadow(mod_y)
253 | 
254 |         assert ash_x.shape == mdata["x"].shape
255 |         assert ash_y.shape == mdata["y"].shape
256 | 
257 |         ash_x.close()
258 |         ash_y.close()
259 | 
260 |     def test_slicing_mudata_int(self, mdata, filepath_h5mu):
261 |         filename = filepath_h5mu
262 |         n, d = mdata.shape
263 |         mdata.write(filename)
264 | 
265 |         msh = MuDataShadow(filename)
266 | 
267 |         msh_view = msh[:10, :5]
268 |         assert msh_view.shape == (10, 5)
269 | 
270 |         msh_view = msh[:11, :]
271 |         assert msh_view.shape == (11, d)
272 | 
273 |         msh_view = msh[:, :7]
274 |         assert msh_view.shape == (n, 7)
275 | 
276 |         msh.close()
277 | 
278 |     def test_slicing_mudata_str(self, mdata, filepath_h5mu):
279 |         filename = filepath_h5mu
280 |         n, d = mdata.shape
281 |         mdata.write(filename)
282 | 
283 |         msh = MuDataShadow(filename)
284 | 
285 |         msh_view = msh[:, ["x3", "y5", "x7", "y9"]]
286 |         assert msh_view.shape == (n, 4)
287 |         assert msh_view.var_names.to_list() == ["x3", "y5", "x7", "y9"]
288 | 
289 |         msh.close()
290 | 


--------------------------------------------------------------------------------
/src/shadows/mudatashadow.py:
--------------------------------------------------------------------------------
  1 | from functools import cached_property
  2 | from pathlib import Path
  3 | 
  4 | import numpy as np
  5 | 
  6 | # For simplicity, use AnnData read_elem/write_elem
  7 | from anndata._core.index import _normalize_indices
  8 | 
  9 | from .anndatashadow import AnnDataShadow
 10 | from .datashadow import DataShadow
 11 | from .elemshadow import ElemShadow
 12 | 
 13 | 
 14 | class MuDataShadow(DataShadow):
 15 |     def __init__(self, filepath, *args, **kwargs):
 16 |         super().__init__(filepath, *args, **kwargs)
 17 |         mods = list(self.file["mod"].keys())
 18 | 
 19 |         modorder = mods
 20 |         if "mod-oder" in self.file["mod"].attrs:
 21 |             modorder_raw = self.file["mod"].attrs["mod-order"]
 22 |             if all(m in mods for m in modorder_raw):
 23 |                 modorder = [m for m in modorder_raw if m in mods]
 24 | 
 25 |         kwargs["parent_format"] = self._format
 26 |         try:
 27 |             self.mod = {
 28 |                 k: AnnDataShadow(Path(filepath) / "mod" / k, *args, **kwargs) for k in modorder
 29 |             }
 30 |         except (FileNotFoundError, TypeError) as e:
 31 |             # fsspec.mapping.FSMap
 32 |             try:
 33 |                 from fsspec.mapping import FSMap
 34 | 
 35 |                 if not isinstance(filepath, FSMap):
 36 |                     raise NotImplementedError(
 37 |                         "remote storage support has only been implemented for FSMap interface"
 38 |                     )
 39 |                 if filepath.fs.__class__.__name__ != "S3FileSystem":
 40 |                     raise NotImplementedError(
 41 |                         "fsspec.mapping.FSMap has only been implemented for S3FileSystem"
 42 |                     )
 43 | 
 44 |                 mapper = filepath.fs.get_mapper
 45 |                 self.mod = {
 46 |                     k: AnnDataShadow(
 47 |                         mapper(str(Path(filepath.root) / "mod" / k)),
 48 |                         format=self._format,
 49 |                         *args,
 50 |                         **kwargs,
 51 |                     )
 52 |                     for k in modorder
 53 |                 }
 54 |             except Exception:
 55 |                 raise e
 56 | 
 57 |         self.n_mod = len(self.mod)
 58 |         self.mask = None
 59 | 
 60 |         self._axis = 0
 61 |         if self.file:
 62 |             if "axis" in self.file[self.root].attrs:
 63 |                 self._axis = self.file[self.root].attrs["axis"]
 64 | 
 65 |         # To handle scanpy plotting calls and other tools
 66 |         self.raw = None
 67 | 
 68 |     @classmethod
 69 |     def _init_as_view(cls, shadow, oidx, vidx):
 70 |         if shadow._format == "zarr":
 71 |             filename = shadow.file.store.path
 72 |             mode = "r+" if not shadow.file.read_only else "r"
 73 |         elif shadow._format == "parquet":
 74 |             filename = shadow.file.path
 75 |             mode = "r+"  # FIXME
 76 |         else:
 77 |             filename = shadow.file.filename
 78 |             mode = shadow.file.mode
 79 | 
 80 |         if shadow.root != "/":
 81 |             filename = str(Path(filename) / shadow.root)
 82 |         view = MuDataShadow(
 83 |             filename,
 84 |             array_backend=shadow._array_backend,
 85 |             table_backend=shadow._table_backend,
 86 |             mode=mode,
 87 |             format=shadow._format,
 88 |         )
 89 | 
 90 |         # NOTE: Cache is not preserved in a new object
 91 | 
 92 |         view._is_view = True
 93 |         view._ref = shadow
 94 |         view._oidx = oidx
 95 |         view._vidx = vidx
 96 | 
 97 |         if shadow.is_view:
 98 |             view._ref = shadow._ref
 99 |             for attr, idx in (("_oidx", oidx), ("_vidx", vidx)):
100 |                 shadow_idx = getattr(shadow, attr)
101 |                 if shadow_idx is not None:
102 |                     n_attr = shadow._ref.n_obs if attr == "_oidx" else shadow._ref.n_vars
103 |                     if isinstance(shadow_idx, slice) and isinstance(idx, int | np.integer | slice):
104 |                         r = range(*shadow_idx.indices(n_attr)).__getitem__(idx)
105 |                         if isinstance(r, int | np.integer):
106 |                             setattr(view, attr, np.array([r]))
107 |                         setattr(view, attr, slice(r.start, r.stop, r.step))
108 |                     elif isinstance(shadow_idx, slice):
109 |                         setattr(view, attr, np.arange(*shadow_idx.indices(shadow._ref.n_obs))[idx])
110 |                     elif hasattr(shadow_idx.dtype, "type") and issubclass(
111 |                         shadow_idx.dtype.type, np.bool_
112 |                     ):
113 |                         if hasattr(idx.dtype, "type") and issubclass(idx.dtype.type, np.bool_):
114 |                             view_idx = shadow_idx.copy()
115 |                             view_idx[view_idx] = idx
116 |                             setattr(view, attr, view_idx)
117 |                         else:
118 |                             setattr(view, attr, shadow_idx[np.where(idx)[0]])
119 |                     else:
120 |                         setattr(view, attr, shadow_idx[idx])
121 | 
122 |         for mod, modality in view.mod.items():
123 |             # Subsetting doesn't depend on axis:
124 |             # axis implicitly influences .obsmap / .varmap
125 |             if isinstance(oidx, slice) and oidx.start is None and oidx.stop is None:
126 |                 mod_obs = oidx
127 |             else:
128 |                 mod_obs = shadow.obsmap[mod][oidx]
129 |                 if hasattr(mod_obs, "columns") and mod in mod_obs.columns:
130 |                     mod_obs = mod_obs[mod].values
131 |                 mod_obs = mod_obs[mod_obs != 0] - 1
132 | 
133 |             if isinstance(vidx, slice) and vidx.start is None and vidx.stop is None:
134 |                 mod_vars = vidx
135 |             else:
136 |                 mod_vars = shadow.varmap[mod][vidx]
137 |                 if hasattr(mod_obs, "columns") and mod in mod_obs.columns:
138 |                     mod_obs = mod_obs[mod].values
139 |                 mod_vars = mod_vars[mod_vars != 0] - 1
140 | 
141 |             view.mod[mod] = modality[mod_obs, mod_vars]
142 |             view.mod[mod]._ref = shadow[mod]
143 |             if hasattr(modality.file, "close") and callable(modality.file.close):
144 |                 modality.file.close()
145 | 
146 |             # TODO: avoid creating a non-view AnnData connection
147 |             # in the MuDataShadow() constructor above
148 | 
149 |         return view
150 | 
151 |     @cached_property
152 |     def _obsmap(self):
153 |         group_storage = (
154 |             self.file[self.root]["obsmap"] if "obsmap" in self.file[self.root] else dict()
155 |         )
156 |         return ElemShadow(
157 |             group_storage,
158 |             key=str(Path(self.root) / "obsmap"),
159 |             cache=self.__dict__,
160 |             n_obs=self.n_obs,
161 |             n_vars=self.n_vars,
162 |             array_backend=self._array_backend,
163 |             table_backend=self._table_backend,
164 |             is_view=self.is_view,
165 |             idx=(self._oidx, None),
166 |         )
167 | 
168 |     @property
169 |     def obsmap(self):
170 |         return self._obsmap
171 | 
172 |     @cached_property
173 |     def _varmap(self):
174 |         group_storage = (
175 |             self.file[self.root]["varmap"] if "varmap" in self.file[self.root] else dict()
176 |         )
177 |         return ElemShadow(
178 |             group_storage,
179 |             key=str(Path(self.root) / "varmap"),
180 |             cache=self.__dict__,
181 |             n_obs=self.n_obs,
182 |             n_vars=self.n_vars,
183 |             array_backend=self._array_backend,
184 |             table_backend=self._table_backend,
185 |             is_view=self.is_view,
186 |             idx=(None, self._vidx),
187 |         )
188 | 
189 |     @property
190 |     def varmap(self):
191 |         return self._varmap
192 | 
193 |     def clear_cache(self):
194 |         super().clear_cache()
195 |         for modality in self.mod.values():
196 |             modality.clear_cache()
197 | 
198 |     def close(self, close_modalities: bool = True):
199 |         if close_modalities:
200 |             for modality in self.mod.values():
201 |                 modality.close()
202 |         super().close()
203 | 
204 |     def reopen(self, mode: str):
205 |         if not self.file or mode != self.file.mode:
206 |             file = self.file.filename
207 |             super().reopen(mode=mode)
208 |             for modality in self.mod.values():
209 |                 modality.reopen(mode=mode, file=file)
210 |         else:
211 |             return self
212 | 
213 |         # Update ._group in all elements
214 |         for key in ["mod"]:
215 |             elem = getattr(self, key)
216 |             if isinstance(elem, ElemShadow):
217 |                 elem._update_group(self.file[str(Path(self.root) / key)])
218 | 
219 |         return self
220 | 
221 |     def __repr__(self):
222 |         if self.is_view:
223 |             if self._ref is not None:
224 |                 s = f"View of MuData Shadow object with n_obs × n_vars = {self.n_obs} × {self.n_vars} (original {self._ref.n_obs} × {self._ref.n_vars})\n"
225 |             else:
226 |                 s = f"View of MuData Shadow object with n_obs × n_vars = {self.n_obs} × {self.n_vars}\n"
227 |         else:
228 |             s = f"MuData Shadow object with n_obs × n_vars = {self.n_obs} × {self.n_vars}\n"
229 | 
230 |         s += "\n".join(["  " + line for line in super().__repr__().strip().split("\n")]) + "\n"
231 | 
232 |         # obsmap and varmap
233 |         for k in ["obsmap", "varmap"]:
234 |             item = getattr(self, k)
235 |             if len(item) > 0:
236 |                 s += "  " + item.__repr__()
237 | 
238 |         s += f"  mod:\t{self.n_mod} modalit{'ies' if self.n_mod > 1 else 'y'}\n"
239 |         for m, modality in self.mod.items():
240 |             m_repr = modality.__repr__().strip().split("\n")[1:]
241 |             s += f"    {m}: {modality.n_obs} x {modality.n_vars}\n"
242 |             s += "\n".join(["      " + line for line in m_repr]) + "\n"
243 |         return s
244 | 
245 |     # Writing
246 | 
247 |     def _push_changes(self, clear_cache: bool = False):
248 |         super()._push_changes(clear_cache=clear_cache)
249 |         for modality in self.mod.values():
250 |             modality._push_changes(
251 |                 clear_cache=clear_cache,
252 |             )
253 | 
254 |     # Views
255 | 
256 |     def __getitem__(self, index):
257 |         if isinstance(index, str):
258 |             return self.mod[index]
259 |         oidx, vidx = _normalize_indices(index, self.obs_names, self.var_names)
260 |         return MuDataShadow._init_as_view(self, oidx, vidx)
261 | 
262 |     #
263 |     # Same as for AnnData above:
264 |     # in the absence of duck typing in most tools,
265 |     # the solution is to mock the class.
266 |     #
267 | 
268 |     @property
269 |     def __class__(self):
270 |         try:
271 |             from mudata import MuData
272 | 
273 |             return MuData
274 |         except ModuleNotFoundError:
275 |             return MuDataShadow
276 | 


--------------------------------------------------------------------------------
/src/shadows/elemshadow.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import MutableMapping
  2 | from functools import cached_property, partial
  3 | from pathlib import Path
  4 | from typing import get_args
  5 | from warnings import warn
  6 | 
  7 | # For simplicity, use AnnData read_elem/write_elem
  8 | from anndata._io.specs import write_elem
  9 | from anndata.compat import H5Array, H5Group, ZarrArray, ZarrGroup
 10 | 
 11 | from .compat import PqArray, PqGroup, read_elem
 12 | 
 13 | ArrayStorageType = ZarrArray | H5Array | PqArray
 14 | GroupStorageType = ZarrGroup | H5Group | PqGroup
 15 | StorageType = ArrayStorageType | GroupStorageType
 16 | 
 17 | RUNECACHED = "\u1401"
 18 | RUNECACHEDALT = "\u25bc"
 19 | RUNENEW = "\u25b2"
 20 | 
 21 | 
 22 | class LazyReader:
 23 |     def __init__(self, reader, data):
 24 |         self.reader = reader
 25 |         self.data = data
 26 |         self.f = lambda data, slice: reader(data[slice])
 27 |         self.partial = partial(self.f, self.data)
 28 | 
 29 |     def __call__(self, value):
 30 |         return self.partial(value)
 31 | 
 32 |     def __getitem__(self, value):
 33 |         return self.partial(value)
 34 | 
 35 | 
 36 | def _get_backend_reader(backend, lazy: bool = False):
 37 |     if callable(backend):
 38 |         reader = backend
 39 |     else:
 40 |         if backend == "numpy":
 41 |             import numpy as np
 42 | 
 43 |             # TODO: Handle sparsity
 44 |             reader = np.array
 45 | 
 46 |         elif backend == "jax":
 47 |             import jax.numpy as jnp
 48 | 
 49 |             reader = jnp.array
 50 | 
 51 |         elif backend == "torch" or backend == "pytorch":
 52 |             import torch
 53 | 
 54 |             reader = torch.Tensor
 55 | 
 56 |         elif backend == "pandas":
 57 |             import pandas as pd
 58 | 
 59 |             reader = pd.DataFrame
 60 | 
 61 |         elif backend == "polars":
 62 |             import polars as pl
 63 | 
 64 |             reader = pl.from_dict
 65 | 
 66 |         elif backend == "arrow" or backend == "pyarrow":
 67 |             import pyarrow as pa
 68 | 
 69 |             reader = pa.Table.from_pydict
 70 | 
 71 |         else:
 72 |             return NotImplementedError
 73 | 
 74 |     if lazy:
 75 |         base_reader = reader
 76 | 
 77 |         def reader(data):
 78 |             return LazyReader(base_reader, data)
 79 | 
 80 |     return reader
 81 | 
 82 | 
 83 | class EmptySlot:
 84 |     def __init__(self):
 85 |         pass
 86 | 
 87 |     def __len__(self):
 88 |         return 0
 89 | 
 90 |     def __repr__(self):
 91 |         return ""
 92 | 
 93 | 
 94 | class ElemShadow(MutableMapping):
 95 |     def __init__(
 96 |         self,
 97 |         group_storage,
 98 |         key: str,
 99 |         cache: dict | None = None,
100 |         n_obs: int | None = None,
101 |         n_vars: int | None = None,
102 |         array_backend: str = "numpy",
103 |         table_backend: str = "pandas",
104 |         is_view: bool | None = False,
105 |         idx=None,
106 |     ):
107 |         self._group = group_storage
108 |         self._key = key
109 |         self._cache = cache
110 |         self._n_obs = n_obs
111 |         self._n_vars = n_vars
112 | 
113 |         try:
114 |             self._elems = list(self._group.keys())
115 |         except AttributeError as e:
116 |             # This block below is only to handle legacy files
117 |             # where this can be a structured array.
118 |             # Legacy file support will get deprecated in later versions.
119 |             import numpy as np
120 | 
121 |             in_memory = np.array(self._group)
122 |             fields = in_memory.dtype.fields
123 |             if fields is not None:
124 |                 self._elems = list(fields.keys())
125 |                 if self._key not in cache:
126 |                     self._cache[self._key] = dict()
127 |                     for value in self._elems:
128 |                         value_path = str(Path(self._key) / value)
129 |                         value_out = in_memory[value]
130 | 
131 |                         key_name = Path(self._key).name
132 |                         if is_view:
133 |                             oidx, vidx = idx
134 |                             if self._key.endswith("layers"):
135 |                                 if oidx is not None and vidx is not None:
136 |                                     value_out = value_out[oidx, vidx]
137 |                                 elif oidx is not None:
138 |                                     value_out = value_out.__getitem__(oidx)
139 |                                 elif vidx is not None:
140 |                                     value_out = value_out[:, vidx]
141 |                             elif key_name.startswith("obs"):
142 |                                 if oidx is not None:
143 |                                     value_out = value_out.__getitem__(oidx)
144 |                                     if key_name == "obsp":
145 |                                         value_out = value_out[:, oidx]
146 |                             elif key_name.startswith("var"):
147 |                                 if vidx is not None:
148 |                                     value_out = value_out.__getitem__(vidx)
149 |                                     if key_name == "varp":
150 |                                         value_out = value_out[:, vidx]
151 | 
152 |                         self._cache[value_path] = value_out
153 |             else:
154 |                 raise AttributeError("Cannot handle this legacy file: " + str(e)) from e
155 | 
156 |         self._newelems = dict()
157 |         self._nested = dict()
158 | 
159 |         self._array_backend = array_backend
160 |         self._table_backend = table_backend
161 | 
162 |         self.is_view = is_view
163 |         self._idx = idx
164 | 
165 |     def __getitem__(self, value):
166 |         value_path = str(Path(self._key) / value)
167 |         if value_path in self._cache:
168 |             return self._cache[value_path]
169 |         elif value in self._newelems:
170 |             return self._newelems[value]
171 |         else:
172 |             value_elem = self._group[value]
173 |             # is_group = type(value_elem).__name__ == 'Group'  # h5py.Group, zarr.hierarchy.Group
174 |             is_group = isinstance(value_elem, get_args(GroupStorageType))
175 | 
176 |             # Return the nested ElemShadow
177 |             if value_path in self._nested:
178 |                 return self._nested[value_path]
179 | 
180 |             # Directly read it if it is a scalar dataset
181 |             # NOTE: Sparse matrices and data frames are groups
182 |             elif not is_group and value_elem.shape == ():
183 |                 value_out = self._group[value][()]
184 |                 if isinstance(value_out, bytes):
185 |                     try:
186 |                         # bytes -> string
187 |                         value_out = value_out.decode()
188 |                     except AttributeError:
189 |                         pass
190 | 
191 |             elif self._array_backend == "numpy" and self._table_backend == "pandas":
192 |                 # HOTFIX
193 |                 if self._group[value].__class__.__module__ == "pqdata.core":
194 |                     value_out = read_elem(self._group[value], _format="parquet")
195 |                 else:
196 |                     value_out = read_elem(self._group[value])
197 | 
198 |             else:
199 |                 if (
200 |                     "encoding-type" in value_elem.attrs
201 |                     and value_elem.attrs["encoding-type"] == "array"
202 |                 ):
203 |                     reader = _get_backend_reader(self._array_backend)
204 |                 elif (
205 |                     "encoding-type" in value_elem.attrs
206 |                     and value_elem.attrs["encoding-type"] == "dataframe"
207 |                 ):
208 |                     reader = _get_backend_reader(self._table_backend)
209 |                 else:
210 |                     reader = _get_backend_reader(self._array_backend)
211 |                 # TODO: avoid reading the whole dataset
212 |                 if isinstance(self._group, PqGroup):
213 |                     value_out = read_elem(self._group[value], _format="parquet")
214 |                     try:
215 |                         value_out = reader(value_out)
216 |                     except ValueError as e:
217 |                         if hasattr(value_out, "todense") and callable(value_out.todense):
218 |                             value_out = reader(value_out.todense())
219 |                         else:
220 |                             raise e
221 |                 else:
222 |                     try:
223 |                         value_out = reader(self._group[value][:])
224 |                     except TypeError:
225 |                         # e.g. sparse matrices
226 |                         value_out = read_elem(self._group[value])
227 |                         try:
228 |                             value_out = reader(value_out)
229 |                         except ValueError as e:
230 |                             if hasattr(value_out, "todense") and callable(value_out.todense):
231 |                                 value_out = reader(value_out.todense())
232 |                             else:
233 |                                 raise e
234 | 
235 |             # slicing behaviour depends on the attribute
236 |             key_name = Path(self._key).name
237 |             if self.is_view:
238 |                 oidx, vidx = self._idx
239 |                 if self._key.endswith("layers"):
240 |                     if oidx is not None and vidx is not None:
241 |                         value_out = value_out[oidx, vidx]
242 |                     elif oidx is not None:
243 |                         value_out = value_out.__getitem__(oidx)
244 |                     elif vidx is not None:
245 |                         value_out = value_out[:, vidx]
246 |                 elif key_name.startswith("obs"):
247 |                     if oidx is not None:
248 |                         value_out = value_out.__getitem__(oidx)
249 |                         if key_name == "obsp":
250 |                             value_out = value_out[:, oidx]
251 |                 elif key_name.startswith("var"):
252 |                     if vidx is not None:
253 |                         value_out = value_out.__getitem__(vidx)
254 |                         if key_name == "varp":
255 |                             value_out = value_out[:, vidx]
256 | 
257 |             self._cache[value_path] = value_out
258 |             return value_out
259 | 
260 |     def __setitem__(self, key, value):
261 |         value_path = str(Path(self._key) / key)
262 | 
263 |         if self._key.endswith("obsm") or self._key.endswith("obsp") or self._key.endswith("layers"):
264 |             if self._n_obs is None:
265 |                 if key in self._elems:
266 |                     self._n_obs = self._group[key].shape[0]
267 | 
268 |             if self._n_obs is not None:
269 |                 assert value.shape[0] == self._n_obs, "Shape mismatch"
270 |                 if self._key.endswith("obsp"):
271 |                     assert value.shape[1] == self._n_obs, "Shape mismatch"
272 | 
273 |         if self._key.endswith("varm") or self._key.endswith("varp") or self._key.endswith("layers"):
274 |             if self._n_vars is None:
275 |                 if key in self._elems:
276 |                     self._n_vars = self._group[key].shape[0]
277 | 
278 |             if self._n_vars is not None:
279 |                 if self._key.endswith("layers"):
280 |                     assert value.shape[1] == self._n_vars, "Shape mismatch"
281 |                 else:  # varm, varp
282 |                     assert value.shape[0] == self._n_vars, "Shape mismatch"
283 |                     if self._key.endswith("varp"):
284 |                         assert value.shape[1] == self._n_vars, "Shape mismatch"
285 | 
286 |         if key in self._elems:
287 |             if isinstance(self._group[key], get_args(GroupStorageType)):
288 |                 self._nested[value_path] = value
289 |             else:
290 |                 self._cache[value_path] = value
291 |         else:
292 |             self._newelems[key] = value
293 | 
294 |     def __delitem__(self, key):
295 |         if key in self._newelems:
296 |             del self._newelems[key]
297 |         else:
298 |             raise NotImplementedError("Cannot delete data " "that already exists in the file")
299 | 
300 |     def __contains__(self, value):
301 |         if value in self._elems or value in self._newelems:
302 |             return True
303 |         return False
304 | 
305 |     def __iter__(self):
306 |         all_keys = self._elems + list(self._newelems.keys())
307 |         for i, key in enumerate(all_keys):
308 |             yield key, self[key]
309 | 
310 |     def keys(self):
311 |         return self._elems + list(self._newelems.keys())
312 | 
313 |     def values(self):
314 |         all_keys = self._elems + list(self._newelems.keys())
315 |         for i, key in enumerate(all_keys):
316 |             yield key, self[key]
317 | 
318 |     def items(self):
319 |         for key in self._elems:
320 |             yield key, self[key]
321 | 
322 |         for key, value in self._newelems.items():
323 |             yield key, value
324 | 
325 |     def __len__(self):
326 |         return len(self._elems) + len(self._newelems)
327 | 
328 |     def __repr__(self):
329 |         s = ""
330 |         key_elems_str, new_elems_str = [], []
331 | 
332 |         if len(self._elems) > 0:
333 |             key_elems_cached = [str(Path(self._key) / e) in self._cache for e in self._elems]
334 |             key_elems_cached_str = [RUNECACHED if e_cached else "" for e_cached in key_elems_cached]
335 |             # TODO: RUNECACHEDALT
336 |             key_elems_str = list(
337 |                 map(lambda xs: "".join(xs), zip(self._elems, key_elems_cached_str))
338 |             )
339 | 
340 |         if len(self._newelems) > 0:
341 |             new_elems_str = [f"{e}{RUNENEW}" for e in self._newelems.keys()]
342 | 
343 |         all_elems_str = key_elems_str + new_elems_str
344 |         if len(all_elems_str) > 0:
345 |             s += f"{Path(self._key).name}:\t{', '.join(all_elems_str)}\n"
346 | 
347 |         return s
348 | 
349 |     # Writing
350 | 
351 |     def _push_changes(self, clear_cache: bool = False):
352 |         if len(self._newelems) > 0:
353 |             keys = list(self._newelems.keys())
354 |             for key in keys:
355 |                 write_elem(self._group, key, self._newelems[key])
356 |                 if not clear_cache:
357 |                     self._cache[str(Path(self._key) / key)] = self._newelems[key]
358 |                 del self._newelems[key]
359 |             self._elems = list(self._group.keys())
360 | 
361 |     def _update_group(self, group):
362 |         self._group = group
363 |         for elem in self._nested.values():
364 |             elem._update_group(group)
365 | 
366 | 
367 | class RawElemShadow(ElemShadow):
368 |     def __init__(
369 |         self,
370 |         group_storage,
371 |         key: str,
372 |         file: str,
373 |         cache: dict | None = None,
374 |         n_obs: int | None = None,
375 |         n_vars: int | None = None,
376 |         array_backend: str = "numpy",
377 |         table_backend: str = "pandas",
378 |         is_view: bool = False,
379 |         idx=None,
380 |     ):
381 |         super().__init__(
382 |             group_storage=group_storage,
383 |             key=key,
384 |             cache=cache,
385 |             n_obs=n_obs,
386 |             n_vars=n_vars,
387 |             array_backend=array_backend,
388 |             table_backend=table_backend,
389 |             is_view=is_view,
390 |             idx=idx,
391 |         )
392 |         self.file = file
393 |         self._ids = {"self": id(self)}
394 | 
395 |     @cached_property
396 |     def _X(self):
397 |         return self.__getitem__("X")
398 | 
399 |     @property
400 |     def X(self):
401 |         return self._X
402 | 
403 |     @cached_property
404 |     def _var(self):
405 |         return self.__getitem__("var")
406 | 
407 |     @property
408 |     def var(self):
409 |         return self._var
410 | 
411 |     @cached_property
412 |     def _var_names(self):
413 |         index = "_index"
414 |         var = self._group["var"]
415 |         if "_index" in var.attrs:
416 |             index = var.attrs["_index"]
417 |         if self.is_view and len(self._idx) > 1 and self._idx[1] is not None:
418 |             return self._group["var"][index][self._idx[1]]
419 |         return self._group["var"][index][:]
420 | 
421 |     @property
422 |     def var_names(self):
423 |         return self._var_names
424 | 
425 |     @cached_property
426 |     def __n_obs(self):
427 |         x = self._group["X"]
428 |         if isinstance(x, get_args(ArrayStorageType)):
429 |             n_obs = x.shape[0]
430 |         else:
431 |             n_obs = x.attrs["shape"][0]
432 | 
433 |         if self.is_view and self._idx[0] is not None:
434 |             oidx = self._idx[0]
435 |             if isinstance(oidx, slice):
436 |                 n_obs = len(range(n_obs).__getitem__(oidx))
437 |             else:
438 |                 n_obs = len(oidx)
439 | 
440 |         return n_obs
441 | 
442 |     @property
443 |     def n_obs(self):
444 |         if self._n_obs is None:
445 |             return self.__n_obs
446 |         return self._n_obs
447 | 
448 |     @cached_property
449 |     def __n_vars(self):
450 |         if "var" in self._group:
451 |             var = self._group["var"]
452 |             if isinstance(var, get_args(ArrayStorageType)):
453 |                 n_vars = var.shape[0]
454 | 
455 |             else:
456 |                 index = "_index"
457 |                 if "_index" in var.attrs:
458 |                     index = var.attrs["_index"]
459 | 
460 |                 n_vars = var[index].shape[0]
461 |         else:
462 |             x = self._group["X"]
463 |             if isinstance(x, get_args(ArrayStorageType)):
464 |                 n_vars = x.shape[1]
465 |             else:
466 |                 n_vars = x.attrs["shape"][1]
467 | 
468 |         self._n_vars = n_vars
469 |         return n_vars
470 | 
471 |     @property
472 |     def n_vars(self):
473 |         if self._n_vars is None:
474 |             return self.__n_vars
475 |         return self._n_vars
476 | 
477 |     @property
478 |     def shape(self):
479 |         return self.n_obs, self.n_vars
480 | 
481 |     @cached_property
482 |     def _varm(self):
483 |         storage_group = self._group["varm"] if "varm" in self._elems else dict()
484 |         return ElemShadow(
485 |             storage_group,
486 |             key=str(Path(self._group.name) / "varm"),
487 |             cache=self.__dict__,
488 |             n_obs=self.n_obs,
489 |             n_vars=self.n_vars,
490 |             array_backend=self._array_backend,
491 |             table_backend=self._table_backend,
492 |             is_view=self.is_view,
493 |             idx=self.idx,
494 |         )
495 | 
496 |     @property
497 |     def varm(self):
498 |         return self._varm
499 | 
500 |     # No writing: .raw is always read-only
501 | 
502 |     def _push_changes(self, *args, **kwrags):
503 |         warn("Raw object is always read-only. No changes will be written.")
504 | 


--------------------------------------------------------------------------------
/src/shadows/datashadow.py:
--------------------------------------------------------------------------------
  1 | import ctypes
  2 | import logging
  3 | from functools import cached_property
  4 | from os import PathLike
  5 | from pathlib import Path
  6 | from typing import Literal, get_args
  7 | from warnings import warn
  8 | 
  9 | # FIXME: import only when needed
 10 | import h5py
 11 | from anndata._core.index import _normalize_indices
 12 | 
 13 | # For simplicity, use AnnData read_elem/write_elem
 14 | from anndata.compat import H5Array, H5Group, ZarrArray, ZarrGroup
 15 | 
 16 | from .compat import PqArray, PqGroup, read_elem
 17 | from .elemshadow import ElemShadow, _get_backend_reader
 18 | 
 19 | # FIXME: in anndata._types now
 20 | ArrayStorageType = ZarrArray | H5Array | PqArray
 21 | GroupStorageType = ZarrGroup | H5Group | PqGroup
 22 | StorageType = ArrayStorageType | GroupStorageType
 23 | 
 24 | 
 25 | RUNECACHED = "\u1401"
 26 | FORMAT_MAP = {
 27 |     "h5": "hdf5",
 28 |     "hdf5": "hdf5",
 29 |     "zarr": "zarr",
 30 |     "pq": "parquet",
 31 |     "pqdata": "parquet",
 32 | }
 33 | 
 34 | 
 35 | class DataShadow:
 36 |     def __init__(
 37 |         self,
 38 |         filepath: PathLike,
 39 |         array_backend: str = "numpy",
 40 |         table_backend: str = "pandas",
 41 |         mode: str = "r",
 42 |         format: Literal["hdf5", "zarr", "parquet", "h5", "pq", "pqdata"] | None = None,
 43 |         lazy: bool = False,
 44 |         parent_format: str | None = None,
 45 |     ):
 46 |         # unify types
 47 |         fpstr = str(filepath)
 48 |         if filepath.__class__.__name__ == "OpenFile":
 49 |             # OpenFile<'file_path'>
 50 |             fpstr = str(filepath.path)
 51 |         elif filepath.__class__.__name__ == "FSMap":
 52 |             # <fsspec.mapping.FSMap at 0x...>
 53 |             fpstr = str(filepath.root)
 54 |         fpath = Path(fpstr)
 55 | 
 56 |         if format is None:
 57 |             logging.info("No format provided, trying to infer from the file extension")
 58 |             if fpath.suffix == ".zarr":
 59 |                 format = "zarr"
 60 |             elif fpath.suffix == ".pqdata":
 61 |                 format = "parquet"
 62 |             else:
 63 |                 # NOTE: prioritizing the file extension over the parent format
 64 |                 # allows to mix formats, e.g. store modalities in .zarr or .hdf5 files
 65 |                 if parent_format is not None:
 66 |                     format = parent_format
 67 |                 else:
 68 |                     format = "hdf5"
 69 | 
 70 |         # map the shorthands to the full names
 71 |         format = FORMAT_MAP.get(format, format)
 72 | 
 73 |         # Auto-detect the format for nested modalities
 74 |         # (e.g. m.zarr/mod/x, m.pqdata/mod/y)
 75 |         if "zarr" in fpstr or "pqdata" in fpstr and fpath.suffix not in (".zarr", ".pqdata"):
 76 |             i = 1
 77 |             while i <= fpstr.count("/"):
 78 |                 path_elements = list(map(lambda x: x[::-1], fpstr[::-1].split("/", i)))
 79 |                 filename, root = path_elements[-1], str(
 80 |                     Path(path_elements[-2]).joinpath(*path_elements[:-2][::-1])
 81 |                 )
 82 |                 if Path(filename).suffix == ".zarr":
 83 |                     format = "zarr"
 84 |                     break
 85 |                 elif Path(filename).suffix == ".pqdata":
 86 |                     format = "parquet"
 87 |                     break
 88 |                 i += 1
 89 | 
 90 |         if format == "hdf5":
 91 |             import h5py
 92 |         elif format == "zarr":
 93 |             import zarr
 94 |         elif format == "parquet":
 95 |             import pqdata
 96 | 
 97 |         if fpath.exists():
 98 |             if format == "zarr":
 99 |                 self.file = zarr.open(fpath, mode=mode)
100 |             elif format == "parquet":
101 |                 self.file = pqdata.open(fpath, mode=mode)
102 |             else:
103 |                 # fallback to hdf5 by default
104 |                 if format != "hdf5":
105 |                     warn(
106 |                         f"Falling back to hdf5, provided format is '{format}' and not 'hdf5' or 'zarr'"
107 |                     )
108 |                 self.file = h5py.File(fpath, mode=mode)
109 |             self.root = "/"
110 |         else:
111 |             root = "/"
112 |             file_exists = False
113 |             i = 1
114 |             while not file_exists and i <= fpstr.count("/"):
115 |                 path_elements = list(map(lambda x: x[::-1], fpstr[::-1].split("/", i)))
116 |                 filename, root = path_elements[-1], str(
117 |                     Path(path_elements[-2]).joinpath(*path_elements[:-2][::-1])
118 |                 )
119 |                 file_exists = Path(filename).exists()
120 |                 i += 1
121 |             if file_exists:
122 |                 format = FORMAT_MAP.get(Path(filename).suffix[1:], format)
123 |                 if format == "zarr":
124 |                     self.file = zarr.open(filename, mode=mode)
125 |                 elif format == "parquet":
126 |                     self.file = pqdata.open(filename, mode=mode)
127 |                 else:
128 |                     # fallback to hdf5 by default
129 |                     if format != "hdf5":
130 |                         warn(
131 |                             f"Falling back to hdf5, provided format is '{format}' and not 'hdf5 or 'zarr'"
132 |                         )
133 |                     self.file = h5py.File(filename, mode=mode)
134 |                 self.root = root
135 |                 # Maybe prepend /mod to the modality name
136 |                 if root not in self.file and f"/mod/{root}" in self.file:
137 |                     self.root = f"/mod/{root}"
138 |             elif (
139 |                 filepath.__class__.__name__ == "BufferedReader"
140 |                 or filepath.__class__.__name__ == "OpenFile"
141 |                 or filepath.__class__.__name__ == "FSMap"
142 |             ):
143 |                 # fsspec support
144 |                 fname = filepath
145 |                 try:
146 |                     from fsspec.core import OpenFile
147 | 
148 |                     if isinstance(filepath, OpenFile):
149 |                         fname = filepath.__enter__()
150 |                         self._callback = fname.__exit__()
151 |                 except ImportError as e:
152 |                     raise ImportError(
153 |                         "To read from remote storage or cache, install fsspec: pip install fsspec"
154 |                     ) from e
155 | 
156 |                 if format == "zarr":
157 |                     self.file = zarr.open(fname, mode=mode)
158 |                 elif format == "parquet":
159 |                     self.file = pqdata.open(fname, mode=mode)
160 |                 else:
161 |                     raise NotImplementedError(
162 |                         "Only zarr and parquet formats are supported for remote files. "
163 |                         "HDF5 files have to be downloaded first."
164 |                     )
165 |                 self.root = "/"
166 |             else:
167 |                 raise FileNotFoundError(f"File {fpstr} does not seem to exist")
168 |         self._array_backend = array_backend
169 |         self._table_backend = table_backend
170 |         self._ids = {"self": id(self)}
171 |         self._format = format
172 | 
173 |         # View-related attributes
174 |         self._is_view = False
175 |         self._oidx = None
176 |         self._vidx = None
177 | 
178 |         # Laziness behaviour
179 |         self._lazy = lazy
180 | 
181 |     @classmethod
182 |     def _init_as_view(cls, shadow, oidx, vidx):
183 |         if shadow._format == "zarr":
184 |             filename = shadow.file.store.path
185 |             mode = "r+" if not shadow.file.read_only else "r"
186 |         elif shadow._format == "parquet":
187 |             raise NotImplementedError("Parquet format is not supported for views.")
188 |         else:
189 |             filename = shadow.file.filename
190 |             mode = shadow.file.mode
191 | 
192 |         if shadow.root != "/":
193 |             filename = str(Path(filename) / shadow.root)
194 |         view = DataShadow(
195 |             filename,
196 |             array_backend=shadow._array_backend,
197 |             table_backend=shadow._table_backend,
198 |             mode=mode,
199 |             format=shadow._format,
200 |         )
201 | 
202 |         # NOTE: Cache is not preserved in a new object
203 | 
204 |         view._is_view = True
205 |         view._ref = shadow
206 |         view._oidx = oidx
207 |         view._vidx = vidx
208 | 
209 |         if shadow.is_view:
210 |             view._ref = shadow._ref
211 |             if shadow._oidx is not None:
212 |                 if isinstance(shadow._oidx, slice):
213 |                     r = range(*shadow._oidx.indices(shadow._ref.n_obs)).__getitem__(oidx)
214 |                     view._oidx = slice(r.start, r.stop, r.step)
215 |                 else:
216 |                     view._oidx = shadow._oidx[oidx]
217 |             if shadow._vidx is not None:
218 |                 if isinstance(shadow._vidx, slice):
219 |                     r = range(*shadow._vidx.indices(shadow._ref.n_vars)).__getitem__(vidx)
220 |                     view._vidx = slice(r.start, r.stop, r.step)
221 |                 else:
222 |                     view._vidx = shadow._vidx[vidx]
223 | 
224 |         return view
225 | 
226 |     def _annot(self, axis: Literal["obs", "var", 0, 1]):
227 |         if axis not in ("obs", "var", 0, 1):
228 |             raise ValueError(f"axis must be 'obs' or 'var', not {axis}")
229 | 
230 |         if isinstance(axis, int):
231 |             axis = "obs" if axis == 0 else "var"
232 | 
233 |         idx = self._oidx if axis == "obs" else self._vidx
234 | 
235 |         # Use anndata v0.8 spec reader
236 |         reader = _get_backend_reader(self._table_backend, self._lazy)
237 |         annot = self.file[self.root][axis]
238 |         columns = {}
239 | 
240 |         if isinstance(annot, get_args(ArrayStorageType)):
241 |             # Deal with legacy or parquet files
242 | 
243 |             # For legacy files,
244 |             # correct the categories for different backends.
245 |             categories = {}
246 |             if "uns" in self.file:
247 |                 uns_keys = list(self.file["uns"])
248 |                 cat_keys = [key for key in uns_keys if key.endswith("_categories")]
249 |                 categories = {
250 |                     key.removesuffix("_categories"): [e.decode() for e in self.file["uns"][key]]
251 |                     for key in cat_keys
252 |                 }
253 | 
254 |             if self._table_backend == "pandas":
255 |                 from pandas import Categorical, DataFrame
256 | 
257 |                 table = DataFrame(read_elem(annot, _format=self._format))
258 |                 if "_index" in annot.attrs:
259 |                     table = table.set_index(annot.attrs["_index"])
260 |                 elif self._format == "hdf5" and "index" in (e[0] for e in annot.dtype.descr):
261 |                     table = table.set_index("index")
262 | 
263 |                 for column in table.columns:
264 |                     if column in categories:
265 |                         table[column] = Categorical.from_codes(
266 |                             table[column], categories=categories[column]
267 |                         )
268 | 
269 |                 if self.is_view:
270 |                     return table.iloc[idx]
271 | 
272 |                 return table
273 |             elif self._table_backend == "polars":
274 |                 import polars as pl
275 | 
276 |                 cat_map = lambda col: lambda x: pl.Series(categories[col])[x]
277 | 
278 |                 table = read_elem(annot, _format=self._format)
279 |                 table = pl.DataFrame(table)
280 | 
281 |                 for column in table.columns:
282 |                     if column in categories:
283 |                         table = table.with_columns(
284 |                             [pl.col(column).map(cat_map(column)).cast(pl.Categorical).alias(column)]
285 |                         )
286 | 
287 |                 if self.is_view:
288 |                     import numpy as np
289 | 
290 |                     if not isinstance(idx, slice) and (
291 |                         isinstance(idx.dtype, pl.Boolean)
292 |                         or hasattr(idx.dtype, "type")
293 |                         and issubclass(idx.dtype.type, np.bool_)
294 |                     ):
295 |                         return table.filter(idx)
296 |                     return table.__getitem__(idx)
297 | 
298 |                 return table
299 |             elif self._table_backend == "pyarrow":
300 |                 import pandas as pd
301 |                 import pyarrow as pa
302 | 
303 |                 table = read_elem(annot, _format=self._format)
304 |                 table = pd.DataFrame(table)
305 | 
306 |                 for column in table.columns:
307 |                     if column in categories:
308 |                         table[column] = pd.Categorical.from_codes(
309 |                             table[column], categories=categories[column]
310 |                         )
311 | 
312 |                 table = pa.Table.from_pandas(table)
313 | 
314 |                 if self.is_view:
315 |                     import numpy as np
316 | 
317 |                     if (
318 |                         not isinstance(idx, slice)
319 |                         and hasattr(idx.dtype, "type")
320 |                         and issubclass(idx.dtype.type, np.bool_)
321 |                     ):
322 |                         return table.filter(idx)
323 |                     return table.__getitem__(idx)
324 | 
325 |                 return table
326 |             else:
327 |                 raise NotImplementedError(
328 |                     "Alternative backends are not available "
329 |                     "for the legacy AnnData/MuData specification."
330 |                 )
331 | 
332 |         if self._table_backend == "pandas":
333 |             table = read_elem(annot, _format=self._format)
334 | 
335 |             if self.is_view:
336 |                 return table.iloc[idx]
337 | 
338 |             return table
339 | 
340 |         # else (only for AnnData >=0.8)
341 |         for key, value in annot.items():
342 |             if key == "__categories":
343 |                 continue
344 |             col = read_elem(value, _format=self._format)
345 |             if self._table_backend == "polars":
346 |                 if "encoding-type" in value.attrs and value.attrs["encoding-type"] == "categorical":
347 |                     import polars as pl
348 | 
349 |                     col = pl.Series(col.astype(str)).cast(pl.Categorical)
350 |             else:
351 |                 raise NotImplementedError("Alternative backends are not fully supported just yet.")
352 |             columns[key] = col
353 | 
354 |         table = reader(columns)
355 | 
356 |         if self.is_view:
357 |             if self._table_backend == "pandas":
358 |                 return table.iloc[idx]
359 |             return table.__getitem__(idx)
360 | 
361 |         return table
362 | 
363 |     @cached_property
364 |     def _obs(self):
365 |         return self._annot("obs")
366 | 
367 |     @property
368 |     def obs(self):
369 |         return self._obs
370 | 
371 |     @cached_property
372 |     def _var(self):
373 |         return self._annot("var")
374 | 
375 |     @property
376 |     def var(self):
377 |         return self._var
378 | 
379 |     def __names(self, axis: str):
380 |         """
381 |         Internal method to get the names of the obs or var axis
382 |         """
383 |         assert axis in ["obs", "var"], "axis must be 'obs' or 'var'"
384 | 
385 |         from pandas import Index
386 | 
387 |         attr = self.file[self.root][axis]
388 | 
389 |         # Handle legacy
390 |         if isinstance(attr, get_args(ArrayStorageType)):
391 |             attr_df = getattr(self, axis)
392 |             if hasattr(attr_df, "index"):
393 |                 names = attr_df.index
394 |             elif hasattr(attr_df, "column_names"):  # pyarrow
395 |                 if "index" in attr_df.column_names:
396 |                     names = Index(attr_df["index"])
397 |                 elif "__index_level_0__" in attr_df.column_names:
398 |                     names = Index(attr_df["__index_level_0__"])
399 |                 elif hasattr(attr_df, "schema"):
400 |                     if hasattr(attr_df.schema, "metadata") and b"pandas" in attr_df.schema.metadata:
401 |                         import json
402 | 
403 |                         pd_meta = json.loads(attr_df.schema.metadata[b"pandas"])
404 |                         names = Index(attr_df[pd_meta["index_columns"][0]].to_numpy())
405 |                     else:
406 |                         raise ValueError(f"Empty {axis}_names")
407 |             elif hasattr(attr_df, "columns"):
408 |                 if "index" in attr_df.columns:
409 |                     names = Index(attr_df["index"])
410 |                 elif "__index_level_0__" in attr_df.columns:
411 |                     names = Index(attr_df["__index_level_0__"])
412 |                 else:
413 |                     from pyarrow import parquet as pq
414 | 
415 |                     # TODO: Refactor e.g. by implementing read_elem_schema
416 |                     filename = self.file[self.root][axis].path
417 |                     schema = pq.read_schema(filename)
418 | 
419 |                     import json
420 | 
421 |                     try:
422 |                         pd_meta = json.loads(schema.metadata[b"pandas"])
423 |                     except KeyError as e:
424 |                         raise KeyError(f"Metadata from pandas not found in the schema: {e}")
425 | 
426 |                     names = Index(attr_df[pd_meta["index_columns"][0]])
427 |             else:
428 |                 raise ValueError(f"Empty {axis}_names")
429 | 
430 |         else:
431 |             index = "_index"
432 |             if "_index" in attr.attrs:
433 |                 index = attr.attrs["_index"]
434 | 
435 |             try:
436 |                 if self.is_view:
437 |                     indices = self._oidx if axis == "obs" else self._vidx
438 |                     names = Index(self.file[self.root][axis][index][:][indices])
439 |                 else:
440 |                     names = Index(self.file[self.root][axis][index][:])
441 |             except KeyError:
442 |                 index = "__index_level_0__"
443 |                 if self.is_view:
444 |                     indices = self._oidx if axis == "obs" else self._vidx
445 |                     names = Index(self.file[self.root][axis][index][:][indices])
446 |                 else:
447 |                     names = Index(self.file[self.root][axis][index][:])
448 | 
449 |         # only string index
450 |         if all(isinstance(e, bytes) for e in names):
451 |             try:
452 |                 names = names.str.decode("utf-8")
453 |             except AttributeError:
454 |                 pass
455 | 
456 |         return names
457 | 
458 |     @cached_property
459 |     def _obs_names(self):
460 |         """
461 |         Note: currently, anndata relies on pd.Index here
462 |         """
463 |         return self.__names("obs")
464 | 
465 |     @property
466 |     def obs_names(self):
467 |         return self._obs_names
468 | 
469 |     @cached_property
470 |     def _var_names(self):
471 |         """
472 |         Note: currently, anndata relies on pd.Index here
473 |         """
474 |         return self.__names("var")
475 | 
476 |     @property
477 |     def var_names(self):
478 |         return self._var_names
479 | 
480 |     @cached_property
481 |     def _n_obs(self):
482 |         obs = self.file[self.root]["obs"]
483 |         if isinstance(obs, get_args(ArrayStorageType)):
484 |             n_obs = obs.shape[0]
485 |         else:
486 |             index = "_index"
487 |             if "_index" in obs.attrs:
488 |                 index = obs.attrs["_index"]
489 | 
490 |             n_obs = obs[index].shape[0]
491 | 
492 |         if self.is_view and self._oidx is not None:
493 |             if isinstance(self._oidx, slice):
494 |                 return len(range(n_obs).__getitem__(self._oidx))
495 |             else:
496 |                 import numpy as np
497 | 
498 |                 if issubclass(self._oidx.dtype.type, np.bool_):
499 |                     return self._oidx.sum()
500 |                 else:
501 |                     return len(self._oidx)
502 |         return n_obs
503 | 
504 |     @property
505 |     def n_obs(self):
506 |         return self._n_obs
507 | 
508 |     @cached_property
509 |     def _n_vars(self):
510 |         var = self.file[self.root]["var"]
511 |         if isinstance(var, get_args(ArrayStorageType)):
512 |             n_vars = var.shape[0]
513 | 
514 |         else:
515 |             index = "_index"
516 |             if "_index" in var.attrs:
517 |                 index = var.attrs["_index"]
518 | 
519 |             n_vars = var[index].shape[0]
520 | 
521 |         if self.is_view and self._vidx is not None:
522 |             if isinstance(self._vidx, slice):
523 |                 return len(range(n_vars).__getitem__(self._vidx))
524 |             else:
525 |                 import numpy as np
526 | 
527 |                 if issubclass(self._vidx.dtype.type, np.bool_):
528 |                     return self._vidx.sum()
529 |                 else:
530 |                     return len(self._vidx)
531 | 
532 |         return n_vars
533 | 
534 |     @property
535 |     def n_vars(self):
536 |         return self._n_vars
537 | 
538 |     @property
539 |     def shape(self):
540 |         return self.n_obs, self.n_vars
541 | 
542 |     @cached_property
543 |     def _obsm(self):
544 |         group_storage = self.file[self.root]["obsm"] if "obsm" in self.file[self.root] else dict()
545 |         return ElemShadow(
546 |             group_storage,
547 |             key=str(Path(self.root) / "obsm"),
548 |             cache=self.__dict__,
549 |             n_obs=self.n_obs,
550 |             n_vars=self.n_vars,
551 |             array_backend=self._array_backend,
552 |             table_backend=self._table_backend,
553 |             is_view=self.is_view,
554 |             idx=(self._oidx, None),
555 |         )
556 | 
557 |     @property
558 |     def obsm(self):
559 |         return self._obsm
560 | 
561 |     def obsm_keys(self) -> list[str]:
562 |         return list(self._obsm.keys())
563 | 
564 |     @cached_property
565 |     def _varm(self):
566 |         group_storage = self.file[self.root]["varm"] if "varm" in self.file[self.root] else dict()
567 |         return ElemShadow(
568 |             group_storage,
569 |             key=str(Path(self.root) / "varm"),
570 |             cache=self.__dict__,
571 |             n_obs=self.n_obs,
572 |             n_vars=self.n_vars,
573 |             array_backend=self._array_backend,
574 |             table_backend=self._table_backend,
575 |             is_view=self.is_view,
576 |             idx=(None, self._vidx),
577 |         )
578 | 
579 |     @property
580 |     def varm(self):
581 |         return self._varm
582 | 
583 |     def varm_keys(self) -> list[str]:
584 |         return list(self._varm.keys())
585 | 
586 |     @cached_property
587 |     def _obsp(self):
588 |         group_storage = self.file[self.root]["obsp"] if "obsp" in self.file[self.root] else dict()
589 |         return ElemShadow(
590 |             group_storage,
591 |             key=str(Path(self.root) / "obsp"),
592 |             cache=self.__dict__,
593 |             n_obs=self.n_obs,
594 |             n_vars=self.n_vars,
595 |             array_backend=self._array_backend,
596 |             table_backend=self._table_backend,
597 |             is_view=self.is_view,
598 |             idx=(self._oidx, self._oidx),
599 |         )
600 | 
601 |     @property
602 |     def obsp(self):
603 |         return self._obsp
604 | 
605 |     @cached_property
606 |     def _varp(self):
607 |         # if "varp" not in self.file[self.root]:
608 |         #    return EmptySlot()
609 |         group_storage = self.file[self.root]["varp"] if "varp" in self.file[self.root] else dict()
610 |         return ElemShadow(
611 |             group_storage,
612 |             key=str(Path(self.root) / "varp"),
613 |             cache=self.__dict__,
614 |             n_obs=self.n_obs,
615 |             n_vars=self.n_vars,
616 |             array_backend=self._array_backend,
617 |             table_backend=self._table_backend,
618 |             is_view=self.is_view,
619 |             idx=(self._vidx, self._vidx),
620 |         )
621 | 
622 |     @property
623 |     def varp(self):
624 |         return self._varp
625 | 
626 |     @cached_property
627 |     def _uns(self):
628 |         if "uns" not in self.file[self.root]:
629 |             return dict()
630 | 
631 |         def map_get_keys(root):
632 |             s = ElemShadow(
633 |                 root,
634 |                 key=root.name,
635 |                 cache=self.__dict__,
636 |                 n_obs=None,
637 |                 n_vars=None,
638 |                 array_backend=self._array_backend,
639 |                 table_backend=self._table_backend,
640 |             )
641 |             for key in root.keys():
642 |                 # if hasattr(root[key], "keys"):
643 |                 if isinstance(root[key], get_args(GroupStorageType)) and hasattr(root[key], "keys"):
644 |                     s[key] = map_get_keys(root[key])
645 |             return s
646 | 
647 |         uns_root = self.file[self.root]["uns"]
648 |         return map_get_keys(uns_root)
649 | 
650 |     @property
651 |     def uns(self):
652 |         return self._uns
653 | 
654 |     def clear_cache(self):
655 |         keys = list(self.__dict__.keys())
656 |         slots = [
657 |             "X",
658 |             "obs",
659 |             "obsm",
660 |             "var",
661 |             "varm",
662 |             "obsp",
663 |             "varp",
664 |             "layers",
665 |             "raw",
666 |             "uns",
667 |         ]
668 |         _slots = [f"_{slot}" for slot in slots]
669 |         for key in keys:
670 |             if key.startswith("/") or key.startswith("mod/") or key in _slots or key in slots:
671 |                 obj_id = id(self.__dict__[key])
672 |                 obj = ctypes.cast(obj_id, ctypes.py_object).value
673 | 
674 |                 del self.__dict__[key]
675 | 
676 |                 # Make sure the object is deleted to free the memory
677 |                 del obj
678 | 
679 |     def close(self):
680 |         if self._format == "zarr":
681 |             self.file.store.close()
682 |             return
683 | 
684 |         self.file.close()
685 | 
686 |         if hasattr(self, "_callback") and self._callback and callable(self._callback):
687 |             self._callback()
688 | 
689 |     def reopen(self, mode: str, file: str | None = None) -> None:
690 |         if self._format == "zarr":
691 |             import zarr
692 | 
693 |         if not self.file:
694 |             if file is None:
695 |                 raise ValueError("The connection is closed but no new file name is provided.")
696 |             self.close()
697 |             if self._format == "zarr":
698 |                 self.file = zarr.open(file, mode=mode)
699 |             else:
700 |                 self.file = h5py.File(file, mode=mode)
701 |         elif self._format == "zarr":
702 |             if self.file.read_only and mode != "r" or mode == "r" and not self.file.read_only:
703 |                 file = file or self.file.store.path
704 |                 self.close()
705 |                 self.file = zarr.open(file, mode=mode)
706 |         elif mode != self.file.mode:
707 |             file = file or self.file.filename
708 |             self.close()
709 |             self.file = h5py.File(file, mode=mode)
710 |         else:
711 |             return
712 | 
713 |         # FIXME: parquet support
714 | 
715 |         # Update ._group in all elements
716 |         for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "uns", "layers"]:
717 |             if key in ["obs", "var"]:
718 |                 # In the current implementation attributes are not ElemShadows
719 |                 pass
720 |             elif hasattr(self, key):
721 |                 elem = getattr(self, key)
722 |                 if isinstance(elem, ElemShadow):
723 |                     elem._update_group(self.file[str(Path(self.root) / key)])
724 | 
725 |         return
726 | 
727 |     def __repr__(self):
728 |         s = ""
729 |         for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "uns"]:
730 |             key_cached = key in self.__dict__
731 |             key_cached_str = RUNECACHED if key_cached else ""
732 | 
733 |             if key in ["obs", "var"]:
734 |                 if key in self.__dict__:
735 |                     s += f"{key}{key_cached_str}:\t"
736 |                     s += f"{', '.join(map(str, getattr(self, key).columns))}\n"
737 |                 else:
738 |                     try:
739 |                         key_elems = self.file[self.root][key].keys()
740 |                     except AttributeError:
741 |                         # Do not extract column names from the pre-0.8 AnnData
742 |                         key_elems = ["..."]
743 |                         # For parquet files, keys can be read from the schema
744 |                         if self._format == "parquet" or self._format == "pyarrow":
745 |                             try:
746 |                                 from pyarrow import parquet as pq
747 | 
748 |                                 filename = self.file[self.root][key].path
749 |                                 schema = pq.read_schema(filename)
750 |                                 key_elems = schema.names
751 |                             except Exception as e:
752 |                                 raise e
753 |                     if len(key_elems) > 0:
754 |                         s += f"{key}:\t{', '.join(key_elems)}\n"
755 |             else:  # complex keys
756 |                 if not (key == "uns" and len(self.uns) == 0):
757 |                     # do not show empty dict
758 |                     s += getattr(self, key).__repr__()
759 | 
760 |         return s
761 | 
762 |     # Views
763 | 
764 |     def __getitem__(self, index):
765 |         oidx, vidx = _normalize_indices(index, self.obs_names, self.var_names)
766 |         return DataShadow._init_as_view(self, oidx, vidx)
767 | 
768 |     @property
769 |     def is_view(self):
770 |         return self._is_view
771 | 
772 |     # Legacy methods for scanpy compatibility
773 | 
774 |     def _sanitize(self):
775 |         pass
776 | 
777 |     def obs_vector(self, key: str, layer: str | None = None):
778 |         return self.obs[key].values
779 | 
780 |     def var_vector(self, key: str, layer: str | None = None):
781 |         return self.var[key].values
782 | 
783 |     # Writing
784 | 
785 |     def _push_changes(self, clear_cache: bool = False):
786 |         for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "uns", "layers"]:
787 |             if hasattr(self, key):
788 |                 elem = getattr(self, key)
789 |                 if isinstance(elem, ElemShadow):
790 |                     elem._push_changes(
791 |                         clear_cache=clear_cache,
792 |                     )
793 | 
794 |     def write(self, *args, **kwargs) -> None:
795 |         if self.is_view:
796 |             raise ValueError("Views cannot write data to the file.")
797 |         if (
798 |             self._format == "zarr"
799 |             and self.file.read_only
800 |             or self._format == "hdf5"
801 |             and self.file.mode == "r"
802 |         ):
803 |             raise OSError(
804 |                 "File is open in read-only mode. Changes can't be pushed. "
805 |                 "Reopen it with .reopen('r+') to enable writing."
806 |             )
807 |         else:
808 |             self._push_changes(*args, **kwargs)
809 |         return
810 | 
811 |     def reopen_and_write(self, mode: str = "r+", *args, **kwargs) -> None:
812 |         original_mode = self.file.mode
813 |         self.reopen(mode)
814 |         try:
815 |             self.write(*args, **kwargs)
816 |         except Exception as e:
817 |             warn(f"An error occurred while writing the changes:\n{e}")
818 |         finally:
819 |             self.reopen(original_mode)
820 | 
821 |     # Laziness
822 | 
823 |     def lazy(self):
824 |         self._lazy = True
825 | 
826 |     def eager(self):
827 |         self._lazy = False
828 | 
829 |     @property
830 |     def is_lazy(self):
831 |         return self._lazy
832 | 


--------------------------------------------------------------------------------
/docs/examples/shadows-zarr.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "b6eae7bd-1091-480f-8c95-551eefe5c53c",
   6 |    "metadata": {
   7 |     "slideshow": {
   8 |      "slide_type": "slide"
   9 |     },
  10 |     "tags": []
  11 |    },
  12 |    "source": [
  13 |     "# Shadows for zarr"
  14 |    ]
  15 |   },
  16 |   {
  17 |    "cell_type": "code",
  18 |    "execution_count": 1,
  19 |    "id": "b17e6265-4c91-4d30-a232-20e6a627c07d",
  20 |    "metadata": {},
  21 |    "outputs": [],
  22 |    "source": [
  23 |     "%load_ext autoreload\n",
  24 |     "%autoreload 2"
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "code",
  29 |    "execution_count": 2,
  30 |    "id": "4aa723fb-6a8d-4d43-913c-a31f2316b02f",
  31 |    "metadata": {},
  32 |    "outputs": [],
  33 |    "source": [
  34 |     "import os\n",
  35 |     "os.chdir(\"../../\")"
  36 |    ]
  37 |   },
  38 |   {
  39 |    "cell_type": "code",
  40 |    "execution_count": 3,
  41 |    "id": "f1c3418a-3a90-41b0-baa6-c6ad340dc75f",
  42 |    "metadata": {},
  43 |    "outputs": [],
  44 |    "source": [
  45 |     "from pathlib import Path\n",
  46 |     "data = Path(\"data/\")"
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "markdown",
  51 |    "id": "b9e3bb66-3928-45f4-ba98-fded629de018",
  52 |    "metadata": {},
  53 |    "source": [
  54 |     " "
  55 |    ]
  56 |   },
  57 |   {
  58 |    "cell_type": "markdown",
  59 |    "id": "934b8d69-b812-422f-b718-080bb8508348",
  60 |    "metadata": {
  61 |     "slideshow": {
  62 |      "slide_type": "slide"
  63 |     },
  64 |     "tags": []
  65 |    },
  66 |    "source": [
  67 |     "## Shadows for zarr storage\n",
  68 |     "\n",
  69 |     "Beyond H5AD and H5MU files, shadow objects also work with [Zarr](https://zarr.dev/) files."
  70 |    ]
  71 |   },
  72 |   {
  73 |    "cell_type": "markdown",
  74 |    "id": "65462d07-01b0-4395-8891-eda01e472f38",
  75 |    "metadata": {},
  76 |    "source": [
  77 |     " "
  78 |    ]
  79 |   },
  80 |   {
  81 |    "cell_type": "markdown",
  82 |    "id": "4a38075c-8da2-4193-af1a-c52e18176f92",
  83 |    "metadata": {
  84 |     "slideshow": {
  85 |      "slide_type": "fragment"
  86 |     },
  87 |     "tags": []
  88 |    },
  89 |    "source": [
  90 |     "Import classes for these shadow objects:"
  91 |    ]
  92 |   },
  93 |   {
  94 |    "cell_type": "code",
  95 |    "execution_count": 4,
  96 |    "id": "079454ed-10dc-47ef-9de2-ef70f95dbed6",
  97 |    "metadata": {
  98 |     "slideshow": {
  99 |      "slide_type": "fragment"
 100 |     },
 101 |     "tags": []
 102 |    },
 103 |    "outputs": [],
 104 |    "source": [
 105 |     "from shadows import AnnDataShadow, MuDataShadow"
 106 |    ]
 107 |   },
 108 |   {
 109 |    "cell_type": "markdown",
 110 |    "id": "564f7b2b-063d-4f0e-8333-c178565ee2d2",
 111 |    "metadata": {},
 112 |    "source": [
 113 |     " "
 114 |    ]
 115 |   },
 116 |   {
 117 |    "cell_type": "markdown",
 118 |    "id": "6b819452-470f-47b7-8fa0-0c8304fd557c",
 119 |    "metadata": {
 120 |     "slideshow": {
 121 |      "slide_type": "fragment"
 122 |     },
 123 |     "tags": []
 124 |    },
 125 |    "source": [
 126 |     "Initialise a multimodal shadow object:"
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "code",
 131 |    "execution_count": 5,
 132 |    "id": "3ff358c0-2c77-460a-97a9-398f615a0e17",
 133 |    "metadata": {
 134 |     "slideshow": {
 135 |      "slide_type": "fragment"
 136 |     },
 137 |     "tags": []
 138 |    },
 139 |    "outputs": [],
 140 |    "source": [
 141 |     "file = data / \"pbmc5k_citeseq/minipbcite_prot.zarr\"\n",
 142 |     "adata = AnnDataShadow(file, format=\"zarr\")"
 143 |    ]
 144 |   },
 145 |   {
 146 |    "cell_type": "markdown",
 147 |    "id": "1747c671-ffc2-4d4d-8a04-7dc44432b2fb",
 148 |    "metadata": {},
 149 |    "source": [
 150 |     " "
 151 |    ]
 152 |   },
 153 |   {
 154 |    "cell_type": "markdown",
 155 |    "id": "b8ae6d73-9a74-48ed-9d41-7e92bfee8f71",
 156 |    "metadata": {
 157 |     "slideshow": {
 158 |      "slide_type": "slide"
 159 |     },
 160 |     "tags": []
 161 |    },
 162 |    "source": [
 163 |     "### File\n",
 164 |     "\n",
 165 |     "As with HDF5 files, file connection that the shadow is using can be accessed via the `.file` attribute:"
 166 |    ]
 167 |   },
 168 |   {
 169 |    "cell_type": "code",
 170 |    "execution_count": 7,
 171 |    "id": "33c47ede-e566-43ac-8596-470263d21b3a",
 172 |    "metadata": {},
 173 |    "outputs": [
 174 |     {
 175 |      "data": {
 176 |       "text/plain": [
 177 |        "<zarr.hierarchy.Group '/' read-only>"
 178 |       ]
 179 |      },
 180 |      "execution_count": 7,
 181 |      "metadata": {},
 182 |      "output_type": "execute_result"
 183 |     }
 184 |    ],
 185 |    "source": [
 186 |     "adata.file"
 187 |    ]
 188 |   },
 189 |   {
 190 |    "cell_type": "markdown",
 191 |    "id": "a43127df-c330-4104-bbf6-399c7392c373",
 192 |    "metadata": {},
 193 |    "source": [
 194 |     "The path to the file can then be accessed via `adata.file.store.path`:"
 195 |    ]
 196 |   },
 197 |   {
 198 |    "cell_type": "code",
 199 |    "execution_count": 8,
 200 |    "id": "2f7dec24-9cc3-4cf2-a044-a6e487c17315",
 201 |    "metadata": {},
 202 |    "outputs": [
 203 |     {
 204 |      "data": {
 205 |       "text/plain": [
 206 |        "'minipbcite_prot.zarr'"
 207 |       ]
 208 |      },
 209 |      "execution_count": 8,
 210 |      "metadata": {},
 211 |      "output_type": "execute_result"
 212 |     }
 213 |    ],
 214 |    "source": [
 215 |     "os.path.basename(adata.file.store.path)"
 216 |    ]
 217 |   },
 218 |   {
 219 |    "cell_type": "markdown",
 220 |    "id": "0574136f-7aa4-4a1e-9312-eee5fc9c6744",
 221 |    "metadata": {
 222 |     "slideshow": {
 223 |      "slide_type": "subslide"
 224 |     },
 225 |     "tags": []
 226 |    },
 227 |    "source": [
 228 |     "Zarr store will be closed upon calling the `adata.close()` method:"
 229 |    ]
 230 |   },
 231 |   {
 232 |    "cell_type": "code",
 233 |    "execution_count": 9,
 234 |    "id": "0b14eda7-0343-4f8d-82d9-ffc7257a1a11",
 235 |    "metadata": {},
 236 |    "outputs": [],
 237 |    "source": [
 238 |     "adata.close()"
 239 |    ]
 240 |   },
 241 |   {
 242 |    "cell_type": "markdown",
 243 |    "id": "a87e0e96-86c2-4623-b239-892e92b04a5a",
 244 |    "metadata": {
 245 |     "slideshow": {
 246 |      "slide_type": "fragment"
 247 |     },
 248 |     "tags": []
 249 |    },
 250 |    "source": [
 251 |     "... or until the file has to be re-opened for modification (see below)."
 252 |    ]
 253 |   },
 254 |   {
 255 |    "cell_type": "markdown",
 256 |    "id": "5a064df4-b533-4124-a85a-f7b20fcc1091",
 257 |    "metadata": {},
 258 |    "source": [
 259 |     " "
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "markdown",
 264 |    "id": "9beb85a9-e226-4b9a-949b-2351432558f7",
 265 |    "metadata": {
 266 |     "slideshow": {
 267 |      "slide_type": "slide"
 268 |     },
 269 |     "tags": []
 270 |    },
 271 |    "source": [
 272 |     "### Permissions\n",
 273 |     "\n",
 274 |     "We can open Zarr files in different modes including purely read-only (`'r'`) and read/write (`'r+'`). The mode can be provided to the constructor:"
 275 |    ]
 276 |   },
 277 |   {
 278 |    "cell_type": "code",
 279 |    "execution_count": 10,
 280 |    "id": "9f297beb-97b5-46ad-97b9-2dedc5c40b53",
 281 |    "metadata": {},
 282 |    "outputs": [
 283 |     {
 284 |      "data": {
 285 |       "text/plain": [
 286 |        "True"
 287 |       ]
 288 |      },
 289 |      "execution_count": 10,
 290 |      "metadata": {},
 291 |      "output_type": "execute_result"
 292 |     }
 293 |    ],
 294 |    "source": [
 295 |     "adata = AnnDataShadow(file, format=\"zarr\", mode=\"r\")\n",
 296 |     "adata.file.read_only"
 297 |    ]
 298 |   },
 299 |   {
 300 |    "cell_type": "markdown",
 301 |    "id": "fc9da2a5-402f-4fe8-83a2-0a5f06a84d7c",
 302 |    "metadata": {},
 303 |    "source": [
 304 |     "Let's add some data to the in-memory shadow object:"
 305 |    ]
 306 |   },
 307 |   {
 308 |    "cell_type": "code",
 309 |    "execution_count": 11,
 310 |    "id": "21f291bd-7c5d-4ef3-a034-c0030dabdb60",
 311 |    "metadata": {
 312 |     "slideshow": {
 313 |      "slide_type": "fragment"
 314 |     },
 315 |     "tags": []
 316 |    },
 317 |    "outputs": [],
 318 |    "source": [
 319 |     "adata.obsm[\"X_pca_copy\"] = adata.obsm[\"X_pca\"].copy()"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "markdown",
 324 |    "id": "b03108f5-0e8a-4646-af12-ef5fc934885b",
 325 |    "metadata": {
 326 |     "slideshow": {
 327 |      "slide_type": "subslide"
 328 |     },
 329 |     "tags": []
 330 |    },
 331 |    "source": [
 332 |     "We can also conveniently close and reopen the connection for a given in-memory shadow object:"
 333 |    ]
 334 |   },
 335 |   {
 336 |    "cell_type": "code",
 337 |    "execution_count": 12,
 338 |    "id": "e8ddb228-74b4-4f8e-8cdc-c84479f38d2d",
 339 |    "metadata": {},
 340 |    "outputs": [
 341 |     {
 342 |      "data": {
 343 |       "text/plain": [
 344 |        "False"
 345 |       ]
 346 |      },
 347 |      "execution_count": 12,
 348 |      "metadata": {},
 349 |      "output_type": "execute_result"
 350 |     }
 351 |    ],
 352 |    "source": [
 353 |     "adata.reopen(mode=\"r+\")\n",
 354 |     "adata.file.read_only"
 355 |    ]
 356 |   },
 357 |   {
 358 |    "cell_type": "markdown",
 359 |    "id": "48157734-adc0-4e7d-8157-64e1201b6fba",
 360 |    "metadata": {},
 361 |    "source": [
 362 |     "This way all the newly added elements are still available in memory:"
 363 |    ]
 364 |   },
 365 |   {
 366 |    "cell_type": "code",
 367 |    "execution_count": 13,
 368 |    "id": "043428b5-dc58-4d0c-b653-e1d8451b39f9",
 369 |    "metadata": {},
 370 |    "outputs": [
 371 |     {
 372 |      "data": {
 373 |       "text/plain": [
 374 |        "obsm:\tX_pcaᐁ, X_umap, X_pca_copy▲"
 375 |       ]
 376 |      },
 377 |      "execution_count": 13,
 378 |      "metadata": {},
 379 |      "output_type": "execute_result"
 380 |     }
 381 |    ],
 382 |    "source": [
 383 |     "adata.obsm"
 384 |    ]
 385 |   },
 386 |   {
 387 |    "cell_type": "code",
 388 |    "execution_count": 14,
 389 |    "id": "50aba055-06e2-490d-a1a6-3307ef7ac6d0",
 390 |    "metadata": {
 391 |     "slideshow": {
 392 |      "slide_type": "fragment"
 393 |     },
 394 |     "tags": []
 395 |    },
 396 |    "outputs": [],
 397 |    "source": [
 398 |     "# Clean up\n",
 399 |     "adata.close()\n",
 400 |     "del adata"
 401 |    ]
 402 |   },
 403 |   {
 404 |    "cell_type": "markdown",
 405 |    "id": "991ccc6a-f182-4689-802d-a9ae70a490e4",
 406 |    "metadata": {},
 407 |    "source": [
 408 |     " "
 409 |    ]
 410 |   },
 411 |   {
 412 |    "cell_type": "markdown",
 413 |    "id": "2dbc52ad-6010-416f-810b-c60e5546ba7b",
 414 |    "metadata": {
 415 |     "slideshow": {
 416 |      "slide_type": "slide"
 417 |     },
 418 |     "tags": []
 419 |    },
 420 |    "source": [
 421 |     "### Individual modality access\n",
 422 |     "\n",
 423 |     "Individual modalities stored in the .h5mu files can be accessed as part of the `MuDataShadow` object:"
 424 |    ]
 425 |   },
 426 |   {
 427 |    "cell_type": "code",
 428 |    "execution_count": 15,
 429 |    "id": "d5ea1511-6f1b-4c51-9ec7-14365dc8d391",
 430 |    "metadata": {
 431 |     "slideshow": {
 432 |      "slide_type": "fragment"
 433 |     },
 434 |     "tags": []
 435 |    },
 436 |    "outputs": [
 437 |     {
 438 |      "data": {
 439 |       "text/plain": [
 440 |        "AnnData Shadow object with n_obs × n_vars = 411 × 29\n",
 441 |        "  X  \n",
 442 |        "  layers:\tcounts\n",
 443 |        "  obs:\t_index\n",
 444 |        "  var:\t_index, feature_types, gene_ids, highly_variable\n",
 445 |        "  obsm:\tX_pca, X_umap\n",
 446 |        "  varm:\tPCs\n",
 447 |        "  obsp:\tconnectivities, distances\n",
 448 |        "  uns:\tneighbors, pca, umap"
 449 |       ]
 450 |      },
 451 |      "execution_count": 15,
 452 |      "metadata": {},
 453 |      "output_type": "execute_result"
 454 |     }
 455 |    ],
 456 |    "source": [
 457 |     "adata = AnnDataShadow(file, format=\"zarr\")\n",
 458 |     "adata"
 459 |    ]
 460 |   },
 461 |   {
 462 |    "cell_type": "code",
 463 |    "execution_count": 16,
 464 |    "id": "946d03a9-d0d1-4ebc-ae29-92d795f08073",
 465 |    "metadata": {
 466 |     "slideshow": {
 467 |      "slide_type": "fragment"
 468 |     },
 469 |     "tags": []
 470 |    },
 471 |    "outputs": [],
 472 |    "source": [
 473 |     "# Clean up\n",
 474 |     "adata.close()\n",
 475 |     "del adata"
 476 |    ]
 477 |   },
 478 |   {
 479 |    "cell_type": "markdown",
 480 |    "id": "14b8ad11-adad-4ea8-9146-3dd7cd9bd415",
 481 |    "metadata": {},
 482 |    "source": [
 483 |     " "
 484 |    ]
 485 |   },
 486 |   {
 487 |    "cell_type": "markdown",
 488 |    "id": "d3ae2a84-34fc-48b9-926e-a5d5f57e4e73",
 489 |    "metadata": {
 490 |     "slideshow": {
 491 |      "slide_type": "slide"
 492 |     },
 493 |     "tags": []
 494 |    },
 495 |    "source": [
 496 |     "### Class identity\n",
 497 |     "\n",
 498 |     "Many tools in the ecosystem including scanpy frequently check if the input object is an AnnData. For instance, [in `sc.pp.highly_variable_genes`](https://github.com/scverse/scanpy/blob/master/scanpy/preprocessing/_highly_variable_genes.py) it reads:\n",
 499 |     "\n",
 500 |     "```py\n",
 501 |     "if not isinstance(adata, AnnData):\n",
 502 |     "    raise ValueError(\n",
 503 |     "        '`pp.highly_variable_genes` expects an `AnnData` argument, '\n",
 504 |     "        'pass `inplace=False` if you want to return a `pd.DataFrame`.'\n",
 505 |     "    )\n",
 506 |     "```\n",
 507 |     "\n",
 508 |     "In order for shadow objects to be accepted by such functions, they mock their class identity:"
 509 |    ]
 510 |   },
 511 |   {
 512 |    "cell_type": "code",
 513 |    "execution_count": 17,
 514 |    "id": "f10b98ff-920f-4d46-924f-1cf3074236db",
 515 |    "metadata": {
 516 |     "slideshow": {
 517 |      "slide_type": "subslide"
 518 |     },
 519 |     "tags": []
 520 |    },
 521 |    "outputs": [],
 522 |    "source": [
 523 |     "adata = AnnDataShadow(file, format=\"zarr\")\n",
 524 |     "\n",
 525 |     "from anndata import AnnData\n",
 526 |     "assert isinstance(adata, AnnData), \"adata is not a valid AnnData object\""
 527 |    ]
 528 |   },
 529 |   {
 530 |    "cell_type": "markdown",
 531 |    "id": "f8e2d4a9-eba2-45c0-88f6-35f69e7d0249",
 532 |    "metadata": {
 533 |     "slideshow": {
 534 |      "slide_type": "subslide"
 535 |     },
 536 |     "tags": []
 537 |    },
 538 |    "source": [
 539 |     "Checking for shadow identity still works:"
 540 |    ]
 541 |   },
 542 |   {
 543 |    "cell_type": "code",
 544 |    "execution_count": 18,
 545 |    "id": "efadd4ba-219c-4c84-a1eb-36baf135c82d",
 546 |    "metadata": {},
 547 |    "outputs": [
 548 |     {
 549 |      "data": {
 550 |       "text/plain": [
 551 |        "True"
 552 |       ]
 553 |      },
 554 |      "execution_count": 18,
 555 |      "metadata": {},
 556 |      "output_type": "execute_result"
 557 |     }
 558 |    ],
 559 |    "source": [
 560 |     "isinstance(adata, AnnDataShadow)"
 561 |    ]
 562 |   },
 563 |   {
 564 |    "cell_type": "code",
 565 |    "execution_count": 19,
 566 |    "id": "a32515de-7866-4229-a639-0818a0dbea3b",
 567 |    "metadata": {
 568 |     "slideshow": {
 569 |      "slide_type": "fragment"
 570 |     },
 571 |     "tags": []
 572 |    },
 573 |    "outputs": [],
 574 |    "source": [
 575 |     "adata.close()"
 576 |    ]
 577 |   },
 578 |   {
 579 |    "cell_type": "markdown",
 580 |    "id": "8d4e683f-0a0b-426c-8cf7-5f5529a844d2",
 581 |    "metadata": {},
 582 |    "source": [
 583 |     " "
 584 |    ]
 585 |   },
 586 |   {
 587 |    "cell_type": "markdown",
 588 |    "id": "c29f18b0-717b-4821-b0f8-e81ca94426de",
 589 |    "metadata": {},
 590 |    "source": [
 591 |     "### Backends\n",
 592 |     "\n",
 593 |     "AnnData/MuData are based on a NumPy/Pandas stack. This is the default for the shadow objects in order to provide compatibility with AnnData/MuData objects.\n",
 594 |     "\n",
 595 |     "However the nature of shadow files also simplifies loading individual matrices or tables with alternative backends, e.g. [JAX](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.array.html#jax.numpy.array) (`Array`), [PyTorch](https://pytorch.org/docs/stable/tensors.html) (`Tensor`) or [polars](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/index.html) (`DataFrame`)."
 596 |    ]
 597 |   },
 598 |   {
 599 |    "cell_type": "code",
 600 |    "execution_count": 20,
 601 |    "id": "734d4e9e-3936-4911-96fe-1bed3de167eb",
 602 |    "metadata": {},
 603 |    "outputs": [],
 604 |    "source": [
 605 |     "adata = AnnDataShadow(file, format=\"zarr\", array_backend=\"jax\", table_backend=\"polars\")"
 606 |    ]
 607 |   },
 608 |   {
 609 |    "cell_type": "code",
 610 |    "execution_count": 21,
 611 |    "id": "3d909ef6-92b7-40f4-b50e-641993469791",
 612 |    "metadata": {},
 613 |    "outputs": [
 614 |     {
 615 |      "name": "stdout",
 616 |      "output_type": "stream",
 617 |      "text": [
 618 |       "<class 'polars.internals.dataframe.frame.DataFrame'>\n"
 619 |      ]
 620 |     },
 621 |     {
 622 |      "data": {
 623 |       "text/html": [
 624 |        "<div>\n",
 625 |        "<style scoped>\n",
 626 |        "    .dataframe tbody tr th:only-of-type {\n",
 627 |        "        vertical-align: middle;\n",
 628 |        "    }\n",
 629 |        "\n",
 630 |        "    .dataframe tbody tr th {\n",
 631 |        "        vertical-align: top;\n",
 632 |        "    }\n",
 633 |        "\n",
 634 |        "    .dataframe thead th {\n",
 635 |        "        text-align: right;\n",
 636 |        "    }\n",
 637 |        "\n",
 638 |        "    .dataframe td {\n",
 639 |        "        white-space: pre;\n",
 640 |        "    }\n",
 641 |        "\n",
 642 |        "    .dataframe td {\n",
 643 |        "        padding-top: 0;\n",
 644 |        "    }\n",
 645 |        "\n",
 646 |        "    .dataframe td {\n",
 647 |        "        padding-bottom: 0;\n",
 648 |        "    }\n",
 649 |        "\n",
 650 |        "    .dataframe td {\n",
 651 |        "        line-height: 95%;\n",
 652 |        "    }\n",
 653 |        "</style>\n",
 654 |        "<table border=\"1\" class=\"dataframe\">\n",
 655 |        "<small>shape: (5, 1)</small>\n",
 656 |        "<thead>\n",
 657 |        "<tr>\n",
 658 |        "<th>\n",
 659 |        "_index\n",
 660 |        "</th>\n",
 661 |        "</tr>\n",
 662 |        "<tr>\n",
 663 |        "<td>\n",
 664 |        "object\n",
 665 |        "</td>\n",
 666 |        "</tr>\n",
 667 |        "</thead>\n",
 668 |        "<tbody>\n",
 669 |        "<tr>\n",
 670 |        "<td>\n",
 671 |        "CAGCCAGGTCTCGACG-1\n",
 672 |        "</td>\n",
 673 |        "</tr>\n",
 674 |        "<tr>\n",
 675 |        "<td>\n",
 676 |        "TTCTTCCTCTCGGTAA-1\n",
 677 |        "</td>\n",
 678 |        "</tr>\n",
 679 |        "<tr>\n",
 680 |        "<td>\n",
 681 |        "CGGGTCAAGAGAGGTA-1\n",
 682 |        "</td>\n",
 683 |        "</tr>\n",
 684 |        "<tr>\n",
 685 |        "<td>\n",
 686 |        "TACCCGTCATAATCCG-1\n",
 687 |        "</td>\n",
 688 |        "</tr>\n",
 689 |        "<tr>\n",
 690 |        "<td>\n",
 691 |        "TGGGTTAGTGAATTAG-1\n",
 692 |        "</td>\n",
 693 |        "</tr>\n",
 694 |        "</tbody>\n",
 695 |        "</table>\n",
 696 |        "</div>"
 697 |       ],
 698 |       "text/plain": [
 699 |        "shape: (5, 1)\n",
 700 |        "┌────────────────────┐\n",
 701 |        "│ _index             │\n",
 702 |        "│ ---                │\n",
 703 |        "│ object             │\n",
 704 |        "╞════════════════════╡\n",
 705 |        "│ CAGCCAGGTCTCGACG-1 │\n",
 706 |        "│ TTCTTCCTCTCGGTAA-1 │\n",
 707 |        "│ CGGGTCAAGAGAGGTA-1 │\n",
 708 |        "│ TACCCGTCATAATCCG-1 │\n",
 709 |        "│ TGGGTTAGTGAATTAG-1 │\n",
 710 |        "└────────────────────┘"
 711 |       ]
 712 |      },
 713 |      "execution_count": 21,
 714 |      "metadata": {},
 715 |      "output_type": "execute_result"
 716 |     }
 717 |    ],
 718 |    "source": [
 719 |     "obs = adata.obs\n",
 720 |     "print(type(obs))\n",
 721 |     "obs.head()"
 722 |    ]
 723 |   },
 724 |   {
 725 |    "cell_type": "code",
 726 |    "execution_count": 22,
 727 |    "id": "32286100-13e4-49af-8194-f53693c9b7f0",
 728 |    "metadata": {},
 729 |    "outputs": [
 730 |     {
 731 |      "name": "stdout",
 732 |      "output_type": "stream",
 733 |      "text": [
 734 |       "<class 'jaxlib.xla_extension.ArrayImpl'>\n"
 735 |      ]
 736 |     },
 737 |     {
 738 |      "data": {
 739 |       "text/plain": [
 740 |        "Array([[ 17.051027  ,   1.2865539 ,  -1.2715828 , ...,  -0.05060111,\n",
 741 |        "         -1.8431426 ,  -1.0410113 ],\n",
 742 |        "       [ 15.563506  ,  -2.1941857 ,  -1.351732  , ...,  -1.0639406 ,\n",
 743 |        "         -0.1610156 ,   2.1454387 ],\n",
 744 |        "       [ 20.369316  ,  -8.03503   ,   0.3842825 , ...,   0.52950376,\n",
 745 |        "         -0.38589898,  -0.7488529 ],\n",
 746 |        "       ...,\n",
 747 |        "       [-11.894565  ,   9.380491  ,  -0.87732434, ...,  -0.40848297,\n",
 748 |        "          0.4135897 ,  -0.710097  ],\n",
 749 |        "       [-13.12094   ,   9.734974  ,  -3.345742  , ...,   1.049644  ,\n",
 750 |        "          0.28707528,  -1.8128693 ],\n",
 751 |        "       [-12.875325  ,  11.512296  ,  -4.9828258 , ...,  -0.82176274,\n",
 752 |        "         -2.06324   ,  -0.14073044]], dtype=float32)"
 753 |       ]
 754 |      },
 755 |      "execution_count": 22,
 756 |      "metadata": {},
 757 |      "output_type": "execute_result"
 758 |     }
 759 |    ],
 760 |    "source": [
 761 |     "rna_pca = adata.obsm[\"X_pca\"]\n",
 762 |     "print(type(rna_pca))\n",
 763 |     "rna_pca"
 764 |    ]
 765 |   },
 766 |   {
 767 |    "cell_type": "markdown",
 768 |    "id": "6cdad910-a34c-49d2-bc03-87bfde9417c9",
 769 |    "metadata": {},
 770 |    "source": [
 771 |     "When alternative backends are being used, not all of the AnnData/MuData features can be supported, and many external tools might not work as expected as they anticipate NumPy/Pandas objects instead."
 772 |    ]
 773 |   },
 774 |   {
 775 |    "cell_type": "code",
 776 |    "execution_count": 23,
 777 |    "id": "b06a9071-0443-41e6-ac81-e3f0ce2653e9",
 778 |    "metadata": {},
 779 |    "outputs": [],
 780 |    "source": [
 781 |     "# Clean up\n",
 782 |     "adata.clear_cache()\n",
 783 |     "adata.close()\n",
 784 |     "del adata, rna_pca, obs"
 785 |    ]
 786 |   },
 787 |   {
 788 |    "cell_type": "markdown",
 789 |    "id": "6c474c9e-dfea-406c-ace6-461e8d5438a4",
 790 |    "metadata": {},
 791 |    "source": [
 792 |     " "
 793 |    ]
 794 |   },
 795 |   {
 796 |    "cell_type": "markdown",
 797 |    "id": "16f9b372-a089-4aed-b91e-b368a2ddc13e",
 798 |    "metadata": {
 799 |     "slideshow": {
 800 |      "slide_type": "slide"
 801 |     },
 802 |     "tags": []
 803 |    },
 804 |    "source": [
 805 |     "### Partial writing\n",
 806 |     "\n",
 807 |     "> [!NOTE]\n",
 808 |     "> This feature is experimental.\n",
 809 |     "\n",
 810 |     "While the main use of the shadows is to provide a low-memory read-only solution to scverse datasets, ability to add new embeddings or other items to the file can greatly extend its usage patterns."
 811 |    ]
 812 |   },
 813 |   {
 814 |    "cell_type": "code",
 815 |    "execution_count": 24,
 816 |    "id": "02245bc0-cc92-4fe7-b665-a4e2f424b353",
 817 |    "metadata": {
 818 |     "slideshow": {
 819 |      "slide_type": "fragment"
 820 |     },
 821 |     "tags": []
 822 |    },
 823 |    "outputs": [],
 824 |    "source": [
 825 |     "adata = AnnDataShadow(file, format=\"zarr\")"
 826 |    ]
 827 |   },
 828 |   {
 829 |    "cell_type": "markdown",
 830 |    "id": "c7324f1c-c4a4-4561-9680-0ac5caacc79f",
 831 |    "metadata": {},
 832 |    "source": [
 833 |     "Add a new embedding to the in-memory object:"
 834 |    ]
 835 |   },
 836 |   {
 837 |    "cell_type": "code",
 838 |    "execution_count": 25,
 839 |    "id": "eb6f076f-0b26-428b-a824-a82b3d648c00",
 840 |    "metadata": {
 841 |     "slideshow": {
 842 |      "slide_type": "fragment"
 843 |     },
 844 |     "tags": []
 845 |    },
 846 |    "outputs": [
 847 |     {
 848 |      "data": {
 849 |       "text/plain": [
 850 |        "obsm:\tX_pcaᐁ, X_umap, X_pca_copy▲"
 851 |       ]
 852 |      },
 853 |      "execution_count": 25,
 854 |      "metadata": {},
 855 |      "output_type": "execute_result"
 856 |     }
 857 |    ],
 858 |    "source": [
 859 |     "adata.obsm[\"X_pca_copy\"] = adata.obsm[\"X_pca\"].copy()\n",
 860 |     "adata.obsm"
 861 |    ]
 862 |   },
 863 |   {
 864 |    "cell_type": "markdown",
 865 |    "id": "0a7a6374-cb13-4f3a-8f5b-e0c4b4f89363",
 866 |    "metadata": {
 867 |     "slideshow": {
 868 |      "slide_type": "subslide"
 869 |     },
 870 |     "tags": []
 871 |    },
 872 |    "source": [
 873 |     "For this, a family of methods is useful, including `.reopen()` and `.write()`. The `.write()` method will only work if the connection is not read-only, e.g. `'r+'`, however it is possible to reopen the file in another mode.\n",
 874 |     "\n",
 875 |     "Internally, `.write()` pushes (`._push_changes()`) the in-memory changes (marked with ▲ in the object representation above) to the file and provides meaningful error messages when the file is not open for writing.\n",
 876 |     "\n",
 877 |     "This separation of concern makes it transparent when the data is modified, and this workflow can be recommended when barely any data are added to the file. As the methods return the shadow itself, it is possible to chain them:"
 878 |    ]
 879 |   },
 880 |   {
 881 |    "cell_type": "code",
 882 |    "execution_count": 26,
 883 |    "id": "bcfa2982-4bf6-42eb-a604-d17d6496598b",
 884 |    "metadata": {
 885 |     "slideshow": {
 886 |      "slide_type": "fragment"
 887 |     },
 888 |     "tags": []
 889 |    },
 890 |    "outputs": [
 891 |     {
 892 |      "data": {
 893 |       "text/plain": [
 894 |        "obsm:\tX_pcaᐁ, X_pca_copy, X_umap"
 895 |       ]
 896 |      },
 897 |      "execution_count": 26,
 898 |      "metadata": {},
 899 |      "output_type": "execute_result"
 900 |     }
 901 |    ],
 902 |    "source": [
 903 |     "adata.reopen(mode='r+').write(clear_cache=True).reopen(mode='r');  # clear pushed elements from cache\n",
 904 |     "adata.obsm"
 905 |    ]
 906 |   },
 907 |   {
 908 |    "cell_type": "code",
 909 |    "execution_count": 27,
 910 |    "id": "1b794d6e-3cf2-4451-9a96-972aec79fc82",
 911 |    "metadata": {},
 912 |    "outputs": [],
 913 |    "source": [
 914 |     "adata.clear_cache()"
 915 |    ]
 916 |   },
 917 |   {
 918 |    "cell_type": "markdown",
 919 |    "id": "af3d311e-0199-4dcf-b5a5-15b8e446fd08",
 920 |    "metadata": {},
 921 |    "source": [
 922 |     " "
 923 |    ]
 924 |   },
 925 |   {
 926 |    "cell_type": "markdown",
 927 |    "id": "1b128596-dbb5-4469-a346-bd14cda79eb3",
 928 |    "metadata": {},
 929 |    "source": [
 930 |     "Default mode is read-only, and it protects the files from being modified while also allowing for multiple connections to the file:"
 931 |    ]
 932 |   },
 933 |   {
 934 |    "cell_type": "code",
 935 |    "execution_count": 28,
 936 |    "id": "8e817c96-ae69-49d7-a574-58481170f011",
 937 |    "metadata": {},
 938 |    "outputs": [
 939 |     {
 940 |      "name": "stdout",
 941 |      "output_type": "stream",
 942 |      "text": [
 943 |       "Not available for .write(): File is open in read-only mode. Changes can't be pushed. Reopen it with .reopen('r+') to enable writing.\n"
 944 |      ]
 945 |     }
 946 |    ],
 947 |    "source": [
 948 |     "try:\n",
 949 |     "    adata.write()\n",
 950 |     "except OSError as e:\n",
 951 |     "    print(\"Not available for .write():\", e)"
 952 |    ]
 953 |   },
 954 |   {
 955 |    "cell_type": "markdown",
 956 |    "id": "2e68cef8-871f-49be-8829-f59ff9d93f99",
 957 |    "metadata": {},
 958 |    "source": [
 959 |     " "
 960 |    ]
 961 |   },
 962 |   {
 963 |    "cell_type": "markdown",
 964 |    "id": "8b5c17b8-98d1-42b6-a008-b3c3b6fbfb79",
 965 |    "metadata": {},
 966 |    "source": [
 967 |     "> [!NOTE]\n",
 968 |     "> Partial writing is currently intended to add new elements to the dataset on di  not allow to delete or modify existing elements"
 969 |    ]
 970 |   },
 971 |   {
 972 |    "cell_type": "markdown",
 973 |    "id": "e841d95f-3f46-4902-b18f-eb4c7080e58d",
 974 |    "metadata": {},
 975 |    "source": [
 976 |     " "
 977 |    ]
 978 |   },
 979 |   {
 980 |    "cell_type": "markdown",
 981 |    "id": "e0c11265-8429-4a34-a552-759b1f07a0bc",
 982 |    "metadata": {
 983 |     "tags": []
 984 |    },
 985 |    "source": [
 986 |     "### Views\n",
 987 |     "\n",
 988 |     "Views for shadow objects are conceptually similar to [views in AnnData/MuData](https://anndata.readthedocs.io/en/latest/generated/anndata.AnnData.is_view.html): they provide a view into an existing object without creating its copy.\n",
 989 |     "\n",
 990 |     "As shadow objects inherently operate on the file they are connected to, their views behave slightly differently. Creating a view creates a new connection to the file and returns a new shadow object, which is aware of the part of the data (e.g. which cells) it is supposed to provide a view for."
 991 |    ]
 992 |   },
 993 |   {
 994 |    "cell_type": "code",
 995 |    "execution_count": 29,
 996 |    "id": "c3ea6e33-128a-48fd-a421-0c9f5801e47d",
 997 |    "metadata": {},
 998 |    "outputs": [
 999 |     {
1000 |      "data": {
1001 |       "text/plain": [
1002 |        "View of AnnData Shadow object with n_obs × n_vars = 100 × 29 (original 411 × 29)\n",
1003 |        "  X  \n",
1004 |        "  layers:\tcounts\n",
1005 |        "  obs:\t_index\n",
1006 |        "  var:\t_index, feature_types, gene_ids, highly_variable\n",
1007 |        "  obsm:\tX_pca, X_pca_copy, X_umap\n",
1008 |        "  varm:\tPCs\n",
1009 |        "  obsp:\tconnectivities, distances\n",
1010 |        "  uns:\tneighbors, pca, umap"
1011 |       ]
1012 |      },
1013 |      "execution_count": 29,
1014 |      "metadata": {},
1015 |      "output_type": "execute_result"
1016 |     }
1017 |    ],
1018 |    "source": [
1019 |     "head = 100\n",
1020 |     "head_view = adata[0:head]\n",
1021 |     "head_view"
1022 |    ]
1023 |   },
1024 |   {
1025 |    "cell_type": "markdown",
1026 |    "id": "2f115798-96d2-4660-889d-b3e9a2d154c3",
1027 |    "metadata": {},
1028 |    "source": [
1029 |     "Individual modalities of a MuData Shadow View are sliced accordingly:"
1030 |    ]
1031 |   },
1032 |   {
1033 |    "cell_type": "code",
1034 |    "execution_count": 30,
1035 |    "id": "13f4b379-e26d-4677-9de3-42b3754af15d",
1036 |    "metadata": {},
1037 |    "outputs": [
1038 |     {
1039 |      "data": {
1040 |       "text/plain": [
1041 |        "(100, 31)"
1042 |       ]
1043 |      },
1044 |      "execution_count": 30,
1045 |      "metadata": {},
1046 |      "output_type": "execute_result"
1047 |     }
1048 |    ],
1049 |    "source": [
1050 |     "head_view.obsm[\"X_pca\"].shape"
1051 |    ]
1052 |   },
1053 |   {
1054 |    "cell_type": "code",
1055 |    "execution_count": 31,
1056 |    "id": "585fcbc6-9d5f-406f-99e1-6b91117e2bac",
1057 |    "metadata": {},
1058 |    "outputs": [
1059 |     {
1060 |      "data": {
1061 |       "text/plain": [
1062 |        "obsm:\tX_pcaᐁ, X_pca_copy, X_umap"
1063 |       ]
1064 |      },
1065 |      "execution_count": 31,
1066 |      "metadata": {},
1067 |      "output_type": "execute_result"
1068 |     }
1069 |    ],
1070 |    "source": [
1071 |     "head_view.obsm"
1072 |    ]
1073 |   },
1074 |   {
1075 |    "cell_type": "code",
1076 |    "execution_count": 32,
1077 |    "id": "bfa15c8a-f4a8-4907-939f-5cb80ef50abc",
1078 |    "metadata": {},
1079 |    "outputs": [
1080 |     {
1081 |      "data": {
1082 |       "text/plain": [
1083 |        "View of AnnData Shadow object with n_obs × n_vars = 2 × 3 (original 411 × 29)\n",
1084 |        "  X  \n",
1085 |        "  layers:\tcounts\n",
1086 |        "  obs:\t_index\n",
1087 |        "  var:\t_index, feature_types, gene_ids, highly_variable\n",
1088 |        "  obsm:\tX_pca, X_pca_copy, X_umap\n",
1089 |        "  varm:\tPCs\n",
1090 |        "  obsp:\tconnectivities, distances\n",
1091 |        "  uns:\tneighbors, pca, umap"
1092 |       ]
1093 |      },
1094 |      "execution_count": 32,
1095 |      "metadata": {},
1096 |      "output_type": "execute_result"
1097 |     }
1098 |    ],
1099 |    "source": [
1100 |     "nested_view = head_view[:2,-3:]\n",
1101 |     "nested_view"
1102 |    ]
1103 |   },
1104 |   {
1105 |    "cell_type": "markdown",
1106 |    "id": "6e3ce502-40e6-4b40-b78e-cf86e527bf18",
1107 |    "metadata": {},
1108 |    "source": [
1109 |     "Getting attributes from views is no different than for shadow objects:"
1110 |    ]
1111 |   },
1112 |   {
1113 |    "cell_type": "code",
1114 |    "execution_count": 33,
1115 |    "id": "216d5cd3-5457-4145-952b-61bed2be9f7d",
1116 |    "metadata": {},
1117 |    "outputs": [
1118 |     {
1119 |      "data": {
1120 |       "text/html": [
1121 |        "<div>\n",
1122 |        "<style scoped>\n",
1123 |        "    .dataframe tbody tr th:only-of-type {\n",
1124 |        "        vertical-align: middle;\n",
1125 |        "    }\n",
1126 |        "\n",
1127 |        "    .dataframe tbody tr th {\n",
1128 |        "        vertical-align: top;\n",
1129 |        "    }\n",
1130 |        "\n",
1131 |        "    .dataframe thead th {\n",
1132 |        "        text-align: right;\n",
1133 |        "    }\n",
1134 |        "</style>\n",
1135 |        "<table border=\"1\" class=\"dataframe\">\n",
1136 |        "  <thead>\n",
1137 |        "    <tr style=\"text-align: right;\">\n",
1138 |        "      <th></th>\n",
1139 |        "    </tr>\n",
1140 |        "  </thead>\n",
1141 |        "  <tbody>\n",
1142 |        "    <tr>\n",
1143 |        "      <th>CAGCCAGGTCTCGACG-1</th>\n",
1144 |        "    </tr>\n",
1145 |        "    <tr>\n",
1146 |        "      <th>TTCTTCCTCTCGGTAA-1</th>\n",
1147 |        "    </tr>\n",
1148 |        "  </tbody>\n",
1149 |        "</table>\n",
1150 |        "</div>"
1151 |       ],
1152 |       "text/plain": [
1153 |        "Empty DataFrame\n",
1154 |        "Columns: []\n",
1155 |        "Index: [CAGCCAGGTCTCGACG-1, TTCTTCCTCTCGGTAA-1]"
1156 |       ]
1157 |      },
1158 |      "execution_count": 33,
1159 |      "metadata": {},
1160 |      "output_type": "execute_result"
1161 |     }
1162 |    ],
1163 |    "source": [
1164 |     "nested_view.obs"
1165 |    ]
1166 |   },
1167 |   {
1168 |    "cell_type": "markdown",
1169 |    "id": "9dbacf34-247e-4ac9-995b-f39656491973",
1170 |    "metadata": {},
1171 |    "source": [
1172 |     "... as they are shadow objects themselves:"
1173 |    ]
1174 |   },
1175 |   {
1176 |    "cell_type": "code",
1177 |    "execution_count": 34,
1178 |    "id": "c0921236-cc65-43fc-a9a1-557d4ab0a1c6",
1179 |    "metadata": {},
1180 |    "outputs": [
1181 |     {
1182 |      "data": {
1183 |       "text/plain": [
1184 |        "shadows.anndatashadow.AnnDataShadow"
1185 |       ]
1186 |      },
1187 |      "execution_count": 34,
1188 |      "metadata": {},
1189 |      "output_type": "execute_result"
1190 |     }
1191 |    ],
1192 |    "source": [
1193 |     "type(nested_view)"
1194 |    ]
1195 |   },
1196 |   {
1197 |    "cell_type": "code",
1198 |    "execution_count": 35,
1199 |    "id": "e70179b3-da72-4155-bbf9-b6f9d1fa8d47",
1200 |    "metadata": {},
1201 |    "outputs": [],
1202 |    "source": [
1203 |     "# Clean up\n",
1204 |     "nested_view.close()\n",
1205 |     "del nested_view\n",
1206 |     "\n",
1207 |     "head_view.close()\n",
1208 |     "del head_view"
1209 |    ]
1210 |   },
1211 |   {
1212 |    "cell_type": "markdown",
1213 |    "id": "ed55ed1b-1d8e-4250-9352-75f59cd5551a",
1214 |    "metadata": {},
1215 |    "source": [
1216 |     " "
1217 |    ]
1218 |   },
1219 |   {
1220 |    "cell_type": "markdown",
1221 |    "id": "ab4a745e-df8c-46f5-9c3d-d2d3678fff5f",
1222 |    "metadata": {
1223 |     "slideshow": {
1224 |      "slide_type": "slide"
1225 |     },
1226 |     "tags": []
1227 |    },
1228 |    "source": [
1229 |     "### Per-feature access to datasets on disk\n",
1230 |     "\n",
1231 |     "This is currently not possible as caching works at the level of individual HDF5 datasets.\n",
1232 |     "\n",
1233 |     "Views may read only the necessary parts of the arrays to memory however this behaviour is currently not universal.\n",
1234 |     "\n",
1235 |     "E.g.:"
1236 |    ]
1237 |   },
1238 |   {
1239 |    "cell_type": "code",
1240 |    "execution_count": 36,
1241 |    "id": "ff5c4052-0929-43c3-947f-6de72b78d69e",
1242 |    "metadata": {},
1243 |    "outputs": [
1244 |     {
1245 |      "data": {
1246 |       "text/plain": [
1247 |        "(10, 29)"
1248 |       ]
1249 |      },
1250 |      "execution_count": 36,
1251 |      "metadata": {},
1252 |      "output_type": "execute_result"
1253 |     }
1254 |    ],
1255 |    "source": [
1256 |     "adata_subset = adata[:10,:100]\n",
1257 |     "adata_subset.X.shape"
1258 |    ]
1259 |   },
1260 |   {
1261 |    "cell_type": "code",
1262 |    "execution_count": 37,
1263 |    "id": "e410e6e1-34c8-48f5-88b5-a45a0545e342",
1264 |    "metadata": {},
1265 |    "outputs": [
1266 |     {
1267 |      "data": {
1268 |       "text/plain": [
1269 |        "View of AnnData Shadow object with n_obs × n_vars = 10 × 29 (original 411 × 29)\n",
1270 |        "  X ᐁ \n",
1271 |        "  layers:\tcounts\n",
1272 |        "  obs:\t_index\n",
1273 |        "  var:\t_index, feature_types, gene_ids, highly_variable\n",
1274 |        "  obsm:\tX_pca, X_pca_copy, X_umap\n",
1275 |        "  varm:\tPCs\n",
1276 |        "  obsp:\tconnectivities, distances\n",
1277 |        "  uns:\tneighbors, pca, umap"
1278 |       ]
1279 |      },
1280 |      "execution_count": 37,
1281 |      "metadata": {},
1282 |      "output_type": "execute_result"
1283 |     }
1284 |    ],
1285 |    "source": [
1286 |     "adata_subset"
1287 |    ]
1288 |   },
1289 |   {
1290 |    "cell_type": "code",
1291 |    "execution_count": 38,
1292 |    "id": "bf2a317a-ca82-4a73-b0ef-07d0cfac2128",
1293 |    "metadata": {},
1294 |    "outputs": [],
1295 |    "source": [
1296 |     "# Clean up\n",
1297 |     "adata.close()\n",
1298 |     "adata_subset.close()\n",
1299 |     "del adata, adata_subset"
1300 |    ]
1301 |   },
1302 |   {
1303 |    "cell_type": "markdown",
1304 |    "id": "bb50af6a-4ee2-4a8f-b022-9b0daa63e81e",
1305 |    "metadata": {},
1306 |    "source": [
1307 |     " "
1308 |    ]
1309 |   },
1310 |   {
1311 |    "cell_type": "markdown",
1312 |    "id": "fec4c262-5bbf-4393-b082-f208f7997a7a",
1313 |    "metadata": {
1314 |     "slideshow": {
1315 |      "slide_type": "slide"
1316 |     },
1317 |     "tags": []
1318 |    },
1319 |    "source": [
1320 |     "---\n",
1321 |     "\n",
1322 |     "In order to return the data to its original state, let's manually remove the items we wrote to the file:"
1323 |    ]
1324 |   },
1325 |   {
1326 |    "cell_type": "code",
1327 |    "execution_count": 39,
1328 |    "id": "46550ff4-39e1-40e6-80d0-4fd45d99af84",
1329 |    "metadata": {
1330 |     "slideshow": {
1331 |      "slide_type": "fragment"
1332 |     },
1333 |     "tags": []
1334 |    },
1335 |    "outputs": [],
1336 |    "source": [
1337 |     "import zarr\n",
1338 |     "\n",
1339 |     "f = zarr.open(file, \"a\")\n",
1340 |     "#                    ^\n",
1341 |     "#        ____________|\n",
1342 |     "# if this works,     \n",
1343 |     "# no dangling read-only connections!\n",
1344 |     "# \n",
1345 |     "\n",
1346 |     "del f[\"obsm/X_pca_copy\"]\n",
1347 |     "f.store.close()"
1348 |    ]
1349 |   },
1350 |   {
1351 |    "cell_type": "markdown",
1352 |    "id": "6bc6a57c-39d0-45ad-be01-8cadde33da83",
1353 |    "metadata": {},
1354 |    "source": [
1355 |     " "
1356 |    ]
1357 |   },
1358 |   {
1359 |    "cell_type": "markdown",
1360 |    "id": "752bd981-1cbd-43ec-b707-9308afb7e55f",
1361 |    "metadata": {},
1362 |    "source": [
1363 |     " "
1364 |    ]
1365 |   }
1366 |  ],
1367 |  "metadata": {
1368 |   "kernelspec": {
1369 |    "display_name": "Python 3 (ipykernel)",
1370 |    "language": "python",
1371 |    "name": "python3"
1372 |   },
1373 |   "language_info": {
1374 |    "codemirror_mode": {
1375 |     "name": "ipython",
1376 |     "version": 3
1377 |    },
1378 |    "file_extension": ".py",
1379 |    "mimetype": "text/x-python",
1380 |    "name": "python",
1381 |    "nbconvert_exporter": "python",
1382 |    "pygments_lexer": "ipython3",
1383 |    "version": "3.10.11"
1384 |   }
1385 |  },
1386 |  "nbformat": 4,
1387 |  "nbformat_minor": 5
1388 | }
1389 | 


--------------------------------------------------------------------------------
/docs/examples/shadows-features.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "b6eae7bd-1091-480f-8c95-551eefe5c53c",
   6 |    "metadata": {
   7 |     "slideshow": {
   8 |      "slide_type": "slide"
   9 |     },
  10 |     "tags": []
  11 |    },
  12 |    "source": [
  13 |     "# Shadows features"
  14 |    ]
  15 |   },
  16 |   {
  17 |    "cell_type": "code",
  18 |    "execution_count": 1,
  19 |    "id": "b17e6265-4c91-4d30-a232-20e6a627c07d",
  20 |    "metadata": {},
  21 |    "outputs": [],
  22 |    "source": [
  23 |     "%load_ext autoreload\n",
  24 |     "%autoreload 2"
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "code",
  29 |    "execution_count": 2,
  30 |    "id": "4aa723fb-6a8d-4d43-913c-a31f2316b02f",
  31 |    "metadata": {},
  32 |    "outputs": [],
  33 |    "source": [
  34 |     "import os\n",
  35 |     "os.chdir(\"../../\")"
  36 |    ]
  37 |   },
  38 |   {
  39 |    "cell_type": "code",
  40 |    "execution_count": 3,
  41 |    "id": "f1c3418a-3a90-41b0-baa6-c6ad340dc75f",
  42 |    "metadata": {},
  43 |    "outputs": [],
  44 |    "source": [
  45 |     "from pathlib import Path\n",
  46 |     "data = Path(\"data/\")"
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "markdown",
  51 |    "id": "b9e3bb66-3928-45f4-ba98-fded629de018",
  52 |    "metadata": {},
  53 |    "source": [
  54 |     " "
  55 |    ]
  56 |   },
  57 |   {
  58 |    "cell_type": "markdown",
  59 |    "id": "934b8d69-b812-422f-b718-080bb8508348",
  60 |    "metadata": {
  61 |     "slideshow": {
  62 |      "slide_type": "slide"
  63 |     },
  64 |     "tags": []
  65 |    },
  66 |    "source": [
  67 |     "## Shadow objects and their features\n",
  68 |     "\n",
  69 |     "While shadow objects provide a convenient read-only drop-in replacement for AnnData/MuData objects when needed, they also have additional features that can help users make the most of *shadows*."
  70 |    ]
  71 |   },
  72 |   {
  73 |    "cell_type": "markdown",
  74 |    "id": "65462d07-01b0-4395-8891-eda01e472f38",
  75 |    "metadata": {},
  76 |    "source": [
  77 |     " "
  78 |    ]
  79 |   },
  80 |   {
  81 |    "cell_type": "markdown",
  82 |    "id": "4a38075c-8da2-4193-af1a-c52e18176f92",
  83 |    "metadata": {
  84 |     "slideshow": {
  85 |      "slide_type": "fragment"
  86 |     },
  87 |     "tags": []
  88 |    },
  89 |    "source": [
  90 |     "Import classes for these shadow objects:"
  91 |    ]
  92 |   },
  93 |   {
  94 |    "cell_type": "code",
  95 |    "execution_count": 4,
  96 |    "id": "079454ed-10dc-47ef-9de2-ef70f95dbed6",
  97 |    "metadata": {
  98 |     "slideshow": {
  99 |      "slide_type": "fragment"
 100 |     },
 101 |     "tags": []
 102 |    },
 103 |    "outputs": [],
 104 |    "source": [
 105 |     "from shadows import AnnDataShadow, MuDataShadow"
 106 |    ]
 107 |   },
 108 |   {
 109 |    "cell_type": "markdown",
 110 |    "id": "564f7b2b-063d-4f0e-8333-c178565ee2d2",
 111 |    "metadata": {},
 112 |    "source": [
 113 |     " "
 114 |    ]
 115 |   },
 116 |   {
 117 |    "cell_type": "markdown",
 118 |    "id": "6b819452-470f-47b7-8fa0-0c8304fd557c",
 119 |    "metadata": {
 120 |     "slideshow": {
 121 |      "slide_type": "fragment"
 122 |     },
 123 |     "tags": []
 124 |    },
 125 |    "source": [
 126 |     "Initialise a multimodal shadow object:"
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "code",
 131 |    "execution_count": 5,
 132 |    "id": "3ff358c0-2c77-460a-97a9-398f615a0e17",
 133 |    "metadata": {
 134 |     "slideshow": {
 135 |      "slide_type": "fragment"
 136 |     },
 137 |     "tags": []
 138 |    },
 139 |    "outputs": [],
 140 |    "source": [
 141 |     "file = data / \"pbmc5k_citeseq/pbmc5k_citeseq_processed.h5mu\"\n",
 142 |     "mdata = MuDataShadow(file)"
 143 |    ]
 144 |   },
 145 |   {
 146 |    "cell_type": "markdown",
 147 |    "id": "1747c671-ffc2-4d4d-8a04-7dc44432b2fb",
 148 |    "metadata": {},
 149 |    "source": [
 150 |     " "
 151 |    ]
 152 |   },
 153 |   {
 154 |    "cell_type": "markdown",
 155 |    "id": "b8ae6d73-9a74-48ed-9d41-7e92bfee8f71",
 156 |    "metadata": {
 157 |     "slideshow": {
 158 |      "slide_type": "slide"
 159 |     },
 160 |     "tags": []
 161 |    },
 162 |    "source": [
 163 |     "### File\n",
 164 |     "\n",
 165 |     "The file connection that the shadow is using can be accessed via the `.file` attribute:"
 166 |    ]
 167 |   },
 168 |   {
 169 |    "cell_type": "code",
 170 |    "execution_count": 6,
 171 |    "id": "33c47ede-e566-43ac-8596-470263d21b3a",
 172 |    "metadata": {},
 173 |    "outputs": [
 174 |     {
 175 |      "data": {
 176 |       "text/plain": [
 177 |        "<HDF5 file \"pbmc5k_citeseq_processed.h5mu\" (mode r)>"
 178 |       ]
 179 |      },
 180 |      "execution_count": 6,
 181 |      "metadata": {},
 182 |      "output_type": "execute_result"
 183 |     }
 184 |    ],
 185 |    "source": [
 186 |     "mdata.file"
 187 |    ]
 188 |   },
 189 |   {
 190 |    "cell_type": "markdown",
 191 |    "id": "a43127df-c330-4104-bbf6-399c7392c373",
 192 |    "metadata": {},
 193 |    "source": [
 194 |     "The name of the file can then be accessed via"
 195 |    ]
 196 |   },
 197 |   {
 198 |    "cell_type": "code",
 199 |    "execution_count": 7,
 200 |    "id": "a7d549f2-ec47-4744-a744-e2f7884638d7",
 201 |    "metadata": {},
 202 |    "outputs": [
 203 |     {
 204 |      "data": {
 205 |       "text/plain": [
 206 |        "'data/pbmc5k_citeseq/pbmc5k_citeseq_processed.h5mu'"
 207 |       ]
 208 |      },
 209 |      "execution_count": 7,
 210 |      "metadata": {},
 211 |      "output_type": "execute_result"
 212 |     }
 213 |    ],
 214 |    "source": [
 215 |     "mdata.file.filename"
 216 |    ]
 217 |   },
 218 |   {
 219 |    "cell_type": "markdown",
 220 |    "id": "0574136f-7aa4-4a1e-9312-eee5fc9c6744",
 221 |    "metadata": {
 222 |     "slideshow": {
 223 |      "slide_type": "subslide"
 224 |     },
 225 |     "tags": []
 226 |    },
 227 |    "source": [
 228 |     "The connection stays open until `mdata.close()` is called"
 229 |    ]
 230 |   },
 231 |   {
 232 |    "cell_type": "code",
 233 |    "execution_count": 8,
 234 |    "id": "1c1f47db-f933-4999-8fae-cb088b56dab5",
 235 |    "metadata": {},
 236 |    "outputs": [],
 237 |    "source": [
 238 |     "mdata.close()"
 239 |    ]
 240 |   },
 241 |   {
 242 |    "cell_type": "markdown",
 243 |    "id": "a87e0e96-86c2-4623-b239-892e92b04a5a",
 244 |    "metadata": {
 245 |     "slideshow": {
 246 |      "slide_type": "fragment"
 247 |     },
 248 |     "tags": []
 249 |    },
 250 |    "source": [
 251 |     "... or until the file has to be re-opened for modification (see below)."
 252 |    ]
 253 |   },
 254 |   {
 255 |    "cell_type": "markdown",
 256 |    "id": "5a064df4-b533-4124-a85a-f7b20fcc1091",
 257 |    "metadata": {},
 258 |    "source": [
 259 |     " "
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "markdown",
 264 |    "id": "9beb85a9-e226-4b9a-949b-2351432558f7",
 265 |    "metadata": {
 266 |     "slideshow": {
 267 |      "slide_type": "slide"
 268 |     },
 269 |     "tags": []
 270 |    },
 271 |    "source": [
 272 |     "### Permissions\n",
 273 |     "\n",
 274 |     "We can open HDF5 files in different modes including purely read-only (`'r'`) and read/write (`'r+'`). The mode can be provided to the constructor:"
 275 |    ]
 276 |   },
 277 |   {
 278 |    "cell_type": "code",
 279 |    "execution_count": 9,
 280 |    "id": "9f297beb-97b5-46ad-97b9-2dedc5c40b53",
 281 |    "metadata": {},
 282 |    "outputs": [
 283 |     {
 284 |      "data": {
 285 |       "text/plain": [
 286 |        "'r'"
 287 |       ]
 288 |      },
 289 |      "execution_count": 9,
 290 |      "metadata": {},
 291 |      "output_type": "execute_result"
 292 |     }
 293 |    ],
 294 |    "source": [
 295 |     "mdata = MuDataShadow(file, mode=\"r\")\n",
 296 |     "mdata.file.mode"
 297 |    ]
 298 |   },
 299 |   {
 300 |    "cell_type": "markdown",
 301 |    "id": "fc9da2a5-402f-4fe8-83a2-0a5f06a84d7c",
 302 |    "metadata": {},
 303 |    "source": [
 304 |     "Let's add some data to the in-memory shadow object:"
 305 |    ]
 306 |   },
 307 |   {
 308 |    "cell_type": "code",
 309 |    "execution_count": 10,
 310 |    "id": "21f291bd-7c5d-4ef3-a034-c0030dabdb60",
 311 |    "metadata": {
 312 |     "slideshow": {
 313 |      "slide_type": "fragment"
 314 |     },
 315 |     "tags": []
 316 |    },
 317 |    "outputs": [],
 318 |    "source": [
 319 |     "mdata[\"rna\"].obsm[\"X_pca_copy\"] = mdata[\"rna\"].obsm[\"X_pca\"].copy()"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "markdown",
 324 |    "id": "b03108f5-0e8a-4646-af12-ef5fc934885b",
 325 |    "metadata": {
 326 |     "slideshow": {
 327 |      "slide_type": "subslide"
 328 |     },
 329 |     "tags": []
 330 |    },
 331 |    "source": [
 332 |     "We can also conveniently close and reopen the connection for a given in-memory shadow object:"
 333 |    ]
 334 |   },
 335 |   {
 336 |    "cell_type": "code",
 337 |    "execution_count": 11,
 338 |    "id": "e8ddb228-74b4-4f8e-8cdc-c84479f38d2d",
 339 |    "metadata": {},
 340 |    "outputs": [
 341 |     {
 342 |      "data": {
 343 |       "text/plain": [
 344 |        "'r+'"
 345 |       ]
 346 |      },
 347 |      "execution_count": 11,
 348 |      "metadata": {},
 349 |      "output_type": "execute_result"
 350 |     }
 351 |    ],
 352 |    "source": [
 353 |     "mdata.reopen(mode=\"r+\")\n",
 354 |     "mdata.file.mode"
 355 |    ]
 356 |   },
 357 |   {
 358 |    "cell_type": "markdown",
 359 |    "id": "48157734-adc0-4e7d-8157-64e1201b6fba",
 360 |    "metadata": {},
 361 |    "source": [
 362 |     "This way all the newly added elements are still available in memory:"
 363 |    ]
 364 |   },
 365 |   {
 366 |    "cell_type": "code",
 367 |    "execution_count": 12,
 368 |    "id": "043428b5-dc58-4d0c-b653-e1d8451b39f9",
 369 |    "metadata": {},
 370 |    "outputs": [
 371 |     {
 372 |      "data": {
 373 |       "text/plain": [
 374 |        "obsm:\tX_pcaᐁ, X_umap, X_pca_copy▲"
 375 |       ]
 376 |      },
 377 |      "execution_count": 12,
 378 |      "metadata": {},
 379 |      "output_type": "execute_result"
 380 |     }
 381 |    ],
 382 |    "source": [
 383 |     "mdata[\"rna\"].obsm"
 384 |    ]
 385 |   },
 386 |   {
 387 |    "cell_type": "code",
 388 |    "execution_count": 13,
 389 |    "id": "50aba055-06e2-490d-a1a6-3307ef7ac6d0",
 390 |    "metadata": {
 391 |     "slideshow": {
 392 |      "slide_type": "fragment"
 393 |     },
 394 |     "tags": []
 395 |    },
 396 |    "outputs": [],
 397 |    "source": [
 398 |     "# Clean up\n",
 399 |     "mdata.close()\n",
 400 |     "del mdata"
 401 |    ]
 402 |   },
 403 |   {
 404 |    "cell_type": "markdown",
 405 |    "id": "991ccc6a-f182-4689-802d-a9ae70a490e4",
 406 |    "metadata": {},
 407 |    "source": [
 408 |     " "
 409 |    ]
 410 |   },
 411 |   {
 412 |    "cell_type": "markdown",
 413 |    "id": "2dbc52ad-6010-416f-810b-c60e5546ba7b",
 414 |    "metadata": {
 415 |     "slideshow": {
 416 |      "slide_type": "slide"
 417 |     },
 418 |     "tags": []
 419 |    },
 420 |    "source": [
 421 |     "### Individual modality access\n",
 422 |     "\n",
 423 |     "Individual modalities stored in the .h5mu files can be accessed as part of the `MuDataShadow` object:"
 424 |    ]
 425 |   },
 426 |   {
 427 |    "cell_type": "code",
 428 |    "execution_count": 14,
 429 |    "id": "d5ea1511-6f1b-4c51-9ec7-14365dc8d391",
 430 |    "metadata": {
 431 |     "slideshow": {
 432 |      "slide_type": "fragment"
 433 |     },
 434 |     "tags": []
 435 |    },
 436 |    "outputs": [
 437 |     {
 438 |      "data": {
 439 |       "text/plain": [
 440 |        "AnnData Shadow object with n_obs × n_vars = 3891 × 17806\n",
 441 |        "  X  \n",
 442 |        "  raw:\tX, var, varm\n",
 443 |        "  obs:\t_index, celltype, leiden, n_genes_by_counts, pct_counts_mt, total_counts, total_counts_mt\n",
 444 |        "  var:\t_index, dispersions, dispersions_norm, feature_types, gene_ids, highly_variable, mean, mean_counts, means, mt, n_cells_by_counts, pct_dropout_by_counts, std, total_counts\n",
 445 |        "  obsm:\tX_pca, X_umap\n",
 446 |        "  varm:\tPCs\n",
 447 |        "  obsp:\tconnectivities, distances\n",
 448 |        "  uns:\tcelltype_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, umap"
 449 |       ]
 450 |      },
 451 |      "execution_count": 14,
 452 |      "metadata": {},
 453 |      "output_type": "execute_result"
 454 |     }
 455 |    ],
 456 |    "source": [
 457 |     "mdata = MuDataShadow(file, mode=\"r\")\n",
 458 |     "mdata[\"rna\"]"
 459 |    ]
 460 |   },
 461 |   {
 462 |    "cell_type": "markdown",
 463 |    "id": "60d08ba8-c7c4-4d13-a5fe-9f39c56dd86a",
 464 |    "metadata": {
 465 |     "slideshow": {
 466 |      "slide_type": "subslide"
 467 |     },
 468 |     "tags": []
 469 |    },
 470 |    "source": [
 471 |     "Moreover, one can also create a direct connection to a specific modality:"
 472 |    ]
 473 |   },
 474 |   {
 475 |    "cell_type": "code",
 476 |    "execution_count": 15,
 477 |    "id": "a853493d-5432-438f-bd8f-837cb63d151a",
 478 |    "metadata": {},
 479 |    "outputs": [
 480 |     {
 481 |      "data": {
 482 |       "text/plain": [
 483 |        "AnnData Shadow object with n_obs × n_vars = 3891 × 17806\n",
 484 |        "  X  \n",
 485 |        "  raw:\tX, var, varm\n",
 486 |        "  obs:\t_index, celltype, leiden, n_genes_by_counts, pct_counts_mt, total_counts, total_counts_mt\n",
 487 |        "  var:\t_index, dispersions, dispersions_norm, feature_types, gene_ids, highly_variable, mean, mean_counts, means, mt, n_cells_by_counts, pct_dropout_by_counts, std, total_counts\n",
 488 |        "  obsm:\tX_pca, X_umap\n",
 489 |        "  varm:\tPCs\n",
 490 |        "  obsp:\tconnectivities, distances\n",
 491 |        "  uns:\tcelltype_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, umap"
 492 |       ]
 493 |      },
 494 |      "execution_count": 15,
 495 |      "metadata": {},
 496 |      "output_type": "execute_result"
 497 |     }
 498 |    ],
 499 |    "source": [
 500 |     "mdata.close()\n",
 501 |     "del mdata\n",
 502 |     "\n",
 503 |     "adata = AnnDataShadow(file / \"mod/rna\")\n",
 504 |     "adata"
 505 |    ]
 506 |   },
 507 |   {
 508 |    "cell_type": "code",
 509 |    "execution_count": 16,
 510 |    "id": "946d03a9-d0d1-4ebc-ae29-92d795f08073",
 511 |    "metadata": {
 512 |     "slideshow": {
 513 |      "slide_type": "fragment"
 514 |     },
 515 |     "tags": []
 516 |    },
 517 |    "outputs": [],
 518 |    "source": [
 519 |     "# Clean up\n",
 520 |     "adata.close()\n",
 521 |     "del adata"
 522 |    ]
 523 |   },
 524 |   {
 525 |    "cell_type": "markdown",
 526 |    "id": "14b8ad11-adad-4ea8-9146-3dd7cd9bd415",
 527 |    "metadata": {},
 528 |    "source": [
 529 |     " "
 530 |    ]
 531 |   },
 532 |   {
 533 |    "cell_type": "markdown",
 534 |    "id": "d3ae2a84-34fc-48b9-926e-a5d5f57e4e73",
 535 |    "metadata": {
 536 |     "slideshow": {
 537 |      "slide_type": "slide"
 538 |     },
 539 |     "tags": []
 540 |    },
 541 |    "source": [
 542 |     "### Class identity\n",
 543 |     "\n",
 544 |     "Many tools in the ecosystem including scanpy frequently check if the input object is an AnnData. For instance, [in `sc.pp.highly_variable_genes`](https://github.com/scverse/scanpy/blob/master/scanpy/preprocessing/_highly_variable_genes.py) it reads:\n",
 545 |     "\n",
 546 |     "```py\n",
 547 |     "if not isinstance(adata, AnnData):\n",
 548 |     "    raise ValueError(\n",
 549 |     "        '`pp.highly_variable_genes` expects an `AnnData` argument, '\n",
 550 |     "        'pass `inplace=False` if you want to return a `pd.DataFrame`.'\n",
 551 |     "    )\n",
 552 |     "```\n",
 553 |     "\n",
 554 |     "In order for shadow objects to be accepted by such functions, they mock their class identity:"
 555 |    ]
 556 |   },
 557 |   {
 558 |    "cell_type": "code",
 559 |    "execution_count": 17,
 560 |    "id": "f10b98ff-920f-4d46-924f-1cf3074236db",
 561 |    "metadata": {
 562 |     "slideshow": {
 563 |      "slide_type": "subslide"
 564 |     },
 565 |     "tags": []
 566 |    },
 567 |    "outputs": [],
 568 |    "source": [
 569 |     "mdata = MuDataShadow(file, mode=\"r\")\n",
 570 |     "\n",
 571 |     "from mudata import MuData\n",
 572 |     "assert isinstance(mdata, MuData), \"mdata is not a valid MuData object\""
 573 |    ]
 574 |   },
 575 |   {
 576 |    "cell_type": "code",
 577 |    "execution_count": 18,
 578 |    "id": "7796c156-b84e-46f9-90e4-fe18ad6b91d8",
 579 |    "metadata": {
 580 |     "slideshow": {
 581 |      "slide_type": "fragment"
 582 |     },
 583 |     "tags": []
 584 |    },
 585 |    "outputs": [],
 586 |    "source": [
 587 |     "from anndata import AnnData\n",
 588 |     "assert isinstance(mdata[\"rna\"], AnnData), \"mdata['rna'] is not a valid AnnData object\""
 589 |    ]
 590 |   },
 591 |   {
 592 |    "cell_type": "markdown",
 593 |    "id": "f8e2d4a9-eba2-45c0-88f6-35f69e7d0249",
 594 |    "metadata": {
 595 |     "slideshow": {
 596 |      "slide_type": "subslide"
 597 |     },
 598 |     "tags": []
 599 |    },
 600 |    "source": [
 601 |     "Checking for shadow identity still works:"
 602 |    ]
 603 |   },
 604 |   {
 605 |    "cell_type": "code",
 606 |    "execution_count": 19,
 607 |    "id": "51cd4264-e9d0-4e2c-a536-835a0d3a699d",
 608 |    "metadata": {},
 609 |    "outputs": [
 610 |     {
 611 |      "data": {
 612 |       "text/plain": [
 613 |        "True"
 614 |       ]
 615 |      },
 616 |      "execution_count": 19,
 617 |      "metadata": {},
 618 |      "output_type": "execute_result"
 619 |     }
 620 |    ],
 621 |    "source": [
 622 |     "isinstance(mdata, MuDataShadow)"
 623 |    ]
 624 |   },
 625 |   {
 626 |    "cell_type": "code",
 627 |    "execution_count": 20,
 628 |    "id": "efadd4ba-219c-4c84-a1eb-36baf135c82d",
 629 |    "metadata": {},
 630 |    "outputs": [
 631 |     {
 632 |      "data": {
 633 |       "text/plain": [
 634 |        "True"
 635 |       ]
 636 |      },
 637 |      "execution_count": 20,
 638 |      "metadata": {},
 639 |      "output_type": "execute_result"
 640 |     }
 641 |    ],
 642 |    "source": [
 643 |     "isinstance(mdata[\"rna\"], AnnDataShadow)"
 644 |    ]
 645 |   },
 646 |   {
 647 |    "cell_type": "code",
 648 |    "execution_count": 21,
 649 |    "id": "a32515de-7866-4229-a639-0818a0dbea3b",
 650 |    "metadata": {
 651 |     "slideshow": {
 652 |      "slide_type": "fragment"
 653 |     },
 654 |     "tags": []
 655 |    },
 656 |    "outputs": [],
 657 |    "source": [
 658 |     "mdata.close()"
 659 |    ]
 660 |   },
 661 |   {
 662 |    "cell_type": "markdown",
 663 |    "id": "8d4e683f-0a0b-426c-8cf7-5f5529a844d2",
 664 |    "metadata": {},
 665 |    "source": [
 666 |     " "
 667 |    ]
 668 |   },
 669 |   {
 670 |    "cell_type": "markdown",
 671 |    "id": "c29f18b0-717b-4821-b0f8-e81ca94426de",
 672 |    "metadata": {},
 673 |    "source": [
 674 |     "### Backends\n",
 675 |     "\n",
 676 |     "AnnData/MuData are based on a NumPy/Pandas stack. This is the default for the shadow objects in order to provide compatibility with AnnData/MuData objects.\n",
 677 |     "\n",
 678 |     "However the nature of shadow files also simplifies loading individual matrices or tables with alternative backends, e.g. [JAX](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.array.html#jax.numpy.array) (`Array`), [PyTorch](https://pytorch.org/docs/stable/tensors.html) (`Tensor`) or [polars](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/index.html) (`DataFrame`)."
 679 |    ]
 680 |   },
 681 |   {
 682 |    "cell_type": "code",
 683 |    "execution_count": 22,
 684 |    "id": "734d4e9e-3936-4911-96fe-1bed3de167eb",
 685 |    "metadata": {},
 686 |    "outputs": [],
 687 |    "source": [
 688 |     "mdata = MuDataShadow(file, array_backend=\"jax\", table_backend=\"polars\")"
 689 |    ]
 690 |   },
 691 |   {
 692 |    "cell_type": "code",
 693 |    "execution_count": 23,
 694 |    "id": "3d909ef6-92b7-40f4-b50e-641993469791",
 695 |    "metadata": {},
 696 |    "outputs": [
 697 |     {
 698 |      "name": "stdout",
 699 |      "output_type": "stream",
 700 |      "text": [
 701 |       "<class 'polars.internals.dataframe.frame.DataFrame'>\n"
 702 |      ]
 703 |     },
 704 |     {
 705 |      "data": {
 706 |       "text/html": [
 707 |        "<div>\n",
 708 |        "<style scoped>\n",
 709 |        "    .dataframe tbody tr th:only-of-type {\n",
 710 |        "        vertical-align: middle;\n",
 711 |        "    }\n",
 712 |        "\n",
 713 |        "    .dataframe tbody tr th {\n",
 714 |        "        vertical-align: top;\n",
 715 |        "    }\n",
 716 |        "\n",
 717 |        "    .dataframe thead th {\n",
 718 |        "        text-align: right;\n",
 719 |        "    }\n",
 720 |        "\n",
 721 |        "    .dataframe td {\n",
 722 |        "        white-space: pre;\n",
 723 |        "    }\n",
 724 |        "\n",
 725 |        "    .dataframe td {\n",
 726 |        "        padding-top: 0;\n",
 727 |        "    }\n",
 728 |        "\n",
 729 |        "    .dataframe td {\n",
 730 |        "        padding-bottom: 0;\n",
 731 |        "    }\n",
 732 |        "\n",
 733 |        "    .dataframe td {\n",
 734 |        "        line-height: 95%;\n",
 735 |        "    }\n",
 736 |        "</style>\n",
 737 |        "<table border=\"1\" class=\"dataframe\">\n",
 738 |        "<small>shape: (5, 7)</small>\n",
 739 |        "<thead>\n",
 740 |        "<tr>\n",
 741 |        "<th>\n",
 742 |        "_index\n",
 743 |        "</th>\n",
 744 |        "<th>\n",
 745 |        "celltype\n",
 746 |        "</th>\n",
 747 |        "<th>\n",
 748 |        "leiden\n",
 749 |        "</th>\n",
 750 |        "<th>\n",
 751 |        "n_genes_by_counts\n",
 752 |        "</th>\n",
 753 |        "<th>\n",
 754 |        "pct_counts_mt\n",
 755 |        "</th>\n",
 756 |        "<th>\n",
 757 |        "total_counts\n",
 758 |        "</th>\n",
 759 |        "<th>\n",
 760 |        "total_counts_mt\n",
 761 |        "</th>\n",
 762 |        "</tr>\n",
 763 |        "<tr>\n",
 764 |        "<td>\n",
 765 |        "object\n",
 766 |        "</td>\n",
 767 |        "<td>\n",
 768 |        "cat\n",
 769 |        "</td>\n",
 770 |        "<td>\n",
 771 |        "cat\n",
 772 |        "</td>\n",
 773 |        "<td>\n",
 774 |        "i32\n",
 775 |        "</td>\n",
 776 |        "<td>\n",
 777 |        "f32\n",
 778 |        "</td>\n",
 779 |        "<td>\n",
 780 |        "f32\n",
 781 |        "</td>\n",
 782 |        "<td>\n",
 783 |        "f32\n",
 784 |        "</td>\n",
 785 |        "</tr>\n",
 786 |        "</thead>\n",
 787 |        "<tbody>\n",
 788 |        "<tr>\n",
 789 |        "<td>\n",
 790 |        "AAACCCAAGAGACAAG-1\n",
 791 |        "</td>\n",
 792 |        "<td>\n",
 793 |        "&quot;intermediate m...\n",
 794 |        "</td>\n",
 795 |        "<td>\n",
 796 |        "&quot;3&quot;\n",
 797 |        "</td>\n",
 798 |        "<td>\n",
 799 |        "2363\n",
 800 |        "</td>\n",
 801 |        "<td>\n",
 802 |        "6.332204\n",
 803 |        "</td>\n",
 804 |        "<td>\n",
 805 |        "7375.0\n",
 806 |        "</td>\n",
 807 |        "<td>\n",
 808 |        "467.0\n",
 809 |        "</td>\n",
 810 |        "</tr>\n",
 811 |        "<tr>\n",
 812 |        "<td>\n",
 813 |        "AAACCCAAGGCCTAGA-1\n",
 814 |        "</td>\n",
 815 |        "<td>\n",
 816 |        "&quot;CD4+ naïve T&quot;\n",
 817 |        "</td>\n",
 818 |        "<td>\n",
 819 |        "&quot;0&quot;\n",
 820 |        "</td>\n",
 821 |        "<td>\n",
 822 |        "1259\n",
 823 |        "</td>\n",
 824 |        "<td>\n",
 825 |        "9.093319\n",
 826 |        "</td>\n",
 827 |        "<td>\n",
 828 |        "3772.0\n",
 829 |        "</td>\n",
 830 |        "<td>\n",
 831 |        "343.0\n",
 832 |        "</td>\n",
 833 |        "</tr>\n",
 834 |        "<tr>\n",
 835 |        "<td>\n",
 836 |        "AAACCCAGTCGTGCCA-1\n",
 837 |        "</td>\n",
 838 |        "<td>\n",
 839 |        "&quot;CD4+ memory T&quot;\n",
 840 |        "</td>\n",
 841 |        "<td>\n",
 842 |        "&quot;2&quot;\n",
 843 |        "</td>\n",
 844 |        "<td>\n",
 845 |        "1578\n",
 846 |        "</td>\n",
 847 |        "<td>\n",
 848 |        "13.178295\n",
 849 |        "</td>\n",
 850 |        "<td>\n",
 851 |        "4902.0\n",
 852 |        "</td>\n",
 853 |        "<td>\n",
 854 |        "646.0\n",
 855 |        "</td>\n",
 856 |        "</tr>\n",
 857 |        "<tr>\n",
 858 |        "<td>\n",
 859 |        "AAACCCATCGTGCATA-1\n",
 860 |        "</td>\n",
 861 |        "<td>\n",
 862 |        "&quot;CD4+ memory T&quot;\n",
 863 |        "</td>\n",
 864 |        "<td>\n",
 865 |        "&quot;2&quot;\n",
 866 |        "</td>\n",
 867 |        "<td>\n",
 868 |        "1908\n",
 869 |        "</td>\n",
 870 |        "<td>\n",
 871 |        "6.354415\n",
 872 |        "</td>\n",
 873 |        "<td>\n",
 874 |        "6704.0\n",
 875 |        "</td>\n",
 876 |        "<td>\n",
 877 |        "426.0\n",
 878 |        "</td>\n",
 879 |        "</tr>\n",
 880 |        "<tr>\n",
 881 |        "<td>\n",
 882 |        "AAACGAAAGACAAGCC-1\n",
 883 |        "</td>\n",
 884 |        "<td>\n",
 885 |        "&quot;CD14 mono&quot;\n",
 886 |        "</td>\n",
 887 |        "<td>\n",
 888 |        "&quot;1&quot;\n",
 889 |        "</td>\n",
 890 |        "<td>\n",
 891 |        "1589\n",
 892 |        "</td>\n",
 893 |        "<td>\n",
 894 |        "9.307693\n",
 895 |        "</td>\n",
 896 |        "<td>\n",
 897 |        "3900.0\n",
 898 |        "</td>\n",
 899 |        "<td>\n",
 900 |        "363.0\n",
 901 |        "</td>\n",
 902 |        "</tr>\n",
 903 |        "</tbody>\n",
 904 |        "</table>\n",
 905 |        "</div>"
 906 |       ],
 907 |       "text/plain": [
 908 |        "shape: (5, 7)\n",
 909 |        "┌──────────────┬──────────────┬────────┬──────────────┬──────────────┬──────────────┬──────────────┐\n",
 910 |        "│ _index       ┆ celltype     ┆ leiden ┆ n_genes_by_c ┆ pct_counts_m ┆ total_counts ┆ total_counts │\n",
 911 |        "│ ---          ┆ ---          ┆ ---    ┆ ounts        ┆ t            ┆ ---          ┆ _mt          │\n",
 912 |        "│ object       ┆ cat          ┆ cat    ┆ ---          ┆ ---          ┆ f32          ┆ ---          │\n",
 913 |        "│              ┆              ┆        ┆ i32          ┆ f32          ┆              ┆ f32          │\n",
 914 |        "╞══════════════╪══════════════╪════════╪══════════════╪══════════════╪══════════════╪══════════════╡\n",
 915 |        "│ AAACCCAAGAGA ┆ intermediate ┆ 3      ┆ 2363         ┆ 6.332204     ┆ 7375.0       ┆ 467.0        │\n",
 916 |        "│ CAAG-1       ┆ mono         ┆        ┆              ┆              ┆              ┆              │\n",
 917 |        "│ AAACCCAAGGCC ┆ CD4+ naïve T ┆ 0      ┆ 1259         ┆ 9.093319     ┆ 3772.0       ┆ 343.0        │\n",
 918 |        "│ TAGA-1       ┆              ┆        ┆              ┆              ┆              ┆              │\n",
 919 |        "│ AAACCCAGTCGT ┆ CD4+ memory  ┆ 2      ┆ 1578         ┆ 13.178295    ┆ 4902.0       ┆ 646.0        │\n",
 920 |        "│ GCCA-1       ┆ T            ┆        ┆              ┆              ┆              ┆              │\n",
 921 |        "│ AAACCCATCGTG ┆ CD4+ memory  ┆ 2      ┆ 1908         ┆ 6.354415     ┆ 6704.0       ┆ 426.0        │\n",
 922 |        "│ CATA-1       ┆ T            ┆        ┆              ┆              ┆              ┆              │\n",
 923 |        "│ AAACGAAAGACA ┆ CD14 mono    ┆ 1      ┆ 1589         ┆ 9.307693     ┆ 3900.0       ┆ 363.0        │\n",
 924 |        "│ AGCC-1       ┆              ┆        ┆              ┆              ┆              ┆              │\n",
 925 |        "└──────────────┴──────────────┴────────┴──────────────┴──────────────┴──────────────┴──────────────┘"
 926 |       ]
 927 |      },
 928 |      "execution_count": 23,
 929 |      "metadata": {},
 930 |      "output_type": "execute_result"
 931 |     }
 932 |    ],
 933 |    "source": [
 934 |     "obs = mdata[\"rna\"].obs\n",
 935 |     "print(type(obs))\n",
 936 |     "obs.head()"
 937 |    ]
 938 |   },
 939 |   {
 940 |    "cell_type": "code",
 941 |    "execution_count": 24,
 942 |    "id": "32286100-13e4-49af-8194-f53693c9b7f0",
 943 |    "metadata": {},
 944 |    "outputs": [
 945 |     {
 946 |      "name": "stdout",
 947 |      "output_type": "stream",
 948 |      "text": [
 949 |       "<class 'jaxlib.xla_extension.DeviceArray'>\n"
 950 |      ]
 951 |     },
 952 |     {
 953 |      "data": {
 954 |       "text/plain": [
 955 |        "DeviceArray([[ 20.551052  ,   0.36840764,  -1.6193684 , ...,\n",
 956 |        "                0.09656975,  -0.90912175,  -0.77955467],\n",
 957 |        "             [ -9.47144   ,  -5.5212517 ,  -5.107428  , ...,\n",
 958 |        "                0.64674896,  -0.892091  ,   1.7873902 ],\n",
 959 |        "             [ -9.913012  ,   2.766899  ,  -2.0684972 , ...,\n",
 960 |        "               -0.6454743 ,   1.615869  ,  -0.63476324],\n",
 961 |        "             ...,\n",
 962 |        "             [ -8.727723  ,   7.9196725 ,   1.3326805 , ...,\n",
 963 |        "                1.4592032 ,   0.91210324,   1.3184382 ],\n",
 964 |        "             [-10.792531  ,   3.2086673 ,  -2.0437238 , ...,\n",
 965 |        "                1.7311838 ,  -1.840564  ,   1.3253008 ],\n",
 966 |        "             [ 20.642431  ,   0.49294943,  -1.6694897 , ...,\n",
 967 |        "               -0.51208967,   0.60652566,  -0.75145006]], dtype=float32)"
 968 |       ]
 969 |      },
 970 |      "execution_count": 24,
 971 |      "metadata": {},
 972 |      "output_type": "execute_result"
 973 |     }
 974 |    ],
 975 |    "source": [
 976 |     "rna_pca = mdata[\"rna\"].obsm[\"X_pca\"]\n",
 977 |     "print(type(rna_pca))\n",
 978 |     "rna_pca"
 979 |    ]
 980 |   },
 981 |   {
 982 |    "cell_type": "markdown",
 983 |    "id": "6cdad910-a34c-49d2-bc03-87bfde9417c9",
 984 |    "metadata": {},
 985 |    "source": [
 986 |     "When alternative backends are being used, not all of the AnnData/MuData features can be supported, and many external tools might not work as expected as they anticipate NumPy/Pandas objects instead."
 987 |    ]
 988 |   },
 989 |   {
 990 |    "cell_type": "code",
 991 |    "execution_count": 25,
 992 |    "id": "b06a9071-0443-41e6-ac81-e3f0ce2653e9",
 993 |    "metadata": {},
 994 |    "outputs": [],
 995 |    "source": [
 996 |     "# Clean up\n",
 997 |     "mdata.clear_cache()\n",
 998 |     "mdata.close()\n",
 999 |     "del mdata, rna_pca, obs"
1000 |    ]
1001 |   },
1002 |   {
1003 |    "cell_type": "markdown",
1004 |    "id": "6c474c9e-dfea-406c-ace6-461e8d5438a4",
1005 |    "metadata": {},
1006 |    "source": [
1007 |     " "
1008 |    ]
1009 |   },
1010 |   {
1011 |    "cell_type": "markdown",
1012 |    "id": "16f9b372-a089-4aed-b91e-b368a2ddc13e",
1013 |    "metadata": {
1014 |     "slideshow": {
1015 |      "slide_type": "slide"
1016 |     },
1017 |     "tags": []
1018 |    },
1019 |    "source": [
1020 |     "### Partial writing\n",
1021 |     "\n",
1022 |     "> [!NOTE]\n",
1023 |     "> This feature is experimental.\n",
1024 |     "\n",
1025 |     "While the main use of the shadows is to provide a low-memory read-only solution to scverse datasets, ability to add new embeddings or other items to the file can greatly extend its usage patterns."
1026 |    ]
1027 |   },
1028 |   {
1029 |    "cell_type": "code",
1030 |    "execution_count": 9,
1031 |    "id": "02245bc0-cc92-4fe7-b665-a4e2f424b353",
1032 |    "metadata": {
1033 |     "slideshow": {
1034 |      "slide_type": "fragment"
1035 |     },
1036 |     "tags": []
1037 |    },
1038 |    "outputs": [],
1039 |    "source": [
1040 |     "mdata = MuDataShadow(file, mode=\"r\")"
1041 |    ]
1042 |   },
1043 |   {
1044 |    "cell_type": "markdown",
1045 |    "id": "c7324f1c-c4a4-4561-9680-0ac5caacc79f",
1046 |    "metadata": {},
1047 |    "source": [
1048 |     "Add a new embedding to the in-memory object:"
1049 |    ]
1050 |   },
1051 |   {
1052 |    "cell_type": "code",
1053 |    "execution_count": 10,
1054 |    "id": "eb6f076f-0b26-428b-a824-a82b3d648c00",
1055 |    "metadata": {
1056 |     "slideshow": {
1057 |      "slide_type": "fragment"
1058 |     },
1059 |     "tags": []
1060 |    },
1061 |    "outputs": [
1062 |     {
1063 |      "data": {
1064 |       "text/plain": [
1065 |        "obsm:\tX_pcaᐁ, X_pca_copyᐁ, X_umap"
1066 |       ]
1067 |      },
1068 |      "execution_count": 10,
1069 |      "metadata": {},
1070 |      "output_type": "execute_result"
1071 |     }
1072 |    ],
1073 |    "source": [
1074 |     "mdata[\"rna\"].obsm[\"X_pca_copy\"] = mdata[\"rna\"].obsm[\"X_pca\"].copy()\n",
1075 |     "mdata[\"rna\"].obsm"
1076 |    ]
1077 |   },
1078 |   {
1079 |    "cell_type": "markdown",
1080 |    "id": "0a7a6374-cb13-4f3a-8f5b-e0c4b4f89363",
1081 |    "metadata": {
1082 |     "slideshow": {
1083 |      "slide_type": "subslide"
1084 |     },
1085 |     "tags": []
1086 |    },
1087 |    "source": [
1088 |     "For this, a family of methods is useful, including `.reopen()` and `.write()`. The `.write()` method will only work if the connection is not read-only, e.g. `'r+'`, however it is possible to reopen the file in another mode.\n",
1089 |     "\n",
1090 |     "Internally, `.write()` pushes (`._push_changes()`) the in-memory changes (marked with ▲ in the object representation above) to the file and provides meaningful error messages when the file is not open for writing.\n",
1091 |     "\n",
1092 |     "This separation of concern makes it transparent when the data is modified, and this workflow can be recommended when barely any data are added to the file. As the methods return the shadow itself, it is possible to chain them:"
1093 |    ]
1094 |   },
1095 |   {
1096 |    "cell_type": "code",
1097 |    "execution_count": 11,
1098 |    "id": "bcfa2982-4bf6-42eb-a604-d17d6496598b",
1099 |    "metadata": {
1100 |     "slideshow": {
1101 |      "slide_type": "fragment"
1102 |     },
1103 |     "tags": []
1104 |    },
1105 |    "outputs": [
1106 |     {
1107 |      "data": {
1108 |       "text/plain": [
1109 |        "obsm:\tX_pcaᐁ, X_pca_copyᐁ, X_umap"
1110 |       ]
1111 |      },
1112 |      "execution_count": 11,
1113 |      "metadata": {},
1114 |      "output_type": "execute_result"
1115 |     }
1116 |    ],
1117 |    "source": [
1118 |     "mdata.reopen(mode='r+').write(clear_cache=True).reopen(mode='r');  # clear pushed elements from cache\n",
1119 |     "mdata[\"rna\"].obsm"
1120 |    ]
1121 |   },
1122 |   {
1123 |    "cell_type": "code",
1124 |    "execution_count": 12,
1125 |    "id": "b03d8f00-6a61-44ec-aa69-fbd01b43c886",
1126 |    "metadata": {},
1127 |    "outputs": [
1128 |     {
1129 |      "data": {
1130 |       "text/plain": [
1131 |        "'r'"
1132 |       ]
1133 |      },
1134 |      "execution_count": 12,
1135 |      "metadata": {},
1136 |      "output_type": "execute_result"
1137 |     }
1138 |    ],
1139 |    "source": [
1140 |     "mdata.file.mode"
1141 |    ]
1142 |   },
1143 |   {
1144 |    "cell_type": "code",
1145 |    "execution_count": 13,
1146 |    "id": "1b794d6e-3cf2-4451-9a96-972aec79fc82",
1147 |    "metadata": {},
1148 |    "outputs": [],
1149 |    "source": [
1150 |     "mdata.clear_cache()"
1151 |    ]
1152 |   },
1153 |   {
1154 |    "cell_type": "markdown",
1155 |    "id": "af3d311e-0199-4dcf-b5a5-15b8e446fd08",
1156 |    "metadata": {},
1157 |    "source": [
1158 |     " "
1159 |    ]
1160 |   },
1161 |   {
1162 |    "cell_type": "markdown",
1163 |    "id": "1b128596-dbb5-4469-a346-bd14cda79eb3",
1164 |    "metadata": {},
1165 |    "source": [
1166 |     "Default mode is read-only, and it protects the files from being modified while also allowing for multiple connections to the file:"
1167 |    ]
1168 |   },
1169 |   {
1170 |    "cell_type": "code",
1171 |    "execution_count": 17,
1172 |    "id": "8e817c96-ae69-49d7-a574-58481170f011",
1173 |    "metadata": {},
1174 |    "outputs": [
1175 |     {
1176 |      "name": "stdout",
1177 |      "output_type": "stream",
1178 |      "text": [
1179 |       "Not available for .write(): File is open in read-only mode. Changes can't be pushed. Reopen it with .reopen('r+') to enable writing.\n"
1180 |      ]
1181 |     }
1182 |    ],
1183 |    "source": [
1184 |     "try:\n",
1185 |     "    mdata.write()\n",
1186 |     "except OSError as e:\n",
1187 |     "    print(\"Not available for .write():\", e)"
1188 |    ]
1189 |   },
1190 |   {
1191 |    "cell_type": "markdown",
1192 |    "id": "2e68cef8-871f-49be-8829-f59ff9d93f99",
1193 |    "metadata": {},
1194 |    "source": [
1195 |     " "
1196 |    ]
1197 |   },
1198 |   {
1199 |    "cell_type": "markdown",
1200 |    "id": "8b5c17b8-98d1-42b6-a008-b3c3b6fbfb79",
1201 |    "metadata": {},
1202 |    "source": [
1203 |     "> [!NOTE]\n",
1204 |     "> Partial writing is currently intended to add new elements to the dataset on disk (e.g. a new embedding to .obsm) rather than to modify the dataset and delete or alter existing elements."
1205 |    ]
1206 |   },
1207 |   {
1208 |    "cell_type": "markdown",
1209 |    "id": "e841d95f-3f46-4902-b18f-eb4c7080e58d",
1210 |    "metadata": {},
1211 |    "source": [
1212 |     " "
1213 |    ]
1214 |   },
1215 |   {
1216 |    "cell_type": "markdown",
1217 |    "id": "e0c11265-8429-4a34-a552-759b1f07a0bc",
1218 |    "metadata": {
1219 |     "tags": []
1220 |    },
1221 |    "source": [
1222 |     "### Views\n",
1223 |     "\n",
1224 |     "Views for shadow objects are conceptually similar to [views in AnnData/MuData](https://anndata.readthedocs.io/en/latest/generated/anndata.AnnData.is_view.html): they provide a view into an existing object without creating its copy.\n",
1225 |     "\n",
1226 |     "As shadow objects inherently operate on the file they are connected to, their views behave slightly differently. Creating a view creates a new connection to the file and returns a new shadow object, which is aware of the part of the data (e.g. which cells) it is supposed to provide a view for."
1227 |    ]
1228 |   },
1229 |   {
1230 |    "cell_type": "code",
1231 |    "execution_count": 18,
1232 |    "id": "c3ea6e33-128a-48fd-a421-0c9f5801e47d",
1233 |    "metadata": {},
1234 |    "outputs": [
1235 |     {
1236 |      "data": {
1237 |       "text/plain": [
1238 |        "View of MuData Shadow object with n_obs × n_vars = 612 × 17838 (original 3891 × 17838)\n",
1239 |        "  obs:\t_index, leiden, leiden_wnn, louvain\n",
1240 |        "  var:\t_index, feature_types, gene_ids, highly_variable\n",
1241 |        "  obsm:\tX_mofa, X_mofa_umap, X_umap, X_wnn_umap, prot, rna\n",
1242 |        "  varm:\tLFs, prot, rna\n",
1243 |        "  obsp:\tconnectivities, distances, wnn_connectivities, wnn_distances\n",
1244 |        "  uns:\tleiden, leiden_wnn_colors, louvain, neighbors, rna:celltype_colors, umap, wnn\n",
1245 |        "  obsmap:\tprot, rna\n",
1246 |        "  varmap:\tprot, rna\n",
1247 |        "  mod:\t2 modalities\n",
1248 |        "    prot: 612 x 32\n",
1249 |        "        X  \n",
1250 |        "        layers:\tcounts\n",
1251 |        "        obs:\t_index\n",
1252 |        "        var:\t_index, feature_types, gene_ids, highly_variable\n",
1253 |        "        obsm:\tX_pca, X_umap\n",
1254 |        "        varm:\tPCs\n",
1255 |        "        obsp:\tconnectivities, distances\n",
1256 |        "        uns:\tneighbors, pca, umap\n",
1257 |        "    rna: 612 x 17806\n",
1258 |        "        X  \n",
1259 |        "        raw:\tX, var, varm\n",
1260 |        "        obs:\t_index, celltype, leiden, n_genes_by_counts, pct_counts_mt, total_counts, total_counts_mt\n",
1261 |        "        var:\t_index, dispersions, dispersions_norm, feature_types, gene_ids, highly_variable, mean, mean_counts, means, mt, n_cells_by_counts, pct_dropout_by_counts, std, total_counts\n",
1262 |        "        obsm:\tX_pca, X_pca_copy, X_umap\n",
1263 |        "        varm:\tPCs\n",
1264 |        "        obsp:\tconnectivities, distances\n",
1265 |        "        uns:\tcelltype_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, umap"
1266 |       ]
1267 |      },
1268 |      "execution_count": 18,
1269 |      "metadata": {},
1270 |      "output_type": "execute_result"
1271 |     }
1272 |    ],
1273 |    "source": [
1274 |     "monocytes = mdata['rna'].obs['celltype'].values == \"CD14 mono\"\n",
1275 |     "monocytes_view = mdata[monocytes]\n",
1276 |     "monocytes_view"
1277 |    ]
1278 |   },
1279 |   {
1280 |    "cell_type": "markdown",
1281 |    "id": "2f115798-96d2-4660-889d-b3e9a2d154c3",
1282 |    "metadata": {},
1283 |    "source": [
1284 |     "Individual modalities of a MuData Shadow View are sliced accordingly:"
1285 |    ]
1286 |   },
1287 |   {
1288 |    "cell_type": "code",
1289 |    "execution_count": 19,
1290 |    "id": "13f4b379-e26d-4677-9de3-42b3754af15d",
1291 |    "metadata": {},
1292 |    "outputs": [
1293 |     {
1294 |      "data": {
1295 |       "text/plain": [
1296 |        "(612, 50)"
1297 |       ]
1298 |      },
1299 |      "execution_count": 19,
1300 |      "metadata": {},
1301 |      "output_type": "execute_result"
1302 |     }
1303 |    ],
1304 |    "source": [
1305 |     "monocytes_view['rna'].obsm[\"X_pca\"].shape"
1306 |    ]
1307 |   },
1308 |   {
1309 |    "cell_type": "code",
1310 |    "execution_count": 20,
1311 |    "id": "585fcbc6-9d5f-406f-99e1-6b91117e2bac",
1312 |    "metadata": {},
1313 |    "outputs": [
1314 |     {
1315 |      "data": {
1316 |       "text/plain": [
1317 |        "obsm:\tX_pcaᐁ, X_pca_copy, X_umap"
1318 |       ]
1319 |      },
1320 |      "execution_count": 20,
1321 |      "metadata": {},
1322 |      "output_type": "execute_result"
1323 |     }
1324 |    ],
1325 |    "source": [
1326 |     "monocytes_view['rna'].obsm"
1327 |    ]
1328 |   },
1329 |   {
1330 |    "cell_type": "markdown",
1331 |    "id": "8fbdbb1f-9e35-44aa-aad8-b1f67f827fbd",
1332 |    "metadata": {},
1333 |    "source": [
1334 |     "Cache is specific to each view:"
1335 |    ]
1336 |   },
1337 |   {
1338 |    "cell_type": "code",
1339 |    "execution_count": 21,
1340 |    "id": "d68cc6ea-de8d-4801-9667-4fa059609d85",
1341 |    "metadata": {},
1342 |    "outputs": [
1343 |     {
1344 |      "data": {
1345 |       "text/plain": [
1346 |        "obsm:\tX_pca, X_pca_copy, X_umap"
1347 |       ]
1348 |      },
1349 |      "execution_count": 21,
1350 |      "metadata": {},
1351 |      "output_type": "execute_result"
1352 |     }
1353 |    ],
1354 |    "source": [
1355 |     "mdata['rna'].obsm  # X_pca is not cached"
1356 |    ]
1357 |   },
1358 |   {
1359 |    "cell_type": "markdown",
1360 |    "id": "e511214b-52a4-4f63-9275-b267b779ecc9",
1361 |    "metadata": {},
1362 |    "source": [
1363 |     "Moreover, this semantic allows to create views of views of views..."
1364 |    ]
1365 |   },
1366 |   {
1367 |    "cell_type": "code",
1368 |    "execution_count": 22,
1369 |    "id": "229da4ce-df96-45b6-a6a4-4b44ee6749f5",
1370 |    "metadata": {},
1371 |    "outputs": [],
1372 |    "source": [
1373 |     "adata = AnnDataShadow(file / \"mod/rna\")"
1374 |    ]
1375 |   },
1376 |   {
1377 |    "cell_type": "code",
1378 |    "execution_count": 23,
1379 |    "id": "30cbefc7-1e59-447c-8413-de8ef34be30b",
1380 |    "metadata": {},
1381 |    "outputs": [
1382 |     {
1383 |      "data": {
1384 |       "text/plain": [
1385 |        "View of AnnData Shadow object with n_obs × n_vars = 7 × 30 (original 3891 × 17806)\n",
1386 |        "  X  \n",
1387 |        "  raw:\tX, var, varm\n",
1388 |        "  obs:\t_index, celltype, leiden, n_genes_by_counts, pct_counts_mt, total_counts, total_counts_mt\n",
1389 |        "  var:\t_index, dispersions, dispersions_norm, feature_types, gene_ids, highly_variable, mean, mean_counts, means, mt, n_cells_by_counts, pct_dropout_by_counts, std, total_counts\n",
1390 |        "  obsm:\tX_pca, X_pca_copy, X_umap\n",
1391 |        "  varm:\tPCs\n",
1392 |        "  obsp:\tconnectivities, distances\n",
1393 |        "  uns:\tcelltype_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, umap"
1394 |       ]
1395 |      },
1396 |      "execution_count": 23,
1397 |      "metadata": {},
1398 |      "output_type": "execute_result"
1399 |     }
1400 |    ],
1401 |    "source": [
1402 |     "view = adata[3:10,:30]\n",
1403 |     "view"
1404 |    ]
1405 |   },
1406 |   {
1407 |    "cell_type": "code",
1408 |    "execution_count": 24,
1409 |    "id": "bfa15c8a-f4a8-4907-939f-5cb80ef50abc",
1410 |    "metadata": {},
1411 |    "outputs": [
1412 |     {
1413 |      "data": {
1414 |       "text/plain": [
1415 |        "View of AnnData Shadow object with n_obs × n_vars = 2 × 3 (original 3891 × 17806)\n",
1416 |        "  X  \n",
1417 |        "  raw:\tX, var, varm\n",
1418 |        "  obs:\t_index, celltype, leiden, n_genes_by_counts, pct_counts_mt, total_counts, total_counts_mt\n",
1419 |        "  var:\t_index, dispersions, dispersions_norm, feature_types, gene_ids, highly_variable, mean, mean_counts, means, mt, n_cells_by_counts, pct_dropout_by_counts, std, total_counts\n",
1420 |        "  obsm:\tX_pca, X_pca_copy, X_umap\n",
1421 |        "  varm:\tPCs\n",
1422 |        "  obsp:\tconnectivities, distances\n",
1423 |        "  uns:\tcelltype_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, umap"
1424 |       ]
1425 |      },
1426 |      "execution_count": 24,
1427 |      "metadata": {},
1428 |      "output_type": "execute_result"
1429 |     }
1430 |    ],
1431 |    "source": [
1432 |     "nested_view = view[:2,-3:]\n",
1433 |     "nested_view"
1434 |    ]
1435 |   },
1436 |   {
1437 |    "cell_type": "markdown",
1438 |    "id": "6e3ce502-40e6-4b40-b78e-cf86e527bf18",
1439 |    "metadata": {},
1440 |    "source": [
1441 |     "Getting attributes from views is no different than for shadow objects:"
1442 |    ]
1443 |   },
1444 |   {
1445 |    "cell_type": "code",
1446 |    "execution_count": 25,
1447 |    "id": "216d5cd3-5457-4145-952b-61bed2be9f7d",
1448 |    "metadata": {},
1449 |    "outputs": [
1450 |     {
1451 |      "data": {
1452 |       "text/html": [
1453 |        "<div>\n",
1454 |        "<style scoped>\n",
1455 |        "    .dataframe tbody tr th:only-of-type {\n",
1456 |        "        vertical-align: middle;\n",
1457 |        "    }\n",
1458 |        "\n",
1459 |        "    .dataframe tbody tr th {\n",
1460 |        "        vertical-align: top;\n",
1461 |        "    }\n",
1462 |        "\n",
1463 |        "    .dataframe thead th {\n",
1464 |        "        text-align: right;\n",
1465 |        "    }\n",
1466 |        "</style>\n",
1467 |        "<table border=\"1\" class=\"dataframe\">\n",
1468 |        "  <thead>\n",
1469 |        "    <tr style=\"text-align: right;\">\n",
1470 |        "      <th></th>\n",
1471 |        "      <th>n_genes_by_counts</th>\n",
1472 |        "      <th>total_counts</th>\n",
1473 |        "      <th>total_counts_mt</th>\n",
1474 |        "      <th>pct_counts_mt</th>\n",
1475 |        "      <th>leiden</th>\n",
1476 |        "      <th>celltype</th>\n",
1477 |        "    </tr>\n",
1478 |        "  </thead>\n",
1479 |        "  <tbody>\n",
1480 |        "    <tr>\n",
1481 |        "      <th>AAACCCATCGTGCATA-1</th>\n",
1482 |        "      <td>1908</td>\n",
1483 |        "      <td>6704.0</td>\n",
1484 |        "      <td>426.0</td>\n",
1485 |        "      <td>6.354415</td>\n",
1486 |        "      <td>2</td>\n",
1487 |        "      <td>CD4+ memory T</td>\n",
1488 |        "    </tr>\n",
1489 |        "    <tr>\n",
1490 |        "      <th>AAACGAAAGACAAGCC-1</th>\n",
1491 |        "      <td>1589</td>\n",
1492 |        "      <td>3900.0</td>\n",
1493 |        "      <td>363.0</td>\n",
1494 |        "      <td>9.307693</td>\n",
1495 |        "      <td>1</td>\n",
1496 |        "      <td>CD14 mono</td>\n",
1497 |        "    </tr>\n",
1498 |        "  </tbody>\n",
1499 |        "</table>\n",
1500 |        "</div>"
1501 |       ],
1502 |       "text/plain": [
1503 |        "                    n_genes_by_counts  total_counts  total_counts_mt  \\\n",
1504 |        "AAACCCATCGTGCATA-1               1908        6704.0            426.0   \n",
1505 |        "AAACGAAAGACAAGCC-1               1589        3900.0            363.0   \n",
1506 |        "\n",
1507 |        "                    pct_counts_mt leiden       celltype  \n",
1508 |        "AAACCCATCGTGCATA-1       6.354415      2  CD4+ memory T  \n",
1509 |        "AAACGAAAGACAAGCC-1       9.307693      1      CD14 mono  "
1510 |       ]
1511 |      },
1512 |      "execution_count": 25,
1513 |      "metadata": {},
1514 |      "output_type": "execute_result"
1515 |     }
1516 |    ],
1517 |    "source": [
1518 |     "nested_view.obs"
1519 |    ]
1520 |   },
1521 |   {
1522 |    "cell_type": "markdown",
1523 |    "id": "9dbacf34-247e-4ac9-995b-f39656491973",
1524 |    "metadata": {},
1525 |    "source": [
1526 |     "... as they are shadow objects themselves:"
1527 |    ]
1528 |   },
1529 |   {
1530 |    "cell_type": "code",
1531 |    "execution_count": 26,
1532 |    "id": "c0921236-cc65-43fc-a9a1-557d4ab0a1c6",
1533 |    "metadata": {},
1534 |    "outputs": [
1535 |     {
1536 |      "data": {
1537 |       "text/plain": [
1538 |        "shadows.anndatashadow.AnnDataShadow"
1539 |       ]
1540 |      },
1541 |      "execution_count": 26,
1542 |      "metadata": {},
1543 |      "output_type": "execute_result"
1544 |     }
1545 |    ],
1546 |    "source": [
1547 |     "type(nested_view)"
1548 |    ]
1549 |   },
1550 |   {
1551 |    "cell_type": "code",
1552 |    "execution_count": 27,
1553 |    "id": "e70179b3-da72-4155-bbf9-b6f9d1fa8d47",
1554 |    "metadata": {},
1555 |    "outputs": [],
1556 |    "source": [
1557 |     "# Clean up\n",
1558 |     "nested_view.close()\n",
1559 |     "view.close()\n",
1560 |     "del nested_view, view\n",
1561 |     "\n",
1562 |     "monocytes_view.close()\n",
1563 |     "mdata.close()\n",
1564 |     "del monocytes_view, mdata"
1565 |    ]
1566 |   },
1567 |   {
1568 |    "cell_type": "markdown",
1569 |    "id": "ed55ed1b-1d8e-4250-9352-75f59cd5551a",
1570 |    "metadata": {},
1571 |    "source": [
1572 |     " "
1573 |    ]
1574 |   },
1575 |   {
1576 |    "cell_type": "markdown",
1577 |    "id": "ab4a745e-df8c-46f5-9c3d-d2d3678fff5f",
1578 |    "metadata": {
1579 |     "slideshow": {
1580 |      "slide_type": "slide"
1581 |     },
1582 |     "tags": []
1583 |    },
1584 |    "source": [
1585 |     "### Per-feature access to datasets on disk\n",
1586 |     "\n",
1587 |     "This is currently not possible as caching works at the level of individual HDF5 datasets.\n",
1588 |     "\n",
1589 |     "Views may read only the necessary parts of the arrays to memory however this behaviour is currently not universal.\n",
1590 |     "\n",
1591 |     "E.g.:"
1592 |    ]
1593 |   },
1594 |   {
1595 |    "cell_type": "code",
1596 |    "execution_count": 28,
1597 |    "id": "ff5c4052-0929-43c3-947f-6de72b78d69e",
1598 |    "metadata": {},
1599 |    "outputs": [
1600 |     {
1601 |      "data": {
1602 |       "text/plain": [
1603 |        "(10, 100)"
1604 |       ]
1605 |      },
1606 |      "execution_count": 28,
1607 |      "metadata": {},
1608 |      "output_type": "execute_result"
1609 |     }
1610 |    ],
1611 |    "source": [
1612 |     "adata_subset = adata[:10,:100]\n",
1613 |     "adata_subset.X.shape"
1614 |    ]
1615 |   },
1616 |   {
1617 |    "cell_type": "code",
1618 |    "execution_count": 29,
1619 |    "id": "e410e6e1-34c8-48f5-88b5-a45a0545e342",
1620 |    "metadata": {},
1621 |    "outputs": [
1622 |     {
1623 |      "data": {
1624 |       "text/plain": [
1625 |        "View of AnnData Shadow object with n_obs × n_vars = 10 × 100 (original 3891 × 17806)\n",
1626 |        "  X ᐁ \n",
1627 |        "  raw:\tX, var, varm\n",
1628 |        "  obs:\t_index, celltype, leiden, n_genes_by_counts, pct_counts_mt, total_counts, total_counts_mt\n",
1629 |        "  var:\t_index, dispersions, dispersions_norm, feature_types, gene_ids, highly_variable, mean, mean_counts, means, mt, n_cells_by_counts, pct_dropout_by_counts, std, total_counts\n",
1630 |        "  obsm:\tX_pca, X_pca_copy, X_umap\n",
1631 |        "  varm:\tPCs\n",
1632 |        "  obsp:\tconnectivities, distances\n",
1633 |        "  uns:\tcelltype_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, umap"
1634 |       ]
1635 |      },
1636 |      "execution_count": 29,
1637 |      "metadata": {},
1638 |      "output_type": "execute_result"
1639 |     }
1640 |    ],
1641 |    "source": [
1642 |     "adata_subset"
1643 |    ]
1644 |   },
1645 |   {
1646 |    "cell_type": "code",
1647 |    "execution_count": 30,
1648 |    "id": "bf2a317a-ca82-4a73-b0ef-07d0cfac2128",
1649 |    "metadata": {},
1650 |    "outputs": [],
1651 |    "source": [
1652 |     "# Clean up\n",
1653 |     "adata.close()\n",
1654 |     "adata_subset.close()\n",
1655 |     "del adata, adata_subset"
1656 |    ]
1657 |   },
1658 |   {
1659 |    "cell_type": "markdown",
1660 |    "id": "bb50af6a-4ee2-4a8f-b022-9b0daa63e81e",
1661 |    "metadata": {},
1662 |    "source": [
1663 |     " "
1664 |    ]
1665 |   },
1666 |   {
1667 |    "cell_type": "markdown",
1668 |    "id": "fec4c262-5bbf-4393-b082-f208f7997a7a",
1669 |    "metadata": {
1670 |     "slideshow": {
1671 |      "slide_type": "slide"
1672 |     },
1673 |     "tags": []
1674 |    },
1675 |    "source": [
1676 |     "---\n",
1677 |     "\n",
1678 |     "In order to return the data to its original state, let's manually remove the items we wrote to the file:"
1679 |    ]
1680 |   },
1681 |   {
1682 |    "cell_type": "code",
1683 |    "execution_count": 31,
1684 |    "id": "46550ff4-39e1-40e6-80d0-4fd45d99af84",
1685 |    "metadata": {
1686 |     "slideshow": {
1687 |      "slide_type": "fragment"
1688 |     },
1689 |     "tags": []
1690 |    },
1691 |    "outputs": [],
1692 |    "source": [
1693 |     "import h5py\n",
1694 |     "\n",
1695 |     "f = h5py.File(file, \"a\")\n",
1696 |     "#                    ^\n",
1697 |     "#        ____________|\n",
1698 |     "# if this works,     \n",
1699 |     "# no dangling read-only connections!\n",
1700 |     "# \n",
1701 |     "\n",
1702 |     "del f[\"mod/rna/obsm/X_pca_copy\"]\n",
1703 |     "f.close()"
1704 |    ]
1705 |   },
1706 |   {
1707 |    "cell_type": "markdown",
1708 |    "id": "6bc6a57c-39d0-45ad-be01-8cadde33da83",
1709 |    "metadata": {},
1710 |    "source": [
1711 |     " "
1712 |    ]
1713 |   },
1714 |   {
1715 |    "cell_type": "markdown",
1716 |    "id": "752bd981-1cbd-43ec-b707-9308afb7e55f",
1717 |    "metadata": {},
1718 |    "source": [
1719 |     " "
1720 |    ]
1721 |   }
1722 |  ],
1723 |  "metadata": {
1724 |   "kernelspec": {
1725 |    "display_name": "Python 3 (ipykernel)",
1726 |    "language": "python",
1727 |    "name": "python3"
1728 |   },
1729 |   "language_info": {
1730 |    "codemirror_mode": {
1731 |     "name": "ipython",
1732 |     "version": 3
1733 |    },
1734 |    "file_extension": ".py",
1735 |    "mimetype": "text/x-python",
1736 |    "name": "python",
1737 |    "nbconvert_exporter": "python",
1738 |    "pygments_lexer": "ipython3",
1739 |    "version": "3.10.11"
1740 |   }
1741 |  },
1742 |  "nbformat": 4,
1743 |  "nbformat_minor": 5
1744 | }
1745 | 


--------------------------------------------------------------------------------