├── docs ├── img │ └── shadows_header.jpg ├── references.md ├── api.md ├── references.bib ├── examples │ ├── index.md │ ├── shadows-zarr.ipynb │ └── shadows-features.ipynb ├── Makefile ├── make.bat ├── index.md └── conf.py ├── tests ├── conftest.py ├── test_shadows_zarr.py └── test_shadows_hdf5.py ├── src └── shadows │ ├── __init__.py │ ├── compat.py │ ├── anndatashadow.py │ ├── mudatashadow.py │ ├── elemshadow.py │ └── datashadow.py ├── .github └── workflows │ ├── pythonpackage.yml │ └── docs.yml ├── LICENSE ├── README.md ├── .gitignore └── pyproject.toml /docs/img/shadows_header.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scverse/shadows/HEAD/docs/img/shadows_header.jpg -------------------------------------------------------------------------------- /docs/references.md: -------------------------------------------------------------------------------- 1 | # References 2 | 3 | ```{bibliography} 4 | :cited: 5 | ``` 6 | 7 | 8 | ```{autosummary} 9 | :toctree: generated 10 | :recursive: 11 | 12 | * 13 | ``` 14 | -------------------------------------------------------------------------------- /docs/api.md: -------------------------------------------------------------------------------- 1 | # API 2 | 3 | 4 | ```{eval-rst} 5 | .. autoclass:: shadows 6 | :show-inheritance: 7 | :members: 8 | ``` 9 | 10 | ```{autosummary} 11 | :toctree: generated 12 | :recursive: 13 | 14 | * 15 | ``` 16 | -------------------------------------------------------------------------------- /docs/references.bib: -------------------------------------------------------------------------------- 1 | @article{bredikhin2022muon, 2 | title={Muon: multimodal omics analysis framework}, 3 | author={Bredikhin, Danila and Kats, Ilia and Stegle, Oliver}, 4 | journal={Genome Biology}, 5 | volume={23}, 6 | number={1}, 7 | pages={1--12}, 8 | year={2022}, 9 | publisher={Springer} 10 | } 11 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture(scope="module") 5 | def filepath_h5ad(tmpdir_factory): 6 | yield str(tmpdir_factory.mktemp("tmp_test_dir_shadows").join("test.h5ad")) 7 | 8 | 9 | @pytest.fixture(scope="module") 10 | def filepath_h5mu(tmpdir_factory): 11 | yield str(tmpdir_factory.mktemp("tmp_test_dir_shadows").join("test.h5mu")) 12 | 13 | 14 | @pytest.fixture(scope="module") 15 | def filepath_mudata_zarr(tmpdir_factory): 16 | yield str(tmpdir_factory.mktemp("tmp_test_dir_shadows").join("test_mudata.zarr")) 17 | -------------------------------------------------------------------------------- /src/shadows/__init__.py: -------------------------------------------------------------------------------- 1 | from .anndatashadow import AnnDataShadow 2 | from .datashadow import DataShadow 3 | from .mudatashadow import MuDataShadow 4 | 5 | try: # See https://github.com/maresb/hatch-vcs-footgun-example 6 | from setuptools_scm import get_version 7 | 8 | __version__ = get_version(root="../..", relative_to=__file__) 9 | except (ImportError, LookupError): 10 | try: 11 | from ._version import __version__ 12 | except ModuleNotFoundError: 13 | raise RuntimeError("pqdata is not correctly installed. Please install it, e.g. with pip.") 14 | 15 | __all__ = ["DataShadow", "AnnDataShadow", "MuDataShadow", "__version__"] 16 | -------------------------------------------------------------------------------- /docs/examples/index.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | ```{toctree} 4 | :maxdepth: 2 5 | 6 | shadow-objects.ipynb 7 | shadows-features.ipynb 8 | shadows-zarr.ipynb 9 | ``` 10 | 11 | ```{contents} 12 | :local: 13 | :depth: 3 14 | ``` 15 | 16 | Shadows offer an interface for AnnData and MuData files on disk that enables loading the necessary parts of the datasets into memory (and caching them) only when needed. 17 | 18 | More features and details of (low!) memory consumption are outlined on the following pages: 19 | 20 | - [key features and memory consumption tracking](shadow-objects.ipynb), 21 | 22 | - [more features](shadows-features.ipynb), 23 | 24 | - [zarr interface](shadows-zarr.ipynb). 25 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /src/shadows/compat.py: -------------------------------------------------------------------------------- 1 | from anndata._io.specs import read_elem as ad_read_elem 2 | 3 | try: 4 | from pqdata.core import Array as PqArray 5 | from pqdata.core import Group as PqGroup 6 | except ImportError: 7 | 8 | class PqArray: 9 | @staticmethod 10 | def __repr__(): 11 | return "mock pqdata.core.Array" 12 | 13 | class PqGroup: 14 | @staticmethod 15 | def __repr__(): 16 | return "mock pqdata.core.Group" 17 | 18 | 19 | def read_elem(*args, **kwargs): 20 | if "_format" in kwargs: 21 | format = kwargs.pop("_format") 22 | if format == "parquet": 23 | from pqdata.core import read_elem as pq_read_elem 24 | 25 | return pq_read_elem(*args, **kwargs) 26 | else: 27 | return ad_read_elem(*args, **kwargs) 28 | else: 29 | return ad_read_elem(*args, **kwargs) 30 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: ["3.10", "3.11", "3.12"] 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | python -m pip install uv 23 | uv venv 24 | source .venv/bin/activate 25 | uv pip install ruff pytest 26 | uv pip install '.[dev,test,all]' 27 | - name: Ruff check 28 | run: | 29 | source .venv/bin/activate 30 | ruff check src/shadows 31 | - name: Test with pytest 32 | run: | 33 | source .venv/bin/activate 34 | pytest 35 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | on: [push] 3 | 4 | jobs: 5 | docs: 6 | 7 | runs-on: ubuntu-latest 8 | 9 | permissions: 10 | contents: write 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Set up Python 3.11 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: "3.11" 17 | - name: Install 18 | run: | 19 | python -m pip install --upgrade pip 20 | python -m pip install '.[doc]' 21 | - name: Install pandoc 22 | run: sudo apt-get install -y pandoc 23 | - name: Build HTML 24 | working-directory: docs 25 | run: | 26 | make html -e 27 | - name: Upload artifacts 28 | uses: actions/upload-artifact@v4 29 | with: 30 | name: html-docs 31 | path: docs/_build/html/ 32 | - name: Deploy 33 | uses: peaceiris/actions-gh-pages@v3 34 | if: github.ref == 'refs/heads/main' 35 | with: 36 | github_token: ${{ secrets.GITHUB_TOKEN }} 37 | publish_dir: docs/_build/html 38 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Shadows 2 | 3 | ```{toctree} 4 | :hidden: true 5 | :maxdepth: 2 6 | 7 | examples/index.md 8 | api.md 9 | references.md 10 | ``` 11 | 12 | `shadows` is a Python library with low-memory interfaces for [scverse](https://scverse.org) data structures such as [AnnData](https://github.com/scverse/anndata) and [MuData](https://github.com/scverse/mudata). 13 | 14 | [//]: # (numfocus-fiscal-sponsor-attribution) 15 | 16 | shadows is part of the scverse® project ([website](https://scverse.org), [governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/). 17 | If you like scverse® and want to support our mission, please consider making a tax-deductible [donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs. 18 | 19 |
20 | 21 | 25 | 26 |
27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, Danila Bredikhin 4 | Copyright (c) 2025, scverse® 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | 3. Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived from 19 | this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 25 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Shadows 4 | 5 | Shadows are on-disk interfaces for scverse data standards such as [AnnData](https://github.com/scverse/anndata) and [MuData](https://github.com/scverse/mudata). 6 | 7 | It is an experimental project. 8 | 9 | [![PyPi version](https://img.shields.io/pypi/v/shadows)](https://pypi.org/project/shadows) 10 | 11 | ## Installation 12 | 13 | ``` 14 | pip install shadows 15 | # or 16 | pip install git+https://github.com/scverse/shadows 17 | ``` 18 | 19 | ## Features 20 | 21 | The shadows library aims to implement the following features: 22 | 23 | - [x] **Shadow objects**: Read-only AnnDataShadow and MuDataShadow for HDF5 files. 24 | 25 | - [x] AnnDataShadow and MuDataShadow for Zarr files. 26 | 27 | - [x] AnnDataShadow and MuDataShadow for Parquet-based serialization ([pqdata](https://github.com/gtca/pqdata)). 28 | 29 | - [x] Data shadows for `.pqdata` and `.zarr` files on S3 storage. 30 | 31 | 32 | ### Shadow objects 33 | 34 | Briefly, shadow objects simply work like this: 35 | 36 | ```py 37 | from shadows import * 38 | ash = AnnDataShadow("pbmc3k.h5ad") 39 | msh = MuDataShadow("pbmc5k_citeseq.h5mu") 40 | ``` 41 | 42 | All the various features are showcased in the following tutorials: 43 | 44 | - [Getting started with shadow objects](/docs/examples/shadow-objects.ipynb) 45 | 46 | - [Advanced features of shadow objects](/docs/examples/shadows-features.ipynb) 47 | 48 | [//]: # (numfocus-fiscal-sponsor-attribution) 49 | 50 | shadows is part of the scverse® project ([website](https://scverse.org), [governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/). 51 | If you like scverse® and want to support our mission, please consider making a tax-deductible [donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs. 52 | 53 |
54 | 55 | 59 | 60 |
61 | 62 | -------------------------------------------------------------------------------- /tests/test_shadows_zarr.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pytest 3 | from typing import Optional 4 | 5 | from shadows import AnnDataShadow, MuDataShadow 6 | 7 | import numpy as np 8 | from scipy.sparse import coo_matrix 9 | from anndata import AnnData 10 | from mudata import MuData 11 | 12 | N, D = 50, 20 13 | 14 | 15 | def matrix(sparse_x: bool = False, n: Optional[int] = None, d: Optional[int] = None): 16 | np.random.seed(100) 17 | 18 | if n is None: 19 | n = N 20 | if d is None: 21 | d = D 22 | 23 | if sparse_x: 24 | sparsity = 0.2 25 | row = np.random.choice(n, 1000 * sparsity) 26 | col = np.random.choice(d, 1000 * sparsity) 27 | data = np.random.normal(size=1000 * sparsity) 28 | 29 | x = coo_matrix((data, (row, col)), shape=(n, d)).tocsr() 30 | else: 31 | x = np.random.normal(size=(n, d)) 32 | return x 33 | 34 | 35 | @pytest.fixture() 36 | def adata(sparse_x: bool = False, obsm: bool = False): 37 | x = matrix(sparse_x) 38 | ad = AnnData(X=x) 39 | 40 | return ad 41 | 42 | 43 | @pytest.fixture() 44 | def mdata(sparse_x: bool = False, sparse_y: bool = False): 45 | np.random.seed(42) 46 | 47 | xn, xd = np.random.choice(100, 2) 48 | yn, yd = np.random.choice(100, 2) 49 | 50 | x = matrix(sparse_x, n=xn, d=xd) 51 | y = matrix(sparse_y, n=yn, d=yd) 52 | 53 | ax = AnnData(X=x) 54 | ay = AnnData(X=y) 55 | 56 | ax.var_names = [f"x{i}" for i in range(xd)] 57 | ay.var_names = [f"y{i}" for i in range(yd)] 58 | 59 | mdata = MuData({"x": ax, "y": ay}) 60 | 61 | return mdata 62 | 63 | 64 | @pytest.mark.usefixtures("filepath_mudata_zarr") 65 | class TestMuData: 66 | def test_mudata_simple(self, mdata, filepath_mudata_zarr): 67 | filename = filepath_mudata_zarr 68 | mdata.write_zarr(filename) 69 | 70 | msh = MuDataShadow(filename) 71 | 72 | assert mdata.shape == msh.shape 73 | 74 | msh.close() 75 | 76 | def test_anndata_inside_mudata(self, mdata, filepath_mudata_zarr): 77 | filename = filepath_mudata_zarr 78 | mdata.write_zarr(filename) 79 | 80 | mod_x = Path(filename) / "mod" / "x" 81 | mod_y = Path(filename) / "mod" / "y" 82 | 83 | ash_x = AnnDataShadow(mod_x) 84 | ash_y = AnnDataShadow(mod_y) 85 | 86 | assert ash_x.shape == mdata["x"].shape 87 | assert ash_y.shape == mdata["y"].shape 88 | 89 | ash_x.close() 90 | ash_y.close() 91 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # macOS 132 | .DS_Store 133 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "hatchling.build" 3 | requires = ["hatchling", "hatch-vcs"] 4 | 5 | [project] 6 | name = "shadows" 7 | description = "Low-memory data interfaces for scverse" 8 | requires-python = ">= 3.10" 9 | license = { file = "LICENSE" } 10 | authors = [ 11 | { name = "Danila Bredikhin" }, 12 | ] 13 | maintainers = [ 14 | { name = "Danila Bredikhin", email = "danila@stanford.edu" }, 15 | ] 16 | readme = "README.md" 17 | classifiers = [ 18 | "Programming Language :: Python :: 3", 19 | "License :: OSI Approved :: BSD License", 20 | "Operating System :: OS Independent", 21 | "Development Status :: 3 - Alpha", 22 | "Topic :: Scientific/Engineering :: Bio-Informatics", 23 | "Intended Audience :: Science/Research" 24 | ] 25 | dependencies = [ 26 | ] 27 | dynamic = ["version"] 28 | 29 | [project.urls] 30 | Documentation = "https://scverse.github.io/shadows" 31 | Source = "https://github.com/scverse/shadows" 32 | Home-page = "https://github.com/scverse/shadows" 33 | 34 | [project.optional-dependencies] 35 | dev = [ 36 | "setuptools_scm", 37 | ] 38 | doc = [ 39 | "sphinx", 40 | "myst-parser", 41 | "sphinx-book-theme", 42 | "readthedocs-sphinx-search", 43 | "nbsphinx", 44 | "sphinx-automodapi", 45 | "sphinxcontrib-bibtex", 46 | "sphinx-autodoc-typehints", 47 | "furo", 48 | "ipython", # fix nbsphinx syntax highlighting 49 | "pygments", 50 | ] 51 | test = [ 52 | "scipy", 53 | "pytest", 54 | "pytest-cov", 55 | "zarr", 56 | "pqdata", 57 | "mudata", 58 | "anndata", 59 | ] 60 | all = [ 61 | "scipy", 62 | "anndata >= 0.8", 63 | "mudata >= 0.2", 64 | ] 65 | 66 | [tool.pytest.ini_options] 67 | python_files = "test_*.py" 68 | testpaths = [ 69 | "./tests", # unit tests 70 | ] 71 | 72 | [tool.black] 73 | line-length = 100 74 | target-version = ['py39'] 75 | 76 | [tool.hatch.version] 77 | source = "vcs" 78 | 79 | [tool.hatch.build.hooks.vcs] 80 | version-file = "src/shadows/_version.py" 81 | 82 | [tool.hatch.build.targets.wheel] 83 | packages = ["src/shadows"] 84 | 85 | [tool.hatch.build.targets.sdist] 86 | exclude = [ 87 | "/.github", 88 | "/docs", 89 | ] 90 | 91 | [tool.ruff] 92 | src = ["src"] 93 | exclude = ["src/shadows/_version.py"] 94 | 95 | [tool.ruff.format] 96 | docstring-code-format = true 97 | 98 | [tool.ruff.lint] 99 | select = [ 100 | "E", # Error detected by Pycodestyle 101 | "F", # Errors detected by Pyflakes 102 | "W", # Warning detected by Pycodestyle 103 | "UP", # pyupgrade 104 | "I", # isort 105 | "TCH", # manage type checking blocks 106 | "ICN", # Follow import conventions 107 | "PTH", # Pathlib instead of os.path 108 | "PT", # Pytest conventions 109 | ] 110 | ignore = [ 111 | # line too long -> we accept long comment lines; formatter gets rid of long code lines 112 | "E501", 113 | # Do not assign a lambda expression, use a def -> AnnData allows lambda expression assignments, 114 | "E731", 115 | # allow I, O, l as variable names -> I is the identity matrix, i, j, k, l is reasonable indexing notation 116 | "E741", 117 | ] 118 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | import sys 10 | from pathlib import Path 11 | from datetime import datetime 12 | from importlib.metadata import metadata 13 | 14 | HERE = Path(__file__).parent 15 | sys.path.insert(0, str(HERE / "extensions")) 16 | 17 | # -- Project information ----------------------------------------------------- 18 | 19 | project = "shadows" 20 | author = "Danila Bredikhin" 21 | copyright = f"{datetime.now():%Y}, {author}" 22 | info = metadata("shadows") 23 | version = info["Version"] 24 | 25 | bibtex_bibfiles = ["references.bib"] 26 | bibtex_reference_style = "author_year" 27 | templates_path = ["_templates"] 28 | nitpicky = True # Warn about broken links 29 | needs_sphinx = "4.0" 30 | 31 | html_context = { 32 | "display_github": True, # Integrate GitHub 33 | "github_user": "gtca", # Username 34 | "github_repo": project, # Repo name 35 | "github_version": "main", # Version 36 | "conf_py_path": "/docs/", # Path in the checkout to the docs root 37 | } 38 | 39 | # -- General configuration --------------------------------------------------- 40 | 41 | # Add any Sphinx extension module names here, as strings. They can be 42 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 43 | # ones. 44 | extensions = [ 45 | "myst_parser", 46 | "sphinx.ext.autodoc", 47 | "sphinx.ext.intersphinx", 48 | "sphinx.ext.autosummary", 49 | "sphinx.ext.napoleon", 50 | "sphinxcontrib.bibtex", 51 | "sphinx_autodoc_typehints", 52 | "nbsphinx", 53 | "sphinx.ext.mathjax", 54 | *[p.stem for p in (HERE / "extensions").glob("*.py")], 55 | ] 56 | 57 | 58 | autosummary_generate = True 59 | autodoc_member_order = "groupwise" 60 | default_role = "literal" 61 | napoleon_google_docstring = False 62 | napoleon_numpy_docstring = True 63 | napoleon_include_init_with_doc = False 64 | napoleon_use_rtype = True # having a separate entry generally helps readability 65 | napoleon_use_param = True 66 | 67 | intersphinx_mapping = { 68 | "anndata": ("https://anndata.readthedocs.io/en/stable/", None), 69 | "numpy": ("https://numpy.org/doc/stable/", None), 70 | } 71 | 72 | nbsphinx_execute = "never" 73 | 74 | # Add any paths that contain templates here, relative to this directory. 75 | templates_path = ["_templates"] 76 | 77 | # List of patterns, relative to source directory, that match files and 78 | # directories to ignore when looking for source files. 79 | # This pattern also affects html_static_path and html_extra_path. 80 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"] 81 | 82 | 83 | # -- Options for HTML output ------------------------------------------------- 84 | 85 | # The theme to use for HTML and HTML Help pages. See the documentation for 86 | # a list of builtin themes. 87 | # 88 | html_theme = "furo" 89 | 90 | # Add any paths that contain custom static files (such as style sheets) here, 91 | # relative to this directory. They are copied after the builtin static files, 92 | # so a file named "default.css" will overwrite the builtin "default.css". 93 | html_static_path = ["_static"] 94 | html_theme_options = { 95 | "logo_only": True, 96 | } 97 | -------------------------------------------------------------------------------- /src/shadows/anndatashadow.py: -------------------------------------------------------------------------------- 1 | from functools import cached_property 2 | from pathlib import Path 3 | 4 | import numpy as np 5 | from anndata import AnnData 6 | 7 | # For simplicity, use AnnData read_elem/write_elem 8 | from anndata._core.index import _normalize_indices 9 | 10 | from .datashadow import DataShadow 11 | from .elemshadow import ElemShadow, RawElemShadow, _get_backend_reader 12 | 13 | RUNECACHED = "\u1401" 14 | RUNECACHEDALT = "\u25bc" 15 | RUNENEW = "\u25b2" 16 | 17 | 18 | class AnnDataShadow(DataShadow): 19 | def __init__(self, filepath, *args, **kwargs): 20 | super().__init__(filepath, *args, **kwargs) 21 | 22 | @classmethod 23 | def _init_as_view(cls, shadow, oidx, vidx): 24 | if shadow._format == "zarr": 25 | filename = shadow.file.store.path 26 | mode = "r+" if not shadow.file.read_only else "r" 27 | elif shadow._format == "parquet": 28 | filename = shadow.file.path 29 | mode = "r+" # FIXME 30 | # raise NotImplementedError("Parquet format is not supported for views.") 31 | else: 32 | filename = shadow.file.filename 33 | mode = shadow.file.mode 34 | 35 | if shadow.root != "/": 36 | filename = ( 37 | str(Path(filename) / shadow.root[1:]) 38 | if shadow.root.startswith("/") 39 | else str(Path(filename) / shadow.root) 40 | ) 41 | view = AnnDataShadow( 42 | filename, 43 | array_backend=shadow._array_backend, 44 | table_backend=shadow._table_backend, 45 | mode=mode, 46 | format=shadow._format, 47 | ) 48 | 49 | # NOTE: Cache is not preserved in a new object 50 | 51 | view._is_view = True 52 | view._ref = shadow 53 | view._oidx = oidx 54 | view._vidx = vidx 55 | 56 | if shadow.is_view: 57 | view._ref = shadow._ref 58 | for attr, idx in (("_oidx", oidx), ("_vidx", vidx)): 59 | shadow_idx = getattr(shadow, attr) 60 | if shadow_idx is not None: 61 | n_attr = shadow._ref.n_obs if attr == "_oidx" else shadow._ref.n_vars 62 | if isinstance(shadow_idx, slice) and isinstance(idx, int | np.integer | slice): 63 | r = range(*shadow_idx.indices(n_attr)).__getitem__(idx) 64 | if isinstance(r, int | np.integer): 65 | setattr(view, attr, np.array([r])) 66 | setattr(view, attr, slice(r.start, r.stop, r.step)) 67 | elif isinstance(shadow_idx, slice): 68 | setattr(view, attr, np.arange(*shadow_idx.indices(shadow._ref.n_obs))[idx]) 69 | elif hasattr(shadow_idx.dtype, "type") and issubclass( 70 | shadow_idx.dtype.type, np.bool_ 71 | ): 72 | if hasattr(idx.dtype, "type") and issubclass(idx.dtype.type, np.bool_): 73 | view_idx = shadow_idx.copy() 74 | view_idx[view_idx] = idx 75 | setattr(view, attr, view_idx) 76 | else: 77 | setattr(view, attr, shadow_idx[np.where(idx)[0]]) 78 | else: 79 | setattr(view, attr, shadow_idx[idx]) 80 | return view 81 | 82 | @cached_property 83 | def _X(self): 84 | reader = _get_backend_reader(self._array_backend, self._lazy) 85 | if self.is_view: 86 | if ( 87 | isinstance(self._vidx, slice) 88 | and self._vidx.start is None 89 | and self._vidx.stop is None 90 | ): 91 | x = reader(self.file[self.root]["X"][self._oidx]) 92 | elif ( 93 | isinstance(self._oidx, slice) 94 | and self._oidx.start is None 95 | and self._oidx.stop is None 96 | ): 97 | x = reader(self.file[self.root]["X"][:, self._vidx]) 98 | else: 99 | # Only one indexing array at a time is possible 100 | x = reader(self.file[self.root]["X"][self._oidx][:, self._vidx]) 101 | else: 102 | x = reader(self.file[self.root]["X"]) 103 | self._ids["X"] = id(x) 104 | return x 105 | 106 | @property 107 | def X(self): 108 | return self._X 109 | 110 | @cached_property 111 | def _layers(self): 112 | group_storage = ( 113 | self.file[self.root]["layers"] if "layers" in self.file[self.root] else dict() 114 | ) 115 | return ElemShadow( 116 | group_storage, 117 | key=str(Path(self.root) / "layers"), 118 | cache=self.__dict__, 119 | n_obs=self.n_obs, 120 | n_vars=self.n_vars, 121 | array_backend=self._array_backend, 122 | table_backend=self._table_backend, 123 | is_view=self.is_view, 124 | idx=(self._oidx, self._vidx), 125 | ) 126 | 127 | @property 128 | def layers(self): 129 | return self._layers 130 | 131 | @cached_property 132 | def _raw(self): 133 | """ 134 | Legacy support. New objects should not use .raw. 135 | """ 136 | if "raw" in self.file[self.root]: 137 | group_storage = self.file[self.root]["raw"] 138 | else: 139 | group_storage = dict() 140 | 141 | return RawElemShadow( 142 | group_storage, 143 | key=str(Path(self.root) / "raw"), 144 | cache=self.__dict__, 145 | n_obs=self.n_obs, 146 | n_vars=None, 147 | array_backend=self._array_backend, 148 | table_backend=self._table_backend, 149 | file=self.file, 150 | is_view=self.is_view, 151 | idx=(self._oidx, None), 152 | ) 153 | 154 | @property 155 | def raw(self): 156 | return self._raw 157 | 158 | def __repr__(self): 159 | if self.is_view: 160 | if self._ref is not None: 161 | s = f"View of AnnData Shadow object with n_obs × n_vars = {self.n_obs} × {self.n_vars} (original {self._ref.n_obs} × {self._ref.n_vars})\n" 162 | else: 163 | s = f"View of AnnData Shadow object with n_obs × n_vars = {self.n_obs} × {self.n_vars}\n" 164 | else: 165 | s = f"AnnData Shadow object with n_obs × n_vars = {self.n_obs} × {self.n_vars}\n" 166 | 167 | # X 168 | key_cached = "X" in self.__dict__ or "_X" in self.__dict__ 169 | key_cached_str = RUNECACHED if key_cached else "" 170 | if key_cached: 171 | if "X" in self._ids and self._ids["X"] != id(self.X): 172 | key_cached_str = RUNECACHEDALT 173 | elif "_X" in self._ids and self._ids["_X"] != id(self.X): 174 | key_cached_str = RUNECACHEDALT 175 | 176 | s += f" X {key_cached_str} \n" 177 | 178 | # raw 179 | if self.raw and len(self.raw.keys()) > 0: 180 | s += " " + self.raw.__repr__() 181 | 182 | # layers 183 | if len(self.layers) > 0: 184 | s += " " + self.layers.__repr__() 185 | 186 | s += "\n".join([" " + line for line in super().__repr__().strip().split("\n")]) + "\n" 187 | 188 | return s 189 | 190 | def obs_vector(self, key: str, layer: str | None = None): 191 | if key not in self.obs.columns and key not in self.var_names: 192 | key = str.encode(key) 193 | if key in self.var_names: 194 | # Assume unique var_names 195 | key_i = np.where(self.var_names == key)[0][0] 196 | if layer is not None: 197 | return self.layers[layer][:, key_i] 198 | else: 199 | return self.X[:, key_i] 200 | 201 | return self.obs[key].values 202 | 203 | def var_vector(self, key: str, layer: str | None = None): 204 | if key not in self.var.columns and key not in self.obs_names: 205 | key = str.encode(key) 206 | if key in self.obs_names: 207 | # Assume unique obs_names 208 | key_i = np.where(self.obs_names == key)[0][0] 209 | if layer is not None: 210 | return self.layers[layer][key_i, :] 211 | else: 212 | return self.X[key_i, :] 213 | 214 | return self.var[key].values 215 | 216 | # Views 217 | 218 | def __getitem__(self, index): 219 | oidx, vidx = _normalize_indices(index, self.obs_names, self.var_names) 220 | return AnnDataShadow._init_as_view(self, oidx, vidx) 221 | 222 | # 223 | # It is either this or duck typing. 224 | # 225 | # Frequently used tools like scanpy 226 | # check if the object is an AnnData instance 227 | # inside quite a few functions. 228 | # 229 | # Until those instances are replaced with duck typing, 230 | # the remedy is to mock the class name. 231 | # 232 | 233 | @property 234 | def __class__(self): 235 | return AnnData 236 | -------------------------------------------------------------------------------- /tests/test_shadows_hdf5.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pytest 3 | from typing import Optional 4 | 5 | from shadows import AnnDataShadow, MuDataShadow 6 | 7 | import numpy as np 8 | from scipy.sparse import coo_matrix 9 | from anndata import AnnData 10 | import mudata 11 | from mudata import MuData 12 | 13 | N, D = 50, 20 14 | 15 | mudata.set_options(pull_on_update=False) 16 | 17 | 18 | def matrix(sparse_x: bool = False, n: Optional[int] = None, d: Optional[int] = None): 19 | np.random.seed(100) 20 | 21 | if n is None: 22 | n = N 23 | if d is None: 24 | d = D 25 | 26 | if sparse_x: 27 | sparsity = 0.2 28 | row = np.random.choice(n, 1000 * sparsity) 29 | col = np.random.choice(d, 1000 * sparsity) 30 | data = np.random.normal(size=1000 * sparsity) 31 | 32 | x = coo_matrix((data, (row, col)), shape=(n, d)).tocsr() 33 | else: 34 | x = np.random.normal(size=(n, d)) 35 | return x 36 | 37 | 38 | @pytest.fixture() 39 | def adata(sparse_x: bool = False, obsm: bool = False): 40 | x = matrix(sparse_x) 41 | ad = AnnData(X=x) 42 | 43 | return ad 44 | 45 | 46 | @pytest.fixture() 47 | def mdata(sparse_x: bool = False, sparse_y: bool = False): 48 | np.random.seed(42) 49 | 50 | xn, xd = np.random.choice(100, 2) 51 | yn, yd = np.random.choice(100, 2) 52 | 53 | x = matrix(sparse_x, n=xn, d=xd) 54 | y = matrix(sparse_y, n=yn, d=yd) 55 | 56 | ax = AnnData(X=x) 57 | ay = AnnData(X=y) 58 | 59 | ax.var_names = [f"x{i}" for i in range(xd)] 60 | ay.var_names = [f"y{i}" for i in range(yd)] 61 | 62 | mdata = MuData({"x": ax, "y": ay}) 63 | 64 | return mdata 65 | 66 | 67 | @pytest.mark.usefixtures("filepath_h5ad") 68 | class TestAnnData: 69 | @pytest.mark.parametrize("sparse_x", [True, False]) 70 | def test_anndata_simple(self, adata, filepath_h5ad, sparse_x): 71 | filename = filepath_h5ad 72 | adata.write(filename) 73 | 74 | ash = AnnDataShadow(filename) 75 | 76 | assert adata.shape == ash.shape 77 | 78 | ash.close() 79 | 80 | def test_anndata_obs(self, adata, filepath_h5ad): 81 | filename = filepath_h5ad.replace(".h5ad", "_obs.h5ad") 82 | 83 | adata.obs["logical"] = np.random.choice([True, False], size=N) 84 | adata.obs["integers"] = np.arange(N) 85 | adata.obs["floats"] = np.random.normal(size=N) 86 | adata.obs["strings"] = np.random.choice(["abc", "def"], size=N) 87 | adata.obs["categories"] = adata.obs["strings"].astype("category") 88 | 89 | adata.write(filename) 90 | 91 | ash = AnnDataShadow(filename) 92 | 93 | for key in ["logical", "integers", "floats", "strings", "categories"]: 94 | assert key in ash.obs.columns 95 | assert ash.obs[key].equals(adata.obs[key]) 96 | 97 | assert adata.obs.shape == ash.obs.shape 98 | 99 | def test_anndata_obsm(self, adata, filepath_h5ad): 100 | filename = filepath_h5ad.replace(".h5ad", "_obsm.h5ad") 101 | 102 | for i in range(2, 10): 103 | adata.obsm["X_test"] = np.random.normal(size=(N, 2)) 104 | adata.write(filename) 105 | 106 | ash = AnnDataShadow(filename) 107 | 108 | assert "X_test" in ash.obsm 109 | assert adata.obsm["X_test"].shape == ash.obsm["X_test"].shape 110 | 111 | ash.close() 112 | 113 | def test_anndata_var(self, adata, filepath_h5ad): 114 | filename = filepath_h5ad.replace(".h5ad", "_var.h5ad") 115 | 116 | adata.var["logical"] = np.random.choice([True, False], size=D) 117 | adata.var["integers"] = np.arange(D) 118 | adata.var["floats"] = np.random.normal(size=D) 119 | adata.var["strings"] = np.random.choice(["abc", "def"], size=D) 120 | adata.var["categories"] = adata.var["strings"].astype("category") 121 | 122 | adata.write(filename) 123 | 124 | ash = AnnDataShadow(filename) 125 | assert adata.var.shape == ash.var.shape 126 | 127 | assert ash.var.strings.equals(adata.var.strings) 128 | assert ash.var.categories.equals(adata.var.categories) 129 | 130 | ash.close() 131 | 132 | def test_anndata_varm(self, adata, filepath_h5ad): 133 | filename = filepath_h5ad.replace(".h5ad", "_varm.h5ad") 134 | 135 | for i in range(2, 10): 136 | adata.varm["loadings"] = np.random.normal(size=(D, 2)) 137 | adata.write(filename) 138 | 139 | ash = AnnDataShadow(filename) 140 | 141 | assert "loadings" in ash.varm 142 | assert adata.varm["loadings"].shape == ash.varm["loadings"].shape 143 | 144 | ash.close() 145 | 146 | def test_anndata_uns(self, adata, filepath_h5ad): 147 | filename = filepath_h5ad.replace(".h5ad", "_uns.h5ad") 148 | 149 | adata.uns["logical"] = np.random.choice([True, False]) 150 | adata.uns["integer"] = 1 151 | adata.uns["float"] = 0.1 152 | adata.uns["string"] = "abc" 153 | adata.uns["dict"] = {"a": 1, "b": 2} 154 | 155 | adata.write(filename) 156 | 157 | ash = AnnDataShadow(filename) 158 | 159 | assert adata.uns["string"] == ash.uns["string"] 160 | assert adata.uns["dict"] == ash.uns["dict"] 161 | 162 | ash.close() 163 | 164 | 165 | @pytest.mark.usefixtures("filepath_h5ad") 166 | class TestViewsAnnData: 167 | def test_single_view_range(self, adata, filepath_h5ad): 168 | filename = filepath_h5ad 169 | adata.write(filename) 170 | 171 | np.random.seed(42) 172 | i = np.random.choice(N, 1)[0] 173 | j = np.random.choice(D, 1)[0] 174 | 175 | ash = AnnDataShadow(filename) 176 | 177 | view = adata[:i, :j] 178 | ash_view = ash[:i, :j] 179 | 180 | assert ash_view.shape == view.shape 181 | assert ash_view.shape == (i, j) 182 | assert ash_view.X.shape == (i, j) 183 | 184 | ash.close() 185 | 186 | def test_bool_slicing(self, adata, filepath_h5ad): 187 | np.random.seed(42) 188 | ix = np.random.choice(adata.obs_names, size=20, replace=False) 189 | sel = adata.obs_names.isin(ix) 190 | adata.obs["sel"] = sel 191 | 192 | filename = filepath_h5ad 193 | adata.write(filename) 194 | 195 | ash = AnnDataShadow(filename) 196 | view = adata[adata.obs.sel, :] 197 | ash_view = ash[ash.obs.sel, :] 198 | 199 | assert ash_view.shape == view.shape 200 | assert ash_view.shape == (len(ix), adata.n_vars) 201 | assert ash_view.X.shape == (len(ix), adata.n_vars) 202 | 203 | ash.close() 204 | 205 | def test_nested_views(self, adata, filepath_h5ad): 206 | filename = filepath_h5ad 207 | adata.write(filename) 208 | 209 | np.random.seed(42) 210 | i = np.random.choice(N, 1)[0] 211 | j = np.random.choice(D, 1)[0] 212 | ii = np.random.choice(i, 1)[0] 213 | jj = np.random.choice(j, 1)[0] 214 | 215 | ash = AnnDataShadow(filename) 216 | 217 | view = adata[:i, :j] 218 | view = view[:ii, :jj] 219 | ash_view = ash[:i, :j] 220 | ash_view = ash_view[:ii, :jj] 221 | 222 | assert ash_view.shape == view.shape 223 | assert ash_view.shape == (ii, jj) 224 | assert ash_view.X.shape == (ii, jj) 225 | 226 | assert ash_view.obs_names.equals(view.obs_names) 227 | assert ash_view.var_names.equals(view.var_names) 228 | 229 | ash.close() 230 | 231 | 232 | @pytest.mark.usefixtures("filepath_h5mu") 233 | class TestMuData: 234 | def test_mudata_simple(self, mdata, filepath_h5mu): 235 | filename = filepath_h5mu 236 | mdata.write(filename) 237 | 238 | msh = MuDataShadow(filename) 239 | 240 | assert mdata.shape == msh.shape 241 | 242 | msh.close() 243 | 244 | def test_anndata_inside_mudata(self, mdata, filepath_h5mu): 245 | filename = filepath_h5mu 246 | mdata.write(filename) 247 | 248 | mod_x = Path(filename) / "mod" / "x" 249 | mod_y = Path(filename) / "mod" / "y" 250 | 251 | ash_x = AnnDataShadow(mod_x) 252 | ash_y = AnnDataShadow(mod_y) 253 | 254 | assert ash_x.shape == mdata["x"].shape 255 | assert ash_y.shape == mdata["y"].shape 256 | 257 | ash_x.close() 258 | ash_y.close() 259 | 260 | def test_slicing_mudata_int(self, mdata, filepath_h5mu): 261 | filename = filepath_h5mu 262 | n, d = mdata.shape 263 | mdata.write(filename) 264 | 265 | msh = MuDataShadow(filename) 266 | 267 | msh_view = msh[:10, :5] 268 | assert msh_view.shape == (10, 5) 269 | 270 | msh_view = msh[:11, :] 271 | assert msh_view.shape == (11, d) 272 | 273 | msh_view = msh[:, :7] 274 | assert msh_view.shape == (n, 7) 275 | 276 | msh.close() 277 | 278 | def test_slicing_mudata_str(self, mdata, filepath_h5mu): 279 | filename = filepath_h5mu 280 | n, d = mdata.shape 281 | mdata.write(filename) 282 | 283 | msh = MuDataShadow(filename) 284 | 285 | msh_view = msh[:, ["x3", "y5", "x7", "y9"]] 286 | assert msh_view.shape == (n, 4) 287 | assert msh_view.var_names.to_list() == ["x3", "y5", "x7", "y9"] 288 | 289 | msh.close() 290 | -------------------------------------------------------------------------------- /src/shadows/mudatashadow.py: -------------------------------------------------------------------------------- 1 | from functools import cached_property 2 | from pathlib import Path 3 | 4 | import numpy as np 5 | 6 | # For simplicity, use AnnData read_elem/write_elem 7 | from anndata._core.index import _normalize_indices 8 | 9 | from .anndatashadow import AnnDataShadow 10 | from .datashadow import DataShadow 11 | from .elemshadow import ElemShadow 12 | 13 | 14 | class MuDataShadow(DataShadow): 15 | def __init__(self, filepath, *args, **kwargs): 16 | super().__init__(filepath, *args, **kwargs) 17 | mods = list(self.file["mod"].keys()) 18 | 19 | modorder = mods 20 | if "mod-oder" in self.file["mod"].attrs: 21 | modorder_raw = self.file["mod"].attrs["mod-order"] 22 | if all(m in mods for m in modorder_raw): 23 | modorder = [m for m in modorder_raw if m in mods] 24 | 25 | kwargs["parent_format"] = self._format 26 | try: 27 | self.mod = { 28 | k: AnnDataShadow(Path(filepath) / "mod" / k, *args, **kwargs) for k in modorder 29 | } 30 | except (FileNotFoundError, TypeError) as e: 31 | # fsspec.mapping.FSMap 32 | try: 33 | from fsspec.mapping import FSMap 34 | 35 | if not isinstance(filepath, FSMap): 36 | raise NotImplementedError( 37 | "remote storage support has only been implemented for FSMap interface" 38 | ) 39 | if filepath.fs.__class__.__name__ != "S3FileSystem": 40 | raise NotImplementedError( 41 | "fsspec.mapping.FSMap has only been implemented for S3FileSystem" 42 | ) 43 | 44 | mapper = filepath.fs.get_mapper 45 | self.mod = { 46 | k: AnnDataShadow( 47 | mapper(str(Path(filepath.root) / "mod" / k)), 48 | format=self._format, 49 | *args, 50 | **kwargs, 51 | ) 52 | for k in modorder 53 | } 54 | except Exception: 55 | raise e 56 | 57 | self.n_mod = len(self.mod) 58 | self.mask = None 59 | 60 | self._axis = 0 61 | if self.file: 62 | if "axis" in self.file[self.root].attrs: 63 | self._axis = self.file[self.root].attrs["axis"] 64 | 65 | # To handle scanpy plotting calls and other tools 66 | self.raw = None 67 | 68 | @classmethod 69 | def _init_as_view(cls, shadow, oidx, vidx): 70 | if shadow._format == "zarr": 71 | filename = shadow.file.store.path 72 | mode = "r+" if not shadow.file.read_only else "r" 73 | elif shadow._format == "parquet": 74 | filename = shadow.file.path 75 | mode = "r+" # FIXME 76 | else: 77 | filename = shadow.file.filename 78 | mode = shadow.file.mode 79 | 80 | if shadow.root != "/": 81 | filename = str(Path(filename) / shadow.root) 82 | view = MuDataShadow( 83 | filename, 84 | array_backend=shadow._array_backend, 85 | table_backend=shadow._table_backend, 86 | mode=mode, 87 | format=shadow._format, 88 | ) 89 | 90 | # NOTE: Cache is not preserved in a new object 91 | 92 | view._is_view = True 93 | view._ref = shadow 94 | view._oidx = oidx 95 | view._vidx = vidx 96 | 97 | if shadow.is_view: 98 | view._ref = shadow._ref 99 | for attr, idx in (("_oidx", oidx), ("_vidx", vidx)): 100 | shadow_idx = getattr(shadow, attr) 101 | if shadow_idx is not None: 102 | n_attr = shadow._ref.n_obs if attr == "_oidx" else shadow._ref.n_vars 103 | if isinstance(shadow_idx, slice) and isinstance(idx, int | np.integer | slice): 104 | r = range(*shadow_idx.indices(n_attr)).__getitem__(idx) 105 | if isinstance(r, int | np.integer): 106 | setattr(view, attr, np.array([r])) 107 | setattr(view, attr, slice(r.start, r.stop, r.step)) 108 | elif isinstance(shadow_idx, slice): 109 | setattr(view, attr, np.arange(*shadow_idx.indices(shadow._ref.n_obs))[idx]) 110 | elif hasattr(shadow_idx.dtype, "type") and issubclass( 111 | shadow_idx.dtype.type, np.bool_ 112 | ): 113 | if hasattr(idx.dtype, "type") and issubclass(idx.dtype.type, np.bool_): 114 | view_idx = shadow_idx.copy() 115 | view_idx[view_idx] = idx 116 | setattr(view, attr, view_idx) 117 | else: 118 | setattr(view, attr, shadow_idx[np.where(idx)[0]]) 119 | else: 120 | setattr(view, attr, shadow_idx[idx]) 121 | 122 | for mod, modality in view.mod.items(): 123 | # Subsetting doesn't depend on axis: 124 | # axis implicitly influences .obsmap / .varmap 125 | if isinstance(oidx, slice) and oidx.start is None and oidx.stop is None: 126 | mod_obs = oidx 127 | else: 128 | mod_obs = shadow.obsmap[mod][oidx] 129 | if hasattr(mod_obs, "columns") and mod in mod_obs.columns: 130 | mod_obs = mod_obs[mod].values 131 | mod_obs = mod_obs[mod_obs != 0] - 1 132 | 133 | if isinstance(vidx, slice) and vidx.start is None and vidx.stop is None: 134 | mod_vars = vidx 135 | else: 136 | mod_vars = shadow.varmap[mod][vidx] 137 | if hasattr(mod_obs, "columns") and mod in mod_obs.columns: 138 | mod_obs = mod_obs[mod].values 139 | mod_vars = mod_vars[mod_vars != 0] - 1 140 | 141 | view.mod[mod] = modality[mod_obs, mod_vars] 142 | view.mod[mod]._ref = shadow[mod] 143 | if hasattr(modality.file, "close") and callable(modality.file.close): 144 | modality.file.close() 145 | 146 | # TODO: avoid creating a non-view AnnData connection 147 | # in the MuDataShadow() constructor above 148 | 149 | return view 150 | 151 | @cached_property 152 | def _obsmap(self): 153 | group_storage = ( 154 | self.file[self.root]["obsmap"] if "obsmap" in self.file[self.root] else dict() 155 | ) 156 | return ElemShadow( 157 | group_storage, 158 | key=str(Path(self.root) / "obsmap"), 159 | cache=self.__dict__, 160 | n_obs=self.n_obs, 161 | n_vars=self.n_vars, 162 | array_backend=self._array_backend, 163 | table_backend=self._table_backend, 164 | is_view=self.is_view, 165 | idx=(self._oidx, None), 166 | ) 167 | 168 | @property 169 | def obsmap(self): 170 | return self._obsmap 171 | 172 | @cached_property 173 | def _varmap(self): 174 | group_storage = ( 175 | self.file[self.root]["varmap"] if "varmap" in self.file[self.root] else dict() 176 | ) 177 | return ElemShadow( 178 | group_storage, 179 | key=str(Path(self.root) / "varmap"), 180 | cache=self.__dict__, 181 | n_obs=self.n_obs, 182 | n_vars=self.n_vars, 183 | array_backend=self._array_backend, 184 | table_backend=self._table_backend, 185 | is_view=self.is_view, 186 | idx=(None, self._vidx), 187 | ) 188 | 189 | @property 190 | def varmap(self): 191 | return self._varmap 192 | 193 | def clear_cache(self): 194 | super().clear_cache() 195 | for modality in self.mod.values(): 196 | modality.clear_cache() 197 | 198 | def close(self, close_modalities: bool = True): 199 | if close_modalities: 200 | for modality in self.mod.values(): 201 | modality.close() 202 | super().close() 203 | 204 | def reopen(self, mode: str): 205 | if not self.file or mode != self.file.mode: 206 | file = self.file.filename 207 | super().reopen(mode=mode) 208 | for modality in self.mod.values(): 209 | modality.reopen(mode=mode, file=file) 210 | else: 211 | return self 212 | 213 | # Update ._group in all elements 214 | for key in ["mod"]: 215 | elem = getattr(self, key) 216 | if isinstance(elem, ElemShadow): 217 | elem._update_group(self.file[str(Path(self.root) / key)]) 218 | 219 | return self 220 | 221 | def __repr__(self): 222 | if self.is_view: 223 | if self._ref is not None: 224 | s = f"View of MuData Shadow object with n_obs × n_vars = {self.n_obs} × {self.n_vars} (original {self._ref.n_obs} × {self._ref.n_vars})\n" 225 | else: 226 | s = f"View of MuData Shadow object with n_obs × n_vars = {self.n_obs} × {self.n_vars}\n" 227 | else: 228 | s = f"MuData Shadow object with n_obs × n_vars = {self.n_obs} × {self.n_vars}\n" 229 | 230 | s += "\n".join([" " + line for line in super().__repr__().strip().split("\n")]) + "\n" 231 | 232 | # obsmap and varmap 233 | for k in ["obsmap", "varmap"]: 234 | item = getattr(self, k) 235 | if len(item) > 0: 236 | s += " " + item.__repr__() 237 | 238 | s += f" mod:\t{self.n_mod} modalit{'ies' if self.n_mod > 1 else 'y'}\n" 239 | for m, modality in self.mod.items(): 240 | m_repr = modality.__repr__().strip().split("\n")[1:] 241 | s += f" {m}: {modality.n_obs} x {modality.n_vars}\n" 242 | s += "\n".join([" " + line for line in m_repr]) + "\n" 243 | return s 244 | 245 | # Writing 246 | 247 | def _push_changes(self, clear_cache: bool = False): 248 | super()._push_changes(clear_cache=clear_cache) 249 | for modality in self.mod.values(): 250 | modality._push_changes( 251 | clear_cache=clear_cache, 252 | ) 253 | 254 | # Views 255 | 256 | def __getitem__(self, index): 257 | if isinstance(index, str): 258 | return self.mod[index] 259 | oidx, vidx = _normalize_indices(index, self.obs_names, self.var_names) 260 | return MuDataShadow._init_as_view(self, oidx, vidx) 261 | 262 | # 263 | # Same as for AnnData above: 264 | # in the absence of duck typing in most tools, 265 | # the solution is to mock the class. 266 | # 267 | 268 | @property 269 | def __class__(self): 270 | try: 271 | from mudata import MuData 272 | 273 | return MuData 274 | except ModuleNotFoundError: 275 | return MuDataShadow 276 | -------------------------------------------------------------------------------- /src/shadows/elemshadow.py: -------------------------------------------------------------------------------- 1 | from collections.abc import MutableMapping 2 | from functools import cached_property, partial 3 | from pathlib import Path 4 | from typing import get_args 5 | from warnings import warn 6 | 7 | # For simplicity, use AnnData read_elem/write_elem 8 | from anndata._io.specs import write_elem 9 | from anndata.compat import H5Array, H5Group, ZarrArray, ZarrGroup 10 | 11 | from .compat import PqArray, PqGroup, read_elem 12 | 13 | ArrayStorageType = ZarrArray | H5Array | PqArray 14 | GroupStorageType = ZarrGroup | H5Group | PqGroup 15 | StorageType = ArrayStorageType | GroupStorageType 16 | 17 | RUNECACHED = "\u1401" 18 | RUNECACHEDALT = "\u25bc" 19 | RUNENEW = "\u25b2" 20 | 21 | 22 | class LazyReader: 23 | def __init__(self, reader, data): 24 | self.reader = reader 25 | self.data = data 26 | self.f = lambda data, slice: reader(data[slice]) 27 | self.partial = partial(self.f, self.data) 28 | 29 | def __call__(self, value): 30 | return self.partial(value) 31 | 32 | def __getitem__(self, value): 33 | return self.partial(value) 34 | 35 | 36 | def _get_backend_reader(backend, lazy: bool = False): 37 | if callable(backend): 38 | reader = backend 39 | else: 40 | if backend == "numpy": 41 | import numpy as np 42 | 43 | # TODO: Handle sparsity 44 | reader = np.array 45 | 46 | elif backend == "jax": 47 | import jax.numpy as jnp 48 | 49 | reader = jnp.array 50 | 51 | elif backend == "torch" or backend == "pytorch": 52 | import torch 53 | 54 | reader = torch.Tensor 55 | 56 | elif backend == "pandas": 57 | import pandas as pd 58 | 59 | reader = pd.DataFrame 60 | 61 | elif backend == "polars": 62 | import polars as pl 63 | 64 | reader = pl.from_dict 65 | 66 | elif backend == "arrow" or backend == "pyarrow": 67 | import pyarrow as pa 68 | 69 | reader = pa.Table.from_pydict 70 | 71 | else: 72 | return NotImplementedError 73 | 74 | if lazy: 75 | base_reader = reader 76 | 77 | def reader(data): 78 | return LazyReader(base_reader, data) 79 | 80 | return reader 81 | 82 | 83 | class EmptySlot: 84 | def __init__(self): 85 | pass 86 | 87 | def __len__(self): 88 | return 0 89 | 90 | def __repr__(self): 91 | return "" 92 | 93 | 94 | class ElemShadow(MutableMapping): 95 | def __init__( 96 | self, 97 | group_storage, 98 | key: str, 99 | cache: dict | None = None, 100 | n_obs: int | None = None, 101 | n_vars: int | None = None, 102 | array_backend: str = "numpy", 103 | table_backend: str = "pandas", 104 | is_view: bool | None = False, 105 | idx=None, 106 | ): 107 | self._group = group_storage 108 | self._key = key 109 | self._cache = cache 110 | self._n_obs = n_obs 111 | self._n_vars = n_vars 112 | 113 | try: 114 | self._elems = list(self._group.keys()) 115 | except AttributeError as e: 116 | # This block below is only to handle legacy files 117 | # where this can be a structured array. 118 | # Legacy file support will get deprecated in later versions. 119 | import numpy as np 120 | 121 | in_memory = np.array(self._group) 122 | fields = in_memory.dtype.fields 123 | if fields is not None: 124 | self._elems = list(fields.keys()) 125 | if self._key not in cache: 126 | self._cache[self._key] = dict() 127 | for value in self._elems: 128 | value_path = str(Path(self._key) / value) 129 | value_out = in_memory[value] 130 | 131 | key_name = Path(self._key).name 132 | if is_view: 133 | oidx, vidx = idx 134 | if self._key.endswith("layers"): 135 | if oidx is not None and vidx is not None: 136 | value_out = value_out[oidx, vidx] 137 | elif oidx is not None: 138 | value_out = value_out.__getitem__(oidx) 139 | elif vidx is not None: 140 | value_out = value_out[:, vidx] 141 | elif key_name.startswith("obs"): 142 | if oidx is not None: 143 | value_out = value_out.__getitem__(oidx) 144 | if key_name == "obsp": 145 | value_out = value_out[:, oidx] 146 | elif key_name.startswith("var"): 147 | if vidx is not None: 148 | value_out = value_out.__getitem__(vidx) 149 | if key_name == "varp": 150 | value_out = value_out[:, vidx] 151 | 152 | self._cache[value_path] = value_out 153 | else: 154 | raise AttributeError("Cannot handle this legacy file: " + str(e)) from e 155 | 156 | self._newelems = dict() 157 | self._nested = dict() 158 | 159 | self._array_backend = array_backend 160 | self._table_backend = table_backend 161 | 162 | self.is_view = is_view 163 | self._idx = idx 164 | 165 | def __getitem__(self, value): 166 | value_path = str(Path(self._key) / value) 167 | if value_path in self._cache: 168 | return self._cache[value_path] 169 | elif value in self._newelems: 170 | return self._newelems[value] 171 | else: 172 | value_elem = self._group[value] 173 | # is_group = type(value_elem).__name__ == 'Group' # h5py.Group, zarr.hierarchy.Group 174 | is_group = isinstance(value_elem, get_args(GroupStorageType)) 175 | 176 | # Return the nested ElemShadow 177 | if value_path in self._nested: 178 | return self._nested[value_path] 179 | 180 | # Directly read it if it is a scalar dataset 181 | # NOTE: Sparse matrices and data frames are groups 182 | elif not is_group and value_elem.shape == (): 183 | value_out = self._group[value][()] 184 | if isinstance(value_out, bytes): 185 | try: 186 | # bytes -> string 187 | value_out = value_out.decode() 188 | except AttributeError: 189 | pass 190 | 191 | elif self._array_backend == "numpy" and self._table_backend == "pandas": 192 | # HOTFIX 193 | if self._group[value].__class__.__module__ == "pqdata.core": 194 | value_out = read_elem(self._group[value], _format="parquet") 195 | else: 196 | value_out = read_elem(self._group[value]) 197 | 198 | else: 199 | if ( 200 | "encoding-type" in value_elem.attrs 201 | and value_elem.attrs["encoding-type"] == "array" 202 | ): 203 | reader = _get_backend_reader(self._array_backend) 204 | elif ( 205 | "encoding-type" in value_elem.attrs 206 | and value_elem.attrs["encoding-type"] == "dataframe" 207 | ): 208 | reader = _get_backend_reader(self._table_backend) 209 | else: 210 | reader = _get_backend_reader(self._array_backend) 211 | # TODO: avoid reading the whole dataset 212 | if isinstance(self._group, PqGroup): 213 | value_out = read_elem(self._group[value], _format="parquet") 214 | try: 215 | value_out = reader(value_out) 216 | except ValueError as e: 217 | if hasattr(value_out, "todense") and callable(value_out.todense): 218 | value_out = reader(value_out.todense()) 219 | else: 220 | raise e 221 | else: 222 | try: 223 | value_out = reader(self._group[value][:]) 224 | except TypeError: 225 | # e.g. sparse matrices 226 | value_out = read_elem(self._group[value]) 227 | try: 228 | value_out = reader(value_out) 229 | except ValueError as e: 230 | if hasattr(value_out, "todense") and callable(value_out.todense): 231 | value_out = reader(value_out.todense()) 232 | else: 233 | raise e 234 | 235 | # slicing behaviour depends on the attribute 236 | key_name = Path(self._key).name 237 | if self.is_view: 238 | oidx, vidx = self._idx 239 | if self._key.endswith("layers"): 240 | if oidx is not None and vidx is not None: 241 | value_out = value_out[oidx, vidx] 242 | elif oidx is not None: 243 | value_out = value_out.__getitem__(oidx) 244 | elif vidx is not None: 245 | value_out = value_out[:, vidx] 246 | elif key_name.startswith("obs"): 247 | if oidx is not None: 248 | value_out = value_out.__getitem__(oidx) 249 | if key_name == "obsp": 250 | value_out = value_out[:, oidx] 251 | elif key_name.startswith("var"): 252 | if vidx is not None: 253 | value_out = value_out.__getitem__(vidx) 254 | if key_name == "varp": 255 | value_out = value_out[:, vidx] 256 | 257 | self._cache[value_path] = value_out 258 | return value_out 259 | 260 | def __setitem__(self, key, value): 261 | value_path = str(Path(self._key) / key) 262 | 263 | if self._key.endswith("obsm") or self._key.endswith("obsp") or self._key.endswith("layers"): 264 | if self._n_obs is None: 265 | if key in self._elems: 266 | self._n_obs = self._group[key].shape[0] 267 | 268 | if self._n_obs is not None: 269 | assert value.shape[0] == self._n_obs, "Shape mismatch" 270 | if self._key.endswith("obsp"): 271 | assert value.shape[1] == self._n_obs, "Shape mismatch" 272 | 273 | if self._key.endswith("varm") or self._key.endswith("varp") or self._key.endswith("layers"): 274 | if self._n_vars is None: 275 | if key in self._elems: 276 | self._n_vars = self._group[key].shape[0] 277 | 278 | if self._n_vars is not None: 279 | if self._key.endswith("layers"): 280 | assert value.shape[1] == self._n_vars, "Shape mismatch" 281 | else: # varm, varp 282 | assert value.shape[0] == self._n_vars, "Shape mismatch" 283 | if self._key.endswith("varp"): 284 | assert value.shape[1] == self._n_vars, "Shape mismatch" 285 | 286 | if key in self._elems: 287 | if isinstance(self._group[key], get_args(GroupStorageType)): 288 | self._nested[value_path] = value 289 | else: 290 | self._cache[value_path] = value 291 | else: 292 | self._newelems[key] = value 293 | 294 | def __delitem__(self, key): 295 | if key in self._newelems: 296 | del self._newelems[key] 297 | else: 298 | raise NotImplementedError("Cannot delete data " "that already exists in the file") 299 | 300 | def __contains__(self, value): 301 | if value in self._elems or value in self._newelems: 302 | return True 303 | return False 304 | 305 | def __iter__(self): 306 | all_keys = self._elems + list(self._newelems.keys()) 307 | for i, key in enumerate(all_keys): 308 | yield key, self[key] 309 | 310 | def keys(self): 311 | return self._elems + list(self._newelems.keys()) 312 | 313 | def values(self): 314 | all_keys = self._elems + list(self._newelems.keys()) 315 | for i, key in enumerate(all_keys): 316 | yield key, self[key] 317 | 318 | def items(self): 319 | for key in self._elems: 320 | yield key, self[key] 321 | 322 | for key, value in self._newelems.items(): 323 | yield key, value 324 | 325 | def __len__(self): 326 | return len(self._elems) + len(self._newelems) 327 | 328 | def __repr__(self): 329 | s = "" 330 | key_elems_str, new_elems_str = [], [] 331 | 332 | if len(self._elems) > 0: 333 | key_elems_cached = [str(Path(self._key) / e) in self._cache for e in self._elems] 334 | key_elems_cached_str = [RUNECACHED if e_cached else "" for e_cached in key_elems_cached] 335 | # TODO: RUNECACHEDALT 336 | key_elems_str = list( 337 | map(lambda xs: "".join(xs), zip(self._elems, key_elems_cached_str)) 338 | ) 339 | 340 | if len(self._newelems) > 0: 341 | new_elems_str = [f"{e}{RUNENEW}" for e in self._newelems.keys()] 342 | 343 | all_elems_str = key_elems_str + new_elems_str 344 | if len(all_elems_str) > 0: 345 | s += f"{Path(self._key).name}:\t{', '.join(all_elems_str)}\n" 346 | 347 | return s 348 | 349 | # Writing 350 | 351 | def _push_changes(self, clear_cache: bool = False): 352 | if len(self._newelems) > 0: 353 | keys = list(self._newelems.keys()) 354 | for key in keys: 355 | write_elem(self._group, key, self._newelems[key]) 356 | if not clear_cache: 357 | self._cache[str(Path(self._key) / key)] = self._newelems[key] 358 | del self._newelems[key] 359 | self._elems = list(self._group.keys()) 360 | 361 | def _update_group(self, group): 362 | self._group = group 363 | for elem in self._nested.values(): 364 | elem._update_group(group) 365 | 366 | 367 | class RawElemShadow(ElemShadow): 368 | def __init__( 369 | self, 370 | group_storage, 371 | key: str, 372 | file: str, 373 | cache: dict | None = None, 374 | n_obs: int | None = None, 375 | n_vars: int | None = None, 376 | array_backend: str = "numpy", 377 | table_backend: str = "pandas", 378 | is_view: bool = False, 379 | idx=None, 380 | ): 381 | super().__init__( 382 | group_storage=group_storage, 383 | key=key, 384 | cache=cache, 385 | n_obs=n_obs, 386 | n_vars=n_vars, 387 | array_backend=array_backend, 388 | table_backend=table_backend, 389 | is_view=is_view, 390 | idx=idx, 391 | ) 392 | self.file = file 393 | self._ids = {"self": id(self)} 394 | 395 | @cached_property 396 | def _X(self): 397 | return self.__getitem__("X") 398 | 399 | @property 400 | def X(self): 401 | return self._X 402 | 403 | @cached_property 404 | def _var(self): 405 | return self.__getitem__("var") 406 | 407 | @property 408 | def var(self): 409 | return self._var 410 | 411 | @cached_property 412 | def _var_names(self): 413 | index = "_index" 414 | var = self._group["var"] 415 | if "_index" in var.attrs: 416 | index = var.attrs["_index"] 417 | if self.is_view and len(self._idx) > 1 and self._idx[1] is not None: 418 | return self._group["var"][index][self._idx[1]] 419 | return self._group["var"][index][:] 420 | 421 | @property 422 | def var_names(self): 423 | return self._var_names 424 | 425 | @cached_property 426 | def __n_obs(self): 427 | x = self._group["X"] 428 | if isinstance(x, get_args(ArrayStorageType)): 429 | n_obs = x.shape[0] 430 | else: 431 | n_obs = x.attrs["shape"][0] 432 | 433 | if self.is_view and self._idx[0] is not None: 434 | oidx = self._idx[0] 435 | if isinstance(oidx, slice): 436 | n_obs = len(range(n_obs).__getitem__(oidx)) 437 | else: 438 | n_obs = len(oidx) 439 | 440 | return n_obs 441 | 442 | @property 443 | def n_obs(self): 444 | if self._n_obs is None: 445 | return self.__n_obs 446 | return self._n_obs 447 | 448 | @cached_property 449 | def __n_vars(self): 450 | if "var" in self._group: 451 | var = self._group["var"] 452 | if isinstance(var, get_args(ArrayStorageType)): 453 | n_vars = var.shape[0] 454 | 455 | else: 456 | index = "_index" 457 | if "_index" in var.attrs: 458 | index = var.attrs["_index"] 459 | 460 | n_vars = var[index].shape[0] 461 | else: 462 | x = self._group["X"] 463 | if isinstance(x, get_args(ArrayStorageType)): 464 | n_vars = x.shape[1] 465 | else: 466 | n_vars = x.attrs["shape"][1] 467 | 468 | self._n_vars = n_vars 469 | return n_vars 470 | 471 | @property 472 | def n_vars(self): 473 | if self._n_vars is None: 474 | return self.__n_vars 475 | return self._n_vars 476 | 477 | @property 478 | def shape(self): 479 | return self.n_obs, self.n_vars 480 | 481 | @cached_property 482 | def _varm(self): 483 | storage_group = self._group["varm"] if "varm" in self._elems else dict() 484 | return ElemShadow( 485 | storage_group, 486 | key=str(Path(self._group.name) / "varm"), 487 | cache=self.__dict__, 488 | n_obs=self.n_obs, 489 | n_vars=self.n_vars, 490 | array_backend=self._array_backend, 491 | table_backend=self._table_backend, 492 | is_view=self.is_view, 493 | idx=self.idx, 494 | ) 495 | 496 | @property 497 | def varm(self): 498 | return self._varm 499 | 500 | # No writing: .raw is always read-only 501 | 502 | def _push_changes(self, *args, **kwrags): 503 | warn("Raw object is always read-only. No changes will be written.") 504 | -------------------------------------------------------------------------------- /src/shadows/datashadow.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import logging 3 | from functools import cached_property 4 | from os import PathLike 5 | from pathlib import Path 6 | from typing import Literal, get_args 7 | from warnings import warn 8 | 9 | # FIXME: import only when needed 10 | import h5py 11 | from anndata._core.index import _normalize_indices 12 | 13 | # For simplicity, use AnnData read_elem/write_elem 14 | from anndata.compat import H5Array, H5Group, ZarrArray, ZarrGroup 15 | 16 | from .compat import PqArray, PqGroup, read_elem 17 | from .elemshadow import ElemShadow, _get_backend_reader 18 | 19 | # FIXME: in anndata._types now 20 | ArrayStorageType = ZarrArray | H5Array | PqArray 21 | GroupStorageType = ZarrGroup | H5Group | PqGroup 22 | StorageType = ArrayStorageType | GroupStorageType 23 | 24 | 25 | RUNECACHED = "\u1401" 26 | FORMAT_MAP = { 27 | "h5": "hdf5", 28 | "hdf5": "hdf5", 29 | "zarr": "zarr", 30 | "pq": "parquet", 31 | "pqdata": "parquet", 32 | } 33 | 34 | 35 | class DataShadow: 36 | def __init__( 37 | self, 38 | filepath: PathLike, 39 | array_backend: str = "numpy", 40 | table_backend: str = "pandas", 41 | mode: str = "r", 42 | format: Literal["hdf5", "zarr", "parquet", "h5", "pq", "pqdata"] | None = None, 43 | lazy: bool = False, 44 | parent_format: str | None = None, 45 | ): 46 | # unify types 47 | fpstr = str(filepath) 48 | if filepath.__class__.__name__ == "OpenFile": 49 | # OpenFile<'file_path'> 50 | fpstr = str(filepath.path) 51 | elif filepath.__class__.__name__ == "FSMap": 52 | # 53 | fpstr = str(filepath.root) 54 | fpath = Path(fpstr) 55 | 56 | if format is None: 57 | logging.info("No format provided, trying to infer from the file extension") 58 | if fpath.suffix == ".zarr": 59 | format = "zarr" 60 | elif fpath.suffix == ".pqdata": 61 | format = "parquet" 62 | else: 63 | # NOTE: prioritizing the file extension over the parent format 64 | # allows to mix formats, e.g. store modalities in .zarr or .hdf5 files 65 | if parent_format is not None: 66 | format = parent_format 67 | else: 68 | format = "hdf5" 69 | 70 | # map the shorthands to the full names 71 | format = FORMAT_MAP.get(format, format) 72 | 73 | # Auto-detect the format for nested modalities 74 | # (e.g. m.zarr/mod/x, m.pqdata/mod/y) 75 | if "zarr" in fpstr or "pqdata" in fpstr and fpath.suffix not in (".zarr", ".pqdata"): 76 | i = 1 77 | while i <= fpstr.count("/"): 78 | path_elements = list(map(lambda x: x[::-1], fpstr[::-1].split("/", i))) 79 | filename, root = path_elements[-1], str( 80 | Path(path_elements[-2]).joinpath(*path_elements[:-2][::-1]) 81 | ) 82 | if Path(filename).suffix == ".zarr": 83 | format = "zarr" 84 | break 85 | elif Path(filename).suffix == ".pqdata": 86 | format = "parquet" 87 | break 88 | i += 1 89 | 90 | if format == "hdf5": 91 | import h5py 92 | elif format == "zarr": 93 | import zarr 94 | elif format == "parquet": 95 | import pqdata 96 | 97 | if fpath.exists(): 98 | if format == "zarr": 99 | self.file = zarr.open(fpath, mode=mode) 100 | elif format == "parquet": 101 | self.file = pqdata.open(fpath, mode=mode) 102 | else: 103 | # fallback to hdf5 by default 104 | if format != "hdf5": 105 | warn( 106 | f"Falling back to hdf5, provided format is '{format}' and not 'hdf5' or 'zarr'" 107 | ) 108 | self.file = h5py.File(fpath, mode=mode) 109 | self.root = "/" 110 | else: 111 | root = "/" 112 | file_exists = False 113 | i = 1 114 | while not file_exists and i <= fpstr.count("/"): 115 | path_elements = list(map(lambda x: x[::-1], fpstr[::-1].split("/", i))) 116 | filename, root = path_elements[-1], str( 117 | Path(path_elements[-2]).joinpath(*path_elements[:-2][::-1]) 118 | ) 119 | file_exists = Path(filename).exists() 120 | i += 1 121 | if file_exists: 122 | format = FORMAT_MAP.get(Path(filename).suffix[1:], format) 123 | if format == "zarr": 124 | self.file = zarr.open(filename, mode=mode) 125 | elif format == "parquet": 126 | self.file = pqdata.open(filename, mode=mode) 127 | else: 128 | # fallback to hdf5 by default 129 | if format != "hdf5": 130 | warn( 131 | f"Falling back to hdf5, provided format is '{format}' and not 'hdf5 or 'zarr'" 132 | ) 133 | self.file = h5py.File(filename, mode=mode) 134 | self.root = root 135 | # Maybe prepend /mod to the modality name 136 | if root not in self.file and f"/mod/{root}" in self.file: 137 | self.root = f"/mod/{root}" 138 | elif ( 139 | filepath.__class__.__name__ == "BufferedReader" 140 | or filepath.__class__.__name__ == "OpenFile" 141 | or filepath.__class__.__name__ == "FSMap" 142 | ): 143 | # fsspec support 144 | fname = filepath 145 | try: 146 | from fsspec.core import OpenFile 147 | 148 | if isinstance(filepath, OpenFile): 149 | fname = filepath.__enter__() 150 | self._callback = fname.__exit__() 151 | except ImportError as e: 152 | raise ImportError( 153 | "To read from remote storage or cache, install fsspec: pip install fsspec" 154 | ) from e 155 | 156 | if format == "zarr": 157 | self.file = zarr.open(fname, mode=mode) 158 | elif format == "parquet": 159 | self.file = pqdata.open(fname, mode=mode) 160 | else: 161 | raise NotImplementedError( 162 | "Only zarr and parquet formats are supported for remote files. " 163 | "HDF5 files have to be downloaded first." 164 | ) 165 | self.root = "/" 166 | else: 167 | raise FileNotFoundError(f"File {fpstr} does not seem to exist") 168 | self._array_backend = array_backend 169 | self._table_backend = table_backend 170 | self._ids = {"self": id(self)} 171 | self._format = format 172 | 173 | # View-related attributes 174 | self._is_view = False 175 | self._oidx = None 176 | self._vidx = None 177 | 178 | # Laziness behaviour 179 | self._lazy = lazy 180 | 181 | @classmethod 182 | def _init_as_view(cls, shadow, oidx, vidx): 183 | if shadow._format == "zarr": 184 | filename = shadow.file.store.path 185 | mode = "r+" if not shadow.file.read_only else "r" 186 | elif shadow._format == "parquet": 187 | raise NotImplementedError("Parquet format is not supported for views.") 188 | else: 189 | filename = shadow.file.filename 190 | mode = shadow.file.mode 191 | 192 | if shadow.root != "/": 193 | filename = str(Path(filename) / shadow.root) 194 | view = DataShadow( 195 | filename, 196 | array_backend=shadow._array_backend, 197 | table_backend=shadow._table_backend, 198 | mode=mode, 199 | format=shadow._format, 200 | ) 201 | 202 | # NOTE: Cache is not preserved in a new object 203 | 204 | view._is_view = True 205 | view._ref = shadow 206 | view._oidx = oidx 207 | view._vidx = vidx 208 | 209 | if shadow.is_view: 210 | view._ref = shadow._ref 211 | if shadow._oidx is not None: 212 | if isinstance(shadow._oidx, slice): 213 | r = range(*shadow._oidx.indices(shadow._ref.n_obs)).__getitem__(oidx) 214 | view._oidx = slice(r.start, r.stop, r.step) 215 | else: 216 | view._oidx = shadow._oidx[oidx] 217 | if shadow._vidx is not None: 218 | if isinstance(shadow._vidx, slice): 219 | r = range(*shadow._vidx.indices(shadow._ref.n_vars)).__getitem__(vidx) 220 | view._vidx = slice(r.start, r.stop, r.step) 221 | else: 222 | view._vidx = shadow._vidx[vidx] 223 | 224 | return view 225 | 226 | def _annot(self, axis: Literal["obs", "var", 0, 1]): 227 | if axis not in ("obs", "var", 0, 1): 228 | raise ValueError(f"axis must be 'obs' or 'var', not {axis}") 229 | 230 | if isinstance(axis, int): 231 | axis = "obs" if axis == 0 else "var" 232 | 233 | idx = self._oidx if axis == "obs" else self._vidx 234 | 235 | # Use anndata v0.8 spec reader 236 | reader = _get_backend_reader(self._table_backend, self._lazy) 237 | annot = self.file[self.root][axis] 238 | columns = {} 239 | 240 | if isinstance(annot, get_args(ArrayStorageType)): 241 | # Deal with legacy or parquet files 242 | 243 | # For legacy files, 244 | # correct the categories for different backends. 245 | categories = {} 246 | if "uns" in self.file: 247 | uns_keys = list(self.file["uns"]) 248 | cat_keys = [key for key in uns_keys if key.endswith("_categories")] 249 | categories = { 250 | key.removesuffix("_categories"): [e.decode() for e in self.file["uns"][key]] 251 | for key in cat_keys 252 | } 253 | 254 | if self._table_backend == "pandas": 255 | from pandas import Categorical, DataFrame 256 | 257 | table = DataFrame(read_elem(annot, _format=self._format)) 258 | if "_index" in annot.attrs: 259 | table = table.set_index(annot.attrs["_index"]) 260 | elif self._format == "hdf5" and "index" in (e[0] for e in annot.dtype.descr): 261 | table = table.set_index("index") 262 | 263 | for column in table.columns: 264 | if column in categories: 265 | table[column] = Categorical.from_codes( 266 | table[column], categories=categories[column] 267 | ) 268 | 269 | if self.is_view: 270 | return table.iloc[idx] 271 | 272 | return table 273 | elif self._table_backend == "polars": 274 | import polars as pl 275 | 276 | cat_map = lambda col: lambda x: pl.Series(categories[col])[x] 277 | 278 | table = read_elem(annot, _format=self._format) 279 | table = pl.DataFrame(table) 280 | 281 | for column in table.columns: 282 | if column in categories: 283 | table = table.with_columns( 284 | [pl.col(column).map(cat_map(column)).cast(pl.Categorical).alias(column)] 285 | ) 286 | 287 | if self.is_view: 288 | import numpy as np 289 | 290 | if not isinstance(idx, slice) and ( 291 | isinstance(idx.dtype, pl.Boolean) 292 | or hasattr(idx.dtype, "type") 293 | and issubclass(idx.dtype.type, np.bool_) 294 | ): 295 | return table.filter(idx) 296 | return table.__getitem__(idx) 297 | 298 | return table 299 | elif self._table_backend == "pyarrow": 300 | import pandas as pd 301 | import pyarrow as pa 302 | 303 | table = read_elem(annot, _format=self._format) 304 | table = pd.DataFrame(table) 305 | 306 | for column in table.columns: 307 | if column in categories: 308 | table[column] = pd.Categorical.from_codes( 309 | table[column], categories=categories[column] 310 | ) 311 | 312 | table = pa.Table.from_pandas(table) 313 | 314 | if self.is_view: 315 | import numpy as np 316 | 317 | if ( 318 | not isinstance(idx, slice) 319 | and hasattr(idx.dtype, "type") 320 | and issubclass(idx.dtype.type, np.bool_) 321 | ): 322 | return table.filter(idx) 323 | return table.__getitem__(idx) 324 | 325 | return table 326 | else: 327 | raise NotImplementedError( 328 | "Alternative backends are not available " 329 | "for the legacy AnnData/MuData specification." 330 | ) 331 | 332 | if self._table_backend == "pandas": 333 | table = read_elem(annot, _format=self._format) 334 | 335 | if self.is_view: 336 | return table.iloc[idx] 337 | 338 | return table 339 | 340 | # else (only for AnnData >=0.8) 341 | for key, value in annot.items(): 342 | if key == "__categories": 343 | continue 344 | col = read_elem(value, _format=self._format) 345 | if self._table_backend == "polars": 346 | if "encoding-type" in value.attrs and value.attrs["encoding-type"] == "categorical": 347 | import polars as pl 348 | 349 | col = pl.Series(col.astype(str)).cast(pl.Categorical) 350 | else: 351 | raise NotImplementedError("Alternative backends are not fully supported just yet.") 352 | columns[key] = col 353 | 354 | table = reader(columns) 355 | 356 | if self.is_view: 357 | if self._table_backend == "pandas": 358 | return table.iloc[idx] 359 | return table.__getitem__(idx) 360 | 361 | return table 362 | 363 | @cached_property 364 | def _obs(self): 365 | return self._annot("obs") 366 | 367 | @property 368 | def obs(self): 369 | return self._obs 370 | 371 | @cached_property 372 | def _var(self): 373 | return self._annot("var") 374 | 375 | @property 376 | def var(self): 377 | return self._var 378 | 379 | def __names(self, axis: str): 380 | """ 381 | Internal method to get the names of the obs or var axis 382 | """ 383 | assert axis in ["obs", "var"], "axis must be 'obs' or 'var'" 384 | 385 | from pandas import Index 386 | 387 | attr = self.file[self.root][axis] 388 | 389 | # Handle legacy 390 | if isinstance(attr, get_args(ArrayStorageType)): 391 | attr_df = getattr(self, axis) 392 | if hasattr(attr_df, "index"): 393 | names = attr_df.index 394 | elif hasattr(attr_df, "column_names"): # pyarrow 395 | if "index" in attr_df.column_names: 396 | names = Index(attr_df["index"]) 397 | elif "__index_level_0__" in attr_df.column_names: 398 | names = Index(attr_df["__index_level_0__"]) 399 | elif hasattr(attr_df, "schema"): 400 | if hasattr(attr_df.schema, "metadata") and b"pandas" in attr_df.schema.metadata: 401 | import json 402 | 403 | pd_meta = json.loads(attr_df.schema.metadata[b"pandas"]) 404 | names = Index(attr_df[pd_meta["index_columns"][0]].to_numpy()) 405 | else: 406 | raise ValueError(f"Empty {axis}_names") 407 | elif hasattr(attr_df, "columns"): 408 | if "index" in attr_df.columns: 409 | names = Index(attr_df["index"]) 410 | elif "__index_level_0__" in attr_df.columns: 411 | names = Index(attr_df["__index_level_0__"]) 412 | else: 413 | from pyarrow import parquet as pq 414 | 415 | # TODO: Refactor e.g. by implementing read_elem_schema 416 | filename = self.file[self.root][axis].path 417 | schema = pq.read_schema(filename) 418 | 419 | import json 420 | 421 | try: 422 | pd_meta = json.loads(schema.metadata[b"pandas"]) 423 | except KeyError as e: 424 | raise KeyError(f"Metadata from pandas not found in the schema: {e}") 425 | 426 | names = Index(attr_df[pd_meta["index_columns"][0]]) 427 | else: 428 | raise ValueError(f"Empty {axis}_names") 429 | 430 | else: 431 | index = "_index" 432 | if "_index" in attr.attrs: 433 | index = attr.attrs["_index"] 434 | 435 | try: 436 | if self.is_view: 437 | indices = self._oidx if axis == "obs" else self._vidx 438 | names = Index(self.file[self.root][axis][index][:][indices]) 439 | else: 440 | names = Index(self.file[self.root][axis][index][:]) 441 | except KeyError: 442 | index = "__index_level_0__" 443 | if self.is_view: 444 | indices = self._oidx if axis == "obs" else self._vidx 445 | names = Index(self.file[self.root][axis][index][:][indices]) 446 | else: 447 | names = Index(self.file[self.root][axis][index][:]) 448 | 449 | # only string index 450 | if all(isinstance(e, bytes) for e in names): 451 | try: 452 | names = names.str.decode("utf-8") 453 | except AttributeError: 454 | pass 455 | 456 | return names 457 | 458 | @cached_property 459 | def _obs_names(self): 460 | """ 461 | Note: currently, anndata relies on pd.Index here 462 | """ 463 | return self.__names("obs") 464 | 465 | @property 466 | def obs_names(self): 467 | return self._obs_names 468 | 469 | @cached_property 470 | def _var_names(self): 471 | """ 472 | Note: currently, anndata relies on pd.Index here 473 | """ 474 | return self.__names("var") 475 | 476 | @property 477 | def var_names(self): 478 | return self._var_names 479 | 480 | @cached_property 481 | def _n_obs(self): 482 | obs = self.file[self.root]["obs"] 483 | if isinstance(obs, get_args(ArrayStorageType)): 484 | n_obs = obs.shape[0] 485 | else: 486 | index = "_index" 487 | if "_index" in obs.attrs: 488 | index = obs.attrs["_index"] 489 | 490 | n_obs = obs[index].shape[0] 491 | 492 | if self.is_view and self._oidx is not None: 493 | if isinstance(self._oidx, slice): 494 | return len(range(n_obs).__getitem__(self._oidx)) 495 | else: 496 | import numpy as np 497 | 498 | if issubclass(self._oidx.dtype.type, np.bool_): 499 | return self._oidx.sum() 500 | else: 501 | return len(self._oidx) 502 | return n_obs 503 | 504 | @property 505 | def n_obs(self): 506 | return self._n_obs 507 | 508 | @cached_property 509 | def _n_vars(self): 510 | var = self.file[self.root]["var"] 511 | if isinstance(var, get_args(ArrayStorageType)): 512 | n_vars = var.shape[0] 513 | 514 | else: 515 | index = "_index" 516 | if "_index" in var.attrs: 517 | index = var.attrs["_index"] 518 | 519 | n_vars = var[index].shape[0] 520 | 521 | if self.is_view and self._vidx is not None: 522 | if isinstance(self._vidx, slice): 523 | return len(range(n_vars).__getitem__(self._vidx)) 524 | else: 525 | import numpy as np 526 | 527 | if issubclass(self._vidx.dtype.type, np.bool_): 528 | return self._vidx.sum() 529 | else: 530 | return len(self._vidx) 531 | 532 | return n_vars 533 | 534 | @property 535 | def n_vars(self): 536 | return self._n_vars 537 | 538 | @property 539 | def shape(self): 540 | return self.n_obs, self.n_vars 541 | 542 | @cached_property 543 | def _obsm(self): 544 | group_storage = self.file[self.root]["obsm"] if "obsm" in self.file[self.root] else dict() 545 | return ElemShadow( 546 | group_storage, 547 | key=str(Path(self.root) / "obsm"), 548 | cache=self.__dict__, 549 | n_obs=self.n_obs, 550 | n_vars=self.n_vars, 551 | array_backend=self._array_backend, 552 | table_backend=self._table_backend, 553 | is_view=self.is_view, 554 | idx=(self._oidx, None), 555 | ) 556 | 557 | @property 558 | def obsm(self): 559 | return self._obsm 560 | 561 | def obsm_keys(self) -> list[str]: 562 | return list(self._obsm.keys()) 563 | 564 | @cached_property 565 | def _varm(self): 566 | group_storage = self.file[self.root]["varm"] if "varm" in self.file[self.root] else dict() 567 | return ElemShadow( 568 | group_storage, 569 | key=str(Path(self.root) / "varm"), 570 | cache=self.__dict__, 571 | n_obs=self.n_obs, 572 | n_vars=self.n_vars, 573 | array_backend=self._array_backend, 574 | table_backend=self._table_backend, 575 | is_view=self.is_view, 576 | idx=(None, self._vidx), 577 | ) 578 | 579 | @property 580 | def varm(self): 581 | return self._varm 582 | 583 | def varm_keys(self) -> list[str]: 584 | return list(self._varm.keys()) 585 | 586 | @cached_property 587 | def _obsp(self): 588 | group_storage = self.file[self.root]["obsp"] if "obsp" in self.file[self.root] else dict() 589 | return ElemShadow( 590 | group_storage, 591 | key=str(Path(self.root) / "obsp"), 592 | cache=self.__dict__, 593 | n_obs=self.n_obs, 594 | n_vars=self.n_vars, 595 | array_backend=self._array_backend, 596 | table_backend=self._table_backend, 597 | is_view=self.is_view, 598 | idx=(self._oidx, self._oidx), 599 | ) 600 | 601 | @property 602 | def obsp(self): 603 | return self._obsp 604 | 605 | @cached_property 606 | def _varp(self): 607 | # if "varp" not in self.file[self.root]: 608 | # return EmptySlot() 609 | group_storage = self.file[self.root]["varp"] if "varp" in self.file[self.root] else dict() 610 | return ElemShadow( 611 | group_storage, 612 | key=str(Path(self.root) / "varp"), 613 | cache=self.__dict__, 614 | n_obs=self.n_obs, 615 | n_vars=self.n_vars, 616 | array_backend=self._array_backend, 617 | table_backend=self._table_backend, 618 | is_view=self.is_view, 619 | idx=(self._vidx, self._vidx), 620 | ) 621 | 622 | @property 623 | def varp(self): 624 | return self._varp 625 | 626 | @cached_property 627 | def _uns(self): 628 | if "uns" not in self.file[self.root]: 629 | return dict() 630 | 631 | def map_get_keys(root): 632 | s = ElemShadow( 633 | root, 634 | key=root.name, 635 | cache=self.__dict__, 636 | n_obs=None, 637 | n_vars=None, 638 | array_backend=self._array_backend, 639 | table_backend=self._table_backend, 640 | ) 641 | for key in root.keys(): 642 | # if hasattr(root[key], "keys"): 643 | if isinstance(root[key], get_args(GroupStorageType)) and hasattr(root[key], "keys"): 644 | s[key] = map_get_keys(root[key]) 645 | return s 646 | 647 | uns_root = self.file[self.root]["uns"] 648 | return map_get_keys(uns_root) 649 | 650 | @property 651 | def uns(self): 652 | return self._uns 653 | 654 | def clear_cache(self): 655 | keys = list(self.__dict__.keys()) 656 | slots = [ 657 | "X", 658 | "obs", 659 | "obsm", 660 | "var", 661 | "varm", 662 | "obsp", 663 | "varp", 664 | "layers", 665 | "raw", 666 | "uns", 667 | ] 668 | _slots = [f"_{slot}" for slot in slots] 669 | for key in keys: 670 | if key.startswith("/") or key.startswith("mod/") or key in _slots or key in slots: 671 | obj_id = id(self.__dict__[key]) 672 | obj = ctypes.cast(obj_id, ctypes.py_object).value 673 | 674 | del self.__dict__[key] 675 | 676 | # Make sure the object is deleted to free the memory 677 | del obj 678 | 679 | def close(self): 680 | if self._format == "zarr": 681 | self.file.store.close() 682 | return 683 | 684 | self.file.close() 685 | 686 | if hasattr(self, "_callback") and self._callback and callable(self._callback): 687 | self._callback() 688 | 689 | def reopen(self, mode: str, file: str | None = None) -> None: 690 | if self._format == "zarr": 691 | import zarr 692 | 693 | if not self.file: 694 | if file is None: 695 | raise ValueError("The connection is closed but no new file name is provided.") 696 | self.close() 697 | if self._format == "zarr": 698 | self.file = zarr.open(file, mode=mode) 699 | else: 700 | self.file = h5py.File(file, mode=mode) 701 | elif self._format == "zarr": 702 | if self.file.read_only and mode != "r" or mode == "r" and not self.file.read_only: 703 | file = file or self.file.store.path 704 | self.close() 705 | self.file = zarr.open(file, mode=mode) 706 | elif mode != self.file.mode: 707 | file = file or self.file.filename 708 | self.close() 709 | self.file = h5py.File(file, mode=mode) 710 | else: 711 | return 712 | 713 | # FIXME: parquet support 714 | 715 | # Update ._group in all elements 716 | for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "uns", "layers"]: 717 | if key in ["obs", "var"]: 718 | # In the current implementation attributes are not ElemShadows 719 | pass 720 | elif hasattr(self, key): 721 | elem = getattr(self, key) 722 | if isinstance(elem, ElemShadow): 723 | elem._update_group(self.file[str(Path(self.root) / key)]) 724 | 725 | return 726 | 727 | def __repr__(self): 728 | s = "" 729 | for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "uns"]: 730 | key_cached = key in self.__dict__ 731 | key_cached_str = RUNECACHED if key_cached else "" 732 | 733 | if key in ["obs", "var"]: 734 | if key in self.__dict__: 735 | s += f"{key}{key_cached_str}:\t" 736 | s += f"{', '.join(map(str, getattr(self, key).columns))}\n" 737 | else: 738 | try: 739 | key_elems = self.file[self.root][key].keys() 740 | except AttributeError: 741 | # Do not extract column names from the pre-0.8 AnnData 742 | key_elems = ["..."] 743 | # For parquet files, keys can be read from the schema 744 | if self._format == "parquet" or self._format == "pyarrow": 745 | try: 746 | from pyarrow import parquet as pq 747 | 748 | filename = self.file[self.root][key].path 749 | schema = pq.read_schema(filename) 750 | key_elems = schema.names 751 | except Exception as e: 752 | raise e 753 | if len(key_elems) > 0: 754 | s += f"{key}:\t{', '.join(key_elems)}\n" 755 | else: # complex keys 756 | if not (key == "uns" and len(self.uns) == 0): 757 | # do not show empty dict 758 | s += getattr(self, key).__repr__() 759 | 760 | return s 761 | 762 | # Views 763 | 764 | def __getitem__(self, index): 765 | oidx, vidx = _normalize_indices(index, self.obs_names, self.var_names) 766 | return DataShadow._init_as_view(self, oidx, vidx) 767 | 768 | @property 769 | def is_view(self): 770 | return self._is_view 771 | 772 | # Legacy methods for scanpy compatibility 773 | 774 | def _sanitize(self): 775 | pass 776 | 777 | def obs_vector(self, key: str, layer: str | None = None): 778 | return self.obs[key].values 779 | 780 | def var_vector(self, key: str, layer: str | None = None): 781 | return self.var[key].values 782 | 783 | # Writing 784 | 785 | def _push_changes(self, clear_cache: bool = False): 786 | for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "uns", "layers"]: 787 | if hasattr(self, key): 788 | elem = getattr(self, key) 789 | if isinstance(elem, ElemShadow): 790 | elem._push_changes( 791 | clear_cache=clear_cache, 792 | ) 793 | 794 | def write(self, *args, **kwargs) -> None: 795 | if self.is_view: 796 | raise ValueError("Views cannot write data to the file.") 797 | if ( 798 | self._format == "zarr" 799 | and self.file.read_only 800 | or self._format == "hdf5" 801 | and self.file.mode == "r" 802 | ): 803 | raise OSError( 804 | "File is open in read-only mode. Changes can't be pushed. " 805 | "Reopen it with .reopen('r+') to enable writing." 806 | ) 807 | else: 808 | self._push_changes(*args, **kwargs) 809 | return 810 | 811 | def reopen_and_write(self, mode: str = "r+", *args, **kwargs) -> None: 812 | original_mode = self.file.mode 813 | self.reopen(mode) 814 | try: 815 | self.write(*args, **kwargs) 816 | except Exception as e: 817 | warn(f"An error occurred while writing the changes:\n{e}") 818 | finally: 819 | self.reopen(original_mode) 820 | 821 | # Laziness 822 | 823 | def lazy(self): 824 | self._lazy = True 825 | 826 | def eager(self): 827 | self._lazy = False 828 | 829 | @property 830 | def is_lazy(self): 831 | return self._lazy 832 | -------------------------------------------------------------------------------- /docs/examples/shadows-zarr.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b6eae7bd-1091-480f-8c95-551eefe5c53c", 6 | "metadata": { 7 | "slideshow": { 8 | "slide_type": "slide" 9 | }, 10 | "tags": [] 11 | }, 12 | "source": [ 13 | "# Shadows for zarr" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "id": "b17e6265-4c91-4d30-a232-20e6a627c07d", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "%load_ext autoreload\n", 24 | "%autoreload 2" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "id": "4aa723fb-6a8d-4d43-913c-a31f2316b02f", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import os\n", 35 | "os.chdir(\"../../\")" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "id": "f1c3418a-3a90-41b0-baa6-c6ad340dc75f", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "from pathlib import Path\n", 46 | "data = Path(\"data/\")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "b9e3bb66-3928-45f4-ba98-fded629de018", 52 | "metadata": {}, 53 | "source": [ 54 | " " 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "934b8d69-b812-422f-b718-080bb8508348", 60 | "metadata": { 61 | "slideshow": { 62 | "slide_type": "slide" 63 | }, 64 | "tags": [] 65 | }, 66 | "source": [ 67 | "## Shadows for zarr storage\n", 68 | "\n", 69 | "Beyond H5AD and H5MU files, shadow objects also work with [Zarr](https://zarr.dev/) files." 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "65462d07-01b0-4395-8891-eda01e472f38", 75 | "metadata": {}, 76 | "source": [ 77 | " " 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "4a38075c-8da2-4193-af1a-c52e18176f92", 83 | "metadata": { 84 | "slideshow": { 85 | "slide_type": "fragment" 86 | }, 87 | "tags": [] 88 | }, 89 | "source": [ 90 | "Import classes for these shadow objects:" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "id": "079454ed-10dc-47ef-9de2-ef70f95dbed6", 97 | "metadata": { 98 | "slideshow": { 99 | "slide_type": "fragment" 100 | }, 101 | "tags": [] 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "from shadows import AnnDataShadow, MuDataShadow" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "id": "564f7b2b-063d-4f0e-8333-c178565ee2d2", 111 | "metadata": {}, 112 | "source": [ 113 | " " 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "6b819452-470f-47b7-8fa0-0c8304fd557c", 119 | "metadata": { 120 | "slideshow": { 121 | "slide_type": "fragment" 122 | }, 123 | "tags": [] 124 | }, 125 | "source": [ 126 | "Initialise a multimodal shadow object:" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 5, 132 | "id": "3ff358c0-2c77-460a-97a9-398f615a0e17", 133 | "metadata": { 134 | "slideshow": { 135 | "slide_type": "fragment" 136 | }, 137 | "tags": [] 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "file = data / \"pbmc5k_citeseq/minipbcite_prot.zarr\"\n", 142 | "adata = AnnDataShadow(file, format=\"zarr\")" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "id": "1747c671-ffc2-4d4d-8a04-7dc44432b2fb", 148 | "metadata": {}, 149 | "source": [ 150 | " " 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "b8ae6d73-9a74-48ed-9d41-7e92bfee8f71", 156 | "metadata": { 157 | "slideshow": { 158 | "slide_type": "slide" 159 | }, 160 | "tags": [] 161 | }, 162 | "source": [ 163 | "### File\n", 164 | "\n", 165 | "As with HDF5 files, file connection that the shadow is using can be accessed via the `.file` attribute:" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 7, 171 | "id": "33c47ede-e566-43ac-8596-470263d21b3a", 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "data": { 176 | "text/plain": [ 177 | "" 178 | ] 179 | }, 180 | "execution_count": 7, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "adata.file" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "id": "a43127df-c330-4104-bbf6-399c7392c373", 192 | "metadata": {}, 193 | "source": [ 194 | "The path to the file can then be accessed via `adata.file.store.path`:" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 8, 200 | "id": "2f7dec24-9cc3-4cf2-a044-a6e487c17315", 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "data": { 205 | "text/plain": [ 206 | "'minipbcite_prot.zarr'" 207 | ] 208 | }, 209 | "execution_count": 8, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "os.path.basename(adata.file.store.path)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "id": "0574136f-7aa4-4a1e-9312-eee5fc9c6744", 221 | "metadata": { 222 | "slideshow": { 223 | "slide_type": "subslide" 224 | }, 225 | "tags": [] 226 | }, 227 | "source": [ 228 | "Zarr store will be closed upon calling the `adata.close()` method:" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 9, 234 | "id": "0b14eda7-0343-4f8d-82d9-ffc7257a1a11", 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "adata.close()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "id": "a87e0e96-86c2-4623-b239-892e92b04a5a", 244 | "metadata": { 245 | "slideshow": { 246 | "slide_type": "fragment" 247 | }, 248 | "tags": [] 249 | }, 250 | "source": [ 251 | "... or until the file has to be re-opened for modification (see below)." 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "id": "5a064df4-b533-4124-a85a-f7b20fcc1091", 257 | "metadata": {}, 258 | "source": [ 259 | " " 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "id": "9beb85a9-e226-4b9a-949b-2351432558f7", 265 | "metadata": { 266 | "slideshow": { 267 | "slide_type": "slide" 268 | }, 269 | "tags": [] 270 | }, 271 | "source": [ 272 | "### Permissions\n", 273 | "\n", 274 | "We can open Zarr files in different modes including purely read-only (`'r'`) and read/write (`'r+'`). The mode can be provided to the constructor:" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 10, 280 | "id": "9f297beb-97b5-46ad-97b9-2dedc5c40b53", 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "True" 287 | ] 288 | }, 289 | "execution_count": 10, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "adata = AnnDataShadow(file, format=\"zarr\", mode=\"r\")\n", 296 | "adata.file.read_only" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "id": "fc9da2a5-402f-4fe8-83a2-0a5f06a84d7c", 302 | "metadata": {}, 303 | "source": [ 304 | "Let's add some data to the in-memory shadow object:" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 11, 310 | "id": "21f291bd-7c5d-4ef3-a034-c0030dabdb60", 311 | "metadata": { 312 | "slideshow": { 313 | "slide_type": "fragment" 314 | }, 315 | "tags": [] 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "adata.obsm[\"X_pca_copy\"] = adata.obsm[\"X_pca\"].copy()" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "id": "b03108f5-0e8a-4646-af12-ef5fc934885b", 325 | "metadata": { 326 | "slideshow": { 327 | "slide_type": "subslide" 328 | }, 329 | "tags": [] 330 | }, 331 | "source": [ 332 | "We can also conveniently close and reopen the connection for a given in-memory shadow object:" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 12, 338 | "id": "e8ddb228-74b4-4f8e-8cdc-c84479f38d2d", 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "data": { 343 | "text/plain": [ 344 | "False" 345 | ] 346 | }, 347 | "execution_count": 12, 348 | "metadata": {}, 349 | "output_type": "execute_result" 350 | } 351 | ], 352 | "source": [ 353 | "adata.reopen(mode=\"r+\")\n", 354 | "adata.file.read_only" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "id": "48157734-adc0-4e7d-8157-64e1201b6fba", 360 | "metadata": {}, 361 | "source": [ 362 | "This way all the newly added elements are still available in memory:" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 13, 368 | "id": "043428b5-dc58-4d0c-b653-e1d8451b39f9", 369 | "metadata": {}, 370 | "outputs": [ 371 | { 372 | "data": { 373 | "text/plain": [ 374 | "obsm:\tX_pcaᐁ, X_umap, X_pca_copy▲" 375 | ] 376 | }, 377 | "execution_count": 13, 378 | "metadata": {}, 379 | "output_type": "execute_result" 380 | } 381 | ], 382 | "source": [ 383 | "adata.obsm" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 14, 389 | "id": "50aba055-06e2-490d-a1a6-3307ef7ac6d0", 390 | "metadata": { 391 | "slideshow": { 392 | "slide_type": "fragment" 393 | }, 394 | "tags": [] 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "# Clean up\n", 399 | "adata.close()\n", 400 | "del adata" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "id": "991ccc6a-f182-4689-802d-a9ae70a490e4", 406 | "metadata": {}, 407 | "source": [ 408 | " " 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "id": "2dbc52ad-6010-416f-810b-c60e5546ba7b", 414 | "metadata": { 415 | "slideshow": { 416 | "slide_type": "slide" 417 | }, 418 | "tags": [] 419 | }, 420 | "source": [ 421 | "### Individual modality access\n", 422 | "\n", 423 | "Individual modalities stored in the .h5mu files can be accessed as part of the `MuDataShadow` object:" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 15, 429 | "id": "d5ea1511-6f1b-4c51-9ec7-14365dc8d391", 430 | "metadata": { 431 | "slideshow": { 432 | "slide_type": "fragment" 433 | }, 434 | "tags": [] 435 | }, 436 | "outputs": [ 437 | { 438 | "data": { 439 | "text/plain": [ 440 | "AnnData Shadow object with n_obs × n_vars = 411 × 29\n", 441 | " X \n", 442 | " layers:\tcounts\n", 443 | " obs:\t_index\n", 444 | " var:\t_index, feature_types, gene_ids, highly_variable\n", 445 | " obsm:\tX_pca, X_umap\n", 446 | " varm:\tPCs\n", 447 | " obsp:\tconnectivities, distances\n", 448 | " uns:\tneighbors, pca, umap" 449 | ] 450 | }, 451 | "execution_count": 15, 452 | "metadata": {}, 453 | "output_type": "execute_result" 454 | } 455 | ], 456 | "source": [ 457 | "adata = AnnDataShadow(file, format=\"zarr\")\n", 458 | "adata" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 16, 464 | "id": "946d03a9-d0d1-4ebc-ae29-92d795f08073", 465 | "metadata": { 466 | "slideshow": { 467 | "slide_type": "fragment" 468 | }, 469 | "tags": [] 470 | }, 471 | "outputs": [], 472 | "source": [ 473 | "# Clean up\n", 474 | "adata.close()\n", 475 | "del adata" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "id": "14b8ad11-adad-4ea8-9146-3dd7cd9bd415", 481 | "metadata": {}, 482 | "source": [ 483 | " " 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "id": "d3ae2a84-34fc-48b9-926e-a5d5f57e4e73", 489 | "metadata": { 490 | "slideshow": { 491 | "slide_type": "slide" 492 | }, 493 | "tags": [] 494 | }, 495 | "source": [ 496 | "### Class identity\n", 497 | "\n", 498 | "Many tools in the ecosystem including scanpy frequently check if the input object is an AnnData. For instance, [in `sc.pp.highly_variable_genes`](https://github.com/scverse/scanpy/blob/master/scanpy/preprocessing/_highly_variable_genes.py) it reads:\n", 499 | "\n", 500 | "```py\n", 501 | "if not isinstance(adata, AnnData):\n", 502 | " raise ValueError(\n", 503 | " '`pp.highly_variable_genes` expects an `AnnData` argument, '\n", 504 | " 'pass `inplace=False` if you want to return a `pd.DataFrame`.'\n", 505 | " )\n", 506 | "```\n", 507 | "\n", 508 | "In order for shadow objects to be accepted by such functions, they mock their class identity:" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 17, 514 | "id": "f10b98ff-920f-4d46-924f-1cf3074236db", 515 | "metadata": { 516 | "slideshow": { 517 | "slide_type": "subslide" 518 | }, 519 | "tags": [] 520 | }, 521 | "outputs": [], 522 | "source": [ 523 | "adata = AnnDataShadow(file, format=\"zarr\")\n", 524 | "\n", 525 | "from anndata import AnnData\n", 526 | "assert isinstance(adata, AnnData), \"adata is not a valid AnnData object\"" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "id": "f8e2d4a9-eba2-45c0-88f6-35f69e7d0249", 532 | "metadata": { 533 | "slideshow": { 534 | "slide_type": "subslide" 535 | }, 536 | "tags": [] 537 | }, 538 | "source": [ 539 | "Checking for shadow identity still works:" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": 18, 545 | "id": "efadd4ba-219c-4c84-a1eb-36baf135c82d", 546 | "metadata": {}, 547 | "outputs": [ 548 | { 549 | "data": { 550 | "text/plain": [ 551 | "True" 552 | ] 553 | }, 554 | "execution_count": 18, 555 | "metadata": {}, 556 | "output_type": "execute_result" 557 | } 558 | ], 559 | "source": [ 560 | "isinstance(adata, AnnDataShadow)" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 19, 566 | "id": "a32515de-7866-4229-a639-0818a0dbea3b", 567 | "metadata": { 568 | "slideshow": { 569 | "slide_type": "fragment" 570 | }, 571 | "tags": [] 572 | }, 573 | "outputs": [], 574 | "source": [ 575 | "adata.close()" 576 | ] 577 | }, 578 | { 579 | "cell_type": "markdown", 580 | "id": "8d4e683f-0a0b-426c-8cf7-5f5529a844d2", 581 | "metadata": {}, 582 | "source": [ 583 | " " 584 | ] 585 | }, 586 | { 587 | "cell_type": "markdown", 588 | "id": "c29f18b0-717b-4821-b0f8-e81ca94426de", 589 | "metadata": {}, 590 | "source": [ 591 | "### Backends\n", 592 | "\n", 593 | "AnnData/MuData are based on a NumPy/Pandas stack. This is the default for the shadow objects in order to provide compatibility with AnnData/MuData objects.\n", 594 | "\n", 595 | "However the nature of shadow files also simplifies loading individual matrices or tables with alternative backends, e.g. [JAX](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.array.html#jax.numpy.array) (`Array`), [PyTorch](https://pytorch.org/docs/stable/tensors.html) (`Tensor`) or [polars](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/index.html) (`DataFrame`)." 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 20, 601 | "id": "734d4e9e-3936-4911-96fe-1bed3de167eb", 602 | "metadata": {}, 603 | "outputs": [], 604 | "source": [ 605 | "adata = AnnDataShadow(file, format=\"zarr\", array_backend=\"jax\", table_backend=\"polars\")" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": 21, 611 | "id": "3d909ef6-92b7-40f4-b50e-641993469791", 612 | "metadata": {}, 613 | "outputs": [ 614 | { 615 | "name": "stdout", 616 | "output_type": "stream", 617 | "text": [ 618 | "\n" 619 | ] 620 | }, 621 | { 622 | "data": { 623 | "text/html": [ 624 | "
\n", 625 | "\n", 654 | "\n", 655 | "shape: (5, 1)\n", 656 | "\n", 657 | "\n", 658 | "\n", 661 | "\n", 662 | "\n", 663 | "\n", 666 | "\n", 667 | "\n", 668 | "\n", 669 | "\n", 670 | "\n", 673 | "\n", 674 | "\n", 675 | "\n", 678 | "\n", 679 | "\n", 680 | "\n", 683 | "\n", 684 | "\n", 685 | "\n", 688 | "\n", 689 | "\n", 690 | "\n", 693 | "\n", 694 | "\n", 695 | "
\n", 659 | "_index\n", 660 | "
\n", 664 | "object\n", 665 | "
\n", 671 | "CAGCCAGGTCTCGACG-1\n", 672 | "
\n", 676 | "TTCTTCCTCTCGGTAA-1\n", 677 | "
\n", 681 | "CGGGTCAAGAGAGGTA-1\n", 682 | "
\n", 686 | "TACCCGTCATAATCCG-1\n", 687 | "
\n", 691 | "TGGGTTAGTGAATTAG-1\n", 692 | "
\n", 696 | "
" 697 | ], 698 | "text/plain": [ 699 | "shape: (5, 1)\n", 700 | "┌────────────────────┐\n", 701 | "│ _index │\n", 702 | "│ --- │\n", 703 | "│ object │\n", 704 | "╞════════════════════╡\n", 705 | "│ CAGCCAGGTCTCGACG-1 │\n", 706 | "│ TTCTTCCTCTCGGTAA-1 │\n", 707 | "│ CGGGTCAAGAGAGGTA-1 │\n", 708 | "│ TACCCGTCATAATCCG-1 │\n", 709 | "│ TGGGTTAGTGAATTAG-1 │\n", 710 | "└────────────────────┘" 711 | ] 712 | }, 713 | "execution_count": 21, 714 | "metadata": {}, 715 | "output_type": "execute_result" 716 | } 717 | ], 718 | "source": [ 719 | "obs = adata.obs\n", 720 | "print(type(obs))\n", 721 | "obs.head()" 722 | ] 723 | }, 724 | { 725 | "cell_type": "code", 726 | "execution_count": 22, 727 | "id": "32286100-13e4-49af-8194-f53693c9b7f0", 728 | "metadata": {}, 729 | "outputs": [ 730 | { 731 | "name": "stdout", 732 | "output_type": "stream", 733 | "text": [ 734 | "\n" 735 | ] 736 | }, 737 | { 738 | "data": { 739 | "text/plain": [ 740 | "Array([[ 17.051027 , 1.2865539 , -1.2715828 , ..., -0.05060111,\n", 741 | " -1.8431426 , -1.0410113 ],\n", 742 | " [ 15.563506 , -2.1941857 , -1.351732 , ..., -1.0639406 ,\n", 743 | " -0.1610156 , 2.1454387 ],\n", 744 | " [ 20.369316 , -8.03503 , 0.3842825 , ..., 0.52950376,\n", 745 | " -0.38589898, -0.7488529 ],\n", 746 | " ...,\n", 747 | " [-11.894565 , 9.380491 , -0.87732434, ..., -0.40848297,\n", 748 | " 0.4135897 , -0.710097 ],\n", 749 | " [-13.12094 , 9.734974 , -3.345742 , ..., 1.049644 ,\n", 750 | " 0.28707528, -1.8128693 ],\n", 751 | " [-12.875325 , 11.512296 , -4.9828258 , ..., -0.82176274,\n", 752 | " -2.06324 , -0.14073044]], dtype=float32)" 753 | ] 754 | }, 755 | "execution_count": 22, 756 | "metadata": {}, 757 | "output_type": "execute_result" 758 | } 759 | ], 760 | "source": [ 761 | "rna_pca = adata.obsm[\"X_pca\"]\n", 762 | "print(type(rna_pca))\n", 763 | "rna_pca" 764 | ] 765 | }, 766 | { 767 | "cell_type": "markdown", 768 | "id": "6cdad910-a34c-49d2-bc03-87bfde9417c9", 769 | "metadata": {}, 770 | "source": [ 771 | "When alternative backends are being used, not all of the AnnData/MuData features can be supported, and many external tools might not work as expected as they anticipate NumPy/Pandas objects instead." 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": 23, 777 | "id": "b06a9071-0443-41e6-ac81-e3f0ce2653e9", 778 | "metadata": {}, 779 | "outputs": [], 780 | "source": [ 781 | "# Clean up\n", 782 | "adata.clear_cache()\n", 783 | "adata.close()\n", 784 | "del adata, rna_pca, obs" 785 | ] 786 | }, 787 | { 788 | "cell_type": "markdown", 789 | "id": "6c474c9e-dfea-406c-ace6-461e8d5438a4", 790 | "metadata": {}, 791 | "source": [ 792 | " " 793 | ] 794 | }, 795 | { 796 | "cell_type": "markdown", 797 | "id": "16f9b372-a089-4aed-b91e-b368a2ddc13e", 798 | "metadata": { 799 | "slideshow": { 800 | "slide_type": "slide" 801 | }, 802 | "tags": [] 803 | }, 804 | "source": [ 805 | "### Partial writing\n", 806 | "\n", 807 | "> [!NOTE]\n", 808 | "> This feature is experimental.\n", 809 | "\n", 810 | "While the main use of the shadows is to provide a low-memory read-only solution to scverse datasets, ability to add new embeddings or other items to the file can greatly extend its usage patterns." 811 | ] 812 | }, 813 | { 814 | "cell_type": "code", 815 | "execution_count": 24, 816 | "id": "02245bc0-cc92-4fe7-b665-a4e2f424b353", 817 | "metadata": { 818 | "slideshow": { 819 | "slide_type": "fragment" 820 | }, 821 | "tags": [] 822 | }, 823 | "outputs": [], 824 | "source": [ 825 | "adata = AnnDataShadow(file, format=\"zarr\")" 826 | ] 827 | }, 828 | { 829 | "cell_type": "markdown", 830 | "id": "c7324f1c-c4a4-4561-9680-0ac5caacc79f", 831 | "metadata": {}, 832 | "source": [ 833 | "Add a new embedding to the in-memory object:" 834 | ] 835 | }, 836 | { 837 | "cell_type": "code", 838 | "execution_count": 25, 839 | "id": "eb6f076f-0b26-428b-a824-a82b3d648c00", 840 | "metadata": { 841 | "slideshow": { 842 | "slide_type": "fragment" 843 | }, 844 | "tags": [] 845 | }, 846 | "outputs": [ 847 | { 848 | "data": { 849 | "text/plain": [ 850 | "obsm:\tX_pcaᐁ, X_umap, X_pca_copy▲" 851 | ] 852 | }, 853 | "execution_count": 25, 854 | "metadata": {}, 855 | "output_type": "execute_result" 856 | } 857 | ], 858 | "source": [ 859 | "adata.obsm[\"X_pca_copy\"] = adata.obsm[\"X_pca\"].copy()\n", 860 | "adata.obsm" 861 | ] 862 | }, 863 | { 864 | "cell_type": "markdown", 865 | "id": "0a7a6374-cb13-4f3a-8f5b-e0c4b4f89363", 866 | "metadata": { 867 | "slideshow": { 868 | "slide_type": "subslide" 869 | }, 870 | "tags": [] 871 | }, 872 | "source": [ 873 | "For this, a family of methods is useful, including `.reopen()` and `.write()`. The `.write()` method will only work if the connection is not read-only, e.g. `'r+'`, however it is possible to reopen the file in another mode.\n", 874 | "\n", 875 | "Internally, `.write()` pushes (`._push_changes()`) the in-memory changes (marked with ▲ in the object representation above) to the file and provides meaningful error messages when the file is not open for writing.\n", 876 | "\n", 877 | "This separation of concern makes it transparent when the data is modified, and this workflow can be recommended when barely any data are added to the file. As the methods return the shadow itself, it is possible to chain them:" 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": 26, 883 | "id": "bcfa2982-4bf6-42eb-a604-d17d6496598b", 884 | "metadata": { 885 | "slideshow": { 886 | "slide_type": "fragment" 887 | }, 888 | "tags": [] 889 | }, 890 | "outputs": [ 891 | { 892 | "data": { 893 | "text/plain": [ 894 | "obsm:\tX_pcaᐁ, X_pca_copy, X_umap" 895 | ] 896 | }, 897 | "execution_count": 26, 898 | "metadata": {}, 899 | "output_type": "execute_result" 900 | } 901 | ], 902 | "source": [ 903 | "adata.reopen(mode='r+').write(clear_cache=True).reopen(mode='r'); # clear pushed elements from cache\n", 904 | "adata.obsm" 905 | ] 906 | }, 907 | { 908 | "cell_type": "code", 909 | "execution_count": 27, 910 | "id": "1b794d6e-3cf2-4451-9a96-972aec79fc82", 911 | "metadata": {}, 912 | "outputs": [], 913 | "source": [ 914 | "adata.clear_cache()" 915 | ] 916 | }, 917 | { 918 | "cell_type": "markdown", 919 | "id": "af3d311e-0199-4dcf-b5a5-15b8e446fd08", 920 | "metadata": {}, 921 | "source": [ 922 | " " 923 | ] 924 | }, 925 | { 926 | "cell_type": "markdown", 927 | "id": "1b128596-dbb5-4469-a346-bd14cda79eb3", 928 | "metadata": {}, 929 | "source": [ 930 | "Default mode is read-only, and it protects the files from being modified while also allowing for multiple connections to the file:" 931 | ] 932 | }, 933 | { 934 | "cell_type": "code", 935 | "execution_count": 28, 936 | "id": "8e817c96-ae69-49d7-a574-58481170f011", 937 | "metadata": {}, 938 | "outputs": [ 939 | { 940 | "name": "stdout", 941 | "output_type": "stream", 942 | "text": [ 943 | "Not available for .write(): File is open in read-only mode. Changes can't be pushed. Reopen it with .reopen('r+') to enable writing.\n" 944 | ] 945 | } 946 | ], 947 | "source": [ 948 | "try:\n", 949 | " adata.write()\n", 950 | "except OSError as e:\n", 951 | " print(\"Not available for .write():\", e)" 952 | ] 953 | }, 954 | { 955 | "cell_type": "markdown", 956 | "id": "2e68cef8-871f-49be-8829-f59ff9d93f99", 957 | "metadata": {}, 958 | "source": [ 959 | " " 960 | ] 961 | }, 962 | { 963 | "cell_type": "markdown", 964 | "id": "8b5c17b8-98d1-42b6-a008-b3c3b6fbfb79", 965 | "metadata": {}, 966 | "source": [ 967 | "> [!NOTE]\n", 968 | "> Partial writing is currently intended to add new elements to the dataset on di not allow to delete or modify existing elements" 969 | ] 970 | }, 971 | { 972 | "cell_type": "markdown", 973 | "id": "e841d95f-3f46-4902-b18f-eb4c7080e58d", 974 | "metadata": {}, 975 | "source": [ 976 | " " 977 | ] 978 | }, 979 | { 980 | "cell_type": "markdown", 981 | "id": "e0c11265-8429-4a34-a552-759b1f07a0bc", 982 | "metadata": { 983 | "tags": [] 984 | }, 985 | "source": [ 986 | "### Views\n", 987 | "\n", 988 | "Views for shadow objects are conceptually similar to [views in AnnData/MuData](https://anndata.readthedocs.io/en/latest/generated/anndata.AnnData.is_view.html): they provide a view into an existing object without creating its copy.\n", 989 | "\n", 990 | "As shadow objects inherently operate on the file they are connected to, their views behave slightly differently. Creating a view creates a new connection to the file and returns a new shadow object, which is aware of the part of the data (e.g. which cells) it is supposed to provide a view for." 991 | ] 992 | }, 993 | { 994 | "cell_type": "code", 995 | "execution_count": 29, 996 | "id": "c3ea6e33-128a-48fd-a421-0c9f5801e47d", 997 | "metadata": {}, 998 | "outputs": [ 999 | { 1000 | "data": { 1001 | "text/plain": [ 1002 | "View of AnnData Shadow object with n_obs × n_vars = 100 × 29 (original 411 × 29)\n", 1003 | " X \n", 1004 | " layers:\tcounts\n", 1005 | " obs:\t_index\n", 1006 | " var:\t_index, feature_types, gene_ids, highly_variable\n", 1007 | " obsm:\tX_pca, X_pca_copy, X_umap\n", 1008 | " varm:\tPCs\n", 1009 | " obsp:\tconnectivities, distances\n", 1010 | " uns:\tneighbors, pca, umap" 1011 | ] 1012 | }, 1013 | "execution_count": 29, 1014 | "metadata": {}, 1015 | "output_type": "execute_result" 1016 | } 1017 | ], 1018 | "source": [ 1019 | "head = 100\n", 1020 | "head_view = adata[0:head]\n", 1021 | "head_view" 1022 | ] 1023 | }, 1024 | { 1025 | "cell_type": "markdown", 1026 | "id": "2f115798-96d2-4660-889d-b3e9a2d154c3", 1027 | "metadata": {}, 1028 | "source": [ 1029 | "Individual modalities of a MuData Shadow View are sliced accordingly:" 1030 | ] 1031 | }, 1032 | { 1033 | "cell_type": "code", 1034 | "execution_count": 30, 1035 | "id": "13f4b379-e26d-4677-9de3-42b3754af15d", 1036 | "metadata": {}, 1037 | "outputs": [ 1038 | { 1039 | "data": { 1040 | "text/plain": [ 1041 | "(100, 31)" 1042 | ] 1043 | }, 1044 | "execution_count": 30, 1045 | "metadata": {}, 1046 | "output_type": "execute_result" 1047 | } 1048 | ], 1049 | "source": [ 1050 | "head_view.obsm[\"X_pca\"].shape" 1051 | ] 1052 | }, 1053 | { 1054 | "cell_type": "code", 1055 | "execution_count": 31, 1056 | "id": "585fcbc6-9d5f-406f-99e1-6b91117e2bac", 1057 | "metadata": {}, 1058 | "outputs": [ 1059 | { 1060 | "data": { 1061 | "text/plain": [ 1062 | "obsm:\tX_pcaᐁ, X_pca_copy, X_umap" 1063 | ] 1064 | }, 1065 | "execution_count": 31, 1066 | "metadata": {}, 1067 | "output_type": "execute_result" 1068 | } 1069 | ], 1070 | "source": [ 1071 | "head_view.obsm" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "code", 1076 | "execution_count": 32, 1077 | "id": "bfa15c8a-f4a8-4907-939f-5cb80ef50abc", 1078 | "metadata": {}, 1079 | "outputs": [ 1080 | { 1081 | "data": { 1082 | "text/plain": [ 1083 | "View of AnnData Shadow object with n_obs × n_vars = 2 × 3 (original 411 × 29)\n", 1084 | " X \n", 1085 | " layers:\tcounts\n", 1086 | " obs:\t_index\n", 1087 | " var:\t_index, feature_types, gene_ids, highly_variable\n", 1088 | " obsm:\tX_pca, X_pca_copy, X_umap\n", 1089 | " varm:\tPCs\n", 1090 | " obsp:\tconnectivities, distances\n", 1091 | " uns:\tneighbors, pca, umap" 1092 | ] 1093 | }, 1094 | "execution_count": 32, 1095 | "metadata": {}, 1096 | "output_type": "execute_result" 1097 | } 1098 | ], 1099 | "source": [ 1100 | "nested_view = head_view[:2,-3:]\n", 1101 | "nested_view" 1102 | ] 1103 | }, 1104 | { 1105 | "cell_type": "markdown", 1106 | "id": "6e3ce502-40e6-4b40-b78e-cf86e527bf18", 1107 | "metadata": {}, 1108 | "source": [ 1109 | "Getting attributes from views is no different than for shadow objects:" 1110 | ] 1111 | }, 1112 | { 1113 | "cell_type": "code", 1114 | "execution_count": 33, 1115 | "id": "216d5cd3-5457-4145-952b-61bed2be9f7d", 1116 | "metadata": {}, 1117 | "outputs": [ 1118 | { 1119 | "data": { 1120 | "text/html": [ 1121 | "
\n", 1122 | "\n", 1135 | "\n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | "
CAGCCAGGTCTCGACG-1
TTCTTCCTCTCGGTAA-1
\n", 1150 | "
" 1151 | ], 1152 | "text/plain": [ 1153 | "Empty DataFrame\n", 1154 | "Columns: []\n", 1155 | "Index: [CAGCCAGGTCTCGACG-1, TTCTTCCTCTCGGTAA-1]" 1156 | ] 1157 | }, 1158 | "execution_count": 33, 1159 | "metadata": {}, 1160 | "output_type": "execute_result" 1161 | } 1162 | ], 1163 | "source": [ 1164 | "nested_view.obs" 1165 | ] 1166 | }, 1167 | { 1168 | "cell_type": "markdown", 1169 | "id": "9dbacf34-247e-4ac9-995b-f39656491973", 1170 | "metadata": {}, 1171 | "source": [ 1172 | "... as they are shadow objects themselves:" 1173 | ] 1174 | }, 1175 | { 1176 | "cell_type": "code", 1177 | "execution_count": 34, 1178 | "id": "c0921236-cc65-43fc-a9a1-557d4ab0a1c6", 1179 | "metadata": {}, 1180 | "outputs": [ 1181 | { 1182 | "data": { 1183 | "text/plain": [ 1184 | "shadows.anndatashadow.AnnDataShadow" 1185 | ] 1186 | }, 1187 | "execution_count": 34, 1188 | "metadata": {}, 1189 | "output_type": "execute_result" 1190 | } 1191 | ], 1192 | "source": [ 1193 | "type(nested_view)" 1194 | ] 1195 | }, 1196 | { 1197 | "cell_type": "code", 1198 | "execution_count": 35, 1199 | "id": "e70179b3-da72-4155-bbf9-b6f9d1fa8d47", 1200 | "metadata": {}, 1201 | "outputs": [], 1202 | "source": [ 1203 | "# Clean up\n", 1204 | "nested_view.close()\n", 1205 | "del nested_view\n", 1206 | "\n", 1207 | "head_view.close()\n", 1208 | "del head_view" 1209 | ] 1210 | }, 1211 | { 1212 | "cell_type": "markdown", 1213 | "id": "ed55ed1b-1d8e-4250-9352-75f59cd5551a", 1214 | "metadata": {}, 1215 | "source": [ 1216 | " " 1217 | ] 1218 | }, 1219 | { 1220 | "cell_type": "markdown", 1221 | "id": "ab4a745e-df8c-46f5-9c3d-d2d3678fff5f", 1222 | "metadata": { 1223 | "slideshow": { 1224 | "slide_type": "slide" 1225 | }, 1226 | "tags": [] 1227 | }, 1228 | "source": [ 1229 | "### Per-feature access to datasets on disk\n", 1230 | "\n", 1231 | "This is currently not possible as caching works at the level of individual HDF5 datasets.\n", 1232 | "\n", 1233 | "Views may read only the necessary parts of the arrays to memory however this behaviour is currently not universal.\n", 1234 | "\n", 1235 | "E.g.:" 1236 | ] 1237 | }, 1238 | { 1239 | "cell_type": "code", 1240 | "execution_count": 36, 1241 | "id": "ff5c4052-0929-43c3-947f-6de72b78d69e", 1242 | "metadata": {}, 1243 | "outputs": [ 1244 | { 1245 | "data": { 1246 | "text/plain": [ 1247 | "(10, 29)" 1248 | ] 1249 | }, 1250 | "execution_count": 36, 1251 | "metadata": {}, 1252 | "output_type": "execute_result" 1253 | } 1254 | ], 1255 | "source": [ 1256 | "adata_subset = adata[:10,:100]\n", 1257 | "adata_subset.X.shape" 1258 | ] 1259 | }, 1260 | { 1261 | "cell_type": "code", 1262 | "execution_count": 37, 1263 | "id": "e410e6e1-34c8-48f5-88b5-a45a0545e342", 1264 | "metadata": {}, 1265 | "outputs": [ 1266 | { 1267 | "data": { 1268 | "text/plain": [ 1269 | "View of AnnData Shadow object with n_obs × n_vars = 10 × 29 (original 411 × 29)\n", 1270 | " X ᐁ \n", 1271 | " layers:\tcounts\n", 1272 | " obs:\t_index\n", 1273 | " var:\t_index, feature_types, gene_ids, highly_variable\n", 1274 | " obsm:\tX_pca, X_pca_copy, X_umap\n", 1275 | " varm:\tPCs\n", 1276 | " obsp:\tconnectivities, distances\n", 1277 | " uns:\tneighbors, pca, umap" 1278 | ] 1279 | }, 1280 | "execution_count": 37, 1281 | "metadata": {}, 1282 | "output_type": "execute_result" 1283 | } 1284 | ], 1285 | "source": [ 1286 | "adata_subset" 1287 | ] 1288 | }, 1289 | { 1290 | "cell_type": "code", 1291 | "execution_count": 38, 1292 | "id": "bf2a317a-ca82-4a73-b0ef-07d0cfac2128", 1293 | "metadata": {}, 1294 | "outputs": [], 1295 | "source": [ 1296 | "# Clean up\n", 1297 | "adata.close()\n", 1298 | "adata_subset.close()\n", 1299 | "del adata, adata_subset" 1300 | ] 1301 | }, 1302 | { 1303 | "cell_type": "markdown", 1304 | "id": "bb50af6a-4ee2-4a8f-b022-9b0daa63e81e", 1305 | "metadata": {}, 1306 | "source": [ 1307 | " " 1308 | ] 1309 | }, 1310 | { 1311 | "cell_type": "markdown", 1312 | "id": "fec4c262-5bbf-4393-b082-f208f7997a7a", 1313 | "metadata": { 1314 | "slideshow": { 1315 | "slide_type": "slide" 1316 | }, 1317 | "tags": [] 1318 | }, 1319 | "source": [ 1320 | "---\n", 1321 | "\n", 1322 | "In order to return the data to its original state, let's manually remove the items we wrote to the file:" 1323 | ] 1324 | }, 1325 | { 1326 | "cell_type": "code", 1327 | "execution_count": 39, 1328 | "id": "46550ff4-39e1-40e6-80d0-4fd45d99af84", 1329 | "metadata": { 1330 | "slideshow": { 1331 | "slide_type": "fragment" 1332 | }, 1333 | "tags": [] 1334 | }, 1335 | "outputs": [], 1336 | "source": [ 1337 | "import zarr\n", 1338 | "\n", 1339 | "f = zarr.open(file, \"a\")\n", 1340 | "# ^\n", 1341 | "# ____________|\n", 1342 | "# if this works, \n", 1343 | "# no dangling read-only connections!\n", 1344 | "# \n", 1345 | "\n", 1346 | "del f[\"obsm/X_pca_copy\"]\n", 1347 | "f.store.close()" 1348 | ] 1349 | }, 1350 | { 1351 | "cell_type": "markdown", 1352 | "id": "6bc6a57c-39d0-45ad-be01-8cadde33da83", 1353 | "metadata": {}, 1354 | "source": [ 1355 | " " 1356 | ] 1357 | }, 1358 | { 1359 | "cell_type": "markdown", 1360 | "id": "752bd981-1cbd-43ec-b707-9308afb7e55f", 1361 | "metadata": {}, 1362 | "source": [ 1363 | " " 1364 | ] 1365 | } 1366 | ], 1367 | "metadata": { 1368 | "kernelspec": { 1369 | "display_name": "Python 3 (ipykernel)", 1370 | "language": "python", 1371 | "name": "python3" 1372 | }, 1373 | "language_info": { 1374 | "codemirror_mode": { 1375 | "name": "ipython", 1376 | "version": 3 1377 | }, 1378 | "file_extension": ".py", 1379 | "mimetype": "text/x-python", 1380 | "name": "python", 1381 | "nbconvert_exporter": "python", 1382 | "pygments_lexer": "ipython3", 1383 | "version": "3.10.11" 1384 | } 1385 | }, 1386 | "nbformat": 4, 1387 | "nbformat_minor": 5 1388 | } 1389 | -------------------------------------------------------------------------------- /docs/examples/shadows-features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b6eae7bd-1091-480f-8c95-551eefe5c53c", 6 | "metadata": { 7 | "slideshow": { 8 | "slide_type": "slide" 9 | }, 10 | "tags": [] 11 | }, 12 | "source": [ 13 | "# Shadows features" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "id": "b17e6265-4c91-4d30-a232-20e6a627c07d", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "%load_ext autoreload\n", 24 | "%autoreload 2" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "id": "4aa723fb-6a8d-4d43-913c-a31f2316b02f", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import os\n", 35 | "os.chdir(\"../../\")" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "id": "f1c3418a-3a90-41b0-baa6-c6ad340dc75f", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "from pathlib import Path\n", 46 | "data = Path(\"data/\")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "b9e3bb66-3928-45f4-ba98-fded629de018", 52 | "metadata": {}, 53 | "source": [ 54 | " " 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "934b8d69-b812-422f-b718-080bb8508348", 60 | "metadata": { 61 | "slideshow": { 62 | "slide_type": "slide" 63 | }, 64 | "tags": [] 65 | }, 66 | "source": [ 67 | "## Shadow objects and their features\n", 68 | "\n", 69 | "While shadow objects provide a convenient read-only drop-in replacement for AnnData/MuData objects when needed, they also have additional features that can help users make the most of *shadows*." 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "65462d07-01b0-4395-8891-eda01e472f38", 75 | "metadata": {}, 76 | "source": [ 77 | " " 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "4a38075c-8da2-4193-af1a-c52e18176f92", 83 | "metadata": { 84 | "slideshow": { 85 | "slide_type": "fragment" 86 | }, 87 | "tags": [] 88 | }, 89 | "source": [ 90 | "Import classes for these shadow objects:" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "id": "079454ed-10dc-47ef-9de2-ef70f95dbed6", 97 | "metadata": { 98 | "slideshow": { 99 | "slide_type": "fragment" 100 | }, 101 | "tags": [] 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "from shadows import AnnDataShadow, MuDataShadow" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "id": "564f7b2b-063d-4f0e-8333-c178565ee2d2", 111 | "metadata": {}, 112 | "source": [ 113 | " " 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "6b819452-470f-47b7-8fa0-0c8304fd557c", 119 | "metadata": { 120 | "slideshow": { 121 | "slide_type": "fragment" 122 | }, 123 | "tags": [] 124 | }, 125 | "source": [ 126 | "Initialise a multimodal shadow object:" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 5, 132 | "id": "3ff358c0-2c77-460a-97a9-398f615a0e17", 133 | "metadata": { 134 | "slideshow": { 135 | "slide_type": "fragment" 136 | }, 137 | "tags": [] 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "file = data / \"pbmc5k_citeseq/pbmc5k_citeseq_processed.h5mu\"\n", 142 | "mdata = MuDataShadow(file)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "id": "1747c671-ffc2-4d4d-8a04-7dc44432b2fb", 148 | "metadata": {}, 149 | "source": [ 150 | " " 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "b8ae6d73-9a74-48ed-9d41-7e92bfee8f71", 156 | "metadata": { 157 | "slideshow": { 158 | "slide_type": "slide" 159 | }, 160 | "tags": [] 161 | }, 162 | "source": [ 163 | "### File\n", 164 | "\n", 165 | "The file connection that the shadow is using can be accessed via the `.file` attribute:" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 6, 171 | "id": "33c47ede-e566-43ac-8596-470263d21b3a", 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "data": { 176 | "text/plain": [ 177 | "" 178 | ] 179 | }, 180 | "execution_count": 6, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "mdata.file" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "id": "a43127df-c330-4104-bbf6-399c7392c373", 192 | "metadata": {}, 193 | "source": [ 194 | "The name of the file can then be accessed via" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 7, 200 | "id": "a7d549f2-ec47-4744-a744-e2f7884638d7", 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "data": { 205 | "text/plain": [ 206 | "'data/pbmc5k_citeseq/pbmc5k_citeseq_processed.h5mu'" 207 | ] 208 | }, 209 | "execution_count": 7, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "mdata.file.filename" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "id": "0574136f-7aa4-4a1e-9312-eee5fc9c6744", 221 | "metadata": { 222 | "slideshow": { 223 | "slide_type": "subslide" 224 | }, 225 | "tags": [] 226 | }, 227 | "source": [ 228 | "The connection stays open until `mdata.close()` is called" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 8, 234 | "id": "1c1f47db-f933-4999-8fae-cb088b56dab5", 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "mdata.close()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "id": "a87e0e96-86c2-4623-b239-892e92b04a5a", 244 | "metadata": { 245 | "slideshow": { 246 | "slide_type": "fragment" 247 | }, 248 | "tags": [] 249 | }, 250 | "source": [ 251 | "... or until the file has to be re-opened for modification (see below)." 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "id": "5a064df4-b533-4124-a85a-f7b20fcc1091", 257 | "metadata": {}, 258 | "source": [ 259 | " " 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "id": "9beb85a9-e226-4b9a-949b-2351432558f7", 265 | "metadata": { 266 | "slideshow": { 267 | "slide_type": "slide" 268 | }, 269 | "tags": [] 270 | }, 271 | "source": [ 272 | "### Permissions\n", 273 | "\n", 274 | "We can open HDF5 files in different modes including purely read-only (`'r'`) and read/write (`'r+'`). The mode can be provided to the constructor:" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 9, 280 | "id": "9f297beb-97b5-46ad-97b9-2dedc5c40b53", 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "'r'" 287 | ] 288 | }, 289 | "execution_count": 9, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "mdata = MuDataShadow(file, mode=\"r\")\n", 296 | "mdata.file.mode" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "id": "fc9da2a5-402f-4fe8-83a2-0a5f06a84d7c", 302 | "metadata": {}, 303 | "source": [ 304 | "Let's add some data to the in-memory shadow object:" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 10, 310 | "id": "21f291bd-7c5d-4ef3-a034-c0030dabdb60", 311 | "metadata": { 312 | "slideshow": { 313 | "slide_type": "fragment" 314 | }, 315 | "tags": [] 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "mdata[\"rna\"].obsm[\"X_pca_copy\"] = mdata[\"rna\"].obsm[\"X_pca\"].copy()" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "id": "b03108f5-0e8a-4646-af12-ef5fc934885b", 325 | "metadata": { 326 | "slideshow": { 327 | "slide_type": "subslide" 328 | }, 329 | "tags": [] 330 | }, 331 | "source": [ 332 | "We can also conveniently close and reopen the connection for a given in-memory shadow object:" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 11, 338 | "id": "e8ddb228-74b4-4f8e-8cdc-c84479f38d2d", 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "data": { 343 | "text/plain": [ 344 | "'r+'" 345 | ] 346 | }, 347 | "execution_count": 11, 348 | "metadata": {}, 349 | "output_type": "execute_result" 350 | } 351 | ], 352 | "source": [ 353 | "mdata.reopen(mode=\"r+\")\n", 354 | "mdata.file.mode" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "id": "48157734-adc0-4e7d-8157-64e1201b6fba", 360 | "metadata": {}, 361 | "source": [ 362 | "This way all the newly added elements are still available in memory:" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 12, 368 | "id": "043428b5-dc58-4d0c-b653-e1d8451b39f9", 369 | "metadata": {}, 370 | "outputs": [ 371 | { 372 | "data": { 373 | "text/plain": [ 374 | "obsm:\tX_pcaᐁ, X_umap, X_pca_copy▲" 375 | ] 376 | }, 377 | "execution_count": 12, 378 | "metadata": {}, 379 | "output_type": "execute_result" 380 | } 381 | ], 382 | "source": [ 383 | "mdata[\"rna\"].obsm" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 13, 389 | "id": "50aba055-06e2-490d-a1a6-3307ef7ac6d0", 390 | "metadata": { 391 | "slideshow": { 392 | "slide_type": "fragment" 393 | }, 394 | "tags": [] 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "# Clean up\n", 399 | "mdata.close()\n", 400 | "del mdata" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "id": "991ccc6a-f182-4689-802d-a9ae70a490e4", 406 | "metadata": {}, 407 | "source": [ 408 | " " 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "id": "2dbc52ad-6010-416f-810b-c60e5546ba7b", 414 | "metadata": { 415 | "slideshow": { 416 | "slide_type": "slide" 417 | }, 418 | "tags": [] 419 | }, 420 | "source": [ 421 | "### Individual modality access\n", 422 | "\n", 423 | "Individual modalities stored in the .h5mu files can be accessed as part of the `MuDataShadow` object:" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 14, 429 | "id": "d5ea1511-6f1b-4c51-9ec7-14365dc8d391", 430 | "metadata": { 431 | "slideshow": { 432 | "slide_type": "fragment" 433 | }, 434 | "tags": [] 435 | }, 436 | "outputs": [ 437 | { 438 | "data": { 439 | "text/plain": [ 440 | "AnnData Shadow object with n_obs × n_vars = 3891 × 17806\n", 441 | " X \n", 442 | " raw:\tX, var, varm\n", 443 | " obs:\t_index, celltype, leiden, n_genes_by_counts, pct_counts_mt, total_counts, total_counts_mt\n", 444 | " var:\t_index, dispersions, dispersions_norm, feature_types, gene_ids, highly_variable, mean, mean_counts, means, mt, n_cells_by_counts, pct_dropout_by_counts, std, total_counts\n", 445 | " obsm:\tX_pca, X_umap\n", 446 | " varm:\tPCs\n", 447 | " obsp:\tconnectivities, distances\n", 448 | " uns:\tcelltype_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, umap" 449 | ] 450 | }, 451 | "execution_count": 14, 452 | "metadata": {}, 453 | "output_type": "execute_result" 454 | } 455 | ], 456 | "source": [ 457 | "mdata = MuDataShadow(file, mode=\"r\")\n", 458 | "mdata[\"rna\"]" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "id": "60d08ba8-c7c4-4d13-a5fe-9f39c56dd86a", 464 | "metadata": { 465 | "slideshow": { 466 | "slide_type": "subslide" 467 | }, 468 | "tags": [] 469 | }, 470 | "source": [ 471 | "Moreover, one can also create a direct connection to a specific modality:" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 15, 477 | "id": "a853493d-5432-438f-bd8f-837cb63d151a", 478 | "metadata": {}, 479 | "outputs": [ 480 | { 481 | "data": { 482 | "text/plain": [ 483 | "AnnData Shadow object with n_obs × n_vars = 3891 × 17806\n", 484 | " X \n", 485 | " raw:\tX, var, varm\n", 486 | " obs:\t_index, celltype, leiden, n_genes_by_counts, pct_counts_mt, total_counts, total_counts_mt\n", 487 | " var:\t_index, dispersions, dispersions_norm, feature_types, gene_ids, highly_variable, mean, mean_counts, means, mt, n_cells_by_counts, pct_dropout_by_counts, std, total_counts\n", 488 | " obsm:\tX_pca, X_umap\n", 489 | " varm:\tPCs\n", 490 | " obsp:\tconnectivities, distances\n", 491 | " uns:\tcelltype_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, umap" 492 | ] 493 | }, 494 | "execution_count": 15, 495 | "metadata": {}, 496 | "output_type": "execute_result" 497 | } 498 | ], 499 | "source": [ 500 | "mdata.close()\n", 501 | "del mdata\n", 502 | "\n", 503 | "adata = AnnDataShadow(file / \"mod/rna\")\n", 504 | "adata" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 16, 510 | "id": "946d03a9-d0d1-4ebc-ae29-92d795f08073", 511 | "metadata": { 512 | "slideshow": { 513 | "slide_type": "fragment" 514 | }, 515 | "tags": [] 516 | }, 517 | "outputs": [], 518 | "source": [ 519 | "# Clean up\n", 520 | "adata.close()\n", 521 | "del adata" 522 | ] 523 | }, 524 | { 525 | "cell_type": "markdown", 526 | "id": "14b8ad11-adad-4ea8-9146-3dd7cd9bd415", 527 | "metadata": {}, 528 | "source": [ 529 | " " 530 | ] 531 | }, 532 | { 533 | "cell_type": "markdown", 534 | "id": "d3ae2a84-34fc-48b9-926e-a5d5f57e4e73", 535 | "metadata": { 536 | "slideshow": { 537 | "slide_type": "slide" 538 | }, 539 | "tags": [] 540 | }, 541 | "source": [ 542 | "### Class identity\n", 543 | "\n", 544 | "Many tools in the ecosystem including scanpy frequently check if the input object is an AnnData. For instance, [in `sc.pp.highly_variable_genes`](https://github.com/scverse/scanpy/blob/master/scanpy/preprocessing/_highly_variable_genes.py) it reads:\n", 545 | "\n", 546 | "```py\n", 547 | "if not isinstance(adata, AnnData):\n", 548 | " raise ValueError(\n", 549 | " '`pp.highly_variable_genes` expects an `AnnData` argument, '\n", 550 | " 'pass `inplace=False` if you want to return a `pd.DataFrame`.'\n", 551 | " )\n", 552 | "```\n", 553 | "\n", 554 | "In order for shadow objects to be accepted by such functions, they mock their class identity:" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 17, 560 | "id": "f10b98ff-920f-4d46-924f-1cf3074236db", 561 | "metadata": { 562 | "slideshow": { 563 | "slide_type": "subslide" 564 | }, 565 | "tags": [] 566 | }, 567 | "outputs": [], 568 | "source": [ 569 | "mdata = MuDataShadow(file, mode=\"r\")\n", 570 | "\n", 571 | "from mudata import MuData\n", 572 | "assert isinstance(mdata, MuData), \"mdata is not a valid MuData object\"" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 18, 578 | "id": "7796c156-b84e-46f9-90e4-fe18ad6b91d8", 579 | "metadata": { 580 | "slideshow": { 581 | "slide_type": "fragment" 582 | }, 583 | "tags": [] 584 | }, 585 | "outputs": [], 586 | "source": [ 587 | "from anndata import AnnData\n", 588 | "assert isinstance(mdata[\"rna\"], AnnData), \"mdata['rna'] is not a valid AnnData object\"" 589 | ] 590 | }, 591 | { 592 | "cell_type": "markdown", 593 | "id": "f8e2d4a9-eba2-45c0-88f6-35f69e7d0249", 594 | "metadata": { 595 | "slideshow": { 596 | "slide_type": "subslide" 597 | }, 598 | "tags": [] 599 | }, 600 | "source": [ 601 | "Checking for shadow identity still works:" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": 19, 607 | "id": "51cd4264-e9d0-4e2c-a536-835a0d3a699d", 608 | "metadata": {}, 609 | "outputs": [ 610 | { 611 | "data": { 612 | "text/plain": [ 613 | "True" 614 | ] 615 | }, 616 | "execution_count": 19, 617 | "metadata": {}, 618 | "output_type": "execute_result" 619 | } 620 | ], 621 | "source": [ 622 | "isinstance(mdata, MuDataShadow)" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 20, 628 | "id": "efadd4ba-219c-4c84-a1eb-36baf135c82d", 629 | "metadata": {}, 630 | "outputs": [ 631 | { 632 | "data": { 633 | "text/plain": [ 634 | "True" 635 | ] 636 | }, 637 | "execution_count": 20, 638 | "metadata": {}, 639 | "output_type": "execute_result" 640 | } 641 | ], 642 | "source": [ 643 | "isinstance(mdata[\"rna\"], AnnDataShadow)" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 21, 649 | "id": "a32515de-7866-4229-a639-0818a0dbea3b", 650 | "metadata": { 651 | "slideshow": { 652 | "slide_type": "fragment" 653 | }, 654 | "tags": [] 655 | }, 656 | "outputs": [], 657 | "source": [ 658 | "mdata.close()" 659 | ] 660 | }, 661 | { 662 | "cell_type": "markdown", 663 | "id": "8d4e683f-0a0b-426c-8cf7-5f5529a844d2", 664 | "metadata": {}, 665 | "source": [ 666 | " " 667 | ] 668 | }, 669 | { 670 | "cell_type": "markdown", 671 | "id": "c29f18b0-717b-4821-b0f8-e81ca94426de", 672 | "metadata": {}, 673 | "source": [ 674 | "### Backends\n", 675 | "\n", 676 | "AnnData/MuData are based on a NumPy/Pandas stack. This is the default for the shadow objects in order to provide compatibility with AnnData/MuData objects.\n", 677 | "\n", 678 | "However the nature of shadow files also simplifies loading individual matrices or tables with alternative backends, e.g. [JAX](https://jax.readthedocs.io/en/latest/_autosummary/jax.numpy.array.html#jax.numpy.array) (`Array`), [PyTorch](https://pytorch.org/docs/stable/tensors.html) (`Tensor`) or [polars](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/index.html) (`DataFrame`)." 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": 22, 684 | "id": "734d4e9e-3936-4911-96fe-1bed3de167eb", 685 | "metadata": {}, 686 | "outputs": [], 687 | "source": [ 688 | "mdata = MuDataShadow(file, array_backend=\"jax\", table_backend=\"polars\")" 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": 23, 694 | "id": "3d909ef6-92b7-40f4-b50e-641993469791", 695 | "metadata": {}, 696 | "outputs": [ 697 | { 698 | "name": "stdout", 699 | "output_type": "stream", 700 | "text": [ 701 | "\n" 702 | ] 703 | }, 704 | { 705 | "data": { 706 | "text/html": [ 707 | "
\n", 708 | "\n", 737 | "\n", 738 | "shape: (5, 7)\n", 739 | "\n", 740 | "\n", 741 | "\n", 744 | "\n", 747 | "\n", 750 | "\n", 753 | "\n", 756 | "\n", 759 | "\n", 762 | "\n", 763 | "\n", 764 | "\n", 767 | "\n", 770 | "\n", 773 | "\n", 776 | "\n", 779 | "\n", 782 | "\n", 785 | "\n", 786 | "\n", 787 | "\n", 788 | "\n", 789 | "\n", 792 | "\n", 795 | "\n", 798 | "\n", 801 | "\n", 804 | "\n", 807 | "\n", 810 | "\n", 811 | "\n", 812 | "\n", 815 | "\n", 818 | "\n", 821 | "\n", 824 | "\n", 827 | "\n", 830 | "\n", 833 | "\n", 834 | "\n", 835 | "\n", 838 | "\n", 841 | "\n", 844 | "\n", 847 | "\n", 850 | "\n", 853 | "\n", 856 | "\n", 857 | "\n", 858 | "\n", 861 | "\n", 864 | "\n", 867 | "\n", 870 | "\n", 873 | "\n", 876 | "\n", 879 | "\n", 880 | "\n", 881 | "\n", 884 | "\n", 887 | "\n", 890 | "\n", 893 | "\n", 896 | "\n", 899 | "\n", 902 | "\n", 903 | "\n", 904 | "
\n", 742 | "_index\n", 743 | "\n", 745 | "celltype\n", 746 | "\n", 748 | "leiden\n", 749 | "\n", 751 | "n_genes_by_counts\n", 752 | "\n", 754 | "pct_counts_mt\n", 755 | "\n", 757 | "total_counts\n", 758 | "\n", 760 | "total_counts_mt\n", 761 | "
\n", 765 | "object\n", 766 | "\n", 768 | "cat\n", 769 | "\n", 771 | "cat\n", 772 | "\n", 774 | "i32\n", 775 | "\n", 777 | "f32\n", 778 | "\n", 780 | "f32\n", 781 | "\n", 783 | "f32\n", 784 | "
\n", 790 | "AAACCCAAGAGACAAG-1\n", 791 | "\n", 793 | ""intermediate m...\n", 794 | "\n", 796 | ""3"\n", 797 | "\n", 799 | "2363\n", 800 | "\n", 802 | "6.332204\n", 803 | "\n", 805 | "7375.0\n", 806 | "\n", 808 | "467.0\n", 809 | "
\n", 813 | "AAACCCAAGGCCTAGA-1\n", 814 | "\n", 816 | ""CD4+ naïve T"\n", 817 | "\n", 819 | ""0"\n", 820 | "\n", 822 | "1259\n", 823 | "\n", 825 | "9.093319\n", 826 | "\n", 828 | "3772.0\n", 829 | "\n", 831 | "343.0\n", 832 | "
\n", 836 | "AAACCCAGTCGTGCCA-1\n", 837 | "\n", 839 | ""CD4+ memory T"\n", 840 | "\n", 842 | ""2"\n", 843 | "\n", 845 | "1578\n", 846 | "\n", 848 | "13.178295\n", 849 | "\n", 851 | "4902.0\n", 852 | "\n", 854 | "646.0\n", 855 | "
\n", 859 | "AAACCCATCGTGCATA-1\n", 860 | "\n", 862 | ""CD4+ memory T"\n", 863 | "\n", 865 | ""2"\n", 866 | "\n", 868 | "1908\n", 869 | "\n", 871 | "6.354415\n", 872 | "\n", 874 | "6704.0\n", 875 | "\n", 877 | "426.0\n", 878 | "
\n", 882 | "AAACGAAAGACAAGCC-1\n", 883 | "\n", 885 | ""CD14 mono"\n", 886 | "\n", 888 | ""1"\n", 889 | "\n", 891 | "1589\n", 892 | "\n", 894 | "9.307693\n", 895 | "\n", 897 | "3900.0\n", 898 | "\n", 900 | "363.0\n", 901 | "
\n", 905 | "
" 906 | ], 907 | "text/plain": [ 908 | "shape: (5, 7)\n", 909 | "┌──────────────┬──────────────┬────────┬──────────────┬──────────────┬──────────────┬──────────────┐\n", 910 | "│ _index ┆ celltype ┆ leiden ┆ n_genes_by_c ┆ pct_counts_m ┆ total_counts ┆ total_counts │\n", 911 | "│ --- ┆ --- ┆ --- ┆ ounts ┆ t ┆ --- ┆ _mt │\n", 912 | "│ object ┆ cat ┆ cat ┆ --- ┆ --- ┆ f32 ┆ --- │\n", 913 | "│ ┆ ┆ ┆ i32 ┆ f32 ┆ ┆ f32 │\n", 914 | "╞══════════════╪══════════════╪════════╪══════════════╪══════════════╪══════════════╪══════════════╡\n", 915 | "│ AAACCCAAGAGA ┆ intermediate ┆ 3 ┆ 2363 ┆ 6.332204 ┆ 7375.0 ┆ 467.0 │\n", 916 | "│ CAAG-1 ┆ mono ┆ ┆ ┆ ┆ ┆ │\n", 917 | "│ AAACCCAAGGCC ┆ CD4+ naïve T ┆ 0 ┆ 1259 ┆ 9.093319 ┆ 3772.0 ┆ 343.0 │\n", 918 | "│ TAGA-1 ┆ ┆ ┆ ┆ ┆ ┆ │\n", 919 | "│ AAACCCAGTCGT ┆ CD4+ memory ┆ 2 ┆ 1578 ┆ 13.178295 ┆ 4902.0 ┆ 646.0 │\n", 920 | "│ GCCA-1 ┆ T ┆ ┆ ┆ ┆ ┆ │\n", 921 | "│ AAACCCATCGTG ┆ CD4+ memory ┆ 2 ┆ 1908 ┆ 6.354415 ┆ 6704.0 ┆ 426.0 │\n", 922 | "│ CATA-1 ┆ T ┆ ┆ ┆ ┆ ┆ │\n", 923 | "│ AAACGAAAGACA ┆ CD14 mono ┆ 1 ┆ 1589 ┆ 9.307693 ┆ 3900.0 ┆ 363.0 │\n", 924 | "│ AGCC-1 ┆ ┆ ┆ ┆ ┆ ┆ │\n", 925 | "└──────────────┴──────────────┴────────┴──────────────┴──────────────┴──────────────┴──────────────┘" 926 | ] 927 | }, 928 | "execution_count": 23, 929 | "metadata": {}, 930 | "output_type": "execute_result" 931 | } 932 | ], 933 | "source": [ 934 | "obs = mdata[\"rna\"].obs\n", 935 | "print(type(obs))\n", 936 | "obs.head()" 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": 24, 942 | "id": "32286100-13e4-49af-8194-f53693c9b7f0", 943 | "metadata": {}, 944 | "outputs": [ 945 | { 946 | "name": "stdout", 947 | "output_type": "stream", 948 | "text": [ 949 | "\n" 950 | ] 951 | }, 952 | { 953 | "data": { 954 | "text/plain": [ 955 | "DeviceArray([[ 20.551052 , 0.36840764, -1.6193684 , ...,\n", 956 | " 0.09656975, -0.90912175, -0.77955467],\n", 957 | " [ -9.47144 , -5.5212517 , -5.107428 , ...,\n", 958 | " 0.64674896, -0.892091 , 1.7873902 ],\n", 959 | " [ -9.913012 , 2.766899 , -2.0684972 , ...,\n", 960 | " -0.6454743 , 1.615869 , -0.63476324],\n", 961 | " ...,\n", 962 | " [ -8.727723 , 7.9196725 , 1.3326805 , ...,\n", 963 | " 1.4592032 , 0.91210324, 1.3184382 ],\n", 964 | " [-10.792531 , 3.2086673 , -2.0437238 , ...,\n", 965 | " 1.7311838 , -1.840564 , 1.3253008 ],\n", 966 | " [ 20.642431 , 0.49294943, -1.6694897 , ...,\n", 967 | " -0.51208967, 0.60652566, -0.75145006]], dtype=float32)" 968 | ] 969 | }, 970 | "execution_count": 24, 971 | "metadata": {}, 972 | "output_type": "execute_result" 973 | } 974 | ], 975 | "source": [ 976 | "rna_pca = mdata[\"rna\"].obsm[\"X_pca\"]\n", 977 | "print(type(rna_pca))\n", 978 | "rna_pca" 979 | ] 980 | }, 981 | { 982 | "cell_type": "markdown", 983 | "id": "6cdad910-a34c-49d2-bc03-87bfde9417c9", 984 | "metadata": {}, 985 | "source": [ 986 | "When alternative backends are being used, not all of the AnnData/MuData features can be supported, and many external tools might not work as expected as they anticipate NumPy/Pandas objects instead." 987 | ] 988 | }, 989 | { 990 | "cell_type": "code", 991 | "execution_count": 25, 992 | "id": "b06a9071-0443-41e6-ac81-e3f0ce2653e9", 993 | "metadata": {}, 994 | "outputs": [], 995 | "source": [ 996 | "# Clean up\n", 997 | "mdata.clear_cache()\n", 998 | "mdata.close()\n", 999 | "del mdata, rna_pca, obs" 1000 | ] 1001 | }, 1002 | { 1003 | "cell_type": "markdown", 1004 | "id": "6c474c9e-dfea-406c-ace6-461e8d5438a4", 1005 | "metadata": {}, 1006 | "source": [ 1007 | " " 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "markdown", 1012 | "id": "16f9b372-a089-4aed-b91e-b368a2ddc13e", 1013 | "metadata": { 1014 | "slideshow": { 1015 | "slide_type": "slide" 1016 | }, 1017 | "tags": [] 1018 | }, 1019 | "source": [ 1020 | "### Partial writing\n", 1021 | "\n", 1022 | "> [!NOTE]\n", 1023 | "> This feature is experimental.\n", 1024 | "\n", 1025 | "While the main use of the shadows is to provide a low-memory read-only solution to scverse datasets, ability to add new embeddings or other items to the file can greatly extend its usage patterns." 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": 9, 1031 | "id": "02245bc0-cc92-4fe7-b665-a4e2f424b353", 1032 | "metadata": { 1033 | "slideshow": { 1034 | "slide_type": "fragment" 1035 | }, 1036 | "tags": [] 1037 | }, 1038 | "outputs": [], 1039 | "source": [ 1040 | "mdata = MuDataShadow(file, mode=\"r\")" 1041 | ] 1042 | }, 1043 | { 1044 | "cell_type": "markdown", 1045 | "id": "c7324f1c-c4a4-4561-9680-0ac5caacc79f", 1046 | "metadata": {}, 1047 | "source": [ 1048 | "Add a new embedding to the in-memory object:" 1049 | ] 1050 | }, 1051 | { 1052 | "cell_type": "code", 1053 | "execution_count": 10, 1054 | "id": "eb6f076f-0b26-428b-a824-a82b3d648c00", 1055 | "metadata": { 1056 | "slideshow": { 1057 | "slide_type": "fragment" 1058 | }, 1059 | "tags": [] 1060 | }, 1061 | "outputs": [ 1062 | { 1063 | "data": { 1064 | "text/plain": [ 1065 | "obsm:\tX_pcaᐁ, X_pca_copyᐁ, X_umap" 1066 | ] 1067 | }, 1068 | "execution_count": 10, 1069 | "metadata": {}, 1070 | "output_type": "execute_result" 1071 | } 1072 | ], 1073 | "source": [ 1074 | "mdata[\"rna\"].obsm[\"X_pca_copy\"] = mdata[\"rna\"].obsm[\"X_pca\"].copy()\n", 1075 | "mdata[\"rna\"].obsm" 1076 | ] 1077 | }, 1078 | { 1079 | "cell_type": "markdown", 1080 | "id": "0a7a6374-cb13-4f3a-8f5b-e0c4b4f89363", 1081 | "metadata": { 1082 | "slideshow": { 1083 | "slide_type": "subslide" 1084 | }, 1085 | "tags": [] 1086 | }, 1087 | "source": [ 1088 | "For this, a family of methods is useful, including `.reopen()` and `.write()`. The `.write()` method will only work if the connection is not read-only, e.g. `'r+'`, however it is possible to reopen the file in another mode.\n", 1089 | "\n", 1090 | "Internally, `.write()` pushes (`._push_changes()`) the in-memory changes (marked with ▲ in the object representation above) to the file and provides meaningful error messages when the file is not open for writing.\n", 1091 | "\n", 1092 | "This separation of concern makes it transparent when the data is modified, and this workflow can be recommended when barely any data are added to the file. As the methods return the shadow itself, it is possible to chain them:" 1093 | ] 1094 | }, 1095 | { 1096 | "cell_type": "code", 1097 | "execution_count": 11, 1098 | "id": "bcfa2982-4bf6-42eb-a604-d17d6496598b", 1099 | "metadata": { 1100 | "slideshow": { 1101 | "slide_type": "fragment" 1102 | }, 1103 | "tags": [] 1104 | }, 1105 | "outputs": [ 1106 | { 1107 | "data": { 1108 | "text/plain": [ 1109 | "obsm:\tX_pcaᐁ, X_pca_copyᐁ, X_umap" 1110 | ] 1111 | }, 1112 | "execution_count": 11, 1113 | "metadata": {}, 1114 | "output_type": "execute_result" 1115 | } 1116 | ], 1117 | "source": [ 1118 | "mdata.reopen(mode='r+').write(clear_cache=True).reopen(mode='r'); # clear pushed elements from cache\n", 1119 | "mdata[\"rna\"].obsm" 1120 | ] 1121 | }, 1122 | { 1123 | "cell_type": "code", 1124 | "execution_count": 12, 1125 | "id": "b03d8f00-6a61-44ec-aa69-fbd01b43c886", 1126 | "metadata": {}, 1127 | "outputs": [ 1128 | { 1129 | "data": { 1130 | "text/plain": [ 1131 | "'r'" 1132 | ] 1133 | }, 1134 | "execution_count": 12, 1135 | "metadata": {}, 1136 | "output_type": "execute_result" 1137 | } 1138 | ], 1139 | "source": [ 1140 | "mdata.file.mode" 1141 | ] 1142 | }, 1143 | { 1144 | "cell_type": "code", 1145 | "execution_count": 13, 1146 | "id": "1b794d6e-3cf2-4451-9a96-972aec79fc82", 1147 | "metadata": {}, 1148 | "outputs": [], 1149 | "source": [ 1150 | "mdata.clear_cache()" 1151 | ] 1152 | }, 1153 | { 1154 | "cell_type": "markdown", 1155 | "id": "af3d311e-0199-4dcf-b5a5-15b8e446fd08", 1156 | "metadata": {}, 1157 | "source": [ 1158 | " " 1159 | ] 1160 | }, 1161 | { 1162 | "cell_type": "markdown", 1163 | "id": "1b128596-dbb5-4469-a346-bd14cda79eb3", 1164 | "metadata": {}, 1165 | "source": [ 1166 | "Default mode is read-only, and it protects the files from being modified while also allowing for multiple connections to the file:" 1167 | ] 1168 | }, 1169 | { 1170 | "cell_type": "code", 1171 | "execution_count": 17, 1172 | "id": "8e817c96-ae69-49d7-a574-58481170f011", 1173 | "metadata": {}, 1174 | "outputs": [ 1175 | { 1176 | "name": "stdout", 1177 | "output_type": "stream", 1178 | "text": [ 1179 | "Not available for .write(): File is open in read-only mode. Changes can't be pushed. Reopen it with .reopen('r+') to enable writing.\n" 1180 | ] 1181 | } 1182 | ], 1183 | "source": [ 1184 | "try:\n", 1185 | " mdata.write()\n", 1186 | "except OSError as e:\n", 1187 | " print(\"Not available for .write():\", e)" 1188 | ] 1189 | }, 1190 | { 1191 | "cell_type": "markdown", 1192 | "id": "2e68cef8-871f-49be-8829-f59ff9d93f99", 1193 | "metadata": {}, 1194 | "source": [ 1195 | " " 1196 | ] 1197 | }, 1198 | { 1199 | "cell_type": "markdown", 1200 | "id": "8b5c17b8-98d1-42b6-a008-b3c3b6fbfb79", 1201 | "metadata": {}, 1202 | "source": [ 1203 | "> [!NOTE]\n", 1204 | "> Partial writing is currently intended to add new elements to the dataset on disk (e.g. a new embedding to .obsm) rather than to modify the dataset and delete or alter existing elements." 1205 | ] 1206 | }, 1207 | { 1208 | "cell_type": "markdown", 1209 | "id": "e841d95f-3f46-4902-b18f-eb4c7080e58d", 1210 | "metadata": {}, 1211 | "source": [ 1212 | " " 1213 | ] 1214 | }, 1215 | { 1216 | "cell_type": "markdown", 1217 | "id": "e0c11265-8429-4a34-a552-759b1f07a0bc", 1218 | "metadata": { 1219 | "tags": [] 1220 | }, 1221 | "source": [ 1222 | "### Views\n", 1223 | "\n", 1224 | "Views for shadow objects are conceptually similar to [views in AnnData/MuData](https://anndata.readthedocs.io/en/latest/generated/anndata.AnnData.is_view.html): they provide a view into an existing object without creating its copy.\n", 1225 | "\n", 1226 | "As shadow objects inherently operate on the file they are connected to, their views behave slightly differently. Creating a view creates a new connection to the file and returns a new shadow object, which is aware of the part of the data (e.g. which cells) it is supposed to provide a view for." 1227 | ] 1228 | }, 1229 | { 1230 | "cell_type": "code", 1231 | "execution_count": 18, 1232 | "id": "c3ea6e33-128a-48fd-a421-0c9f5801e47d", 1233 | "metadata": {}, 1234 | "outputs": [ 1235 | { 1236 | "data": { 1237 | "text/plain": [ 1238 | "View of MuData Shadow object with n_obs × n_vars = 612 × 17838 (original 3891 × 17838)\n", 1239 | " obs:\t_index, leiden, leiden_wnn, louvain\n", 1240 | " var:\t_index, feature_types, gene_ids, highly_variable\n", 1241 | " obsm:\tX_mofa, X_mofa_umap, X_umap, X_wnn_umap, prot, rna\n", 1242 | " varm:\tLFs, prot, rna\n", 1243 | " obsp:\tconnectivities, distances, wnn_connectivities, wnn_distances\n", 1244 | " uns:\tleiden, leiden_wnn_colors, louvain, neighbors, rna:celltype_colors, umap, wnn\n", 1245 | " obsmap:\tprot, rna\n", 1246 | " varmap:\tprot, rna\n", 1247 | " mod:\t2 modalities\n", 1248 | " prot: 612 x 32\n", 1249 | " X \n", 1250 | " layers:\tcounts\n", 1251 | " obs:\t_index\n", 1252 | " var:\t_index, feature_types, gene_ids, highly_variable\n", 1253 | " obsm:\tX_pca, X_umap\n", 1254 | " varm:\tPCs\n", 1255 | " obsp:\tconnectivities, distances\n", 1256 | " uns:\tneighbors, pca, umap\n", 1257 | " rna: 612 x 17806\n", 1258 | " X \n", 1259 | " raw:\tX, var, varm\n", 1260 | " obs:\t_index, celltype, leiden, n_genes_by_counts, pct_counts_mt, total_counts, total_counts_mt\n", 1261 | " var:\t_index, dispersions, dispersions_norm, feature_types, gene_ids, highly_variable, mean, mean_counts, means, mt, n_cells_by_counts, pct_dropout_by_counts, std, total_counts\n", 1262 | " obsm:\tX_pca, X_pca_copy, X_umap\n", 1263 | " varm:\tPCs\n", 1264 | " obsp:\tconnectivities, distances\n", 1265 | " uns:\tcelltype_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, umap" 1266 | ] 1267 | }, 1268 | "execution_count": 18, 1269 | "metadata": {}, 1270 | "output_type": "execute_result" 1271 | } 1272 | ], 1273 | "source": [ 1274 | "monocytes = mdata['rna'].obs['celltype'].values == \"CD14 mono\"\n", 1275 | "monocytes_view = mdata[monocytes]\n", 1276 | "monocytes_view" 1277 | ] 1278 | }, 1279 | { 1280 | "cell_type": "markdown", 1281 | "id": "2f115798-96d2-4660-889d-b3e9a2d154c3", 1282 | "metadata": {}, 1283 | "source": [ 1284 | "Individual modalities of a MuData Shadow View are sliced accordingly:" 1285 | ] 1286 | }, 1287 | { 1288 | "cell_type": "code", 1289 | "execution_count": 19, 1290 | "id": "13f4b379-e26d-4677-9de3-42b3754af15d", 1291 | "metadata": {}, 1292 | "outputs": [ 1293 | { 1294 | "data": { 1295 | "text/plain": [ 1296 | "(612, 50)" 1297 | ] 1298 | }, 1299 | "execution_count": 19, 1300 | "metadata": {}, 1301 | "output_type": "execute_result" 1302 | } 1303 | ], 1304 | "source": [ 1305 | "monocytes_view['rna'].obsm[\"X_pca\"].shape" 1306 | ] 1307 | }, 1308 | { 1309 | "cell_type": "code", 1310 | "execution_count": 20, 1311 | "id": "585fcbc6-9d5f-406f-99e1-6b91117e2bac", 1312 | "metadata": {}, 1313 | "outputs": [ 1314 | { 1315 | "data": { 1316 | "text/plain": [ 1317 | "obsm:\tX_pcaᐁ, X_pca_copy, X_umap" 1318 | ] 1319 | }, 1320 | "execution_count": 20, 1321 | "metadata": {}, 1322 | "output_type": "execute_result" 1323 | } 1324 | ], 1325 | "source": [ 1326 | "monocytes_view['rna'].obsm" 1327 | ] 1328 | }, 1329 | { 1330 | "cell_type": "markdown", 1331 | "id": "8fbdbb1f-9e35-44aa-aad8-b1f67f827fbd", 1332 | "metadata": {}, 1333 | "source": [ 1334 | "Cache is specific to each view:" 1335 | ] 1336 | }, 1337 | { 1338 | "cell_type": "code", 1339 | "execution_count": 21, 1340 | "id": "d68cc6ea-de8d-4801-9667-4fa059609d85", 1341 | "metadata": {}, 1342 | "outputs": [ 1343 | { 1344 | "data": { 1345 | "text/plain": [ 1346 | "obsm:\tX_pca, X_pca_copy, X_umap" 1347 | ] 1348 | }, 1349 | "execution_count": 21, 1350 | "metadata": {}, 1351 | "output_type": "execute_result" 1352 | } 1353 | ], 1354 | "source": [ 1355 | "mdata['rna'].obsm # X_pca is not cached" 1356 | ] 1357 | }, 1358 | { 1359 | "cell_type": "markdown", 1360 | "id": "e511214b-52a4-4f63-9275-b267b779ecc9", 1361 | "metadata": {}, 1362 | "source": [ 1363 | "Moreover, this semantic allows to create views of views of views..." 1364 | ] 1365 | }, 1366 | { 1367 | "cell_type": "code", 1368 | "execution_count": 22, 1369 | "id": "229da4ce-df96-45b6-a6a4-4b44ee6749f5", 1370 | "metadata": {}, 1371 | "outputs": [], 1372 | "source": [ 1373 | "adata = AnnDataShadow(file / \"mod/rna\")" 1374 | ] 1375 | }, 1376 | { 1377 | "cell_type": "code", 1378 | "execution_count": 23, 1379 | "id": "30cbefc7-1e59-447c-8413-de8ef34be30b", 1380 | "metadata": {}, 1381 | "outputs": [ 1382 | { 1383 | "data": { 1384 | "text/plain": [ 1385 | "View of AnnData Shadow object with n_obs × n_vars = 7 × 30 (original 3891 × 17806)\n", 1386 | " X \n", 1387 | " raw:\tX, var, varm\n", 1388 | " obs:\t_index, celltype, leiden, n_genes_by_counts, pct_counts_mt, total_counts, total_counts_mt\n", 1389 | " var:\t_index, dispersions, dispersions_norm, feature_types, gene_ids, highly_variable, mean, mean_counts, means, mt, n_cells_by_counts, pct_dropout_by_counts, std, total_counts\n", 1390 | " obsm:\tX_pca, X_pca_copy, X_umap\n", 1391 | " varm:\tPCs\n", 1392 | " obsp:\tconnectivities, distances\n", 1393 | " uns:\tcelltype_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, umap" 1394 | ] 1395 | }, 1396 | "execution_count": 23, 1397 | "metadata": {}, 1398 | "output_type": "execute_result" 1399 | } 1400 | ], 1401 | "source": [ 1402 | "view = adata[3:10,:30]\n", 1403 | "view" 1404 | ] 1405 | }, 1406 | { 1407 | "cell_type": "code", 1408 | "execution_count": 24, 1409 | "id": "bfa15c8a-f4a8-4907-939f-5cb80ef50abc", 1410 | "metadata": {}, 1411 | "outputs": [ 1412 | { 1413 | "data": { 1414 | "text/plain": [ 1415 | "View of AnnData Shadow object with n_obs × n_vars = 2 × 3 (original 3891 × 17806)\n", 1416 | " X \n", 1417 | " raw:\tX, var, varm\n", 1418 | " obs:\t_index, celltype, leiden, n_genes_by_counts, pct_counts_mt, total_counts, total_counts_mt\n", 1419 | " var:\t_index, dispersions, dispersions_norm, feature_types, gene_ids, highly_variable, mean, mean_counts, means, mt, n_cells_by_counts, pct_dropout_by_counts, std, total_counts\n", 1420 | " obsm:\tX_pca, X_pca_copy, X_umap\n", 1421 | " varm:\tPCs\n", 1422 | " obsp:\tconnectivities, distances\n", 1423 | " uns:\tcelltype_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, umap" 1424 | ] 1425 | }, 1426 | "execution_count": 24, 1427 | "metadata": {}, 1428 | "output_type": "execute_result" 1429 | } 1430 | ], 1431 | "source": [ 1432 | "nested_view = view[:2,-3:]\n", 1433 | "nested_view" 1434 | ] 1435 | }, 1436 | { 1437 | "cell_type": "markdown", 1438 | "id": "6e3ce502-40e6-4b40-b78e-cf86e527bf18", 1439 | "metadata": {}, 1440 | "source": [ 1441 | "Getting attributes from views is no different than for shadow objects:" 1442 | ] 1443 | }, 1444 | { 1445 | "cell_type": "code", 1446 | "execution_count": 25, 1447 | "id": "216d5cd3-5457-4145-952b-61bed2be9f7d", 1448 | "metadata": {}, 1449 | "outputs": [ 1450 | { 1451 | "data": { 1452 | "text/html": [ 1453 | "
\n", 1454 | "\n", 1467 | "\n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | "
n_genes_by_countstotal_countstotal_counts_mtpct_counts_mtleidencelltype
AAACCCATCGTGCATA-119086704.0426.06.3544152CD4+ memory T
AAACGAAAGACAAGCC-115893900.0363.09.3076931CD14 mono
\n", 1500 | "
" 1501 | ], 1502 | "text/plain": [ 1503 | " n_genes_by_counts total_counts total_counts_mt \\\n", 1504 | "AAACCCATCGTGCATA-1 1908 6704.0 426.0 \n", 1505 | "AAACGAAAGACAAGCC-1 1589 3900.0 363.0 \n", 1506 | "\n", 1507 | " pct_counts_mt leiden celltype \n", 1508 | "AAACCCATCGTGCATA-1 6.354415 2 CD4+ memory T \n", 1509 | "AAACGAAAGACAAGCC-1 9.307693 1 CD14 mono " 1510 | ] 1511 | }, 1512 | "execution_count": 25, 1513 | "metadata": {}, 1514 | "output_type": "execute_result" 1515 | } 1516 | ], 1517 | "source": [ 1518 | "nested_view.obs" 1519 | ] 1520 | }, 1521 | { 1522 | "cell_type": "markdown", 1523 | "id": "9dbacf34-247e-4ac9-995b-f39656491973", 1524 | "metadata": {}, 1525 | "source": [ 1526 | "... as they are shadow objects themselves:" 1527 | ] 1528 | }, 1529 | { 1530 | "cell_type": "code", 1531 | "execution_count": 26, 1532 | "id": "c0921236-cc65-43fc-a9a1-557d4ab0a1c6", 1533 | "metadata": {}, 1534 | "outputs": [ 1535 | { 1536 | "data": { 1537 | "text/plain": [ 1538 | "shadows.anndatashadow.AnnDataShadow" 1539 | ] 1540 | }, 1541 | "execution_count": 26, 1542 | "metadata": {}, 1543 | "output_type": "execute_result" 1544 | } 1545 | ], 1546 | "source": [ 1547 | "type(nested_view)" 1548 | ] 1549 | }, 1550 | { 1551 | "cell_type": "code", 1552 | "execution_count": 27, 1553 | "id": "e70179b3-da72-4155-bbf9-b6f9d1fa8d47", 1554 | "metadata": {}, 1555 | "outputs": [], 1556 | "source": [ 1557 | "# Clean up\n", 1558 | "nested_view.close()\n", 1559 | "view.close()\n", 1560 | "del nested_view, view\n", 1561 | "\n", 1562 | "monocytes_view.close()\n", 1563 | "mdata.close()\n", 1564 | "del monocytes_view, mdata" 1565 | ] 1566 | }, 1567 | { 1568 | "cell_type": "markdown", 1569 | "id": "ed55ed1b-1d8e-4250-9352-75f59cd5551a", 1570 | "metadata": {}, 1571 | "source": [ 1572 | " " 1573 | ] 1574 | }, 1575 | { 1576 | "cell_type": "markdown", 1577 | "id": "ab4a745e-df8c-46f5-9c3d-d2d3678fff5f", 1578 | "metadata": { 1579 | "slideshow": { 1580 | "slide_type": "slide" 1581 | }, 1582 | "tags": [] 1583 | }, 1584 | "source": [ 1585 | "### Per-feature access to datasets on disk\n", 1586 | "\n", 1587 | "This is currently not possible as caching works at the level of individual HDF5 datasets.\n", 1588 | "\n", 1589 | "Views may read only the necessary parts of the arrays to memory however this behaviour is currently not universal.\n", 1590 | "\n", 1591 | "E.g.:" 1592 | ] 1593 | }, 1594 | { 1595 | "cell_type": "code", 1596 | "execution_count": 28, 1597 | "id": "ff5c4052-0929-43c3-947f-6de72b78d69e", 1598 | "metadata": {}, 1599 | "outputs": [ 1600 | { 1601 | "data": { 1602 | "text/plain": [ 1603 | "(10, 100)" 1604 | ] 1605 | }, 1606 | "execution_count": 28, 1607 | "metadata": {}, 1608 | "output_type": "execute_result" 1609 | } 1610 | ], 1611 | "source": [ 1612 | "adata_subset = adata[:10,:100]\n", 1613 | "adata_subset.X.shape" 1614 | ] 1615 | }, 1616 | { 1617 | "cell_type": "code", 1618 | "execution_count": 29, 1619 | "id": "e410e6e1-34c8-48f5-88b5-a45a0545e342", 1620 | "metadata": {}, 1621 | "outputs": [ 1622 | { 1623 | "data": { 1624 | "text/plain": [ 1625 | "View of AnnData Shadow object with n_obs × n_vars = 10 × 100 (original 3891 × 17806)\n", 1626 | " X ᐁ \n", 1627 | " raw:\tX, var, varm\n", 1628 | " obs:\t_index, celltype, leiden, n_genes_by_counts, pct_counts_mt, total_counts, total_counts_mt\n", 1629 | " var:\t_index, dispersions, dispersions_norm, feature_types, gene_ids, highly_variable, mean, mean_counts, means, mt, n_cells_by_counts, pct_dropout_by_counts, std, total_counts\n", 1630 | " obsm:\tX_pca, X_pca_copy, X_umap\n", 1631 | " varm:\tPCs\n", 1632 | " obsp:\tconnectivities, distances\n", 1633 | " uns:\tcelltype_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, umap" 1634 | ] 1635 | }, 1636 | "execution_count": 29, 1637 | "metadata": {}, 1638 | "output_type": "execute_result" 1639 | } 1640 | ], 1641 | "source": [ 1642 | "adata_subset" 1643 | ] 1644 | }, 1645 | { 1646 | "cell_type": "code", 1647 | "execution_count": 30, 1648 | "id": "bf2a317a-ca82-4a73-b0ef-07d0cfac2128", 1649 | "metadata": {}, 1650 | "outputs": [], 1651 | "source": [ 1652 | "# Clean up\n", 1653 | "adata.close()\n", 1654 | "adata_subset.close()\n", 1655 | "del adata, adata_subset" 1656 | ] 1657 | }, 1658 | { 1659 | "cell_type": "markdown", 1660 | "id": "bb50af6a-4ee2-4a8f-b022-9b0daa63e81e", 1661 | "metadata": {}, 1662 | "source": [ 1663 | " " 1664 | ] 1665 | }, 1666 | { 1667 | "cell_type": "markdown", 1668 | "id": "fec4c262-5bbf-4393-b082-f208f7997a7a", 1669 | "metadata": { 1670 | "slideshow": { 1671 | "slide_type": "slide" 1672 | }, 1673 | "tags": [] 1674 | }, 1675 | "source": [ 1676 | "---\n", 1677 | "\n", 1678 | "In order to return the data to its original state, let's manually remove the items we wrote to the file:" 1679 | ] 1680 | }, 1681 | { 1682 | "cell_type": "code", 1683 | "execution_count": 31, 1684 | "id": "46550ff4-39e1-40e6-80d0-4fd45d99af84", 1685 | "metadata": { 1686 | "slideshow": { 1687 | "slide_type": "fragment" 1688 | }, 1689 | "tags": [] 1690 | }, 1691 | "outputs": [], 1692 | "source": [ 1693 | "import h5py\n", 1694 | "\n", 1695 | "f = h5py.File(file, \"a\")\n", 1696 | "# ^\n", 1697 | "# ____________|\n", 1698 | "# if this works, \n", 1699 | "# no dangling read-only connections!\n", 1700 | "# \n", 1701 | "\n", 1702 | "del f[\"mod/rna/obsm/X_pca_copy\"]\n", 1703 | "f.close()" 1704 | ] 1705 | }, 1706 | { 1707 | "cell_type": "markdown", 1708 | "id": "6bc6a57c-39d0-45ad-be01-8cadde33da83", 1709 | "metadata": {}, 1710 | "source": [ 1711 | " " 1712 | ] 1713 | }, 1714 | { 1715 | "cell_type": "markdown", 1716 | "id": "752bd981-1cbd-43ec-b707-9308afb7e55f", 1717 | "metadata": {}, 1718 | "source": [ 1719 | " " 1720 | ] 1721 | } 1722 | ], 1723 | "metadata": { 1724 | "kernelspec": { 1725 | "display_name": "Python 3 (ipykernel)", 1726 | "language": "python", 1727 | "name": "python3" 1728 | }, 1729 | "language_info": { 1730 | "codemirror_mode": { 1731 | "name": "ipython", 1732 | "version": 3 1733 | }, 1734 | "file_extension": ".py", 1735 | "mimetype": "text/x-python", 1736 | "name": "python", 1737 | "nbconvert_exporter": "python", 1738 | "pygments_lexer": "ipython3", 1739 | "version": "3.10.11" 1740 | } 1741 | }, 1742 | "nbformat": 4, 1743 | "nbformat_minor": 5 1744 | } 1745 | --------------------------------------------------------------------------------