68 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | build-backend = "hatchling.build"
3 | requires = [ "hatchling" ]
4 |
5 | [project]
6 | name = "scverse-tutorials"
7 | version = "0.0.1"
8 | description = "Tutorials for single-cell analysis with scverse packages"
9 | readme = "README.md"
10 | license = "BSD-3-Clause"
11 | maintainers = [ { name = "scverse team", email = "core-team@scverse.org" } ]
12 | authors = [ { name = "scverse team" } ]
13 | requires-python = ">=3.13"
14 | classifiers = [
15 | "Private :: Do Not Upload", # Prevent uploading to PyPI
16 | "Programming Language :: Python :: 3 :: Only",
17 | "Programming Language :: Python :: 3.13",
18 | "Programming Language :: Python :: 3.14",
19 | ]
20 | optional-dependencies.dev = [ "pre-commit" ]
21 | optional-dependencies.docs = [
22 | # For notebooks
23 | "ipykernel",
24 | "ipython",
25 | "myst-nb>=1.1",
26 | "sphinx>=7",
27 | "sphinx-autodoc-typehints",
28 | "sphinx-book-theme>=1.1",
29 | "sphinx-copybutton",
30 | "sphinx-issues>=5.0.1",
31 | "sphinx-tabs",
32 | "sphinxcontrib-bibtex>=1",
33 | "sphinxext-opengraph",
34 | ]
35 | optional-dependencies.registry = [ "httpx", "jsonschema", "pillow", "pyyaml" ]
36 | urls.Documentation = "https://scverse.org/scverse-tutorials"
37 | urls.Home-page = "https://github.com/scverse/scverse-tutorials"
38 | urls.Source = "https://github.com/scverse/scverse-tutorials"
39 |
40 | [tool.hatch.envs.default]
41 | installer = "uv"
42 | features = [ "dev" ]
43 |
44 | [tool.hatch.envs.registry]
45 | features = [ "registry" ]
46 | scripts.validate = "python tutorial-registry/validate.py {args}"
47 |
48 | [tool.hatch.envs.docs]
49 | features = [ "docs" ]
50 | extra-dependencies = [
51 | "setuptools", # undeclared dependency in pybtex
52 | # fix from here: https://github.com/executablebooks/MyST-NB/pull/597
53 | "myst-nb @ git+https://github.com/flying-sheep/MyST-NB.git@eval-metadata",
54 | ]
55 | scripts.build = "sphinx-build -M html docs docs/_build {args}"
56 | scripts.open = "python3 -m webbrowser -t docs/_build/html/index.html"
57 | scripts.clean = "git clean -fdX -- {args:docs}"
58 |
59 | [tool.hatch.build.targets.wheel]
60 | bypass-selection = true # This is not a package
61 |
62 | [tool.ruff]
63 | line-length = 120
64 | src = [ "src" ]
65 | extend-include = [ "*.ipynb" ]
66 |
67 | format.docstring-code-format = true
68 |
69 | lint.select = [
70 | "B", # flake8-bugbear
71 | "BLE", # flake8-blind-except
72 | "C4", # flake8-comprehensions
73 | "D", # pydocstyle
74 | "E", # Error detected by Pycodestyle
75 | "F", # Errors detected by Pyflakes
76 | "I", # isort
77 | "RUF100", # Report unused noqa directives
78 | "TID", # flake8-tidy-imports
79 | "UP", # pyupgrade
80 | "W", # Warning detected by Pycodestyle
81 | ]
82 | lint.ignore = [
83 | "B008", # Errors from function calls in argument defaults. These are fine when the result is immutable.
84 | "C408", # dict() is nice for some use cases
85 | "D100", # Missing docstring in public module
86 | "D104", # Missing docstring in public package
87 | "D105", # __magic__ methods are often self-explanatory, allow missing docstrings
88 | "D107", # Missing docstring in __init__
89 | # Disable one in each pair of mutually incompatible rules
90 | "D203", # We don’t want a blank line before a class docstring
91 | "D213", # <> We want docstrings to start immediately after the opening triple quote
92 | "D400", # first line should end with a period [Bug: doesn’t work with single-line docstrings]
93 | "D401", # First line should be in imperative mood; try rephrasing
94 | "E501", # line too long -> we accept long comment lines; formatter gets rid of long code lines
95 | "E731", # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient
96 | "E741", # allow I, O, l as variable names -> I is the identity matrix
97 | ]
98 | lint.per-file-ignores."docs/**" = [
99 | "B018", # Trailing expressions in notebooks are not “useless”
100 | "D103", # No need for docstrings in functions, we use literate programming
101 | "E402", # Imports in non-top cells are fine
102 | ]
103 | lint.pydocstyle.convention = "numpy"
104 |
105 | [tool.cruft]
106 | skip = [
107 | ".github/ISSUE_TEMPLATE",
108 | "environment.yml",
109 | "*.md",
110 | "src",
111 | "docs/*.md",
112 | "docs/notebooks",
113 | "docs/references.bib",
114 | "tests",
115 | "tutorial-registry",
116 | ]
117 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 |
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 | import sys
9 | from datetime import datetime
10 | from importlib.metadata import metadata
11 | from pathlib import Path
12 |
13 | HERE = Path(__file__).parent
14 | sys.path.insert(0, str(HERE / "extensions"))
15 |
16 |
17 | # -- Project information -----------------------------------------------------
18 |
19 | # NOTE: If you installed your project in editable mode, this might be stale.
20 | # If this is the case, reinstall it to refresh the metadata
21 | info = metadata("scverse-tutorials")
22 | project_name = info["Name"]
23 | author = info["Author"]
24 | copyright = f"{datetime.now():%Y}, {author}."
25 | version = info["Version"]
26 | urls = dict(pu.split(", ") for pu in info.get_all("Project-URL"))
27 | repository_url = urls["Source"]
28 |
29 | # The full version, including alpha/beta/rc tags
30 | release = info["Version"]
31 |
32 | bibtex_bibfiles = ["references.bib"]
33 | templates_path = ["_templates"]
34 | nitpicky = True # Warn about broken links
35 | needs_sphinx = "4.0"
36 |
37 | html_context = {
38 | "display_github": True, # Integrate GitHub
39 | "github_user": "scverse",
40 | "github_repo": project_name,
41 | "github_version": "main",
42 | "conf_py_path": "/docs/",
43 | }
44 |
45 | # -- General configuration ---------------------------------------------------
46 |
47 | # Add any Sphinx extension module names here, as strings.
48 | # They can be extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
49 | extensions = [
50 | "myst_nb",
51 | "sphinx_copybutton",
52 | "sphinx.ext.autodoc",
53 | "sphinx.ext.intersphinx",
54 | "sphinx.ext.autosummary",
55 | "sphinx.ext.napoleon",
56 | "sphinx_issues",
57 | "sphinxcontrib.bibtex",
58 | "sphinx_autodoc_typehints",
59 | "sphinx_tabs.tabs",
60 | "sphinx.ext.mathjax",
61 | "IPython.sphinxext.ipython_console_highlighting",
62 | "sphinxext.opengraph",
63 | *[p.stem for p in (HERE / "extensions").glob("*.py")],
64 | ]
65 |
66 | autosummary_generate = True
67 | autodoc_member_order = "groupwise"
68 | default_role = "literal"
69 | napoleon_google_docstring = False
70 | napoleon_numpy_docstring = True
71 | napoleon_include_init_with_doc = False
72 | napoleon_use_rtype = True # having a separate entry generally helps readability
73 | napoleon_use_param = True
74 | myst_heading_anchors = 6 # create anchors for h1-h6
75 | myst_enable_extensions = [
76 | "amsmath",
77 | "colon_fence",
78 | "deflist",
79 | "dollarmath",
80 | "html_image",
81 | "html_admonition",
82 | ]
83 | myst_url_schemes = ("http", "https", "mailto")
84 | nb_output_stderr = "remove"
85 | nb_execution_mode = "off"
86 | nb_merge_streams = True
87 | typehints_defaults = "braces"
88 |
89 | source_suffix = {
90 | ".rst": "restructuredtext",
91 | ".ipynb": "myst-nb",
92 | ".myst": "myst-nb",
93 | }
94 |
95 | intersphinx_mapping = {
96 | "python": ("https://docs.python.org/3", None),
97 | "anndata": ("https://anndata.readthedocs.io/en/latest/", None), # TODO: change back to stable after 0.12 release
98 | "numpy": ("https://numpy.org/doc/stable/", None),
99 | "scanpy": ("https://scanpy.readthedocs.io/en/stable/", None),
100 | "fast-array-utils": ("https://icb-fast-array-utils.readthedocs-hosted.com/en/stable", None),
101 | "dask": ("https://docs.dask.org/en/stable", None),
102 | "scipy": ("https://docs.scipy.org/doc/scipy", None),
103 | "rapids-singlecell": ("https://rapids-singlecell.readthedocs.io/en/stable/", None),
104 | }
105 |
106 | # List of patterns, relative to source directory, that match files and
107 | # directories to ignore when looking for source files.
108 | # This pattern also affects html_static_path and html_extra_path.
109 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints", ".jupyter_cache"]
110 |
111 |
112 | # -- Options for HTML output -------------------------------------------------
113 |
114 | # The theme to use for HTML and HTML Help pages. See the documentation for
115 | # a list of builtin themes.
116 | #
117 | html_theme = "sphinx_book_theme"
118 | html_static_path = ["_static"]
119 | html_css_files = ["css/custom.css"]
120 |
121 | html_title = project_name
122 |
123 | html_theme_options = {
124 | "repository_url": repository_url,
125 | "use_repository_button": True,
126 | "path_to_docs": "docs/",
127 | "navigation_with_keys": False,
128 | }
129 |
130 | pygments_style = "default"
131 |
132 | nitpick_ignore = [
133 | # If building the documentation fails because of a missing link that is outside your control,
134 | # you can add an exception to this list.
135 | # ("py:class", "igraph.Graph"),
136 | ]
137 |
--------------------------------------------------------------------------------
/docs/references.bib:
--------------------------------------------------------------------------------
1 | @article{Wolf2018,
2 | author = {Wolf, F. Alexander
3 | and Angerer, Philipp
4 | and Theis, Fabian J.},
5 | title = {SCANPY: large-scale single-cell gene expression data analysis},
6 | journal = {Genome Biology},
7 | year = {2018},
8 | month = {Feb},
9 | day = {06},
10 | volume = {19},
11 | number = {1},
12 | pages = {15},
13 | abstract = {Scanpy is a scalable toolkit for analyzing single-cell gene expression data. It includes methods for preprocessing, visualization, clustering, pseudotime and trajectory inference, differential expression testing, and simulation of gene regulatory networks. Its Python-based implementation efficiently deals with data sets of more than one million cells (https://github.com/theislab/Scanpy). Along with Scanpy, we present AnnData, a generic class for handling annotated data matrices (https://github.com/theislab/anndata).},
14 | issn = {1474-760X},
15 | doi = {10.1186/s13059-017-1382-0},
16 | url = {https://doi.org/10.1186/s13059-017-1382-0}
17 | }
18 | @inproceedings{luecken2021,
19 | author = {Luecken, Malte and Burkhardt, Daniel and Cannoodt, Robrecht and Lance, Christopher and Agrawal, Aditi and Aliee, Hananeh and Chen, Ann and Deconinck, Louise and Detweiler, Angela and Granados, Alejandro and Huynh, Shelly and Isacco, Laura and Kim, Yang and Klein, Dominik and DE KUMAR, BONY and Kuppasani, Sunil and Lickert, Heiko and McGeever, Aaron and Melgarejo, Joaquin and Mekonen, Honey and Morri, Maurizio and M\"{u}ller, Michaela and Neff, Norma and Paul, Sheryl and Rieck, Bastian and Schneider, Kaylie and Steelman, Scott and Sterr, Michael and Treacy, Daniel and Tong, Alexander and Villani, Alexandra-Chloe and Wang, Guilin and Yan, Jia and Zhang, Ce and Pisco, Angela and Krishnaswamy, Smita and Theis, Fabian and Bloom, Jonathan M},
20 | booktitle = {Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks},
21 | editor = {J. Vanschoren and S. Yeung},
22 | pages = {},
23 | publisher = {Curran},
24 | title = {A sandbox for prediction and integration of DNA, RNA, and proteins in single cells},
25 | url = {https://datasets-benchmarks-proceedings.neurips.cc/paper_files/paper/2021/file/158f3069a435b314a80bdcb024f8e422-Paper-round2.pdf},
26 | volume = {1},
27 | year = {2021}
28 | }
29 | @article{McCarthy2017,
30 | doi = {10.1093/bioinformatics/btw777},
31 | url = {https://doi.org/10.1093/bioinformatics/btw777},
32 | year = {2017},
33 | month = jan,
34 | publisher = {Oxford University Press ({OUP})},
35 | volume = {33},
36 | number = {8},
37 | pages = {1179--1186},
38 | author = {Davis J McCarthy and Kieran R Campbell and Aaron T L Lun and Quin F Wills},
39 | editor = {Ivo Hofacker},
40 | title = {Scater: pre-processing, quality control, normalization and visualization of single-cell {RNA}-seq data in R},
41 | journal = {Bioinformatics}
42 | }
43 | @article{Wolock2019,
44 | doi = {10.1016/j.cels.2018.11.005},
45 | url = {https://doi.org/10.1016/j.cels.2018.11.005},
46 | year = {2019},
47 | month = apr,
48 | publisher = {Elsevier {BV}},
49 | volume = {8},
50 | number = {4},
51 | pages = {281--291.e9},
52 | author = {Samuel L. Wolock and Romain Lopez and Allon M. Klein},
53 | title = {Scrublet: Computational Identification of Cell Doublets in Single-Cell Transcriptomic Data},
54 | journal = {Cell Systems}
55 | }
56 | @article{Satija2015,
57 | doi = {10.1038/nbt.3192},
58 | url = {https://doi.org/10.1038/nbt.3192},
59 | year = {2015},
60 | month = apr,
61 | publisher = {Springer Science and Business Media {LLC}},
62 | volume = {33},
63 | number = {5},
64 | pages = {495--502},
65 | author = {Rahul Satija and Jeffrey A Farrell and David Gennert and Alexander F Schier and Aviv Regev},
66 | title = {Spatial reconstruction of single-cell gene expression data},
67 | journal = {Nature Biotechnology}
68 | }
69 | @article{Zheng2017,
70 | doi = {10.1038/ncomms14049},
71 | url = {https://doi.org/10.1038/ncomms14049},
72 | year = {2017},
73 | month = jan,
74 | publisher = {Springer Science and Business Media {LLC}},
75 | volume = {8},
76 | number = {1},
77 | author = {Grace X. Y. Zheng and Jessica M. Terry and Phillip Belgrader and Paul Ryvkin and Zachary W. Bent and Ryan Wilson and Solongo B. Ziraldo and Tobias D. Wheeler and Geoff P. McDermott and Junjie Zhu and Mark T. Gregory and Joe Shuga and Luz Montesclaros and Jason G. Underwood and Donald A. Masquelier and Stefanie Y. Nishimura and Michael Schnall-Levin and Paul W. Wyatt and Christopher M. Hindson and Rajiv Bharadwaj and Alexander Wong and Kevin D. Ness and Lan W. Beppu and H. Joachim Deeg and Christopher McFarland and Keith R. Loeb and William J. Valente and Nolan G. Ericson and Emily A. Stevens and Jerald P. Radich and Tarjei S. Mikkelsen and Benjamin J. Hindson and Jason H. Bielas},
78 | title = {Massively parallel digital transcriptional profiling of single cells},
79 | journal = {Nature Communications}
80 | }
81 | @article{stuart2019comprehensive,
82 | title = {Comprehensive integration of single-cell data},
83 | author = {Stuart, Tim and Butler, Andrew and Hoffman, Paul and Hafemeister, Christoph and Papalexi, Efthymia and Mauck, William M and Hao, Yuhan and Stoeckius, Marlon and Smibert, Peter and Satija, Rahul},
84 | journal = {Cell},
85 | volume = {177},
86 | number = {7},
87 | pages = {1888--1902},
88 | year = {2019},
89 | publisher = {Elsevier}
90 | }
91 | @article{traag2019louvain,
92 | title = {From Louvain to Leiden: guaranteeing well-connected communities},
93 | author = {Traag, Vincent A and Waltman, Ludo and Van Eck, Nees Jan},
94 | journal = {Scientific reports},
95 | volume = {9},
96 | number = {1},
97 | pages = {5233},
98 | year = {2019},
99 | publisher = {Nature Publishing Group UK London}
100 | }
101 |
--------------------------------------------------------------------------------
/tutorial-registry/validate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Validate tutorials' meta.yaml and generate an output directory with json/images to be uploaded on github pages."""
3 |
4 | from __future__ import annotations
5 |
6 | import argparse
7 | import json
8 | import logging
9 | import shutil
10 | import sys
11 | from pathlib import Path
12 | from textwrap import dedent
13 | from typing import TYPE_CHECKING, Any, Literal
14 |
15 | import httpx
16 | import jsonschema
17 | import yaml
18 | from PIL import Image
19 |
20 | logger = logging.getLogger(__name__)
21 |
22 | if TYPE_CHECKING:
23 | from collections.abc import Generator, Iterable, Mapping
24 |
25 | HERE = Path(__file__).absolute().parent
26 |
27 |
28 | def _check_url_exists(url: str) -> None:
29 | logger.info(f"Testing URL: {url}")
30 | response = httpx.head(url, follow_redirects=True)
31 | if response.status_code != 200:
32 | raise ValueError(f"URL {url} is not reachable (error {response.status_code}). ")
33 |
34 |
35 | def _check_image(img_path: Path) -> None:
36 | """Check that the image exists and that it is either SVG or fits into the 512x512 bounding box."""
37 | if not img_path.exists():
38 | raise ValueError(f"Image does not exist: {img_path}")
39 | if img_path.suffix == ".svg":
40 | return
41 | with Image.open(img_path) as img:
42 | width, height = img.size
43 | if not ((width == 512 and height <= 512) or (width <= 512 and height == 512)):
44 | raise ValueError(
45 | dedent(
46 | f"""\
47 | When validating {img_path}: Image must fit in a 512x512px bounding box and one dimension must be
48 | exactly 512 px. Actual dimensions (width, height): ({width}, ({height}))."
49 | """
50 | )
51 | )
52 |
53 |
54 | def validate_tutorials(schema_file: Path, tutorials_dir: Path) -> Generator[dict]:
55 | """Find all tutorial `meta.yaml` files in the tutorials dir and yield tutorial records."""
56 | schema = json.loads(schema_file.read_bytes())
57 | known_links = set()
58 | known_primary_to_orders: dict[str, set[int]] = {}
59 |
60 | for tmp_meta_file in tutorials_dir.rglob("meta.yaml"):
61 | tutorial_id = tmp_meta_file.parent.name
62 | with tmp_meta_file.open() as f:
63 | tmp_tutorial = yaml.load(f, yaml.SafeLoader)
64 |
65 | jsonschema.validate(tmp_tutorial, schema)
66 |
67 | link = tmp_tutorial["link"]
68 | if link in known_links:
69 | raise ValueError(f"When validating {tmp_meta_file}: Duplicate link: {link}")
70 | known_links.add(link)
71 |
72 | # Check for duplicate orders within the same primary category
73 | primary_category = tmp_tutorial.get("primary_category")
74 | order = tmp_tutorial.get("order")
75 |
76 | if primary_category and order is not None:
77 | if primary_category not in known_primary_to_orders:
78 | known_primary_to_orders[primary_category] = set()
79 |
80 | if order in known_primary_to_orders[primary_category]:
81 | raise ValueError(
82 | f"When validating {tmp_meta_file}: Duplicate order {order} "
83 | f"for primary category '{primary_category}'"
84 | )
85 |
86 | known_primary_to_orders[primary_category].add(order)
87 |
88 | _check_url_exists(link)
89 |
90 | # replace image path by absolute local path to image
91 | img_path = tutorials_dir / tutorial_id / tmp_tutorial["image"]
92 | _check_image(img_path)
93 | tmp_tutorial["image"] = str(img_path)
94 |
95 | yield tmp_tutorial
96 |
97 |
98 | def load_categories(categories_file: Path) -> dict[str, Any]:
99 | """Load the categories JSON."""
100 | with open(categories_file) as f:
101 | return yaml.load(f, yaml.SafeLoader)
102 |
103 |
104 | def make_output(
105 | categories: Iterable[Mapping[str, Mapping[Literal["description"], str]]],
106 | tutorials: Iterable[Mapping[str, str | Iterable[str]]],
107 | *,
108 | outdir: Path | None = None,
109 | ) -> None:
110 | """Create the output directory.
111 |
112 | Structure:
113 | outdir
114 | - tutorials.json # contains categories and tutorials
115 | - tutorialxxx/icon.svg # original icon filenames under a folder for each tutorial. The path of the icon is listed in the json.
116 | - tutorialyyy/icon.png
117 | """
118 | if outdir:
119 | outdir.mkdir(parents=True)
120 |
121 | tutorials_rel = []
122 | for tutorial in tutorials:
123 | img_srcpath = Path(tutorial["image"])
124 | img_localpath = Path(img_srcpath.parent.name) / img_srcpath.name
125 | tut_rel = dict(tutorial)
126 | tut_rel["image"] = str(img_localpath)
127 | tutorials_rel.append(tut_rel)
128 | if outdir:
129 | img_outpath = outdir / img_localpath
130 | img_outpath.parent.mkdir()
131 | shutil.copy(img_srcpath, img_outpath)
132 |
133 | result = {"categories": categories, "tutorials": tutorials_rel}
134 |
135 | if outdir:
136 | with (outdir / "tutorials.json").open("w") as f:
137 | json.dump(result, f)
138 | else:
139 | json.dump(result, sys.stdout, indent=2)
140 |
141 |
142 | def main(schema_file: Path, meta_dir: Path, categories_file: Path, *, outdir: Path | None = None):
143 | """Validate and create output directory."""
144 | tutorials = list(validate_tutorials(schema_file, meta_dir))
145 | categories = load_categories(categories_file)
146 | make_output(categories, tutorials, outdir=outdir)
147 |
148 |
149 | if __name__ == "__main__":
150 | logging.basicConfig(level=logging.INFO, format="%(message)s")
151 |
152 | parser = argparse.ArgumentParser(
153 | prog="validate.py",
154 | description="Validate tutorials' meta.yaml and generate an output directory with json/images to be uploaded on github pages.",
155 | )
156 | parser.add_argument("--outdir", type=Path, help="outdir that will contain the data to be uploaded on github pages")
157 | args = parser.parse_args()
158 |
159 | SCHEMA = HERE / "schema.json"
160 | META_DIR = HERE / "tutorials"
161 | CATEGORIES = HERE / "categories.yml"
162 |
163 | main(SCHEMA, META_DIR, CATEGORIES, outdir=args.outdir)
164 |
--------------------------------------------------------------------------------
/docs/how-to-dask.md:
--------------------------------------------------------------------------------
1 | # Dask Q&A
2 |
3 | Here we will go through some common questions and answers about `dask`, with a special focus on its integration with `scanpy` and `anndata`. For more comprehensive tutorials or other topics like {doc}`launching a cluster `, head over their documentation.
4 |
5 | ## Quickstart
6 |
7 | ### How do I monitor the {doc}`dask dashboard `?
8 |
9 | If you are in a jupyter notebook, when you render the `repr` of your `client`, you will see a link, usually something like `http://localhost:8787/status`.
10 | If you are working locally, this link alone should suffice.
11 |
12 | If you are working on some sort of remote notebook from a web browser, you will need to replace `http://localhost` by the root url of the notebook.
13 |
14 | If you are in vscode, there is a [`dask` extension] which will allow you to monitor there.
15 |
16 | ### How do I know how to allocate resources?
17 |
18 | In `dask`, every worker will receive an equal share of the memory available.
19 | So if you request e.g., a slurm job with 256GB of RAM, and then start 8 workers, each will have 32 GB of memory.
20 |
21 | `dask` distributes jobs to each worker generally based on the chunking of the array.
22 | So if you have dense chunks of `(30_000, 30_000)` with 32 bit integers, you will need to be have 3.6 GB for each worker, at the minimum to even load the data.
23 | Then if you do something like matrix multiplication, you will need double or even more, as an example.
24 |
25 | ### How do I read my data into a `dask` array?
26 |
27 | {func}`anndata.experimental.read_elem_lazy` or {func}`anndata.experimental.read_lazy` can help you if you already have data on-disk that was written to the `anndata` file format.
28 | If you use {func}`dask.array.to_zarr`, the data _cannot_ be read in using `anndata`'s functionality as `anndata` will look for its {doc}`specified file format metadata `.
29 |
30 | If you need to implement custom io, generally we found that using {func}`dask.array.map_blocks` provides a nice way.
31 | See [our custom h5 io code] for an example.
32 |
33 | ## Advanced use and how-to-contribute
34 |
35 | ### How do `scanpy` and `anndata` handle sparse matrices?
36 |
37 | While there is some {class}`scipy.sparse.csr_matrix` and {class}`scipy.sparse.csc_matrix` support for `dask`, it is not comprehensive and missing key functions like summation, mean etc.
38 | We have implemented custom functionality, much of which lives in {mod}`fast_array_utils`, although we have also had to implement custom algorithms like `pca` for sparse-in-dask.
39 | In the future, an [`array-api`] compatible sparse matrix like [`finch`] would help us considerably as `dask` supports the [`array-api`].
40 |
41 | Therefore, if you run into a puzzling error after trying to run a function like {func}`numpy.sum` (or similar) on a sparse-in-dask array, consider checking {mod}`fast_array_utils`.
42 | If you need to implement the function yourself, see the next point.
43 |
44 | ### Custom block-wise array operations
45 |
46 | Sometimes you may want to do an operation on a an array that is implemented nowhere.
47 | Generally, we have found {func}`dask.array.map_blocks` to be versatile enough that most operations can be expressed on it. Click on the link to see `dask`'s own tutorial about the function.
48 |
49 | Take this (simplified) example of calculating a gram matrix from {func}`scanpy.pp.pca` for sparse-in-dask:
50 |
51 | ```python
52 | def gram_block(x_part):
53 | gram_matrix = x_part.T @ x_part
54 | return gram_matrix[None, ...]
55 |
56 | gram_matrix_dask = da.map_blocks(
57 | gram_block,
58 | x,
59 | new_axis=(1,),
60 | chunks=((1,) * x.blocks.size, (x.shape[1],), (x.shape[1],)),
61 | meta=np.array([], dtype=x.dtype),
62 | dtype=x.dtype,
63 | ).sum(axis=0)
64 | ```
65 |
66 | This algorithm goes through every `chunk_size` number of rows and calculates the gram matrix for those rows producing a collection of `(n_vars,n_vars)` size matrix.
67 | These are the summed together to produce a single `(n_vars,n_vars)` matrix, which is the gram matrix.
68 |
69 | Because `dask` does not implement matrix multiplication for sparse-in-dask, we do it ourselves.
70 | We use `map_blocks` over a CSR sparse-in-dask array where the chunking looks something like `(chunk_size, n_vars)`.
71 | When we compute the individual block's gram matrix, we add an axis via `[None, ...]` so that we can sum over that axis i.e., the `da.map_blocks` call produces a `(n_obs // chunk_size, n_vars, n_vars)` sized-matrix which is summed over the first dimension.
72 | However, to make this work, we need to be very specific about how `da.map_blocks` expects its result to look like, done via `new_axis` and `chunks`.
73 | `new_axis` indicates that we are adding a single new axis at the front.
74 | The `chunks` argument specifies that the output of `da.map_blocks` should have `x.blocks.size` number of `(1, n_vars, n_vars)` matrixes.
75 | This `chunks` argument thus allows the inferral of the shape of the output.
76 |
77 | While this example is a bit complicated it shows how you can go from a matrix of one shape and chunking to another by operating in a clean way over blocks.
78 |
79 | ## FAQ
80 |
81 | ### What is `persist` used for in RSC notebooks?
82 |
83 | In the {doc}`multi-gpu showcase notebook for rapids-singlecell `, {meth}`dask.array.Array.persist` appears across the notebook.
84 | This loads the entire dataset into memory while keeping the representation as a dask array.
85 | Thus, lazy computation still works but only necessitates a single read into memory.
86 | The catch is that you need to have enough memory to use `persist`, but if you do it greatly speeds up the computation.
87 |
88 | ### I'm out of memory, what now?
89 |
90 | You can always reduce the number of workers you use, which will cause more memory to be allocated per worker.
91 | Some algorithms may have limitations with loading all data onto a single node; see {issue}`dask/dask-ml#985` for an example.
92 |
93 | ### How do I choose chunk sizes?
94 |
95 | Have a look at the {doc}`dask docs for chunking `, however the general rule of thumb there is to use larger chunks in memory than on disk.
96 | In this sense, it is probably a good idea to use the largest chunk size in memory allowable by your memory limits (and the algorithms you use) in order to maximize any thread-level parallelization in algorithms to its fullest.
97 | For sparse data, where the chunks in-memory do not map to those on disk, maxing out the memory available by choosing a large chunk size becomes more imperative.
98 |
99 | [`dask` extension]: https://marketplace.visualstudio.com/items?itemName=joyceerhl.vscode-das
100 | [our custom h5 io code]: https://github.com/scverse/anndata/blob/089ed929393a02200b389395f278b7c920e5bc4a/src/anndata/_io/specs/lazy_methods.py#L179-L205
101 | [`array-api`]: https://data-apis.org/array-api/latest/index.html
102 | [`finch`]: https://github.com/finch-tensor/finch-tensor-python
103 |
--------------------------------------------------------------------------------
/docs/notebooks/scverse_data_interoperability.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "fifth-grammar",
6 | "metadata": {},
7 | "source": [
8 | "# Interoperability between scverse data structures and other languages \n",
9 | "\n",
10 | "Here we provide a list of resources that can be used to work with scverse data structures from your language of choice.\n",
11 | "\n",
12 | "A more detailed tutorial on interoperability with other languages can be found in the [Single-cell analysis best-practices book](https://www.sc-best-practices.org/introduction/interoperability.html)."
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "id": "pending-grenada",
18 | "metadata": {},
19 | "source": [
20 | "## Conversion between python and R structures for single-cell analysis\n",
21 | "\n",
22 | "Several toolkits for single-cell analysis in R build upon [SingleCellExperiment](http://bioconductor.org/books/3.16/OSCA.intro/the-singlecellexperiment-class.html) objects or [Seurat](https://satijalab.org/seurat/) objects. The following table provides an indication of which objects slots store the same data in AnnData and R objects.\n",
23 | "\n",
24 | "| | `AnnData` | `SingleCellExperiment` | `Seurat` |\n",
25 | "|--------------------------------------|--------------------------|------------------------|------------------------------------|\n",
26 | "| **Active expression matrix** | `adata.X` | `assay(sce)` | `GetAssayData(seu)` |\n",
27 | "| **Alternative expression matrices** | `adata.layers['counts']` | `counts(sce)` | `GetAssay(seu)@counts` |\n",
28 | "| **Cell-level metadata** | `adata.obs` | `colData(sce)` | `seu@meta.data` |\n",
29 | "| **Gene-level metadata** | `adata.var` | `rowData(sce)` | `GetAssay(seu)@meta.features` |\n",
30 | "| **Dimensionality reductions** | `adata.obsm` | `reducedDim(sce)` | `seu@reductions` |\n",
31 | "| **cell IDs** | `adata.obs_names` | `colnames(sce)` | `colnames(seu)` |\n",
32 | "| **gene IDs** | `adata.var_names` | `rownames(sce)` | `rownames(seu)` |\n",
33 | "| **Cell-cell similarity graphs** | `adata.obsp` | --- | `seu@graphs` |"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "id": "executed-authority",
39 | "metadata": {},
40 | "source": [
41 | "### AnnData ⇄ Seurat objects\n",
42 | "\n",
43 | "See [Seurat documentation](https://satijalab.org/seurat/) for more details about Seurat objects.\n",
44 | "\n",
45 | "- [MuDataSeurat](https://pmbio.github.io/MuDataSeurat/) - R package to read and write `h5ad` files to and from Seurat objects\n",
46 | "- [sceasy](https://github.com/cellgeni/sceasy#usage) - R package to convert between objects within a session or saving `h5ad` or `rds` files \n",
47 | "- Using [reticulate](https://theislab.github.io/scanpy-in-R/#converting-from-python-to-r-1) - tutorial for conversion within R/Rmd sessions \n",
48 | "\n",
49 | "\n",
50 | "\n",
51 | "### AnnData ⇄ SingleCellExperiment objects\n",
52 | "\n",
53 | "See [OSCA book](http://bioconductor.org/books/3.16/OSCA.intro/the-singlecellexperiment-class.html) for more details about SingleCellExperiment objects.\n",
54 | "\n",
55 | "- [zellconverter](https://theislab.github.io/zellkonverter/articles/zellkonverter.html) - R/Bioconductor package to read and write `h5ad` files and to convert objects within R sessions using [basilisk](https://bioconductor.org/packages/release/bioc/html/basilisk.html) \n",
56 | "- [anndata2ri](https://github.com/theislab/anndata2ri#anndata--singlecellexperiment) - python package to convert between objects within python sessions using [rpy2](https://github.com/rpy2/rpy2#readme) \n",
57 | "- [sceasy](https://github.com/cellgeni/sceasy#usage) - R package to convert between objects within a session or saving `h5ad` or `rds` files \n",
58 | "- Using [reticulate](https://theislab.github.io/scanpy-in-R/#converting-from-python-to-r-1) - tutorial for conversion within R/Rmd sessions \n",
59 | "\n",
60 | "### AnnData ⇄ Loom objects\n",
61 | "\n",
62 | "See [Loompy documentation](http://linnarssonlab.org/loompy/index.html) for more details about Loom objects.\n",
63 | "\n",
64 | "- Using [anndata](https://anndata.readthedocs.io/en/latest/generated/anndata.read_loom.html#anndata.read_loom) - function to read `loom` files as AnnData objects\n",
65 | "- [sceasy](https://github.com/cellgeni/sceasy#usage) - R package to convert between objects within a session or saving `h5ad` or `loom` files \n",
66 | "\n",
67 | "### MuData ⇄ Seurat objects\n",
68 | "\n",
69 | "See [Seurat documentation](https://satijalab.org/seurat/) for more details about Seurat objects.\n",
70 | "\n",
71 | "- [MuDataSeurat](https://pmbio.github.io/MuDataSeurat/) - R package to read and write `h5mu` files to and from Seurat objects\n",
72 | "\n",
73 | "### MuData ⇄ MultiAssayExperiment objects\n",
74 | "\n",
75 | "See [documentation](http://waldronlab.io/MultiAssayExperiment/) for more details about MultiAssayExperiment objects.\n",
76 | "\n",
77 | "- [MuData for MultiAssayExperiment](https://ilia-kats.github.io/MuData/articles/Getting-Started.html) - R package to read and write `h5mu` files to and from `MultiAssayExperiment` objects \n",
78 | "\n",
79 | "### MuData ⇄ ArchR objects\n",
80 | "\n",
81 | "See [ArchR documentation](https://www.archrproject.com/bookdown/what-is-an-arrow-file-archrproject.html) for more details about ArchR objects.\n",
82 | "\n",
83 | "- Using [chame](https://gtca.github.io/chame/examples/archr_io.html) - python package providing functionality to read Arrow files "
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "id": "virtual-street",
89 | "metadata": {},
90 | "source": [
91 | "## Read h5ad/h5mu in other languages\n",
92 | "\n",
93 | "### Julia\n",
94 | "\n",
95 | "- [Muon.jl](https://docs.juliahub.com/Muon/QfqCh/0.1.1/objects/) provides Julia implementations of ``AnnData`` and ``MuData`` objects, as well as IO for the HDF5 format\n",
96 | "- [scVI.jl](https://maren-ha.github.io/scVI.jl/index.html) provides a Julia implementation of ``AnnData`` as well as IO for the HDF5 format.\n",
97 | "\n",
98 | "### Javascript\n",
99 | "\n",
100 | "- [Vitessce](https://github.com/vitessce/vitessce) -contains loaders from ``AnnData``s stored as Zarr\n",
101 | "\n",
102 | "### Rust\n",
103 | "\n",
104 | "- [anndata-rs](https://github.com/kaizhang/anndata-rs) provides a Rust implementation of ``AnnData`` as well as advanced IO support for the HDF5 storage format."
105 | ]
106 | }
107 | ],
108 | "metadata": {
109 | "kernelspec": {
110 | "display_name": "Python 3 (ipykernel)",
111 | "language": "python",
112 | "name": "python3"
113 | },
114 | "language_info": {
115 | "codemirror_mode": {
116 | "name": "ipython",
117 | "version": 3
118 | },
119 | "file_extension": ".py",
120 | "mimetype": "text/x-python",
121 | "name": "python",
122 | "nbconvert_exporter": "python",
123 | "pygments_lexer": "ipython3",
124 | "version": "3.12.12"
125 | }
126 | },
127 | "nbformat": 4,
128 | "nbformat_minor": 5
129 | }
130 |
--------------------------------------------------------------------------------
/tutorial-registry/tutorials/scirpy-tcr/icon.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
147 |
--------------------------------------------------------------------------------
/docs/notebooks/tutorial_concatenation_anndata_mudata.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Concatenating multimodal experiments"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import warnings\n",
17 | "\n",
18 | "import anndata as ad\n",
19 | "import numpy as np\n",
20 | "import pandas as pd\n",
21 | "from mudata import MuData\n",
22 | "\n",
23 | "warnings.simplefilter(action=\"ignore\", category=FutureWarning)\n",
24 | "\n",
25 | "np.random.seed(1979)"
26 | ]
27 | },
28 | {
29 | "attachments": {},
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "Sometimes, you may want to concatenate 2 `MuData` objects because they represent complementary slices of the same dataset on which you have applied different processing. Think of analysing B and T cells separately for your PBMC typical dataset. \n",
34 | "Other times instead you need to concatenate 2 modalities into one `AnnData` because the tool you're working with doesn't currently support `MuData` (yeah we know, how dare they?).\n",
35 | "We will showcase here these 2 scenarios of concatenation.\n",
36 | "\n",
37 | "\n",
38 | ":::{note}\n",
39 | "Native concatenation of two `MuData` objects is currently discussed in \n",
40 | "[scverse/mudata#20](https://github.com/scverse/mudata/issues/20) and may\n",
41 | "eventually make parts of this tutorial obsolete. \n",
42 | "\n",
43 | "Note that for some modalities, concatenation requires extra care. For instance, \n",
44 | "in the case of ATAC-seq, concatenation does not make sense unless fragments are aggregated first. \n",
45 | ":::"
46 | ]
47 | },
48 | {
49 | "attachments": {},
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "First, we need to import the raw data for a dataset of our choice. We use mudatasets package that conveniently collects some useful 10X single cell datasets that are publicly available. For this example we need a multimodal dataset, so select the *citeseq 5k* dataset, a collection of healthy PBMCs for which 2 modalities were profiled, RNA and PROTEINS."
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 2,
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "data": {
63 | "text/plain": [
64 | "['pbmc10k_multiome',\n",
65 | " 'brain9k_multiome',\n",
66 | " 'brain3k_multiome',\n",
67 | " 'pbmc5k_citeseq',\n",
68 | " 'pbmc3k_multiome']"
69 | ]
70 | },
71 | "execution_count": 2,
72 | "metadata": {},
73 | "output_type": "execute_result"
74 | }
75 | ],
76 | "source": [
77 | "import mudatasets as mds\n",
78 | "\n",
79 | "mds.list_datasets()"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 3,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "■ File filtered_feature_bc_matrix.h5 from pbmc5k_citeseq has been found at /home/runner/mudatasets/pbmc5k_citeseq/filtered_feature_bc_matrix.h5\n",
92 | "■ Checksum is validated (md5) for filtered_feature_bc_matrix.h5\n"
93 | ]
94 | },
95 | {
96 | "name": "stderr",
97 | "output_type": "stream",
98 | "text": [
99 | "/home/runner/miniconda3/envs/tutorials/lib/python3.12/site-packages/mudatasets/core.py:203: UserWarning: Dataset is in the 10X .h5 format and can't be loaded as backed.\n",
100 | " warn(\"Dataset is in the 10X .h5 format and can't be loaded as backed.\")\n"
101 | ]
102 | },
103 | {
104 | "name": "stderr",
105 | "output_type": "stream",
106 | "text": [
107 | "/home/runner/miniconda3/envs/tutorials/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
108 | " from .autonotebook import tqdm as notebook_tqdm\n"
109 | ]
110 | },
111 | {
112 | "name": "stdout",
113 | "output_type": "stream",
114 | "text": [
115 | "■ Loading filtered_feature_bc_matrix.h5...\n"
116 | ]
117 | },
118 | {
119 | "name": "stderr",
120 | "output_type": "stream",
121 | "text": [
122 | "/home/runner/miniconda3/envs/tutorials/lib/python3.12/site-packages/anndata/_core/anndata.py:1798: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n",
123 | " utils.warn_names_duplicates(\"var\")\n",
124 | "/home/runner/miniconda3/envs/tutorials/lib/python3.12/site-packages/anndata/_core/anndata.py:1798: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n",
125 | " utils.warn_names_duplicates(\"var\")\n",
126 | "/home/runner/miniconda3/envs/tutorials/lib/python3.12/site-packages/mudata/_core/mudata.py:947: UserWarning: var_names are not unique. To make them unique, call `.var_names_make_unique`.\n",
127 | " warnings.warn(\n"
128 | ]
129 | }
130 | ],
131 | "source": [
132 | "mds.info(\"pbmc5k_citeseq\")\n",
133 | "pbmc5k = mds.load(\"pbmc5k_citeseq\", files=[\"filtered_feature_bc_matrix.h5\"])"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 4,
139 | "metadata": {},
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/html": [
144 | "