├── .github
    └── workflows
    │   └── test_and_deploy.yml
├── .gitignore
├── CITATION.cff
├── LICENSE
├── README.md
├── data
    ├── DLPFC.h5ad
    ├── corresptissues.json
    ├── data_test.h5ad
    └── tissuescolor.json
├── notebooks
    ├── Embryo-Registration.ipynb
    ├── Spatial-differential-expression.ipynb
    └── Test-embryo.ipynb
├── pyproject.toml
├── setup.cfg
├── src
    └── sc3D
    │   ├── __init__.py
    │   ├── _tests
    │       ├── __init__.py
    │       └── test_sc3D.py
    │   ├── sc3D.py
    │   └── transformations.py
├── tox.ini
└── txt
    ├── figures
        ├── interpolation.ai
        ├── interpolation.pdf
        ├── sc3D_vs_PASTE_DLPFC.pdf
        ├── sc3D_vs_PASTE_Mouse.pdf
        └── sc3D_vs_PASTE_time.pdf
    ├── references.bib
    ├── scSpatial.docx
    ├── scSpatial.pdf
    └── scSpatial.tex


/.github/workflows/test_and_deploy.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: tests
 5 | 
 6 | on:
 7 |   push:
 8 |     branches:
 9 |       - main
10 |     tags:
11 |       - "v*" # Push events to matching v*, i.e. v1.0, v20.15.10
12 |   pull_request:
13 |     branches:
14 |       - main
15 |   workflow_dispatch:
16 | 
17 | jobs:
18 |   test:
19 |     name: ${{ matrix.platform }} py${{ matrix.python-version }}
20 |     runs-on: ${{ matrix.platform }}
21 |     strategy:
22 |       matrix:
23 |         platform: [ubuntu-latest, windows-latest, macos-latest]
24 |         python-version: [3.8, 3.9, '3.10']
25 | 
26 |     steps:
27 |       - uses: actions/checkout@v2
28 | 
29 |       - name: Set up Python ${{ matrix.python-version }}
30 |         uses: actions/setup-python@v2
31 |         with:
32 |           python-version: ${{ matrix.python-version }}
33 | 
34 |       # note: if you need dependencies from conda, considering using
35 |       # setup-miniconda: https://github.com/conda-incubator/setup-miniconda
36 |       # and
37 |       # tox-conda: https://github.com/tox-dev/tox-conda
38 |       - name: Install dependencies
39 |         run: |
40 |           python -m pip install --upgrade pip
41 |           python -m pip install setuptools tox tox-gh-actions
42 | 
43 |       # this runs the platform-specific tests declared in tox.ini
44 |       - name: Test with tox
45 |         uses: GabrielBB/xvfb-action@v1
46 |         with:
47 |           run: python -m tox
48 |         env:
49 |           PLATFORM: ${{ matrix.platform }}
50 | 
51 |       - name: Coverage
52 |         uses: codecov/codecov-action@v2
53 | 
54 |   deploy:
55 |     # this will run when you have tagged a commit, starting with "v*"
56 |     # and requires that you have put your twine API key in your
57 |     # github secrets (see readme for details)
58 |     needs: [test]
59 |     runs-on: ubuntu-latest
60 |     if: contains(github.ref, 'tags')
61 |     steps:
62 |       - uses: actions/checkout@v2
63 |       - name: Set up Python
64 |         uses: actions/setup-python@v2
65 |         with:
66 |           python-version: "3.x"
67 |       - name: Install dependencies
68 |         run: |
69 |           python -m pip install --upgrade pip
70 |           pip install -U setuptools setuptools_scm wheel twine build
71 |       - name: Build and publish
72 |         env:
73 |           TWINE_USERNAME: __token__
74 |           TWINE_PASSWORD: ${{ secrets.TWINE_API_KEY }}
75 |         run: |
76 |           git tag
77 |           python -m build .
78 |           twine upload dist/*
79 | 
80 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints
 2 | build
 3 | dist
 4 | env
 5 | out
 6 | sc-3D
 7 | 
 8 | src/*.egg-info
 9 | src/sc3D/__pycache__
10 | 
11 | _*.ipynb
12 | data/_*.h5ad
13 | data/_*.json
14 | data/*.png
15 | data/_*
16 | .DS_Store
17 | */.DS_Store
18 | */*/.DS_Store
19 | 
20 | .coverage
21 | 
22 | .vscode
23 | src/sc3D/_tests/__pycache__
24 | *coverage*
25 | txt/*.aux
26 | txt/*.bbl
27 | txt/*.blg
28 | txt/*.log
29 | txt/*.synctex.gz
30 | txt/*.docx
31 | *.ai
32 | test.csv
33 | *.fdb_latexmk
34 | *.fls


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.1
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Guignard"
 5 |   given-names: "Léo"
 6 |   orcid: "https://orcid.org/0000-0002-3686-1385"
 7 | title: "sc3D"
 8 | version: 1.2.1
 9 | date-released: 2023-07-01
10 | url: "https://github.com/GuignardLab/sc3D"
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 leoguignard
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | [![License MIT](https://img.shields.io/pypi/l/sc-3D.svg?color=green)](https://github.com/GuignardLab/sc3D/raw/main/LICENSE)
  3 | [![PyPI](https://img.shields.io/pypi/v/sc-3D.svg?color=green)](https://pypi.org/project/sc-3D)
  4 | [![Python Version](https://img.shields.io/pypi/pyversions/sc-3D.svg?color=green)](https://python.org)
  5 | [![tests](https://github.com/GuignardLab/sc3D/workflows/tests/badge.svg)](https://github.com/GuignardLab/sc3D/actions)
  6 | [![codecov](https://codecov.io/gh/GuignardLab/sc3D/branch/main/graph/badge.svg)](https://codecov.io/gh/GuignardLab/sc3D)
  7 | [![Downloads](https://static.pepy.tech/badge/sc-3d)](https://pepy.tech/project/sc-3d)
  8 | 
  9 | # sc3D
 10 | 
 11 | __sc3D__ is a Python library to handle 3D spatial transcriptomic datasets.
 12 | 
 13 | __To access the 3D viewer for sc3D datasets, you can go to the following link: [GuignardLab/napari-sc3D-viewer](https://github.com/GuignardLab/napari-sc3D-viewer)__
 14 | 
 15 | You can find it on the Guignard Lab GitHub page: [GuignardLab/sc3D](https://github.com/GuignardLab/sc3D). In there you will be able to find jupyter notebooks giving examples about how to use the datasets.
 16 | 
 17 | This code was developed in the context of the following study:
 18 | 
 19 | [__Spatial transcriptomic maps of whole mouse embryos.__](https://www.nature.com/articles/s41588-023-01435-6) *Abhishek Sampath Kumar, Luyi Tian, Adriano Bolondi et al.*
 20 | 
 21 | The __sc3D__ code is based on the [anndata](https://anndata.readthedocs.io/en/latest/) and [Scanpy](https://scanpy.readthedocs.io/en/stable/) libraries and allows to read, register arrays and compute 3D differential expression.
 22 | 
 23 | The dataset necessary to run the tests and look at the results can be downloaded [there](https://figshare.com/s/9c73df7fd39e3ca5422d) for the unregistered dataset (and test the provided algorithms) and [there](https://figshare.com/s/1c29d867bc8b90d754d2) for the already registered atlas to browse with our visualiser. You can find the visualiser [there](https://www.github.com/guignardlab/napari-sc3d-viewer).
 24 | 
 25 | ## Description of the GitHub repository
 26 | 
 27 | - data: a folder containing examples for the tissue color and tissue name files
 28 | 
 29 | - src: a folder containing the source code
 30 | 
 31 | - txt: a folder containing the text describing the method (LaTeX, pdf and docx formats are available)
 32 | 
 33 | - README.md: this file
 34 | 
 35 | - notebooks/Test-embryo.ipynb: Basic read and write examples (many different ways of writing)
 36 | 
 37 | - notebooks/Spatial-differential-expression.ipynb: a jupyter notebook with some examples on how to perform the spatial differential expression
 38 | 
 39 | - notebooks/Embryo-registration.ipynb: a jupyter notebook with an example on how to do the array registration
 40 | 
 41 | - setup.py: Setup file to install the library
 42 | 
 43 | ## Installation
 44 | 
 45 | We strongly advise to use virtual environments to install this package. For example using conda or miniconda:
 46 | 
 47 | ```shell
 48 | conda create -n sc-3D
 49 | conda activate sc-3D
 50 | ```
 51 | 
 52 | If necessary, install `pip`:
 53 | 
 54 | ```shell
 55 | conda install pip
 56 | ```
 57 | 
 58 | Then, to install the latest stable version:
 59 | 
 60 | ```shell
 61 | pip install sc-3D
 62 | ```
 63 | 
 64 | or to install the latest version from the GitHub repository:
 65 | 
 66 | ```shell
 67 | git clone https://github.com/GuignardLab/sc3D.git
 68 | cd sc3D
 69 | pip install .
 70 | ```
 71 | 
 72 | ### Troubleshooting for latest M1 MacOs chips
 73 | 
 74 | If working with an M1 chip, it is possible that all the necessary libraries are not yet available from the usual channels.
 75 | 
 76 | To overcome this issue we recommand to manually install the latest, GitHub version of __sc3D__ using [miniforge](https://github.com/conda-forge/miniforge) instead of anaconda or miniconda.
 77 | 
 78 | Once miniforge is installed and working, you can run the following commands:
 79 | 
 80 | ```shell
 81 | conda create -n sc-3D
 82 | conda activate sc-3D
 83 | ```
 84 | 
 85 | to create your environment, then:
 86 | 
 87 | ```shell
 88 | git clone https://github.com/GuignardLab/sc3D.git
 89 | cd sc3D
 90 | conda install pip scipy numpy matplotlib pandas seaborn anndata napari
 91 | pip install .
 92 | ```
 93 | 
 94 | If the previous commands are still not working, it is possible that you need to install the `pkg-config` package. You can find some information on how to do it there: [install pkg-config](https://gist.github.com/jl/9e5ebbc9ccf44f3c804e)
 95 | 
 96 | ## Basic usage
 97 | 
 98 | Once installed, the library can be called the following way:
 99 | 
100 | ```python
101 | from sc3D import Embryo
102 | ```
103 | 
104 | To import some data:
105 | 
106 | **Note: at the time being, the following conventions are expected:**
107 | 
108 | - **the x-y coordinates are stored in `data.obsm['X_spatial']`**
109 | - **the array number should be stored in `data.obs['orig.ident']` in the format `".*_[0-9]*"` where the digits after the underscore (`_`) are the id of the array**
110 | - **the tissue type has to be stored in `data.obs['predicted.id']`**
111 | - **the gene names have to be stored as indices or in `data.var['feature_name']`**
112 | 
113 | Since version `0.1.2`, one can specify the name of the columns where the different necessary informations are stored using the following parameters:
114 | 
115 | - `tissue_id` to inform the tissue id column
116 | - `array_id` to inform the array/puck/slice id column
117 | - `pos_id` to inform the position column (an `x, y` position is expected within this given column)
118 | - `gene_name_id` to inform the gene name column
119 | - `pos_reg_id` when to inform the registered position column (an `x, y, z` position is expected within this given column)
120 | 
121 | ```python
122 | # To read the data
123 | embryo = Embryo('path/to/data.h5ad')
124 | 
125 | # To remove potential spatial outliers
126 | embryo.removing_spatial_outliers(th=outlier_threshold)
127 | 
128 | # To register the arrays and compute the
129 | # spline interpolations
130 | embryo.reconstruct_intermediate(embryo, th_d=th_d,
131 |                                 genes=genes_of_interest)
132 | 
133 | # To save the dataset as a registered dataset (to then look at it in the 3D visualizer)
134 | embryo.save_anndata('path/to/out/registered.h5ad')
135 | 
136 | # To compute the 3D differential expression for selected tissues
137 | tissues_to_process = [5, 10, 12, 18, 21, 24, 30, 31, 33, 34, 39]
138 | th_vol = .025
139 | _ = embryo.get_3D_differential_expression(tissues_to_process, th_vol)
140 | ```
141 | 
142 | The dataset used for the project this code is from can be downloaded [there](https://cellxgene.cziscience.com/collections/d74b6979-efba-47cd-990a-9d80ccf29055/private) (under the name `mouse_embryo_E8.5_merged_data`)
143 | 
144 | Many other functions are available that can be found used in the two provided jupyter notebooks.
145 | 
146 | ## Running the notebooks
147 | 
148 | Two example notebooks are provided.
149 | To run them one wants to first install the jupyter notebook:
150 | 
151 | ```shell
152 | conda install jupyter
153 | ```
154 | 
155 | or
156 | 
157 | ```shell
158 | pip install jupyter
159 | ```
160 | 
161 | The notebooks can the be started from a terminal in the folder containing the `.ipynb` files with the following command:
162 | 
163 | ```shell
164 | jupyter notebook
165 | ```
166 | 
167 | The notebooks should be self content.
168 | 
169 | Note that the test dataset is not included in this repository put can be downloaded from [there](https://cellxgene.cziscience.com/collections/d74b6979-efba-47cd-990a-9d80ccf29055/private).
170 | 


--------------------------------------------------------------------------------
/data/DLPFC.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuignardLab/sc3D/e08b067d930a109c666a5af191cac97529258673/data/DLPFC.h5ad


--------------------------------------------------------------------------------
/data/corresptissues.json:
--------------------------------------------------------------------------------
 1 | {"0": "Extraembryonic ectoderm 1",
 2 | "1": "Anterior neuroectoderm",
 3 | "10": "Preplacodal ectoderm",
 4 | "11": "Posterior neuroectoderm",
 5 | "12": "Lateral plate mesoderm",
 6 | "13": "Hematopoietic progenitors",
 7 | "14": "Parietal endoderm",
 8 | "15": "Amnion mesoderm early",
 9 | "16": "Surface ectoderm",
10 | "18": "Somites",
11 | "2": "Primitive streak late",
12 | "20": "Splanchnic mesoderm",
13 | "21": "Heart",
14 | "22": "Primitive blood late",
15 | "23": "Notochord",
16 | "24": "Brain",
17 | "25": "Extraembryonic ectoderm 2",
18 | "26": "Neuromesodermal progenitors (NMPs)",
19 | "27": "Primordial germ cells (PGCs)",
20 | "28": "Differentiated trophoblast cells",
21 | "29": "Visceral endoderm early",
22 | "30": "Presomitic mesoderm (PSM)",
23 | "31": "Neuromesodermal Progenitors (NMPs)",
24 | "32": "Angioblasts",
25 | "33": "Neural crest",
26 | "34": "Pharyngeal arch",
27 | "35": "Mesenchyme",
28 | "36": "Primitive blood progenitors",
29 | "38": "Node",
30 | "39": "Neural tube",
31 | "4": "Primitive and definitive endoderm",
32 | "40": "Visceral endoderm late",
33 | "41": "Amnion mesoderm late",
34 | "5": "Allantois",
35 | "6": "Secondary heart field (SHF)",
36 | "7": "Gut",
37 | "9": "Primitive blood early"}


--------------------------------------------------------------------------------
/data/data_test.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuignardLab/sc3D/e08b067d930a109c666a5af191cac97529258673/data/data_test.h5ad


--------------------------------------------------------------------------------
/data/tissuescolor.json:
--------------------------------------------------------------------------------
 1 | {"5": [0.7411764705882353, 0.803921568627451, 1.0],
 2 |  "6": [0.19607843137254902, 0.35294117647058826, 0.6078431372549019],
 3 |  "7": [0.996078431372549, 0.6862745098039216, 0.08627450980392157],
 4 |  "9": [0.7686274509803922, 0.27058823529411763, 0.10980392156862745],
 5 |  "10": [0.10980392156862745, 1.0, 0.807843137254902],
 6 |  "12": [0.7529411764705882, 0.4588235294117647, 0.6509803921568628],
 7 |  "13": [0.9647058823529412, 0.13333333333333333, 0.1803921568627451],
 8 |  "14": [0.7411764705882353, 0.43529411764705883, 0.6705882352941176],
 9 |  "15": [0.9686274509803922, 0.8823529411764706, 0.6274509803921569],
10 |  "16": [1.0, 0.9803921568627451, 0.9803921568627451],
11 |  "18": [0.47058823529411764, 0.16470588235294117, 0.7137254901960784],
12 |  "20": [0.5019607843137255, 0.5019607843137255, 0.5019607843137255],
13 |  "21": [0.9803921568627451, 0.0, 0.5294117647058824],
14 |  "22": [0.5098039215686274, 0.1803921568627451, 0.10980392156862745],
15 |  "23": [0.5215686274509804, 0.4, 0.050980392156862744],
16 |  "24": [0.803921568627451, 0.1607843137254902, 0.5647058823529412],
17 |  "27": [0.6588235294117647, 0.6588235294117647, 0.6588235294117647],
18 |  "29": [0.0, 0.0, 0.5450980392156862],
19 |  "30": [0.5450980392156862, 0.2784313725490196, 0.36470588235294116],
20 |  "31": [1.0, 0.7568627450980392, 0.1450980392156863],
21 |  "32": [0.8705882352941177, 0.6274509803921569, 0.9921568627450981],
22 |  "33": [0.19607843137254902, 0.5137254901960784, 0.996078431372549],
23 |  "34": [0.9725490196078431, 0.6313725490196078, 0.6235294117647059],
24 |  "35": [0.7098039215686275, 0.9372549019607843, 0.7098039215686275],
25 |  "36": [0.1803921568627451, 0.8509803921568627, 1.0],
26 |  "39": [0.10980392156862745, 0.5137254901960784, 0.33725490196078434],
27 |  "40": [1.0, 0.6470588235294118, 0.30980392156862746],
28 |  "41": [0.8470588235294118, 0.7490196078431373, 0.8470588235294118]}


--------------------------------------------------------------------------------
/notebooks/Embryo-Registration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "1f3d24be",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import json\n",
 11 |     "from matplotlib import pyplot as plt\n",
 12 |     "import numpy as np\n",
 13 |     "from sc3D import Embryo\n",
 14 |     "\n",
 15 |     "%matplotlib inline"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "id": "a4252ded",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "# Setting up parameters\n",
 24 |     "Note that it is necessary to download the E8.5.h5ad dataset!\n",
 25 |     "\n",
 26 |     "The dataset can be found [there](https://figshare.com/s/9c73df7fd39e3ca5422d)."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "id": "52716110",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "### Tissue name mapping and tissue colors can be loaded here"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "06060a02",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "corres_tissues = \"data/corresptissues.json\"\n",
 45 |     "with open(\"data/tissuescolor.json\") as f:\n",
 46 |     "    colors_paper = json.load(f)\n",
 47 |     "    colors_paper = {eval(k): v for k, v in colors_paper.items()}"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "id": "17b256ba",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "### Definition of the variables"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "id": "5018ec89",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# Path to the input data\n",
 66 |     "data_path = \"data/srt90.h5ad\"\n",
 67 |     "# Path to the output folder\n",
 68 |     "output_folder = \"out/\"\n",
 69 |     "\n",
 70 |     "# Set of genes that will be interpolated\n",
 71 |     "genes_of_interest = [\"T\", \"Sox2\"]\n",
 72 |     "\n",
 73 |     "# List of tissues that are ignored to do coverslip registration\n",
 74 |     "tissues_to_ignore = []  # 13, 15, 16, 22, 27, 29, 32, 36, 40, 41]\n",
 75 |     "\n",
 76 |     "# Coverslips to ignore\n",
 77 |     "# on the starting side\n",
 78 |     "nb_CS_begin_ignore = 0\n",
 79 |     "# on the ending side\n",
 80 |     "nb_CS_end_ignore = 0\n",
 81 |     "\n",
 82 |     "# Gives more weight to some tissues to help the alignment\n",
 83 |     "tissue_weight = {21: 2000, 18: 2000}\n",
 84 |     "# xy resolution\n",
 85 |     "xy_resolution = 0.6\n",
 86 |     "# Distance max that two beads can be linked together between coverslips\n",
 87 |     "th_d = 150\n",
 88 |     "# Threshold bellow which the beads will be considered noise.\n",
 89 |     "# Value between 0 (all beads taken) and 1 (almost no beads taken)\n",
 90 |     "outlier_threshold = 0.6\n",
 91 |     "\n",
 92 |     "# Number of interpolated layers between two consecutive coverslips\n",
 93 |     "nb_interp = 5"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "id": "f26e9ac7",
 99 |    "metadata": {},
100 |    "source": [
101 |     "### Loading the embryo and removing some spatial outliers"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "id": "feca794e",
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "embryo = Embryo(\n",
112 |     "    data_path,\n",
113 |     "    tissues_to_ignore,\n",
114 |     "    corres_tissues,\n",
115 |     "    tissue_weight=tissue_weight,\n",
116 |     "    xy_resolution=xy_resolution,\n",
117 |     "    genes_of_interest=genes_of_interest,\n",
118 |     "    nb_CS_begin_ignore=nb_CS_begin_ignore,\n",
119 |     "    nb_CS_end_ignore=nb_CS_end_ignore,\n",
120 |     "    store_anndata=True,\n",
121 |     "    tissue_id=\"first_type\",\n",
122 |     ")\n",
123 |     "embryo.removing_spatial_outliers(th=outlier_threshold)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "id": "d77bc007",
129 |    "metadata": {},
130 |    "source": [
131 |     "### Doing the spatial registration"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "id": "c30b6fb2",
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "embryo.registration_3d()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "id": "f22267c7",
147 |    "metadata": {},
148 |    "source": [
149 |     "## Saving the dataset\n",
150 |     "The following line allows to save the dataset so you can then open it using our viewer."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "id": "4441412c",
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "embryo.save_anndata(output_folder + \"/E9.0_registered.h5ad\")"
161 |    ]
162 |   }
163 |  ],
164 |  "metadata": {
165 |   "kernelspec": {
166 |    "display_name": "Python 3 (ipykernel)",
167 |    "language": "python",
168 |    "name": "python3"
169 |   },
170 |   "language_info": {
171 |    "codemirror_mode": {
172 |     "name": "ipython",
173 |     "version": 3
174 |    },
175 |    "file_extension": ".py",
176 |    "mimetype": "text/x-python",
177 |    "name": "python",
178 |    "nbconvert_exporter": "python",
179 |    "pygments_lexer": "ipython3",
180 |    "version": "3.9.13"
181 |   }
182 |  },
183 |  "nbformat": 4,
184 |  "nbformat_minor": 5
185 | }
186 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.briefcase]
 6 | project_name = "sc-3D"
 7 | author = "Leo Guignard"
 8 | license = "MIT"
 9 | 
10 | [tool.black]
11 | line-length = 79
12 | 
13 | [tool.isort]
14 | profile = "black"
15 | line_length = 79
16 | 
17 | [tool.bumpver]
18 | current_version = "1.2.1"
19 | version_pattern = "MAJOR.MINOR.PATCH[-TAG]"
20 | commit_message = "bump version {old_version} -> {new_version}"
21 | commit = true
22 | tag = true
23 | push = false
24 | 
25 | [tool.bumpver.file_patterns]
26 | "pyproject.toml" = [
27 |     'current_version = "{version}"',
28 | ]
29 | "src/sc3D/__init__.py" = [
30 |     '__version__ = "{version}"',
31 | ]
32 | "setup.cfg" = [
33 |     'version = {version}',
34 | ]
35 | "CITATION.cff" = [
36 |     "version: {version}",
37 | ]


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = sc-3D
 3 | version = 1.2.1
 4 | author = Leo Guignard
 5 | author_email = leo.guignard@univ-amu.fr
 6 | url = https://github.com/GuignardLab/sc3D
 7 | license = MIT
 8 | description = Array alignment and 3D differential expression for 3D sc omics
 9 | long_description = file: README.md
10 | long_description_content_type = text/markdown
11 | summary = Array alignment and 3D differential expression for 3D sc omics
12 | classifiers =
13 |     Development Status :: 4 - Beta
14 |     Intended Audience :: Developers
15 |     Topic :: Software Development :: Testing
16 |     Programming Language :: Python
17 |     Programming Language :: Python :: 3
18 |     Programming Language :: Python :: 3.8
19 |     Programming Language :: Python :: 3.9
20 |     Programming Language :: Python :: 3.10
21 |     Operating System :: OS Independent
22 |     License :: OSI Approved :: MIT License
23 | project_urls =
24 |     Bug Tracker = https://github.com/GuignardLab/sc3D/issues
25 |     Documentation = https://github.com/GuignardLab/sc3D#README.md
26 |     Source Code = https://github.com/GuignardLab/sc3D
27 |     User Support = https://github.com/GuignardLab/sc3D/issues
28 |     Twitter = https://twitter.com/guignardlab
29 | python_requires = >=3.8
30 | classifier =
31 |     Operating System :: OS Independent
32 | 
33 | [options]
34 | packages = find:
35 | include_package_data = True
36 | python_requires = >=3.8
37 | package_dir =
38 |     =src
39 | 
40 | # add your package requirements here
41 | install_requires =
42 |     scipy
43 |     numpy
44 |     matplotlib
45 |     pandas
46 |     seaborn
47 |     scikit-learn
48 |     anndata
49 | 
50 | [options.extras_require]
51 | testing =
52 |     tox
53 |     pytest  # https://docs.pytest.org/en/latest/contents.html
54 |     pytest-cov  # https://pytest-cov.readthedocs.io/en/latest/
55 | 
56 | [options.packages.find]
57 | where = src


--------------------------------------------------------------------------------
/src/sc3D/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.2.1"
2 | 
3 | from .sc3D import Embryo
4 | from .transformations import transformations
5 | 


--------------------------------------------------------------------------------
/src/sc3D/_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuignardLab/sc3D/e08b067d930a109c666a5af191cac97529258673/src/sc3D/_tests/__init__.py


--------------------------------------------------------------------------------
/src/sc3D/_tests/test_sc3D.py:
--------------------------------------------------------------------------------
 1 | from sc3D import Embryo
 2 | import numpy as np
 3 | 
 4 | 
 5 | def test_sc3D():
 6 |     em = Embryo("data/data_test.h5ad", store_anndata=True)
 7 |     assert len(em.all_cells) == 120
 8 |     em.smooth_data()
 9 |     em.plot_coverslip(7)
10 |     em.get_3D_differential_expression([21])
11 |     em.plot_top_3D_diff_expr_genes([21, 23], repetition_allowed=True)
12 |     em.plot_top_3D_diff_expr_genes([21, 23], repetition_allowed=False)
13 |     em.plot_volume_vs_neighbs(21)
14 |     em.removing_spatial_outliers()
15 |     em.compute_volumes()
16 |     em.set_zpos()
17 | 
18 |     em = Embryo(
19 |         f"data/DLPFC.h5ad",
20 |         tissue_id="layer_guess",
21 |         pos_id="spatial",
22 |         array_id="z",
23 |         z_space=30,
24 |         store_anndata=True,
25 |     )
26 |     em.produce_em()
27 |     em.registration_3d()
28 |     origin = np.mean([em.final[c] for c in em.all_cells], axis=0)
29 |     origin = np.hstack([origin, 80])
30 |     angles = np.array([-5.0, 5.0, 0.0])
31 |     points_to_plot = em.plot_slice(
32 |         angles, color_map="viridis", origin=origin, thickness=30, nb_interp=5
33 |     )
34 | 


--------------------------------------------------------------------------------
/src/sc3D/sc3D.py:
--------------------------------------------------------------------------------
   1 | #!python
   2 | """
   3 | This file is subject to the terms and conditions defined in
   4 | file 'LICENCE', which is part of this source code package.
   5 | Author: Leo Guignard (leo.guignard...@AT@...univ-amu.fr)
   6 | """
   7 | from collections import Counter
   8 | from itertools import combinations
   9 | import re
  10 | 
  11 | import numpy as np
  12 | import scipy as sp
  13 | from matplotlib import pyplot as plt
  14 | import pandas as pd
  15 | 
  16 | from scipy.spatial import KDTree, Delaunay
  17 | from scipy.spatial.distance import cdist
  18 | from scipy.optimize import linear_sum_assignment
  19 | from scipy.interpolate import InterpolatedUnivariateSpline, interp1d
  20 | from scipy.stats import zscore, linregress
  21 | from seaborn import scatterplot
  22 | import json
  23 | from pathlib import Path
  24 | from time import time
  25 | 
  26 | import anndata
  27 | from sc3D.transformations import transformations as tr
  28 | 
  29 | 
  30 | class Embryo:
  31 |     """
  32 |     Embryo class to handle samples from 3D spatial
  33 |     single cell omics. It was initially designed with
  34 |     a specific dataset in mind but it should work
  35 |     for other kinds of datasets.
  36 |     """
  37 | 
  38 |     def set_zpos(self):
  39 |         """
  40 |         Creates the dictionary containing
  41 |         the z position of the different beads
  42 |         """
  43 |         self.z_pos = {}
  44 |         self.pos_3D = {}
  45 |         cs_conversion = {
  46 |             b: a * self.z_space for a, b in enumerate(self.all_cover_slips)
  47 |         }
  48 |         for c in self.all_cells:
  49 |             self.z_pos[c] = cs_conversion[self.cover_slip[c]]
  50 |             x, y = self.pos[c]
  51 |             self.pos_3D[c] = np.array([x, y, self.z_pos[c]])
  52 | 
  53 |     def read_anndata(
  54 |         self,
  55 |         path,
  56 |         xy_resolution=1,
  57 |         genes_of_interest=None,
  58 |         store_anndata=False,
  59 |         tissue_id="predicted.id",
  60 |         array_id="orig.ident",
  61 |         array_id_num_pos=-1,
  62 |         pos_id="X_spatial",
  63 |         pos_reg_id="X_spatial_registered",
  64 |         gene_name_id="feature_name",
  65 |         sample_list=None,
  66 |     ):
  67 |         """
  68 |         Reads and loads a 3D spatial single cell
  69 |         omics dataset from an anndata file.
  70 | 
  71 |         Args:
  72 |             path (str): path to the csv or h5ad file
  73 |             xy_resolution (float): resolution of the xy coordinates
  74 |             genes_of_interest (list of str): list of genes to load
  75 |                 genes_of_interest lists the genes that can then be
  76 |                 interpolated between slices
  77 |             tissues_to_ignore (list of int): list of tissue ids that
  78 |                 will be ignored. The beads that have been assigned these
  79 |                 tissue types will not be loaded
  80 |             store_anndata (bool): whether or not to store the anndata
  81 |                 matrix. The matrix is necessary when looking for
  82 |                 differentially expressed genes
  83 |             tissue_id (str): string naming the column containing the tissue ids. The
  84 |                 tissue ids will be contained in `data.obs[tissue_id]`.
  85 |                 Default: 'predicted.id'
  86 |             array_id (str): string naming the column containing the array/puck/slice
  87 |                 id. It will determine the `z` position of the cell.
  88 |                 The array id will be contained in `data.obs[array_id]` as a number.
  89 |                 The number has to always be at the same position in the `str`,
  90 |                 a position that is determined by `array_id_num_pos`. For example
  91 |                 '20220101_4' with a `array_id_num_pos==1` will give an array id
  92 |                 of `4`.
  93 |                 Default: 'predicted.id'
  94 |             array_id_num_pos (int): See `array_id`.
  95 |                 Default: -1 (last number in the sequence)
  96 |             pos_id (str): string naming the column containing the x, y positions. The
  97 |                 x, y positions will be contained in `data.obsm[pos_id]`.
  98 |                 Default: 'X_spatial'
  99 |             pos_reg_id (str): string naming the column containing the x, y, z registered
 100 |                 positions. The x, y, z registered positions will be contained
 101 |                 in `data.obsm[pos_reg_id]`.
 102 |                 Default: 'X_spatial_registered'
 103 |             gene_name_id (str): string naming the column containing the gene names.
 104 |                 The gene names will be contained in `data.var[gene_name_id]`.
 105 |                 Default: 'feature_name'
 106 |             sample_list (list): list of samples when the dataset is split in multiple
 107 |                 h5/h5ad files. It can either be the path to each file or the id of
 108 |                 the slice. When it is only the id of the slices, it is then expected
 109 |                 to have the file pattern in the `path` as follow for example:
 110 |                 `path/to/file_{0}.h5ad` where the slice id should be at the position
 111 |                 `{0}`. The code will call `path.format(sample_list[0])` for example.
 112 |         """
 113 |         if sample_list is None or len(sample_list) <= 1:
 114 |             data = anndata.read(str(path))
 115 |         else:
 116 |             path = str(path)
 117 |             if path[path.find("{") :].find("}") != -1:
 118 |                 pathes = [path.format(s) for s in sample_list]
 119 |             elif Path(sample_list[0]).exists():
 120 |                 pathes = sample_list
 121 |             else:
 122 |                 for s in sample_list:
 123 |                     if s in path:
 124 |                         path = path.replace(s, "{0}")
 125 |                 pathes = [path.format(s) for s in sample_list]
 126 |             data = anndata.read_h5ad(pathes[0])
 127 |             data.obs[array_id] = [
 128 |                 1,
 129 |             ] * data.shape[0]
 130 |             for i, p in enumerate(pathes[1:]):
 131 |                 to_add = anndata.read_h5ad(p)
 132 |                 to_add.obs[array_id] = [
 133 |                     i + 2,
 134 |                 ] * to_add.shape[0]
 135 |                 data = anndata.concat([data, to_add])
 136 | 
 137 |         # if tissues_to_ignore is not None:
 138 |         #     data = data[~(data.obs[tissue_id].astype(int).isin(tissues_to_ignore))]
 139 |         if self.nb_CS_begin_ignore != 0 or self.nb_CS_end_ignore != 0:
 140 |             orig = sorted(set(data.obs[array_id]))
 141 |             cs_to_remove = (
 142 |                 orig[: self.nb_CS_begin_ignore]
 143 |                 + orig[-self.nb_CS_end_ignore :]
 144 |             )
 145 |             data = data[~(data.obs[array_id].isin(cs_to_remove))]
 146 |         if data.raw is not None:
 147 |             data.raw = data.raw.to_adata()
 148 |         else:
 149 |             data.raw = data.copy()
 150 |         ids = range(len(data))
 151 |         self.all_cells = list(ids)
 152 |         self.cell_names = dict(
 153 |             zip(
 154 |                 ids,
 155 |                 map(
 156 |                     lambda x, y: str.split(x, y)[-1],
 157 |                     data.obs_names,
 158 |                     "_" * len(data),
 159 |                 ),
 160 |             )
 161 |         )
 162 |         if pos_id in data.obsm:
 163 |             self.pos = dict(zip(ids, data.obsm[pos_id] * xy_resolution))
 164 |         try:
 165 |             self.tissue = dict(zip(ids, data.obs[tissue_id].astype(int)))
 166 |         except:
 167 |             tissues = data.obs[tissue_id].unique()
 168 |             tissue_map = dict(zip(tissues, range(len(tissues))))
 169 |             self.tissue = dict(
 170 |                 zip(ids, map(tissue_map.get, data.obs[tissue_id]))
 171 |             )
 172 |             self.corres_tissue = {v: k for k, v in tissue_map.items()}
 173 | 
 174 |         if gene_name_id in data.var:
 175 |             data.var.set_index(gene_name_id, inplace=True)
 176 |             if gene_name_id in data.raw.var:
 177 |                 data.raw.var.set_index(gene_name_id, inplace=True)
 178 |             else:
 179 |                 data.raw.var.set_index(data.var.index, inplace=True)
 180 |         if genes_of_interest is None:
 181 |             genes_of_interest = []
 182 |         elif genes_of_interest == "all":
 183 |             genes_of_interest = data.var_names
 184 |         self.all_genes = sorted(genes_of_interest)
 185 |         if 0 < len(genes_of_interest):
 186 |             self.gene_expression = dict(
 187 |                 zip(ids, np.array(data.raw[:, self.all_genes].X.A))
 188 |             )
 189 |             self.data = data.raw[:, self.all_genes].X.A
 190 |         else:
 191 |             self.gene_expression = {id_: [] for id_ in ids}
 192 | 
 193 |         self.array_id_num_pos = array_id_num_pos
 194 |         if array_id in data.obs_keys():
 195 |             if data.obs[array_id].dtype != int:
 196 |                 exp = re.compile("[0-9]+")
 197 |                 cs = list(
 198 |                     map(
 199 |                         lambda x: int(exp.findall(x)[array_id_num_pos]),
 200 |                         data.obs[array_id],
 201 |                     )
 202 |                 )
 203 |             else:
 204 |                 cs = data.obs[array_id]
 205 |             self.cover_slip = dict(zip(ids, cs))
 206 |             for c, cs in self.cover_slip.items():
 207 |                 self.cells_from_cover_slip.setdefault(cs, set()).add(c)
 208 |             self.all_cover_slips = sorted(set(self.cells_from_cover_slip))
 209 | 
 210 |         for c, T in self.tissue.items():
 211 |             self.cells_from_tissue.setdefault(T, set()).add(c)
 212 |         self.all_tissues = set(self.cells_from_tissue)
 213 |         if store_anndata:
 214 |             self.anndata = data
 215 |         if pos_reg_id in data.obsm:
 216 |             self.pos_3D = dict(zip(ids, data.obsm[pos_reg_id]))
 217 |         else:
 218 |             self.set_zpos()
 219 | 
 220 |     @staticmethod
 221 |     def rigid_transform_2D(A, B):
 222 |         """
 223 |         Given to lists of paired positions, computes the rigid
 224 |         transformation that minimizes between the paired positions.
 225 |         Shamefully copied from there:
 226 |         https://github.com/nghiaho12/rigid_transform_3D
 227 | 
 228 |         Args:
 229 |             A (2 x n ndarray): list of 2D positions
 230 |             B (2 x n ndarray): list of 2D positions
 231 |         Returns:
 232 |             M (4x4 ndarray): resulting rigid matrix
 233 |         """
 234 |         assert A.shape == B.shape
 235 | 
 236 |         num_rows, num_cols = A.shape
 237 |         if num_rows != 2:
 238 |             raise Exception(
 239 |                 f"matrix A is not 2xN, it is {num_rows}x{num_cols}"
 240 |             )
 241 | 
 242 |         num_rows, num_cols = B.shape
 243 |         if num_rows != 2:
 244 |             raise Exception(
 245 |                 f"matrix B is not 2xN, it is {num_rows}x{num_cols}"
 246 |             )
 247 | 
 248 |         # find mean column wise
 249 |         centroid_A = np.mean(A, axis=1)
 250 |         centroid_B = np.mean(B, axis=1)
 251 | 
 252 |         # ensure centroids are 3x1
 253 |         centroid_A = centroid_A.reshape(-1, 1)
 254 |         centroid_B = centroid_B.reshape(-1, 1)
 255 | 
 256 |         # subtract mean
 257 |         Am = A - centroid_A
 258 |         Bm = B - centroid_B
 259 | 
 260 |         H = Am @ np.transpose(Bm)
 261 | 
 262 |         # find rotation
 263 |         U, _, Vt = np.linalg.svd(H)
 264 |         R = Vt.T @ U.T
 265 | 
 266 |         # special reflection case
 267 |         if np.linalg.det(R) < 0:
 268 |             Vt[1, :] *= -1
 269 |             R = Vt.T @ U.T
 270 | 
 271 |         t = -R @ centroid_A + centroid_B
 272 |         M = np.identity(3)
 273 |         M[:2, :2] = R
 274 |         M[:2, -1:] = t
 275 | 
 276 |         return M
 277 | 
 278 |     def register(self, pos_ref, pos_flo, apply=False, rigid=False):
 279 |         """
 280 |         Computes and if asked, apply the transformation that minizes the
 281 |         distances between two sets of paired points. The computed transformation
 282 |         is always linear but can be rigid (rotation+translation) or
 283 |         affine (rigid+shearing)
 284 | 
 285 |         Args:
 286 |             pos_ref (2 x n ndarray): list of the reference 2D positions
 287 |             pos_flo (2 x n ndarray): list of 2D positions to transform
 288 |             apply (bool): if true, on top of returning the transformation
 289 |                 matrix, the function returns the transformed points.
 290 |                 Default: False
 291 |             rigid (bool): if true a rigid transformation is computed
 292 |                 otherwise an affine function is computed
 293 |         Returns:
 294 |             M (4 x 4 ndarray): resulting rigid matrix
 295 |             new_pos (2 x n ndarray): list of transformed `pos_flo`
 296 |                 positions. Only returned if `apply` is `True`
 297 |         """
 298 |         if rigid:
 299 |             M = self.rigid_transform_2D(pos_flo.T, pos_ref.T)
 300 |         else:
 301 |             try:
 302 |                 M = tr.affine_matrix_from_points(pos_flo.T, pos_ref.T)
 303 |             except Exception:
 304 |                 M = self.rigid_transform_2D(pos_flo.T, pos_ref.T)
 305 |         if apply:
 306 |             pos = np.pad(
 307 |                 pos_flo, ((0, 0), (0, 1)), "constant", constant_values=1
 308 |             ).T
 309 |             new_pos = np.dot(M, pos)[:2].T
 310 |             return (M, new_pos)
 311 |         return M
 312 | 
 313 |     def center_data(self):
 314 |         """
 315 |         Centers the dataset on 0.
 316 |         Stores the result in `self.centered_pos`
 317 |         Returns:
 318 |             (dict, int:[float, float]): a dictionnary that maps beads id to
 319 |                 their centered positions
 320 |         """
 321 |         for cells in self.cells_from_cover_slip.values():
 322 |             pos = np.array([self.pos[c] for c in cells])
 323 |             avg = np.mean(pos, axis=0)
 324 |             self.centered_pos.update(zip(cells, pos - avg))
 325 |         return self.centered_pos
 326 | 
 327 |     def get_tissue_centers(self):
 328 |         """
 329 |         Computes the center of mass of the different tissues
 330 |         within each puck. Stores the result in `self.tissue_centers`
 331 |         Returns:
 332 |             (dict puck_id:(dict (tissue_id, tissue_weight): float)):
 333 |                 dictionary that maps a puck id to another dictionary.
 334 |                 The second dictionary maps a tissue id and its weight
 335 |                 to the center of mass of the tissue in that puck
 336 |         """
 337 |         for cs, cells in self.cells_from_cover_slip.items():
 338 |             self.tissue_centers[cs] = {}
 339 |             tissues = {
 340 |                 t: cells.intersection(T)
 341 |                 for t, T in self.cells_from_tissue.items()
 342 |                 if not t in self.tissues_to_ignore
 343 |             }
 344 |             tissues[-1] = cells
 345 |             for tissue, c_tissue in tissues.items():
 346 |                 if len(c_tissue) > 2:
 347 |                     pos = [self.centered_pos[ci] for ci in c_tissue]
 348 |                     for w in range(self.tissue_weight.get(tissue, 1)):
 349 |                         self.tissue_centers[cs][(tissue, w)] = np.mean(
 350 |                             pos, axis=0
 351 |                         )
 352 |         return self.tissue_centers
 353 | 
 354 |     def build_and_apply_trsf_matrix(self, cs_ref, cs_flo):
 355 |         """
 356 |         Prepare the data, compute and apply the transformation that
 357 |         matches two pucks.
 358 | 
 359 |         Args:
 360 |             cs_ref (int): id of the reference puck
 361 |             cs_flo (int): if of the floating puck (that will be transformed)
 362 |         """
 363 |         # getting the shared tissue between the consecutive coverslips
 364 |         tissues_ref = set(self.tissue_centers[cs_ref].keys())
 365 |         tissues_flo = set(self.tissue_centers[cs_flo].keys())
 366 |         tissues_common = list(tissues_ref.intersection(tissues_flo))
 367 |         # getting the average position of the tissue to register
 368 |         pos_flo = np.array(
 369 |             [self.tissue_centers[cs_flo][t] for t in tissues_common]
 370 |         )
 371 |         # getting the average position of the reference tissue
 372 |         pos_ref = np.array(
 373 |             [self.tissue_centers_reg[cs_ref][t] for t in tissues_common]
 374 |         )
 375 |         # computing the transformation
 376 |         M = self.rigid_transform_2D(pos_flo.T, pos_ref.T)
 377 |         # M = self.register(pos_flo, pos_ref)
 378 | 
 379 |         # preping the floating positions for the trsf
 380 |         pos = np.pad(
 381 |             [
 382 |                 self.centered_pos[ci]
 383 |                 for ci in self.cells_from_cover_slip[cs_flo]
 384 |             ],
 385 |             ((0, 0), (0, 1)),
 386 |             "constant",
 387 |             constant_values=1,
 388 |         ).T
 389 |         if not hasattr(self, "init_trsf"):
 390 |             self.init_trsf = {}
 391 |         self.init_trsf[cs_flo] = M
 392 |         # applying the trsf
 393 |         new_pos = np.dot(M, pos)[:2].T
 394 |         # updating the position dictionary
 395 |         self.registered_pos.update(
 396 |             dict(zip(self.cells_from_cover_slip[cs_flo], new_pos))
 397 |         )
 398 | 
 399 |         # preping the floating tissue centers
 400 |         pos = np.pad(
 401 |             [
 402 |                 self.tissue_centers[cs_flo][t]
 403 |                 for t in self.tissue_centers[cs_flo]
 404 |             ],
 405 |             ((0, 0), (0, 1)),
 406 |             "constant",
 407 |             constant_values=1,
 408 |         ).T
 409 |         new_pos = np.dot(M, pos)[:2].T
 410 |         self.tissue_centers_reg[cs_flo] = dict(
 411 |             zip(self.tissue_centers[cs_flo], new_pos)
 412 |         )
 413 | 
 414 |     def register_with_tissues(self):
 415 |         """
 416 |         Register together all the pucks using tissue center of masses.
 417 |         """
 418 |         if self.centered_pos is None:
 419 |             self.centered_pos = {}
 420 |             self.center_data()
 421 |         if self.tissue_centers is None:
 422 |             self.tissue_centers = {}
 423 |             self.get_tissue_centers()
 424 |         cs_ref = self.all_cover_slips[0]
 425 |         self.tissue_centers_reg[cs_ref] = self.tissue_centers[cs_ref]
 426 |         self.registered_pos = {
 427 |             c: self.centered_pos[c] for c in self.cells_from_cover_slip[cs_ref]
 428 |         }
 429 |         for cs_flo in self.all_cover_slips[1:]:
 430 |             self.build_and_apply_trsf_matrix(cs_ref, cs_flo)
 431 |             cs_ref = cs_flo
 432 | 
 433 |     def build_pairing(self, cs1, cs2, rebuild=False, refine=False, th_d=None):
 434 |         """
 435 |         Build the pairing between beads from two pucks and stores it in the
 436 |         dictionary `pairing` that maps a bead id to the id of its paired bead.
 437 | 
 438 |         Args:
 439 |             cs1 (int): id of the first puck
 440 |             cs2 (int): id of the second puck
 441 |             rebuild (bool): if true the previously computed pairings are erased
 442 |                 Default: False (you should probably keep it that way)
 443 |             refine (bool): if true, uses the previously computed registration to
 444 |                 do the pairing (usually kept at False).
 445 |                 Default: False
 446 |             th_d (bool | float): threshold above which a pairing is discarded.
 447 |                 If th_d is a boolean, then the threshold is the median of the
 448 |                 distribution of all the distances. If th_d is a float the value
 449 |                 given is used as a threshold.
 450 |                 Usually used as a float.
 451 |         Returns:
 452 |             pos_ref (2 x n ndarray): list of positions that have been paired from
 453 |                 the first puck (`cs1`)
 454 |             pos_flo (2 x n ndarray): list of positions that have been paired from
 455 |                 the second puck (`cs2`)
 456 |         """
 457 |         if rebuild:
 458 |             self.pairing = {}
 459 |         pos_ref = []
 460 |         pos_flo = []
 461 |         tissues_to_treat = [
 462 |             t for t in self.all_tissues if not t in self.tissues_to_ignore
 463 |         ]
 464 |         for tissue in tissues_to_treat:
 465 |             cells_cs1 = np.array(
 466 |                 [
 467 |                     c
 468 |                     for c in self.cells_from_cover_slip[cs1]
 469 |                     if self.tissue[c] == tissue
 470 |                 ]
 471 |             )
 472 |             cells_cs2 = np.array(
 473 |                 [
 474 |                     c
 475 |                     for c in self.cells_from_cover_slip[cs2]
 476 |                     if self.tissue[c] == tissue
 477 |                 ]
 478 |             )
 479 |             positions_cs1 = np.array(
 480 |                 [self.final.get(c, self.registered_pos[c]) for c in cells_cs1]
 481 |             )
 482 |             if refine:
 483 |                 positions_cs2 = np.array(
 484 |                     [self.pos_reg_aff[c] for c in cells_cs2]
 485 |                 )
 486 |             else:
 487 |                 positions_cs2 = np.array(
 488 |                     [self.registered_pos[c] for c in cells_cs2]
 489 |                 )
 490 |             if len(positions_cs1) > 0 and len(positions_cs2) > 0:
 491 |                 distance = cdist(positions_cs1, positions_cs2)
 492 |                 copy_d = distance.copy()
 493 |                 if isinstance(th_d, bool):
 494 |                     th_d_tissue = np.max(distance) / 2
 495 |                     distance[th_d_tissue < distance] = np.inf
 496 |                 elif isinstance(th_d, (int, float)):
 497 |                     th_d_tissue = th_d
 498 |                     distance[th_d_tissue < distance] = np.inf
 499 |                 else:
 500 |                     th_d_tissue = np.inf
 501 |                 try:
 502 |                     pairing = linear_sum_assignment(distance)
 503 |                     pos_ref += list(positions_cs1[pairing[0]])
 504 |                     pos_flo += list(positions_cs2[pairing[1]])
 505 |                     self.pairing.update(
 506 |                         zip(cells_cs1[pairing[0]], cells_cs2[pairing[1]])
 507 |                     )
 508 |                 except Exception as e:
 509 |                     print("re-doing linear sum assignment :(")
 510 |                     pairing = linear_sum_assignment(copy_d)
 511 |                     pos_ref_tmp = positions_cs1[pairing[0]]
 512 |                     pos_flo_tmp = positions_cs2[pairing[1]]
 513 |                     distance_paired = np.linalg.norm(
 514 |                         np.array(pos_ref_tmp) - np.array(pos_flo_tmp), axis=1
 515 |                     ).reshape(-1, 1)
 516 |                     to_keep = (distance_paired < th_d_tissue).reshape(-1)
 517 |                     pos_ref_tmp = pos_ref_tmp[to_keep]
 518 |                     pos_flo_tmp = pos_flo_tmp[to_keep]
 519 |                     pos_ref += list(pos_ref_tmp)
 520 |                     pos_flo += list(pos_flo_tmp)
 521 |                     self.pairing.update(
 522 |                         zip(
 523 |                             cells_cs1[pairing[0][to_keep]],
 524 |                             cells_cs2[pairing[1][to_keep]],
 525 |                         )
 526 |                     )
 527 |         return pos_ref, pos_flo
 528 | 
 529 |     def register_cs(
 530 |         self,
 531 |         cs1,
 532 |         cs2,
 533 |         refine=False,
 534 |         rigid=False,
 535 |         final=False,
 536 |         th_d=None,
 537 |         timing=False,
 538 |     ):
 539 |         """
 540 |         Registers the puck `cs2` onto the puck `cs1`.
 541 | 
 542 |         Args:
 543 |             cs1 (int): id of the first puck
 544 |             cs2 (int): id of the second puck
 545 |             refine (bool): if true, uses the previously computed registration to
 546 |                 do the pairing (usually kept at False).
 547 |                 Default: False
 548 |             rebuild (bool): if true the previously computed pairings are erased
 549 |                 Default: False (you should probably keep it that way)
 550 |             final (bool): if True assumes that it is the final registration between
 551 |                 the two considered pucks (legacy, always True now).
 552 |                 Default: True
 553 |             th_d (bool | float): threshold above which a pairing is discarded.
 554 |                 If th_d is a boolean, then the threshold is the median of the
 555 |                 distribution of all the distances. If th_d is a float the value
 556 |                 given is used as a threshold.
 557 |                 Usually used as a float.
 558 | 
 559 |         """
 560 |         if timing:
 561 |             start = time()
 562 |             if not hasattr(self, "timing"):
 563 |                 self.timing = {}
 564 |             current_cs_timing = self.timing.setdefault((cs1, cs2), {})
 565 |         if self.registered_pos is None:
 566 |             self.register_with_tissues()
 567 |             if timing:
 568 |                 current_cs_timing["register_with_tissues"] = time() - start
 569 |                 start = time()
 570 |         if (self.final is None) and final:
 571 |             self.final = {
 572 |                 c: self.centered_pos[c]
 573 |                 for c in self.cells_from_cover_slip[cs1]
 574 |             }
 575 |         pos_ref, pos_flo = self.build_pairing(
 576 |             cs1, cs2, rebuild=False, refine=refine, th_d=th_d
 577 |         )
 578 |         if timing:
 579 |             current_cs_timing["build_pairing"] = time() - start
 580 |             start = time()
 581 |         M = self.register(
 582 |             np.array(pos_ref), np.array(pos_flo), apply=False, rigid=rigid
 583 |         )
 584 |         if timing:
 585 |             current_cs_timing["register"] = time() - start
 586 |             start = time()
 587 |         cells_cs2 = self.cells_from_cover_slip[cs2]
 588 |         if refine:
 589 |             positions_cs2 = np.array([self.pos_reg_aff[c] for c in cells_cs2])
 590 |         else:
 591 |             positions_cs2 = np.array(
 592 |                 [self.registered_pos[c] for c in cells_cs2]
 593 |             )
 594 |         pos = np.pad(
 595 |             positions_cs2, ((0, 0), (0, 1)), "constant", constant_values=1
 596 |         ).T
 597 |         new_pos = np.dot(M, pos)[:2].T
 598 |         self.pos_reg_aff.update(zip(cells_cs2, new_pos))
 599 |         if final:
 600 |             self.final.update(zip(cells_cs2, new_pos))
 601 |         if timing:
 602 |             current_cs_timing["apply"] = time() - start
 603 |             start = time()
 604 |         return M
 605 | 
 606 |     @staticmethod
 607 |     def build_gabriel_graph(node_ids, pos, data_struct="adj-dict", dist=False):
 608 |         """
 609 |         Build the gabriel graph of a set of nodes with
 610 |         associtated positions.
 611 | 
 612 |         Args:
 613 |             node_ids ([int, ] (size n)): list of node ids
 614 |             pos (n x m ndarray): ndarray of the positions where n is
 615 |                 the number of nodes and m is the spatial dimension
 616 |             data_struct (str): in which type of data structure will
 617 |                 the graph be saved, currently either 'adj-dict' and
 618 |                 'adj-mat' are supported.
 619 |                 'adj-dict': Adjacency dictionary
 620 |                 'adj-mat' : Adjacency matrix
 621 |             dist (bool)
 622 |         Returns:
 623 |             final_GG (dict id: set([ids, ])): the gabriel graph as
 624 |                 an adjacency list, a dictionary that maps node ids
 625 |                 to the list of neighboring node ids
 626 |         """
 627 |         if not data_struct in ["adj-dict", "adj-mat"]:
 628 |             raise ValueError(
 629 |                 "Data structure for the Gabriel graph not understood"
 630 |             )
 631 |         tmp = Delaunay(pos)
 632 |         delaunay_graph = {}
 633 | 
 634 |         for N in tmp.simplices:
 635 |             for e1, e2 in combinations(np.sort(N), 2):
 636 |                 delaunay_graph.setdefault(e1, set()).add(e2)
 637 |                 delaunay_graph.setdefault(e2, set()).add(e1)
 638 | 
 639 |         if data_struct.lower() == "adj-dict":
 640 |             Gabriel_graph = {}
 641 |             for e1, neighbs in delaunay_graph.items():
 642 |                 for ni in neighbs:
 643 |                     if not any(
 644 |                         np.linalg.norm((pos[ni] + pos[e1]) / 2 - pos[i])
 645 |                         < np.linalg.norm(pos[ni] - pos[e1]) / 2
 646 |                         for i in neighbs.intersection(delaunay_graph[ni])
 647 |                     ):
 648 |                         Gabriel_graph.setdefault(e1, set()).add(ni)
 649 |                         Gabriel_graph.setdefault(ni, set()).add(e1)
 650 | 
 651 |             final_GG = {}
 652 |             for e1, neighbs in Gabriel_graph.items():
 653 |                 neighbs = np.array(list(neighbs))
 654 |                 distances = np.linalg.norm(
 655 |                     pos[e1] - [pos[ni] for ni in neighbs], axis=1
 656 |                 )
 657 |                 final_GG[node_ids[e1]] = {
 658 |                     node_ids[ni]
 659 |                     for ni in neighbs[distances <= 5 * np.median(distances)]
 660 |                 }
 661 | 
 662 |         elif data_struct.lower() == "adj-mat":
 663 |             X, Y, val = [], [], []
 664 |             for e1, neighbs in delaunay_graph.items():
 665 |                 for ni in [n for n in neighbs if e1 < n]:
 666 |                     D = np.linalg.norm(pos[e1] - pos[ni])
 667 |                     if not any(
 668 |                         np.linalg.norm((pos[ni] + pos[e1]) / 2 - pos[i])
 669 |                         < D / 2
 670 |                         for i in neighbs.intersection(delaunay_graph[ni])
 671 |                     ):
 672 |                         X.append(node_ids[e1])
 673 |                         Y.append(node_ids[ni])
 674 |                         X.append(node_ids[ni])
 675 |                         Y.append(node_ids[e1])
 676 |                         if dist:
 677 |                             val.append(D)
 678 |                             val.append(D)
 679 |                         else:
 680 |                             val.append(True)
 681 |                             val.append(True)
 682 |             final_GG = sp.sparse.coo_array(
 683 |                 (val, (X, Y)), shape=(max(node_ids) + 1, max(node_ids) + 1)
 684 |             )
 685 |             final_GG = final_GG.tocsr()
 686 | 
 687 |         return final_GG
 688 | 
 689 |     def smooth_data(self, inplace=True):
 690 |         """
 691 |         Smooth the gene expression according to the spatial neighborhood relationship.
 692 |         The spatial neighborhood relationship is computed as the Gabriel graph.
 693 |         The smoothed expression (\(s_c \) of the gene $g$ of a cell $c$ which has
 694 |         a set of neighbors $N_c = \{n_i\}$ is computed as follow:
 695 |             $$s_c = \\frac{\sum_{n_i \in N_c} ||n_i - c||.g_{n_i}}{\sum_{n_i \in N_c} ||n_i - c||}$$
 696 |         where $||n_i - c||$ is the distance between $n_i$ and $c$ and $g_{n_i}$ is the measured
 697 |         expression intensity of the gene $g$ in the cell $n_i$.
 698 |         The result is stored in `self.anndata.raw` in place of the previous raw data if required.
 699 |         Otherwise, the smoothed matrix is returned.
 700 | 
 701 |         :WARNING: This function can be high CPU and memory taxing since it is multiplying
 702 |             the neighborhood adjacency matrix (nb_beadxnb_beads) by the gene expression matrix
 703 |             (nb_beadsxnb_genes)
 704 | 
 705 |         Args:
 706 |             inplace (bool): whether or not to replace the current data with the smoothed one
 707 |                 If `True`, the data will be saved in place of self.anndata.raw otherwise the
 708 |                 new smoothed matrix will be returned. Default: `True`
 709 | 
 710 |         Returns:
 711 | 
 712 |         """
 713 |         ids, pos = list(zip(*self.pos_3D.items()))
 714 |         GG = self.build_gabriel_graph(ids, pos, "adj-mat", dist=True)
 715 |         GG = GG.astype(
 716 |             np.float32
 717 |         ).toarray()  # Matrix multiplication "optimisation"
 718 |         gene_expr = self.anndata.raw.X.toarray()
 719 |         product = np.dot(GG, gene_expr)
 720 |         dist_sum = GG.sum(axis=1)
 721 |         product_n = product / dist_sum.reshape(-1, 1)
 722 |         product_sparse = sp.sparse.csr_array(product_n)
 723 |         tmp_raw = self.anndata.raw.to_adata()
 724 |         tmp_raw.X = product_sparse.toarray()
 725 |         if inplace:
 726 |             self.anndata.raw = tmp_raw
 727 |         else:
 728 |             return tmp_raw
 729 | 
 730 |     def downsample(self, spacing=10, pos_id="pos_3D"):
 731 |         """
 732 |         Downsample the sample slice by slice on a grid defined by `spacing`.
 733 |         For example, downsampling with a spacing of 10
 734 |         will create new slices with a distance between beads of 10 units.
 735 |         The expression of the new "beads" will be the average of the expression
 736 |         of the beads within a radius `spacing/2` from the coordinate of the new bead.
 737 |         The tissue assigned to the new bead is the tissue that is the most present within
 738 |         the orginal beads in the radius `spacing/2`.
 739 |         The function returns a anndata object (`out`) that can then be saved with the
 740 |         anndata `write` function: out.write("path_to_file.h5ad").
 741 | 
 742 |         Args:
 743 |             spacing (int): the spacing between the x and y coordinates in the new grid
 744 |             pos_id (str | dict): the position dictionary to take into account
 745 | 
 746 |         Returns:
 747 |             (anndata): the resampled anndata dataset with the following `.obs`:
 748 |                 - "nb_cells": The number of cells from the original dataset
 749 |                               collected for the "bead"
 750 |                 - `self.pos_reg_id`: the new 3D position of the "bead"
 751 |                 - `self.tissue_id`: the tissue/cluster of the "bead"
 752 |                 - `self.array_id`: the array/puck of the "bead"
 753 |         """
 754 |         from itertools import product
 755 | 
 756 |         first = True
 757 |         mapping_from_removed = np.arange(max(self.all_cells) + 1)
 758 |         mapping_from_removed[sorted(self.all_cells)] = np.arange(
 759 |             len(self.all_cells)
 760 |         )
 761 |         if isinstance(pos_id, str):
 762 |             pos_vals = self.__getattribute__(pos_id)
 763 |         else:
 764 |             pos_vals = pos_id
 765 |         for s, s_cells in self.cells_from_cover_slip.items():
 766 |             cells = [c for c in s_cells]
 767 |             pos = np.array([pos_vals[c][:2] for c in s_cells])
 768 |             min_x, min_y = np.min(pos, axis=0)
 769 |             max_x, max_y = np.max(pos, axis=0)
 770 |             x_coords = np.linspace(
 771 |                 min_x, max_x, int((max_x - min_x) // spacing)
 772 |             )
 773 |             y_coords = np.linspace(
 774 |                 min_y, max_y, int((max_y - min_y) // spacing)
 775 |             )
 776 |             coordinates = np.array(list(product(x_coords, y_coords)))
 777 |             z = self.pos_3D[list(s_cells)[0]][-1] / 4
 778 |             kdtree = KDTree(pos)
 779 |             mapping = kdtree.query_ball_point(coordinates, spacing / 2)
 780 |             final_positions = np.array(
 781 |                 [
 782 |                     list(coordinates[i]) + [z]
 783 |                     for i, v in enumerate(mapping)
 784 |                     if 0 < len(v)
 785 |                 ]
 786 |             )
 787 |             final_expr = np.array(
 788 |                 [
 789 |                     np.mean(
 790 |                         self.anndata.raw[
 791 |                             mapping_from_removed[[cells[vi] for vi in v]]
 792 |                         ].X.A,
 793 |                         axis=0,
 794 |                     )
 795 |                     for v in mapping
 796 |                     if 0 < len(v)
 797 |                 ]
 798 |             )
 799 |             nb_cells = np.array([len(v) for v in mapping if 0 < len(v)])
 800 |             tissue = [
 801 |                 np.unique(
 802 |                     [self.tissue[cells[vi]] for vi in v], return_counts=True
 803 |                 )
 804 |                 for v in mapping
 805 |                 if 0 < len(v)
 806 |             ]
 807 |             tissue = np.array(
 808 |                 [self.corres_tissue[v[0][np.argmax(v[1])]] for v in tissue]
 809 |             )
 810 |             if first:
 811 |                 out = anndata.AnnData(final_expr, var=self.anndata.raw.var)
 812 |                 out.obs["nb_cells"] = nb_cells
 813 |                 out.obsm[self.pos_reg_id] = final_positions
 814 |                 out.obs[self.tissue_id] = tissue
 815 |                 out.obs[self.array_id] = s
 816 |                 first = False
 817 |             else:
 818 |                 out_new = anndata.AnnData(final_expr, var=self.anndata.raw.var)
 819 |                 out_new.obs["nb_cells"] = nb_cells
 820 |                 out_new.obsm[self.pos_reg_id] = final_positions
 821 |                 out_new.obs[self.tissue_id] = tissue
 822 |                 out_new.obs[self.array_id] = s
 823 |                 out = anndata.concat([out, out_new])
 824 |         return out
 825 | 
 826 |     def plot_coverslip(
 827 |         self,
 828 |         cs,
 829 |         pos="pos",
 830 |         ax=None,
 831 |         tissues_to_plot=None,
 832 |         legend=False,
 833 |         color=None,
 834 |         cells=None,
 835 |         show=False,
 836 |         **kwargs,
 837 |     ):
 838 |         """
 839 |         Plot a puck
 840 | 
 841 |         Args:
 842 |             cs (int): id of the puck to plot
 843 |             pos (str): attribute defining the positions to plot.
 844 |                 Probably want to use 'final' since it is the registered
 845 |                 positions. Despite that, default is 'pos', the original
 846 |                 positions
 847 |             ax (matplotlib.AxesSubplot): can be provided to constrain the
 848 |                 plot
 849 |             tissues_to_plot ([t_id, ]): list of tissue ids to plot
 850 |             legend (bool): if True a legend is ploted.
 851 |                 Default: False
 852 |             color (dict t_id: [float, float, float]): a dictionary that
 853 |                 maps a tissue id to a given color. If `None`, then the default
 854 |                 matplotlib colors are used.
 855 |                 Default: None
 856 |             cells ([id, ]): list of bead ids to plot. If `cells` is provided
 857 |                 `tissues_to_plot` and `cs` are ignored
 858 |             kwargs : the kwargs are passed down to the matplotlib.scatterplot call
 859 |         Returns:
 860 |             fig (matplotlib.Figure): the created figure
 861 |             ax (matplotlib.AxesSubplot): the working axis
 862 |         """
 863 |         pre_existing_ax = ax is not None
 864 |         if not pre_existing_ax:
 865 |             fig, ax = plt.subplots()
 866 |         else:
 867 |             fig = ax.get_figure()
 868 |         if isinstance(pos, str):
 869 |             positions_attr = self.__getattribute__(pos)
 870 |         else:
 871 |             positions_attr = pos
 872 |         if tissues_to_plot is None and cells is None:
 873 |             cells = self.cells_from_cover_slip[cs]
 874 |         elif cells is None:
 875 |             cells = [
 876 |                 c
 877 |                 for c in self.cells_from_cover_slip[cs]
 878 |                 if self.tissue[c] in tissues_to_plot
 879 |             ]
 880 |         positions = np.array([positions_attr[c][:2] for c in cells])
 881 |         tissues = [self.tissue[c] for c in cells]
 882 |         if len(positions) < 1:
 883 |             return fig, ax
 884 |         scatter_args = {
 885 |             "marker": ".",
 886 |             "s": 25,
 887 |             "cmap": "tab20",
 888 |             "vmin": min(self.all_tissues),
 889 |             "vmax": max(self.all_tissues),
 890 |         }
 891 |         scatter_args.update(kwargs)
 892 |         if color is None:
 893 |             color = tissues
 894 |         elif isinstance(color, dict):
 895 |             color = [
 896 |                 color.get(
 897 |                     t,
 898 |                     [
 899 |                         0.8,
 900 |                     ]
 901 |                     * 3,
 902 |                 )
 903 |                 for t in tissues
 904 |             ]
 905 |         scatter = ax.scatter(*positions.T, c=color, **scatter_args)
 906 |         if legend:
 907 |             from matplotlib import colormaps
 908 | 
 909 |             cmap = colormaps[scatter_args["cmap"]]
 910 |             mapping = lambda v: (v - scatter_args["vmin"]) / (
 911 |                 scatter_args["vmax"] - scatter_args["vmin"]
 912 |             )
 913 |             for t in np.unique(tissues):
 914 |                 ax.plot(
 915 |                     [],
 916 |                     [],
 917 |                     linestyle="",
 918 |                     marker=scatter_args["marker"],
 919 |                     color=cmap(mapping(t)),
 920 |                     label=self.corres_tissue.get(t, t),
 921 |                 )
 922 |             ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
 923 |         if not pre_existing_ax:
 924 |             ax.set_aspect("equal")
 925 |             fig.tight_layout()
 926 |         if show:
 927 |             plt.show()
 928 |         return fig, ax
 929 | 
 930 |     def removing_spatial_outliers(self, th=0.2, n_components=3):
 931 |         """
 932 |         Removes spatial outliers given a threshold and a number of components
 933 | 
 934 |         Args:
 935 |             th (float): Likelyhood below which a bead is discarded.
 936 |                 Default: 0.2
 937 |             n_components (int): number of components for the gaussian mixture
 938 |                 model.
 939 |                 Default: 3 (probably should keep it that way. Lesser than 2 will
 940 |                             crash things)
 941 |         """
 942 |         from sklearn import mixture
 943 | 
 944 |         for t in self.all_tissues:
 945 |             c_to_d = {}
 946 |             cells_final = []
 947 |             for cells in self.cells_from_cover_slip.values():
 948 |                 cells_t = np.array(list(cells & self.cells_from_tissue[t]))
 949 |                 if len(cells_t) < 2:
 950 |                     continue
 951 |                 cells_final.extend(list(cells_t))
 952 |                 pos = [self.pos[c] for c in cells_t]
 953 |                 kdtree = KDTree(pos)
 954 |                 dist = list(kdtree.query(pos, k=2, workers=-1)[0][:, 1])
 955 |                 c_to_d.update(zip(cells_t, dist))
 956 |             if len(cells_final) < 10:
 957 |                 continue
 958 |             cells_final = np.array(cells_final)
 959 | 
 960 |             D = np.array([d for c, d in c_to_d.items()])
 961 |             gmm = mixture.GaussianMixture(
 962 |                 n_components=n_components,
 963 |                 max_iter=1000,
 964 |                 covariance_type="full",
 965 |             ).fit(D.reshape(-1, 1))
 966 |             order = np.argsort(gmm.means_, axis=0)
 967 |             proba0 = gmm.predict_proba(D.reshape(-1, 1))[:, order[0, 0]]
 968 |             proba1 = gmm.predict_proba(D.reshape(-1, 1))[:, order[1, 0]]
 969 |             self.filtered_cells.update(
 970 |                 cells_final[(th < proba0) | (th < proba1)]
 971 |             )
 972 |         self.all_cells = set(self.all_cells).intersection(self.filtered_cells)
 973 |         self.pos = {k: self.pos[k] for k in self.filtered_cells}
 974 |         self.tissue = {k: self.tissue[k] for k in self.filtered_cells}
 975 |         self.cover_slip = {k: self.cover_slip[k] for k in self.filtered_cells}
 976 |         self.cell_names = {k: self.cell_names[k] for k in self.filtered_cells}
 977 |         self.gene_expression = {
 978 |             k: self.gene_expression[k] for k in self.filtered_cells
 979 |         }
 980 |         l_all = list(self.all_cells)
 981 |         if hasattr(self, "anndata"):
 982 |             self.anndata = self.anndata[l_all]
 983 |             self.anndata.raw = self.anndata.raw.to_adata()
 984 |         for t, c in self.cells_from_cover_slip.items():
 985 |             c.intersection_update(self.filtered_cells)
 986 |         for t, c in self.cells_from_tissue.items():
 987 |             c.intersection_update(self.filtered_cells)
 988 | 
 989 |     @staticmethod
 990 |     def __apply_trsf(M, trans, points):
 991 |         """
 992 |         Apply a rotation from a 2x2 rotation matrix `M` together with
 993 |         a translation from a translation vector of length 2 `trans` to a list of
 994 |         `points`
 995 | 
 996 |         Args:
 997 |             M (nd.array): a 2x2 rotation matrix
 998 |             trans (nd.array): a translation vector of length 2
 999 |             points (nd.array): a nx2 array of `n` points 2D positions
1000 | 
1001 |         Returns:
1002 |             (nd.array) a 2xn matrix of the `n` points transformed
1003 |         """
1004 |         trsf = np.identity(3)
1005 |         trsf[:-1, :-1] = M
1006 |         tr = np.identity(3)
1007 |         tr[:-1, -1] = -trans
1008 |         trsf = trsf @ tr
1009 | 
1010 |         flo = points.T
1011 |         flo_pad = np.pad(flo, ((0, 1), (0, 0)), constant_values=1)
1012 |         return (trsf @ flo_pad)[:-1]
1013 | 
1014 |     def registration_3d(
1015 |         self,
1016 |         rigid=True,
1017 |         th_d=True,
1018 |         cs=None,
1019 |         timing=False,
1020 |         method=None,
1021 |         min_counts_genes=15,
1022 |         min_counts_cells=100,
1023 |         work_with_raw=True,
1024 |         alpha=0.1,
1025 |         pre_registration=True,
1026 |     ):
1027 |         """
1028 |         Compute the 3D registration of the dataset and store the result in
1029 |         `self.pos_3D`
1030 | 
1031 |         Args:
1032 |             rigid (bool): whether looking for a rigid transformation or an affine
1033 |                 one. Note that it is NOT recommended to look for an affine
1034 |                 transformation since it will change the geometry of the slices.
1035 |                 Default: True
1036 |             th_d (bool | float): threshold above which a pairing is discarded.
1037 |                 If th_d is a boolean, then the threshold is the median of the
1038 |                 distribution of all the distances. If th_d is a float the value
1039 |                 given is used as a threshold.
1040 |                 Usually used as a float.
1041 |                 Default: True
1042 |             cs ([p_id, ]): list of array ids to treat. If None, then all the arrays
1043 |                 are treated.
1044 |                 Default: None
1045 |             timing (bool | str | Path): whether to save the timing of the array alignments.
1046 |                 If False it is not saved. If True it is saved in a file named `timing.txt`.
1047 |                 If it is a string it is save in the folder described by the string
1048 |                 (creating it/them if necessary).
1049 |                 Default: False
1050 |             method (str [None, 'sc3D', 'paste']): method to use to perform the alignment.
1051 |                 If `'paste'` is asked, the `paste` library must be installed first (from
1052 |                 Zeira et al. Nat Methods 19, 567–575 (2022).
1053 |                 https://doi.org/10.1038/s41592-022-01459-6).
1054 |                 If None then the default method used is `'sc3D'`.
1055 |                 Default: None
1056 |             min_counts_genes (int): threshold for the PASTE method
1057 |             min_counts_cells (int): threshold for the PASTE method
1058 |             work_with_raw (bool): whether to work with the raw data or not.
1059 |                 Only useful for the PASTE method
1060 |         """
1061 |         if cs is not None:
1062 |             cs_to_treat = cs
1063 |         else:
1064 |             cs_to_treat = self.all_cover_slips
1065 |         if self.z_pos is None or set(self.z_pos) != set(self.all_cells):
1066 |             self.set_zpos()
1067 |         if timing:
1068 |             start = current_time = time()
1069 |             times = []
1070 |         self.trsfs = {}
1071 |         if isinstance(method, str) and method.lower() == "paste":
1072 |             try:
1073 |                 import paste as pst
1074 |             except:
1075 |                 print("could not import PASTE, aborting")
1076 |                 return
1077 |             try:
1078 |                 import scanpy as sc
1079 | 
1080 |                 sc_imp = True
1081 |             except:
1082 |                 sc_imp = False
1083 |                 print(
1084 |                     "scanpy could not be loaded\n"
1085 |                     "no filtering will be applied"
1086 |                 )
1087 |             if work_with_raw:
1088 |                 raw_data = self.anndata.raw.to_adata()
1089 |             else:
1090 |                 raw_data = self.anndata.copy()
1091 |             if sc_imp:
1092 |                 if min_counts_genes != None:
1093 |                     filter_1 = sc.pp.filter_genes(
1094 |                         raw_data, min_counts=min_counts_genes, inplace=False
1095 |                     )
1096 |                 if min_counts_cells != None:
1097 |                     if min_counts_genes != None:
1098 |                         filter_2 = sc.pp.filter_cells(
1099 |                             raw_data[:, filter_1[0]],
1100 |                             min_counts=min_counts_cells,
1101 |                             inplace=False,
1102 |                         )
1103 |                     else:
1104 |                         filter_2 = sc.pp.filter_cells(
1105 |                             raw_data,
1106 |                             min_counts=min_counts_cells,
1107 |                             inplace=False,
1108 |                         )
1109 |                 if min_counts_genes != None:
1110 |                     if min_counts_cells != None:
1111 |                         M_raw_filtered = raw_data[
1112 |                             filter_2[0], filter_1[0]
1113 |                         ].copy()
1114 |                     else:
1115 |                         M_raw_filtered = raw_data[:, filter_1[0]].copy()
1116 |                 elif min_counts_cells != None:
1117 |                     M_raw_filtered = raw_data[filter_2[0], :].copy()
1118 |                 else:
1119 |                     M_raw_filtered = raw_data.copy()
1120 |             else:
1121 |                 M_raw_filtered = raw_data.copy()
1122 | 
1123 |             all_cs_names = sorted(self.anndata.obs[self.array_id].unique())
1124 |             exp = re.compile("[0-9]+")
1125 |             init_cs_numbers = [
1126 |                 int(exp.findall(x)[self.array_id_num_pos])
1127 |                 for x in all_cs_names
1128 |             ]
1129 |             num_to_name = dict(zip(init_cs_numbers, all_cs_names))
1130 |             M_raw_filtered.obsm["spatial"] = M_raw_filtered.obsm[self.pos_id]
1131 |             slices_id = sorted(cs_to_treat)
1132 |             slices = []
1133 |             for s_id in slices_id:
1134 |                 slices.append(
1135 |                     M_raw_filtered[
1136 |                         M_raw_filtered.obs[self.array_id] == num_to_name[s_id]
1137 |                     ]
1138 |                 )
1139 |             if timing:
1140 |                 start = time()
1141 |             times = []
1142 |             pis = []
1143 |             if timing:
1144 |                 current_time = time()
1145 |             for i, (s1, s2) in enumerate(zip(slices[:-1], slices[1:])):
1146 |                 if pre_registration:
1147 |                     pi0 = pst.match_spots_using_spatial_heuristic(
1148 |                         s1.obsm[self.pos_id], s2.obsm[self.pos_id], use_ot=True
1149 |                     )
1150 |                 else:
1151 |                     pi0 = None
1152 |                 pis.append(
1153 |                     pst.pairwise_align(
1154 |                         s1, s2, alpha=alpha, G_init=pi0, norm=True
1155 |                     )
1156 |                 )
1157 |                 if timing:
1158 |                     times.append([i, i + 1, time() - current_time])
1159 |                     current_time = time()
1160 | 
1161 |             _, M, trans = pst.stack_slices_pairwise(
1162 |                 slices, pis, output_params=True, matrix=True
1163 |             )
1164 |             if timing:
1165 |                 times.append([-1, -1, time() - start])
1166 |             for i, s_id in enumerate(slices_id):
1167 |                 print(f"Applying {i}th in {s_id}")
1168 |                 if i == 0:
1169 |                     m = np.identity(2)
1170 |                 else:
1171 |                     m = M[i - 1]
1172 |                 trsf = np.identity(3)
1173 |                 trsf[:-1, :-1] = M
1174 |                 tr = np.identity(3)
1175 |                 tr[:-1, -1] = -trans
1176 |                 trsf = trsf @ tr
1177 |                 self.trsfs[s_id] = trsf
1178 |                 cell_slice = list(self.cells_from_cover_slip[s_id])
1179 |                 points = np.array([self.pos[c] for c in cell_slice])
1180 |                 registered = self.__apply_trsf(m, trans[i], points)
1181 |                 z_space = self.z_pos[cell_slice[0]]
1182 |                 reg_pos = np.vstack(
1183 |                     (
1184 |                         registered,
1185 |                         [
1186 |                             z_space,
1187 |                         ]
1188 |                         * registered.shape[1],
1189 |                     )
1190 |                 ).T
1191 |                 self.pos_3D.update(dict(zip(cell_slice, reg_pos)))
1192 | 
1193 |         else:
1194 |             self.GG_cs = {}
1195 |             self.KDT_cs = {}
1196 |             for i, cs1 in enumerate(cs_to_treat[:-1]):
1197 |                 cs2 = cs_to_treat[i + 1]
1198 |                 self.trsfs[cs2] = self.register_cs(
1199 |                     cs1, cs2, rigid=rigid, final=True, th_d=th_d, timing=timing
1200 |                 )
1201 |                 if timing:
1202 |                     times.append([cs1, cs2, time() - current_time])
1203 |                     current_time = time()
1204 | 
1205 |             if timing:
1206 |                 times.append([-1, -1, time() - start])
1207 | 
1208 |             self.pos_3D = {
1209 |                 c: np.array(list(self.final[c]) + [self.z_pos[c]])
1210 |                 for c in self.all_cells
1211 |                 if self.cover_slip[c] in cs_to_treat
1212 |             }
1213 | 
1214 |         if timing:
1215 |             if isinstance(timing, str) or isinstance(timing, Path):
1216 |                 p = Path(timing)
1217 |                 if p.is_dir() and p.exists():
1218 |                     np.savetxt(p / "timing.txt", times)
1219 |                 elif p.parent.exists():
1220 |                     np.savetxt(p, times)
1221 |                 elif p.parent.mkdir(parents=True):
1222 |                     np.savetxt(p, times)
1223 |             else:
1224 |                 np.savetxt("timing.txt", times)
1225 | 
1226 |     def reconstruct_intermediate(
1227 |         self, rigid=True, th_d=True, cs=None, multicore=True, genes=None
1228 |     ):
1229 |         """
1230 |         Register all pucks against each other and build the interpolation splines
1231 | 
1232 |         Args:
1233 |             rigid (bool): if True, a rigid transformation is computed and applied.
1234 |                 Otherwise it is an affine transformation.
1235 |                 Default: True
1236 |             th_d (bool | float): threshold above which a pairing is discarded.
1237 |                 If th_d is a boolean, then the threshold is the median of the
1238 |                 distribution of all the distances. If th_d is a float the value
1239 |                 given is used as a threshold. Usually used as a float.
1240 |             cs ([p_id, ]): list of puck ids to treat. If None, then all the pucks
1241 |                 are treated.
1242 |                 Default: None
1243 |             multicore (bool): useless at the time being. Maybe one day ...
1244 |             genes ([str, ]): gene names that will be interpolated
1245 |         """
1246 |         disapear_bounds = (0.1, 0.5, 0.9)
1247 |         if not hasattr(self, "final"):
1248 |             self.registration_3d(rigid=rigid, th_d=th_d, cs=cs)
1249 |         if cs is not None:
1250 |             cs_to_treat = cs
1251 |         else:
1252 |             cs_to_treat = self.all_cover_slips
1253 |         if self.z_pos is None or set(self.z_pos) != set(self.all_cells):
1254 |             self.set_zpos()
1255 |         self.GG_cs = {}
1256 |         self.KDT_cs = {}
1257 |         for i, cs1 in enumerate(cs_to_treat[:-1]):
1258 |             cs2 = cs_to_treat[i + 1]
1259 |             self.register_cs(cs1, cs2, rigid=rigid, final=True, th_d=th_d)
1260 |         for csi in cs_to_treat:
1261 |             cids = list(self.cells_from_cover_slip[csi])
1262 |             pos = [self.final[c] for c in cids]
1263 |             self.GG_cs[csi] = self.build_gabriel_graph(cids, pos)
1264 |         paths = []
1265 |         inv_pairing = {v: k for k, v in self.pairing.items()}
1266 |         roots = set(self.pairing).difference(inv_pairing)
1267 |         for c in roots:
1268 |             p = [c]
1269 |             while p[-1] in self.pairing:
1270 |                 p.append(self.pairing[p[-1]])
1271 |             paths.append(p)
1272 | 
1273 |         unmapped_down = set(self.all_cells) - set(inv_pairing)
1274 |         unmapped_down.difference_update(
1275 |             self.cells_from_cover_slip[min(self.all_cover_slips)]
1276 |         )
1277 |         unmapped_up = set(self.all_cells).difference(self.pairing)
1278 |         unmapped_up.difference_update(
1279 |             self.cells_from_cover_slip[max(self.all_cover_slips)]
1280 |         )
1281 | 
1282 |         self.KDT_cs_down = {}
1283 |         self.paired_cs_down = {}
1284 |         for csi in cs_to_treat[1:]:
1285 |             self.paired_cs_down[csi] = set(
1286 |                 self.cells_from_cover_slip[csi]
1287 |             ) & set(inv_pairing)
1288 |             self.paired_cs_down[csi] = np.array(list(self.paired_cs_down[csi]))
1289 |             pos = [self.final[c] for c in self.paired_cs_down[csi]]
1290 |             self.KDT_cs_down[csi] = KDTree(pos)
1291 | 
1292 |         arrival_down = {}
1293 |         d_to_closest_down = {}
1294 |         for c in unmapped_down:
1295 |             csi = self.cover_slip[c]
1296 |             neighbs = self.GG_cs[csi].get(c).difference(unmapped_down)
1297 |             if len(neighbs) < 1:
1298 |                 neighbs = [
1299 |                     self.paired_cs_down[csi][
1300 |                         self.KDT_cs_down[csi].query(self.final[c], 1)[1]
1301 |                     ]
1302 |                 ]
1303 |             arrival_down[c] = np.mean(
1304 |                 [self.final[inv_pairing[ni]] for ni in neighbs], axis=0
1305 |             )
1306 |             d_to_closest_down[c] = np.mean(
1307 |                 [
1308 |                     np.linalg.norm(self.final[c] - self.final[ni])
1309 |                     for ni in neighbs
1310 |                 ]
1311 |             )
1312 | 
1313 |         self.KDT_cs_up = {}
1314 |         self.paired_cs_up = {}
1315 |         for csi in cs_to_treat[:-1]:
1316 |             self.paired_cs_up[csi] = set(
1317 |                 self.cells_from_cover_slip[csi]
1318 |             ) & set(self.pairing)
1319 |             self.paired_cs_up[csi] = np.array(list(self.paired_cs_up[csi]))
1320 |             pos = [self.final[c] for c in self.paired_cs_up[csi]]
1321 |             self.KDT_cs_up[csi] = KDTree(pos)
1322 | 
1323 |         arrival_up = {}
1324 |         d_to_closest_up = {}
1325 |         for c in unmapped_up:
1326 |             csi = self.cover_slip[c]
1327 |             neighbs = self.GG_cs[csi].get(c).difference(unmapped_up)
1328 |             if len(neighbs) < 1:
1329 |                 neighbs = [
1330 |                     self.paired_cs_up[csi][
1331 |                         self.KDT_cs_up[csi].query(self.final[c], 1)[1]
1332 |                     ]
1333 |                 ]
1334 |             arrival_up[c] = np.mean(
1335 |                 [self.final[self.pairing[ni]] for ni in neighbs], axis=0
1336 |             )
1337 |             d_to_closest_up[c] = np.mean(
1338 |                 [
1339 |                     np.linalg.norm(self.final[c] - self.final[ni])
1340 |                     for ni in neighbs
1341 |                 ]
1342 |             )
1343 | 
1344 |         d_to_closest_vals = list(d_to_closest_down.values()) + list(
1345 |             d_to_closest_up.values()
1346 |         )
1347 |         med_to_closest = np.median(d_to_closest_vals)
1348 |         min_to_closest = np.percentile(d_to_closest_vals, 1)
1349 |         max_to_closest = np.percentile(d_to_closest_vals, 99)
1350 |         end, mid, start = disapear_bounds
1351 |         dist_to_disapear = interp1d(
1352 |             [min_to_closest, med_to_closest, max_to_closest],
1353 |             [start, mid, end],
1354 |             bounds_error=False,
1355 |             fill_value=(start, end),
1356 |         )
1357 | 
1358 |         cells_to_treat = set(self.all_cells)
1359 |         all_trajs = {}
1360 |         if genes is not None and isinstance(genes, list):
1361 |             all_expr = {}
1362 |         elif genes is not None and not isinstance(genes, list):
1363 |             print("The genes to process have to be in a `list`")
1364 |             genes = None
1365 |             all_expr = []
1366 | 
1367 |         nb_skipped = 0
1368 |         while 0 < len(cells_to_treat):
1369 |             curr_cell = cells_to_treat.pop()
1370 |             traj = [curr_cell]
1371 |             while traj[0] in inv_pairing:
1372 |                 traj.insert(0, inv_pairing[traj[0]])
1373 |             while traj[-1] in self.pairing:
1374 |                 traj.append(self.pairing[traj[-1]])
1375 |             if len(traj) <= 1:
1376 |                 nb_skipped += 1
1377 |                 continue
1378 |             pos_traj = [self.final[c] for c in traj]
1379 |             z_traj = [self.z_pos[c] for c in traj]
1380 |             if traj[-1] in arrival_up:
1381 |                 pos_traj.append(arrival_up[traj[-1]])
1382 |                 D = dist_to_disapear(d_to_closest_up[traj[-1]])
1383 |                 z_traj.append(z_traj[-1] + D * self.z_space)
1384 |             if traj[0] in arrival_down:
1385 |                 pos_traj.insert(0, arrival_down[traj[0]])
1386 |                 D = dist_to_disapear(d_to_closest_down[traj[0]])
1387 |                 z_traj.insert(0, z_traj[0] - D * self.z_space)
1388 |             pos_traj_x, pos_traj_y = zip(*pos_traj)
1389 |             k_interp = min(3, len(pos_traj_x) - 1)
1390 |             f_traj_x = InterpolatedUnivariateSpline(
1391 |                 z_traj, pos_traj_x, k=k_interp, ext="const"
1392 |             )
1393 |             f_traj_y = InterpolatedUnivariateSpline(
1394 |                 z_traj, pos_traj_y, k=k_interp, ext="const"
1395 |             )
1396 |             if genes is not None:
1397 |                 for i, g in enumerate(genes):
1398 |                     if g in self.all_genes:
1399 |                         index = self.all_genes.index(g)
1400 |                         value_traj = [
1401 |                             self.gene_expression[c][index] for c in traj
1402 |                         ]
1403 |                         z_traj_g = [self.z_pos[c] for c in traj]
1404 |                         k_interp = min(3, len(z_traj_g) - 1)
1405 |                         f_traj_v = InterpolatedUnivariateSpline(
1406 |                             z_traj_g, value_traj, k=1, ext="const"
1407 |                         )
1408 |                     all_expr.setdefault(g, {}).update(
1409 |                         {traj[0]: [min(z_traj), max(z_traj), f_traj_v]}
1410 |                     )
1411 | 
1412 |             all_trajs[traj[0]] = [min(z_traj), max(z_traj), f_traj_x, f_traj_y]
1413 |             cells_to_treat -= set(traj)
1414 |         self.all_trajs = all_trajs
1415 |         if genes is not None:
1416 |             self.all_expr = all_expr
1417 | 
1418 |     def plot_slice(
1419 |         self,
1420 |         angle,
1421 |         color_map=None,
1422 |         rot_orig=None,
1423 |         origin=None,
1424 |         thickness=30,
1425 |         tissues=None,
1426 |         angle_unit="degree",
1427 |         nb_interp=5,
1428 |         output_path=None,
1429 |         gene=None,
1430 |         min_g1=None,
1431 |         min_g2=None,
1432 |         max_g1=None,
1433 |         max_g2=None,
1434 |         main_bi_color="g",
1435 |         figsize=(5, 5),
1436 |         path_scale=None,
1437 |         **kwargs,
1438 |     ):
1439 |         """
1440 |         Plot an arbitrarly oriented slice according to an angle a direction and an origin.
1441 | 
1442 |         Args:
1443 |             angle (float): angle of the rotation of the slice
1444 |             color_map (matplotlib.cmap): color map that will be applied
1445 |             rot_origin ([int, int, int]): 3D vector of the normal of the
1446 |                 rotation plan. If [0, 0, 1] is given the rotation will be
1447 |                 around the z axis
1448 |             origin ([int, int, int]): coordinates of center of the rotation
1449 |             thickness (float): thickness of the slice
1450 |             tissues ([t_id, ]): list of tissue ids to plot
1451 |             angle_unit (str): if `'degree'` the angle is treated as degrees.
1452 |                 Otherwise it is treated a radii
1453 |             nb_interp (int): number of pucks to interpolate in between
1454 |                 existing pucks
1455 |             output_path (str): path to the output figure
1456 |             gene (str | [str, str]): gene name to interpolate. If a list
1457 |                 of 2 strings is given gene colocalization is plotted
1458 |             min_g1/g2 (float): minimum threshold value for the first and
1459 |                 second genes when colocalization. If `None`, the 2nd
1460 |                 percentile of the gene expression is used as a threshold
1461 |             max_g1/g2 (float): maximum threshold value for the first and
1462 |                 second genes when colocalization. If `None`, the 98th
1463 |                 percentile of the gene expression is used as a threshold
1464 |             main_bi_color ('g' | 'r' | 'b'): when colocalization, two
1465 |                 colors are used green and red+blue ('g'), etc
1466 |             figsize (float, float): width and height of the figure given
1467 |                 to the function plt.figure
1468 |             path_scale (str): path to the figure that will contain the
1469 |                 scale for colocalization figures
1470 |             kwargs : the keyword args are forwarded to the scatterplot function
1471 |         Returns:
1472 |             points_to_plot (n x 2 ndarray): list of the positions of the points
1473 |                 that have been plotted
1474 |         """
1475 |         if tissues is None:
1476 |             tissues = self.all_tissues
1477 |         if angle_unit == "degree":
1478 |             angle = np.deg2rad(angle)
1479 |         if rot_orig is None:
1480 |             rot_orig = [0, 0, 1]
1481 |         if origin is None:
1482 |             origin = [0, 0, 0]
1483 |         x_angle, y_angle, z_angle = angle
1484 |         print(tr)
1485 |         rot_x = tr.rotation_matrix(x_angle, [1, 0, 0], origin)
1486 |         rot_y = tr.rotation_matrix(y_angle, [0, 1, 0], origin)
1487 |         rot_z = tr.rotation_matrix(z_angle, [0, 0, 1], origin)
1488 |         rot_composed = rot_x @ rot_y @ rot_z
1489 |         new_axis = (np.hstack([rot_orig, 1]) @ rot_composed)[:-1]
1490 |         equation = (
1491 |             lambda pos: np.sum(new_axis * pos, axis=1) - origin @ new_axis
1492 |         )
1493 |         if gene is not None and not isinstance(gene, str):
1494 |             if len(gene) == 1:
1495 |                 gene = gene[0]
1496 |                 points, color, *_ = self.produce_em(
1497 |                     nb_interp, tissues, gene=gene
1498 |                 )
1499 |                 color = np.array(color)
1500 |             else:
1501 |                 colors = []
1502 |                 for g in gene:
1503 |                     points, color, *_ = self.produce_em(
1504 |                         nb_interp, tissues, gene=g
1505 |                     )
1506 |                     colors.append(color)
1507 |                 C = np.array(colors)
1508 |                 if min_g1 is None:
1509 |                     min_g1 = np.percentile(C, 2, axis=1)[0]
1510 |                 if min_g2 is None:
1511 |                     min_g2 = np.percentile(C, 2, axis=1)[1]
1512 |                 if max_g1 is None:
1513 |                     max_g1 = np.percentile(C, 98, axis=1)[0]
1514 |                 if max_g2 is None:
1515 |                     max_g2 = np.percentile(C, 98, axis=1)[1]
1516 |                 norm = lambda C: (C - [[min_g1], [min_g2]]) / [
1517 |                     [max_g1 - min_g1],
1518 |                     [max_g2 - min_g2],
1519 |                 ]
1520 |                 V = norm(C)
1521 |                 V[V < 0] = 0
1522 |                 V[1 < V] = 1
1523 |                 final_C = np.zeros((len(colors[0]), 3))
1524 |                 on_channel = (
1525 |                     np.array(["r", "g", "b"]) == main_bi_color.lower()
1526 |                 ).astype(int)
1527 |                 final_C[:, 0] = V[on_channel[0]]
1528 |                 final_C[:, 1] = V[on_channel[1]]
1529 |                 final_C[:, 2] = V[on_channel[2]]
1530 |                 if path_scale:
1531 |                     scale_square = np.zeros((256, 256, 3))
1532 |                     V1 = np.linspace(0, max_g1, 256)
1533 |                     V2 = np.linspace(0, max_g2, 256)
1534 |                     VS = np.array([V1, V2])
1535 |                     VS = norm(VS)
1536 |                     VS[VS < 0] = 0
1537 |                     VS[1 < VS] = 1
1538 |                     scale_square[..., np.where(on_channel)[0][0]] = VS[0]
1539 |                     for ax in np.where(1 - on_channel)[0]:
1540 |                         scale_square[..., ax] = VS[1].reshape(-1, 1)
1541 |                     fig, ax = plt.subplots(figsize=(5, 5))
1542 |                     ax.imshow(scale_square.swapaxes(1, 0), origin="lower")
1543 |                     recap_g1 = lambda x: x * 255 / max_g1
1544 |                     recap_g2 = lambda x: x * 255 / max_g2
1545 |                     vals_g1 = np.arange(np.floor(max_g1) + 1, dtype=int)
1546 |                     vals_g2 = np.arange(np.floor(max_g2) + 1, dtype=int)
1547 |                     ax.set_xticks(recap_g1(vals_g1))
1548 |                     ax.set_yticks(recap_g2(vals_g2))
1549 |                     ax.set_xticklabels(vals_g1)
1550 |                     ax.set_yticklabels(vals_g2)
1551 |                     ax.set_xlabel(gene[0])
1552 |                     ax.set_ylabel(gene[1])
1553 |                     fig.tight_layout()
1554 |                     fig.savefig(path_scale)
1555 |         else:
1556 |             points, color, *_ = self.produce_em(nb_interp, tissues, gene=gene)
1557 |             color = np.array(color)
1558 |         points = np.array(points)
1559 |         dist_to_plan = equation(points)
1560 |         plan = np.abs(dist_to_plan) < thickness
1561 |         dist_to_plan = dist_to_plan[plan]
1562 |         points_to_plot = points[plan]
1563 |         points_to_plot = (
1564 |             np.hstack([points_to_plot, [[1]] * points_to_plot.shape[0]])
1565 |             @ rot_composed
1566 |         )[:, :-1]
1567 |         if gene is None:
1568 |             if isinstance(color_map, dict):
1569 |                 color_to_plot = np.array([color_map[c] for c in color[plan]])
1570 |             else:
1571 |                 color_to_plot = np.array([(0, 0, 0)] * len(color[plan]))
1572 |         elif not isinstance(gene, str):
1573 |             color_to_plot = final_C[plan]
1574 |         else:
1575 |             color_to_plot = color[plan]
1576 |         p_order = np.argsort(dist_to_plan)
1577 |         points_to_plot = points_to_plot[p_order]
1578 |         color_to_plot = color_to_plot[p_order]
1579 |         fig = plt.figure(figsize=figsize)
1580 |         ax = fig.add_subplot(1, 1, 1)  # , projection='3d')
1581 |         if gene is None:
1582 |             kwargs_scatter = {"s": 5, "color": color_to_plot}
1583 |         else:
1584 |             kwargs_scatter = {"s": 5, "c": color_to_plot}
1585 |         kwargs_scatter.update(kwargs)
1586 |         ax.scatter(*(points_to_plot.T[:-1]), **kwargs_scatter)
1587 |         ax.axis("equal")
1588 |         if output_path is not None:
1589 |             output_path = Path(output_path)
1590 |             if not output_path.parent.exists():
1591 |                 Path.mkdir(output_path.parent)
1592 |             fig.savefig(output_path)
1593 |         return points_to_plot
1594 | 
1595 |     def get_full_transformation(self, cs: int, homogeneous=True) -> np.ndarray:
1596 |         """
1597 |         Get the full transformation matrix for a slide
1598 | 
1599 |         Args:
1600 |             cs (int): cover slip id
1601 |             homogeneous (bool): whether to return the transformation matrix
1602 |                 as an homogeneous matrix or the decomposed version of the matrix
1603 |                 Default: True
1604 | 
1605 |         Returns:
1606 |             M (nd.array): 3x3 homogeneous coordinates transformation matrix
1607 |                 if `homogeneous` is True. Otherwise it dictionary with the
1608 |                 set of transformations (translation, rotation, scaling, shearing)
1609 |         """
1610 |         init_pos = np.array(
1611 |             [self.pos[c] for c in self.cells_from_cover_slip[cs]]
1612 |         )
1613 |         final_pos = np.array(
1614 |             [self.pos_3D[c][:-1] for c in self.cells_from_cover_slip[cs]]
1615 |         )
1616 | 
1617 |         M = self.rigid_transform_2D(init_pos.T, final_pos.T)
1618 |         pos = np.pad(
1619 |             init_pos, ((0, 0), (0, 1)), "constant", constant_values=1
1620 |         ).T
1621 |         new_pos = np.dot(M, pos)[:2].T
1622 |         if not np.allclose(new_pos, final_pos):
1623 |             print("Careful the transformation might not be correct")
1624 |         if homogeneous:
1625 |             return M
1626 |         else:
1627 |             tx, ty = M[0, 2], M[1, 2]
1628 | 
1629 |             # Extract the upper-left 2x2 matrix (contains rotation, scale, and shear)
1630 |             M = M[:2, :2]
1631 | 
1632 |             # Compute scale
1633 |             scale_x = np.linalg.norm(M[:, 0])
1634 |             scale_y = np.linalg.norm(M[:, 1])
1635 | 
1636 |             # Compute shear
1637 |             shear = np.dot(M[:, 0] / scale_x, M[:, 1] / scale_y)
1638 | 
1639 |             # Remove scale and shear to isolate rotation
1640 |             rotation_trsf = M / [scale_x, scale_y]
1641 |             rotation = np.arctan2(rotation_trsf[1, 0], rotation_trsf[0, 0])
1642 |             return {
1643 |                 "translation": [tx, ty],
1644 |                 "rotation": np.degrees(rotation),
1645 |                 "scale": [scale_x, scale_y],
1646 |                 "shear": shear,
1647 |             }
1648 | 
1649 |     def anndata_slice(
1650 |         self,
1651 |         output_path,
1652 |         angle,
1653 |         gene_list,
1654 |         rot_orig=None,
1655 |         origin=None,
1656 |         thickness=30,
1657 |         tissues=None,
1658 |         angle_unit="degree",
1659 |     ):
1660 |         """
1661 |         Build a anndata file containing a slice
1662 | 
1663 |         Args:
1664 |             output_path (str): path to the output ply file
1665 |             angle (float): angle of the rotation of the slice
1666 |             color_map (matplotlib.cmap): color map that will be applied
1667 |             rot_origin ([int, int, int]): 3D vector of the normal of the
1668 |                 rotation plan. If [0, 0, 1] is given the rotation will be
1669 |                 around the z axis
1670 |             origin ([int, int, int]): coordinates of center of the rotation
1671 |             thickness (float): thickness of the slice
1672 |             tissues ([t_id, ]): list of tissue ids to plot
1673 |             angle_unit (str): if `'degree'` the angle is treated as degrees.
1674 |                 Otherwise it is treated a radii
1675 |             gene_list ([str, ]): list of the gene names to interpolate
1676 |                 (only selected genes can be inputed)
1677 |         Returns:
1678 |             points_to_plot (n x 2 ndarray): list of the positions of the points
1679 |                 that have been plotted
1680 |         """
1681 |         if tissues is None:
1682 |             tissues = self.all_tissues
1683 |         if angle_unit == "degree":
1684 |             angle = np.deg2rad(angle)
1685 |         if rot_orig is None:
1686 |             rot_orig = [0, 0, 1]
1687 |         if origin is None:
1688 |             origin = [0, 0, 0]
1689 |         x_angle, y_angle, z_angle = angle
1690 |         rot_x = tr.rotation_matrix(x_angle, [1, 0, 0], origin)
1691 |         rot_y = tr.rotation_matrix(y_angle, [0, 1, 0], origin)
1692 |         rot_z = tr.rotation_matrix(z_angle, [0, 0, 1], origin)
1693 |         rot_composed = rot_x @ rot_y @ rot_z
1694 |         new_axis = (np.hstack([rot_orig, 1]) @ rot_composed)[:-1]
1695 |         equation = (
1696 |             lambda pos: np.sum(new_axis * pos, axis=1) - origin @ new_axis
1697 |         )
1698 |         points, colors, genes = self.produce_em(
1699 |             5, tissues_to_plot=None, gene_list=gene_list
1700 |         )
1701 |         points = np.array(points)
1702 |         colors = np.array(colors)
1703 |         genes = np.array(genes)
1704 |         plan = np.abs(equation(points)) < thickness
1705 |         points_to_plot = points[plan]
1706 |         points_to_plot = (
1707 |             np.hstack([points_to_plot, [[1]] * points_to_plot.shape[0]])
1708 |             @ rot_composed
1709 |         )[:, :-1]
1710 |         color_to_plot = colors[plan]
1711 |         genes_to_plot = genes.T[plan]
1712 |         df = pd.DataFrame(genes_to_plot, columns=gene_list)
1713 |         D = anndata.AnnData(df)
1714 |         D.obsm["X_Spatial"] = points_to_plot
1715 |         D.obs["predicted.id"] = [str(k) for k in color_to_plot]
1716 |         output_path = Path(output_path)
1717 |         if not output_path.parent.exists():
1718 |             Path.mkdir(output_path.parent)
1719 |         D.write(output_path)
1720 | 
1721 |         return points_to_plot
1722 | 
1723 |     def anndata_no_extra(
1724 |         self,
1725 |         output_path,
1726 |         angle,
1727 |         rot_orig=None,
1728 |         origin=None,
1729 |         thickness=30,
1730 |         angle_unit="degree",
1731 |     ):
1732 |         """
1733 |         Build a anndata file containing a slice without doing interpolation
1734 |         but any gene can be requested
1735 | 
1736 |         Args:
1737 |             output_path (str): path to the output `h5ad` file
1738 |             angle (float): angle of the rotation of the slice
1739 |             color_map (matplotlib.cmap): color map that will be applied
1740 |             rot_origin ([int, int, int]): 3D vector of the normal of the
1741 |                 rotation plan. If [0, 0, 1] is given the rotation will be
1742 |                 around the z axis
1743 |             origin ([int, int, int]): coordinates of center of the rotation
1744 |             thickness (float): thickness of the slice
1745 |             tissues ([t_id, ]): list of tissue ids to plot
1746 |             angle_unit (str): if `'degree'` the angle is treated as degrees.
1747 |                 Otherwise it is treated a radii
1748 |             nb_interp (int): number of pucks to interpolate in between
1749 |                 existing pucks
1750 |             gene_list ([str, ]): list of the gene names to interpolate
1751 |         """
1752 |         if angle_unit == "degree":
1753 |             angle = np.deg2rad(angle)
1754 |         if rot_orig is None:
1755 |             rot_orig = [0, 0, 1]
1756 |         if origin is None:
1757 |             origin = [0, 0, 0]
1758 |         x_angle, y_angle, z_angle = angle
1759 |         rot_x = tr.rotation_matrix(x_angle, [1, 0, 0], origin)
1760 |         rot_y = tr.rotation_matrix(y_angle, [0, 1, 0], origin)
1761 |         rot_z = tr.rotation_matrix(z_angle, [0, 0, 1], origin)
1762 |         rot_composed = rot_x @ rot_y @ rot_z
1763 |         new_axis = (np.hstack([rot_orig, 1]) @ rot_composed)[:-1]
1764 |         equation = (
1765 |             lambda pos: np.sum(new_axis * pos, axis=1) - origin @ new_axis
1766 |         )
1767 |         cells = np.array(sorted(self.all_cells))
1768 |         pos = np.array([list(self.final[c]) + [self.z_pos[c]] for c in cells])
1769 |         kept = cells[(np.abs(equation(pos)) < thickness)]
1770 |         data_tmp = self.anndata.copy()
1771 |         data_tmp = data_tmp[kept]
1772 |         pos_final = np.array(
1773 |             [list(self.final[c]) + [self.z_pos[c]] for c in kept]
1774 |         )
1775 |         pos_final = (
1776 |             np.hstack([pos_final, [[1]] * pos_final.shape[0]]) @ rot_composed
1777 |         )[:, :-1]
1778 |         data_tmp.obsm["X_spatial_registered"] = pos_final
1779 |         output_path = Path(output_path)
1780 |         if not output_path.parent.exists():
1781 |             Path.mkdir(output_path.parent)
1782 |         data_tmp.write(output_path)
1783 | 
1784 |     def save_anndata(self, output_path):
1785 |         """
1786 |         Save the registered dataset as an anndata file
1787 | 
1788 |         Args:
1789 |             output_path (str): path to the output anndata file ('.h5ad' file)
1790 |         """
1791 |         data_tmp = self.anndata.copy()
1792 |         all_c_sorted = sorted(self.all_cells)
1793 |         pos_final = np.array([self.pos_3D[c] for c in all_c_sorted])
1794 |         data_tmp.obsm["X_spatial_registered"] = pos_final
1795 |         output_path = Path(output_path)
1796 |         if not output_path.parent.exists():
1797 |             Path.mkdir(output_path.parent)
1798 |         data_tmp.write(output_path)
1799 | 
1800 |     def produce_em(
1801 |         self, nb_intra=5, tissues_to_plot=None, gene=None, gene_list=None
1802 |     ):
1803 |         """
1804 |         Interpolates beads from the previously computed splines and returns
1805 |         the list of the interpolated positions together with a list of values
1806 |         for each position corresponding either to the tissue id of the position
1807 |         or to the gene expression value if a gene name is provided.
1808 | 
1809 |         Args:
1810 |             nb_intra (int): number of interpolated slices to add between
1811 |                 real slices
1812 |             tissues_to_plot ([t_id, ]): list of tissue ids to interpolate
1813 |                 if `None` all tissues are interpolated
1814 |             gene (str): name of a gene to output its interpolated value
1815 |                 for each bead
1816 |             gene_list ([str, ]): list of gene names to interpolate. If
1817 |                 a gene list is given, the list gene_expr is returned.
1818 |                 The list contains list of gene expressions for the interpolated
1819 |                 beads. Only pre-selected genes can be inputed
1820 |         Returns:
1821 |             points (n x 3 ndarray): ndarray containing `n` bead positions
1822 |             colors (ndarray of length n): list of bead values. Tissue id
1823 |                 by default gene expression value if `gene` is not `None`.
1824 |             gene_expr (`len(gene_list)` x n ndarray): array of `colors` like
1825 |                 arrays containing gene expression of the genes queried in
1826 |                 `gene_list`
1827 |         """
1828 |         if len(self.z_pos) == 0:
1829 |             self.set_zpos
1830 |         old_spacing = sorted(set(self.z_pos.values()))
1831 |         new_spacing = np.linspace(
1832 |             min(old_spacing),
1833 |             max(old_spacing),
1834 |             len(old_spacing) + (len(old_spacing) - 1) * nb_intra,
1835 |         )
1836 | 
1837 |         if self.all_trajs is None:
1838 |             self.reconstruct_intermediate()
1839 |         points = []
1840 |         colors = []
1841 |         if gene_list is not None:
1842 |             gene_expr = [[] for _ in range(len(gene_list))]
1843 |         for c, (min_z, max_z, traj_x, traj_y) in self.all_trajs.items():
1844 |             if tissues_to_plot is None or self.tissue[c] in tissues_to_plot:
1845 |                 spacing = new_spacing[
1846 |                     (min_z <= new_spacing) & (new_spacing <= max_z)
1847 |                 ]
1848 |                 points.extend(zip(traj_x(spacing), traj_y(spacing), spacing))
1849 |                 if self.all_expr == {} or gene is None:
1850 |                     colors.extend([self.tissue[c]] * len(spacing))
1851 |                 else:
1852 |                     min_z, max_z, traj_expr = self.all_expr[gene][c]
1853 |                     colors.extend(traj_expr(spacing))
1854 |                 if gene_list is not None:
1855 |                     for g, L in zip(gene_list, gene_expr):
1856 |                         min_z, max_z, traj_expr = self.all_expr[g][c]
1857 |                         L.extend(traj_expr(spacing))
1858 |         if gene_list is not None:
1859 |             return points, colors, gene_expr
1860 |         return points, colors
1861 | 
1862 |     @staticmethod
1863 |     def threshold_otsu(values, nbins=256):
1864 |         """Return threshold value based on Otsu's method.
1865 |         Parameters
1866 |         ----------
1867 |         image : array
1868 |         Input image.
1869 |         nbins : int
1870 |         Number of bins used to calculate histogram. This value is ignored for
1871 |         integer arrays.
1872 |         Returns
1873 |         -------
1874 |         threshold : float
1875 |         Threshold value.
1876 |         References
1877 |         ----------
1878 |         .. [1] Wikipedia, http://en.wikipedia.org/wiki/Otsu's_Method
1879 |         """
1880 |         hist, bin_edges = np.histogram(values, nbins)
1881 |         bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2.0
1882 | 
1883 |         hist = hist.astype(float)
1884 | 
1885 |         # class probabilities for all possible thresholds
1886 |         weight1 = np.cumsum(hist)
1887 |         weight2 = np.cumsum(hist[::-1])[::-1]
1888 |         # class means for all possible thresholds
1889 |         mean1 = np.cumsum(hist * bin_centers) / weight1
1890 |         mean2 = (np.cumsum((hist * bin_centers)[::-1]) / weight2[::-1])[::-1]
1891 | 
1892 |         # Clip ends to align class 1 and class 2 variables:
1893 |         # The last value of `weight1`/`mean1` should pair with zero values in
1894 |         # `weight2`/`mean2`, which do not exist.
1895 |         variance12 = weight1[:-1] * weight2[1:] * (mean1[:-1] - mean2[1:]) ** 2
1896 | 
1897 |         idx = np.argmax(variance12)
1898 |         threshold = bin_centers[:-1][idx]
1899 |         return threshold
1900 | 
1901 |     def compute_expr_thresholds(self):
1902 |         """
1903 |         Compute the expression threshold for all genes
1904 | 
1905 |         Returns:
1906 |             th ([float, ] ndarray): list of thresholds for each genes
1907 |                 following the same order as the gene order in `self.anndata`
1908 |         """
1909 |         if self.all_genes:
1910 |             out = map(self.threshold_otsu, self.anndata.raw.X.toarray().T)
1911 |         elif sp.sparse.issparse(self.anndata.X):
1912 |             out = map(self.threshold_otsu, self.anndata.X.toarray().T)
1913 |         else:
1914 |             out = map(self.threshold_otsu, self.anndata.X.T)
1915 |         th = []
1916 |         for o in out:
1917 |             th += [o]
1918 |         th = np.array(th)
1919 |         return th
1920 | 
1921 |     def neighbs(self, gene, sub_data, cells):
1922 |         """
1923 |         Compute the average number of positive neighbors for the positive cells
1924 |         within a given tissue, given a gene
1925 | 
1926 |         Args:
1927 |             gene (int): gene id (position in the `self.anndata` array)
1928 |             sub_data (ndarray): sliced version of `self.anndata` only containing
1929 |                 the beads corresponding to the tissue to analyse
1930 |             cells (ndarray): ids of the cells in `Embryo` ordered similarly to
1931 |                 the `self.anndata` array (to do correspondancy)
1932 |         Returns:
1933 |             avg_nb_neighbs (float): average number of positive neighbors per
1934 |                 positive cells
1935 |         """
1936 | 
1937 |         # Position of positive cells in `self.anndata`
1938 |         positive_cells = np.where(self.gene_expr_th[gene] < sub_data[:, gene])[
1939 |             0
1940 |         ]
1941 | 
1942 |         # Ids of positive cells
1943 |         positive_cells = cells[positive_cells].reshape(1, -1)
1944 | 
1945 |         avg_nb_neighbs = self.full_GG[positive_cells.T, positive_cells].sum()
1946 |         avg_nb_neighbs /= positive_cells.shape[1]
1947 |         return avg_nb_neighbs
1948 | 
1949 |     def cell_groups(self, t, th_vol=0.025, cells=None, nb_N=None):
1950 |         """
1951 |         Compute the local expression metric for each gene in a given tissue `t`
1952 | 
1953 |         Args:
1954 |             t (int): tissue id to process
1955 |             th_vol (float 0<th_vol<1): high and low volume threshold.
1956 |                 Any gene expression that covers more that 1-th_vol
1957 |                 fraction of the tissue volume or less that th_vol fraction
1958 |                 of the tissue volume is discarded.
1959 |         Returns:
1960 |             data_plot (pandas.DataFrame): pandas DataFrame containing most
1961 |                 of the computed information for gene localization of the tissue
1962 |                 `t`. The main value is in the column `Distance_to_reg`
1963 |         """
1964 |         if self.all_genes:
1965 |             data = self.anndata.raw.X
1966 |         elif sp.sparse.issparse(self.anndata.X):
1967 |             data = self.anndata.X.toarray()
1968 |         else:
1969 |             data = self.anndata.copy().X
1970 |         if cells is None:
1971 |             cells = np.array(
1972 |                 [c for c in self.all_cells if self.tissue[c] == t]
1973 |             )
1974 |         if nb_N is None:
1975 |             nb_N = self.whole_tissue_nb_N[t]
1976 | 
1977 |         # Spliting the array to only have tissue *t* cells
1978 |         sub_data = data[cells]
1979 |         if self.all_genes:
1980 |             sub_data = np.array(sub_data.toarray())
1981 | 
1982 |         # Occupied volume for the cells of tissue *t*
1983 |         volume_total = len(cells)
1984 | 
1985 |         # Volume ratio for cell expressing in a tissue for each gene
1986 |         sub_volumes = (
1987 |             np.sum(self.gene_expr_th < sub_data, axis=0) / volume_total
1988 |         )
1989 | 
1990 |         # Mask for the list of genes that are expressing enough within the tissue
1991 |         mask_expr = (th_vol < sub_volumes) & (sub_volumes < 1 - th_vol)
1992 | 
1993 |         # List of genes that are expressing enough within the tissue
1994 |         interesting_genes = np.where(mask_expr)[0]
1995 | 
1996 |         # Computing the number of cells expressing
1997 |         avg_nb_neighbs = []
1998 |         for g in interesting_genes:
1999 |             nb_N_for_g = self.neighbs(g, sub_data, cells)
2000 |             avg_nb_neighbs.append(nb_N_for_g / nb_N)
2001 |         avg_nb_neighbs = np.array(avg_nb_neighbs)
2002 | 
2003 |         # Build a dataframe with the previously computed metrics
2004 |         data_plot = {
2005 |             "Volume ratio": sub_volumes[mask_expr],
2006 |             "Avg #Neighbors ratio": avg_nb_neighbs,
2007 |         }
2008 | 
2009 |         # Compute the linear regression
2010 |         # Value against which the linear regression is done
2011 |         # It is important that the relationship between x and y is linear!!!
2012 |         regression_x = "Avg #Neighbors ratio"
2013 |         regression_y = "Volume ratio"
2014 | 
2015 |         regression = linregress(
2016 |             data_plot[regression_x], data_plot[regression_y]
2017 |         )
2018 |         b = regression.intercept
2019 |         a = regression.slope
2020 |         f = lambda x: a * x + b
2021 |         data_plot["Localization score"] = (
2022 |             f(data_plot[regression_x]) - data_plot[regression_y]
2023 |         )
2024 |         data_plot["Interesting gene row ID"] = interesting_genes
2025 |         if self.all_genes:
2026 |             data_plot["Gene names"] = np.array(
2027 |                 self.anndata.raw.var_names[
2028 |                     data_plot["Interesting gene row ID"]
2029 |                 ]
2030 |             )
2031 |         else:
2032 |             data_plot["Gene names"] = np.array(
2033 |                 self.anndata.var_names[data_plot["Interesting gene row ID"]]
2034 |             )
2035 |         data_plot = pd.DataFrame(data_plot)
2036 | 
2037 |         return data_plot
2038 | 
2039 |     def get_3D_differential_expression(
2040 |         self,
2041 |         tissues_to_process,
2042 |         th_vol=0.025,
2043 |         all_genes=True,
2044 |         do_2D=False,
2045 |     ):
2046 |         """
2047 |         Compute the 3D spatial differential expression for a list of tissues and
2048 |         stores it in `self.diff_expressed_3D`.
2049 | 
2050 |         Args:
2051 |             tissues_to_process ([t_ids, ]): list of tissue ids to process
2052 |             th_vol (float 0<th_vol<1): high and low volume threshold.
2053 |                 Any gene expression that covers more that 1-th_vol
2054 |                 fraction of the tissue volume or less that th_vol fraction
2055 |                 of the tissue volume is discarded.
2056 |             all_genes (bool): True if all the genes should be considered.
2057 |                 Otherwise only the previously computed variable genes are
2058 |                 considered
2059 |             do_2D (bool): Do the computation per 2D slice instead of in 3D
2060 |         Returns:
2061 |             self.diff_expressed_3D (dict t_id: pandas.DataFrame):
2062 |                 dictionary that maps a tissue to a pandas DataFrame containing
2063 |                 most of the computed information for gene localization of
2064 |                 the tissue `t_id`. The main value is in the column `Distance_to_reg`
2065 |         """
2066 |         if self.all_genes is None:
2067 |             self.all_genes = all_genes
2068 |         cells = list(self.all_cells)
2069 |         pos_3D = [self.pos_3D[c] for c in cells]
2070 |         if self.full_GG is None:
2071 |             self.full_GG = self.build_gabriel_graph(
2072 |                 cells, pos_3D, data_struct="adj-mat"
2073 |             )
2074 | 
2075 |         if self.gene_expr_th is None:
2076 |             self.gene_expr_th = self.compute_expr_thresholds()
2077 | 
2078 |         if self.whole_tissue_nb_N is None:
2079 |             self.whole_tissue_nb_N = {}
2080 |             for t in self.all_tissues:
2081 |                 cells = np.array(
2082 |                     [c for c in self.all_cells if self.tissue[c] == t]
2083 |                 )
2084 |                 if 0 < len(cells):
2085 |                     self.whole_tissue_nb_N[t] = (
2086 |                         self.full_GG[cells].nnz
2087 |                     ) / len(cells)
2088 |                 else:
2089 |                     self.whole_tissue_nb_N[t] = 0
2090 | 
2091 |         for t in tissues_to_process:
2092 |             if not (t, th_vol) in self.__diff_expr_processed:
2093 |                 self.__diff_expr_processed[(t, th_vol)] = self.cell_groups(
2094 |                     t, th_vol=th_vol
2095 |                 )
2096 |             self.diff_expressed_3D[t] = self.__diff_expr_processed[
2097 |                 (t, th_vol)
2098 |             ].copy()
2099 | 
2100 |         if self.tissues_diff_expre_processed is None:
2101 |             self.tissues_diff_expre_processed = tissues_to_process
2102 |         else:
2103 |             self.tissues_diff_expre_processed.extend(tissues_to_process)
2104 | 
2105 |         return self.diff_expressed_3D
2106 | 
2107 |     def plot_top_3D_diff_expr_genes(
2108 |         self,
2109 |         tissues_to_process,
2110 |         nb_genes=20,
2111 |         repetition_allowed=False,
2112 |         compute_z_score=True,
2113 |         fig=None,
2114 |         ax=None,
2115 |         output_path=None,
2116 |     ):
2117 |         """
2118 |         Plot the top `nb_genes` genes for 3D differentially expressed genes for a
2119 |         list of tissues.
2120 | 
2121 |         Args:
2122 |             tissues_to_process ([t_ids, ]): list of tissue ids to process
2123 |             nb_genes (int): number of genes in the top gene list
2124 |             repetition_allowed (bool): if true, a gene can be in the top
2125 |                 `nb_genes` of multiple tissues. Otherwise it is only kept
2126 |                 for the tissue it has the highest localization score.
2127 |                 Default: False
2128 |             compute_z_score (bool): if true, the z-score of gene expression is computed
2129 |                 for each gene independently, otherwise the initial value from `self.anndata`
2130 |                 is kept
2131 |                 Default: True
2132 |             fig (matplotlib.Figure): figure onto which ploting the output. If fig
2133 |                 is given ax should be given too. If None, a new figure is created
2134 |                 or recovered from ax.
2135 |                 Default: None
2136 |             ax (matplotlib.AxesSubplot): the working figure axis
2137 |                 Default: None
2138 |             output_path (str): path to the desired output figure. If None, the figure
2139 |                 is not saved
2140 |                 Default: None
2141 |         """
2142 |         if self.tissues_diff_expre_processed is None:
2143 |             self.tissues_diff_expre_processed = []
2144 |         tmp_T = set(tissues_to_process).difference(
2145 |             self.tissues_diff_expre_processed
2146 |         )
2147 |         if len(tmp_T) != 0:
2148 |             print(
2149 |                 "You asked to plot tissue(s) that were not already processed"
2150 |             )
2151 |             print("The following tissue(s) will be ignored:")
2152 |             for t in tmp_T:
2153 |                 print(f"\t - id: {t}, name: {self.corres_tissue.get(t, t)}")
2154 |         tissues_to_process = list(
2155 |             set(tissues_to_process).intersection(
2156 |                 self.tissues_diff_expre_processed
2157 |             )
2158 |         )
2159 |         genes_of_interest = []
2160 |         gene_dict = {}
2161 |         tissue_genes = {}
2162 |         genes_in = {}
2163 |         added_genes = 1 if repetition_allowed else 4
2164 |         if nb_genes is None:
2165 |             nb_genes = min(
2166 |                 [len(genes) for genes in self.diff_expressed_3D.values()]
2167 |             )
2168 |         for t in tissues_to_process:
2169 |             data_t = self.diff_expressed_3D[t]
2170 |             G_N = data_t.sort_values("Localization score")[
2171 |                 "Interesting gene row ID"
2172 |             ][: -nb_genes * added_genes - 1 : -1]
2173 |             G_V = data_t.sort_values("Localization score")[
2174 |                 "Localization score"
2175 |             ][: -nb_genes * added_genes - 1 : -1]
2176 |             genes_of_interest.extend(G_N[:nb_genes])
2177 |             for g, v in zip(G_N, G_V):
2178 |                 tissue_genes.setdefault(g, []).append(t)
2179 |                 gene_dict[(t, g)] = v
2180 |             genes_in[t] = list(G_N)
2181 |         if not repetition_allowed:
2182 |             dict_counter = Counter(genes_of_interest)
2183 |             acc = 0
2184 |             while any(1 < k for k in dict_counter.values()):
2185 |                 t = tissues_to_process[acc % len(tissues_to_process)]
2186 |                 for g in genes_in[t]:
2187 |                     if 1 < dict_counter[g]:
2188 |                         tissues = np.array(tissue_genes[g])
2189 |                         values = [gene_dict[(t, g)] for t in tissues]
2190 |                         if tissues[np.argsort(values)][-1] != t:
2191 |                             genes_in[t].remove(g)
2192 |                 genes_of_interest = []
2193 |                 for t in tissues_to_process:
2194 |                     genes_of_interest.extend(genes_in[t][:nb_genes])
2195 |                 dict_counter = Counter(genes_of_interest)
2196 |                 acc += 1
2197 |         nb_genes = min(nb_genes, len(genes_of_interest))
2198 |         values = np.zeros(
2199 |             (nb_genes * len(tissues_to_process), len(tissues_to_process))
2200 |         )
2201 |         tissue_order = []
2202 |         for i, g in enumerate(genes_of_interest):
2203 |             for j, t in enumerate(tissues_to_process):
2204 |                 data_t = self.diff_expressed_3D[t]
2205 |                 if g in data_t["Interesting gene row ID"].values:
2206 |                     values[i, j] = data_t[
2207 |                         data_t["Interesting gene row ID"] == g
2208 |                     ]["Localization score"]
2209 |                 if i == 0:
2210 |                     tissue_order.append(t)
2211 |         # z_score = (values - np.mean(values, axis=1).reshape(-1, 1))/np.std(values, axis=1).reshape(-1, 1)
2212 |         if compute_z_score:
2213 |             values = zscore(values, axis=0)
2214 |         if ax is None:
2215 |             fig, ax = plt.subplots(figsize=(5, max(5, round(1.5 * nb_genes))))
2216 |         if fig is None:
2217 |             fig = ax.get_figure()
2218 |         ax.imshow(
2219 |             values,
2220 |             interpolation="nearest",
2221 |             cmap="Reds",
2222 |             vmin=np.percentile(values, 2),
2223 |         )
2224 |         ax.set_xticks(range(len(tissue_order)))
2225 |         ax.set_xticklabels(
2226 |             [self.corres_tissue.get(t, t) for t in tissue_order], rotation=90
2227 |         )
2228 |         ax.set_yticks(range(values.shape[0]))
2229 |         if self.all_genes:
2230 |             ax.set_yticklabels(
2231 |                 list(self.anndata.raw[:, genes_of_interest].var_names)
2232 |             )
2233 |         else:
2234 |             ax.set_yticklabels(
2235 |                 list(self.anndata[:, genes_of_interest].var_names)
2236 |             )
2237 |         fig.tight_layout()
2238 |         if output_path is not None:
2239 |             output_path = Path(output_path)
2240 |             if not output_path.parent.exists():
2241 |                 Path.mkdir(output_path.parent)
2242 |             fig.savefig(output_path)
2243 |         return fig, ax
2244 | 
2245 |     def plot_volume_vs_neighbs(
2246 |         self,
2247 |         t,
2248 |         print_top=None,
2249 |         print_genes=None,
2250 |         fig=None,
2251 |         ax=None,
2252 |         output_path=None,
2253 |         **kwargs,
2254 |     ):
2255 |         """
2256 |         Plot volume of expressing cells versus the average number of expressing neighbors
2257 |         for a given tissue.
2258 | 
2259 |         Args:
2260 |             t (int): tissue id to treat
2261 |             print_top (int): number of gene names to plot onto the figure (slows down)
2262 |                 the function significantly
2263 |             fig (matplotlib.Figure): figure onto which ploting the output. If fig
2264 |                 is given ax should be given too. If None, a new figure is created
2265 |                 or recovered from ax.
2266 |                 Default: None
2267 |             ax (matplotlib.AxesSubplot): the working figure axis
2268 |                 Default: None
2269 |             output_path (str): path to the desired output figure. If None, the figure
2270 |                 is not saved
2271 |                 Default: None
2272 |             kwargs are forwarded to the seaborn.scatterplot
2273 |         """
2274 |         if (
2275 |             self.diff_expressed_3D is not None
2276 |             and t not in self.diff_expressed_3D
2277 |         ):
2278 |             print(
2279 |                 f"The tissue {t} ({self.corres_tissue.get(t, t)}) has not been processed yet."
2280 |             )
2281 |             print("No figure can be made.")
2282 |             return
2283 |         data_plot = self.diff_expressed_3D[t]
2284 |         if ax is None:
2285 |             fig, ax = plt.subplots(figsize=(10, 8))
2286 |         if fig is None:
2287 |             fig = ax.get_figure()
2288 |         x = "Avg #Neighbors ratio"
2289 |         y = "Volume ratio"
2290 |         g = scatterplot(
2291 |             data=data_plot, x=x, y=y, ax=ax, hue="Localization score", **kwargs
2292 |         )
2293 |         legend = g.axes.get_legend()
2294 |         legend.set_title("Localization score")
2295 |         ax.set_ylabel("Relative volume (to total tissue volume)")
2296 |         ax.set_xlabel(
2297 |             "Relative cell density (to the average cell density within the tissue)"
2298 |         )
2299 |         if print_top is not None:
2300 |             top_X = data_plot.sort_values(
2301 |                 "Localization score", ascending=False
2302 |             )[:print_top]
2303 |             x_values = top_X[x]
2304 |             y_values = top_X[y]
2305 |             names = top_X["Gene names"]
2306 |             for name, x, y in zip(names, x_values, y_values):
2307 |                 plt.text(
2308 |                     x=x,
2309 |                     y=y,
2310 |                     s=name,
2311 |                     fontdict=dict(color="red", size=8, fontweight="bold"),
2312 |                     va="baseline",
2313 |                 )
2314 |         if print_genes is not None:
2315 |             for gene in print_genes:
2316 |                 gene_num_all = np.where(self.anndata.var_names == gene)[0][0]
2317 |                 gene_num = np.where(
2318 |                     data_plot["Interesting gene row ID"] == gene_num_all
2319 |                 )[0]
2320 |                 if gene_num.any():
2321 |                     gene_num = gene_num[0]
2322 |                     plt.text(
2323 |                         x=data_plot[x][gene_num],
2324 |                         y=data_plot[y][gene_num],
2325 |                         s=gene,
2326 |                         fontdict=dict(color="red", size=8, fontweight="bold"),
2327 |                         va="baseline",
2328 |                     )
2329 |         ax.set_title(self.corres_tissue.get(t, t))
2330 |         fig.tight_layout()
2331 |         if output_path is not None:
2332 |             output_path = Path(output_path)
2333 |             if not output_path.parent.exists():
2334 |                 Path.mkdir(output_path.parent)
2335 |             fig.savefig(output_path)
2336 | 
2337 |     def print_diff_expr_genes(self, tissue, nb):
2338 |         """
2339 |         Extract from the DataFrame `self.diff_expressed_3D`
2340 |         the genes that are locally expressed for a given tissue
2341 | 
2342 |         Args:
2343 |             tissue (int): id of the tissue to look at
2344 |             nb (int): number of genes to extract
2345 |         Returns
2346 |             order (`nb` x m pandas.DataFrame): DataFrame containing
2347 |                 the top `nb` localized genes.
2348 |         """
2349 |         if (
2350 |             self.diff_expressed_3D is None
2351 |             or tissue not in self.diff_expressed_3D
2352 |         ):
2353 |             print(
2354 |                 f"The tissue {tissue} ({self.corres_tissue.get(tissue, tissue)}) has not been processed yet."
2355 |             )
2356 |             print("No figure can be made.")
2357 |             return
2358 |         data_plot = self.diff_expressed_3D[tissue]
2359 |         order = data_plot.sort_values("Localization score", ascending=False)[
2360 |             :nb
2361 |         ]
2362 |         return order
2363 | 
2364 |     def compute_volumes(self, tissues_to_treat=None, bead_size=(10, 10, 30)):
2365 |         """
2366 |         Computes the volumes of the tissues based on the number of beads and their size.
2367 | 
2368 |         Args:
2369 |             tissues_to_treat (list): Optional, default: `None`, list of tissues for which to compute the volumes.
2370 |                 If left to `None`, the volume for all the tissues are computed.
2371 |             bead_size (tuple): Optional, default: `(10, 10, 30)`, distance between beads. Will be use to compute
2372 |                 the total volume.
2373 | 
2374 |         Returns:
2375 |             {tissue_id (str): float}: dictionary that maps a tissue to its volume.
2376 |         """
2377 |         if tissues_to_treat is None:
2378 |             tissues_to_treat = self.all_tissues
2379 | 
2380 |         volumes = {}
2381 |         bead_volume = np.prod(bead_size)
2382 |         for t in tissues_to_treat:
2383 |             volumes[self.corres_tissue.get(t, t)] = (
2384 |                 len(self.cells_from_tissue.get(t, [])) * bead_volume
2385 |             )
2386 | 
2387 |         return pd.DataFrame(volumes, index=["Volume"]).T
2388 | 
2389 |     def __init__(
2390 |         self,
2391 |         data_path="",
2392 |         tissues_to_ignore=None,
2393 |         corres_tissue=None,
2394 |         tissue_weight=None,
2395 |         xy_resolution=1,
2396 |         genes_of_interest=None,
2397 |         nb_CS_begin_ignore=0,
2398 |         nb_CS_end_ignore=0,
2399 |         store_anndata=False,
2400 |         z_space=30.0,
2401 |         tissue_id="predicted.id",
2402 |         array_id="orig.ident",
2403 |         pos_id="X_spatial",
2404 |         pos_reg_id="X_spatial_registered",
2405 |         gene_name_id="feature_name",
2406 |         umap_id="X_umap",
2407 |         sample_list=None,
2408 |     ):
2409 |         """
2410 |         Initialize an spatial single cell embryo
2411 | 
2412 |         Args:
2413 |             data_path (str): path to the file containing the sc data (h5ad format)
2414 |             tissues_to_ignore ([t_ids, ]): list of tissues to ignore. Beads belonging
2415 |                 to these tissues will be discarded
2416 |             corres_tissue ({t_id: str}): dictionary that maps a tissue id to a tissue
2417 |                 name
2418 |             tissue_weight ({t_id: int}): dictionary that maps a tissue id to a weight
2419 |                 that will be used for the puck registration. The higher the value is
2420 |                 the more aligned the tissue will be. The default value is 1
2421 |             xy_resolution (float): resolution in x and y (assumed to be isotrope)
2422 |             gene_of_interest ([str, ]): list of gene names to be selected. For some
2423 |                 applications, they will be the only ones that can be processed.
2424 |             nb_CS_begin_ignore (int): number of pucks to ignore at the begining of
2425 |                 the stack.
2426 |                 Default: 0
2427 |             nb_CS_end_ignore (int): number of pucks to ignore at the end of the stack
2428 |                 Default: 0
2429 |             store_anndata (bool): if true the anndata array is stored. Necessary when
2430 |                 doing 3D differential expression analysis
2431 |                 Default: False
2432 |             tissue_id (str): string naming the column containing the tissue ids. The
2433 |                 tissue ids will be contained in `data.obs[tissue_id]`.
2434 |                 Default: 'predicted.id'
2435 |             array_id (str): string naming the column containing the array/puck/slice
2436 |                 id. It will determine the `z` position of the cell.
2437 |                 The array id will be contained in `data.obs[array_id]` in the format
2438 |                 '.*_[0-9]*' where everything after the underscore (`_`) is considered
2439 |                 as the id number of the array.
2440 |                 Default: 'orig.ident'
2441 |             pos_id (str): string naming the column containing the x, y positions. The
2442 |                 x, y positions will be contained in `data.obsm[pos_id]`.
2443 |                 Default: 'X_spatial'
2444 |             pos_reg_id (str): string naming the column containing the x, y, z registered
2445 |                 positions. The x, y, z registered positions will be contained
2446 |                 in `data.obsm[pos_reg_id]`.
2447 |                 Default: 'X_spatial_registered'
2448 |             gene_name_id (str): string naming the column containing the gene names.
2449 |                 The gene names will be contained in `data.var[gene_name_id]`.
2450 |                 Default: 'feature_name'
2451 |             umap_id (str): string naming the column containing the umap coordinates.
2452 |                 The umap coordinates will be contained in `data.obsm[umap_id]`.
2453 |                 Default: 'X_umap'
2454 |         """
2455 |         self.cells = set()
2456 |         self.pos = {}
2457 |         self.cover_slip = {}
2458 |         self.tissue = {}
2459 |         self.all_tissues = set()
2460 |         self.cells_from_cover_slip = {}
2461 |         self.cells_from_tissue = {}
2462 |         self.all_cover_slips = []
2463 |         self.nb_CS_begin_ignore = nb_CS_begin_ignore
2464 |         self.nb_CS_end_ignore = nb_CS_end_ignore
2465 |         self.tissues_to_ignore = (
2466 |             [] if tissues_to_ignore is None else tissues_to_ignore
2467 |         )
2468 |         if corres_tissue is None:
2469 |             self.corres_tissue = {}
2470 |         elif isinstance(corres_tissue, str) or hasattr(
2471 |             corres_tissue, "exists"
2472 |         ):
2473 |             with open(corres_tissue) as f:
2474 |                 self.corres_tissue = json.load(f)
2475 |                 self.corres_tissue = {
2476 |                     eval(k): v for k, v in self.corres_tissue.items()
2477 |                 }
2478 |         else:
2479 |             self.corres_tissue = corres_tissue
2480 |         self.tissue_weight = {} if tissue_weight is None else tissue_weight
2481 |         self.z_space = z_space
2482 |         self.z_pos = {}
2483 |         self.all_cells = None
2484 |         self.cell_names = None
2485 |         self.all_genes = None
2486 |         self.gene_expression = None
2487 |         self.centered_pos = None
2488 |         self.tissue_centers = None
2489 |         self.tissue_centers_reg = {}
2490 |         self.registered_pos = None
2491 |         self.pairing = {}
2492 |         self.pos_reg_aff = {}
2493 |         self.final = None
2494 |         self.filtered_cells = set()
2495 |         self.GG_cs = {}
2496 |         self.KDT_cs = {}
2497 |         self.KDT_cs_down = {}
2498 |         self.paired_cs_down = {}
2499 |         self.KDT_cs_up = {}
2500 |         self.paired_cs_up = {}
2501 |         self.all_trajs = None
2502 |         self.all_expr = None
2503 |         self.full_GG = None
2504 |         self.gene_expr_th = None
2505 |         self.whole_tissue_nb_N = None
2506 |         self.diff_expressed_3D = {}
2507 |         self.tissues_diff_expre_processed = None
2508 |         self.umap_id = umap_id
2509 |         self.array_id = array_id
2510 |         self.tissue_id = tissue_id
2511 |         self.pos_id = pos_id
2512 |         self.__diff_expr_processed = {}
2513 |         self.pos_reg_id = pos_reg_id
2514 | 
2515 |         if Path(data_path).suffix in [".h5ad", ".h5", ".csv"] or (
2516 |             0 < len(sample_list)
2517 |             and Path(sample_list[0]).suffix in [".h5ad", ".h5", ".csv"]
2518 |         ):
2519 |             self.read_anndata(
2520 |                 data_path,
2521 |                 xy_resolution=xy_resolution,
2522 |                 genes_of_interest=genes_of_interest,
2523 |                 store_anndata=store_anndata,
2524 |                 tissue_id=tissue_id,
2525 |                 array_id=array_id,
2526 |                 pos_id=pos_id,
2527 |                 pos_reg_id=pos_reg_id,
2528 |                 gene_name_id=gene_name_id,
2529 |                 sample_list=sample_list,
2530 |             )
2531 | 


--------------------------------------------------------------------------------
/src/sc3D/transformations.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file is a copy of the useful function
  3 | from Christophe Gohlke's amazing package transformations
  4 | that can be found there: https://pypi.org/project/transformations/
  5 | 
  6 | For a reason that I don't really understand, the package
  7 | does not install on newer version of Python therefore
  8 | I decided to, for the time being at least, copy some of its work here.
  9 | 
 10 | Léo
 11 | """
 12 | 
 13 | import numpy as np
 14 | import math
 15 | 
 16 | 
 17 | class transformations:
 18 |     _EPS = np.finfo(float).eps * 4.0
 19 | 
 20 |     @classmethod
 21 |     def unit_vector(clf, data, axis=None, out=None):
 22 |         """Return ndarray normalized by length, i.e. Euclidean norm, along axis.
 23 | 
 24 |         >>> v0 = numpy.random.random(3)
 25 |         >>> v1 = unit_vector(v0)
 26 |         >>> numpy.allclose(v1, v0 / numpy.linalg.norm(v0))
 27 |         True
 28 |         >>> v0 = numpy.random.rand(5, 4, 3)
 29 |         >>> v1 = unit_vector(v0, axis=-1)
 30 |         >>> v2 = v0 / numpy.expand_dims(numpy.sqrt(numpy.sum(v0*v0, axis=2)), 2)
 31 |         >>> numpy.allclose(v1, v2)
 32 |         True
 33 |         >>> v1 = unit_vector(v0, axis=1)
 34 |         >>> v2 = v0 / numpy.expand_dims(numpy.sqrt(numpy.sum(v0*v0, axis=1)), 1)
 35 |         >>> numpy.allclose(v1, v2)
 36 |         True
 37 |         >>> v1 = numpy.empty((5, 4, 3))
 38 |         >>> unit_vector(v0, axis=1, out=v1)
 39 |         >>> numpy.allclose(v1, v2)
 40 |         True
 41 |         >>> list(unit_vector([]))
 42 |         []
 43 |         >>> list(unit_vector([1]))
 44 |         [1.0]
 45 | 
 46 |         """
 47 |         if out is None:
 48 |             data = np.array(data, dtype=np.float64, copy=True)
 49 |             if data.ndim == 1:
 50 |                 data /= math.sqrt(np.dot(data, data))
 51 |                 return data
 52 |         else:
 53 |             if out is not data:
 54 |                 out[:] = np.array(data, copy=False)
 55 |             data = out
 56 |         length = np.atleast_1d(np.sum(data * data, axis))
 57 |         np.sqrt(length, length)
 58 |         if axis is not None:
 59 |             length = np.expand_dims(length, axis)
 60 |         data /= length
 61 |         if out is None:
 62 |             return data
 63 |         return None
 64 | 
 65 |     @classmethod
 66 |     def rotation_matrix(clf, angle, direction, point=None):
 67 |         """Return matrix to rotate about axis defined by point and direction.
 68 | 
 69 |         >>> R = rotation_matrix(math.pi/2, [0, 0, 1], [1, 0, 0])
 70 |         >>> numpy.allclose(numpy.dot(R, [0, 0, 0, 1]), [1, -1, 0, 1])
 71 |         True
 72 |         >>> angle = (random.random() - 0.5) * (2*math.pi)
 73 |         >>> direc = numpy.random.random(3) - 0.5
 74 |         >>> point = numpy.random.random(3) - 0.5
 75 |         >>> R0 = rotation_matrix(angle, direc, point)
 76 |         >>> R1 = rotation_matrix(angle-2*math.pi, direc, point)
 77 |         >>> is_same_transform(R0, R1)
 78 |         True
 79 |         >>> R0 = rotation_matrix(angle, direc, point)
 80 |         >>> R1 = rotation_matrix(-angle, -direc, point)
 81 |         >>> is_same_transform(R0, R1)
 82 |         True
 83 |         >>> I = numpy.identity(4, numpy.float64)
 84 |         >>> numpy.allclose(I, rotation_matrix(math.pi*2, direc))
 85 |         True
 86 |         >>> numpy.allclose(2, numpy.trace(rotation_matrix(math.pi/2,
 87 |         ...                                               direc, point)))
 88 |         True
 89 | 
 90 |         """
 91 |         import math
 92 | 
 93 |         sina = math.sin(angle)
 94 |         cosa = math.cos(angle)
 95 |         direction = clf.unit_vector(direction[:3])
 96 |         # rotation matrix around unit vector
 97 |         R = np.diag([cosa, cosa, cosa])
 98 |         R += np.outer(direction, direction) * (1.0 - cosa)
 99 |         direction *= sina
100 |         R += np.array(
101 |             [
102 |                 [0.0, -direction[2], direction[1]],
103 |                 [direction[2], 0.0, -direction[0]],
104 |                 [-direction[1], direction[0], 0.0],
105 |             ]
106 |         )
107 |         M = np.identity(4)
108 |         M[:3, :3] = R
109 |         if point is not None:
110 |             # rotation not around origin
111 |             point = np.array(point[:3], dtype=np.float64, copy=False)
112 |             M[:3, 3] = point - np.dot(R, point)
113 |         return M
114 | 
115 |     @classmethod
116 |     def vector_norm(clf, data, axis=None, out=None):
117 |         """Return length, i.e. Euclidean norm, of ndarray along axis.
118 | 
119 |         >>> v = numpy.random.random(3)
120 |         >>> n = vector_norm(v)
121 |         >>> numpy.allclose(n, numpy.linalg.norm(v))
122 |         True
123 |         >>> v = numpy.random.rand(6, 5, 3)
124 |         >>> n = vector_norm(v, axis=-1)
125 |         >>> numpy.allclose(n, numpy.sqrt(numpy.sum(v*v, axis=2)))
126 |         True
127 |         >>> n = vector_norm(v, axis=1)
128 |         >>> numpy.allclose(n, numpy.sqrt(numpy.sum(v*v, axis=1)))
129 |         True
130 |         >>> v = numpy.random.rand(5, 4, 3)
131 |         >>> n = numpy.empty((5, 3))
132 |         >>> vector_norm(v, axis=1, out=n)
133 |         >>> numpy.allclose(n, numpy.sqrt(numpy.sum(v*v, axis=1)))
134 |         True
135 |         >>> vector_norm([])
136 |         0.0
137 |         >>> vector_norm([1])
138 |         1.0
139 | 
140 |         """
141 |         data = np.array(data, dtype=np.float64, copy=True)
142 |         if out is None:
143 |             if data.ndim == 1:
144 |                 return math.sqrt(np.dot(data, data))
145 |             data *= data
146 |             out = np.atleast_1d(np.sum(data, axis=axis))
147 |             np.sqrt(out, out)
148 |             return out
149 |         data *= data
150 |         np.sum(data, axis=axis, out=out)
151 |         np.sqrt(out, out)
152 |         return None
153 | 
154 |     @classmethod
155 |     def quaternion_matrix(clf, quaternion):
156 |         """Return homogeneous rotation matrix from quaternion.
157 | 
158 |         >>> M = quaternion_matrix([0.99810947, 0.06146124, 0, 0])
159 |         >>> numpy.allclose(M, rotation_matrix(0.123, [1, 0, 0]))
160 |         True
161 |         >>> M = quaternion_matrix([1, 0, 0, 0])
162 |         >>> numpy.allclose(M, numpy.identity(4))
163 |         True
164 |         >>> M = quaternion_matrix([0, 1, 0, 0])
165 |         >>> numpy.allclose(M, numpy.diag([1, -1, -1, 1]))
166 |         True
167 | 
168 |         """
169 |         q = np.array(quaternion, dtype=np.float64, copy=True)
170 |         n = np.dot(q, q)
171 |         if n < clf._EPS:
172 |             return np.identity(4)
173 |         q *= math.sqrt(2.0 / n)
174 |         q = np.outer(q, q)
175 |         return np.array(
176 |             [
177 |                 [
178 |                     1.0 - q[2, 2] - q[3, 3],
179 |                     q[1, 2] - q[3, 0],
180 |                     q[1, 3] + q[2, 0],
181 |                     0.0,
182 |                 ],
183 |                 [
184 |                     q[1, 2] + q[3, 0],
185 |                     1.0 - q[1, 1] - q[3, 3],
186 |                     q[2, 3] - q[1, 0],
187 |                     0.0,
188 |                 ],
189 |                 [
190 |                     q[1, 3] - q[2, 0],
191 |                     q[2, 3] + q[1, 0],
192 |                     1.0 - q[1, 1] - q[2, 2],
193 |                     0.0,
194 |                 ],
195 |                 [0.0, 0.0, 0.0, 1.0],
196 |             ]
197 |         )
198 | 
199 |     @classmethod
200 |     def affine_matrix_from_points(
201 |         clf, v0, v1, shear=True, scale=True, usesvd=True
202 |     ):
203 |         """Return affine transform matrix to register two point sets.
204 | 
205 |         v0 and v1 are shape (ndims, -1) arrays of at least ndims non-homogeneous
206 |         coordinates, where ndims is the dimensionality of the coordinate space.
207 | 
208 |         If shear is False, a similarity transformation matrix is returned.
209 |         If also scale is False, a rigid/Euclidean transformation matrix
210 |         is returned.
211 | 
212 |         By default the algorithm by Hartley and Zissermann [15] is used.
213 |         If usesvd is True, similarity and Euclidean transformation matrices
214 |         are calculated by minimizing the weighted sum of squared deviations
215 |         (RMSD) according to the algorithm by Kabsch [8].
216 |         Otherwise, and if ndims is 3, the quaternion based algorithm by Horn [9]
217 |         is used, which is slower when using this Python implementation.
218 | 
219 |         The returned matrix performs rotation, translation and uniform scaling
220 |         (if specified).
221 | 
222 |         >>> v0 = [[0, 1031, 1031, 0], [0, 0, 1600, 1600]]
223 |         >>> v1 = [[675, 826, 826, 677], [55, 52, 281, 277]]
224 |         >>> affine_matrix_from_points(v0, v1)
225 |         array([[  0.14549,   0.00062, 675.50008],
226 |             [  0.00048,   0.14094,  53.24971],
227 |             [  0.     ,   0.     ,   1.     ]])
228 |         >>> T = translation_matrix(numpy.random.random(3)-0.5)
229 |         >>> R = random_rotation_matrix(numpy.random.random(3))
230 |         >>> S = scale_matrix(random.random())
231 |         >>> M = concatenate_matrices(T, R, S)
232 |         >>> v0 = (numpy.random.rand(4, 100) - 0.5) * 20
233 |         >>> v0[3] = 1
234 |         >>> v1 = numpy.dot(M, v0)
235 |         >>> v0[:3] += numpy.random.normal(0, 1e-8, 300).reshape(3, -1)
236 |         >>> M = affine_matrix_from_points(v0[:3], v1[:3])
237 |         >>> numpy.allclose(v1, numpy.dot(M, v0))
238 |         True
239 | 
240 |         More examples in superimposition_matrix()
241 | 
242 |         """
243 |         v0 = np.array(v0, dtype=np.float64, copy=True)
244 |         v1 = np.array(v1, dtype=np.float64, copy=True)
245 | 
246 |         ndims = v0.shape[0]
247 |         if ndims < 2 or v0.shape[1] < ndims or v0.shape != v1.shape:
248 |             raise ValueError("input arrays are of wrong shape or type")
249 | 
250 |         # move centroids to origin
251 |         t0 = -np.mean(v0, axis=1)
252 |         M0 = np.identity(ndims + 1)
253 |         M0[:ndims, ndims] = t0
254 |         v0 += t0.reshape(ndims, 1)
255 |         t1 = -np.mean(v1, axis=1)
256 |         M1 = np.identity(ndims + 1)
257 |         M1[:ndims, ndims] = t1
258 |         v1 += t1.reshape(ndims, 1)
259 | 
260 |         if shear:
261 |             # Affine transformation
262 |             A = np.concatenate((v0, v1), axis=0)
263 |             u, s, vh = np.linalg.svd(A.T)
264 |             vh = vh[:ndims].T
265 |             B = vh[:ndims]
266 |             C = vh[ndims : 2 * ndims]
267 |             t = np.dot(C, np.linalg.pinv(B))
268 |             t = np.concatenate((t, np.zeros((ndims, 1))), axis=1)
269 |             M = np.vstack((t, ((0.0,) * ndims) + (1.0,)))
270 |         elif usesvd or ndims != 3:
271 |             # Rigid transformation via SVD of covariance matrix
272 |             u, s, vh = np.linalg.svd(np.dot(v1, v0.T))
273 |             # rotation matrix from SVD orthonormal bases
274 |             R = np.dot(u, vh)
275 |             if np.linalg.det(R) < 0.0:
276 |                 # R does not constitute right handed system
277 |                 R -= np.outer(u[:, ndims - 1], vh[ndims - 1, :] * 2.0)
278 |                 s[-1] *= -1.0
279 |             # homogeneous transformation matrix
280 |             M = np.identity(ndims + 1)
281 |             M[:ndims, :ndims] = R
282 |         else:
283 |             # Rigid transformation matrix via quaternion
284 |             # compute symmetric matrix N
285 |             xx, yy, zz = np.sum(v0 * v1, axis=1)
286 |             xy, yz, zx = np.sum(v0 * np.roll(v1, -1, axis=0), axis=1)
287 |             xz, yx, zy = np.sum(v0 * np.roll(v1, -2, axis=0), axis=1)
288 |             N = [
289 |                 [xx + yy + zz, 0.0, 0.0, 0.0],
290 |                 [yz - zy, xx - yy - zz, 0.0, 0.0],
291 |                 [zx - xz, xy + yx, yy - xx - zz, 0.0],
292 |                 [xy - yx, zx + xz, yz + zy, zz - xx - yy],
293 |             ]
294 |             # quaternion: eigenvector corresponding to most positive eigenvalue
295 |             w, V = np.linalg.eigh(N)
296 |             q = V[:, np.argmax(w)]
297 |             q /= clf.vector_norm(q)  # unit quaternion
298 |             # homogeneous transformation matrix
299 |             M = clf.quaternion_matrix(q)
300 | 
301 |         if scale and not shear:
302 |             # Affine transformation; scale is ratio of RMS deviations from centroid
303 |             v0 *= v0
304 |             v1 *= v1
305 |             M[:ndims, :ndims] *= math.sqrt(np.sum(v1) / np.sum(v0))
306 | 
307 |         # move centroids back
308 |         M = np.dot(np.linalg.inv(M1), np.dot(M, M0))
309 |         M /= M[ndims, ndims]
310 |         return M
311 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # For more information about tox, see https://tox.readthedocs.io/en/latest/
 2 | [tox]
 3 | envlist = py{38,39,310}-{linux,macos,windows}
 4 | isolated_build=true
 5 | 
 6 | [gh-actions]
 7 | python =
 8 |     3.8: py38
 9 |     3.9: py39
10 |     3.10: py310
11 | 
12 | [gh-actions:env]
13 | PLATFORM =
14 |     ubuntu-latest: linux
15 |     macos-latest: macos
16 |     windows-latest: windows
17 | 
18 | [testenv]
19 | platform =
20 |     macos: darwin
21 |     linux: linux
22 |     windows: win32
23 | passenv =
24 |     CI
25 |     GITHUB_ACTIONS
26 |     DISPLAY
27 |     XAUTHORITY
28 |     NUMPY_EXPERIMENTAL_ARRAY_FUNCTION
29 |     PYVISTA_OFF_SCREEN
30 | extras =
31 |     testing
32 | commands = pytest -v --color=yes --cov=sc3D --cov-report=xml
33 | 


--------------------------------------------------------------------------------
/txt/figures/interpolation.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuignardLab/sc3D/e08b067d930a109c666a5af191cac97529258673/txt/figures/interpolation.ai


--------------------------------------------------------------------------------
/txt/figures/interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuignardLab/sc3D/e08b067d930a109c666a5af191cac97529258673/txt/figures/interpolation.pdf


--------------------------------------------------------------------------------
/txt/figures/sc3D_vs_PASTE_DLPFC.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuignardLab/sc3D/e08b067d930a109c666a5af191cac97529258673/txt/figures/sc3D_vs_PASTE_DLPFC.pdf


--------------------------------------------------------------------------------
/txt/figures/sc3D_vs_PASTE_Mouse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuignardLab/sc3D/e08b067d930a109c666a5af191cac97529258673/txt/figures/sc3D_vs_PASTE_Mouse.pdf


--------------------------------------------------------------------------------
/txt/figures/sc3D_vs_PASTE_time.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuignardLab/sc3D/e08b067d930a109c666a5af191cac97529258673/txt/figures/sc3D_vs_PASTE_time.pdf


--------------------------------------------------------------------------------
/txt/references.bib:
--------------------------------------------------------------------------------
 1 | %% This BibTeX bibliography file was created using BibDesk.
 2 | %% https://bibdesk.sourceforge.io/
 3 | 
 4 | %% Created for Léo Guignard at 2022-11-11 14:25:44 +0100 
 5 | 
 6 | 
 7 | %% Saved with string encoding Unicode (UTF-8) 
 8 | 
 9 | 
10 | 
11 | @article{Maynard:2021,
12 | 	abstract = {We used the 10x Genomics Visium platform to define the spatial topography of gene expression in the six-layered human dorsolateral prefrontal cortex. We identified extensive layer-enriched expression signatures and refined associations to previous laminar markers. We overlaid our laminar expression signatures on large-scale single nucleus RNA-sequencing data, enhancing spatial annotation of expression-driven clusters. By integrating neuropsychiatric disorder gene sets, we showed differential layer-enriched expression of genes associated with schizophrenia and autism spectrum disorder, highlighting the clinical relevance of spatially defined expression. We then developed a data-driven framework to define unsupervised clusters in spatial transcriptomics data, which can be applied to other tissues or brain regions in which morphological architecture is not as well defined as cortical laminae. Last, we created a web application for the scientific community to explore these raw and summarized data to augment ongoing neuroscience and spatial transcriptomics research (http://research.libd.org/spatialLIBD).},
13 | 	author = {Maynard, Kristen R. and Collado-Torres, Leonardo and Weber, Lukas M. and Uytingco, Cedric and Barry, Brianna K. and Williams, Stephen R. and Catallini, Joseph L. and Tran, Matthew N. and Besich, Zachary and Tippani, Madhavi and Chew, Jennifer and Yin, Yifeng and Kleinman, Joel E. and Hyde, Thomas M. and Rao, Nikhil and Hicks, Stephanie C. and Martinowich, Keri and Jaffe, Andrew E.},
14 | 	date = {2021/03/01},
15 | 	date-added = {2022-11-11 14:25:04 +0100},
16 | 	date-modified = {2022-11-11 14:25:07 +0100},
17 | 	doi = {10.1038/s41593-020-00787-0},
18 | 	id = {Maynard2021},
19 | 	isbn = {1546-1726},
20 | 	journal = {Nature Neuroscience},
21 | 	number = {3},
22 | 	pages = {425--436},
23 | 	title = {Transcriptome-scale spatial gene expression in the human dorsolateral prefrontal cortex},
24 | 	url = {https://doi.org/10.1038/s41593-020-00787-0},
25 | 	volume = {24},
26 | 	year = {2021},
27 | 	bdsk-url-1 = {https://doi.org/10.1038/s41593-020-00787-0}}
28 | 
29 | @article{Zeira:2022,
30 | 	abstract = {Spatial transcriptomics (ST) measures mRNA expression across thousands of spots from a tissue slice while recording the two-dimensional (2D) coordinates of each spot. We introduce probabilistic alignment of ST experiments (PASTE), a method to align and integrate ST data from multiple adjacent tissue slices. PASTE computes pairwise alignments of slices using an optimal transport formulation that models both transcriptional similarity and physical distances between spots. PASTE further combines pairwise alignments to construct a stacked 3D alignment of a tissue. Alternatively, PASTE can integrate multiple ST slices into a single consensus slice. We show that PASTE accurately aligns spots across adjacent slices in both simulated and real ST data, demonstrating the advantages of using both transcriptional similarity and spatial information. We further show that the PASTE integrated slice improves the identification of cell types and differentially expressed genes compared with existing approaches that either analyze single ST slices or ignore spatial information.},
31 | 	author = {Zeira, Ron and Land, Max and Strzalkowski, Alexander and Raphael, Benjamin J.},
32 | 	date = {2022/05/01},
33 | 	date-added = {2022-11-11 14:20:00 +0100},
34 | 	date-modified = {2022-11-11 14:20:09 +0100},
35 | 	doi = {10.1038/s41592-022-01459-6},
36 | 	id = {Zeira2022},
37 | 	isbn = {1548-7105},
38 | 	journal = {Nature Methods},
39 | 	number = {5},
40 | 	pages = {567--575},
41 | 	title = {Alignment and integration of spatial transcriptomics data},
42 | 	url = {https://doi.org/10.1038/s41592-022-01459-6},
43 | 	volume = {19},
44 | 	year = {2022},
45 | 	bdsk-url-1 = {https://doi.org/10.1038/s41592-022-01459-6}}
46 | 
47 | @article{Otsu:1979,
48 | 	author = {Otsu, Nobuyuki},
49 | 	doi = {10.1109/TSMC.1979.4310076},
50 | 	journal = {IEEE Transactions on Systems, Man, and Cybernetics},
51 | 	number = {1},
52 | 	pages = {62-66},
53 | 	title = {A Threshold Selection Method from Gray-Level Histograms},
54 | 	volume = {9},
55 | 	year = {1979},
56 | 	bdsk-url-1 = {https://doi.org/10.1109/TSMC.1979.4310076}}
57 | 
58 | @article{mcdole:2018,
59 | 	author = {McDole, Katie and Guignard, L{\'e}o and Amat, Fernando and Berger, Andrew and Malandain, Gr{\'e}goire and Royer, Lo\"{\i}c A. and Turaga, Srinivas C. and Branson, Kristin and Keller, Philipp J.},
60 | 	date-modified = {2019-12-30 19:43:40 +0100},
61 | 	file = {Full Text:/Users/guignardl/Zotero/storage/YB5W8THL/S0092867418312431.html:text/html},
62 | 	journal = {Cell},
63 | 	number = {3},
64 | 	pages = {859--876},
65 | 	title = {In toto imaging and reconstruction of post-implantation mouse development at the single-cell level},
66 | 	volume = {175},
67 | 	year = {2018}}
68 | 


--------------------------------------------------------------------------------
/txt/scSpatial.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuignardLab/sc3D/e08b067d930a109c666a5af191cac97529258673/txt/scSpatial.docx


--------------------------------------------------------------------------------
/txt/scSpatial.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuignardLab/sc3D/e08b067d930a109c666a5af191cac97529258673/txt/scSpatial.pdf


--------------------------------------------------------------------------------
/txt/scSpatial.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[10pt,a4paper]{article}
  2 | \usepackage[utf8]{inputenc}
  3 | \usepackage{amsmath}
  4 | \usepackage{amsfonts}
  5 | \usepackage{amssymb}
  6 | \usepackage{xcolor}
  7 | \usepackage{natbib}
  8 | \usepackage{graphicx}
  9 | %\usepackage[left=1cm,right=1cm,top=1cm,bottom=1.5cm]{geometry}
 10 | \usepackage{geometry}
 11 | \author{L\'eo Guignard}
 12 | \title{Array alignment and interpolation}
 13 | 
 14 | \DeclareMathOperator*{\argmax}{arg\,max}
 15 | \DeclareMathOperator*{\argmin}{arg\,min}
 16 | 
 17 | \begin{document}
 18 | \maketitle
 19 | \paragraph{}When doing spatial single-cell transcriptomics, beads are recorded from arrays.
 20 | Beads are placed on a 2D matrix where each bead is spaced by a given distance $x_{res}$ (resp.
 21 | $y_{res}$) along the $x$ (resp.
 22 | $y$) dimension (in our case $x_{res}=y_{res}=6 \mu m$).
 23 | These distances define the $xy$ resolution (or lateral resolution) of the slice or array.
 24 | Then, consecutive arrays are spaced by a given distance $z_{res}$ defining the $z$ resolution (or axial resolution) of the dataset (in our case $z_{res}=30\mu m$).
 25 | \paragraph{}Because the arrays are physically moved between their slicing and their acquisition, they are not acquired within the same frame (meaning that they are not aligned).
 26 | In order to reconstruct a 3D representation of the single cell transcriptomics of the acquired embryo and to do full 3D spatial analysis, it is necessary to align consecutive arrays to recover the spatial integrity of the specimen.
 27 | Moreover, because the axial resolution is significantly greater than the lateral one, in some cases, it is necessary to interpolate the data between the arrays.
 28 | \paragraph{}In the following section we will describe how this alignment was performed together with how the beads were interpolated between arrays.
 29 | \section{Notation}
 30 | \paragraph{}We define our dataset as a set of arrays \(\mathcal{P}=\{P_i\}\).
 31 | The function $c_P$ maps a array to its height coordinate $z_i$: \(c_P: P_i\in \mathcal{P} \rightarrow z_i \in \mathbb{R}\).
 32 | Each array $P_i$ is itself a set of beads, \(P_i=\{b_{ij}\}\) and similarly to the arrays, the function \(c_b\) maps a bead \(b_{ij}\) to its \(xy\) coordinate within the array: \(c_b:b_{ij}\in P_i\rightarrow (x,y)\in \mathbb{R}^2\).
 33 | From \(c_P\) and \(c_b\) we define the function \(c:b_{ij}\in P_i\rightarrow (x,y,z)\in\mathbb{R}\) which maps a bead to its 3D spatial coordinate.
 34 | Note that \(c_P\) defines a total order on the arrays.
 35 | Let then \(\mathcal{P}\) be ordered such that \(\forall i,j,~P_i<P_j\iff c_P(P_i)<c_P(P_j)\).
 36 | %%
 37 | \paragraph{}Moreover, using the previously described analysis, we can associate each bead to the tissue it most likely belongs to, the function \(T:b_{ij}\in P_i\rightarrow t\in\mathcal{T}\) where \(t\in\mathcal{T}\) maps each bead to a unique identifier for the tissue it belongs to. \(\mathcal{T}\) is the set of possible tissues previously identified.
 38 | Similarly, to each bead is associated a value for each given gene analysed in the dataset.
 39 | This value is correlated to the level of expression of said gene for said bead.
 40 | Given a gene \(g\), we define the function \(E_g:b_{ij}\in P_i\rightarrow e\in\mathbb{R}\) which maps the value \(e\) of expression of the gene \(g\) to the bead \(b_{ij}\).
 41 | %%
 42 | \section{Pre-processing}
 43 | \subsection{Removing the remaining outliers}
 44 | \paragraph{}Before aligning the arrays it is possible to remove beads that are likely to be noise.
 45 | Within a given tissue, noisy beads were detected as the beads that were spatially further away from their neighbors than the normal distribution of the spatial distances between beads.
 46 | To assess the normal distribution of spatial distances between beads of a given tissue, we first computed the distance between any given bead and its closest bead from the same tissue type.
 47 | We then analysed the distribution of these distances by fitting a gaussian mixture model, with a number of components equal to \(n_{components}\) (in our case we used \(n_{components}=3\).
 48 | When using 3 components, the \(1^{st}\) and \(2^{nd}\) components are assumed to be real.
 49 | The \(3^{rd}\) component, with the higher mean, is the distribution of distances of noisy beads.
 50 | We then discarded all the beads that had a distance to their closest neighbor from the same tissue which had a probability to belong to the first or second component were lower than \(th_{gmm}\) (in our case we used \(th_{gmm}=0.6\%\)).
 51 | This pre-processing step is not mandatory but it can help getting more accurate results. The value might vary depending on the sample and tissue analyzed.
 52 | \section{Aligning the arrays}
 53 | \paragraph{}As previously mentioned, due to the nature of the acquisition process, consecutive arrays do not live within the same frame.
 54 | They are therefore not spatially comparable.
 55 | \paragraph{}To align the arrays and register them onto the same frame, we first chose our first array in \(\mathcal{P}\), \(P_0\), as the reference array.
 56 | We then registered each following slide to its preceding one: \(P_1\) is registered onto \(P_0\), \(P_{i+1}\) is registered onto \(P_{i}\) and so on and so forth.
 57 | Ultimately we can compose all the transformations together to register any array onto the first array.
 58 | To compute the transformation necessary to register two consecutive arrays, we first performed a coarse grain alignment using the center of mass of a subset of the different tissue types.
 59 | We then refined the alignment by pairing beads across the arrays and by aligning them.
 60 | \subsection{Coarse grain alignment}
 61 | \paragraph{}We first chose a subset of tissues \(\mathcal{L}\subset\mathcal{T}\) that are spatially localised (for example heart tube precursor beads or somite precursor beads).
 62 | We then discarded (only for the coarse grain alignment) tissues that were spread in space (for example blood precursor beads).
 63 | An exhaustive list of the discarded tissue types can be found below.
 64 | \paragraph{}Then we computed the alignment transformation (\(r_{i\leftarrow j}^*\), registering the array \(P_j\) onto the array \(P_i\)) as the rigid transformation (translation plus rotation) that minimizes the sum of the squared distances between corresponding tissue types center of mass:
 65 | \begin{eqnarray}\label{eq:rigid}
 66 | r_{i\leftarrow j}^*&=&\argmax_{r\in\mathcal{R}} \big\{\sum_{t\in \mathcal{L}}||COM_i(t)-r[COM_j(t)]||_2\big\}
 67 | \end{eqnarray}
 68 | where \(COM_i(t)\) is the position of the center of mass of the tissue \(t\) in the array \(i\), \(||\cdot||_2\) is the L2 norm and \(\mathcal{R}\) is the set of all rigid transformations.
 69 | \(r[COM_j(t)]\) is therefore the position of the center of mass of the tissue \(t\) in the array \(j\) after application of the rigid transformation \(r\).
 70 | \paragraph{}We then applied the composition of the necessary transformations to register all arrays onto the first array.
 71 | For example, to register the array \(j\) onto the array \(0\), we applied the transformation \(r_{0\leftarrow j}^*=r_{0\leftarrow 1}^*\circ r_{1\leftarrow 2}^*\circ \cdots \circ r_{j-1\leftarrow j}^*\).
 72 | \subsection{Alignment refinement}\label{subsec:ali-ref}
 73 | \paragraph{}To refine the alignment, we then paired beads from consecutive arrays.
 74 | Only beads from the same tissue types could be paired.
 75 | The pairing was the one that minimizes the sum of the distances between paired beads (using the solution of the linear sum assignment optimization).
 76 | The distances were computed after applying the coarse grain transformation.
 77 | From this pairing, as previously, we computed the rigid transformation \(R^*_{i\leftarrow j}\) that minimizes the sum of the squares of the distances between the paired beads (see eq.\eqref{eq:rigid}).
 78 | \section{In between array interpolation}
 79 | \paragraph{}As shown earlier, the distance between two consecutive beads within the same array is significantly smaller than the one between two arrays (one order of magnitude in our case, 6\(\mu m\) versus 30\(\mu m\)). This property makes it that some analysis of the 3D volume of the embryo is difficult or even not possible. If one wants to study a plane that is tilted by a small angle to the original array planes, the beads become too scarce. To overcome this issue, we designed an algorithm to interpolate beads in between arrays. The goal is to create beads where there would likely be and to assign to these interpolated beads gene expression values that are the most likely values.
 80 | \paragraph{}To interpolate beads between arrays, we first paired beads belonging to the same tissues (similarly to the pairing that was done in \ref{subsec:ali-ref}) to construct paths of beads (see Fig. \ref{fig:interp} A-B). We then assume a smooth continuity of gene expression and position to compute the interpolation. This smooth and continuous hypothesis on the system is the basis of the interpolation. It is important to keep this hypothesis in mind when doing any further analysis since it might not apply to all samples or tissue types.
 81 | \subsection{Bead pairings and paths}
 82 | \paragraph{}The first step towards in-between array interpolation is to build a path of beads from consecutive arrays.
 83 | These paths were built by pairing beads from consecutive arrays.
 84 | Beads were paired based on their spatial proximity.
 85 | As previously, we used a linear assignment algorithm to pair beads.
 86 | Linear sum alignment requires that a positive cost is computed between any beads that can be paired together.
 87 | Here, we used as our cost function the Euclidean distance (hence the spatial proximity bead pairing).
 88 | The bead pairing between consecutive arrays creates paths of beads. A path of beads is a series of beads that are paired sequentially by the array to array pairing.
 89 | For example \(b_{i,j}\) is paired to \(b_{i+1,k}\) which is paired to \(b_{i+2,l}\). These paired beads then create the path \(Pa_{i,j}=(b_{i,j}, b_{i+1,k},b_{i+2,l})\).
 90 | \paragraph{}Because each path does not start (resp. end) at the first (reps. last) array of the sample, to avoid abrupt and artificial stops of the paths, we computed intermediate beads which would mark the most likely end of the path (see Fig. \ref{fig:interp} C). The position of these beads added in between arrays (let them be \(b^{proj}_{i,j}\)) was computed as follow:
 91 | \begin{equation}\label{eq:proj}
 92 | c(b^{proj}_{i,j})=c({b_{i,j}})+\frac{n}{|\mathcal{N}^p_{i,j}|}\sum_{b_{i,k}\in \mathcal{N}^p_{i,j}}\overrightarrow{b_{i,k},p(b_{i,k})}
 93 | \end{equation}
 94 | where \(p(b_{i,k})\) is the projection of \(b_{i,k}\) in the following array. \(\mathcal{N}_{i,j}\) is the set of beads that are neighbors of \(b_{i,j}\) according to the Gabriel graph tessellation of the set of beads for a given array.
 95 | \(\mathcal{N} ^p_{i,j}\subset \mathcal{N}_{i,j}\) is the subset of neighbors of the bead \(b_{i,j}\) that have a projection, meaning that \(p(b_{i,j})\) exists.
 96 | \(n\) is the fraction of the average projection to consider. \(n\) is computed with regard to the distance of the bead \(b_{i,j}\) to its closest neighbor that has a projection.
 97 | The closer a bead is to a bead with a projection, the larger \(n\) will be, meaning that if a bead is close to a bead that is linked to another bead in the following array, the longer it will exist in between the arrays.
 98 | Note that the Gabriel graph tessellation is a decimation of the more classical vorono\"i tessellation which avoids most of the usual issues of the vorono\"i tessellation especially when it comes to concave set of positions (see \cite{mcdole:2018}).
 99 | \begin{figure}
100 | \center
101 | \includegraphics[width=0.8\textwidth]{figures/interpolation}
102 | \caption{\textbf{Schematics of inter array bead interpolation.} A. Schematics of original data. Dashed lines represent the arrays, circles represent the beads. B. Pairing between beads of consecutive arrays forming the different paths. C. When necessary, addition of the in-between array projections. D. Resulting splines from the bead paring. E. Interpolation of beads in between the arrays, the grey beads are the interpolated ones.}\label{fig:interp}
103 | \end{figure}
104 | \subsection{Interpolation from the paths}
105 | \paragraph{}Once the paths are created, all existing beads belong to a single path.
106 | From these paths, positions and gene expressions can be interpolated.
107 | Here, we interpolated the positions and gene expressions in between arrays as univariate cubic splines. Therefore, it is possible to retrieve the set of beads that live at any given \(z\) position together with the interpolated gene expression (see Fig. \ref{fig:interp} D-E).
108 | \section{Spatial differential expression}
109 | \paragraph{}The goal here was to score genes on whether they were locally expressed or not within a given tissue.
110 | To do so, we quantified if the set of genes that are considered expressing are spatially positioned next to each other.
111 | The metric we decided to use is based on the fact that the average degree of a graph (or network) is related to the average degree of that graph with randomly removed nodes.
112 | If a graph \(G\) has a density of \(d(G)=k\), then if a fraction \(f\) of nodes are removed randomly, building the new graph \(G'\) , the density of \(G'\)  is \(d(G')\sim k(1-f)\).
113 | But, if the nodes are not distributed randomly but are rather spatially localised, then the density of the new graph will be higher than the expected value: \(d(G')>k(1-f)\).
114 | 
115 | \paragraph{}Having the previous paragraph in mind, it means that the more a gene expression will be localised, the further away the density of the graph of expressing beads will be from the expected value of $k(1-f)$
116 | \paragraph{}In the context of our study, \(G = (V, E)\) is the graph where the vertices are the beads with their spatial positions.
117 | Then, to build the set of edges \(E\) between the vertices we computed the Gabriel graph on \(V\).
118 | Finally, the fraction of removed nodes (\(f\)) is the fraction of the total number of beads in which beads are not expressing a given gene.
119 | \paragraph{}To compute \(f\) we first computed the expression threshold above which a bead is considered expressing.
120 | The threshold was computed independently for each gene-tissue pair as the value that splits the distribution of expression values for each gene within each tissue into two classes (expressing and not expressing beads).
121 | For that we used the Otsu method \citep{Otsu:1979} which splits a distribution into two classes such that the intra-class variance is minimum (and therefore maximising the inter-class variance).
122 | For each gene \(g\), we therefore have computed a threshold \(O_{th}(g)\).
123 | \paragraph{}Having split the beads in two separate classes, we create the new graph \(G'_{g} = (V'_g, E'_g)\) where \(V'_g=\{v\in V~|~O_{th}(g)<ge(g, v)\}\) with \(ge(g,v)\) is the expression of the gene \(g\) of the vertice (or bead) \(v\).
124 | The set of edges \(E'_g\) is then the set of edges included in \(E_g\) such that both vertices of the edges in \(E'_g\) are in the set of expressing beads (\(V'_g\)): \(E'_g=\{(v_1, v_2) \in E_g~|~(v_1, v_2)\in {V'}_g^{2}\}\).
125 | \(G'_g\) is the graph of connections between beads expressing a given gene \(g\).
126 | \paragraph{}We then computed the densities of \(G_g\) and \(G'_g\) as followed:
127 | \begin{equation}
128 | d(G_g)=\frac{2.|E_g|}{|V_g|(|V_g|-1)}.
129 | \end{equation}
130 | The fraction of removed nodes is computed as \(f_g=1-\frac{|V'_g|}{|V_g|}\).
131 | This value can be seen as the ratio of volume expressing a given gene over the total volume.
132 | We also normalised the values so they are comparable across different genes and tissues, we are therefore looking at \(f_g\) (which is already normalised) and \(d_g=\frac{d(G'_g)}{d(G_g)}\).
133 | \paragraph{}We can then look at \(d_g\) against \(f_g\) knowing that \(d(G'_g)\sim d(G_g)f_g\) and therefore \(d_g\sim f_g\).
134 | Because of the potential noise of the dataset, instead of taking the theoretical value, we computed the linear regression between the distributions \(d_g\) and \(f_g\) for each gene within a given tissue.
135 | Then, for each gene, the distance to the linear regression is calculated which is the score for spatially expressed genes.
136 | The higher the score, the further the gene is from the expected value, and therefore the further it is from being randomly expressed in space, meaning that it is locally expressed.
137 | 
138 | \section{Comparison to PASTE}
139 | \paragraph{}We compared the results of our inter-array registration algorithm to the ones of PASTE described in \cite{Zeira:2022}.
140 | The main difference between sc3D registration method and PASTE's is that our method relies heavily on the pre-annotation of the tissues of the dataset.
141 | PASTE on the other hand only uses RNAI counts per bead and finds the transformation that minimises the difference between paired beads across arrays.
142 | For that reason PASTE allows to perform array registration without having to pre-process the data as much as sc3D but sc3D is usually significantly faster since the optimisation part is done at a higher level.
143 | \paragraph{}To compare PASTE to sc3D we used both our dataset and the human dorsolateral prefrontal cortex (DLPFC) from \cite{Maynard:2021} that was used in the PASTE article.
144 | When we ran PASTE on the DLPFC dataset we used the optimal parameters provided by the article.
145 | When we ran PASTE on our dataset, we tried different parameterisation and kept the best.
146 | To compare our results to those of PASTE we used a similar metric than the one used in the PASTE article for non simulated datasets: the fraction of beads that have their closest bead in the neighbouring arrays belonging to the same tissue.
147 | To compute our accuracy metric, we first compute a pairing between beads of consecutive arrays.
148 | The pairing is a spatial proximity pairing where two beads are paired if they are closed spatially.
149 | Effectively we computed the optimal pairing \(\pi^\star\) over the set of all the possible pairings \(\Pi\) that minimises the sum of the distances between paired beads:
150 | \begin{equation}
151 | \pi^\star = \argmin_{\pi\in \Pi}\sum_{(b_{i,j}, b_{i+1,k})\in \pi} \left\Vert b_{i,j}, b_{i+1,k}\right\Vert_2
152 | \end{equation}
153 | where \((b_{i,j}, b_{i+1,k})\in \pi\) are two paired beads in \(\pi\). To do so we used the Hungarian algorithm that gives an optimal result.
154 | Given this pairing, we can now count the percentage of pairs that have the same tissue over the total number of pairs. The higher this percentage is, the better the alignment.
155 | Figures \ref{fig:qual-DLPFC} and \ref{fig:qual-Mouse} show the accuracy of the alignment between our method and PASTE.
156 | Note that for the DLPFC the results are similar and that for the mouse dataset sc3D performs better.
157 | It is important to keep in mind that the parameterization that we found for PASTE on our dataset might not be the best even though we tried to optimise it.
158 | \begin{figure}
159 | \center
160 | \includegraphics[width=0.8\textwidth]{figures/sc3D_vs_PASTE_DLPFC}
161 | \caption{\textbf{Quality of alignment for PASTE and sc3D methods for the DLPFC dataset.}}\label{fig:qual-DLPFC}
162 | \end{figure}
163 | \begin{figure}
164 | \center
165 | \includegraphics[width=0.8\textwidth]{figures/sc3D_vs_PASTE_Mouse}
166 | \caption{\textbf{Quality of alignment for PASTE and sc3D methods for our Mouse dataset at E8.5.} The subset of tissues for the second histogram is the following: Allantois, PP ectoderm, LPM, Somites, Heart, Brain, PSM, NMPs, Neural crest, Pharyngeal arch, Neural tube}\label{fig:qual-Mouse}
167 | \end{figure}
168 | \paragraph{}As another point of comparison, we recorded the time it took the two methods to align multiple arrays. We found our method to be one to two orders of magnitude faster (Figure \ref{fig:time-mouse}) on our dataset.
169 | \begin{figure}
170 | \center
171 | \includegraphics[width=0.8\textwidth]{figures/sc3D_vs_PASTE_time}
172 | \caption{\textbf{Processing time for the Mouse dataset}}\label{fig:time-mouse}
173 | \end{figure}
174 | \bibliographystyle{apalike}
175 | \bibliography{references}
176 | \end{document}


--------------------------------------------------------------------------------