├── .gitignore
├── ENACT_demo.ipynb
├── ENACT_outputs_demo.ipynb
├── LICENSE.md
├── MANIFEST.in
├── Makefile
├── README.md
├── config
└── configs.yaml
├── figs
├── pipelineflow.png
└── tissuumaps.png
├── pyproject.toml
├── reproduce_paper_results.sh
├── requirements.txt
├── run_cell_ann_eval.sh
├── run_enact.sh
├── sample_enact_output.html
├── setup_py_env.sh
├── src
├── enact
│ ├── __init__.py
│ ├── assignment_methods
│ │ ├── __init__.py
│ │ ├── naive.py
│ │ ├── weight_by_area.py
│ │ └── weight_by_gene.py
│ ├── cellassign.py
│ ├── celltypist.py
│ ├── package_results.py
│ ├── pipeline.py
│ └── utils
│ │ └── logging.py
├── eval
│ ├── cell_annotation_eval.py
│ └── paper_eval-cellassign-methods-highlevel.ipynb
├── main.py
└── synthetic_data
│ ├── generate_synthetic_data_Xenium.ipynb
│ └── generate_synthetic_data_seqFISH.ipynb
└── templates
└── tmap_template.tmap
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .hypothesis/
3 | .pytest_cache/
4 | .coverage
5 | SpatialOneHD.log
6 | ENACT.log
7 | cache/*
8 | .ipynb_checkpoints/
9 | src/.ipynb_checkpoints/
10 | templates/.ipynb_checkpoints/
11 | data/
12 | ENACT_supporting_files/
13 | ENACT_supporting_files.zip
14 | .idea/*
15 | .checkmarx/scan_results/*
16 | src/cache/*
17 | src/cache-pathologist/*
18 | src/binned_outputs/*
19 | src/binned_outputs-mouse/*
--------------------------------------------------------------------------------
/ENACT_demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "88dfe185-575f-484b-87a5-662d54a8aa14",
6 | "metadata": {},
7 | "source": [
8 | "## ENACT Demo Notebook - Human Colorectal Cancer"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "ef3a94f5-4189-4c46-b4fa-570989cb78e9",
14 | "metadata": {},
15 | "source": [
16 | "This notebook provides a demo for running ENACT on the Human Colorectal Cancer sample provided on 10X Genomics' website."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "31994db6-6997-4124-a4d5-bf09dbf64f69",
22 | "metadata": {},
23 | "source": [
24 | "### Download VisiumHD data from the 10X Genomics website"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "id": "e56081b4-2eb0-45e4-9f46-7ed118b51551",
30 | "metadata": {},
31 | "source": [
32 | "Whole slide image: full resolution tissue image"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "id": "712a9e76-d7e1-4cc1-b0ae-afad223a1713",
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "!curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.0.0/Visium_HD_Human_Colon_Cancer/Visium_HD_Human_Colon_Cancer_tissue_image.btf"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "id": "bfa4bd8e-4b4d-4593-b5f2-8cc881c1a2b1",
48 | "metadata": {},
49 | "source": [
50 | "Visium HD output file. The transcript counts are provided in a .tar.gz file that needs to be extracted:"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "id": "6f7bc5a4-6f56-4ffa-8b1c-9c178d5c6022",
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "!curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.0.0/Visium_HD_Human_Colon_Cancer/Visium_HD_Human_Colon_Cancer_binned_outputs.tar.gz\n",
61 | "!tar -xvzf Visium_HD_Human_Colon_Cancer_binned_outputs.tar.gz"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "id": "c838e47e-0e91-4462-a099-ff061cd4f94f",
67 | "metadata": {},
68 | "source": [
69 | "Locate the following two files from the extracted outputs file. These are the files we will use later as input to ENACT.\n"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "id": "08ea9a56-14c5-4ebc-bb09-e7535bbc1fee",
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | ".\n",
80 | "└── binned_outputs/\n",
81 | " └── square_002um/\n",
82 | " ├── filtered_feature_bc_matrix.h5 <---- Transcript counts file (2um resolution)\n",
83 | " └── spatial/\n",
84 | " └── tissue_positions.parquet <---- Bin locations relative to the full resolution image\n"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "id": "8d760ee8-f5a0-4a0f-ace6-c0b91176f4e1",
90 | "metadata": {},
91 | "source": [
92 | "### Install ENACT\n",
93 | "This will install the ENACT package and its dependencies.\n"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "id": "d555ae41-0776-4047-bfe2-1ee3ebc475bb",
100 | "metadata": {
101 | "scrolled": true,
102 | "tags": []
103 | },
104 | "outputs": [],
105 | "source": [
106 | "!pip install enact-SO"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "id": "39736162-ba27-435f-8dc6-876b2f507315",
112 | "metadata": {},
113 | "source": [
114 | "### Access and update the `configs.yaml` file"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "id": "56f81b7a-e58e-498b-8589-0fd9cfc82c08",
120 | "metadata": {},
121 | "source": [
122 | "To run the ENACT pipeline, you will need a configuration file that specifies all the required settings. You can download the template configuration file from the GitHub repository.\n",
123 | "\n",
124 | "Refer to [Defining ENACT Configurations](https://github.com/Sanofi-OneAI/oneai-dda-spatialtr-enact/tree/release/ospo-new?tab=readme-ov-file#defining-enact-configurations) for a full list of parameters to configure."
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "id": "19207eb1-f22a-48d7-80ba-08b6fc118872",
130 | "metadata": {},
131 | "source": [
132 | "#### Step 1\n",
133 | "Download the `configs.yaml` template from the `config` folder of [this repository](https://github.com/Sanofi-OneAI/oneai-dda-spatialtr-enact), and save it in your working directory."
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "id": "8996161d-5164-4931-bfdc-ca0065686d44",
139 | "metadata": {},
140 | "source": [
141 | "#### Step 2\n",
142 | "Edit the input file locations in `configs.yaml` to the downloaded Visium HD files' location."
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "id": "ff15bcbd-681e-4947-8277-cffc30f69df4",
148 | "metadata": {},
149 | "source": [
150 | "```yaml\n",
151 | "analysis_name: \"demo-colon\" \n",
152 | "cache_dir: \"enact_output\" \n",
153 | "paths:\n",
154 | " wsi_path: \"Visium_HD_Human_Colon_Cancer_tissue_image.btf\" \n",
155 | " visiumhd_h5_path: \"binned_outputs/square_002um/filtered_feature_bc_matrix.h5\" \n",
156 | " tissue_positions_path: \"binned_outputs/square_002um/spatial/tissue_positions.parquet\" \n",
157 | "```"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "id": "81daa91f-e34e-4018-8a46-89ddb9b6cf99",
163 | "metadata": {},
164 | "source": [
165 | "#### Step 3\n",
166 | "Next, we set all the steps in the `configs.yaml` file to `True`, in order to run the whole ENACT pipeline later"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "id": "8a4eb5a6-2436-4cf3-8b52-e06d431fc3a0",
172 | "metadata": {},
173 | "source": [
174 | "```yaml\n",
175 | "steps:\n",
176 | " segmentation: True \n",
177 | " bin_to_geodataframes: True \n",
178 | " bin_to_cell_assignment: True \n",
179 | " cell_type_annotation: True \n",
180 | "```"
181 | ]
182 | },
183 | {
184 | "cell_type": "markdown",
185 | "id": "4af04737-6ece-431f-b5d5-1eaefe63efca",
186 | "metadata": {},
187 | "source": [
188 | "#### Step 4\n",
189 | "Lastly, choose the `bin_to_cell_method` and `cell_annotation_method` we want to run with. In this demo, we will go with `\"weighted_by_area\"`, and `\"celltypist\"`.\n",
190 | "\n",
191 | "To run Celltypist as our cell annotation method, we also need to input the `cell_typist_model` parameter based on the type of sample we use."
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "id": "9b495d82-cfce-4973-aed0-84aec7d2ac31",
197 | "metadata": {},
198 | "source": [
199 | "```yaml\n",
200 | " params:\n",
201 | " bin_to_cell_method: \"weighted_by_area\" \n",
202 | " cell_annotation_method: \"celltypist\" \n",
203 | " cell_typist_model: \"Human_Colorectal_Cancer.pkl\" \n",
204 | " seg_method: \"stardist\" \n",
205 | " patch_size: 4000 \n",
206 | " use_hvg: True \n",
207 | " n_hvg: 1000 \n",
208 | "```"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "id": "13a165e4-7f63-4cfd-80ed-52a0823692f9",
214 | "metadata": {},
215 | "source": [
216 | "### Run ENACT"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "id": "2aadbd97-ddf2-4252-9bb0-c59ba4600c4c",
222 | "metadata": {},
223 | "source": [
224 | "Running ENACT on the whole sample image will take around 40 minutes. Output of the pipeline will be stored in the `\"enact_output\"` directory"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "id": "393087e7-9598-4ebe-a628-14cc0ac673a8",
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "from enact.pipeline import ENACT\n",
235 | "import yaml\n",
236 | "\n",
237 | "configs_path = \"config/configs.yaml\" # Change this to the location of the configs.yaml file that you just edited\n",
238 | "with open(configs_path, \"r\") as stream:\n",
239 | " configs = yaml.safe_load(stream)\n",
240 | "\n",
241 | "so_hd = ENACT(configs_dict=configs)\n",
242 | "so_hd.run_enact()"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "id": "cfc23f74",
248 | "metadata": {},
249 | "source": [
250 | "New! Alternatively, users can specify ENACT configurations directly in the class constructor with the following *minimum* configurations. Refer to Readme for full list of ENACT parameters."
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": null,
256 | "id": "87648e0c",
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "from enact.pipeline import ENACT\n",
261 | "\n",
262 | "# Running ENACT with `weighted-by-area` bin-to-cell assignment, and `celltypist` for cell type annotation\n",
263 | "so_hd = ENACT(\n",
264 | " cache_dir=\"/home/oneai/test_cache\",\n",
265 | " wsi_path=\"Visium_HD_Human_Colon_Cancer_tissue_image.btf\",\n",
266 | " visiumhd_h5_path=\"binned_outputs/square_002um/filtered_feature_bc_matrix.h5\",\n",
267 | " tissue_positions_path=\"binned_outputs/square_002um/spatial/tissue_positions.parquet\",\n",
268 | " analysis_name=\"demo-colon\", #optional\n",
269 | " bin_to_cell_method=\"weighted_by_area\", #optional \n",
270 | " cell_annotation_method=\"celltypist\", #optional \n",
271 | " cell_typist_model=\"Human_Colorectal_Cancer.pkl\" #optional \n",
272 | ")\n",
273 | "so_hd.run_enact()"
274 | ]
275 | },
276 | {
277 | "cell_type": "markdown",
278 | "id": "ac1595f3",
279 | "metadata": {},
280 | "source": [
281 | "Example: Only running the cell segmentation step and disabling all the other steps"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": null,
287 | "id": "1fbef539",
288 | "metadata": {},
289 | "outputs": [],
290 | "source": [
291 | "from enact.pipeline import ENACT\n",
292 | "\n",
293 | "so_hd = ENACT(\n",
294 | " cache_dir=\"/home/oneai/test_cache\",\n",
295 | " wsi_path=\"Visium_HD_Human_Colon_Cancer_tissue_image.btf\",\n",
296 | " visiumhd_h5_path=\"binned_outputs/square_002um/filtered_feature_bc_matrix.h5\",\n",
297 | " tissue_positions_path=\"binned_outputs/square_002um/spatial/tissue_positions.parquet\",\n",
298 | " analysis_name=\"demo-colon\",\n",
299 | " bin_to_cell_method=\"weighted_by_area\", \n",
300 | " cell_annotation_method=\"celltypist\", \n",
301 | " cell_typist_model=\"Human_Colorectal_Cancer.pkl\",\n",
302 | " segmentation=True,\n",
303 | " bin_to_geodataframes=False,\n",
304 | " bin_to_cell_assignment=False,\n",
305 | " cell_type_annotation=False\n",
306 | ")\n",
307 | "so_hd.run_enact()"
308 | ]
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "id": "2ae423f9",
313 | "metadata": {},
314 | "source": [
315 | "Example: Running ENACT with `naive` bin-to-cell assignment and `cellassign` for cell type annotation"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": null,
321 | "metadata": {},
322 | "outputs": [],
323 | "source": [
324 | "from enact.pipeline import ENACT\n",
325 | "\n",
326 | "so_hd = ENACT(\n",
327 | " cache_dir=\"/home/oneai/test_cache\",\n",
328 | " wsi_path=\"Visium_HD_Human_Colon_Cancer_tissue_image.btf\",\n",
329 | " visiumhd_h5_path=\"binned_outputs/square_002um/filtered_feature_bc_matrix.h5\",\n",
330 | " tissue_positions_path=\"binned_outputs/square_002um/spatial/tissue_positions.parquet\",\n",
331 | " analysis_name=\"demo-colon\",\n",
332 | " bin_to_cell_method=\"naive\", \n",
333 | " cell_annotation_method=\"cellassign\"\n",
334 | ")\n",
335 | "so_hd.run_enact()"
336 | ]
337 | }
338 | ],
339 | "metadata": {
340 | "kernelspec": {
341 | "display_name": "Python 3 (ipykernel)",
342 | "language": "python",
343 | "name": "python3"
344 | },
345 | "language_info": {
346 | "codemirror_mode": {
347 | "name": "ipython",
348 | "version": 3
349 | },
350 | "file_extension": ".py",
351 | "mimetype": "text/x-python",
352 | "name": "python",
353 | "nbconvert_exporter": "python",
354 | "pygments_lexer": "ipython3",
355 | "version": "3.10.14"
356 | }
357 | },
358 | "nbformat": 4,
359 | "nbformat_minor": 5
360 | }
361 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | **Copyright Sanofi 2024**
2 |
3 | Permission is hereby granted, free of charge, for academic research purposes only and for non-commercial uses only, to any person from academic research or non-profit organizations obtaining a copy of this software and associated documentation files (the "Software"), to use, copy, modify, or merge the Software, subject to the following conditions: this permission notice shall be included in all copies of the Software or of substantial portions of the Software.
4 |
5 | For purposes of this license, “non-commercial use” excludes uses foreseeably resulting in a commercial benefit. To use this software for other purposes (such as the development of a commercial product, including but not limited to software, service, or pharmaceuticals, or in a collaboration with a private company), please contact SANOFI at patent.gos@sanofi.com.
6 |
7 | All other rights are reserved. The Software is provided “as is”, without warranty of any kind, express or implied, including the warranties of noninfringement.
8 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE.md
3 | include config/configs.yaml
4 |
5 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | ENV_DIR := /home/oneai/envs/
2 |
3 | PY_ENV_NAME := enact_py_env
4 |
5 | PY_ENV_PATH := $(ENV_DIR)$(PY_ENV_NAME)
6 |
7 | CONFIG_PATH ?= config/configs.yaml
8 |
9 | create-env:
10 | conda create --prefix $(PY_ENV_PATH) python=3.10
11 |
12 | run_enact:
13 | bash setup_py_env.sh $(PY_ENV_PATH)
14 | bash run_enact.sh $(PY_ENV_PATH) ${CONFIG_PATH}
15 |
16 | setup_py_env:
17 | bash setup_py_env.sh $(PY_ENV_PATH)
18 |
19 | run_cell_ann_eval:
20 | bash setup_py_env.sh $(PY_ENV_PATH)
21 | bash run_cell_ann_eval.sh $(PY_ENV_PATH)
22 |
23 | reproduce_results:
24 | bash setup_py_env.sh $(PY_ENV_PATH)
25 | bash reproduce_paper_results.sh $(PY_ENV_PATH)
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ENACT: End-to-End Analysis and Cell Type Annotation for Visium High Definition (HD) Slides
2 |
3 | >[!NOTE]
4 | >This is the official repo for [ENACT](https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btaf094/8063614). The manuscript can be accessed through [Bioinformatics Journal](https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btaf094/62340410/btaf094.pdf).
5 |
6 | Spatial transcriptomics (ST) enables the study of gene expression within its spatial context in histopathology samples. To date, a limiting factor has been the resolution of sequencing based ST products. The introduction of the Visium High Definition (HD) technology opens the door to cell resolution ST studies. However, challenges remain in the ability to accurately map transcripts to cells and in cell type assignment based on spot data.
7 |
8 | ENACT is the first tissue-agnostic pipeline that integrates advanced cell segmentation with Visium HD transcriptomics data to infer cell types across whole tissue sections. Our pipeline incorporates novel bin-to-cell assignment methods, enhancing the accuracy of single-cell transcript estimates. Validated on diverse synthetic and real datasets, our approach demonstrates high effectiveness at predicting cell types and scalability, offering a robust solution for spatially resolved transcriptomics analysis.
9 |
10 | This repository has the code for inferring cell types from the sub-cellular transcript counts provided by VisiumHD.
11 |
12 | This can be achieved through the following steps:
13 |
14 | 1. **Cell segmentation**: segment high resolution image using NN-based image segmentation networks such as Stardist.
15 | 2. **Bin-to-cell assignment**: Obtain cell-wise transcript counts by aggregating the VisiumHD bins that are associated with each cell
16 | 3. **Cell type inference**: Use the cell-wise transcript counts to infer the cell labels/ phenotypes using methods used for single-cell RNA seq analysis ([CellAsign](https://www.nature.com/articles/s41592-019-0529-1#:~:text=CellAssign%20uses%20a%20probabilistic%20model%20to%20assign%20single) or [CellTypist](https://pubmed.ncbi.nlm.nih.gov/35549406/#:~:text=To%20systematically%20resolve%20immune%20cell%20heterogeneity%20across%20tissues,) or [Sargent](https://www.sciencedirect.com/science/article/pii/S2215016123001966#:~:text=We%20present%20Sargent,%20a%20transformation-free,%20cluster-free,%20single-cell%20annotation) if installed) or novel approaches, and use comprehensive cell marker databases ([Panglao](https://panglaodb.se/index.html) or [CellMarker](http://xteam.xbio.top/CellMarker/) can be used as reference).
17 |
18 | >[!NOTE]
19 | > [Sargent](https://doi.org/10.1016/j.mex.2023.102196) (doi: https://doi.org/10.1016/j.mex.2023.102196) needs to be insalled and set up independently. [Sargent](https://doi.org/10.1016/j.mex.2023.102196) is currently available in the [author's github page](https://github.com/nourin-nn/sargent/). For additional information on Sargent's usage and license, please contact the paper's corresponding authors (nima.nouri@sanofi.com) or check their GitHub page.
20 | >
21 | > We provide the results obtained by Sargent in [ENACT's Zenodo page](https://doi.org/10.5281/zenodo.15211043) under the following folders:
22 | >- ENACT_supporting_files/public_data/human_colorectal/paper_results/chunks/naive/sargent_results/
23 | >- ENACT_supporting_files/public_data/human_colorectal/paper_results/chunks/weighted_by_area/sargent_results/
24 | >- ENACT_supporting_files/public_data/human_colorectal/paper_results/chunks/weighted_by_transcript/sargent_results/
25 | >- ENACT_supporting_files/public_data/human_colorectal/paper_results/chunks/weighted_by_cluster/sargent_results/
26 |
27 |
31 | 
32 |
33 | ## Index of Instructions:
34 | 1. Installation
35 | - [System Requirements](#system-requirements)
36 | - [Install ENACT from Source](#install-enact-from-source)
37 | - [Install ENACT with Pip](#install-enact-with-pip)
38 | 2. Inputs and Outputs
39 | - [Input Files for ENACT](#input-files-for-enact)
40 | - [Defining ENACT Configurations](#defining-enact-configurations)
41 | - [Output Files for ENACT](#output-files-for-enact)
42 | 3. Running ENACT
43 | - [Basic Example: Running ENACT from Notebook](#basic-example-running-enact-from-notebook)
44 | - [Basic Example: Running ENACT from Terminal](#basic-example-running-enact-from-terminal)
45 | - [Running Instructions](#running-instructions)
46 | 4. Visualizing Outputs
47 | - [Working with ENACT Output](#working-with-enact-output)
48 | - [Visualizing Results on TissUUmaps](#visualizing-results-on-tissuumaps)
49 | 5. Reproducing Paper Results
50 | - [Reproducing Paper Results](#reproducing-paper-results)
51 | - [Creating Synthetic VisiumHD Datasets](#creating-synthetic-visiumhd-datasets)
52 | 6. [Citing ENACT](#citing-enact)
53 |
54 | ## System Requirements
55 | ENACT was tested with the following specifications:
56 | * Hardware Requirements: 32 CPU, 64GB RAM, 100 GB (harddisk and memory requirements may vary depending on whole slide image size; if the weight of the wsi is small the memory requirements can be significantly decreased)
57 |
58 | * Software: Python 3.10, (Optional) GPU (CUDA 11)
59 |
60 | ## Install ENACT from Source
61 | ### Step 1: Clone ENACT repository
62 | ```
63 | git clone https://github.com/Sanofi-Public/enact-pipeline.git
64 | cd enact-pipeline
65 | ```
66 | ### Step 2: Setup Python environment
67 | Start by defining the location and the name of the Conda environment in the `Makefile`:
68 | ```
69 | ENV_DIR := /home/oneai/envs/ <---- Conda environment location
70 | PY_ENV_NAME := enact_py_env <---- Conda environment name
71 | ```
72 | Next, run the following Make command to create a Conda environment with all of ENACT's dependencies
73 | ```
74 | make setup_py_env
75 | ```
76 |
77 | ## Install ENACT with Pip
78 | ENACT can be installed from [Pypi](https://pypi.org/project/enact-SO/) using:
79 | ```
80 | pip install enact-SO
81 | ```
82 |
83 | ## Input Files for ENACT
84 | ENACT requires only three files, which can be obtained from SpaceRanger’s outputs for each experiment:
85 |
86 | 1. **Whole resolution tissue image**. This will be segmented to obtain the cell boundaries that will be used to aggregate the transcript counts.
87 | 2. **tissue_positions.parquet**. This is the file that specifies the *2um* Visium HD bin locations relative to the full resolution image.
88 | 3. **filtered_feature_bc_matrix.h5**. This is the .h5 file with the *2um* Visium HD bin counts.
89 |
90 | ## Defining ENACT Configurations
91 | ENACT users can choose to specify the configurations via one of two ways:
92 |
93 | 1. Passing them within the class constructor:
94 | ```
95 | from enact.pipeline import ENACT
96 |
97 | so_hd = ENACT(
98 | cache_dir="/home/oneai/test_cache",
99 | wsi_path="Visium_HD_Human_Colon_Cancer_tissue_image.btf",
100 | visiumhd_h5_path="binned_outputs/square_002um/filtered_feature_bc_matrix.h5",
101 | tissue_positions_path="binned_outputs/square_002um/spatial/tissue_positions.parquet",
102 | )
103 | ```
104 |
105 | Full list of ENACT parameters (click to expand)
106 |
107 | ## Parameters
108 |
109 | - **cache_dir (str)**:
110 | Directory to cache ENACT results. This must be specified by the user.
111 |
112 | - **wsi_path (str)**:
113 | Path to the Whole Slide Image (WSI) file. This must be provided by the user.
114 |
115 | - **visiumhd_h5_path (str)**:
116 | Path to the Visium HD h5 file containing spatial transcriptomics data. This
117 | must be provided by the user.
118 |
119 | - **tissue_positions_path (str)**:
120 | Path to the tissue positions file that contains spatial locations of barcodes.
121 | This must be provided by the user.
122 |
123 | - **analysis_name (str)**:
124 | Name of the analysis, used for output directories and results.
125 | *Default*: `"enact_demo"`.
126 |
127 | - **seg_method (str)**:
128 | Cell segmentation method.
129 | *Default*: `"stardist"`.
130 | *Options*: `["stardist"]`.
131 |
132 | - **patch_size (int)**:
133 | Size of patches (in pixels) to process the image. Use a smaller patch size to
134 | reduce memory requirements.
135 | *Default*: `4000`.
136 |
137 | - **use_hvg (bool)**:
138 | Whether to use highly variable genes (HVG) during the analysis.
139 | *Default*: `True`.
140 | *Options*: `[True]`.
141 |
142 | - **n_hvg (int)**:
143 | Number of highly variable genes to use if `use_hvg` is `True`.
144 | *Default*: `1000`.
145 |
146 | - **n_clusters (int)**:
147 | Number of clusters. Used only if `bin_to_cell_method` is `"weighted_by_cluster"`.
148 | *Default*: `4`.
149 |
150 | - **bin_representation (str)**:
151 | Representation type for VisiumHD bins.
152 | *Default*: `"polygon"`.
153 | *Options*: `["polygon"]`.
154 |
155 | - **bin_to_cell_method (str)**:
156 | Method to assign bins to cells.
157 | *Default*: `"weighted_by_cluster"`.
158 | *Options*: `["naive", "weighted_by_area", "weighted_by_gene", "weighted_by_cluster"]`.
159 |
160 | - **cell_annotation_method (str)**:
161 | Method for annotating cell types.
162 | *Default*: `"celltypist"`.
163 | *Options*: `["celltypist", "sargent" (if installed), "cellassign"]`.
164 |
165 | - **cell_typist_model (str)**:
166 | Path to the pre-trained CellTypist model for cell type annotation. Only used if
167 | `cell_annotation_method` is `"celltypist"`.
168 | Refer to [CellTypist Models](https://www.celltypist.org/models) for a list of
169 | available models.
170 | *Default*: `""` (empty string).
171 |
172 | - **run_synthetic (bool)**:
173 | Whether to run synthetic data generation for testing purposes.
174 | *Default*: `False`.
175 |
176 | - **segmentation (bool)**:
177 | Flag to run the image segmentation step.
178 | *Default*: `True`.
179 |
180 | - **bin_to_geodataframes (bool)**:
181 | Flag to convert the bins to GeoDataFrames.
182 | *Default*: `True`.
183 |
184 | - **bin_to_cell_assignment (bool)**:
185 | Flag to run bin-to-cell assignment.
186 | *Default*: `True`.
187 |
188 | - **cell_type_annotation (bool)**:
189 | Flag to run cell type annotation.
190 | *Default*: `True`.
191 |
192 | - **cell_markers (dict)**:
193 | A dictionary of cell markers used for annotation. Only used if `cell_annotation_method`
194 | is one of `["sargent", "cellassign"]`.
195 |
196 | - **chunks_to_run (list)**:
197 | Specific chunks of data to run the analysis on, typically for debugging.
198 | *Default*: `[]` (runs all chunks).
199 |
200 | - **configs_dict (dict)**:
201 | Dictionary containing ENACT configuration parameters. If provided, the values
202 | in `configs_dict` will override any corresponding parameters passed directly
203 | to the class constructor. This is useful for running ENACT with a predefined
204 | configuration for convenience and consistency.
205 | *Default*: `{}` (uses the parameters specified in the class constructor).
206 |
207 |
208 |
209 | 2. Specifying configurations in a `yaml` file: (sample file located under `config/configs.yaml`):
210 | ```yaml
211 | analysis_name: <---- custom name for analysis. Will create a folder with that name to store the results
212 | run_synthetic: False <---- True if you want to run bin to cell assignment on synthetic dataset, False otherwise
213 | cache_dir: <---- path to store pipeline outputs
214 | paths:
215 | wsi_path: <---- path to whole slide image
216 | visiumhd_h5_path: <---- location of the 2um x 2um gene by bin file (filtered_feature_bc_matrix.h5) from 10X Genomics
217 | tissue_positions_path: <---- location of the tissue of the tissue_positions.parquet file from 10X genomicsgenomics
218 | steps:
219 | segmentation: True <---- True if you want to run segmentation
220 | bin_to_geodataframes: True <---- True to convert bin to geodataframes
221 | bin_to_cell_assignment: True <---- True to bin-to-cell assignment
222 | cell_type_annotation: True <---- True to run cell type annotation
223 | params:
224 | bin_to_cell_method: "weighted_by_cluster" <---- bin-to-cell assignment method. Pick one of ["naive", "weighted_by_area", "weighted_by_gene", "weighted_by_cluster"]
225 | cell_annotation_method: "celltypist" <---- cell annotation method. Pick one of ["cellassign", "celltypist"]
226 | cell_typist_model: "Human_Colorectal_Cancer.pkl" <---- CellTypist model weights to use. Update based on organ of interest if cell_annotation_method is set to "celltypist"
227 | seg_method: "stardist" <---- cell segmentation method. Stardist is the only option for now
228 | image_type: "he" <---- image type. Options are ["he", "if"] (for H&E image or IF image, respectively.)
229 | nucleus_expansion: True <---- flag to enable nuclei expansion to get cell boundaries. Default is True.
230 | expand_by_nbins: 2 <---- number of bins to expand the nuclei by to get cell boundaries. Default is 2 bins.
231 | patch_size: 4000 <---- defines the patch size. The whole resolution image will be broken into patches of this size. Reduce if you run into memory issues
232 | use_hvg: True <---- True only run analysis on top n highly variable genes. Setting it to False runs ENACT on all genes in the counts file
233 | n_hvg: 1000 <---- number of highly variable genes to use. Default is 1000.
234 | destripe_norm: False <---- flag to enable destripe normalization (Bin2cell normalization). Recommend enable only for CellTypist. Disable for Sargent.
235 | n_clusters: 4 <---- number of cell clusters to use for the "weighted_by_cluster" method. Default is 4.
236 | n_pcs: 250 <---- number of principal components before clustering for Weighted-by-Cluster. Default is 250.
237 | stardist:
238 | block_size: 4096 <---- the size of image blocks the model processes at a time
239 | prob_thresh: 0.005 <---- value between 0 and 1, higher values lead to fewer segmented objects, but will likely avoid false positives
240 | overlap_thresh: 0.001 <---- value between 0 and 1, higher values allow segmented objects to overlap substantially
241 | min_overlap: 128 <---- overlap between blocks, should it be larger than the size of a cell
242 | context: 128 <---- context pixels around the blocks to be included during prediction
243 | n_tiles: (4,4,1) <---- the input image is broken up into (overlapping) tiles that are processed independently and re-assembled. This parameter denotes a tuple of the number of tiles for every image axis
244 | stardist_modelname: "2D_versatile_he" <---- Specify one of the available Stardist models. 2D_versatile_fluo (for IF images) or 2D_versatile_he (for H&E images)
245 | channel_to_segment: 2 <---- Only applicable for IF images. This is the image channel to segment (usually the DAPI channel)
246 | cell_markers: <---- cell-gene markers to use for cell annotation. Only applicable if params/cell_annotation_method is "cellassign" or "sargent". No need to specify for "CellTypist"
247 | Epithelial: ["CDH1","EPCAM","CLDN1","CD2"]
248 | Enterocytes: ["CD55", "ELF3", "PLIN2", "GSTM3", "KLF5", "CBR1", "APOA1", "CA1", "PDHA1", "EHF"]
249 | Goblet cells: ["MANF", "KRT7", "AQP3", "AGR2", "BACE2", "TFF3", "PHGR1", "MUC4", "MUC13", "GUCA2A"]
250 | ```
251 |
252 | ## Output Files for ENACT
253 | ENACT outputs all its results under the `cache` directory which gets automatically created at run time:
254 | ```
255 | .
256 | └── cache/
257 | └── /
258 | ├── chunks/ # ENACT results at a chunck level
259 | │ ├── bins_gdf/
260 | │ │ └── patch_.csv
261 | │ ├── cells_gdf/
262 | │ │ └── patch_.csv
263 | │ └── /
264 | │ ├── bin_to_cell_assign/
265 | │ │ └── patch_.csv
266 | │ ├── cell_ix_lookup/
267 | │ │ └── patch_.csv
268 | │ └── _results/
269 | │ ├── cells_adata.csv
270 | │ └── merged_results.csv
271 | ├── tmap/ # Directory storing files to visualize results on TissUUmaps
272 | │ ├── _adata.h5
273 | │ ├── _tmap.tmap
274 | │ ├── cells_layer.png
275 | │ └── wsi.tif
276 | └── cells_df.csv # cells dataframe, each row is a cell with its coordinates
277 | ```
278 | ENACT breaks down the whole resolution image into "chunks" (or patches) of size `patch_size`. Results are provided per-chunk under the `chunks` directory.
279 | * `bins_gdf`:Folder containing GeoPandas dataframes representing the 2um Visium HD bins within a given patch
280 | * `cells_gdf`: Folder containing GeoPandas dataframes representing cells segmented in the tissue
281 | * `/bin_to_cell_assign`: Folder contains dataframes with the transcripts assigned to each cells
282 | * `/cell_ix_lookup`: Folder contains dataframes defining the indices and coordinates of the cells
283 | * `/_results/cells_adata.csv`: Anndata object containing the results from ENACT (cell coordinates, cell types, transcript counts)
284 | * <`bin_to_cell_method>/_results/merged_results.csv`: Dataframe (.csv) containing the results from ENACT (cell coordinates, cell types)
285 |
286 | ## Basic Example: Running ENACT from Notebook
287 | The **[demo notebook](ENACT_demo.ipynb)** provides a step-by-step guide on how to install and run ENACT on VisiumHD public data using notebook. The **[output processing demo notebook](ENACT_outputs_demo.ipynb)** provides a comprehensive, step-by-step guide on how the user can use the generated data for further downstream analysis (see [Working with ENACT Output](#working-with-enact-output) for additional details)
288 |
289 | ## Basic Example: Running ENACT from Terminal
290 | This section provides a guide for running ENACT on the [Human Colorectal Cancer sample](https://www.10xgenomics.com/datasets/visium-hd-cytassist-gene-expression-libraries-of-human-crc) provided on 10X Genomics' website.
291 | ### Step 1: Install ENACT from Source
292 | Refer to [Install ENACT from Source](#install-enact-from-source)
293 |
294 | ### Step 2: Download the necessary files from the 10X Genomics website:
295 |
296 | 1. Whole slide image: full resolution tissue image
297 | ```
298 | curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.0.0/Visium_HD_Human_Colon_Cancer/Visium_HD_Human_Colon_Cancer_tissue_image.btf
299 | ```
300 |
301 | 2. Visium HD output file. The transcript counts are provided in a .tar.gz file that needs to be extracted:
302 | ```
303 | curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.0.0/Visium_HD_Human_Colon_Cancer/Visium_HD_Human_Colon_Cancer_binned_outputs.tar.gz
304 | tar -xvzf Visium_HD_Human_Colon_Cancer_binned_outputs.tar.gz
305 | ```
306 | Locate the following two files from the extracted outputs file.
307 | ```
308 | .
309 | └── binned_outputs/
310 | └── square_002um/
311 | ├── filtered_feature_bc_matrix.h5 <---- Transcript counts file (2um resolution)
312 | └── spatial/
313 | └── tissue_positions.parquet <---- Bin locations relative to the full resolution image
314 | ```
315 |
316 | ### Step 3: Update input file locations and parameters under `config/configs.yaml`
317 |
318 | Refer to [Running Instructions](#running-instructions) for a full list of ENACT parameters to change.
319 |
320 | Below is a sample configuration file to use to run ENACT on the Human Colorectal cancer sample:
321 |
322 | ```yaml
323 | analysis_name: "colon-demo"
324 | run_synthetic: False # True if you want to run bin to cell assignment on synthetic dataset, False otherwise.
325 | cache_dir: "cache/ENACT_outputs" # Change according to your desired output location
326 | paths:
327 | wsi_path: "/Visium_HD_Human_Colon_Cancer_tissue_image.btf" # whole slide image path
328 | visiumhd_h5_path: "/binned_outputs/square_002um/filtered_feature_bc_matrix.h5" # location of the 2um x 2um gene by bin file (filtered_feature_bc_matrix.h5) from 10X Genomics.
329 | tissue_positions_path: "/binned_outputs/square_002um/spatial/tissue_positions.parquet" # location of the tissue of the tissue_positions.parquet file from 10X genomics
330 | steps:
331 | segmentation: True # True if you want to run segmentation
332 | bin_to_geodataframes: True # True to convert bin to geodataframes
333 | bin_to_cell_assignment: True # True to assign cells to bins
334 | cell_type_annotation: True # True to run cell type annotation
335 | params:
336 | seg_method: "stardist" # Stardist is the only option for now
337 | image_type: "if" # Image type: Options: ["he", "if"] (for H&E image or IF image, respectively.)
338 | nucleus_expansion: True # Flag to enable nuclei expansion to get cell boundaries
339 | expand_by_nbins: 2 # Number of bins to expand the nuclei by to get cell boundaries
340 | patch_size: 4000 # Defines the patch size. The whole resolution image will be broken into patches of this size
341 | bin_representation: "polygon" # or point TODO: Remove support for anything else
342 | bin_to_cell_method: "weighted_by_cluster" # or naive
343 | cell_annotation_method: "celltypist"
344 | cell_typist_model: "Human_Colorectal_Cancer.pkl"
345 | use_hvg: True # Only run analysis on highly variable genes + cell markers specified
346 | n_hvg: 1000 # Number of highly variable genes to use
347 | n_clusters: 4 # Number of clusters for Weighted-by-Cluster
348 | n_pcs: 250 # Number of principal components before clustering for Weighted-by-Cluster
349 | chunks_to_run: []
350 | stardist:
351 | block_size: 4096 # the size of image blocks the model processes at a time
352 | prob_thresh: 0.005 # value between 0 and 1, higher values lead to fewer segmented objects, but will likely avoid false positives
353 | overlap_thresh: 0.001 # value between 0 and 1, higher values allow segmented objects to overlap substantially
354 | min_overlap: 128 # overlap between blocks, should it be larger than the size of a cell
355 | context: 128 # context pixels around the blocks to be included during prediction
356 | n_tiles: (4,4,1) #the input image is broken up into (overlapping) tiles that are processed independently and re-assembled. This parameter denotes a tuple of the number of tiles for every image axis
357 | stardist_modelname: "2D_versatile_fluo" # Specify one of the available Stardist models: 2D_versatile_fluo (for IF images) or 2D_versatile_he (for H&E images)
358 | channel_to_segment: 2 # Only applicable for IF images. This is the image channel to segment (usually the DAPI channel)
359 | cell_markers: # Only needed if cell_annotation_method is one of "Sargent" or "CellAssign"
360 | # Human Colon
361 | Epithelial: ["CDH1","EPCAM","CLDN1","CD2"]
362 | Enterocytes: ["CD55", "ELF3", "PLIN2", "GSTM3", "KLF5", "CBR1", "APOA1", "CA1", "PDHA1", "EHF"]
363 | Goblet cells: ["MANF", "KRT7", "AQP3", "AGR2", "BACE2", "TFF3", "PHGR1", "MUC4", "MUC13", "GUCA2A"]
364 | Enteroendocrine cells: ["NUCB2", "FABP5", "CPE", "ALCAM", "GCG", "SST", "CHGB", "IAPP", "CHGA", "ENPP2"]
365 | Crypt cells: ["HOPX", "SLC12A2", "MSI1", "SMOC2", "OLFM4", "ASCL2", "PROM1", "BMI1", "EPHB2", "LRIG1"]
366 | Endothelial: ["PECAM1","CD34","KDR","CDH5","PROM1","PDPN","TEK","FLT1","VCAM1","PTPRC","VWF","ENG","MCAM","ICAM1","FLT4"]
367 | Fibroblast: ["COL1A1","COL3A1","COL5A2","PDGFRA","ACTA2","TCF21","FN"]
368 | Smooth muscle cell: ["BGN","MYL9","MYLK","FHL2","ITGA1","ACTA2","EHD2","OGN","SNCG","FABP4"]
369 | B cells: ["CD74", "HMGA1", "CD52", "PTPRC", "HLA-DRA", "CD24", "CXCR4", "SPCS3", "LTB", "IGKC"]
370 | T cells: ["JUNB", "S100A4", "CD52", "PFN1P1", "CD81", "EEF1B2P3", "CXCR4", "CREM", "IL32", "TGIF1"]
371 | NK cells: ["S100A4", "IL32", "CXCR4", "FHL2", "IL2RG", "CD69", "CD7", "NKG7", "CD2", "HOPX"]
372 |
373 | ```
374 |
375 | ## Running Instructions
376 | This section provides a guide on running ENACT on your own data
377 | ### Step 1: Install ENACT from Source
378 | Refer to [Install ENACT from Source](#install-enact-from-source)
379 |
380 | ### Step 2: Define the Location of ENACT's Required Files
381 | Define the locations of ENACT's required files in the `config/configs.yaml` file. Refer to [Input Files for ENACT](#input-files-for-enact)
382 | ```yaml
383 | analysis_name: <---- custom name for analysis. Will create a folder with that name to store the results
384 | cache_dir: <---- path to store pipeline outputs
385 | paths:
386 | wsi_path: <---- path to whole slide image
387 | visiumhd_h5_path: <---- location of the 2um x 2um gene by bin file (filtered_feature_bc_matrix.h5) from 10X Genomics.
388 | tissue_positions_path: <---- location of the tissue of the tissue_positions.parquet file from 10X genomics
389 | ```
390 |
391 | ### Step 3: Define ENACT configurations
392 | Define the following core parameters in the `config/configs.yaml` file:
393 | ```yaml
394 | params:
395 | bin_to_cell_method: "weighted_by_cluster" <---- bin-to-cell assignment method. Pick one of ["naive", "weighted_by_area", "weighted_by_gene", "weighted_by_cluster"]
396 | cell_annotation_method: "celltypist" <---- cell annotation method. Pick one of ["cellassign", "celltypist", "sargent" (if installed)]
397 | cell_typist_model: "Human_Colorectal_Cancer.pkl" <---- CellTypist model weights to use. Update based on organ of interest if using cell_annotation_method is set to
398 | ```
399 | Refer to [Defining ENACT Configurations](#defining-enact-configurations) for a full list of parameters to configure. If using CellTypist, set `cell_typist_model` to one of the following models based on the organ and species under study: [CellTypist models](https://www.celltypist.org/models#:~:text=CellTypist%20was%20first%20developed%20as%20a%20platform%20for).
400 |
401 | ### Step 4: Define Cell Gene Markers
402 | >[!NOTE]
403 | >Only applies if cell_annotation_method is "cellassign" or "sargent". Skip this step if using CellTypist
404 |
405 | Define the cell gene markers in `config/configs.yaml` file. Those can be expert annotated or obtained from open-source databases such as [Panglao](https://panglaodb.se/index.html) or [CellMarker](http://xteam.xbio.top/CellMarker/). Example cell markers for human colorectal cancer samples:
406 | ```yaml
407 | cell_markers:
408 | Epithelial: ["CDH1","EPCAM","CLDN1","CD2"]
409 | Enterocytes: ["CD55", "ELF3", "PLIN2", "GSTM3", "KLF5", "CBR1", "APOA1", "CA1", "PDHA1", "EHF"]
410 | Goblet cells: ["MANF", "KRT7", "AQP3", "AGR2", "BACE2", "TFF3", "PHGR1", "MUC4", "MUC13", "GUCA2A"]
411 | Enteroendocrine cells: ["NUCB2", "FABP5", "CPE", "ALCAM", "GCG", "SST", "CHGB", "IAPP", "CHGA", "ENPP2"]
412 | Crypt cells: ["HOPX", "SLC12A2", "MSI1", "SMOC2", "OLFM4", "ASCL2", "PROM1", "BMI1", "EPHB2", "LRIG1"]
413 | Endothelial: ["PECAM1","CD34","KDR","CDH5","PROM1","PDPN","TEK","FLT1","VCAM1","PTPRC","VWF","ENG","MCAM","ICAM1","FLT4"]
414 | Fibroblast: ["COL1A1","COL3A1","COL5A2","PDGFRA","ACTA2","TCF21","FN"]
415 | Smooth muscle cell: ["BGN","MYL9","MYLK","FHL2","ITGA1","ACTA2","EHD2","OGN","SNCG","FABP4"]
416 | B cells: ["CD74", "HMGA1", "CD52", "PTPRC", "HLA-DRA", "CD24", "CXCR4", "SPCS3", "LTB", "IGKC"]
417 | T cells: ["JUNB", "S100A4", "CD52", "PFN1P1", "CD81", "EEF1B2P3", "CXCR4", "CREM", "IL32", "TGIF1"]
418 | NK cells: ["S100A4", "IL32", "CXCR4", "FHL2", "IL2RG", "CD69", "CD7", "NKG7", "CD2", "HOPX"]
419 | ```
420 | ### Step 5: Run ENACT
421 | ```
422 | make run_enact
423 | ```
424 |
425 | ## Working with ENACT Output
426 |
427 | The **[output demo notebook](ENACT_outputs_demo.ipynb)** provides a comprehensive, step-by-step guide on how to access and analyze output data from ENACT. The notebook covers the following topics:
428 |
429 | - **Loading the AnnData object in Python**
430 | Learn how to load the main data structure for single-cell analysis.
431 |
432 | - **Extracting cell types and their spatial coordinates**
433 | Access information about cell types and their positions in the tissue.
434 |
435 | - **Determining the number of shared and unique bins per cell**
436 | Explore metrics that characterize the bin and cell relationships.
437 |
438 | - **Accessing and visualizing the number of transcripts per cell**
439 | Visualize and analyze transcriptional activity across cells.
440 |
441 | - **Identifying the top-n expressed genes in the sample**
442 | Retrieve the most highly expressed genes in your dataset.
443 |
444 | - **Generating interactive plots**
445 | Visualize cell boundaries and cell types within the tissue using interactive visualizations.
446 |
447 | - **Performing downstream analysis**
448 | Run a sample analysis, such as neighborhood enrichment analysis, using external packages like **Squidpy**.
449 |
450 | This notebook serves as a helpful resource for navigating and analyzing ENACT output data effectively.
451 |
452 |
453 | ## Visualizing Results on TissUUmaps
454 | To view results on [TissUUmaps](https://tissuumaps.github.io), begin by installing TissUUmaps by following the instructions at:
455 | https://tissuumaps.github.io/TissUUmaps-docs/docs/intro/installation.html#.
456 |
457 | Once installed, follow the instructions at: https://tissuumaps.github.io/TissUUmaps-docs/docs/starting/projects.html#loading-projects
458 |
459 | For convenience, ENACT creates a TissUUmaps project file (.tmap extension) located at under the `/tmap/` folder.
460 |
464 | 
465 |
466 | ## Reproducing Paper Results
467 | This section provides a guide on how to reproduce the ENACT paper results on the [10X Genomics Human Colorectal Cancer VisumHD sample](https://www.10xgenomics.com/datasets/visium-hd-cytassist-gene-expression-libraries-of-human-crc).
468 | Here, ENACT is run on various combinations of bin-to-cell assignment methods and cell annotation algorithms.
469 |
470 | ### Step 1: Install ENACT from Source
471 | Refer to [Install ENACT from Source](#install-enact-from-source)
472 |
473 | ### Step 2: Run ENACT on combinations of bin-to-cell assignment methods and cell annotation algorithms
474 | 3. Run the following command which will download all the supplementary file from [ENACT's Zenodo page](https://doi.org/10.5281/zenodo.15211043) and programmatically run ENACT with various combinations of bin-to-cell assignment methods and cell annotation algorithms:
475 | ```
476 | make reproduce_results
477 | ```
478 |
479 | ## Creating Synthetic VisiumHD Datasets
480 |
481 | 1. To create synthetic VisiumHD dataset from Xenium or seqFISH+ data, run and follow the instructions of the notebooks in [src/synthetic_data](src/synthetic_data).
482 |
483 | 2. To run the ENACT pipeline with the synthetic data, set the following parameters in the `config/configs.yaml` file:
484 |
485 | ```yaml
486 | run_synthetic: True <---- True if you want to run bin to cell assignment on synthetic dataset, False otherwise.
487 | ```
488 |
489 | 3. Run ENACT:
490 | ```
491 | make run_enact
492 | ```
493 |
494 | ## Citing ENACT
495 | If you use this repository or its tools in your research, please cite the following:
496 | ```
497 | @article{10.1093/bioinformatics/btaf094,
498 | author = {Kamel, Mena and Song, Yiwen and Solbas, Ana and Villordo, Sergio and Sarangi, Amrut and Senin, Pavel and Sunaal, Mathew and Ayestas, Luis Cano and Levin, Clement and Wang, Seqianand Classe, Marion and Bar-Joseph, Ziv and Pla Planas, Albert},
499 | title = {ENACT: End-to-end Analysis of Visium High Definition (HD) Data},
500 | journal = {Bioinformatics},
501 | pages = {btaf094},
502 | year = {2025},
503 | month = {03},
504 | abstract = {Spatial transcriptomics (ST) enables the study of gene expression within its spatial context in histopathology samples. To date, a limiting factor has been the resolution of sequencing based ST products. The introduction of the Visium High Definition (HD) technology opens the door to cell resolution ST studies. However, challenges remain in the ability to accurately map transcripts to cells and in assigning cell types based on the transcript data.We developed ENACT, a self-contained pipeline that integrates advanced cell segmentation with Visium HD transcriptomics data to infer cell types across whole tissue sections. Our pipeline incorporates novel bin-to-cell assignment methods, enhancing the accuracy of single-cell transcript estimates. Validated on diverse synthetic and real datasets, our approach is both scalableto samples with hundreds of thousands of cells and effective, offering a robust solution for spatially resolved transcriptomics analysis.ENACT source code is available at https://github.com/Sanofi-Public/enact-pipeline. Experimental data is available at https://doi.org/10.5281/zenodo.15211043.Supplementary data are available at Bioinformatics online.},
505 | issn = {1367-4811},
506 | doi = {10.1093/bioinformatics/btaf094},
507 | url = {https://doi.org/10.1093/bioinformatics/btaf094},
508 | eprint = {https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btaf094/62340410/btaf094.pdf},
509 | }
510 | ```
511 |
--------------------------------------------------------------------------------
/config/configs.yaml:
--------------------------------------------------------------------------------
1 | analysis_name: "colon-demo"
2 | run_synthetic: False # True if you want to run bin to cell assignment on synthetic dataset, False otherwise.
3 | cache_dir: "/home/oneai/enact-pipeline/ENACT_supporting_files/output_files"
4 | paths:
5 | wsi_path: "/home/oneai/enact-pipeline/ENACT_supporting_files/public_data/human_colorectal/input_files/Visium_HD_Human_Colon_Cancer_tissue_image.btf"
6 | visiumhd_h5_path: "/home/oneai/enact-pipeline/ENACT_supporting_files/public_data/human_colorectal/input_files/filtered_feature_bc_matrix.h5"
7 | tissue_positions_path: "/home/oneai/enact-pipeline/ENACT_supporting_files/public_data/human_colorectal/input_files/tissue_positions.parquet"
8 | steps:
9 | segmentation: True # True if you want to run segmentation
10 | bin_to_geodataframes: True # True to convert bin to geodataframes
11 | bin_to_cell_assignment: True # True to assign cells to bins
12 | cell_type_annotation: True # True to run cell type annotation
13 | params:
14 | seg_method: "stardist" # Stardist is the only option for now
15 | image_type: "he" # Image type: Options: ["he", "if"] (for H&E image or IF image, respectively.)
16 | nucleus_expansion: True # Flag to enable nuclei expansion to get cell boundaries
17 | expand_by_nbins: 2 # Number of bins to expand the nuclei by to get cell boundaries
18 | patch_size: 4000 # Defines the patch size. The whole resolution image will be broken into patches of this size
19 | bin_representation: "polygon" # or point TODO: Remove support for anything else
20 | bin_to_cell_method: "weighted_by_area" # or naive/ weighted_by_cluster/ weighted_by_gene
21 | cell_annotation_method: "celltypist"
22 | cell_typist_model: "Human_Colorectal_Cancer.pkl" # only needed if using cell_annotation_method = "celltypist"
23 | use_hvg: True # Only run analysis on highly variable genes + cell markers specified
24 | n_hvg: 1000 # Number of highly variable genes to use
25 | destripe_norm: False # Flag to enable destripe normalization (Bin2cell normalization)
26 | n_clusters: 4 # Number of clusters for Weighted-by-Cluster
27 | n_pcs: 250 # Number of principal components before clustering for Weighted-by-Cluster
28 | chunks_to_run: [] # Chunks to run ENACT on specific patches
29 | stardist:
30 | block_size: 4096 # the size of image blocks the model processes at a time
31 | prob_thresh: 0.005 # value between 0 and 1, higher values lead to fewer segmented objects, but will likely avoid false positives
32 | overlap_thresh: 0.001 # value between 0 and 1, higher values allow segmented objects to overlap substantially
33 | min_overlap: 128 # overlap between blocks, should it be larger than the size of a cell
34 | context: 128 # context pixels around the blocks to be included during prediction
35 | n_tiles: (4,4,1) # the input image is broken up into (overlapping) tiles that are processed independently and re-assembled. This parameter denotes a tuple of the number of tiles for every image axis
36 | stardist_modelname: "2D_versatile_he" # Specify one of the available Stardist models: 2D_versatile_fluo (for IF images) or 2D_versatile_he (for H&E images)
37 | channel_to_segment: 2 # Only applicable for IF images. This is the image channel to segment (usually the DAPI channel)
38 | cell_markers:
39 | # Human Colon
40 | Epithelial: ["CDH1","EPCAM","CLDN1","CD2"]
41 | Enterocytes: ["CD55", "ELF3", "PLIN2", "GSTM3", "KLF5", "CBR1", "APOA1", "CA1", "PDHA1", "EHF"]
42 | Goblet cells: ["MANF", "KRT7", "AQP3", "AGR2", "BACE2", "TFF3", "PHGR1", "MUC4", "MUC13", "GUCA2A"]
43 | Enteroendocrine cells: ["NUCB2", "FABP5", "CPE", "ALCAM", "GCG", "SST", "CHGB", "IAPP", "CHGA", "ENPP2"]
44 | Crypt cells: ["HOPX", "SLC12A2", "MSI1", "SMOC2", "OLFM4", "ASCL2", "PROM1", "BMI1", "EPHB2", "LRIG1"]
45 | Endothelial: ["PECAM1","CD34","KDR","CDH5","PROM1","PDPN","TEK","FLT1","VCAM1","PTPRC","VWF","ENG","MCAM","ICAM1","FLT4"]
46 | Fibroblast: ["COL1A1","COL3A1","COL5A2","PDGFRA","ACTA2","TCF21","FN"]
47 | Smooth muscle cell: ["BGN","MYL9","MYLK","FHL2","ITGA1","ACTA2","EHD2","OGN","SNCG","FABP4"]
48 | B cells: ["CD74", "HMGA1", "CD52", "PTPRC", "HLA-DRA", "CD24", "CXCR4", "SPCS3", "LTB", "IGKC"]
49 | T cells: ["JUNB", "S100A4", "CD52", "PFN1P1", "CD81", "EEF1B2P3", "CXCR4", "CREM", "IL32", "TGIF1"]
50 | NK cells: ["S100A4", "IL32", "CXCR4", "FHL2", "IL2RG", "CD69", "CD7", "NKG7", "CD2", "HOPX"]
51 |
52 | # # Human Pancreas
53 | # Acinar_cell: ["PRSS1", "KLK1","CTRC", "PNLIP"]
54 | # Alpha_cell: ["GCG", "ARX", "CLIM1", "CRYBA2", "FEV", "GBA", "HMGB3"]
55 | # Beta_cell: ["INS", "BMP-5", "CDKN1C", "CRTR1", "DLK1", "NPTX2", "PACAP"]
56 | # Delta_cell: ["SST", "CHE1", "ESE3B", "ETV1", "GABRG2", "HER4", "ISL1"]
57 | # Ductal_cell: ["PROM1"]
58 | # Epsilon cell: ["GHRL", "TM4SF5"]
59 | # Mesenchymal_cell: ["THY1"]
60 | # Pancreatic_polypeptide_cell: [
61 | # "AQP3", "ARHGAP3", "ARX", "BHLHB26", "BHLHB27",
62 | # "CARTPT", "EGR3", "ENTPD2", "ETV1", "MEIS1",
63 | # "MEIS2", "PAX6", "PTGFR", "RBTN3", "SERTM1",
64 | # "SLITRK6", "THSD7A", "ZNF506"
65 | # ]
66 | # PP_cell: ["PPY"]
67 |
68 | # # Human breast cancer
69 | # Cancer stem cell: ["CD133", "ALDH1", "SOX2", "OCT4", "CD44"]
70 | # Epithelial cell: ["EPCAM", "KRT8", "KRT18", "CDH1", "CLDN1", "MUC1"]
71 | # Immune cell: ["CD45", "CD3", "CD19", "CD14", "CD56"]
72 | # Natural killer cell: ["CD56", "CD16", "NKp46", "NKG2D", "CD94"]
73 | # Progenitor cell: ["Nestin", "CD34", "Sox2", "GATA2", "LGR5"]
74 | # Stem cell: ["OCT4", "SOX2", "NANOG", "KLF4", "CD34"]
75 |
76 |
77 | # # Mouse intestine
78 | # Enterocytes: ["Cbr1", "Plin2", "Gls", "Plin3", "Dab1", "Pmepa1", "Acsl5", "Hmox1", "Abcg2", "Cd36"]
79 | # Goblet cells: ["Manf", "Krt7", "Ccl9", "Muc13", "Phgr1", "Cdx2", "Aqp3", "Creb3L1", "Guca2A", "Klk1"]
80 | # Enteroendocrine cells: ["Fabp5", "Cpe", "Enpp2", "Chgb", "Alcam", "Chga", "Pax6", "Neurod1", "Cck", "Isl1"]
81 | # Paneth cells: ["Gpx2", "Fabp4", "Lyz1", "Kcnn4", "Lgals2", "Guca2B", "Lgr4", "Defa24", "Il4Ra", "Guca2A"]
82 | # Crypt cells: ["Prom1", "Hopx", "Msi1", "Olfm4", "Kcne3", "Bmi1", "Axin2", "Kcnq1", "Ascl2", "Lrig1"]
83 | # Smooth muscle cells: ["Bgn", "Myl9", "Pcp4L1", "Itga1", "Nrp2", "Mylk", "Ehd2", "Fabp4", "Acta2", "Ogn"]
84 | # B cells: ["Cd52", "Bcl11A", "Ebf1", "Cd74", "Ptprc", "Pold4", "Ighm", "Cd14", "Creld2", "Fli1"]
85 | # T cells: ["Cd81", "Junb", "Cd52", "Ptprcap", "H2-Q7", "Ccl6", "Bcl2", "Maff", "Ccl4", "Ccl3"]
86 | # NK cells: ["Ctla2A", "Ccl4", "Cd3G", "Ccl3", "Nkg7", "Lat", "Dusp2", "Itgam", "Fhl2", "Ccl5"]
87 |
88 | # # Mouse embryo
89 | # 1-cell stage cell (Blastomere): ['Accsl', 'Acvr1b', 'Asf1b', 'Bcl2l10', 'Blcap', 'Cdk2ap2', 'Ciapin1', 'Dclk2', 'Dusp7', 'H1foo']
90 | # Blood progenitor cell: ['Flk1', 'Runx1', 'Tal1', 'Runx1']
91 | # Cardiomyocyte: ['Bmp4', 'Emcn', 'Fbn1', 'Gata4', 'Hand1', 'Hand2', 'Mef2c', 'Myl4', 'Neb', 'Nid1']
92 | # Fibroblast: ['Col5a2', 'Thy1']
93 | # Oocyte: ['Abi3bp', 'Ampd3', 'Ankra2', 'Cep78', 'Cnn3', 'Dclre1a', 'Dcun1d5', 'Depdc7', 'Dnajc3', 'Dpy30']
94 | # Pharyngeal mesoderm cell: ['Prdm1', 'Tbx1']
95 | # Pre-haematopoietic stem cell: ['2410004N09Rik', '9030617O03Rik', '9030619P08Rik', 'Ablim1', 'Acot11', 'Akr1c14', 'Angpt1', 'Ank', 'Anpep', 'Art4']
96 | # Primitive erythroid cell: ['Gata1', 'Hbb-bh1', 'Klf1']
97 | # Primitive streak cell: ['Nanog', 'Pou5f1']
98 | # Venous cell: ['Apj', 'Coup-tf2', 'Dab2', 'EphB4', 'Nrp2', 'Tie-2']
99 |
100 | # # Human Tonsil
101 | # Epithelial: ["EPCAM"]
102 | # Endothelial: ["PECAM1", "CD34", "KDR", "CDH5", "PROM1", "PDPN", "TEK", "FLT1", "VCAM1", "PTPRC", "VWF", "ENG", "MCAM", "ICAM1", "FLT4"]
103 | # Fibroblast: ["COL1A1", "COL3A1", "COL5A2", "PDGFRA", "ACTA2", "TCF21", "FN"]
104 | # B_cells: ["CD74", "HMGA1", "CD52", "PTPRC", "HLA-DRA", "CD24", "CXCR4", "SPCS3", "LTB", "IGKC"]
105 | # T_cells: ["JUNB", "S100A4", "CD52", "PFN1P1", "CD81", "EEF1B2P3", "CXCR4", "CREM", "IL32", "TGIF1"]
106 | # NK_cells: ["S100A4", "IL32", "CXCR4", "FHL2", "IL2RG", "CD69", "CD7", "NKG7", "CD2", "HOPX"]
--------------------------------------------------------------------------------
/figs/pipelineflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sanofi-Public/enact-pipeline/3c4e6094b4df35c1e2c61a65f8ceace3e3ac281b/figs/pipelineflow.png
--------------------------------------------------------------------------------
/figs/tissuumaps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sanofi-Public/enact-pipeline/3c4e6094b4df35c1e2c61a65f8ceace3e3ac281b/figs/tissuumaps.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "enact-SO"
7 | version = "0.2.3"
8 | description = "ENACT is a self-contained pipeline designed to streamline Visium HD analysis from cell segmentation to annotation, enabling integration with advanced spatial analysis tools."
9 | license ={ file = "LICENSE.md" }
10 | readme = "README.md"
11 | requires-python = ">=3.9"
12 | keywords = ["spatial", "omics", "bioinformatics", "transcriptomics", "VisiumHD", ]
13 | authors = [
14 | { name = "Mena Kamel", email = "mena.kamel@sanofi.com" },
15 | { name = "Yiwen Song", email = "yiwen.song@sanofi.com" },
16 | ]
17 | classifiers = [
18 |
19 | "Programming Language :: Python",
20 | "Programming Language :: Python :: 3.10",
21 | "Programming Language :: Python :: 3.11",
22 | "Programming Language :: Python :: 3.12",
23 | ]
24 |
25 | # Core dependencies required for running the ENACT pipeline
26 | dependencies = [
27 | "anndata==0.10.8",
28 | "fastparquet==2024.5.0",
29 | "shapely==2.0.5",
30 | "stardist==0.9.1",
31 | "tifffile==2024.7.24",
32 | "scvi-tools==1.1.6.post2",
33 | "scanpy==1.10.2",
34 | "geopandas==1.0.1",
35 | "tensorflow==2.17.0",
36 | "plotly==5.24.0",
37 | "imagecodecs==2024.9.22",
38 | "pyyaml==6.0",
39 | "pandas",
40 | "numpy",
41 | "tqdm",
42 | "Pillow",
43 | "scipy",
44 | "celltypist-SO==1.6.5",
45 | "python-multipart==0.0.19"
46 | ]
47 |
48 | # Documentation and other URLs related to the project
49 | [project.urls]
50 | Documentation = "https://github.com/Sanofi-Public/enact-pipeline#readme"
51 | Source = "https://github.com/Sanofi-Public/enact-pipeline"
52 |
53 | # Scripts and linting tools
54 | [tool.hatch.scripts]
55 | check = "mypy --install-types --non-interactive {args:src/enact tests}"
56 |
57 | [tool.hatch.build.targets.wheel]
58 | packages = ["src/enact"]
59 |
60 | [tool.setuptools.packages.find]
61 | where = ["src"]
62 | include = ["enact*"]
63 |
64 | [tool.coverage.report]
65 | exclude_lines = [
66 | "no cov",
67 | "if TYPE_CHECKING:",
68 | ]
69 |
70 | [tool.hatch.publish.test]
71 | disable = true
72 |
73 | # Include important files like README and LICENSE
74 | [tool.setuptools]
75 | include-package-data = true
76 |
--------------------------------------------------------------------------------
/reproduce_paper_results.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | eval "$(conda shell.bash hook)"
3 |
4 | set -e
5 |
6 | PY_ENV_PATH=$1
7 |
8 | conda activate $PY_ENV_PATH
9 |
10 | FILE_URL="https://zenodo.org/records/14748859/files/ENACT_supporting_files.zip"
11 | OUTPUT_FILE="ENACT_supporting_files.zip"
12 |
13 | # Download ENACT supporting files if they are not present
14 | if [ -f "$OUTPUT_FILE" ]; then
15 | echo "$OUTPUT_FILE already exists. Skipping download."
16 | else
17 | echo "$OUTPUT_FILE is downloading."
18 | wget -O $OUTPUT_FILE $FILE_URL
19 | unzip $OUTPUT_FILE
20 | fi
21 |
22 |
23 | # Need to add step to download files from Zenodo to ENACT_supporting_files (in repo home directory)
24 | # Run ENACT pipeline to test all combinations of Bin-to-cell assignment and cell annotation methods - Order of experiments matters, don't change!
25 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/naive-celltypist.yaml
26 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/naive-cellassign.yaml
27 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_area-celltypist.yaml
28 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_area-cellassign.yaml
29 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_transcript-celltypist.yaml
30 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_transcript-cellassign.yaml
31 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_cluster-celltypist.yaml
32 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_cluster-cellassign.yaml
33 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | anndata==0.10.8
2 | fastparquet==2024.5.0
3 | shapely==2.0.5
4 | stardist==0.9.1
5 | tifffile==2024.7.24
6 | scvi-tools==1.1.6.post2
7 | celltypist-SO==1.6.5
8 | scanpy==1.10.2
9 | geopandas==1.0.1
10 | tensorflow==2.17.0
11 | plotly==5.24.0
12 | imagecodecs==2024.9.22
13 |
14 | pytest==7.3.2
15 | pytest-cov==4.1.0
16 | python-multipart==0.0.19
17 |
--------------------------------------------------------------------------------
/run_cell_ann_eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | eval "$(conda shell.bash hook)"
3 |
4 | set -e
5 |
6 | PY_ENV_PATH=$1
7 |
8 | # Run ENACT pipeline
9 | conda activate $PY_ENV_PATH
10 | python -m src.eval.cell_annotation_eval
11 |
12 |
--------------------------------------------------------------------------------
/run_enact.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | eval "$(conda shell.bash hook)"
3 |
4 | set -e
5 |
6 | PY_ENV_PATH=$1
7 | CONFIG_PATH=$2
8 |
9 | # Run ENACT pipeline
10 | conda activate $PY_ENV_PATH
11 | python -m src.enact.pipeline --configs_path "$CONFIG_PATH"
--------------------------------------------------------------------------------
/setup_py_env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | eval "$(conda shell.bash hook)"
3 |
4 | set -e
5 |
6 | PY_ENV_PATH=$1
7 |
8 | # Create Python environment
9 | if ! conda info --envs | grep -q "$PY_ENV_PATH"; then
10 | echo "Environment $PY_ENV_PATH does not exist. Creating..."
11 | conda create --prefix $PY_ENV_PATH python=3.10
12 | conda activate $PY_ENV_PATH
13 | pip install -r requirements.txt
14 | fi
15 |
--------------------------------------------------------------------------------
/src/enact/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sanofi-Public/enact-pipeline/3c4e6094b4df35c1e2c61a65f8ceace3e3ac281b/src/enact/__init__.py
--------------------------------------------------------------------------------
/src/enact/assignment_methods/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sanofi-Public/enact-pipeline/3c4e6094b4df35c1e2c61a65f8ceace3e3ac281b/src/enact/assignment_methods/__init__.py
--------------------------------------------------------------------------------
/src/enact/assignment_methods/naive.py:
--------------------------------------------------------------------------------
1 | # Naive method. Only using the bins unique to each cell (overlapping bins omitted)
2 |
3 |
4 | def naive_assignment(result_spatial_join):
5 | # Naive method. Only using the bins unique to each cell (overlapping bins omitted)
6 | result_spatial_join = result_spatial_join[result_spatial_join["unique_bin"]]
7 | result_spatial_join["weight"] = 1
8 | return result_spatial_join
9 |
--------------------------------------------------------------------------------
/src/enact/assignment_methods/weight_by_area.py:
--------------------------------------------------------------------------------
1 | # Weighted by area method
2 | import anndata
3 | import numpy as np
4 | from scipy import sparse
5 |
6 |
7 | def apply_weights_to_adata_counts(adata):
8 | """Applies the weights to the counts matrix
9 |
10 | Args:
11 | adata (AnnData): Counts AnnData
12 |
13 | Returns:
14 | AnnData: Weighted-adjusted AnnData
15 | """
16 | weight = adata.obs["weight"]
17 | # Reshape weights to (130000, 1) for broadcasting
18 | weight = np.array(weight)
19 | weight = weight[:, np.newaxis]
20 |
21 | # OPTIMIZATION
22 | # Perform element-wise multiplication
23 | weighted_counts = adata.X.multiply(weight)
24 |
25 | # convert back to sparse
26 | adata.X = sparse.csr_matrix(weighted_counts)
27 | return adata
28 |
29 |
30 | def weight_by_area_assignment(result_spatial_join, expanded_adata, cell_gdf_chunk):
31 | # Calculate overlapping area between cell and bin
32 | result_spatial_join["area"] = result_spatial_join.apply(
33 | lambda row: row["geometry"]
34 | .intersection(cell_gdf_chunk.loc[row["index_right"], "geometry"])
35 | .area,
36 | axis=1,
37 | )
38 | bin_area = result_spatial_join.iloc[0]["geometry"].area
39 | result_spatial_join["weight"] = result_spatial_join["area"] / bin_area
40 | result_spatial_join.loc[
41 | result_spatial_join["unique_bin"],
42 | "weight",
43 | ] = 1
44 | expanded_adata.obs["weight"] = result_spatial_join["weight"].tolist()
45 | expanded_adata = apply_weights_to_adata_counts(expanded_adata)
46 | return result_spatial_join, expanded_adata
47 |
--------------------------------------------------------------------------------
/src/enact/assignment_methods/weight_by_gene.py:
--------------------------------------------------------------------------------
1 | # Weighted by area method
2 | import anndata
3 | import numpy as np
4 | import pandas as pd
5 | from scipy import sparse
6 | from tqdm import tqdm
7 | from sklearn.cluster import KMeans
8 | from sklearn.decomposition import PCA
9 | from sklearn.preprocessing import StandardScaler
10 |
11 |
12 | def apply_weights_to_adata_counts(expanded_adata, weights_df):
13 | """Applies the weights to the counts matrix
14 |
15 | Args:
16 | adata (AnnData): Counts AnnData
17 |
18 | Returns:
19 | AnnData: Weighted-adjusted AnnData
20 | """
21 | if weights_df.empty:
22 | return expanded_adata
23 | # Applying the weighting
24 | mask = (expanded_adata.obs_names.isin(weights_df.index)) & (
25 | expanded_adata.obs["id"].isin(weights_df["id"])
26 | )
27 | indices = np.where(mask)[0]
28 | # Apply weights to the entries in the expression matrix
29 | weights_matrix = np.ones(expanded_adata.shape)
30 |
31 | for idx in tqdm(indices, total=len(indices)):
32 | bin_id = expanded_adata.obs.iloc[idx]["index"]
33 | cell_id = expanded_adata.obs.iloc[idx]["id"]
34 | bin_rows = weights_df.loc[bin_id]
35 | weights = bin_rows[bin_rows["id"] == cell_id][expanded_adata.var_names]
36 | weights_matrix[idx] = weights.iloc[0].tolist()
37 | weighted_counts = expanded_adata.X.multiply(weights_matrix)
38 | # convert back to sparse
39 | expanded_adata.X = sparse.csr_matrix(weighted_counts)
40 | return expanded_adata
41 |
42 |
43 | def weight_by_gene_assignment(
44 | result_spatial_join, expanded_adata, unique_cell_by_gene_adata
45 | ):
46 | # Getting the gene counts of the cells (unique signature for each cell)
47 | gene_counts_non_overlap = (
48 | pd.DataFrame(
49 | unique_cell_by_gene_adata.X.toarray(),
50 | index=unique_cell_by_gene_adata.obs_names,
51 | columns=unique_cell_by_gene_adata.var_names,
52 | )
53 | .groupby(unique_cell_by_gene_adata.obs["id"])
54 | .sum()
55 | .reset_index()
56 | )
57 |
58 | # Getting the bins that overlap with multiple cells
59 | overlapping_bins = result_spatial_join[~result_spatial_join["unique_bin"]]
60 |
61 | # Getting a table of bins with the parent cell and the parent cell's gene content
62 | overlap_merge = pd.merge(
63 | overlapping_bins[["index", "id"]], gene_counts_non_overlap, on="id", how="left"
64 | )
65 | overlap_merge.set_index("index", inplace=True)
66 |
67 | # Grouping the bins by the bin id
68 | grouped_overlap = overlap_merge.groupby("index")
69 |
70 | # Initialize progress bar for processing overlapping bins
71 | pbar = tqdm(grouped_overlap, desc="Processing overlapping bins", unit="bin")
72 | gene_columns = overlap_merge.columns.drop(["id"]).tolist()
73 | weights_list = []
74 | # Looping through the bins and splitting the counts
75 | for bin_index, group_rows in pbar:
76 | # getting total gene counts from the cells that share a bin
77 | gene_total = group_rows[gene_columns].sum(axis=0)
78 | # Dividing the cells gene counts by the total gene counts to get the weight
79 | gene_weights = group_rows[gene_columns].div(gene_total, axis=1).fillna(0)
80 | gene_weights["id"] = group_rows["id"]
81 | weights_list.append(gene_weights)
82 | # Getting a weights dataframe
83 | if weights_list:
84 | weights_df = pd.concat(weights_list, axis=0)
85 | else:
86 | weights_df = pd.DataFrame()
87 | pbar.close()
88 | expanded_adata = apply_weights_to_adata_counts(expanded_adata, weights_df)
89 | return result_spatial_join, expanded_adata
90 |
91 |
92 | def weight_by_cluster_assignment(
93 | result_spatial_join, expanded_adata, unique_cell_by_gene_adata, n_clusters=4, n_pcs=250
94 | ):
95 | # Getting the gene counts of the cells (unique signature for each cell)
96 | gene_counts_non_overlap = (
97 | pd.DataFrame(
98 | unique_cell_by_gene_adata.X.toarray(),
99 | index=unique_cell_by_gene_adata.obs_names,
100 | columns=unique_cell_by_gene_adata.var_names,
101 | )
102 | .groupby(unique_cell_by_gene_adata.obs["id"])
103 | .sum()
104 | .reset_index()
105 | )
106 |
107 | # Getting the bins that overlap with multiple cells
108 | overlapping_bins = result_spatial_join[~result_spatial_join["unique_bin"]]
109 |
110 | gene_columns = gene_counts_non_overlap.columns.drop(["id"]).tolist()
111 |
112 | # Standardize the data
113 | scaler = StandardScaler()
114 | data_scaled = scaler.fit_transform(gene_counts_non_overlap[gene_columns])
115 |
116 | # Apply PCA for dimensionality reduction
117 | n_pcs = np.min([data_scaled.shape[0], data_scaled.shape[1], n_pcs])
118 | pca = PCA(n_components=n_pcs)
119 | data_pca = pca.fit_transform(data_scaled)
120 |
121 | # clustering on gene counts from non-overlapping bins
122 | n_clusters = np.min([n_clusters, n_pcs])
123 | kmeans = KMeans(n_clusters=n_clusters, random_state=0)
124 | clusters = kmeans.fit_predict(data_pca)
125 | gene_counts_non_overlap["cluster"] = clusters
126 | cluster_means = gene_counts_non_overlap.groupby("cluster")[gene_columns].mean()
127 |
128 | # Getting a table of bins with the parent cell and the parent cell's gene content
129 | # index = bin index, id: cell index
130 | # table has the bin, the cells that share them, and cell transcript counts
131 | overlap_merge = pd.merge(
132 | overlapping_bins[["index", "id"]], gene_counts_non_overlap, on="id", how="left"
133 | )
134 | # merge cluster mean gene counts with overlapping bins -
135 | # using cluster gene counts instead of the bins's gene counts
136 | overlap_merge = pd.merge(
137 | overlap_merge[["index", "id", "cluster"]],
138 | cluster_means,
139 | left_on="cluster",
140 | right_index=True,
141 | how="left",
142 | )
143 | overlap_merge.set_index("index", inplace=True)
144 |
145 | grouped_overlap = overlap_merge.groupby("index")
146 |
147 | # Initialize progress bar for processing overlapping bins
148 | pbar = tqdm(grouped_overlap, desc="Processing overlapping bins", unit="bin")
149 | weights_list = []
150 | # Looping through the bins and splitting the counts
151 | for bin_index, group_rows in pbar:
152 | # getting total gene counts from the cells that share a bin
153 | gene_total = group_rows[gene_columns].sum(axis=0)
154 | # Dividing the cells gene counts by the total gene counts to get the weight
155 | gene_weights = group_rows[gene_columns].div(gene_total, axis=1)
156 | num_cells = len(group_rows)
157 | gene_weights = gene_weights.fillna(1/num_cells)
158 | gene_weights = gene_weights.copy()
159 | gene_weights["id"] = group_rows["id"]
160 | weights_list.append(gene_weights)
161 | # Getting a weights dataframe
162 | if weights_list:
163 | weights_df = pd.concat(weights_list, axis=0)
164 | else:
165 | weights_df = pd.DataFrame()
166 | pbar.close()
167 | expanded_adata = apply_weights_to_adata_counts(expanded_adata, weights_df)
168 | return result_spatial_join, expanded_adata
--------------------------------------------------------------------------------
/src/enact/cellassign.py:
--------------------------------------------------------------------------------
1 | """Class for defining methods to package pipeline outputs into AnnData objects
2 | """
3 |
4 | import os
5 | import pandas as pd
6 | import anndata
7 | import scanpy as sc
8 | import scvi
9 | import seaborn as sns
10 | from scvi.external import CellAssign
11 | import numpy as np
12 | import torch
13 |
14 | from .pipeline import ENACT
15 |
16 | seed = 42
17 |
18 |
19 | class CellAssignPipeline(ENACT):
20 | """Class for running CellAssign algorithm"""
21 |
22 | def __init__(self, **kwargs):
23 | super().__init__(**kwargs)
24 |
25 | def format_markers_to_df(self):
26 | """Method to format marker genes to a pandas dataframe
27 | num gene x num cell_types
28 | """
29 | markers_dict = self.configs["cell_markers"]
30 | genes_set = set([item for sublist in markers_dict.values() for item in sublist])
31 | markers_df = pd.DataFrame(columns=markers_dict.keys(), index=sorted(genes_set))
32 | markers_df = markers_df.fillna(0)
33 | for cell_type, gene_markers in markers_dict.items():
34 | markers_df.loc[gene_markers, cell_type] = 1
35 | self.markers_df = markers_df
36 |
37 | def run_cell_assign(self):
38 | """Runs CellAssign"""
39 | bin_assign_results = self.merge_files_sparse(self.bin_assign_dir)
40 | cell_lookup_df = self.merge_files(self.cell_ix_lookup_dir, save=False)
41 |
42 | spatial_cols = ["cell_x", "cell_y"]
43 | stat_columns = ["num_shared_bins", "num_unique_bins", "num_transcripts"]
44 | cell_lookup_df.loc[:, "id"] = cell_lookup_df["id"].astype(str)
45 | cell_lookup_df = cell_lookup_df.set_index("id")
46 | cell_lookup_df["num_transcripts"] = cell_lookup_df["num_transcripts"].fillna(0)
47 |
48 | bin_assign_result_sparse, gene_columns = bin_assign_results
49 | adata = anndata.AnnData(X=bin_assign_result_sparse, obs=cell_lookup_df.copy())
50 | adata.var_names = gene_columns
51 |
52 | adata.obsm["spatial"] = cell_lookup_df[spatial_cols].astype(int)
53 | adata.obsm["stats"] = cell_lookup_df[stat_columns].astype(int)
54 |
55 | lib_size = adata.X.sum(1)
56 | adata.obs["size_factor"] = lib_size / np.mean(lib_size)
57 | adata.obs["lib_size"] = lib_size
58 |
59 | marker_gene_mat = self.markers_df.copy()
60 | marker_gene_mat = marker_gene_mat.loc[
61 | sorted(list(set(self.markers_df.index) & set(gene_columns)))
62 | ]
63 | bdata = adata[:, marker_gene_mat.index].copy()
64 |
65 | torch.manual_seed(seed)
66 | scvi.external.CellAssign.setup_anndata(bdata, size_factor_key="size_factor")
67 | model = CellAssign(bdata, marker_gene_mat, random_b_g_0=False)
68 | model.train()
69 | predictions = model.predict()
70 |
71 | bdata.obs["cell_type"] = predictions.idxmax(axis=1).values
72 | bdata.obs[adata.obsm["spatial"].columns] = adata.obsm["spatial"]
73 | bdata.obs[adata.obsm["stats"].columns] = adata.obsm["stats"]
74 | bdata.obs["chunk_name"] = cell_lookup_df["chunk_name"]
75 | bdata.obs.to_csv(
76 | os.path.join(self.cellannotation_results_dir, "merged_results.csv")
77 | )
78 | print(
79 | f"saved to : {os.path.join(self.cellannotation_results_dir, 'merged_results.csv')}"
80 | )
81 |
82 |
83 | if __name__ == "__main__":
84 | # Creating CellAssignPipeline object
85 | cell_assign = CellAssignPipeline(configs_path="config/configs.yaml")
86 | cell_assign.format_markers_to_df()
87 |
--------------------------------------------------------------------------------
/src/enact/celltypist.py:
--------------------------------------------------------------------------------
1 | """Class for defining methods to package pipeline outputs into AnnData objects
2 | """
3 |
4 | import os
5 | import pandas as pd
6 | import anndata
7 | import scanpy as sc
8 | import seaborn as sns
9 | import numpy as np
10 |
11 | ## Attempt to import celltypist, and prompt installation if not found
12 | import celltypist
13 | from celltypist import models
14 |
15 | from .pipeline import ENACT
16 |
17 |
18 | class CellTypistPipeline(ENACT):
19 | """Class for running CellAssign algorithm"""
20 |
21 | def __init__(self, **kwargs):
22 | super().__init__(**kwargs)
23 |
24 | def run_cell_typist(self):
25 | """Runs CellTypist"""
26 | bin_assign_results = self.merge_files_sparse(self.bin_assign_dir)
27 | cell_lookup_df = self.merge_files(self.cell_ix_lookup_dir, save=False)
28 |
29 | spatial_cols = ["cell_x", "cell_y"]
30 | stat_columns = ["num_shared_bins", "num_unique_bins", "num_transcripts"]
31 | cell_lookup_df.loc[:, "id"] = cell_lookup_df["id"].astype(str)
32 | cell_lookup_df = cell_lookup_df.set_index("id")
33 | cell_lookup_df["num_transcripts"] = cell_lookup_df["num_transcripts"].fillna(0)
34 |
35 | bin_assign_result_sparse, gene_columns = bin_assign_results
36 | adata = anndata.AnnData(X=bin_assign_result_sparse, obs=cell_lookup_df.copy())
37 | adata.var_names = gene_columns
38 |
39 | adata.obsm["spatial"] = cell_lookup_df[spatial_cols].astype(int)
40 | adata.obsm["stats"] = cell_lookup_df[stat_columns].astype(int)
41 |
42 | lib_size = adata.X.sum(1)
43 | adata.obs["size_factor"] = lib_size / np.mean(lib_size)
44 | adata.obs["lib_size"] = lib_size
45 |
46 | # normalize adata to the log1p normalised format (to 10,000 counts per cell)
47 | sc.pp.normalize_total(adata, target_sum=1e4)
48 | sc.pp.log1p(adata)
49 |
50 | # download celltypist model and predict cell type
51 | if ".pkl" not in self.cell_typist_model:
52 | self.cell_typist_model = self.cell_typist_model + ".pkl"
53 | models.download_models(model=self.cell_typist_model)
54 | predictions = celltypist.annotate(adata, model=self.cell_typist_model)
55 | adata = predictions.to_adata(
56 | insert_labels=True, insert_conf=True, insert_prob=True
57 | )
58 |
59 | adata.obs.rename(columns={"predicted_labels": "cell_type"}, inplace=True)
60 | adata.obs[adata.obsm["spatial"].columns] = adata.obsm["spatial"]
61 | adata.obs[adata.obsm["stats"].columns] = adata.obsm["stats"]
62 | adata.obs["chunk_name"] = cell_lookup_df["chunk_name"]
63 | results_df = adata.obs.drop(columns=adata.obs["cell_type"].unique().tolist())
64 | results_df.to_csv(
65 | os.path.join(self.cellannotation_results_dir, "merged_results.csv")
66 | )
67 |
68 |
69 | if __name__ == "__main__":
70 | # Creating CellAssignPipeline object
71 | cell_typist = CellTypistPipeline(configs_path="config/configs.yaml")
72 | cell_typist.run_cell_typist()
73 |
--------------------------------------------------------------------------------
/src/enact/package_results.py:
--------------------------------------------------------------------------------
1 | """Class for defining methods to package pipeline outputs into AnnData objects
2 | """
3 |
4 | import os
5 | import yaml
6 | import json
7 | import shutil
8 | import anndata
9 | import pandas as pd
10 | from PIL import Image
11 | import numpy as np
12 | from scipy.sparse import csr_matrix
13 |
14 | # import squidpy as sq
15 |
16 | from .pipeline import ENACT
17 |
18 |
19 | class PackageResults(ENACT):
20 | """Class for packaging ENACT pipeline outputs"""
21 |
22 | def __init__(self, **kwargs):
23 | super().__init__(**kwargs)
24 | self.files_to_ignore = [
25 | "merged_results.csv",
26 | "merged_results_old.csv",
27 | "cells_adata.h5",
28 | ".ipynb_checkpoints",
29 | ]
30 |
31 | def merge_cellassign_output_files(self):
32 | """Merges the CellAssign results with gene counts
33 |
34 | Returns:
35 | _type_: _description_
36 | """
37 | if self.configs["params"]["chunks_to_run"]:
38 | chunk_list = self.configs["params"]["chunks_to_run"]
39 | else:
40 | chunk_list = os.listdir(self.bin_assign_dir)
41 | cell_by_gene_list = []
42 | for chunk_name in chunk_list:
43 | if chunk_name in self.files_to_ignore:
44 | continue
45 | index_lookup = pd.read_csv(
46 | os.path.join(self.cell_ix_lookup_dir, chunk_name)
47 | )
48 | trancript_counts = pd.read_csv(
49 | os.path.join(self.bin_assign_dir, chunk_name)
50 | ).drop(columns=["Unnamed: 0"])
51 | cell_by_gene_chunk = pd.concat(
52 | [index_lookup["id"], trancript_counts], axis=1
53 | )
54 | cell_by_gene_list.append(cell_by_gene_chunk)
55 | cell_by_gene_df = pd.concat(cell_by_gene_list, axis=0)
56 | return cell_by_gene_df
57 |
58 | def merge_sargent_output_files(self):
59 | """Merges the Sargent chunk results into a single results file
60 |
61 | Returns:
62 | _type_: _description_
63 | """
64 | os.makedirs(self.sargent_results_dir, exist_ok=True)
65 | # Merge the sargent_results_chunks data and gene_to_cell_assignment_chunks_ix_lookup
66 | chunks = os.listdir(self.sargent_results_dir)
67 | sargent_results_list = []
68 | cell_by_gene_list = []
69 | for chunk_name in chunks:
70 | if chunk_name in self.files_to_ignore:
71 | continue
72 | cell_labels = pd.read_csv(
73 | os.path.join(self.sargent_results_dir, chunk_name)
74 | )
75 | index_lookup = pd.read_csv(
76 | os.path.join(self.cell_ix_lookup_dir, chunk_name)
77 | )
78 | trancript_counts = pd.read_csv(
79 | os.path.join(self.bin_assign_dir, chunk_name)
80 | ).drop(columns=["Unnamed: 0"])
81 |
82 | sargent_result_chunk = pd.concat([index_lookup, cell_labels["x"]], axis=1)
83 | cell_by_gene_chunk = pd.concat(
84 | [index_lookup["id"], trancript_counts], axis=1
85 | )
86 | sargent_result_chunk.drop("Unnamed: 0", axis=1, inplace=True)
87 | sargent_results_list.append(sargent_result_chunk)
88 | cell_by_gene_list.append(cell_by_gene_chunk)
89 | sargent_results_df = pd.concat(sargent_results_list, axis=0)
90 | sargent_results_df = sargent_results_df.rename(columns={"x": "cell_type"})
91 | cell_by_gene_df = pd.concat(cell_by_gene_list, axis=0)
92 | sargent_results_df.to_csv(
93 | os.path.join(self.sargent_results_dir, "merged_results.csv"), index=False
94 | )
95 | return sargent_results_df, cell_by_gene_df
96 |
97 | def df_to_adata(self, results_df, cell_by_gene_df):
98 | """Converts pd.DataFrame object with pipeline results to AnnData
99 |
100 | Args:
101 | results_df (_type_): _description_
102 |
103 | Returns:
104 | anndata.AnnData: Anndata with pipeline outputs
105 | """
106 | file_columns = results_df.columns
107 | spatial_cols = ["cell_x", "cell_y"]
108 | stat_columns = ["num_shared_bins", "num_unique_bins", "num_transcripts"]
109 | results_df.loc[:, "id"] = results_df["id"].astype(str)
110 | results_df = results_df.set_index("id")
111 | results_df["num_transcripts"] = results_df["num_transcripts"].fillna(0)
112 | results_df["cell_type"] = results_df["cell_type"].str.lower()
113 | adata = anndata.AnnData(cell_by_gene_df.set_index("id"))
114 | adata.obs = adata.obs.merge(results_df, on="id").drop_duplicates(keep='first')
115 |
116 | adata.obsm["spatial"] = adata.obs[spatial_cols].astype(int)
117 | adata.obsm["stats"] = adata.obs[stat_columns].astype(int)
118 |
119 | # This column is the output of cell type inference pipeline
120 | adata.obs["cell_type"] = adata.obs[["cell_type"]].astype("category")
121 | adata.obs["patch_id"] = adata.obs[["chunk_name"]]
122 | adata.obs = adata.obs[["cell_type", "patch_id"]]
123 |
124 | # Converting the Anndata cell transcript counts to sparse format for more efficient storage
125 | adata.X = csr_matrix(adata.X).astype(np.float32)
126 | return adata
127 |
128 | def create_tmap_file(self):
129 | """Creates a tmap file for the sample being run on ENACT
130 | """
131 | # The following three files need to be in the same directory:
132 | # cells_adata.h5, wsi file, experiment_tmap.tmap
133 | tmap_template_path = "./templates/tmap_template.tmap"
134 | with open(tmap_template_path, "r") as stream:
135 | tmap_template = yaml.safe_load(stream)
136 | tmap_template["filename"] = self.configs["analysis_name"]
137 | bin_to_cell_method = self.configs["params"]["bin_to_cell_method"]
138 | cell_annotation_method = self.configs["params"]["cell_annotation_method"]
139 | wsi_src_path = self.configs["paths"]["wsi_path"]
140 | wsi_fname = "wsi.tif"
141 | run_name = f"{bin_to_cell_method}|{cell_annotation_method}"
142 | tmap_template["markerFiles"][0]["title"] = f"ENACT Results: {run_name.replace('|', ' | ')}"
143 | tmap_template["markerFiles"][0]["expectedHeader"].update(
144 | {
145 | "X": "/obsm/spatial/cell_x",
146 | "Y": "/obsm/spatial/cell_y",
147 | "gb_col": "/obs/cell_type/",
148 | }
149 | )
150 | tmap_template["layers"][0].update(
151 | {"name": wsi_fname, "tileSource": f"{wsi_fname}.dzi"}
152 | )
153 | tmap_template["markerFiles"][0]["path"] = f"{run_name}_cells_adata.h5"
154 |
155 | # save tmap file at a separate directory "tmap"
156 | tmap_output_dir = os.path.join(self.cache_dir, "tmap")
157 | os.makedirs(tmap_output_dir, exist_ok=True)
158 | tmap_file_path = os.path.join(tmap_output_dir, f"{run_name}_tmap.tmap")
159 | with open(tmap_file_path, "w") as outfile:
160 | outfile.write(json.dumps(tmap_template, indent=4))
161 |
162 | # Copy the anndata file to the "tmap" directory
163 | adata_src_path = os.path.join(
164 | self.cellannotation_results_dir, "cells_adata.h5"
165 | )
166 | adata_dst_path = os.path.join(tmap_output_dir, f"{run_name}_cells_adata.h5")
167 | shutil.copy(adata_src_path, adata_dst_path)
168 |
169 | # Copy the cells_layer.png file to the "tmap" directory
170 | layer_src_path = os.path.join(
171 | self.cache_dir, "cells_layer.png"
172 | )
173 | layer_dst_path = os.path.join(tmap_output_dir, "cells_layer.png")
174 | if os.path.exists(layer_src_path):
175 | shutil.copy(layer_src_path, layer_dst_path)
176 |
177 | # Saving a cropped version (lite version) of the image file to the "tmap" directory
178 | wsi_dst_path = os.path.join(tmap_output_dir, wsi_fname)
179 | cropped_image, _ = self.load_image()
180 | cropped_image = Image.fromarray(cropped_image)
181 | cropped_image.save(wsi_dst_path)
182 |
183 | message = f"""
184 | Sample ready to visualize on TissUUmaps. To install TissUUmaps, follow the instructions at:\n
185 | https://tissuumaps.github.io/TissUUmaps-docs/docs/intro/installation.html#.
186 |
187 | To view the the sample, follow the instructions at:\n
188 | https://tissuumaps.github.io/TissUUmaps-docs/docs/starting/projects.html#loading-projects
189 |
190 | TissUUmaps project file is located here:\n
191 | {tmap_file_path}
192 | """
193 | print (message)
194 |
195 | # def run_neighborhood_enrichment(self, adata):
196 | # """Sample function to run Squidpy operations on AnnData object
197 |
198 | # Args:
199 | # adata (_type_): _description_
200 |
201 | # Returns:
202 | # _type_: _description_
203 | # """
204 | # sq.gr.spatial_neighbors(adata)
205 | # sq.gr.nhood_enrichment(adata, cluster_key="cell_type")
206 | # return adata
207 |
208 | def save_adata(self, adata):
209 | """Save the anndata object to disk
210 |
211 | Args:
212 | adata (_type_): _description_
213 | """
214 | adata.write(
215 | os.path.join(self.cellannotation_results_dir, "cells_adata.h5"),
216 | compression="gzip",
217 | )
218 |
219 |
220 | if __name__ == "__main__":
221 | # Creating ENACT object
222 | so_hd = PackageResults(configs_path="config/configs.yaml")
223 | results_df, cell_by_gene_df = so_hd.merge_sargent_output_files()
224 | adata = so_hd.df_to_adata(results_df, cell_by_gene_df)
225 | # adata = so_hd.run_neighborhood_enrichment(adata) # Example integration with SquiPy
226 | so_hd.save_adata(adata)
--------------------------------------------------------------------------------
/src/enact/utils/logging.py:
--------------------------------------------------------------------------------
1 | """
2 | Created By : ...
3 | Created Date: DD/MM/YYYY
4 | Description : ...
5 | """
6 | import os
7 | import logging
8 |
9 |
10 | def get_logger(app_name, cache_dir):
11 | """Create and configure logger.
12 |
13 | :return: logger
14 | :rtype: Logger
15 | """
16 | # Clear log
17 | log_file = os.path.join(cache_dir, f"{app_name}.log")
18 |
19 | logger = logging.getLogger(app_name)
20 | if logger.hasHandlers():
21 | return logger
22 |
23 | logger.setLevel(logging.DEBUG)
24 |
25 | # Create file handler
26 | file_handler = logging.FileHandler(log_file)
27 | file_handler.setLevel(logging.INFO)
28 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
29 | file_handler.setFormatter(formatter)
30 |
31 | # Create stream handler
32 | stream_handler = logging.StreamHandler()
33 | stream_handler.setLevel(logging.DEBUG)
34 | stream_handler.setFormatter(formatter)
35 |
36 | # Add handlers to the logger
37 | logger.addHandler(file_handler)
38 | logger.addHandler(stream_handler)
39 |
40 | return logger
--------------------------------------------------------------------------------
/src/eval/cell_annotation_eval.py:
--------------------------------------------------------------------------------
1 | # Script runs the evaluation to compare ENACT cell annotations versus pathologist cell annotations
2 |
3 | from shapely.geometry import shape
4 | import plotly.express as px
5 | import geopandas as gpd
6 | import json
7 | from shapely.geometry import Polygon, Point
8 | from shapely import wkt
9 | import pandas as pd
10 | from sklearn.metrics import precision_recall_fscore_support, accuracy_score
11 | from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
12 | import os
13 |
14 | # from src.pipelines.enact_pipeline import ENACT
15 |
16 | # so_hd = ENACT(configs_path="config/configs.yaml")
17 |
18 | geojson_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/Visium_HD_Human_Colon_Cancer-wsi-40598_0_65263_22706.geojson"
19 | segmentation_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/cells_df.csv"
20 | predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_cluster/cellassign_results/merged_results.csv"
21 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_cluster/sargent_results/merged_results.csv"
22 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_area/cellassign_results/merged_results.csv"
23 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_area/sargent_results/merged_results.csv"
24 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_gene/sargent_results/merged_results.csv"
25 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/naive/sargent_results/merged_results.csv"
26 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_gene/cellassign_results/merged_results.csv"
27 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/naive/cellassign_results/merged_results.csv"
28 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/naive/celltypist_results/merged_results.csv"
29 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_area/celltypist_results/merged_results.csv"
30 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_gene/celltypist_results/merged_results.csv"
31 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_cluster/celltypist_results/merged_results.csv"
32 |
33 |
34 | results_eval_dir = os.path.join("/".join(predictions_df_path.split("/")[:-1]), "eval")
35 | os.makedirs(results_eval_dir, exist_ok=True)
36 |
37 |
38 | name_map = {
39 | 'unclassified': "no label",
40 | 'Immune': "immune cells",
41 | 'Crypt cells': "epithelial cells",
42 | 'Enterocytes': "epithelial cells",
43 | 'Epithelial': "epithelial cells",
44 | 'Smooth muscle cell': "stromal cells",
45 | 'Fibroblast': "stromal cells",
46 | 'Endothelial': "stromal cells",
47 | 'Paneth cells': "epithelial cells",
48 | 'Enteroendocrine cells': "epithelial cells",
49 | 'Goblet cells': "epithelial cells",
50 | 'Neuronal': "stromal cells",
51 | 'ephitelial cells': "epithelial cells",
52 | 'no label': "no label",
53 | "Ignore*": "no label",
54 | "B cells": "immune cells",
55 | "T cells": "immune cells",
56 | "NK cells": "immune cells",
57 | "Macrophages": "immune cells",
58 | "Neutrophils": "immune cells",
59 | "Eosinophils": "immune cells",
60 | 'CD19+CD20+ B': "immune cells", # B cells are immune cells
61 | 'CD4+ T cells': "immune cells", # CD4+ T cells are immune cells
62 | 'CD8+ T cells': "immune cells", # CD8+ T cells are immune cells
63 | 'CMS1': "epithelial cells", # CMS (Consensus Molecular Subtypes) refer to tumor/epithelial cells
64 | 'CMS2': "epithelial cells", # Same as above
65 | 'CMS3': "epithelial cells", # Same as above
66 | 'CMS4': "epithelial cells", # Same as above
67 | 'Enteric glial cells': "stromal cells", # Glial cells are part of the stromal tissue
68 | 'Goblet cells': "epithelial cells", # Goblet cells are epithelial cells
69 | 'IgA+ Plasma': "immune cells", # Plasma cells are immune cells (B-cell derivatives)
70 | 'IgG+ Plasma': "immune cells", # Same as above
71 | 'Intermediate': "no label", # Ambiguous, no clear label
72 | 'Lymphatic ECs': "stromal cells", # Endothelial cells are considered stromal
73 | 'Mast cells': "immune cells", # Mast cells are immune cells
74 | 'Mature Enterocytes type 1': "epithelial cells", # Enterocytes are epithelial cells
75 | 'Mature Enterocytes type 2': "epithelial cells", # Same as above
76 | 'Myofibroblasts': "stromal cells", # Fibroblasts are stromal cells
77 | 'NK cells': "immune cells", # NK cells are immune cells
78 | 'Pericytes': "stromal cells", # Pericytes are part of the vasculature (stromal)
79 | 'Pro-inflammatory': "immune cells", # Inflammation implies immune function
80 | 'Proliferating': "no label", # Too vague to classify, no label
81 | 'Proliferative ECs': "stromal cells", # Endothelial cells are stromal
82 | 'Regulatory T cells': "immune cells", # T cells are immune cells
83 | 'SPP1+': "no label", # Ambiguous, no clear label
84 | 'Smooth muscle cells': "stromal cells", # Smooth muscle cells are stromal cells
85 | 'Stalk-like ECs': "stromal cells", # Endothelial cells are stromal
86 | 'Stem-like/TA': "epithelial cells", # Stem cells in this context are usually epithelial
87 | 'Stromal 1': "stromal cells", # Explicitly stromal
88 | 'Stromal 2': "stromal cells", # Same as above
89 | 'Stromal 3': "stromal cells", # Same as above
90 | 'T follicular helper cells': "immune cells", # T cells are immune cells
91 | 'T helper 17 cells': "immune cells", # Same as above
92 | 'Tip-like ECs': "stromal cells", # Endothelial cells are stromal
93 | 'Unknown': "no label", # No clear label
94 | 'cDC': "immune cells", # Conventional dendritic cells are immune cells
95 | 'gamma delta T cells': "immune cells" # T cells are immune cells
96 | }
97 |
98 |
99 | segmentation_df = pd.read_csv(segmentation_df_path)
100 | predictions_df = pd.read_csv(predictions_df_path)
101 | predictions_df = predictions_df.merge(segmentation_df[["id", "geometry"]], how="left", on="id")
102 | predictions_df["geometry"] = predictions_df["geometry"].apply(wkt.loads)
103 | pred_gpd = gpd.GeoDataFrame(predictions_df,geometry="geometry")
104 |
105 | def load_path_annotations():
106 | annotation_names = []
107 | annotation_geometries = []
108 | with open(geojson_path) as f:
109 | regions = json.load(f)
110 | for region in regions["features"]:
111 | ann_type = region["properties"]["objectType"]
112 | if ann_type == "annotation":
113 | annotation_name = region["properties"]["classification"]["name"]
114 | if annotation_name in ["Region*"]:
115 | continue
116 | annotation_geometries.append(shape(region["geometry"]))
117 | annotation_names.append(annotation_name)
118 | annotations_gpd = gpd.GeoDataFrame({"geometry": annotation_geometries, "gt_label": annotation_names})
119 | annotations_gpd["ann_ix"] = [f"ID_{i}" for i in range(len(annotations_gpd))]
120 | return annotations_gpd
121 |
122 | def get_gt_annotations(annotations_gpd):
123 | try:
124 | cells_within_ann_gpd = gpd.sjoin(annotations_gpd, pred_gpd[["cell_type", "cell_x", "cell_y", "geometry", "id"]], how='left', predicate='intersects')
125 | except:
126 | cells_within_ann_gpd = gpd.sjoin(annotations_gpd, pred_gpd[["cell_assign_results", "cell_x", "cell_y", "geometry", "id"]], how='left', predicate='intersects')
127 | cells_within_ann_gpd = cells_within_ann_gpd.drop_duplicates("ann_ix")
128 | try:
129 | cells_within_ann_gpd["cell_type"] = cells_within_ann_gpd["cell_type"].fillna("unclassified")
130 | except:
131 | cells_within_ann_gpd["cell_assign_results"] = cells_within_ann_gpd["cell_assign_results"].fillna("unclassified")
132 | return cells_within_ann_gpd
133 |
134 | def validate_labels(cells_within_ann_gpd):
135 | try:
136 | cell_types_in_pred = set(cells_within_ann_gpd.cell_type.unique())
137 | except:
138 | cell_types_in_pred = set(cells_within_ann_gpd.cell_assign_results.unique())
139 | print(f"Cells in pred dataset: {cell_types_in_pred}")
140 | print (f"All cells are in the mapping!: {cell_types_in_pred.issubset(set(name_map.keys()))}")
141 |
142 | def relabel_cells(cells_within_ann_gpd):
143 | # Renaming cell types
144 | for granular_name, generic_name in name_map.items():
145 | cells_within_ann_gpd.loc[cells_within_ann_gpd.gt_label == granular_name, "gt_label"] = generic_name
146 | try:
147 | cells_within_ann_gpd.loc[cells_within_ann_gpd.cell_type == granular_name, "pred_label_clean"] = generic_name
148 | except:
149 | cells_within_ann_gpd.loc[cells_within_ann_gpd.cell_assign_results == granular_name, "pred_label_clean"] = generic_name
150 | return cells_within_ann_gpd
151 |
152 | def eval_annotations(results_table):
153 | cell_types = sorted(set(results_table.gt_label.unique().tolist() + results_table.pred_label_clean.unique().tolist()))
154 | cm = confusion_matrix(
155 | results_table.gt_label,
156 | results_table.pred_label_clean,
157 | labels=cell_types
158 | )
159 | cm_plot = ConfusionMatrixDisplay(
160 | confusion_matrix=cm,
161 | display_labels=cell_types
162 | )
163 | cm_plot.plot()
164 |
165 | averaging_methods = ["micro", "macro", "weighted"]
166 | eval_dict = {}
167 | for method in averaging_methods:
168 | eval_metrics = precision_recall_fscore_support(results_table.gt_label, results_table.pred_label_clean, average=method)
169 | precision, recall, fbeta_score, support = eval_metrics
170 | eval_dict[method] = eval_metrics
171 | num_correct_samples = accuracy_score(results_table.gt_label, results_table.pred_label_clean, normalize=False)
172 | accuracy = accuracy_score(results_table.gt_label, results_table.pred_label_clean, normalize=True)
173 | print(f"Experiment name: {predictions_df_path}")
174 | print (f"Number of GT annotations: {len(results_table)}\nNumber of correct predictions: {num_correct_samples}\nAccuracy: {accuracy}")
175 | print("__________")
176 | try:
177 | print(pd.DataFrame(results_table.cell_type.value_counts()))
178 | except:
179 | print(pd.DataFrame(results_table.cell_assign_results.value_counts()))
180 | print("__________")
181 | print(pd.DataFrame(results_table.pred_label_clean.value_counts()))
182 | print("__________")
183 | metrics_df = pd.DataFrame(eval_dict, index=["Precision", "Recall", "F-Score", "Support"])
184 | results_table.to_csv(os.path.join(results_eval_dir, "cell_annotation_eval.csv"), index=False)
185 | metrics_df.to_csv(os.path.join(results_eval_dir, "cell_annotation_eval_metrics.csv"), index=True)
186 | cm_plot.figure_.savefig(os.path.join(results_eval_dir, "confusion_matrix.png"),dpi=300)
187 | print (metrics_df)
188 | return results_table, metrics_df
189 |
190 | if __name__ == "__main__":
191 | annotations_gpd = load_path_annotations()
192 | cells_within_ann_gpd = get_gt_annotations(annotations_gpd)
193 | validate_labels(cells_within_ann_gpd)
194 | cells_within_ann_gpd = relabel_cells(cells_within_ann_gpd)
195 | results_table = cells_within_ann_gpd[(cells_within_ann_gpd["gt_label"] != "no label")]
196 | results_table, metrics_df = eval_annotations(results_table)
--------------------------------------------------------------------------------
/src/eval/paper_eval-cellassign-methods-highlevel.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "8cd6edbb-e2e7-4474-9b6c-05203f97e7dc",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# !pip install shapely\n",
11 | "# !pip install plotly\n",
12 | "!pip install geopandas"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 1,
18 | "id": "74ee8420-da7a-4d00-91d9-e4273e83d21f",
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "from shapely.geometry import shape\n",
23 | "import plotly.express as px\n",
24 | "import geopandas as gpd\n",
25 | "import json\n",
26 | "from shapely.geometry import Polygon, Point\n",
27 | "from shapely import wkt\n",
28 | "import pandas as pd\n",
29 | "from sklearn.metrics import precision_recall_fscore_support, accuracy_score\n",
30 | "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
31 | "import os"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 15,
37 | "id": "6f641cb0-1d6a-4cfb-bc35-2fa58302b28f",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "# geojson_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/Visium_HD_Human_Colon_Cancer-wsi-40598_0_65263_22706-landmarks.geojson\"\n",
42 | "geojson_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/Visium_HD_Mouse_Small_Intestine-wsi-156_4_23459_24009_all_for_one.geojson\"\n",
43 | "\n",
44 | "segmentation_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/stardist_cells_df.csv\"\n",
45 | "results_eval_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/mouse_anatomical_landmark_eval\"\n",
46 | "os.makedirs(results_eval_dir, exist_ok=True)\n",
47 | "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/Sargent+naive.csv\"\n",
48 | "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/Sargent+weighted.csv\"\n",
49 | "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/cellassign+weighted.csv\"\n",
50 | "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/cellassign+naive.csv\"\n",
51 | "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/Sargent+weighted-full.csv\"\n",
52 | "predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/sargent+weighted+mouse.csv\"\n",
53 | "\n",
54 | "\n",
55 | "method = predictions_df_path.split(\"/\")[-1].split(\".\")[0]\n",
56 | "\n",
57 | "segmentation_df = pd.read_csv(segmentation_df_path)\n",
58 | "predictions_df = pd.read_csv(predictions_df_path)\n",
59 | "predictions_df = predictions_df.merge(segmentation_df[[\"id\", \"geometry\"]], how=\"left\", on=\"id\")\n",
60 | "predictions_df = predictions_df[~predictions_df.geometry.isna()]\n",
61 | "try:\n",
62 | " predictions_df[\"geometry\"] = predictions_df[\"geometry\"].apply(wkt.loads)\n",
63 | "except:\n",
64 | " pass\n",
65 | "pred_gpd = gpd.GeoDataFrame(predictions_df,geometry=\"geometry\")"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 16,
71 | "id": "9d745a36-0221-450f-9c35-b5c099a8d189",
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "annotation_names = []\n",
76 | "annotation_geometries = []\n",
77 | "with open(geojson_path) as f:\n",
78 | " regions = json.load(f)\n",
79 | "for region in regions[\"features\"]:\n",
80 | " ann_type = region[\"properties\"][\"objectType\"]\n",
81 | " if ann_type == \"annotation\":\n",
82 | " annotation_name = region[\"properties\"][\"classification\"][\"name\"]\n",
83 | " if annotation_name in [\"Region*\"]:\n",
84 | " continue\n",
85 | " annotation_geometries.append(shape(region[\"geometry\"]))\n",
86 | " annotation_names.append(annotation_name)\n",
87 | "annotations_gpd = gpd.GeoDataFrame({\"geometry\": annotation_geometries, \"label\": annotation_names})\n",
88 | "annotations_gpd[\"ann_ix\"] = [f\"ID_{i}\" for i in range(len(annotations_gpd))]\n",
89 | "cells_within_ann_gpd = gpd.sjoin(pred_gpd[[\"cell_type\", \"cell_x\", \"cell_y\", \"geometry\", \"id\"]], annotations_gpd, how='left', predicate='within')\n",
90 | "cells_within_ann_gpd = cells_within_ann_gpd.drop_duplicates(subset=[\"id\"])"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "id": "099d30dc-a404-4662-8fec-7b0275079e42",
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "for annotation_name in annotation_names:\n",
101 | " df = cells_within_ann_gpd[cells_within_ann_gpd.label == annotation_name]\n",
102 | " # df = df[~(df.cell_type == \"unclassified\")]\n",
103 | " df = df.groupby([\"cell_type\"]).agg(\"count\").reset_index()\n",
104 | " df = df.sort_values(\"id\", ascending=False)\n",
105 | " fig = px.bar(df, x='cell_type', y='id', title=f\"Region: {annotation_name}\")\n",
106 | " fig.update_layout(\n",
107 | " xaxis_title=\"cell type\", yaxis_title=\"# cells\"\n",
108 | " )\n",
109 | " fig.show()\n",
110 | " fig.write_html(os.path.join(results_eval_dir, f\"{method}_{annotation_name}_cell_counts.html\"))"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "id": "f7a3204e-89bd-4e6b-b14a-ea98a8fe9f5d",
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "results_eval_dir"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "id": "159a4cc2-32a4-49d9-9a87-f186a0d255de",
127 | "metadata": {},
128 | "outputs": [],
129 | "source": []
130 | }
131 | ],
132 | "metadata": {
133 | "kernelspec": {
134 | "display_name": "Python 3 (ipykernel)",
135 | "language": "python",
136 | "name": "python3"
137 | },
138 | "language_info": {
139 | "codemirror_mode": {
140 | "name": "ipython",
141 | "version": 3
142 | },
143 | "file_extension": ".py",
144 | "mimetype": "text/x-python",
145 | "name": "python",
146 | "nbconvert_exporter": "python",
147 | "pygments_lexer": "ipython3",
148 | "version": "3.10.14"
149 | }
150 | },
151 | "nbformat": 4,
152 | "nbformat_minor": 5
153 | }
154 |
--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
1 | """
2 | Created By : ...
3 | Created Date: DD/MM/YYYY
4 | Description : ...
5 | """
6 |
7 | import argparse
8 | from utils.logging import get_logger
9 |
10 |
11 | APP_NAME = 'MyProject'
12 | LOGGER = get_logger(APP_NAME)
13 |
14 |
15 | def dummy(dum):
16 | """Example function
17 |
18 | :param dum: Text to log.
19 | :type number: str
20 | :return: The entry text.
21 | :rtype: str
22 | """
23 | LOGGER.info(f'{dum} in progress')
24 | return dum
25 |
26 |
--------------------------------------------------------------------------------
/src/synthetic_data/generate_synthetic_data_Xenium.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "018887fd-e9e8-495b-872f-fefbd9cd6cb5",
6 | "metadata": {},
7 | "source": [
8 | "To generate synthetic VisiumHD data from Xenium, please read and run all the cells below. Thanks!"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "bd0c610b-e1b5-43e6-a35d-3548588cb652",
14 | "metadata": {},
15 | "source": [
16 | "## Download Xenium output from 10X website\n",
17 | "Paste the URL for the binned_outputs.tar.gz for the sample you want to analyze.\n",
18 | "\n",
19 | "1. Go to Xenium public datasets page:https://www.10xgenomics.com/datasets?query=&page=1&configure%5BhitsPerPage%5D=50&configure%5BmaxValuesPerFacet%5D=1000&refinementList%5Bproduct.name%5D%5B0%5D=In%20Situ%20Gene%20Expression&refinementList%5Bspecies%5D%5B0%5D=Human&refinementList%5BdiseaseStates%5D%5B0%5D=colorectal%20cancer\n",
20 | "\n",
21 | "2. Select sample to analyze scrolling down to downloads section, click \"Batch download\"\n"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "id": "5f721b2b-4314-4528-9c01-185726147728",
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "import zipfile\n",
32 | "xenium_outputs_url = \"https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_Human_Colorectal_Cancer_Addon_FFPE/Xenium_V1_Human_Colorectal_Cancer_Addon_FFPE_outs.zip\"\n",
33 | "# Step 1: Download the raw Xenium output\n",
34 | "!curl -O {xenium_outputs_url}\n",
35 | "\n",
36 | "# Extract the ZIP file\n",
37 | "zip_file_path = xenium_outputs_url.split(\"/\")[-1]\n",
38 | "\n",
39 | "with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n",
40 | " zip_ref.extractall(\"extracted_files\")\n",
41 | "\n",
42 | "print(\"Extraction completed.\")"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "id": "a9fcd48a-2f55-43b4-befd-8d646ea634cf",
48 | "metadata": {},
49 | "source": [
50 | "### Install prerequisite libraries"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "id": "7453e3e3-a55c-47fb-ab83-2c3743833b89",
57 | "metadata": {
58 | "scrolled": true,
59 | "tags": []
60 | },
61 | "outputs": [],
62 | "source": [
63 | "!pip install --upgrade pip\n",
64 | "!pip install scipy\n",
65 | "!pip install shapely\n",
66 | "!pip install tifffile\n",
67 | "!pip install plotly\n",
68 | "!pip install tensorflow-gpu==2.10.0\n",
69 | "!pip install stardist\n",
70 | "!pip install geopandas\n",
71 | "!pip install scanpy\n",
72 | "!pip install fastparquet\n",
73 | "!pip install opencv-python\n",
74 | "!pip install geojson\n",
75 | "!pip install scikit-learn"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "id": "1f79fb2c-0fd9-4bd4-8be9-4d1bd04d8733",
81 | "metadata": {},
82 | "source": [
83 | "### Import Relevant Libraries"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "id": "16e4dc02-2b8d-4e00-9cbd-8a4d151ca5af",
90 | "metadata": {
91 | "scrolled": true,
92 | "tags": []
93 | },
94 | "outputs": [],
95 | "source": [
96 | "import geopandas as gpd # Geopandas for storing Shapely objects\n",
97 | "from matplotlib.colors import ListedColormap\n",
98 | "import matplotlib.pyplot as plt\n",
99 | "import scanpy as sc\n",
100 | "import pandas as pd\n",
101 | "from scipy import sparse\n",
102 | "import anndata\n",
103 | "import os\n",
104 | "import gzip\n",
105 | "import numpy as np\n",
106 | "import re\n",
107 | "import shapely\n",
108 | "from shapely.geometry import Polygon, Point # Representing bins and cells as Shapely Polygons and Point objects\n",
109 | "from shapely import wkt"
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "id": "46a8d90a-65dd-4e93-b4e2-4a257d6e1dc7",
115 | "metadata": {},
116 | "source": [
117 | "### Load Cell & Transcripts Info"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "id": "feb54b91-6757-467c-81d3-7a4f6916fcda",
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "# Load the transcript data\n",
128 | "transcripts_path = \"extracted_files/transcripts.csv.gz\"\n",
129 | "with gzip.open(transcripts_path, 'rt') as f:\n",
130 | " transcripts_df = pd.read_csv(f)\n",
131 | "\n",
132 | "# Load cell info\n",
133 | "cells_path = \"extracted_files/cells.csv.gz\"\n",
134 | "with gzip.open(cells_path, 'rt') as f:\n",
135 | " cells_df = pd.read_csv(f)\n"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "id": "ccac1dea-7855-4af4-8989-c2b63deed2f1",
141 | "metadata": {},
142 | "source": [
143 | "### Load Cell Boundary Info"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "id": "25d2bdf0-8871-4bb0-a38e-3f9c31c7b3ea",
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "import zarr\n",
154 | "\n",
155 | "zarr_file = zarr.open('extracted_files/cells.zarr.zip', mode='r')\n",
156 | "print(zarr_file.tree())"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "id": "c092c013-dd0d-47f5-a6cc-3491f1f62dfe",
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "file = zarr_file['polygon_sets/0/vertices'][:]\n",
167 | "# 1 is whole cell, 0 is nucleus"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "id": "0da5ff74-6269-42b4-9a9e-604f520a7528",
173 | "metadata": {
174 | "tags": []
175 | },
176 | "source": [
177 | "### Create folders to store synthetic data"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "id": "a6839176-4a75-4f1e-b4f7-13899b946963",
183 | "metadata": {},
184 | "source": [
185 | "For both the `seqfish_dir` and `enact_data_dir`, change `\"/home/oneai/\"` to the directory that stores this repo."
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "id": "7ec69f53-4a93-491a-b6f0-652b27ffaaf1",
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "xenium_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/synthetic_data/xenium\" # Update it to the directory where you want to save the synthetic data\n",
196 | "enact_data_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/xenium_nuclei/chunks\" # Directory that saves all the input and results of the enact pipeline, \n",
197 | "# should end with \"oneai-dda-spatialtr-visiumhd_analysis/cache/seqfish/chunks\"\n",
198 | "\n",
199 | "transcripts_df_chunks_dir = os.path.join(xenium_dir, \"transcripts_patches\") # Directory to store the files that contain the transcripts info for each chunk\n",
200 | "output_dir = os.path.join(enact_data_dir, \"bins_gdf\") # Directory to store the results of gene-to-bin assignment for each chunk\n",
201 | "cells_df_chunks_dir = os.path.join(enact_data_dir,\"cells_gdf\") \n",
202 | "ground_truth_dir = os.path.join(xenium_dir, \"ground_truth_nuclei\")\n",
203 | "\n",
204 | "# Making relevant directories\n",
205 | "os.makedirs(xenium_dir, exist_ok=True)\n",
206 | "os.makedirs(enact_data_dir, exist_ok=True)\n",
207 | "os.makedirs(transcripts_df_chunks_dir, exist_ok=True)\n",
208 | "os.makedirs(output_dir, exist_ok=True)\n",
209 | "os.makedirs(cells_df_chunks_dir, exist_ok=True)\n",
210 | "os.makedirs(ground_truth_dir, exist_ok=True)"
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "id": "dafe70a1-ed23-4cb6-a7b6-d35e4c01f895",
216 | "metadata": {},
217 | "source": [
218 | "### Generate Synthetic VesiumHD Dataset"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "id": "5bdd8461-7bcc-4101-b26b-765daf975916",
224 | "metadata": {
225 | "tags": []
226 | },
227 | "source": [
228 | "#### Break transcripts df to patches (based on location)"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "id": "042b4ce0-30d1-4c23-9b2d-0622db0a4f8c",
234 | "metadata": {},
235 | "source": [
236 | "Break transcripts df to patches of size 1000um x 1000um (larger patch size may result in memory issue)"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "id": "60fb886a-5893-40ba-b187-650d6cfb4ed6",
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "# patch size: 1000 um x 1000 um\n",
247 | "\n",
248 | "patch_size = 1000\n",
249 | "\n",
250 | "# patch indices\n",
251 | "transcripts_df['x_patch'] = (transcripts_df['x_location'] // patch_size).astype(int)\n",
252 | "transcripts_df['y_patch'] = (transcripts_df['y_location'] // patch_size).astype(int)\n",
253 | "transcripts_df[\"patch_id\"] = transcripts_df[\"x_patch\"].astype(str) + \"_\" + transcripts_df[\"y_patch\"].astype(str)\n",
254 | "\n",
255 | "# Create a df for each patch\n",
256 | "grouped = transcripts_df.groupby(['x_patch', 'y_patch'])\n",
257 | "for (x_patch, y_patch), group in grouped:\n",
258 | " # Calculate the start and end locations for each patch\n",
259 | " # x_start = x_patch * patch_size\n",
260 | " # x_end = (x_patch + 1) * patch_size\n",
261 | " # y_start = y_patch * patch_size\n",
262 | " # y_end = (y_patch + 1) * patch_size\n",
263 | " \n",
264 | " filename = f\"patch_{x_patch}_{y_patch}.csv\"\n",
265 | " output_loc = os.path.join(transcripts_df_patch_dir , filename)\n",
266 | " group.to_csv(output_loc)\n",
267 | "\n",
268 | " print(f\"Saved {filename}\")"
269 | ]
270 | },
271 | {
272 | "cell_type": "markdown",
273 | "id": "a7bbc9ec-675b-4b25-8448-334ed317798a",
274 | "metadata": {
275 | "tags": []
276 | },
277 | "source": [
278 | "#### Generate synthetic vesiumHD for each patch"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "id": "ceebc8fd-88e9-4b14-8470-a474085dee64",
284 | "metadata": {},
285 | "source": [
286 | "Each patch is broken into bins of size 2um x 2um. The synthetic data contains transcript counts orgnized by bin_id. Each row contains transcript counts for a unique bin. Bins with no transcript counts is not included. \n",
287 | "\n",
288 | "In addition to all the gene features, there are two additional columns represent the row number and column number of the bin, and a column contains the Shapely polygon item that represents the bin. The first column is the bin_id."
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "id": "d19155a0-5646-49bd-915c-94737e251bb0",
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "def generate_synthetic_VesiumHD_data(transcripts_df, bin_size=2, whole_cell=True, QScore20=True):\n",
299 | " filtered_df = transcripts_df.copy()\n",
300 | " # only count transcripts in the nucleus\n",
301 | " if not whole_cell:\n",
302 | " filtered_df = transcripts_df[transcripts_df['overlaps_nucleus'] == 1].copy()\n",
303 | " \n",
304 | " only count transcripts with QScore >= 20\n",
305 | " if QScore20:\n",
306 | " filtered_df = filtered_df[filtered_df['qv'] >= 20].copy()\n",
307 | " \n",
308 | " # assigne bin to each transcript\n",
309 | " filtered_df.loc[:, 'row'] =np.ceil(filtered_df['y_location'] / bin_size).astype(int)\n",
310 | " filtered_df.loc[:, 'column'] = np.ceil(filtered_df['x_location'] / bin_size).astype(int)\n",
311 | " filtered_df.loc[:, 'assigned_bin_id'] = filtered_df.apply(\n",
312 | " lambda row: f\"{bin_size}um_\" + str(row['row']).zfill(5) +\"_\"+ str(row['column']).zfill(5),\n",
313 | " axis=1)\n",
314 | " \n",
315 | " bin_coordinates = filtered_df[['assigned_bin_id', 'row', 'column']].drop_duplicates().set_index('assigned_bin_id')\n",
316 | " bin_gene_matrix = filtered_df.groupby(['assigned_bin_id', 'feature_name']).size().unstack(fill_value=0)\n",
317 | " bin_gene_matrix_with_coords = bin_gene_matrix.merge(bin_coordinates, left_index=True, right_index=True)\n",
318 | " \n",
319 | " return bin_gene_matrix_with_coords"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": null,
325 | "id": "bd804c49-dc85-4fa9-85d4-a621cf0598ae",
326 | "metadata": {},
327 | "outputs": [],
328 | "source": [
329 | "# Extract row and column number from the bin_id\n",
330 | "def extract_numbers(entry):\n",
331 | " match = re.search(r'_(\\d{5})_(\\d{5})', entry)\n",
332 | " if match:\n",
333 | " number1 = int(match.group(1).lstrip('0')) \n",
334 | " number2 = int(match.group(2).lstrip('0')) \n",
335 | " return number2*2-1, number1*2-1\n",
336 | " else:\n",
337 | " return None, None"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": null,
343 | "id": "f8d45c22-2776-4b80-a29b-37d07f6b06c5",
344 | "metadata": {},
345 | "outputs": [],
346 | "source": [
347 | "from tqdm import tqdm\n",
348 | "def generate_bin_polys(bins_df, x_col, y_col, bin_size):\n",
349 | " \"\"\"Represents the bins as Shapely polygons\n",
350 | "\n",
351 | " Args:\n",
352 | " bins_df (pd.DataFrame): bins dataframe\n",
353 | " x_col (str): column with the bin centre x-coordinate\n",
354 | " y_col (str): column with the bin centre y-coordinate\n",
355 | " bin_size (int): bin size in pixels\n",
356 | "\n",
357 | " Returns:\n",
358 | " list: list of Shapely polygons\n",
359 | " \"\"\"\n",
360 | " geometry = []\n",
361 | " # Generates Shapely polygons to represent each bin\n",
362 | "\n",
363 | " if True:\n",
364 | " half_bin_size = bin_size / 2\n",
365 | " bbox_coords = pd.DataFrame(\n",
366 | " {\n",
367 | " \"min_x\": bins_df[x_col] - half_bin_size,\n",
368 | " \"min_y\": bins_df[y_col] - half_bin_size,\n",
369 | " \"max_x\": bins_df[x_col] + half_bin_size,\n",
370 | " \"max_y\": bins_df[y_col] + half_bin_size,\n",
371 | " }\n",
372 | " )\n",
373 | " geometry = [\n",
374 | " shapely.geometry.box(min_x, min_y, max_x, max_y)\n",
375 | " for min_x, min_y, max_x, max_y in tqdm(\n",
376 | " zip(\n",
377 | " bbox_coords[\"min_x\"],\n",
378 | " bbox_coords[\"min_y\"],\n",
379 | " bbox_coords[\"max_x\"],\n",
380 | " bbox_coords[\"max_y\"],\n",
381 | " ),\n",
382 | " total=len(bins_df),\n",
383 | " )\n",
384 | " ]\n",
385 | "\n",
386 | " return geometry"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "id": "9f1c4071-ff50-4ec1-bd0d-37c8ddecaa54",
393 | "metadata": {
394 | "tags": []
395 | },
396 | "outputs": [],
397 | "source": [
398 | "# Loop through all the transcripra_df chunks and generate gene-to-bin assignments \n",
399 | "patch_size = 1000\n",
400 | "bin_size = 2\n",
401 | "transcripts_df_chunks = os.listdir(transcripts_df_patch_dir)\n",
402 | "for chunk_fname in transcripts_df_chunks:\n",
403 | " output_loc = os.path.join(output_dir, chunk_fname)\n",
404 | " # if os.path.exists(output_loc):\n",
405 | " # continue\n",
406 | " if chunk_fname in [\".ipynb_checkpoints\"]:\n",
407 | " continue\n",
408 | " transcripts_df_chunk = pd.read_csv(os.path.join(transcripts_df_patch_dir, chunk_fname))\n",
409 | " bin_df_chunk = generate_synthetic_VesiumHD_data(transcripts_df_chunk, bin_size, whole_cell=True, QScore20=True)\n",
410 | " bin_df_chunk['column'] = bin_df_chunk['column']*2-1\n",
411 | " bin_df_chunk['row'] = bin_df_chunk['row']*2-1\n",
412 | " bin_df_chunk['geometry'] = generate_bin_polys(bin_df_chunk, 'column', 'row', 2)\n",
413 | " bin_gdf_chunk = gpd.GeoDataFrame( bin_df_chunk, geometry = bin_df_chunk['geometry'])\n",
414 | " bin_df_chunk.to_csv(output_loc)\n",
415 | " print(f\"Successfully assigned transcripts to bins for {chunk_fname}\")\n"
416 | ]
417 | },
418 | {
419 | "cell_type": "markdown",
420 | "id": "105e310d-2a9d-41b5-9450-23ab3e57e7f7",
421 | "metadata": {
422 | "tags": []
423 | },
424 | "source": [
425 | "### Generate cell_gdf as enact_pipeline input"
426 | ]
427 | },
428 | {
429 | "cell_type": "markdown",
430 | "id": "428d33fd-45be-4dde-b4b9-acc3de13f9e0",
431 | "metadata": {},
432 | "source": [
433 | "This session generate the cell_df patches required to run the enact pipeline. The main purpose is to create Shapely polygons that represent the cell outline."
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": null,
439 | "id": "5a4bff77-1b7a-4921-a4c2-0b66cf800468",
440 | "metadata": {},
441 | "outputs": [],
442 | "source": [
443 | "def create_polygons(coords_array):\n",
444 | " polygons = []\n",
445 | " for row in coords_array:\n",
446 | " reshaped_coords = row.reshape(-1, 2)\n",
447 | " polygon = Polygon(reshaped_coords)\n",
448 | " polygons.append(polygon)\n",
449 | " return polygons\n",
450 | "\n",
451 | "# Create the polygons\n",
452 | "polygons = create_polygons(file)\n",
453 | "cells_df['polygons'] = polygons"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": null,
459 | "id": "22875d42-5489-4ed0-b370-d693f26318e9",
460 | "metadata": {},
461 | "outputs": [],
462 | "source": [
463 | "cell_gdf_chunk = gpd.GeoDataFrame(cells_df, geometry = cells_df['polygons'])\n",
464 | "cell_gdf_chunk.rename(columns={'x_centroid': 'cell_x', 'y_centroid': 'cell_y'}, inplace=True)\n",
465 | "cell_gdf_chunk.drop(\"Unnamed: 0\", axis=1, inplace=True)\n",
466 | "cell_gdf_chunk[['cell_id','cell_x','cell_y','geometry']].to_csv(os.path.join(enact_data_dir, \"cells_gdf\"))"
467 | ]
468 | },
469 | {
470 | "cell_type": "markdown",
471 | "id": "5e38a13d-3dfa-45e2-abc3-1b40c382a1db",
472 | "metadata": {
473 | "tags": []
474 | },
475 | "source": [
476 | "### Run ENACT bin-to-cell pipeline\n",
477 | "In the configs.yaml file: \n",
478 | "\n",
479 | " Set \"analysis_name\" in the configs.yaml file to \"xenium\" (or \"xenium_nuclei).\n",
480 | " Set \"run_synthetic\" to True and all other steps to False.\n",
481 | " Set \"bin_to_cell_method\" to one of these four: \"naive\", \"weighted_by_area\", \"weighted_by_gene\", or \"weighted_by_cluster\"\n",
482 | "\n",
483 | "Run `make run_enact`"
484 | ]
485 | },
486 | {
487 | "cell_type": "markdown",
488 | "id": "2ae8aa8e-0a17-48ae-86ed-81a04ec203dc",
489 | "metadata": {
490 | "tags": []
491 | },
492 | "source": [
493 | "### Generate Ground Truth"
494 | ]
495 | },
496 | {
497 | "cell_type": "markdown",
498 | "id": "670974eb-8dae-4d67-b735-1cd53858d560",
499 | "metadata": {},
500 | "source": [
501 | "The following cell will generate and save the ground truth of the synthetic VisiumHD data for the use of bin-to-cell assignment methods evaluation. Ground truth dataframe consists of rows representing the transcript counts of each cell. Each column represents a gene feature (gene feature name is also the column name)."
502 | ]
503 | },
504 | {
505 | "cell_type": "markdown",
506 | "id": "8224ea02-5701-450c-9efb-c38de7492764",
507 | "metadata": {
508 | "tags": []
509 | },
510 | "source": [
511 | "#### Generate Cell-gene matrix for evaluation"
512 | ]
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": null,
517 | "id": "8f23be59-86ef-4ed0-b9fd-b22b203fa769",
518 | "metadata": {},
519 | "outputs": [],
520 | "source": [
521 | "def generate_ground_truth_table(transcripts_df, cells_df, whole_cell=True, QScore20=True, include_unassigned_transcript=False):\n",
522 | " filtered_df = transcripts_df\n",
523 | " \n",
524 | " # only count transcripts in the nucleus\n",
525 | " if not whole_cell:\n",
526 | " filtered_df = transcripts_df[transcripts_df['overlaps_nucleus'] == 1]\n",
527 | " \n",
528 | " # only count transcripts with QScore >= 20\n",
529 | " if QScore20:\n",
530 | " filtered_df = filtered_df[filtered_df['qv'] >= 20]\n",
531 | " \n",
532 | " # only count transcripts that are assigned to specific cells\n",
533 | " if not include_unassigned_transcript:\n",
534 | " filtered_df = filtered_df[filtered_df['cell_id'] != 'UNASSIGNED']\n",
535 | " \n",
536 | " pivot_df = filtered_df.pivot_table(index='cell_id', columns='feature_name', aggfunc='size', fill_value=0)\n",
537 | " \n",
538 | " merged_df = pivot_df.merge(cells_df[['cell_id']], left_index=True, right_on='cell_id', how='right')\n",
539 | " columns = ['cell_id'] + [col for col in merged_df.columns if col not in ['cell_id', 'x_centroid', 'y_centroid','polygons']]\n",
540 | " merged_df = merged_df[columns]\n",
541 | " merged_df.set_index('cell_id', inplace=True)\n",
542 | " #merged_df['total_gene_counts'] = merged_df.iloc[:, 3:].sum(axis=1)\n",
543 | " \n",
544 | " return merged_df"
545 | ]
546 | },
547 | {
548 | "cell_type": "code",
549 | "execution_count": null,
550 | "id": "389f2644-5496-4286-961c-fa74ea32e97f",
551 | "metadata": {},
552 | "outputs": [],
553 | "source": [
554 | "bin_size = 2\n",
555 | "cell_df_chunks = os.listdir(cells_df_chunks_dir)\n",
556 | "for chunk_fname in cell_df_chunks:\n",
557 | " output_loc = os.path.join(ground_truth_dir,chunk_fname)\n",
558 | " if os.path.exists(output_loc):\n",
559 | " continue\n",
560 | " if chunk_fname in [\".ipynb_checkpoints\"]:\n",
561 | " continue\n",
562 | " cell_df_chunk = pd.read_csv(os.path.join(cell_dir, chunk_fname))\n",
563 | " groundtruth_chunk = generate_ground_truth_table(transcripts_df, cell_df_chunk, whole_cell=False, QScore20=False, include_unassigned_transcript=False)\n",
564 | " groundtruth_chunk.to_csv(output_loc)\n",
565 | " print(f\"Successfully generated groundthuth for {chunk_fname}\")"
566 | ]
567 | },
568 | {
569 | "cell_type": "markdown",
570 | "id": "f7a648b8-c2d9-4489-951e-dc0c443b489d",
571 | "metadata": {
572 | "tags": []
573 | },
574 | "source": [
575 | "### Evaluation of ENACT bin-to-cell results"
576 | ]
577 | },
578 | {
579 | "cell_type": "markdown",
580 | "id": "3759f86f-8498-41b1-a7ea-ca934b102d22",
581 | "metadata": {
582 | "tags": []
583 | },
584 | "source": [
585 | "#### Overall precision, recall, and f1"
586 | ]
587 | },
588 | {
589 | "cell_type": "markdown",
590 | "id": "20f300f2-73fb-4c86-9bf0-704f053d5299",
591 | "metadata": {},
592 | "source": [
593 | "Run this session with all the methods you have run with ENACT, change 'method' in the cell bellow to the one you want to evaluate."
594 | ]
595 | },
596 | {
597 | "cell_type": "code",
598 | "execution_count": null,
599 | "id": "5061ee46-1591-4a96-8643-5e96d7c55a44",
600 | "metadata": {},
601 | "outputs": [],
602 | "source": [
603 | "import pandas as pd\n",
604 | "import numpy as np\n",
605 | "\n",
606 | "method = \"weighted_by_cluster\"\n",
607 | "results_dir = os.path.join(enact_data_dir, method, \"bin_to_cell_assign\")\n",
608 | "\n",
609 | "# Initialize variables to accumulate weighted precision, recall, and F1\n",
610 | "total_cells = 0\n",
611 | "precision_sum = 0\n",
612 | "recall_sum = 0\n",
613 | "missing_cells_count = 0\n",
614 | "total_cells_count = 0\n",
615 | "results_chunks = os.listdir(results_dir)\n",
616 | "\n",
617 | "for chunk_fname in results_chunks:\n",
618 | " if chunk_fname in [\".ipynb_checkpoints\"]:\n",
619 | " continue\n",
620 | "\n",
621 | " generated = pd.read_csv(os.path.join(results_dir, chunk_fname))\n",
622 | " ground_truth = pd.read_csv(os.path.join(ground_truth_dir, chunk_fname))\n",
623 | " if len(generated) ==0:\n",
624 | " print(chunk_fname)\n",
625 | " continue\n",
626 | " generated.rename(columns={'id': 'cell_id'}, inplace=True)\n",
627 | " \n",
628 | " # Align both dataframes by 'cell_id', filling missing cells in generated with 0\n",
629 | " merged = pd.merge(ground_truth, generated, on='cell_id', how='left', suffixes=('_gt', '_gen')).fillna(0)\n",
630 | " num_cells = (ground_truth.iloc[:, 1:] != 0).any(axis=1).sum()\n",
631 | " missing_cells_count += num_cells - len(generated)\n",
632 | " total_cells_count += num_cells\n",
633 | "\n",
634 | " ground_truth_aligned = merged.filter(like='_gt').values\n",
635 | " generated_aligned = merged.filter(like='_gen').values\n",
636 | " assert ground_truth_aligned.shape == generated_aligned.shape, \"Aligned matrices must have the same shape!\"\n",
637 | "\n",
638 | " num_cells = ground_truth_aligned.shape[0]\n",
639 | "\n",
640 | " # Compute precision for the current patch\n",
641 | " patch_precision = np.sum(np.minimum(generated_aligned, ground_truth_aligned)) / np.sum(generated_aligned)\n",
642 | "\n",
643 | " # Compute recall for the current patch\n",
644 | " patch_recall = np.sum(np.minimum(generated_aligned, ground_truth_aligned)) / np.sum(ground_truth_aligned)\n",
645 | "\n",
646 | " # F1 score for the current patch\n",
647 | " if patch_precision + patch_recall > 0:\n",
648 | " patch_f1 = 2 * (patch_precision * patch_recall) / (patch_precision + patch_recall)\n",
649 | " else:\n",
650 | " patch_f1 = 0\n",
651 | "\n",
652 | " # Accumulate the weighted precision, recall, and number of aligned cells\n",
653 | " precision_sum += patch_precision * num_cells\n",
654 | " recall_sum += patch_recall * num_cells\n",
655 | " total_cells += num_cells\n",
656 | " \n",
657 | "# Compute overall weighted precision, recall, and F1 score\n",
658 | "overall_precision = precision_sum / total_cells\n",
659 | "overall_recall = recall_sum / total_cells\n",
660 | "\n",
661 | "if overall_precision + overall_recall > 0:\n",
662 | " overall_f1_score = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall)\n",
663 | "else:\n",
664 | " overall_f1_score = 0 \n",
665 | "\n",
666 | "# Print results\n",
667 | "print(f\"Overall Precision: {overall_precision}\")\n",
668 | "print(f\"Overall Recall: {overall_recall}\")\n",
669 | "print(f\"Overall F1 Score: {overall_f1_score}\")\n",
670 | "print(f\"Total missing cells in the generated data compared to ground truth: {missing_cells_count}\")\n",
671 | "print(f\"Total cells : {total_cells_count}\")"
672 | ]
673 | },
674 | {
675 | "cell_type": "markdown",
676 | "id": "eef397d4-ce75-4459-869e-7141fb72ba79",
677 | "metadata": {
678 | "tags": []
679 | },
680 | "source": [
681 | "#### Visualize the distribution using violin plots "
682 | ]
683 | },
684 | {
685 | "cell_type": "markdown",
686 | "id": "e0b763a9-4dce-48c3-9e43-40d2fbfd7c88",
687 | "metadata": {},
688 | "source": [
689 | "The following cells would create violin plots for all four methods in order to better compare the results. You can choose to only compare the ones you have run by changing the 'methods' list below to only include those."
690 | ]
691 | },
692 | {
693 | "cell_type": "code",
694 | "execution_count": null,
695 | "id": "b5e2326d-d85e-4075-afa5-2edf492eef0b",
696 | "metadata": {},
697 | "outputs": [],
698 | "source": [
699 | "import pandas as pd\n",
700 | "import numpy as np\n",
701 | "import os\n",
702 | "import seaborn as sns\n",
703 | "import matplotlib.pyplot as plt\n",
704 | "\n",
705 | "# Define methods and their directories\n",
706 | "methods = [\n",
707 | " {\n",
708 | " 'name': 'Naive',\n",
709 | " 'results_dir': os.path.join(enact_data_dir, \"naive\", \"bin_to_cell_assign\"), \n",
710 | " 'ground_truth_dir':ground_truth_dir\n",
711 | " },\n",
712 | " {\n",
713 | " 'name': 'Weighted_by_area',\n",
714 | " 'results_dir': os.path.join(enact_data_dir, \"weighted_by_area\", \"bin_to_cell_assign\"), \n",
715 | " 'ground_truth_dir':ground_truth_dir\n",
716 | " },\n",
717 | " {\n",
718 | " 'name': 'Weighted_by_gene',\n",
719 | " 'results_dir': os.path.join(enact_data_dir, \"weighted_by_gene\", \"bin_to_cell_assign\"), \n",
720 | " 'ground_truth_dir': ground_truth_dir\n",
721 | " },\n",
722 | " {\n",
723 | " 'name': 'Weighted_by_cluster',\n",
724 | " 'results_dir': os.path.join(enact_data_dir, \"weighted_by_cluster\", \"bin_to_cell_assign\"), \n",
725 | " 'ground_truth_dir': ground_truth_dir\n",
726 | " }\n",
727 | "]\n",
728 | "\n",
729 | "# Initialize a list to store per-patch metrics for all methods\n",
730 | "metrics_list = []\n",
731 | "\n",
732 | "# Loop through each method to compute per-patch metrics\n",
733 | "for method in methods:\n",
734 | " method_name = method['name']\n",
735 | " results_dir = method['results_dir']\n",
736 | " ground_truth_dir = method['ground_truth_dir']\n",
737 | " \n",
738 | " print(f\"Processing {method_name}...\")\n",
739 | " \n",
740 | " # Get list of generated and ground truth files\n",
741 | " generated_files = [f for f in os.listdir(results_dir) if f.endswith('.csv') and f not in [\".ipynb_checkpoints\"]]\n",
742 | " ground_truth_files = [f for f in os.listdir(ground_truth_dir) if f.endswith('.csv') and f not in [\".ipynb_checkpoints\"]]\n",
743 | " \n",
744 | " # Find common files between generated results and ground truth\n",
745 | " common_files = set(generated_files) & set(ground_truth_files)\n",
746 | " \n",
747 | " if not common_files:\n",
748 | " print(f\"No common files found for {method_name}. Skipping method.\")\n",
749 | " continue\n",
750 | " \n",
751 | " # Loop through each common file (patch)\n",
752 | " for fname in common_files:\n",
753 | " ground_truth_path = os.path.join(ground_truth_dir, fname)\n",
754 | " generated_path = os.path.join(results_dir, fname)\n",
755 | " \n",
756 | " # Load ground truth and generated data\n",
757 | " ground_truth = pd.read_csv(ground_truth_path)\n",
758 | " generated = pd.read_csv(generated_path)\n",
759 | " \n",
760 | " # Skip if generated data is empty\n",
761 | " if generated.empty:\n",
762 | " print(f\"No data in generated file {fname} for {method_name}. Skipping patch.\")\n",
763 | " continue\n",
764 | " \n",
765 | " # Rename columns for consistency\n",
766 | " if 'id' in generated.columns:\n",
767 | " generated.rename(columns={'id': 'cell_id'}, inplace=True)\n",
768 | " \n",
769 | " # Merge ground truth and generated data on 'cell_id', filling missing values with 0\n",
770 | " merged = pd.merge(\n",
771 | " ground_truth, generated, on='cell_id', how='outer', suffixes=('_gt', '_gen')\n",
772 | " ).fillna(0)\n",
773 | " \n",
774 | " # Extract aligned matrices for ground truth and generated data\n",
775 | " ground_truth_aligned = merged.filter(regex='_gt$').values\n",
776 | " generated_aligned = merged.filter(regex='_gen$').values\n",
777 | " \n",
778 | " # Ensure matrices are aligned\n",
779 | " if ground_truth_aligned.shape != generated_aligned.shape:\n",
780 | " print(f\"Shape mismatch in patch {fname} for {method_name}. Skipping patch.\")\n",
781 | " continue\n",
782 | " \n",
783 | " # Compute counts for this patch\n",
784 | " tp = np.sum(np.minimum(generated_aligned, ground_truth_aligned))\n",
785 | " predicted = np.sum(generated_aligned)\n",
786 | " actual = np.sum(ground_truth_aligned)\n",
787 | " \n",
788 | " # Compute metrics for this patch\n",
789 | " precision = tp / predicted if predicted > 0 else 0\n",
790 | " recall = tp / actual if actual > 0 else 0\n",
791 | " f1_score = (\n",
792 | " 2 * (precision * recall) / (precision + recall)\n",
793 | " if (precision + recall) > 0 else 0\n",
794 | " )\n",
795 | " \n",
796 | " # Store metrics for this patch\n",
797 | " metrics_list.append({\n",
798 | " 'Method': method_name,\n",
799 | " 'Patch': fname,\n",
800 | " 'Precision': precision,\n",
801 | " 'Recall': recall,\n",
802 | " 'F1 Score': f1_score\n",
803 | " })\n",
804 | "\n",
805 | "# Create a DataFrame with per-patch metrics\n",
806 | "metrics_df = pd.DataFrame(metrics_list)\n",
807 | "\n",
808 | "# Display the first few rows of the DataFrame\n",
809 | "print(\"\\nPer-Patch Metrics:\")\n",
810 | "print(metrics_df.head())"
811 | ]
812 | },
813 | {
814 | "cell_type": "code",
815 | "execution_count": null,
816 | "id": "6ad2f5b2-4b89-4480-85b5-6af2cc6bfb56",
817 | "metadata": {},
818 | "outputs": [],
819 | "source": [
820 | "# plotting\n",
821 | "sns.set(style=\"whitegrid\")\n",
822 | "\n",
823 | "# Create a figure with subplots for each metric\n",
824 | "fig, axes = plt.subplots(1, 3, figsize=(18, 6))\n",
825 | "\n",
826 | "# Precision Violin Plot\n",
827 | "sns.violinplot(x='Method', y='Precision', data=metrics_df, ax=axes[0], inner='quartile', palette='Set2')\n",
828 | "axes[0].set_title('Precision')\n",
829 | "axes[0].set_xlabel('Method')\n",
830 | "axes[0].set_ylabel('value')\n",
831 | "axes[0].set_ylim(0,1)\n",
832 | "axes[0].tick_params(axis='x', labelsize=8) # Adjust the font size here\n",
833 | "\n",
834 | "# Recall Violin Plot\n",
835 | "sns.violinplot(x='Method', y='Recall', data=metrics_df, ax=axes[1], inner='quartile', palette='Set2')\n",
836 | "axes[1].set_title('Recall')\n",
837 | "axes[1].set_xlabel('Method')\n",
838 | "axes[1].set_ylabel('value')\n",
839 | "axes[1].set_ylim(0,1)\n",
840 | "axes[1].tick_params(axis='x', labelsize=8) # Adjust the font size here\n",
841 | "\n",
842 | "# F1 Score Violin Plot\n",
843 | "sns.violinplot(x='Method', y='F1 Score', data=metrics_df, ax=axes[2], inner='quartile', palette='Set2')\n",
844 | "axes[2].set_title('F1 Score')\n",
845 | "axes[2].set_xlabel('Method')\n",
846 | "axes[2].set_ylabel('value')\n",
847 | "axes[2].set_ylim(0,1)\n",
848 | "axes[2].tick_params(axis='x', labelsize=8) # Adjust the font size here\n",
849 | "\n",
850 | "plt.tight_layout()\n",
851 | "plt.show()"
852 | ]
853 | }
854 | ],
855 | "metadata": {
856 | "kernelspec": {
857 | "display_name": "Python 3 (ipykernel)",
858 | "language": "python",
859 | "name": "python3"
860 | },
861 | "language_info": {
862 | "codemirror_mode": {
863 | "name": "ipython",
864 | "version": 3
865 | },
866 | "file_extension": ".py",
867 | "mimetype": "text/x-python",
868 | "name": "python",
869 | "nbconvert_exporter": "python",
870 | "pygments_lexer": "ipython3",
871 | "version": "3.10.14"
872 | }
873 | },
874 | "nbformat": 4,
875 | "nbformat_minor": 5
876 | }
877 |
--------------------------------------------------------------------------------
/src/synthetic_data/generate_synthetic_data_seqFISH.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "4d2cee82-d84e-4c57-af58-6ce961a3f819",
6 | "metadata": {},
7 | "source": [
8 | "To generate synthetic VisiumHD data from seqFISH+, please read and run all the cells below. Thanks!"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "a9fcd48a-2f55-43b4-befd-8d646ea634cf",
14 | "metadata": {
15 | "tags": []
16 | },
17 | "source": [
18 | "### Install prerequisite libraries"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "id": "7453e3e3-a55c-47fb-ab83-2c3743833b89",
25 | "metadata": {
26 | "scrolled": true,
27 | "tags": []
28 | },
29 | "outputs": [
30 | {
31 | "name": "stdout",
32 | "output_type": "stream",
33 | "text": [
34 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
35 | "Requirement already satisfied: pip in /opt/conda/lib/python3.10/site-packages (24.2)\n",
36 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
37 | "Requirement already satisfied: scipy in /home/oneai/.local/lib/python3.10/site-packages (1.10.0)\n",
38 | "Requirement already satisfied: numpy<1.27.0,>=1.19.5 in /home/oneai/.local/lib/python3.10/site-packages (from scipy) (1.22.4)\n",
39 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
40 | "Requirement already satisfied: shapely in /home/oneai/.local/lib/python3.10/site-packages (2.0.0)\n",
41 | "Requirement already satisfied: numpy>=1.14 in /home/oneai/.local/lib/python3.10/site-packages (from shapely) (1.22.4)\n",
42 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
43 | "Requirement already satisfied: tifffile in /home/oneai/.local/lib/python3.10/site-packages (2022.10.10)\n",
44 | "Requirement already satisfied: numpy>=1.19.2 in /home/oneai/.local/lib/python3.10/site-packages (from tifffile) (1.22.4)\n",
45 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
46 | "Requirement already satisfied: plotly in /home/oneai/.local/lib/python3.10/site-packages (5.13.1)\n",
47 | "Requirement already satisfied: tenacity>=6.2.0 in /home/oneai/.local/lib/python3.10/site-packages (from plotly) (9.0.0)\n",
48 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
49 | "Requirement already satisfied: tensorflow-gpu==2.10.0 in /opt/conda/lib/python3.10/site-packages (2.10.0)\n",
50 | "Requirement already satisfied: absl-py>=1.0.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.1.0)\n",
51 | "Requirement already satisfied: astunparse>=1.6.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.6.3)\n",
52 | "Requirement already satisfied: flatbuffers>=2.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (24.3.25)\n",
53 | "Requirement already satisfied: gast<=0.4.0,>=0.2.1 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (0.4.0)\n",
54 | "Requirement already satisfied: google-pasta>=0.1.1 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (0.2.0)\n",
55 | "Requirement already satisfied: grpcio<2.0,>=1.24.3 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.66.1)\n",
56 | "Requirement already satisfied: h5py>=2.9.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (3.11.0)\n",
57 | "Requirement already satisfied: keras<2.11,>=2.10.0 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.10.0)\n",
58 | "Requirement already satisfied: keras-preprocessing>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.1.2)\n",
59 | "Requirement already satisfied: libclang>=13.0.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (18.1.1)\n",
60 | "Requirement already satisfied: numpy>=1.20 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.22.4)\n",
61 | "Requirement already satisfied: opt-einsum>=2.3.2 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (3.3.0)\n",
62 | "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (24.0)\n",
63 | "Requirement already satisfied: protobuf<3.20,>=3.9.2 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (3.19.6)\n",
64 | "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (69.5.1)\n",
65 | "Requirement already satisfied: six>=1.12.0 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.16.0)\n",
66 | "Requirement already satisfied: tensorboard<2.11,>=2.10 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.10.1)\n",
67 | "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (0.37.1)\n",
68 | "Requirement already satisfied: tensorflow-estimator<2.11,>=2.10.0 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.10.0)\n",
69 | "Requirement already satisfied: termcolor>=1.1.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.4.0)\n",
70 | "Requirement already satisfied: typing-extensions>=3.6.6 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (4.12.2)\n",
71 | "Requirement already satisfied: wrapt>=1.11.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.14.1)\n",
72 | "Requirement already satisfied: wheel<1.0,>=0.23.0 in /opt/conda/lib/python3.10/site-packages (from astunparse>=1.6.0->tensorflow-gpu==2.10.0) (0.43.0)\n",
73 | "Requirement already satisfied: google-auth<3,>=1.6.3 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2.33.0)\n",
74 | "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (0.4.6)\n",
75 | "Requirement already satisfied: markdown>=2.6.8 in /home/oneai/.local/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.7)\n",
76 | "Requirement already satisfied: requests<3,>=2.21.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2.31.0)\n",
77 | "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (0.6.1)\n",
78 | "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (1.8.1)\n",
79 | "Requirement already satisfied: werkzeug>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.0.4)\n",
80 | "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (5.4.0)\n",
81 | "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (0.4.0)\n",
82 | "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (4.9)\n",
83 | "Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/conda/lib/python3.10/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2.0.0)\n",
84 | "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.3.2)\n",
85 | "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.6)\n",
86 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (1.26.19)\n",
87 | "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2024.7.4)\n",
88 | "Requirement already satisfied: MarkupSafe>=2.1.1 in /opt/conda/lib/python3.10/site-packages (from werkzeug>=1.0.1->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2.1.5)\n",
89 | "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /opt/conda/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (0.6.0)\n",
90 | "Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.10/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.2.2)\n",
91 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
92 | "Requirement already satisfied: stardist in /home/oneai/.local/lib/python3.10/site-packages (0.9.1)\n",
93 | "Requirement already satisfied: csbdeep>=0.8.0 in /home/oneai/.local/lib/python3.10/site-packages (from stardist) (0.8.0)\n",
94 | "Requirement already satisfied: scikit-image in /home/oneai/.local/lib/python3.10/site-packages (from stardist) (0.19.3)\n",
95 | "Requirement already satisfied: numba in /home/oneai/.local/lib/python3.10/site-packages (from stardist) (0.55.2)\n",
96 | "Requirement already satisfied: imageio in /home/oneai/.local/lib/python3.10/site-packages (from stardist) (2.35.1)\n",
97 | "Requirement already satisfied: numpy in /home/oneai/.local/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (1.22.4)\n",
98 | "Requirement already satisfied: scipy in /home/oneai/.local/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (1.10.0)\n",
99 | "Requirement already satisfied: matplotlib in /home/oneai/.local/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (3.6.2)\n",
100 | "Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (1.16.0)\n",
101 | "Requirement already satisfied: tifffile in /home/oneai/.local/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (2022.10.10)\n",
102 | "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (4.66.2)\n",
103 | "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (24.0)\n",
104 | "Requirement already satisfied: pillow>=8.3.2 in /home/oneai/.local/lib/python3.10/site-packages (from imageio->stardist) (10.4.0)\n",
105 | "Requirement already satisfied: llvmlite<0.39,>=0.38.0rc1 in /home/oneai/.local/lib/python3.10/site-packages (from numba->stardist) (0.38.1)\n",
106 | "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from numba->stardist) (69.5.1)\n",
107 | "Requirement already satisfied: networkx>=2.2 in /home/oneai/.local/lib/python3.10/site-packages (from scikit-image->stardist) (3.3)\n",
108 | "Requirement already satisfied: PyWavelets>=1.1.1 in /home/oneai/.local/lib/python3.10/site-packages (from scikit-image->stardist) (1.6.0)\n",
109 | "Requirement already satisfied: contourpy>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (1.2.1)\n",
110 | "Requirement already satisfied: cycler>=0.10 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (0.12.1)\n",
111 | "Requirement already satisfied: fonttools>=4.22.0 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (4.53.1)\n",
112 | "Requirement already satisfied: kiwisolver>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (1.4.5)\n",
113 | "Requirement already satisfied: pyparsing>=2.2.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (3.1.4)\n",
114 | "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (2.9.0)\n",
115 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
116 | "Requirement already satisfied: geopandas in /home/oneai/.local/lib/python3.10/site-packages (0.12.2)\n",
117 | "Requirement already satisfied: pandas>=1.0.0 in /home/oneai/.local/lib/python3.10/site-packages (from geopandas) (1.5.2)\n",
118 | "Requirement already satisfied: shapely>=1.7 in /home/oneai/.local/lib/python3.10/site-packages (from geopandas) (2.0.0)\n",
119 | "Requirement already satisfied: fiona>=1.8 in /home/oneai/.local/lib/python3.10/site-packages (from geopandas) (1.9.6)\n",
120 | "Requirement already satisfied: pyproj>=2.6.1.post1 in /home/oneai/.local/lib/python3.10/site-packages (from geopandas) (3.6.1)\n",
121 | "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from geopandas) (24.0)\n",
122 | "Requirement already satisfied: attrs>=19.2.0 in /opt/conda/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (24.2.0)\n",
123 | "Requirement already satisfied: certifi in /opt/conda/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (2024.7.4)\n",
124 | "Requirement already satisfied: click~=8.0 in /home/oneai/.local/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (8.1.7)\n",
125 | "Requirement already satisfied: click-plugins>=1.0 in /home/oneai/.local/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (1.1.1)\n",
126 | "Requirement already satisfied: cligj>=0.5 in /home/oneai/.local/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (0.7.2)\n",
127 | "Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (1.16.0)\n",
128 | "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.0.0->geopandas) (2.9.0)\n",
129 | "Requirement already satisfied: pytz>=2020.1 in /home/oneai/.local/lib/python3.10/site-packages (from pandas>=1.0.0->geopandas) (2022.7.1)\n",
130 | "Requirement already satisfied: numpy>=1.21.0 in /home/oneai/.local/lib/python3.10/site-packages (from pandas>=1.0.0->geopandas) (1.22.4)\n",
131 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
132 | "Requirement already satisfied: scanpy in /home/oneai/.local/lib/python3.10/site-packages (1.9.1)\n",
133 | "Requirement already satisfied: anndata>=0.7.4 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.8.0)\n",
134 | "Requirement already satisfied: numpy>=1.17.0 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.22.4)\n",
135 | "Requirement already satisfied: matplotlib>=3.4 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (3.6.2)\n",
136 | "Requirement already satisfied: pandas>=1.0 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.5.2)\n",
137 | "Requirement already satisfied: scipy>=1.4 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.10.0)\n",
138 | "Requirement already satisfied: seaborn in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.13.2)\n",
139 | "Requirement already satisfied: h5py>=3 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (3.11.0)\n",
140 | "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from scanpy) (4.66.2)\n",
141 | "Requirement already satisfied: scikit-learn>=0.22 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.2.0)\n",
142 | "Requirement already satisfied: statsmodels>=0.10.0rc2 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.14.2)\n",
143 | "Requirement already satisfied: patsy in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.5.6)\n",
144 | "Requirement already satisfied: networkx>=2.3 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (3.3)\n",
145 | "Requirement already satisfied: natsort in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (8.4.0)\n",
146 | "Requirement already satisfied: joblib in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.4.2)\n",
147 | "Requirement already satisfied: numba>=0.41.0 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.55.2)\n",
148 | "Requirement already satisfied: umap-learn>=0.3.10 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.5.6)\n",
149 | "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from scanpy) (24.0)\n",
150 | "Requirement already satisfied: session-info in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.0.0)\n",
151 | "Requirement already satisfied: contourpy>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (1.2.1)\n",
152 | "Requirement already satisfied: cycler>=0.10 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (0.12.1)\n",
153 | "Requirement already satisfied: fonttools>=4.22.0 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (4.53.1)\n",
154 | "Requirement already satisfied: kiwisolver>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (1.4.5)\n",
155 | "Requirement already satisfied: pillow>=6.2.0 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (10.4.0)\n",
156 | "Requirement already satisfied: pyparsing>=2.2.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (3.1.4)\n",
157 | "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (2.9.0)\n",
158 | "Requirement already satisfied: llvmlite<0.39,>=0.38.0rc1 in /home/oneai/.local/lib/python3.10/site-packages (from numba>=0.41.0->scanpy) (0.38.1)\n",
159 | "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from numba>=0.41.0->scanpy) (69.5.1)\n",
160 | "Requirement already satisfied: pytz>=2020.1 in /home/oneai/.local/lib/python3.10/site-packages (from pandas>=1.0->scanpy) (2022.7.1)\n",
161 | "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/oneai/.local/lib/python3.10/site-packages (from scikit-learn>=0.22->scanpy) (3.1.0)\n",
162 | "Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from patsy->scanpy) (1.16.0)\n",
163 | "Requirement already satisfied: pynndescent>=0.5 in /home/oneai/.local/lib/python3.10/site-packages (from umap-learn>=0.3.10->scanpy) (0.5.13)\n",
164 | "Requirement already satisfied: stdlib-list in /home/oneai/.local/lib/python3.10/site-packages (from session-info->scanpy) (0.10.0)\n",
165 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
166 | "Requirement already satisfied: fastparquet in /home/oneai/.local/lib/python3.10/site-packages (2024.5.0)\n",
167 | "Requirement already satisfied: pandas>=1.5.0 in /home/oneai/.local/lib/python3.10/site-packages (from fastparquet) (1.5.2)\n",
168 | "Requirement already satisfied: numpy in /home/oneai/.local/lib/python3.10/site-packages (from fastparquet) (1.22.4)\n",
169 | "Requirement already satisfied: cramjam>=2.3 in /home/oneai/.local/lib/python3.10/site-packages (from fastparquet) (2.8.3)\n",
170 | "Requirement already satisfied: fsspec in /home/oneai/.local/lib/python3.10/site-packages (from fastparquet) (2024.6.1)\n",
171 | "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from fastparquet) (24.0)\n",
172 | "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.5.0->fastparquet) (2.9.0)\n",
173 | "Requirement already satisfied: pytz>=2020.1 in /home/oneai/.local/lib/python3.10/site-packages (from pandas>=1.5.0->fastparquet) (2022.7.1)\n",
174 | "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas>=1.5.0->fastparquet) (1.16.0)\n",
175 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
176 | "Requirement already satisfied: imagecodecs in /home/oneai/.local/lib/python3.10/site-packages (2024.6.1)\n",
177 | "Requirement already satisfied: numpy in /home/oneai/.local/lib/python3.10/site-packages (from imagecodecs) (1.22.4)\n",
178 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
179 | "Requirement already satisfied: zarr in /home/oneai/.local/lib/python3.10/site-packages (2.17.1)\n",
180 | "Requirement already satisfied: asciitree in /home/oneai/.local/lib/python3.10/site-packages (from zarr) (0.3.3)\n",
181 | "Requirement already satisfied: numpy>=1.21.1 in /home/oneai/.local/lib/python3.10/site-packages (from zarr) (1.22.4)\n",
182 | "Requirement already satisfied: numcodecs>=0.10.0 in /home/oneai/.local/lib/python3.10/site-packages (from zarr) (0.13.0)\n",
183 | "Requirement already satisfied: fasteners in /home/oneai/.local/lib/python3.10/site-packages (from zarr) (0.19)\n",
184 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
185 | "Requirement already satisfied: scipy in /home/oneai/.local/lib/python3.10/site-packages (1.10.0)\n",
186 | "Requirement already satisfied: numpy<1.27.0,>=1.19.5 in /home/oneai/.local/lib/python3.10/site-packages (from scipy) (1.22.4)\n",
187 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n",
188 | "Requirement already satisfied: h5py in /home/oneai/.local/lib/python3.10/site-packages (3.11.0)\n",
189 | "Requirement already satisfied: numpy>=1.17.3 in /home/oneai/.local/lib/python3.10/site-packages (from h5py) (1.22.4)\n"
190 | ]
191 | }
192 | ],
193 | "source": [
194 | "!pip install --upgrade pip\n",
195 | "!pip install scipy\n",
196 | "!pip install shapely\n",
197 | "!pip install tifffile\n",
198 | "!pip install plotly\n",
199 | "!pip install tensorflow-gpu==2.10.0\n",
200 | "!pip install stardist\n",
201 | "!pip install geopandas\n",
202 | "!pip install scanpy\n",
203 | "!pip install fastparquet\n",
204 | "!pip install imagecodecs\n",
205 | "!pip install zarr\n",
206 | "!pip install scipy\n",
207 | "!pip install h5py"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "id": "1f79fb2c-0fd9-4bd4-8be9-4d1bd04d8733",
213 | "metadata": {
214 | "tags": []
215 | },
216 | "source": [
217 | "### Import Relevant Libraries"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": null,
223 | "id": "16e4dc02-2b8d-4e00-9cbd-8a4d151ca5af",
224 | "metadata": {
225 | "scrolled": true,
226 | "tags": []
227 | },
228 | "outputs": [],
229 | "source": [
230 | "import tifffile as tifi # Package to read the WSI (whole slide image)\n",
231 | "from csbdeep.utils import normalize # Image normalization\n",
232 | "from shapely.geometry import Polygon, Point # Representing bins and cells as Shapely Polygons and Point objects\n",
233 | "from shapely import wkt\n",
234 | "import geopandas as gpd # Geopandas for storing Shapely objects\n",
235 | "from matplotlib.colors import ListedColormap\n",
236 | "import matplotlib.pyplot as plt\n",
237 | "import scanpy as sc\n",
238 | "import pandas as pd\n",
239 | "from scipy import sparse\n",
240 | "import anndata\n",
241 | "import os\n",
242 | "import gzip\n",
243 | "import numpy as np\n",
244 | "import re\n",
245 | "import shapely\n",
246 | "import zarr\n"
247 | ]
248 | },
249 | {
250 | "cell_type": "markdown",
251 | "id": "a91a092e-781d-4a9e-8777-d3bb9c99309c",
252 | "metadata": {
253 | "tags": []
254 | },
255 | "source": [
256 | "### Create folders to store synthetic data"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "id": "37e30a8b-77f8-4d2c-97c2-8274eb0d23a3",
262 | "metadata": {},
263 | "source": [
264 | "For both the `seqfish_dir` and `enact_data_dir`, change `\"/home/oneai/\"` to the directory that stores this repo."
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "id": "01f77ecd-3f9a-4a39-bbb2-e90e851ec360",
271 | "metadata": {},
272 | "outputs": [],
273 | "source": [
274 | "seqfish_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/synthetic_data/seqFISH\" # Update it to the directory where you want to save the synthetic data\n",
275 | "enact_data_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/seqfish/chunks\" # Directory that saves all the input and results of the enact pipeline, \n",
276 | "# should end with \"oneai-dda-spatialtr-visiumhd_analysis/cache/seqfish/chunks\"\n",
277 | "\n",
278 | "transcripts_df_chunks_dir = os.path.join(seqfish_dir, \"transcripts_patches\") # Directory to store the files that contain the transcripts info for each chunk\n",
279 | "output_dir = os.path.join(enact_data_dir, \"bins_gdf\") # Directory to store the generated synthetic binned transcript counts\n",
280 | "cells_df_chunks_dir = os.path.join(enact_data_dir,\"cells_gdf\") # Directory to store the generated synthetic binned transcript counts\n",
281 | "\n",
282 | "# Making relevant directories\n",
283 | "os.makedirs(seqfish_dir, exist_ok=True)\n",
284 | "os.makedirs(enact_data_dir, exist_ok=True)\n",
285 | "os.makedirs(transcripts_df_chunks_dir, exist_ok=True)\n",
286 | "os.makedirs(output_dir, exist_ok=True)\n",
287 | "os.makedirs(cells_df_chunks_dir, exist_ok=True)"
288 | ]
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "id": "0048c41f-18ee-4b92-b7ea-680956330667",
293 | "metadata": {
294 | "tags": []
295 | },
296 | "source": [
297 | "### Download seqFISH+ data\n",
298 | "\n",
299 | "1. Download \"ROIs_Experiment1_NIH3T3.zip\" from https://zenodo.org/records/2669683#.Xqi1w5NKg6g to seqfish_dir. The zipfile contains cell segmentation files\n",
300 | "2. Download \"run1.csv.gz\" from https://github.com/MonashBioinformaticsPlatform/seqfish-hack. It contains the tidy format of \"seqFISH+_NIH3T3_point_locations.zip\" from the official seqFISH+ zenodo site"
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "id": "46a8d90a-65dd-4e93-b4e2-4a257d6e1dc7",
306 | "metadata": {
307 | "tags": []
308 | },
309 | "source": [
310 | "### Load Cell & Transcripts Info"
311 | ]
312 | },
313 | {
314 | "cell_type": "markdown",
315 | "id": "a40feb4c-1510-4222-bdec-a5e419758f32",
316 | "metadata": {},
317 | "source": [
318 | "This following cells first unzip \"ROIs_Experiment1_NIH3T3.zip\" to extract the cell segmentation information. Then load transcripts dataframe from \"run1.csv.gz\""
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "id": "e7bb6152-3999-4ccf-9a08-8fad268ab972",
325 | "metadata": {},
326 | "outputs": [],
327 | "source": [
328 | "import zipfile\n",
329 | "import os\n",
330 | "zip_file_path = os.path.join(seqfish_dir, \"ROIs_Experiment1_NIH3T3.zip\")\n",
331 | "\n",
332 | "# Open the ZIP file and extract all the contents\n",
333 | "with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n",
334 | " zip_ref.extractall(seqfish_dir)\n",
335 | "\n",
336 | "print(f'Files extracted to {seqfish_dir}')"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "id": "062ee054-6e0e-4c2b-9782-9e2b328c18e0",
343 | "metadata": {},
344 | "outputs": [],
345 | "source": [
346 | "file_path = os.path.join(seqfish_dir, \"run1.csv.gz\")\n",
347 | "\n",
348 | "transcripts_df = pd.read_csv(file_path, compression='gzip')\n",
349 | "print(transcripts_df)"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": null,
355 | "id": "eb2be572-8903-4539-8306-087cf61aa82d",
356 | "metadata": {},
357 | "outputs": [],
358 | "source": [
359 | "# convert from pixel to um\n",
360 | "transcripts_df.x = transcripts_df.x*0.103\n",
361 | "transcripts_df.y = transcripts_df.y*0.103\n",
362 | "# label cell to include fov and cell number\n",
363 | "transcripts_df['new_cell_name'] = transcripts_df.apply(lambda x: f\"{x['fov']}_Cell_{x['cell']}\", axis=1)"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "id": "5c8134a2-9a2b-41b6-81ab-4e292609e2f2",
369 | "metadata": {
370 | "tags": []
371 | },
372 | "source": [
373 | "### Generate Ground Truth"
374 | ]
375 | },
376 | {
377 | "cell_type": "markdown",
378 | "id": "9ae7d39d-f065-454a-bea6-7d31f57139fd",
379 | "metadata": {},
380 | "source": [
381 | "The following cell will generate and save the ground truth of the synthetic VisiumHD data for the use of bin-to-cell assignment methods evaluation. Ground truth dataframe consists of rows representing the transcript counts of each cell. Each column represents a gene feature (gene feature name is also the column name)."
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": null,
387 | "id": "b9bc5483-2357-40aa-a5b8-1a140b08967a",
388 | "metadata": {},
389 | "outputs": [],
390 | "source": [
391 | "groundtruth_df = transcripts_df.pivot_table(index=['new_cell_name'], columns='gene', aggfunc='size', fill_value=0)\n",
392 | "ground_truth_file = os.path.join(seqfish_dir, \"groundtruth.csv\")\n",
393 | "groundtruth_df.to_csv(ground_truth_file)"
394 | ]
395 | },
396 | {
397 | "cell_type": "markdown",
398 | "id": "dafe70a1-ed23-4cb6-a7b6-d35e4c01f895",
399 | "metadata": {
400 | "tags": []
401 | },
402 | "source": [
403 | "### Generate Synthetic VesiumHD Dataset"
404 | ]
405 | },
406 | {
407 | "cell_type": "markdown",
408 | "id": "5bdd8461-7bcc-4101-b26b-765daf975916",
409 | "metadata": {
410 | "tags": []
411 | },
412 | "source": [
413 | "#### Break transcripts df to patches (based on fov)"
414 | ]
415 | },
416 | {
417 | "cell_type": "markdown",
418 | "id": "b0d353b0-74cc-44f1-9a5b-ab5533d5d76a",
419 | "metadata": {},
420 | "source": [
421 | "Break transcripts df to patches based on their field of view (fov), since cell segmentation is done on each individual fov seperately."
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": null,
427 | "id": "60fb886a-5893-40ba-b187-650d6cfb4ed6",
428 | "metadata": {},
429 | "outputs": [],
430 | "source": [
431 | "# Create a df for each fov\n",
432 | "grouped = transcripts_df.groupby(['fov'])\n",
433 | "for fov, group in grouped:\n",
434 | " filename = f\"patch_{fov}.csv\"\n",
435 | " output_loc = os.path.join(transcripts_df_chunks_dir, filename)\n",
436 | " group.to_csv(output_loc)\n",
437 | "\n",
438 | " print(f\"Saved {filename}\")"
439 | ]
440 | },
441 | {
442 | "cell_type": "markdown",
443 | "id": "a7bbc9ec-675b-4b25-8448-334ed317798a",
444 | "metadata": {
445 | "tags": []
446 | },
447 | "source": [
448 | "#### Generate synthetic vesiumHD for each patch"
449 | ]
450 | },
451 | {
452 | "cell_type": "markdown",
453 | "id": "99052790-7e12-4851-b9a4-e9ead3a55d0f",
454 | "metadata": {},
455 | "source": [
456 | "Each fov is broken into bins of size 2um x 2um. The synthetic data contains transcript counts orgnized by bin_id. Each row contains transcript counts for a unique bin. Bins with no transcript counts is not included. \n",
457 | "\n",
458 | "In addition to all the gene features, there are two additional columns represent the row number and column number of the bin, and a column contains the Shapely polygon item that represents the bin. The first column is the bin_id."
459 | ]
460 | },
461 | {
462 | "cell_type": "code",
463 | "execution_count": null,
464 | "id": "d19155a0-5646-49bd-915c-94737e251bb0",
465 | "metadata": {},
466 | "outputs": [],
467 | "source": [
468 | "def generate_synthetic_VesiumHD_data(transcripts_df, bin_size=2):\n",
469 | " \n",
470 | " filtered_df = transcripts_df.copy()\n",
471 | " \n",
472 | " # assigne bin to each transcript\n",
473 | " filtered_df.loc[:, 'row'] =np.ceil(filtered_df['y'] / bin_size).astype(int)\n",
474 | " filtered_df.loc[:, 'column'] = np.ceil(filtered_df['x'] / bin_size).astype(int)\n",
475 | " filtered_df.loc[:, 'assigned_bin_id'] = filtered_df.apply(\n",
476 | " lambda row: f\"{bin_size}um_\" + str(row['row']).zfill(5) +\"_\"+ str(row['column']).zfill(5),\n",
477 | " axis=1)\n",
478 | " bin_coordinates = filtered_df[['assigned_bin_id', 'row', 'column']].drop_duplicates().set_index('assigned_bin_id')\n",
479 | " bin_gene_matrix = filtered_df.groupby(['assigned_bin_id', 'gene']).size().unstack(fill_value=0)\n",
480 | " bin_gene_matrix_with_coords = bin_gene_matrix.merge(bin_coordinates, left_index=True, right_index=True)\n",
481 | " return bin_gene_matrix_with_coords"
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": null,
487 | "id": "bd804c49-dc85-4fa9-85d4-a621cf0598ae",
488 | "metadata": {},
489 | "outputs": [],
490 | "source": [
491 | "# Extract row and column number from the bin_id\n",
492 | "def extract_numbers(entry):\n",
493 | " match = re.search(r'_(\\d{5})_(\\d{5})', entry)\n",
494 | " if match:\n",
495 | " number1 = int(match.group(1).lstrip('0')) \n",
496 | " number2 = int(match.group(2).lstrip('0')) \n",
497 | " return number2*2-1, number1*2-1\n",
498 | " else:\n",
499 | " return None, None"
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": null,
505 | "id": "ee921e47-70e4-4bee-92e3-6ce40a0fb50d",
506 | "metadata": {},
507 | "outputs": [],
508 | "source": [
509 | "from tqdm import tqdm\n",
510 | "def generate_bin_polys(bins_df, x_col, y_col, bin_size):\n",
511 | " \"\"\"Represents the bins as Shapely polygons\n",
512 | "\n",
513 | " Args:\n",
514 | " bins_df (pd.DataFrame): bins dataframe\n",
515 | " x_col (str): column with the bin centre x-coordinate\n",
516 | " y_col (str): column with the bin centre y-coordinate\n",
517 | " bin_size (int): bin size in pixels\n",
518 | "\n",
519 | " Returns:\n",
520 | " list: list of Shapely polygons\n",
521 | " \"\"\"\n",
522 | " geometry = []\n",
523 | " # Generates Shapely polygons to represent each bin\n",
524 | "\n",
525 | " if True:\n",
526 | " half_bin_size = bin_size / 2\n",
527 | " bbox_coords = pd.DataFrame(\n",
528 | " {\n",
529 | " \"min_x\": bins_df[x_col] - half_bin_size,\n",
530 | " \"min_y\": bins_df[y_col] - half_bin_size,\n",
531 | " \"max_x\": bins_df[x_col] + half_bin_size,\n",
532 | " \"max_y\": bins_df[y_col] + half_bin_size,\n",
533 | " }\n",
534 | " )\n",
535 | " geometry = [\n",
536 | " shapely.geometry.box(min_x, min_y, max_x, max_y)\n",
537 | " for min_x, min_y, max_x, max_y in tqdm(\n",
538 | " zip(\n",
539 | " bbox_coords[\"min_x\"],\n",
540 | " bbox_coords[\"min_y\"],\n",
541 | " bbox_coords[\"max_x\"],\n",
542 | " bbox_coords[\"max_y\"],\n",
543 | " ),\n",
544 | " total=len(bins_df),\n",
545 | " )\n",
546 | " ]\n",
547 | "\n",
548 | " return geometry"
549 | ]
550 | },
551 | {
552 | "cell_type": "code",
553 | "execution_count": null,
554 | "id": "9f1c4071-ff50-4ec1-bd0d-37c8ddecaa54",
555 | "metadata": {
556 | "tags": []
557 | },
558 | "outputs": [],
559 | "source": [
560 | "# Loop through all the transcripra_df patches and generate gene-to-bin assignments \n",
561 | "bin_size = 2\n",
562 | "transcripts_df_chunks = os.listdir(transcripts_df_chunks_dir)\n",
563 | "for chunk_fname in transcripts_df_chunks:\n",
564 | " output_loc = os.path.join(output_dir, chunk_fname)\n",
565 | " if chunk_fname in [\".ipynb_checkpoints\"]:\n",
566 | " continue\n",
567 | " # if os.path.exists(output_loc):\n",
568 | " # continue\n",
569 | " transcripts_df_chunk = pd.read_csv(os.path.join(transcripts_df_chunks_dir, chunk_fname))\n",
570 | " bin_df_chunk = generate_synthetic_VesiumHD_data(transcripts_df_chunk, bin_size)\n",
571 | " bin_df_chunk['column'] = bin_df_chunk['column']*2-1\n",
572 | " bin_df_chunk['row'] = bin_df_chunk['row']*2-1\n",
573 | " bin_df_chunk['geometry'] = generate_bin_polys(bin_df_chunk, 'column', 'row', 2)\n",
574 | " bin_gdf_chunk = gpd.GeoDataFrame( bin_df_chunk, geometry = bin_df_chunk['geometry'])\n",
575 | " bin_gdf_chunk.to_csv(output_loc)\n",
576 | " \n",
577 | " print(f\"Successfully assigned transcripts to bins for {chunk_fname}\")"
578 | ]
579 | },
580 | {
581 | "cell_type": "markdown",
582 | "id": "2ae8aa8e-0a17-48ae-86ed-81a04ec203dc",
583 | "metadata": {
584 | "tags": []
585 | },
586 | "source": [
587 | "### Generate ENACT pipeline cell segmentation input"
588 | ]
589 | },
590 | {
591 | "cell_type": "markdown",
592 | "id": "a2fc8e57-23d4-4f71-971b-6e4e1d9f0267",
593 | "metadata": {},
594 | "source": [
595 | "This session generate the cell_df patches required to run the enact pipeline. The main purpose is to create Shapely polygons that represent the cell outline."
596 | ]
597 | },
598 | {
599 | "cell_type": "markdown",
600 | "id": "57c34d0c-029c-482f-bc27-fc39e52adf4a",
601 | "metadata": {
602 | "tags": []
603 | },
604 | "source": [
605 | "#### Load cell boundary data and create cell polygons"
606 | ]
607 | },
608 | {
609 | "cell_type": "code",
610 | "execution_count": null,
611 | "id": "b140bc6d-f120-4d18-b302-844bb3b79a63",
612 | "metadata": {},
613 | "outputs": [],
614 | "source": [
615 | "import read_roi\n",
616 | "def process_roi_file(key, roi_file_path):\n",
617 | " roi_data = read_roi.read_roi_file(roi_file_path)\n",
618 | " data = roi_data[key]\n",
619 | " # Apply the scaling factor to each coordinate separately\n",
620 | " scaled_x = [x * 0.103 for x in data['x']]\n",
621 | " scaled_y = [y * 0.103 for y in data['y']]\n",
622 | " # Create the list of points using zip on the scaled coordinates\n",
623 | " points = [(x, y) for x, y in zip(scaled_x, scaled_y)]\n",
624 | " # Create and return the polygon\n",
625 | " polygon = Polygon(points)\n",
626 | " return polygon"
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": null,
632 | "id": "b5295212-8548-44a1-b15f-1234bdf28b88",
633 | "metadata": {},
634 | "outputs": [],
635 | "source": [
636 | "def extract_fov_from_string(s):\n",
637 | " # Search for one or more digits in the string\n",
638 | " match = re.search(r'\\d+', s)\n",
639 | " if match:\n",
640 | " return int(match.group(0))+1 # Convert the found number to an integer\n",
641 | " else:\n",
642 | " return None # Return None if no number is found"
643 | ]
644 | },
645 | {
646 | "cell_type": "code",
647 | "execution_count": null,
648 | "id": "fef3e532-d471-4635-b743-947c402dbe35",
649 | "metadata": {},
650 | "outputs": [],
651 | "source": [
652 | "base_path = os.path.join(seqfish_dir, \"ALL_Roi\") # Change this to the path where your fov folders are stored\n",
653 | "fov_data = []\n",
654 | "\n",
655 | "for fov_folder in os.listdir(base_path):\n",
656 | " fov_folder_path = os.path.join(base_path, fov_folder)\n",
657 | " if os.path.isdir(fov_folder_path):\n",
658 | " # Loop through each ROI file in the fov folder\n",
659 | " for roi_file in os.listdir(fov_folder_path):\n",
660 | " if roi_file.endswith('.roi'):\n",
661 | " key = roi_file.replace('.roi', '')\n",
662 | " roi_file_path = os.path.join(fov_folder_path, roi_file)\n",
663 | " polygon = process_roi_file(key, roi_file_path)\n",
664 | " fov_data.append({\n",
665 | " 'fov': extract_fov_from_string(fov_folder),\n",
666 | " 'cell': roi_file.replace('.roi', ''),\n",
667 | " 'geometry': polygon\n",
668 | " })\n",
669 | "\n",
670 | "cell_boundary_df = pd.DataFrame(fov_data)"
671 | ]
672 | },
673 | {
674 | "cell_type": "markdown",
675 | "id": "1f3b01b4-c042-4e70-9dd0-7ef88741b833",
676 | "metadata": {
677 | "tags": []
678 | },
679 | "source": [
680 | "#### relabel cell name of polygons df to the standard name"
681 | ]
682 | },
683 | {
684 | "cell_type": "code",
685 | "execution_count": null,
686 | "id": "15c59f4d-6fce-4702-861a-176516f518b3",
687 | "metadata": {},
688 | "outputs": [],
689 | "source": [
690 | "df_sorted = cell_boundary_df.sort_values(by=['fov', 'cell'])\n",
691 | "df_sorted['cell_id'] = df_sorted.groupby('fov').cumcount() + 1\n",
692 | "df_sorted['cell_id'] = df_sorted.apply(lambda x: f\"{x['fov']}_Cell_{x['cell_id']}\", axis=1)\n",
693 | "df_sorted.to_csv(\"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/seqfish/cells_df.csv\")"
694 | ]
695 | },
696 | {
697 | "cell_type": "markdown",
698 | "id": "8c9e51e0-b001-4b31-a6cb-d9a9c8f32eb4",
699 | "metadata": {
700 | "tags": []
701 | },
702 | "source": [
703 | "#### Break cell polygons df to patches (based on fov)"
704 | ]
705 | },
706 | {
707 | "cell_type": "code",
708 | "execution_count": null,
709 | "id": "13e7bc10-1903-46ef-9042-9086b35259a5",
710 | "metadata": {},
711 | "outputs": [
712 | {
713 | "name": "stdout",
714 | "output_type": "stream",
715 | "text": [
716 | "Saved patch_1.csv\n",
717 | "Saved patch_2.csv\n",
718 | "Saved patch_3.csv\n",
719 | "Saved patch_4.csv\n",
720 | "Saved patch_5.csv\n",
721 | "Saved patch_6.csv\n",
722 | "Saved patch_7.csv\n"
723 | ]
724 | },
725 | {
726 | "name": "stderr",
727 | "output_type": "stream",
728 | "text": [
729 | "/tmp/ipykernel_1563651/2577681905.py:3: FutureWarning: In a future version of pandas, a length 1 tuple will be returned when iterating over a groupby with a grouper equal to a list of length 1. Don't supply a list with a single grouper to avoid this warning.\n",
730 | " for fov, group in grouped:\n"
731 | ]
732 | }
733 | ],
734 | "source": [
735 | "\n",
736 | "# Create a df for each patch\n",
737 | "grouped = df_sorted.groupby(['fov'])\n",
738 | "for fov, group in grouped:\n",
739 | " filename = f\"patch_{fov}.csv\"\n",
740 | " output_loc = os.path.join(cells_df_chunks_dir, filename)\n",
741 | " group.to_csv(output_loc)\n",
742 | "\n",
743 | " print(f\"Saved {filename}\")\n"
744 | ]
745 | },
746 | {
747 | "cell_type": "markdown",
748 | "id": "eb4bebd9-bc07-44da-a02f-28d5ddc3c1ed",
749 | "metadata": {
750 | "tags": []
751 | },
752 | "source": [
753 | "### Run ENACT bin-to-cell pipeline\n",
754 | "In the configs.yaml file: \n",
755 | "\n",
756 | " Set \"analysis_name\" in the configs.yaml file to \"seqfish\".\n",
757 | " Set \"run_synthetic\" to True.\n",
758 | " Set \"bin_to_cell_method\" to one of these four: \"naive\", \"weighted_by_area\", \"weighted_by_gene\", or \"weighted_by_cluster\"\n",
759 | "\n",
760 | "Run `make run_enact`"
761 | ]
762 | },
763 | {
764 | "cell_type": "markdown",
765 | "id": "2a8fc042-2406-4db3-9617-7e3968ce8d28",
766 | "metadata": {
767 | "tags": []
768 | },
769 | "source": [
770 | "### Evaluation of ENACT bin-to-cell results"
771 | ]
772 | },
773 | {
774 | "cell_type": "markdown",
775 | "id": "01ff50d0-2993-42e9-98e9-fe478c32d605",
776 | "metadata": {},
777 | "source": [
778 | "To evaluate and compare the four bin-to-cell methods, please first complete the step above with all four methods. You can also only run the methods you are interested in and change the following code accordingly."
779 | ]
780 | },
781 | {
782 | "cell_type": "markdown",
783 | "id": "1ef3fb7f-cc99-4f9e-b5cc-321412b08ddb",
784 | "metadata": {
785 | "tags": []
786 | },
787 | "source": [
788 | "#### Calculate precision, recall, and F1 for each bin2cell method"
789 | ]
790 | },
791 | {
792 | "cell_type": "markdown",
793 | "id": "4d11287c-611d-49d2-a1e6-e12c14a973f5",
794 | "metadata": {},
795 | "source": [
796 | "Run this session with all the methods you have run with ENACT, change 'method' in the cell below to the one you want to evaluate."
797 | ]
798 | },
799 | {
800 | "cell_type": "code",
801 | "execution_count": null,
802 | "id": "f3c684f3-5b10-4bd2-8e1c-81d4cdb68ee4",
803 | "metadata": {},
804 | "outputs": [],
805 | "source": [
806 | "# Concatenate all patches of ENACT results file \n",
807 | "method = \"weighted_by_gene\" # other methods: \"naive\", \"weighted_by_area\", \"weighted_by_cluster\" \n",
808 | "directory_path = os.path.join(enact_data_dir,method,\"bin_to_cell_assign\") \n",
809 | "output_file = os.path.join(enact_data_dir,method,\"bin_to_cell_assign/merged.csv\") \n",
810 | "\n",
811 | "concatenate_csv_files(directory_path, output_file)"
812 | ]
813 | },
814 | {
815 | "cell_type": "code",
816 | "execution_count": null,
817 | "id": "4580e62f-e2f3-4d1e-9a25-c483304a119e",
818 | "metadata": {},
819 | "outputs": [],
820 | "source": [
821 | "import os\n",
822 | "import pandas as pd\n",
823 | "\n",
824 | "def concatenate_csv_files(directory_path, output_file):\n",
825 | " dataframes = []\n",
826 | "\n",
827 | " for filename in os.listdir(directory_path):\n",
828 | " if filename.endswith('.csv'):\n",
829 | " file_path = os.path.join(directory_path, filename)\n",
830 | " df = pd.read_csv(file_path)\n",
831 | " dataframes.append(df)\n",
832 | " \n",
833 | " concatenated_df = pd.concat(dataframes, ignore_index=True)\n",
834 | " concatenated_df = concatenated_df.drop(columns = ['Unnamed: 0.1','Unnamed: 0'])\n",
835 | " sorted_df = concatenated_df.sort_values(by='id')\n",
836 | " sorted_df.to_csv(output_file, index=False)\n",
837 | " print(f\"All CSV files have been concatenated into {output_file}\")"
838 | ]
839 | },
840 | {
841 | "cell_type": "code",
842 | "execution_count": null,
843 | "id": "263c024a-821e-4d15-9abd-d3463a8e34f1",
844 | "metadata": {},
845 | "outputs": [],
846 | "source": [
847 | "import pandas as pd\n",
848 | "import numpy as np\n",
849 | "from shapely.geometry import Polygon\n",
850 | "\n",
851 | "def calculate_metrics(ground_truth_file, generated_file, eval_file):\n",
852 | " # Load ground truth and generated data\n",
853 | " ground_truth = pd.read_csv(ground_truth_file)\n",
854 | " generated = pd.read_csv(generated_file)\n",
855 | " generated.fillna(0)\n",
856 | " # Ensure 'cell_id' is properly handled\n",
857 | " if 'id' in generated.columns:\n",
858 | " generated.rename(columns={'id': 'new_cell_name'}, inplace=True)\n",
859 | "\n",
860 | " # Merge data on 'cell_id'\n",
861 | " merged = pd.merge(\n",
862 | " ground_truth, generated, on='new_cell_name', how='outer', suffixes=('_gt', '_gen')\n",
863 | " ).fillna(0)\n",
864 | " # print(merged)\n",
865 | "\n",
866 | " # Identify common gene features\n",
867 | " gt_columns = merged.filter(like='_gt').columns\n",
868 | " gen_columns = merged.filter(like='_gen').columns\n",
869 | "\n",
870 | " common_genes = set(gt_columns).intersection(gen_columns)\n",
871 | "\n",
872 | " # Reorder columns based on common genes\n",
873 | " ordered_gt_columns = sorted(gt_columns)\n",
874 | " ordered_gen_columns = sorted(gen_columns)\n",
875 | " \n",
876 | "\n",
877 | " # Extract aligned matrices for ground truth and generated data\n",
878 | " ground_truth_aligned = merged[['new_cell_name'] + [col for col in ordered_gt_columns if col in gt_columns]].values\n",
879 | " generated_aligned = merged[['new_cell_name'] + [col for col in ordered_gen_columns if col in gen_columns]].values\n",
880 | " \n",
881 | " print(ground_truth_aligned)\n",
882 | " print(generated_aligned)\n",
883 | " # Ensure matrices are aligned and have the same shape\n",
884 | " if ground_truth_aligned.shape[1] != generated_aligned.shape[1]:\n",
885 | " raise ValueError(\"The aligned matrices must have the same shape!\")\n",
886 | "\n",
887 | " ground_truth_aligned = ground_truth_aligned[:, 1:] # Exclude cell_ids\n",
888 | " generated_aligned = generated_aligned[:, 1:] \n",
889 | "\n",
890 | " num_cells = (ground_truth.iloc[:, 1:] != 0).any(axis=1).sum()\n",
891 | " tp = np.sum(np.minimum(generated_aligned, ground_truth_aligned), axis=1)\n",
892 | " predicted = np.sum(generated_aligned, axis=1)\n",
893 | " actual = np.sum(ground_truth_aligned, axis=1)\n",
894 | "\n",
895 | " # Calculate precision, recall, and F1 score for each row\n",
896 | " precision = tp / predicted\n",
897 | " recall = tp / actual\n",
898 | " f1_score = 2 * (precision * recall) / (precision + recall)\n",
899 | " \n",
900 | "\n",
901 | " # Add a column called 'Method' where all rows have the same entry\n",
902 | " method_column = np.full((precision.shape[0],), 'Naive') # Replace 'YourMethodName' with the actual method name\n",
903 | "\n",
904 | " df = pd.DataFrame({\n",
905 | " 'Precision': precision,\n",
906 | " 'Recall': recall,\n",
907 | " 'F1 Score': f1_score,\n",
908 | " 'Method': method_column\n",
909 | " })\n",
910 | "\n",
911 | "\n",
912 | " df.to_csv(eval_file)\n"
913 | ]
914 | },
915 | {
916 | "cell_type": "code",
917 | "execution_count": null,
918 | "id": "db707de6-8da9-495e-82a0-69c009cf1475",
919 | "metadata": {},
920 | "outputs": [],
921 | "source": [
922 | "ground_truth_file = os.path.join(seqfish_dir, \"groundtruth.csv\")\n",
923 | "generated_file = os.path.join(enact_data_dir,method,\"bin_to_cell_assign/merged.csv\")\n",
924 | "eval_file = os.path.join(enact_data_dir,method,\"eval.csv\") \n",
925 | "\n",
926 | "calculate_metrics(ground_truth_file, generated_file, eval_file)"
927 | ]
928 | },
929 | {
930 | "cell_type": "markdown",
931 | "id": "5cd9470e-8410-4510-9165-cebd466ab343",
932 | "metadata": {
933 | "tags": []
934 | },
935 | "source": [
936 | "#### Create violin plots comparing four bin2cell methods"
937 | ]
938 | },
939 | {
940 | "cell_type": "markdown",
941 | "id": "b78b6e7d-0e57-46fd-bddb-c701750a625b",
942 | "metadata": {},
943 | "source": [
944 | "The following cells would create violin plots for all four methods in order to better compare the results. You can choose to only compare the ones you have run by changing the 'file_names' list to only include those."
945 | ]
946 | },
947 | {
948 | "cell_type": "code",
949 | "execution_count": null,
950 | "id": "7dc3b4e5-798b-4d0f-a243-bc619daa6f50",
951 | "metadata": {},
952 | "outputs": [],
953 | "source": [
954 | "file_names = [os.path.join(enact_data_dir,\"naive/eval.csv\"), \n",
955 | " os.path.join(enact_data_dir,\"weighted_by_area/eval.csv\"), \n",
956 | " os.path.join(enact_data_dir,\"weighted_by_gene/eval.csv\"),\n",
957 | " os.path.join(enact_data_dir,\"weighted_by_cluster/eval.csv\")] # Replace with actual file paths\n",
958 | "\n",
959 | "# Read and concatenate all files\n",
960 | "df_list = [pd.read_csv(file) for file in file_names]\n",
961 | "metrics_df = pd.concat(df_list, ignore_index=True)"
962 | ]
963 | },
964 | {
965 | "cell_type": "code",
966 | "execution_count": null,
967 | "id": "5a9f977f-6530-446a-9aa4-57d0cbbca85b",
968 | "metadata": {},
969 | "outputs": [],
970 | "source": [
971 | "# Visualize the distributions using violin plots\n",
972 | "sns.set(style=\"whitegrid\")\n",
973 | "\n",
974 | "# Create a figure with subplots for each metric\n",
975 | "fig, axes = plt.subplots(1, 3, figsize=(18, 6))\n",
976 | "\n",
977 | "# Precision Violin Plot\n",
978 | "sns.violinplot(x='Method', y='Precision', data=metrics_df, ax=axes[0], inner='quartile', palette='Set2')\n",
979 | "axes[0].set_title('Precision')\n",
980 | "axes[0].set_xlabel('Method')\n",
981 | "axes[0].set_ylabel('value')\n",
982 | "axes[0].set_ylim(0.8,1)\n",
983 | "axes[0].tick_params(axis='x', labelsize=8) # Adjust the font size here\n",
984 | "\n",
985 | "# Recall Violin Plot\n",
986 | "sns.violinplot(x='Method', y='Recall', data=metrics_df, ax=axes[1], inner='quartile', palette='Set2')\n",
987 | "axes[1].set_title('Recall')\n",
988 | "axes[1].set_xlabel('Method')\n",
989 | "axes[1].set_ylabel('value')\n",
990 | "axes[1].set_ylim(0.8,1)\n",
991 | "axes[1].tick_params(axis='x', labelsize=8) # Adjust the font size here\n",
992 | "\n",
993 | "# F1 Score Violin Plot\n",
994 | "sns.violinplot(x='Method', y='F1 Score', data=metrics_df, ax=axes[2], inner='quartile', palette='Set2')\n",
995 | "axes[2].set_title('F1 Score')\n",
996 | "axes[2].set_xlabel('Method')\n",
997 | "axes[2].set_ylabel('value')\n",
998 | "axes[2].set_ylim(0.8,1)\n",
999 | "axes[2].tick_params(axis='x', labelsize=8) # Adjust the font size here\n",
1000 | "\n",
1001 | "plt.tight_layout()\n",
1002 | "plt.show()"
1003 | ]
1004 | }
1005 | ],
1006 | "metadata": {
1007 | "kernelspec": {
1008 | "display_name": "Python 3 (ipykernel)",
1009 | "language": "python",
1010 | "name": "python3"
1011 | },
1012 | "language_info": {
1013 | "codemirror_mode": {
1014 | "name": "ipython",
1015 | "version": 3
1016 | },
1017 | "file_extension": ".py",
1018 | "mimetype": "text/x-python",
1019 | "name": "python",
1020 | "nbconvert_exporter": "python",
1021 | "pygments_lexer": "ipython3",
1022 | "version": "3.10.14"
1023 | }
1024 | },
1025 | "nbformat": 4,
1026 | "nbformat_minor": 5
1027 | }
1028 |
--------------------------------------------------------------------------------
/templates/tmap_template.tmap:
--------------------------------------------------------------------------------
1 | {
2 | "compositeMode": "source-over",
3 | "filename": "",
4 | "filters": [
5 | "Saturation",
6 | "Brightness",
7 | "Contrast"
8 | ],
9 | "layerFilters": {
10 | "0": [
11 | {
12 | "name": "Saturation",
13 | "value": "0"
14 | },
15 | {
16 | "name": "Brightness",
17 | "value": "0"
18 | },
19 | {
20 | "name": "Contrast",
21 | "value": "1"
22 | }
23 | ],
24 | "1": [
25 | {
26 | "name": "Saturation",
27 | "value": "0"
28 | },
29 | {
30 | "name": "Brightness",
31 | "value": "0"
32 | },
33 | {
34 | "name": "Contrast",
35 | "value": "1"
36 | }
37 | ]
38 | },
39 | "layerOpacities": {
40 | "0": "1",
41 | "1": "0.5"
42 | },
43 | "layerVisibilities": {
44 | "0": true,
45 | "1": true
46 | },
47 | "layers": [
48 | {
49 | "name": "wsi.tif",
50 | "tileSource": "wsi.tif.dzi"
51 | },
52 | {
53 | "name": "cells_layer.png",
54 | "tileSource": "cells_layer.png.dzi"
55 | }
56 | ],
57 | "markerFiles": [
58 | {
59 | "autoLoad": false,
60 | "comment": "Displays the cell centroids color-coded by their cell type as predicted by Sargent.",
61 | "expectedHeader": {
62 | "X": "cell_x",
63 | "Y": "cell_y",
64 | "cb_cmap": "",
65 | "cb_col": "",
66 | "cb_gr_dict": "",
67 | "collectionItem_col": "",
68 | "collectionItem_fixed": "0",
69 | "coord_factor": "1",
70 | "gb_col": "cell_type",
71 | "gb_name": "",
72 | "opacity": "0.7",
73 | "opacity_col": "",
74 | "pie_col": "",
75 | "pie_dict": "",
76 | "scale_col": "",
77 | "scale_factor": "0.2",
78 | "shape_col": "",
79 | "shape_fixed": "disc",
80 | "shape_gr_dict": "",
81 | "tooltip_fmt": ""
82 | },
83 | "expectedRadios": {
84 | "_no_outline": true,
85 | "cb_col": false,
86 | "cb_gr": true,
87 | "cb_gr_dict": false,
88 | "cb_gr_key": false,
89 | "cb_gr_rand": true,
90 | "collectionItem_col": false,
91 | "collectionItem_fixed": false,
92 | "opacity_check": false,
93 | "pie_check": false,
94 | "scale_check": false,
95 | "shape_col": false,
96 | "shape_fixed": true,
97 | "shape_gr": false,
98 | "shape_gr_dict": false,
99 | "shape_gr_rand": true
100 | },
101 | "fromButton": 0,
102 | "hideSettings": true,
103 | "name": "Cell centroids",
104 | "path": "",
105 | "title": "Sargent results",
106 | "uid": "U48505"
107 | }
108 | ],
109 | "plugins": [
110 | "Experiment_Data_Export",
111 | "Feature_Space",
112 | "Plot_Histogram",
113 | "Live_Region_Analysis"
114 | ],
115 | "regionFiles": [
116 | {
117 | "path": "",
118 | "title": "Load Pathologist Annotation"
119 | }
120 | ],
121 | "regions": {},
122 | "schemaVersion": "1.3"
123 | }
--------------------------------------------------------------------------------