├── .gitignore ├── ENACT_demo.ipynb ├── ENACT_outputs_demo.ipynb ├── LICENSE.md ├── MANIFEST.in ├── Makefile ├── README.md ├── config └── configs.yaml ├── figs ├── pipelineflow.png └── tissuumaps.png ├── pyproject.toml ├── reproduce_paper_results.sh ├── requirements.txt ├── run_cell_ann_eval.sh ├── run_enact.sh ├── sample_enact_output.html ├── setup_py_env.sh ├── src ├── enact │ ├── __init__.py │ ├── assignment_methods │ │ ├── __init__.py │ │ ├── naive.py │ │ ├── weight_by_area.py │ │ └── weight_by_gene.py │ ├── cellassign.py │ ├── celltypist.py │ ├── package_results.py │ ├── pipeline.py │ └── utils │ │ └── logging.py ├── eval │ ├── cell_annotation_eval.py │ └── paper_eval-cellassign-methods-highlevel.ipynb ├── main.py └── synthetic_data │ ├── generate_synthetic_data_Xenium.ipynb │ └── generate_synthetic_data_seqFISH.ipynb └── templates └── tmap_template.tmap /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .hypothesis/ 3 | .pytest_cache/ 4 | .coverage 5 | SpatialOneHD.log 6 | ENACT.log 7 | cache/* 8 | .ipynb_checkpoints/ 9 | src/.ipynb_checkpoints/ 10 | templates/.ipynb_checkpoints/ 11 | data/ 12 | ENACT_supporting_files/ 13 | ENACT_supporting_files.zip 14 | .idea/* 15 | .checkmarx/scan_results/* 16 | src/cache/* 17 | src/cache-pathologist/* 18 | src/binned_outputs/* 19 | src/binned_outputs-mouse/* -------------------------------------------------------------------------------- /ENACT_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "88dfe185-575f-484b-87a5-662d54a8aa14", 6 | "metadata": {}, 7 | "source": [ 8 | "## ENACT Demo Notebook - Human Colorectal Cancer" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "ef3a94f5-4189-4c46-b4fa-570989cb78e9", 14 | "metadata": {}, 15 | "source": [ 16 | "This notebook provides a demo for running ENACT on the Human Colorectal Cancer sample provided on 10X Genomics' website." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "31994db6-6997-4124-a4d5-bf09dbf64f69", 22 | "metadata": {}, 23 | "source": [ 24 | "### Download VisiumHD data from the 10X Genomics website" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "e56081b4-2eb0-45e4-9f46-7ed118b51551", 30 | "metadata": {}, 31 | "source": [ 32 | "Whole slide image: full resolution tissue image" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "712a9e76-d7e1-4cc1-b0ae-afad223a1713", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "!curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.0.0/Visium_HD_Human_Colon_Cancer/Visium_HD_Human_Colon_Cancer_tissue_image.btf" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "bfa4bd8e-4b4d-4593-b5f2-8cc881c1a2b1", 48 | "metadata": {}, 49 | "source": [ 50 | "Visium HD output file. The transcript counts are provided in a .tar.gz file that needs to be extracted:" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "6f7bc5a4-6f56-4ffa-8b1c-9c178d5c6022", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.0.0/Visium_HD_Human_Colon_Cancer/Visium_HD_Human_Colon_Cancer_binned_outputs.tar.gz\n", 61 | "!tar -xvzf Visium_HD_Human_Colon_Cancer_binned_outputs.tar.gz" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "id": "c838e47e-0e91-4462-a099-ff061cd4f94f", 67 | "metadata": {}, 68 | "source": [ 69 | "Locate the following two files from the extracted outputs file. These are the files we will use later as input to ENACT.\n" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "id": "08ea9a56-14c5-4ebc-bb09-e7535bbc1fee", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | ".\n", 80 | "└── binned_outputs/\n", 81 | " └── square_002um/\n", 82 | " ├── filtered_feature_bc_matrix.h5 <---- Transcript counts file (2um resolution)\n", 83 | " └── spatial/\n", 84 | " └── tissue_positions.parquet <---- Bin locations relative to the full resolution image\n" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "8d760ee8-f5a0-4a0f-ace6-c0b91176f4e1", 90 | "metadata": {}, 91 | "source": [ 92 | "### Install ENACT\n", 93 | "This will install the ENACT package and its dependencies.\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "d555ae41-0776-4047-bfe2-1ee3ebc475bb", 100 | "metadata": { 101 | "scrolled": true, 102 | "tags": [] 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "!pip install enact-SO" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "id": "39736162-ba27-435f-8dc6-876b2f507315", 112 | "metadata": {}, 113 | "source": [ 114 | "### Access and update the `configs.yaml` file" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "id": "56f81b7a-e58e-498b-8589-0fd9cfc82c08", 120 | "metadata": {}, 121 | "source": [ 122 | "To run the ENACT pipeline, you will need a configuration file that specifies all the required settings. You can download the template configuration file from the GitHub repository.\n", 123 | "\n", 124 | "Refer to [Defining ENACT Configurations](https://github.com/Sanofi-OneAI/oneai-dda-spatialtr-enact/tree/release/ospo-new?tab=readme-ov-file#defining-enact-configurations) for a full list of parameters to configure." 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "19207eb1-f22a-48d7-80ba-08b6fc118872", 130 | "metadata": {}, 131 | "source": [ 132 | "#### Step 1\n", 133 | "Download the `configs.yaml` template from the `config` folder of [this repository](https://github.com/Sanofi-OneAI/oneai-dda-spatialtr-enact), and save it in your working directory." 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "8996161d-5164-4931-bfdc-ca0065686d44", 139 | "metadata": {}, 140 | "source": [ 141 | "#### Step 2\n", 142 | "Edit the input file locations in `configs.yaml` to the downloaded Visium HD files' location." 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "id": "ff15bcbd-681e-4947-8277-cffc30f69df4", 148 | "metadata": {}, 149 | "source": [ 150 | "```yaml\n", 151 | "analysis_name: \"demo-colon\" \n", 152 | "cache_dir: \"enact_output\" \n", 153 | "paths:\n", 154 | " wsi_path: \"Visium_HD_Human_Colon_Cancer_tissue_image.btf\" \n", 155 | " visiumhd_h5_path: \"binned_outputs/square_002um/filtered_feature_bc_matrix.h5\" \n", 156 | " tissue_positions_path: \"binned_outputs/square_002um/spatial/tissue_positions.parquet\" \n", 157 | "```" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "id": "81daa91f-e34e-4018-8a46-89ddb9b6cf99", 163 | "metadata": {}, 164 | "source": [ 165 | "#### Step 3\n", 166 | "Next, we set all the steps in the `configs.yaml` file to `True`, in order to run the whole ENACT pipeline later" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "id": "8a4eb5a6-2436-4cf3-8b52-e06d431fc3a0", 172 | "metadata": {}, 173 | "source": [ 174 | "```yaml\n", 175 | "steps:\n", 176 | " segmentation: True \n", 177 | " bin_to_geodataframes: True \n", 178 | " bin_to_cell_assignment: True \n", 179 | " cell_type_annotation: True \n", 180 | "```" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "id": "4af04737-6ece-431f-b5d5-1eaefe63efca", 186 | "metadata": {}, 187 | "source": [ 188 | "#### Step 4\n", 189 | "Lastly, choose the `bin_to_cell_method` and `cell_annotation_method` we want to run with. In this demo, we will go with `\"weighted_by_area\"`, and `\"celltypist\"`.\n", 190 | "\n", 191 | "To run Celltypist as our cell annotation method, we also need to input the `cell_typist_model` parameter based on the type of sample we use." 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "id": "9b495d82-cfce-4973-aed0-84aec7d2ac31", 197 | "metadata": {}, 198 | "source": [ 199 | "```yaml\n", 200 | " params:\n", 201 | " bin_to_cell_method: \"weighted_by_area\" \n", 202 | " cell_annotation_method: \"celltypist\" \n", 203 | " cell_typist_model: \"Human_Colorectal_Cancer.pkl\" \n", 204 | " seg_method: \"stardist\" \n", 205 | " patch_size: 4000 \n", 206 | " use_hvg: True \n", 207 | " n_hvg: 1000 \n", 208 | "```" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "id": "13a165e4-7f63-4cfd-80ed-52a0823692f9", 214 | "metadata": {}, 215 | "source": [ 216 | "### Run ENACT" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "id": "2aadbd97-ddf2-4252-9bb0-c59ba4600c4c", 222 | "metadata": {}, 223 | "source": [ 224 | "Running ENACT on the whole sample image will take around 40 minutes. Output of the pipeline will be stored in the `\"enact_output\"` directory" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "id": "393087e7-9598-4ebe-a628-14cc0ac673a8", 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "from enact.pipeline import ENACT\n", 235 | "import yaml\n", 236 | "\n", 237 | "configs_path = \"config/configs.yaml\" # Change this to the location of the configs.yaml file that you just edited\n", 238 | "with open(configs_path, \"r\") as stream:\n", 239 | " configs = yaml.safe_load(stream)\n", 240 | "\n", 241 | "so_hd = ENACT(configs_dict=configs)\n", 242 | "so_hd.run_enact()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "id": "cfc23f74", 248 | "metadata": {}, 249 | "source": [ 250 | "New! Alternatively, users can specify ENACT configurations directly in the class constructor with the following *minimum* configurations. Refer to Readme for full list of ENACT parameters." 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "id": "87648e0c", 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "from enact.pipeline import ENACT\n", 261 | "\n", 262 | "# Running ENACT with `weighted-by-area` bin-to-cell assignment, and `celltypist` for cell type annotation\n", 263 | "so_hd = ENACT(\n", 264 | " cache_dir=\"/home/oneai/test_cache\",\n", 265 | " wsi_path=\"Visium_HD_Human_Colon_Cancer_tissue_image.btf\",\n", 266 | " visiumhd_h5_path=\"binned_outputs/square_002um/filtered_feature_bc_matrix.h5\",\n", 267 | " tissue_positions_path=\"binned_outputs/square_002um/spatial/tissue_positions.parquet\",\n", 268 | " analysis_name=\"demo-colon\", #optional\n", 269 | " bin_to_cell_method=\"weighted_by_area\", #optional \n", 270 | " cell_annotation_method=\"celltypist\", #optional \n", 271 | " cell_typist_model=\"Human_Colorectal_Cancer.pkl\" #optional \n", 272 | ")\n", 273 | "so_hd.run_enact()" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "id": "ac1595f3", 279 | "metadata": {}, 280 | "source": [ 281 | "Example: Only running the cell segmentation step and disabling all the other steps" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "id": "1fbef539", 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "from enact.pipeline import ENACT\n", 292 | "\n", 293 | "so_hd = ENACT(\n", 294 | " cache_dir=\"/home/oneai/test_cache\",\n", 295 | " wsi_path=\"Visium_HD_Human_Colon_Cancer_tissue_image.btf\",\n", 296 | " visiumhd_h5_path=\"binned_outputs/square_002um/filtered_feature_bc_matrix.h5\",\n", 297 | " tissue_positions_path=\"binned_outputs/square_002um/spatial/tissue_positions.parquet\",\n", 298 | " analysis_name=\"demo-colon\",\n", 299 | " bin_to_cell_method=\"weighted_by_area\", \n", 300 | " cell_annotation_method=\"celltypist\", \n", 301 | " cell_typist_model=\"Human_Colorectal_Cancer.pkl\",\n", 302 | " segmentation=True,\n", 303 | " bin_to_geodataframes=False,\n", 304 | " bin_to_cell_assignment=False,\n", 305 | " cell_type_annotation=False\n", 306 | ")\n", 307 | "so_hd.run_enact()" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "id": "2ae423f9", 313 | "metadata": {}, 314 | "source": [ 315 | "Example: Running ENACT with `naive` bin-to-cell assignment and `cellassign` for cell type annotation" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "from enact.pipeline import ENACT\n", 325 | "\n", 326 | "so_hd = ENACT(\n", 327 | " cache_dir=\"/home/oneai/test_cache\",\n", 328 | " wsi_path=\"Visium_HD_Human_Colon_Cancer_tissue_image.btf\",\n", 329 | " visiumhd_h5_path=\"binned_outputs/square_002um/filtered_feature_bc_matrix.h5\",\n", 330 | " tissue_positions_path=\"binned_outputs/square_002um/spatial/tissue_positions.parquet\",\n", 331 | " analysis_name=\"demo-colon\",\n", 332 | " bin_to_cell_method=\"naive\", \n", 333 | " cell_annotation_method=\"cellassign\"\n", 334 | ")\n", 335 | "so_hd.run_enact()" 336 | ] 337 | } 338 | ], 339 | "metadata": { 340 | "kernelspec": { 341 | "display_name": "Python 3 (ipykernel)", 342 | "language": "python", 343 | "name": "python3" 344 | }, 345 | "language_info": { 346 | "codemirror_mode": { 347 | "name": "ipython", 348 | "version": 3 349 | }, 350 | "file_extension": ".py", 351 | "mimetype": "text/x-python", 352 | "name": "python", 353 | "nbconvert_exporter": "python", 354 | "pygments_lexer": "ipython3", 355 | "version": "3.10.14" 356 | } 357 | }, 358 | "nbformat": 4, 359 | "nbformat_minor": 5 360 | } 361 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | **Copyright Sanofi 2024** 2 | 3 | Permission is hereby granted, free of charge, for academic research purposes only and for non-commercial uses only, to any person from academic research or non-profit organizations obtaining a copy of this software and associated documentation files (the "Software"), to use, copy, modify, or merge the Software, subject to the following conditions: this permission notice shall be included in all copies of the Software or of substantial portions of the Software. 4 | 5 | For purposes of this license, “non-commercial use” excludes uses foreseeably resulting in a commercial benefit. To use this software for other purposes (such as the development of a commercial product, including but not limited to software, service, or pharmaceuticals, or in a collaboration with a private company), please contact SANOFI at patent.gos@sanofi.com. 6 | 7 | All other rights are reserved. The Software is provided “as is”, without warranty of any kind, express or implied, including the warranties of noninfringement. 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE.md 3 | include config/configs.yaml 4 | 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ENV_DIR := /home/oneai/envs/ 2 | 3 | PY_ENV_NAME := enact_py_env 4 | 5 | PY_ENV_PATH := $(ENV_DIR)$(PY_ENV_NAME) 6 | 7 | CONFIG_PATH ?= config/configs.yaml 8 | 9 | create-env: 10 | conda create --prefix $(PY_ENV_PATH) python=3.10 11 | 12 | run_enact: 13 | bash setup_py_env.sh $(PY_ENV_PATH) 14 | bash run_enact.sh $(PY_ENV_PATH) ${CONFIG_PATH} 15 | 16 | setup_py_env: 17 | bash setup_py_env.sh $(PY_ENV_PATH) 18 | 19 | run_cell_ann_eval: 20 | bash setup_py_env.sh $(PY_ENV_PATH) 21 | bash run_cell_ann_eval.sh $(PY_ENV_PATH) 22 | 23 | reproduce_results: 24 | bash setup_py_env.sh $(PY_ENV_PATH) 25 | bash reproduce_paper_results.sh $(PY_ENV_PATH) 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ENACT: End-to-End Analysis and Cell Type Annotation for Visium High Definition (HD) Slides 2 | 3 | >[!NOTE] 4 | >This is the official repo for [ENACT](https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btaf094/8063614). The manuscript can be accessed through [Bioinformatics Journal](https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btaf094/62340410/btaf094.pdf). 5 | 6 | Spatial transcriptomics (ST) enables the study of gene expression within its spatial context in histopathology samples. To date, a limiting factor has been the resolution of sequencing based ST products. The introduction of the Visium High Definition (HD) technology opens the door to cell resolution ST studies. However, challenges remain in the ability to accurately map transcripts to cells and in cell type assignment based on spot data. 7 | 8 | ENACT is the first tissue-agnostic pipeline that integrates advanced cell segmentation with Visium HD transcriptomics data to infer cell types across whole tissue sections. Our pipeline incorporates novel bin-to-cell assignment methods, enhancing the accuracy of single-cell transcript estimates. Validated on diverse synthetic and real datasets, our approach demonstrates high effectiveness at predicting cell types and scalability, offering a robust solution for spatially resolved transcriptomics analysis. 9 | 10 | This repository has the code for inferring cell types from the sub-cellular transcript counts provided by VisiumHD. 11 | 12 | This can be achieved through the following steps: 13 | 14 | 1. **Cell segmentation**: segment high resolution image using NN-based image segmentation networks such as Stardist. 15 | 2. **Bin-to-cell assignment**: Obtain cell-wise transcript counts by aggregating the VisiumHD bins that are associated with each cell 16 | 3. **Cell type inference**: Use the cell-wise transcript counts to infer the cell labels/ phenotypes using methods used for single-cell RNA seq analysis ([CellAsign](https://www.nature.com/articles/s41592-019-0529-1#:~:text=CellAssign%20uses%20a%20probabilistic%20model%20to%20assign%20single) or [CellTypist](https://pubmed.ncbi.nlm.nih.gov/35549406/#:~:text=To%20systematically%20resolve%20immune%20cell%20heterogeneity%20across%20tissues,) or [Sargent](https://www.sciencedirect.com/science/article/pii/S2215016123001966#:~:text=We%20present%20Sargent,%20a%20transformation-free,%20cluster-free,%20single-cell%20annotation) if installed) or novel approaches, and use comprehensive cell marker databases ([Panglao](https://panglaodb.se/index.html) or [CellMarker](http://xteam.xbio.top/CellMarker/) can be used as reference). 17 | 18 | >[!NOTE] 19 | > [Sargent](https://doi.org/10.1016/j.mex.2023.102196) (doi: https://doi.org/10.1016/j.mex.2023.102196) needs to be insalled and set up independently. [Sargent](https://doi.org/10.1016/j.mex.2023.102196) is currently available in the [author's github page](https://github.com/nourin-nn/sargent/). For additional information on Sargent's usage and license, please contact the paper's corresponding authors (nima.nouri@sanofi.com) or check their GitHub page. 20 | > 21 | > We provide the results obtained by Sargent in [ENACT's Zenodo page](https://doi.org/10.5281/zenodo.15211043) under the following folders: 22 | >- ENACT_supporting_files/public_data/human_colorectal/paper_results/chunks/naive/sargent_results/ 23 | >- ENACT_supporting_files/public_data/human_colorectal/paper_results/chunks/weighted_by_area/sargent_results/ 24 | >- ENACT_supporting_files/public_data/human_colorectal/paper_results/chunks/weighted_by_transcript/sargent_results/ 25 | >- ENACT_supporting_files/public_data/human_colorectal/paper_results/chunks/weighted_by_cluster/sargent_results/ 26 | 27 | 31 | ![plot](figs/pipelineflow.png) 32 | 33 | ## Index of Instructions: 34 | 1. Installation 35 | - [System Requirements](#system-requirements) 36 | - [Install ENACT from Source](#install-enact-from-source) 37 | - [Install ENACT with Pip](#install-enact-with-pip) 38 | 2. Inputs and Outputs 39 | - [Input Files for ENACT](#input-files-for-enact) 40 | - [Defining ENACT Configurations](#defining-enact-configurations) 41 | - [Output Files for ENACT](#output-files-for-enact) 42 | 3. Running ENACT 43 | - [Basic Example: Running ENACT from Notebook](#basic-example-running-enact-from-notebook) 44 | - [Basic Example: Running ENACT from Terminal](#basic-example-running-enact-from-terminal) 45 | - [Running Instructions](#running-instructions) 46 | 4. Visualizing Outputs 47 | - [Working with ENACT Output](#working-with-enact-output) 48 | - [Visualizing Results on TissUUmaps](#visualizing-results-on-tissuumaps) 49 | 5. Reproducing Paper Results 50 | - [Reproducing Paper Results](#reproducing-paper-results) 51 | - [Creating Synthetic VisiumHD Datasets](#creating-synthetic-visiumhd-datasets) 52 | 6. [Citing ENACT](#citing-enact) 53 | 54 | ## System Requirements 55 | ENACT was tested with the following specifications: 56 | * Hardware Requirements: 32 CPU, 64GB RAM, 100 GB (harddisk and memory requirements may vary depending on whole slide image size; if the weight of the wsi is small the memory requirements can be significantly decreased) 57 | 58 | * Software: Python 3.10, (Optional) GPU (CUDA 11) 59 | 60 | ## Install ENACT from Source 61 | ### Step 1: Clone ENACT repository 62 | ``` 63 | git clone https://github.com/Sanofi-Public/enact-pipeline.git 64 | cd enact-pipeline 65 | ``` 66 | ### Step 2: Setup Python environment 67 | Start by defining the location and the name of the Conda environment in the `Makefile`: 68 | ``` 69 | ENV_DIR := /home/oneai/envs/ <---- Conda environment location 70 | PY_ENV_NAME := enact_py_env <---- Conda environment name 71 | ``` 72 | Next, run the following Make command to create a Conda environment with all of ENACT's dependencies 73 | ``` 74 | make setup_py_env 75 | ``` 76 | 77 | ## Install ENACT with Pip 78 | ENACT can be installed from [Pypi](https://pypi.org/project/enact-SO/) using: 79 | ``` 80 | pip install enact-SO 81 | ``` 82 | 83 | ## Input Files for ENACT 84 | ENACT requires only three files, which can be obtained from SpaceRanger’s outputs for each experiment: 85 | 86 | 1. **Whole resolution tissue image**. This will be segmented to obtain the cell boundaries that will be used to aggregate the transcript counts. 87 | 2. **tissue_positions.parquet**. This is the file that specifies the *2um* Visium HD bin locations relative to the full resolution image. 88 | 3. **filtered_feature_bc_matrix.h5**. This is the .h5 file with the *2um* Visium HD bin counts. 89 | 90 | ## Defining ENACT Configurations 91 | ENACT users can choose to specify the configurations via one of two ways: 92 | 93 | 1. Passing them within the class constructor: 94 | ``` 95 | from enact.pipeline import ENACT 96 | 97 | so_hd = ENACT( 98 | cache_dir="/home/oneai/test_cache", 99 | wsi_path="Visium_HD_Human_Colon_Cancer_tissue_image.btf", 100 | visiumhd_h5_path="binned_outputs/square_002um/filtered_feature_bc_matrix.h5", 101 | tissue_positions_path="binned_outputs/square_002um/spatial/tissue_positions.parquet", 102 | ) 103 | ``` 104 |
105 | Full list of ENACT parameters (click to expand) 106 | 107 | ## Parameters 108 | 109 | - **cache_dir (str)**: 110 | Directory to cache ENACT results. This must be specified by the user. 111 | 112 | - **wsi_path (str)**: 113 | Path to the Whole Slide Image (WSI) file. This must be provided by the user. 114 | 115 | - **visiumhd_h5_path (str)**: 116 | Path to the Visium HD h5 file containing spatial transcriptomics data. This 117 | must be provided by the user. 118 | 119 | - **tissue_positions_path (str)**: 120 | Path to the tissue positions file that contains spatial locations of barcodes. 121 | This must be provided by the user. 122 | 123 | - **analysis_name (str)**: 124 | Name of the analysis, used for output directories and results. 125 | *Default*: `"enact_demo"`. 126 | 127 | - **seg_method (str)**: 128 | Cell segmentation method. 129 | *Default*: `"stardist"`. 130 | *Options*: `["stardist"]`. 131 | 132 | - **patch_size (int)**: 133 | Size of patches (in pixels) to process the image. Use a smaller patch size to 134 | reduce memory requirements. 135 | *Default*: `4000`. 136 | 137 | - **use_hvg (bool)**: 138 | Whether to use highly variable genes (HVG) during the analysis. 139 | *Default*: `True`. 140 | *Options*: `[True]`. 141 | 142 | - **n_hvg (int)**: 143 | Number of highly variable genes to use if `use_hvg` is `True`. 144 | *Default*: `1000`. 145 | 146 | - **n_clusters (int)**: 147 | Number of clusters. Used only if `bin_to_cell_method` is `"weighted_by_cluster"`. 148 | *Default*: `4`. 149 | 150 | - **bin_representation (str)**: 151 | Representation type for VisiumHD bins. 152 | *Default*: `"polygon"`. 153 | *Options*: `["polygon"]`. 154 | 155 | - **bin_to_cell_method (str)**: 156 | Method to assign bins to cells. 157 | *Default*: `"weighted_by_cluster"`. 158 | *Options*: `["naive", "weighted_by_area", "weighted_by_gene", "weighted_by_cluster"]`. 159 | 160 | - **cell_annotation_method (str)**: 161 | Method for annotating cell types. 162 | *Default*: `"celltypist"`. 163 | *Options*: `["celltypist", "sargent" (if installed), "cellassign"]`. 164 | 165 | - **cell_typist_model (str)**: 166 | Path to the pre-trained CellTypist model for cell type annotation. Only used if 167 | `cell_annotation_method` is `"celltypist"`. 168 | Refer to [CellTypist Models](https://www.celltypist.org/models) for a list of 169 | available models. 170 | *Default*: `""` (empty string). 171 | 172 | - **run_synthetic (bool)**: 173 | Whether to run synthetic data generation for testing purposes. 174 | *Default*: `False`. 175 | 176 | - **segmentation (bool)**: 177 | Flag to run the image segmentation step. 178 | *Default*: `True`. 179 | 180 | - **bin_to_geodataframes (bool)**: 181 | Flag to convert the bins to GeoDataFrames. 182 | *Default*: `True`. 183 | 184 | - **bin_to_cell_assignment (bool)**: 185 | Flag to run bin-to-cell assignment. 186 | *Default*: `True`. 187 | 188 | - **cell_type_annotation (bool)**: 189 | Flag to run cell type annotation. 190 | *Default*: `True`. 191 | 192 | - **cell_markers (dict)**: 193 | A dictionary of cell markers used for annotation. Only used if `cell_annotation_method` 194 | is one of `["sargent", "cellassign"]`. 195 | 196 | - **chunks_to_run (list)**: 197 | Specific chunks of data to run the analysis on, typically for debugging. 198 | *Default*: `[]` (runs all chunks). 199 | 200 | - **configs_dict (dict)**: 201 | Dictionary containing ENACT configuration parameters. If provided, the values 202 | in `configs_dict` will override any corresponding parameters passed directly 203 | to the class constructor. This is useful for running ENACT with a predefined 204 | configuration for convenience and consistency. 205 | *Default*: `{}` (uses the parameters specified in the class constructor). 206 | 207 |
208 | 209 | 2. Specifying configurations in a `yaml` file: (sample file located under `config/configs.yaml`): 210 | ```yaml 211 | analysis_name: <---- custom name for analysis. Will create a folder with that name to store the results 212 | run_synthetic: False <---- True if you want to run bin to cell assignment on synthetic dataset, False otherwise 213 | cache_dir: <---- path to store pipeline outputs 214 | paths: 215 | wsi_path: <---- path to whole slide image 216 | visiumhd_h5_path: <---- location of the 2um x 2um gene by bin file (filtered_feature_bc_matrix.h5) from 10X Genomics 217 | tissue_positions_path: <---- location of the tissue of the tissue_positions.parquet file from 10X genomicsgenomics 218 | steps: 219 | segmentation: True <---- True if you want to run segmentation 220 | bin_to_geodataframes: True <---- True to convert bin to geodataframes 221 | bin_to_cell_assignment: True <---- True to bin-to-cell assignment 222 | cell_type_annotation: True <---- True to run cell type annotation 223 | params: 224 | bin_to_cell_method: "weighted_by_cluster" <---- bin-to-cell assignment method. Pick one of ["naive", "weighted_by_area", "weighted_by_gene", "weighted_by_cluster"] 225 | cell_annotation_method: "celltypist" <---- cell annotation method. Pick one of ["cellassign", "celltypist"] 226 | cell_typist_model: "Human_Colorectal_Cancer.pkl" <---- CellTypist model weights to use. Update based on organ of interest if cell_annotation_method is set to "celltypist" 227 | seg_method: "stardist" <---- cell segmentation method. Stardist is the only option for now 228 | image_type: "he" <---- image type. Options are ["he", "if"] (for H&E image or IF image, respectively.) 229 | nucleus_expansion: True <---- flag to enable nuclei expansion to get cell boundaries. Default is True. 230 | expand_by_nbins: 2 <---- number of bins to expand the nuclei by to get cell boundaries. Default is 2 bins. 231 | patch_size: 4000 <---- defines the patch size. The whole resolution image will be broken into patches of this size. Reduce if you run into memory issues 232 | use_hvg: True <---- True only run analysis on top n highly variable genes. Setting it to False runs ENACT on all genes in the counts file 233 | n_hvg: 1000 <---- number of highly variable genes to use. Default is 1000. 234 | destripe_norm: False <---- flag to enable destripe normalization (Bin2cell normalization). Recommend enable only for CellTypist. Disable for Sargent. 235 | n_clusters: 4 <---- number of cell clusters to use for the "weighted_by_cluster" method. Default is 4. 236 | n_pcs: 250 <---- number of principal components before clustering for Weighted-by-Cluster. Default is 250. 237 | stardist: 238 | block_size: 4096 <---- the size of image blocks the model processes at a time 239 | prob_thresh: 0.005 <---- value between 0 and 1, higher values lead to fewer segmented objects, but will likely avoid false positives 240 | overlap_thresh: 0.001 <---- value between 0 and 1, higher values allow segmented objects to overlap substantially 241 | min_overlap: 128 <---- overlap between blocks, should it be larger than the size of a cell 242 | context: 128 <---- context pixels around the blocks to be included during prediction 243 | n_tiles: (4,4,1) <---- the input image is broken up into (overlapping) tiles that are processed independently and re-assembled. This parameter denotes a tuple of the number of tiles for every image axis 244 | stardist_modelname: "2D_versatile_he" <---- Specify one of the available Stardist models. 2D_versatile_fluo (for IF images) or 2D_versatile_he (for H&E images) 245 | channel_to_segment: 2 <---- Only applicable for IF images. This is the image channel to segment (usually the DAPI channel) 246 | cell_markers: <---- cell-gene markers to use for cell annotation. Only applicable if params/cell_annotation_method is "cellassign" or "sargent". No need to specify for "CellTypist" 247 | Epithelial: ["CDH1","EPCAM","CLDN1","CD2"] 248 | Enterocytes: ["CD55", "ELF3", "PLIN2", "GSTM3", "KLF5", "CBR1", "APOA1", "CA1", "PDHA1", "EHF"] 249 | Goblet cells: ["MANF", "KRT7", "AQP3", "AGR2", "BACE2", "TFF3", "PHGR1", "MUC4", "MUC13", "GUCA2A"] 250 | ``` 251 | 252 | ## Output Files for ENACT 253 | ENACT outputs all its results under the `cache` directory which gets automatically created at run time: 254 | ``` 255 | . 256 | └── cache/ 257 | └── / 258 | ├── chunks/ # ENACT results at a chunck level 259 | │ ├── bins_gdf/ 260 | │ │ └── patch_.csv 261 | │ ├── cells_gdf/ 262 | │ │ └── patch_.csv 263 | │ └── / 264 | │ ├── bin_to_cell_assign/ 265 | │ │ └── patch_.csv 266 | │ ├── cell_ix_lookup/ 267 | │ │ └── patch_.csv 268 | │ └── _results/ 269 | │ ├── cells_adata.csv 270 | │ └── merged_results.csv 271 | ├── tmap/ # Directory storing files to visualize results on TissUUmaps 272 | │ ├── _adata.h5 273 | │ ├── _tmap.tmap 274 | │ ├── cells_layer.png 275 | │ └── wsi.tif 276 | └── cells_df.csv # cells dataframe, each row is a cell with its coordinates 277 | ``` 278 | ENACT breaks down the whole resolution image into "chunks" (or patches) of size `patch_size`. Results are provided per-chunk under the `chunks` directory. 279 | * `bins_gdf`:Folder containing GeoPandas dataframes representing the 2um Visium HD bins within a given patch 280 | * `cells_gdf`: Folder containing GeoPandas dataframes representing cells segmented in the tissue 281 | * `/bin_to_cell_assign`: Folder contains dataframes with the transcripts assigned to each cells 282 | * `/cell_ix_lookup`: Folder contains dataframes defining the indices and coordinates of the cells 283 | * `/_results/cells_adata.csv`: Anndata object containing the results from ENACT (cell coordinates, cell types, transcript counts) 284 | * <`bin_to_cell_method>/_results/merged_results.csv`: Dataframe (.csv) containing the results from ENACT (cell coordinates, cell types) 285 | 286 | ## Basic Example: Running ENACT from Notebook 287 | The **[demo notebook](ENACT_demo.ipynb)** provides a step-by-step guide on how to install and run ENACT on VisiumHD public data using notebook. The **[output processing demo notebook](ENACT_outputs_demo.ipynb)** provides a comprehensive, step-by-step guide on how the user can use the generated data for further downstream analysis (see [Working with ENACT Output](#working-with-enact-output) for additional details) 288 | 289 | ## Basic Example: Running ENACT from Terminal 290 | This section provides a guide for running ENACT on the [Human Colorectal Cancer sample](https://www.10xgenomics.com/datasets/visium-hd-cytassist-gene-expression-libraries-of-human-crc) provided on 10X Genomics' website. 291 | ### Step 1: Install ENACT from Source 292 | Refer to [Install ENACT from Source](#install-enact-from-source) 293 | 294 | ### Step 2: Download the necessary files from the 10X Genomics website: 295 | 296 | 1. Whole slide image: full resolution tissue image 297 | ``` 298 | curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.0.0/Visium_HD_Human_Colon_Cancer/Visium_HD_Human_Colon_Cancer_tissue_image.btf 299 | ``` 300 | 301 | 2. Visium HD output file. The transcript counts are provided in a .tar.gz file that needs to be extracted: 302 | ``` 303 | curl -O https://cf.10xgenomics.com/samples/spatial-exp/3.0.0/Visium_HD_Human_Colon_Cancer/Visium_HD_Human_Colon_Cancer_binned_outputs.tar.gz 304 | tar -xvzf Visium_HD_Human_Colon_Cancer_binned_outputs.tar.gz 305 | ``` 306 | Locate the following two files from the extracted outputs file. 307 | ``` 308 | . 309 | └── binned_outputs/ 310 | └── square_002um/ 311 | ├── filtered_feature_bc_matrix.h5 <---- Transcript counts file (2um resolution) 312 | └── spatial/ 313 | └── tissue_positions.parquet <---- Bin locations relative to the full resolution image 314 | ``` 315 | 316 | ### Step 3: Update input file locations and parameters under `config/configs.yaml` 317 | 318 | Refer to [Running Instructions](#running-instructions) for a full list of ENACT parameters to change. 319 | 320 | Below is a sample configuration file to use to run ENACT on the Human Colorectal cancer sample: 321 | 322 | ```yaml 323 | analysis_name: "colon-demo" 324 | run_synthetic: False # True if you want to run bin to cell assignment on synthetic dataset, False otherwise. 325 | cache_dir: "cache/ENACT_outputs" # Change according to your desired output location 326 | paths: 327 | wsi_path: "/Visium_HD_Human_Colon_Cancer_tissue_image.btf" # whole slide image path 328 | visiumhd_h5_path: "/binned_outputs/square_002um/filtered_feature_bc_matrix.h5" # location of the 2um x 2um gene by bin file (filtered_feature_bc_matrix.h5) from 10X Genomics. 329 | tissue_positions_path: "/binned_outputs/square_002um/spatial/tissue_positions.parquet" # location of the tissue of the tissue_positions.parquet file from 10X genomics 330 | steps: 331 | segmentation: True # True if you want to run segmentation 332 | bin_to_geodataframes: True # True to convert bin to geodataframes 333 | bin_to_cell_assignment: True # True to assign cells to bins 334 | cell_type_annotation: True # True to run cell type annotation 335 | params: 336 | seg_method: "stardist" # Stardist is the only option for now 337 | image_type: "if" # Image type: Options: ["he", "if"] (for H&E image or IF image, respectively.) 338 | nucleus_expansion: True # Flag to enable nuclei expansion to get cell boundaries 339 | expand_by_nbins: 2 # Number of bins to expand the nuclei by to get cell boundaries 340 | patch_size: 4000 # Defines the patch size. The whole resolution image will be broken into patches of this size 341 | bin_representation: "polygon" # or point TODO: Remove support for anything else 342 | bin_to_cell_method: "weighted_by_cluster" # or naive 343 | cell_annotation_method: "celltypist" 344 | cell_typist_model: "Human_Colorectal_Cancer.pkl" 345 | use_hvg: True # Only run analysis on highly variable genes + cell markers specified 346 | n_hvg: 1000 # Number of highly variable genes to use 347 | n_clusters: 4 # Number of clusters for Weighted-by-Cluster 348 | n_pcs: 250 # Number of principal components before clustering for Weighted-by-Cluster 349 | chunks_to_run: [] 350 | stardist: 351 | block_size: 4096 # the size of image blocks the model processes at a time 352 | prob_thresh: 0.005 # value between 0 and 1, higher values lead to fewer segmented objects, but will likely avoid false positives 353 | overlap_thresh: 0.001 # value between 0 and 1, higher values allow segmented objects to overlap substantially 354 | min_overlap: 128 # overlap between blocks, should it be larger than the size of a cell 355 | context: 128 # context pixels around the blocks to be included during prediction 356 | n_tiles: (4,4,1) #the input image is broken up into (overlapping) tiles that are processed independently and re-assembled. This parameter denotes a tuple of the number of tiles for every image axis 357 | stardist_modelname: "2D_versatile_fluo" # Specify one of the available Stardist models: 2D_versatile_fluo (for IF images) or 2D_versatile_he (for H&E images) 358 | channel_to_segment: 2 # Only applicable for IF images. This is the image channel to segment (usually the DAPI channel) 359 | cell_markers: # Only needed if cell_annotation_method is one of "Sargent" or "CellAssign" 360 | # Human Colon 361 | Epithelial: ["CDH1","EPCAM","CLDN1","CD2"] 362 | Enterocytes: ["CD55", "ELF3", "PLIN2", "GSTM3", "KLF5", "CBR1", "APOA1", "CA1", "PDHA1", "EHF"] 363 | Goblet cells: ["MANF", "KRT7", "AQP3", "AGR2", "BACE2", "TFF3", "PHGR1", "MUC4", "MUC13", "GUCA2A"] 364 | Enteroendocrine cells: ["NUCB2", "FABP5", "CPE", "ALCAM", "GCG", "SST", "CHGB", "IAPP", "CHGA", "ENPP2"] 365 | Crypt cells: ["HOPX", "SLC12A2", "MSI1", "SMOC2", "OLFM4", "ASCL2", "PROM1", "BMI1", "EPHB2", "LRIG1"] 366 | Endothelial: ["PECAM1","CD34","KDR","CDH5","PROM1","PDPN","TEK","FLT1","VCAM1","PTPRC","VWF","ENG","MCAM","ICAM1","FLT4"] 367 | Fibroblast: ["COL1A1","COL3A1","COL5A2","PDGFRA","ACTA2","TCF21","FN"] 368 | Smooth muscle cell: ["BGN","MYL9","MYLK","FHL2","ITGA1","ACTA2","EHD2","OGN","SNCG","FABP4"] 369 | B cells: ["CD74", "HMGA1", "CD52", "PTPRC", "HLA-DRA", "CD24", "CXCR4", "SPCS3", "LTB", "IGKC"] 370 | T cells: ["JUNB", "S100A4", "CD52", "PFN1P1", "CD81", "EEF1B2P3", "CXCR4", "CREM", "IL32", "TGIF1"] 371 | NK cells: ["S100A4", "IL32", "CXCR4", "FHL2", "IL2RG", "CD69", "CD7", "NKG7", "CD2", "HOPX"] 372 | 373 | ``` 374 | 375 | ## Running Instructions 376 | This section provides a guide on running ENACT on your own data 377 | ### Step 1: Install ENACT from Source 378 | Refer to [Install ENACT from Source](#install-enact-from-source) 379 | 380 | ### Step 2: Define the Location of ENACT's Required Files 381 | Define the locations of ENACT's required files in the `config/configs.yaml` file. Refer to [Input Files for ENACT](#input-files-for-enact) 382 | ```yaml 383 | analysis_name: <---- custom name for analysis. Will create a folder with that name to store the results 384 | cache_dir: <---- path to store pipeline outputs 385 | paths: 386 | wsi_path: <---- path to whole slide image 387 | visiumhd_h5_path: <---- location of the 2um x 2um gene by bin file (filtered_feature_bc_matrix.h5) from 10X Genomics. 388 | tissue_positions_path: <---- location of the tissue of the tissue_positions.parquet file from 10X genomics 389 | ``` 390 | 391 | ### Step 3: Define ENACT configurations 392 | Define the following core parameters in the `config/configs.yaml` file: 393 | ```yaml 394 | params: 395 | bin_to_cell_method: "weighted_by_cluster" <---- bin-to-cell assignment method. Pick one of ["naive", "weighted_by_area", "weighted_by_gene", "weighted_by_cluster"] 396 | cell_annotation_method: "celltypist" <---- cell annotation method. Pick one of ["cellassign", "celltypist", "sargent" (if installed)] 397 | cell_typist_model: "Human_Colorectal_Cancer.pkl" <---- CellTypist model weights to use. Update based on organ of interest if using cell_annotation_method is set to 398 | ``` 399 | Refer to [Defining ENACT Configurations](#defining-enact-configurations) for a full list of parameters to configure. If using CellTypist, set `cell_typist_model` to one of the following models based on the organ and species under study: [CellTypist models](https://www.celltypist.org/models#:~:text=CellTypist%20was%20first%20developed%20as%20a%20platform%20for). 400 | 401 | ### Step 4: Define Cell Gene Markers 402 | >[!NOTE] 403 | >Only applies if cell_annotation_method is "cellassign" or "sargent". Skip this step if using CellTypist 404 | 405 | Define the cell gene markers in `config/configs.yaml` file. Those can be expert annotated or obtained from open-source databases such as [Panglao](https://panglaodb.se/index.html) or [CellMarker](http://xteam.xbio.top/CellMarker/). Example cell markers for human colorectal cancer samples: 406 | ```yaml 407 | cell_markers: 408 | Epithelial: ["CDH1","EPCAM","CLDN1","CD2"] 409 | Enterocytes: ["CD55", "ELF3", "PLIN2", "GSTM3", "KLF5", "CBR1", "APOA1", "CA1", "PDHA1", "EHF"] 410 | Goblet cells: ["MANF", "KRT7", "AQP3", "AGR2", "BACE2", "TFF3", "PHGR1", "MUC4", "MUC13", "GUCA2A"] 411 | Enteroendocrine cells: ["NUCB2", "FABP5", "CPE", "ALCAM", "GCG", "SST", "CHGB", "IAPP", "CHGA", "ENPP2"] 412 | Crypt cells: ["HOPX", "SLC12A2", "MSI1", "SMOC2", "OLFM4", "ASCL2", "PROM1", "BMI1", "EPHB2", "LRIG1"] 413 | Endothelial: ["PECAM1","CD34","KDR","CDH5","PROM1","PDPN","TEK","FLT1","VCAM1","PTPRC","VWF","ENG","MCAM","ICAM1","FLT4"] 414 | Fibroblast: ["COL1A1","COL3A1","COL5A2","PDGFRA","ACTA2","TCF21","FN"] 415 | Smooth muscle cell: ["BGN","MYL9","MYLK","FHL2","ITGA1","ACTA2","EHD2","OGN","SNCG","FABP4"] 416 | B cells: ["CD74", "HMGA1", "CD52", "PTPRC", "HLA-DRA", "CD24", "CXCR4", "SPCS3", "LTB", "IGKC"] 417 | T cells: ["JUNB", "S100A4", "CD52", "PFN1P1", "CD81", "EEF1B2P3", "CXCR4", "CREM", "IL32", "TGIF1"] 418 | NK cells: ["S100A4", "IL32", "CXCR4", "FHL2", "IL2RG", "CD69", "CD7", "NKG7", "CD2", "HOPX"] 419 | ``` 420 | ### Step 5: Run ENACT 421 | ``` 422 | make run_enact 423 | ``` 424 | 425 | ## Working with ENACT Output 426 | 427 | The **[output demo notebook](ENACT_outputs_demo.ipynb)** provides a comprehensive, step-by-step guide on how to access and analyze output data from ENACT. The notebook covers the following topics: 428 | 429 | - **Loading the AnnData object in Python** 430 | Learn how to load the main data structure for single-cell analysis. 431 | 432 | - **Extracting cell types and their spatial coordinates** 433 | Access information about cell types and their positions in the tissue. 434 | 435 | - **Determining the number of shared and unique bins per cell** 436 | Explore metrics that characterize the bin and cell relationships. 437 | 438 | - **Accessing and visualizing the number of transcripts per cell** 439 | Visualize and analyze transcriptional activity across cells. 440 | 441 | - **Identifying the top-n expressed genes in the sample** 442 | Retrieve the most highly expressed genes in your dataset. 443 | 444 | - **Generating interactive plots** 445 | Visualize cell boundaries and cell types within the tissue using interactive visualizations. 446 | 447 | - **Performing downstream analysis** 448 | Run a sample analysis, such as neighborhood enrichment analysis, using external packages like **Squidpy**. 449 | 450 | This notebook serves as a helpful resource for navigating and analyzing ENACT output data effectively. 451 | 452 | 453 | ## Visualizing Results on TissUUmaps 454 | To view results on [TissUUmaps](https://tissuumaps.github.io), begin by installing TissUUmaps by following the instructions at: 455 | https://tissuumaps.github.io/TissUUmaps-docs/docs/intro/installation.html#. 456 | 457 | Once installed, follow the instructions at: https://tissuumaps.github.io/TissUUmaps-docs/docs/starting/projects.html#loading-projects 458 | 459 | For convenience, ENACT creates a TissUUmaps project file (.tmap extension) located at under the `/tmap/` folder. 460 | 464 | ![plot](figs/tissuumaps.png) 465 | 466 | ## Reproducing Paper Results 467 | This section provides a guide on how to reproduce the ENACT paper results on the [10X Genomics Human Colorectal Cancer VisumHD sample](https://www.10xgenomics.com/datasets/visium-hd-cytassist-gene-expression-libraries-of-human-crc). 468 | Here, ENACT is run on various combinations of bin-to-cell assignment methods and cell annotation algorithms. 469 | 470 | ### Step 1: Install ENACT from Source 471 | Refer to [Install ENACT from Source](#install-enact-from-source) 472 | 473 | ### Step 2: Run ENACT on combinations of bin-to-cell assignment methods and cell annotation algorithms 474 | 3. Run the following command which will download all the supplementary file from [ENACT's Zenodo page](https://doi.org/10.5281/zenodo.15211043) and programmatically run ENACT with various combinations of bin-to-cell assignment methods and cell annotation algorithms: 475 | ``` 476 | make reproduce_results 477 | ``` 478 | 479 | ## Creating Synthetic VisiumHD Datasets 480 | 481 | 1. To create synthetic VisiumHD dataset from Xenium or seqFISH+ data, run and follow the instructions of the notebooks in [src/synthetic_data](src/synthetic_data). 482 | 483 | 2. To run the ENACT pipeline with the synthetic data, set the following parameters in the `config/configs.yaml` file: 484 | 485 | ```yaml 486 | run_synthetic: True <---- True if you want to run bin to cell assignment on synthetic dataset, False otherwise. 487 | ``` 488 | 489 | 3. Run ENACT: 490 | ``` 491 | make run_enact 492 | ``` 493 | 494 | ## Citing ENACT 495 | If you use this repository or its tools in your research, please cite the following: 496 | ``` 497 | @article{10.1093/bioinformatics/btaf094, 498 | author = {Kamel, Mena and Song, Yiwen and Solbas, Ana and Villordo, Sergio and Sarangi, Amrut and Senin, Pavel and Sunaal, Mathew and Ayestas, Luis Cano and Levin, Clement and Wang, Seqianand Classe, Marion and Bar-Joseph, Ziv and Pla Planas, Albert}, 499 | title = {ENACT: End-to-end Analysis of Visium High Definition (HD) Data}, 500 | journal = {Bioinformatics}, 501 | pages = {btaf094}, 502 | year = {2025}, 503 | month = {03}, 504 | abstract = {Spatial transcriptomics (ST) enables the study of gene expression within its spatial context in histopathology samples. To date, a limiting factor has been the resolution of sequencing based ST products. The introduction of the Visium High Definition (HD) technology opens the door to cell resolution ST studies. However, challenges remain in the ability to accurately map transcripts to cells and in assigning cell types based on the transcript data.We developed ENACT, a self-contained pipeline that integrates advanced cell segmentation with Visium HD transcriptomics data to infer cell types across whole tissue sections. Our pipeline incorporates novel bin-to-cell assignment methods, enhancing the accuracy of single-cell transcript estimates. Validated on diverse synthetic and real datasets, our approach is both scalableto samples with hundreds of thousands of cells and effective, offering a robust solution for spatially resolved transcriptomics analysis.ENACT source code is available at https://github.com/Sanofi-Public/enact-pipeline. Experimental data is available at https://doi.org/10.5281/zenodo.15211043.Supplementary data are available at Bioinformatics online.}, 505 | issn = {1367-4811}, 506 | doi = {10.1093/bioinformatics/btaf094}, 507 | url = {https://doi.org/10.1093/bioinformatics/btaf094}, 508 | eprint = {https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btaf094/62340410/btaf094.pdf}, 509 | } 510 | ``` 511 | -------------------------------------------------------------------------------- /config/configs.yaml: -------------------------------------------------------------------------------- 1 | analysis_name: "colon-demo" 2 | run_synthetic: False # True if you want to run bin to cell assignment on synthetic dataset, False otherwise. 3 | cache_dir: "/home/oneai/enact-pipeline/ENACT_supporting_files/output_files" 4 | paths: 5 | wsi_path: "/home/oneai/enact-pipeline/ENACT_supporting_files/public_data/human_colorectal/input_files/Visium_HD_Human_Colon_Cancer_tissue_image.btf" 6 | visiumhd_h5_path: "/home/oneai/enact-pipeline/ENACT_supporting_files/public_data/human_colorectal/input_files/filtered_feature_bc_matrix.h5" 7 | tissue_positions_path: "/home/oneai/enact-pipeline/ENACT_supporting_files/public_data/human_colorectal/input_files/tissue_positions.parquet" 8 | steps: 9 | segmentation: True # True if you want to run segmentation 10 | bin_to_geodataframes: True # True to convert bin to geodataframes 11 | bin_to_cell_assignment: True # True to assign cells to bins 12 | cell_type_annotation: True # True to run cell type annotation 13 | params: 14 | seg_method: "stardist" # Stardist is the only option for now 15 | image_type: "he" # Image type: Options: ["he", "if"] (for H&E image or IF image, respectively.) 16 | nucleus_expansion: True # Flag to enable nuclei expansion to get cell boundaries 17 | expand_by_nbins: 2 # Number of bins to expand the nuclei by to get cell boundaries 18 | patch_size: 4000 # Defines the patch size. The whole resolution image will be broken into patches of this size 19 | bin_representation: "polygon" # or point TODO: Remove support for anything else 20 | bin_to_cell_method: "weighted_by_area" # or naive/ weighted_by_cluster/ weighted_by_gene 21 | cell_annotation_method: "celltypist" 22 | cell_typist_model: "Human_Colorectal_Cancer.pkl" # only needed if using cell_annotation_method = "celltypist" 23 | use_hvg: True # Only run analysis on highly variable genes + cell markers specified 24 | n_hvg: 1000 # Number of highly variable genes to use 25 | destripe_norm: False # Flag to enable destripe normalization (Bin2cell normalization) 26 | n_clusters: 4 # Number of clusters for Weighted-by-Cluster 27 | n_pcs: 250 # Number of principal components before clustering for Weighted-by-Cluster 28 | chunks_to_run: [] # Chunks to run ENACT on specific patches 29 | stardist: 30 | block_size: 4096 # the size of image blocks the model processes at a time 31 | prob_thresh: 0.005 # value between 0 and 1, higher values lead to fewer segmented objects, but will likely avoid false positives 32 | overlap_thresh: 0.001 # value between 0 and 1, higher values allow segmented objects to overlap substantially 33 | min_overlap: 128 # overlap between blocks, should it be larger than the size of a cell 34 | context: 128 # context pixels around the blocks to be included during prediction 35 | n_tiles: (4,4,1) # the input image is broken up into (overlapping) tiles that are processed independently and re-assembled. This parameter denotes a tuple of the number of tiles for every image axis 36 | stardist_modelname: "2D_versatile_he" # Specify one of the available Stardist models: 2D_versatile_fluo (for IF images) or 2D_versatile_he (for H&E images) 37 | channel_to_segment: 2 # Only applicable for IF images. This is the image channel to segment (usually the DAPI channel) 38 | cell_markers: 39 | # Human Colon 40 | Epithelial: ["CDH1","EPCAM","CLDN1","CD2"] 41 | Enterocytes: ["CD55", "ELF3", "PLIN2", "GSTM3", "KLF5", "CBR1", "APOA1", "CA1", "PDHA1", "EHF"] 42 | Goblet cells: ["MANF", "KRT7", "AQP3", "AGR2", "BACE2", "TFF3", "PHGR1", "MUC4", "MUC13", "GUCA2A"] 43 | Enteroendocrine cells: ["NUCB2", "FABP5", "CPE", "ALCAM", "GCG", "SST", "CHGB", "IAPP", "CHGA", "ENPP2"] 44 | Crypt cells: ["HOPX", "SLC12A2", "MSI1", "SMOC2", "OLFM4", "ASCL2", "PROM1", "BMI1", "EPHB2", "LRIG1"] 45 | Endothelial: ["PECAM1","CD34","KDR","CDH5","PROM1","PDPN","TEK","FLT1","VCAM1","PTPRC","VWF","ENG","MCAM","ICAM1","FLT4"] 46 | Fibroblast: ["COL1A1","COL3A1","COL5A2","PDGFRA","ACTA2","TCF21","FN"] 47 | Smooth muscle cell: ["BGN","MYL9","MYLK","FHL2","ITGA1","ACTA2","EHD2","OGN","SNCG","FABP4"] 48 | B cells: ["CD74", "HMGA1", "CD52", "PTPRC", "HLA-DRA", "CD24", "CXCR4", "SPCS3", "LTB", "IGKC"] 49 | T cells: ["JUNB", "S100A4", "CD52", "PFN1P1", "CD81", "EEF1B2P3", "CXCR4", "CREM", "IL32", "TGIF1"] 50 | NK cells: ["S100A4", "IL32", "CXCR4", "FHL2", "IL2RG", "CD69", "CD7", "NKG7", "CD2", "HOPX"] 51 | 52 | # # Human Pancreas 53 | # Acinar_cell: ["PRSS1", "KLK1","CTRC", "PNLIP"] 54 | # Alpha_cell: ["GCG", "ARX", "CLIM1", "CRYBA2", "FEV", "GBA", "HMGB3"] 55 | # Beta_cell: ["INS", "BMP-5", "CDKN1C", "CRTR1", "DLK1", "NPTX2", "PACAP"] 56 | # Delta_cell: ["SST", "CHE1", "ESE3B", "ETV1", "GABRG2", "HER4", "ISL1"] 57 | # Ductal_cell: ["PROM1"] 58 | # Epsilon cell: ["GHRL", "TM4SF5"] 59 | # Mesenchymal_cell: ["THY1"] 60 | # Pancreatic_polypeptide_cell: [ 61 | # "AQP3", "ARHGAP3", "ARX", "BHLHB26", "BHLHB27", 62 | # "CARTPT", "EGR3", "ENTPD2", "ETV1", "MEIS1", 63 | # "MEIS2", "PAX6", "PTGFR", "RBTN3", "SERTM1", 64 | # "SLITRK6", "THSD7A", "ZNF506" 65 | # ] 66 | # PP_cell: ["PPY"] 67 | 68 | # # Human breast cancer 69 | # Cancer stem cell: ["CD133", "ALDH1", "SOX2", "OCT4", "CD44"] 70 | # Epithelial cell: ["EPCAM", "KRT8", "KRT18", "CDH1", "CLDN1", "MUC1"] 71 | # Immune cell: ["CD45", "CD3", "CD19", "CD14", "CD56"] 72 | # Natural killer cell: ["CD56", "CD16", "NKp46", "NKG2D", "CD94"] 73 | # Progenitor cell: ["Nestin", "CD34", "Sox2", "GATA2", "LGR5"] 74 | # Stem cell: ["OCT4", "SOX2", "NANOG", "KLF4", "CD34"] 75 | 76 | 77 | # # Mouse intestine 78 | # Enterocytes: ["Cbr1", "Plin2", "Gls", "Plin3", "Dab1", "Pmepa1", "Acsl5", "Hmox1", "Abcg2", "Cd36"] 79 | # Goblet cells: ["Manf", "Krt7", "Ccl9", "Muc13", "Phgr1", "Cdx2", "Aqp3", "Creb3L1", "Guca2A", "Klk1"] 80 | # Enteroendocrine cells: ["Fabp5", "Cpe", "Enpp2", "Chgb", "Alcam", "Chga", "Pax6", "Neurod1", "Cck", "Isl1"] 81 | # Paneth cells: ["Gpx2", "Fabp4", "Lyz1", "Kcnn4", "Lgals2", "Guca2B", "Lgr4", "Defa24", "Il4Ra", "Guca2A"] 82 | # Crypt cells: ["Prom1", "Hopx", "Msi1", "Olfm4", "Kcne3", "Bmi1", "Axin2", "Kcnq1", "Ascl2", "Lrig1"] 83 | # Smooth muscle cells: ["Bgn", "Myl9", "Pcp4L1", "Itga1", "Nrp2", "Mylk", "Ehd2", "Fabp4", "Acta2", "Ogn"] 84 | # B cells: ["Cd52", "Bcl11A", "Ebf1", "Cd74", "Ptprc", "Pold4", "Ighm", "Cd14", "Creld2", "Fli1"] 85 | # T cells: ["Cd81", "Junb", "Cd52", "Ptprcap", "H2-Q7", "Ccl6", "Bcl2", "Maff", "Ccl4", "Ccl3"] 86 | # NK cells: ["Ctla2A", "Ccl4", "Cd3G", "Ccl3", "Nkg7", "Lat", "Dusp2", "Itgam", "Fhl2", "Ccl5"] 87 | 88 | # # Mouse embryo 89 | # 1-cell stage cell (Blastomere): ['Accsl', 'Acvr1b', 'Asf1b', 'Bcl2l10', 'Blcap', 'Cdk2ap2', 'Ciapin1', 'Dclk2', 'Dusp7', 'H1foo'] 90 | # Blood progenitor cell: ['Flk1', 'Runx1', 'Tal1', 'Runx1'] 91 | # Cardiomyocyte: ['Bmp4', 'Emcn', 'Fbn1', 'Gata4', 'Hand1', 'Hand2', 'Mef2c', 'Myl4', 'Neb', 'Nid1'] 92 | # Fibroblast: ['Col5a2', 'Thy1'] 93 | # Oocyte: ['Abi3bp', 'Ampd3', 'Ankra2', 'Cep78', 'Cnn3', 'Dclre1a', 'Dcun1d5', 'Depdc7', 'Dnajc3', 'Dpy30'] 94 | # Pharyngeal mesoderm cell: ['Prdm1', 'Tbx1'] 95 | # Pre-haematopoietic stem cell: ['2410004N09Rik', '9030617O03Rik', '9030619P08Rik', 'Ablim1', 'Acot11', 'Akr1c14', 'Angpt1', 'Ank', 'Anpep', 'Art4'] 96 | # Primitive erythroid cell: ['Gata1', 'Hbb-bh1', 'Klf1'] 97 | # Primitive streak cell: ['Nanog', 'Pou5f1'] 98 | # Venous cell: ['Apj', 'Coup-tf2', 'Dab2', 'EphB4', 'Nrp2', 'Tie-2'] 99 | 100 | # # Human Tonsil 101 | # Epithelial: ["EPCAM"] 102 | # Endothelial: ["PECAM1", "CD34", "KDR", "CDH5", "PROM1", "PDPN", "TEK", "FLT1", "VCAM1", "PTPRC", "VWF", "ENG", "MCAM", "ICAM1", "FLT4"] 103 | # Fibroblast: ["COL1A1", "COL3A1", "COL5A2", "PDGFRA", "ACTA2", "TCF21", "FN"] 104 | # B_cells: ["CD74", "HMGA1", "CD52", "PTPRC", "HLA-DRA", "CD24", "CXCR4", "SPCS3", "LTB", "IGKC"] 105 | # T_cells: ["JUNB", "S100A4", "CD52", "PFN1P1", "CD81", "EEF1B2P3", "CXCR4", "CREM", "IL32", "TGIF1"] 106 | # NK_cells: ["S100A4", "IL32", "CXCR4", "FHL2", "IL2RG", "CD69", "CD7", "NKG7", "CD2", "HOPX"] -------------------------------------------------------------------------------- /figs/pipelineflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sanofi-Public/enact-pipeline/3c4e6094b4df35c1e2c61a65f8ceace3e3ac281b/figs/pipelineflow.png -------------------------------------------------------------------------------- /figs/tissuumaps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sanofi-Public/enact-pipeline/3c4e6094b4df35c1e2c61a65f8ceace3e3ac281b/figs/tissuumaps.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "enact-SO" 7 | version = "0.2.3" 8 | description = "ENACT is a self-contained pipeline designed to streamline Visium HD analysis from cell segmentation to annotation, enabling integration with advanced spatial analysis tools." 9 | license ={ file = "LICENSE.md" } 10 | readme = "README.md" 11 | requires-python = ">=3.9" 12 | keywords = ["spatial", "omics", "bioinformatics", "transcriptomics", "VisiumHD", ] 13 | authors = [ 14 | { name = "Mena Kamel", email = "mena.kamel@sanofi.com" }, 15 | { name = "Yiwen Song", email = "yiwen.song@sanofi.com" }, 16 | ] 17 | classifiers = [ 18 | 19 | "Programming Language :: Python", 20 | "Programming Language :: Python :: 3.10", 21 | "Programming Language :: Python :: 3.11", 22 | "Programming Language :: Python :: 3.12", 23 | ] 24 | 25 | # Core dependencies required for running the ENACT pipeline 26 | dependencies = [ 27 | "anndata==0.10.8", 28 | "fastparquet==2024.5.0", 29 | "shapely==2.0.5", 30 | "stardist==0.9.1", 31 | "tifffile==2024.7.24", 32 | "scvi-tools==1.1.6.post2", 33 | "scanpy==1.10.2", 34 | "geopandas==1.0.1", 35 | "tensorflow==2.17.0", 36 | "plotly==5.24.0", 37 | "imagecodecs==2024.9.22", 38 | "pyyaml==6.0", 39 | "pandas", 40 | "numpy", 41 | "tqdm", 42 | "Pillow", 43 | "scipy", 44 | "celltypist-SO==1.6.5", 45 | "python-multipart==0.0.19" 46 | ] 47 | 48 | # Documentation and other URLs related to the project 49 | [project.urls] 50 | Documentation = "https://github.com/Sanofi-Public/enact-pipeline#readme" 51 | Source = "https://github.com/Sanofi-Public/enact-pipeline" 52 | 53 | # Scripts and linting tools 54 | [tool.hatch.scripts] 55 | check = "mypy --install-types --non-interactive {args:src/enact tests}" 56 | 57 | [tool.hatch.build.targets.wheel] 58 | packages = ["src/enact"] 59 | 60 | [tool.setuptools.packages.find] 61 | where = ["src"] 62 | include = ["enact*"] 63 | 64 | [tool.coverage.report] 65 | exclude_lines = [ 66 | "no cov", 67 | "if TYPE_CHECKING:", 68 | ] 69 | 70 | [tool.hatch.publish.test] 71 | disable = true 72 | 73 | # Include important files like README and LICENSE 74 | [tool.setuptools] 75 | include-package-data = true 76 | -------------------------------------------------------------------------------- /reproduce_paper_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | eval "$(conda shell.bash hook)" 3 | 4 | set -e 5 | 6 | PY_ENV_PATH=$1 7 | 8 | conda activate $PY_ENV_PATH 9 | 10 | FILE_URL="https://zenodo.org/records/14748859/files/ENACT_supporting_files.zip" 11 | OUTPUT_FILE="ENACT_supporting_files.zip" 12 | 13 | # Download ENACT supporting files if they are not present 14 | if [ -f "$OUTPUT_FILE" ]; then 15 | echo "$OUTPUT_FILE already exists. Skipping download." 16 | else 17 | echo "$OUTPUT_FILE is downloading." 18 | wget -O $OUTPUT_FILE $FILE_URL 19 | unzip $OUTPUT_FILE 20 | fi 21 | 22 | 23 | # Need to add step to download files from Zenodo to ENACT_supporting_files (in repo home directory) 24 | # Run ENACT pipeline to test all combinations of Bin-to-cell assignment and cell annotation methods - Order of experiments matters, don't change! 25 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/naive-celltypist.yaml 26 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/naive-cellassign.yaml 27 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_area-celltypist.yaml 28 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_area-cellassign.yaml 29 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_transcript-celltypist.yaml 30 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_transcript-cellassign.yaml 31 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_cluster-celltypist.yaml 32 | python -m src.enact.pipeline --configs_path ENACT_supporting_files/public_data/human_colorectal/config_files/weighted_by_cluster-cellassign.yaml 33 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | anndata==0.10.8 2 | fastparquet==2024.5.0 3 | shapely==2.0.5 4 | stardist==0.9.1 5 | tifffile==2024.7.24 6 | scvi-tools==1.1.6.post2 7 | celltypist-SO==1.6.5 8 | scanpy==1.10.2 9 | geopandas==1.0.1 10 | tensorflow==2.17.0 11 | plotly==5.24.0 12 | imagecodecs==2024.9.22 13 | 14 | pytest==7.3.2 15 | pytest-cov==4.1.0 16 | python-multipart==0.0.19 17 | -------------------------------------------------------------------------------- /run_cell_ann_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | eval "$(conda shell.bash hook)" 3 | 4 | set -e 5 | 6 | PY_ENV_PATH=$1 7 | 8 | # Run ENACT pipeline 9 | conda activate $PY_ENV_PATH 10 | python -m src.eval.cell_annotation_eval 11 | 12 | -------------------------------------------------------------------------------- /run_enact.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | eval "$(conda shell.bash hook)" 3 | 4 | set -e 5 | 6 | PY_ENV_PATH=$1 7 | CONFIG_PATH=$2 8 | 9 | # Run ENACT pipeline 10 | conda activate $PY_ENV_PATH 11 | python -m src.enact.pipeline --configs_path "$CONFIG_PATH" -------------------------------------------------------------------------------- /setup_py_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | eval "$(conda shell.bash hook)" 3 | 4 | set -e 5 | 6 | PY_ENV_PATH=$1 7 | 8 | # Create Python environment 9 | if ! conda info --envs | grep -q "$PY_ENV_PATH"; then 10 | echo "Environment $PY_ENV_PATH does not exist. Creating..." 11 | conda create --prefix $PY_ENV_PATH python=3.10 12 | conda activate $PY_ENV_PATH 13 | pip install -r requirements.txt 14 | fi 15 | -------------------------------------------------------------------------------- /src/enact/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sanofi-Public/enact-pipeline/3c4e6094b4df35c1e2c61a65f8ceace3e3ac281b/src/enact/__init__.py -------------------------------------------------------------------------------- /src/enact/assignment_methods/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sanofi-Public/enact-pipeline/3c4e6094b4df35c1e2c61a65f8ceace3e3ac281b/src/enact/assignment_methods/__init__.py -------------------------------------------------------------------------------- /src/enact/assignment_methods/naive.py: -------------------------------------------------------------------------------- 1 | # Naive method. Only using the bins unique to each cell (overlapping bins omitted) 2 | 3 | 4 | def naive_assignment(result_spatial_join): 5 | # Naive method. Only using the bins unique to each cell (overlapping bins omitted) 6 | result_spatial_join = result_spatial_join[result_spatial_join["unique_bin"]] 7 | result_spatial_join["weight"] = 1 8 | return result_spatial_join 9 | -------------------------------------------------------------------------------- /src/enact/assignment_methods/weight_by_area.py: -------------------------------------------------------------------------------- 1 | # Weighted by area method 2 | import anndata 3 | import numpy as np 4 | from scipy import sparse 5 | 6 | 7 | def apply_weights_to_adata_counts(adata): 8 | """Applies the weights to the counts matrix 9 | 10 | Args: 11 | adata (AnnData): Counts AnnData 12 | 13 | Returns: 14 | AnnData: Weighted-adjusted AnnData 15 | """ 16 | weight = adata.obs["weight"] 17 | # Reshape weights to (130000, 1) for broadcasting 18 | weight = np.array(weight) 19 | weight = weight[:, np.newaxis] 20 | 21 | # OPTIMIZATION 22 | # Perform element-wise multiplication 23 | weighted_counts = adata.X.multiply(weight) 24 | 25 | # convert back to sparse 26 | adata.X = sparse.csr_matrix(weighted_counts) 27 | return adata 28 | 29 | 30 | def weight_by_area_assignment(result_spatial_join, expanded_adata, cell_gdf_chunk): 31 | # Calculate overlapping area between cell and bin 32 | result_spatial_join["area"] = result_spatial_join.apply( 33 | lambda row: row["geometry"] 34 | .intersection(cell_gdf_chunk.loc[row["index_right"], "geometry"]) 35 | .area, 36 | axis=1, 37 | ) 38 | bin_area = result_spatial_join.iloc[0]["geometry"].area 39 | result_spatial_join["weight"] = result_spatial_join["area"] / bin_area 40 | result_spatial_join.loc[ 41 | result_spatial_join["unique_bin"], 42 | "weight", 43 | ] = 1 44 | expanded_adata.obs["weight"] = result_spatial_join["weight"].tolist() 45 | expanded_adata = apply_weights_to_adata_counts(expanded_adata) 46 | return result_spatial_join, expanded_adata 47 | -------------------------------------------------------------------------------- /src/enact/assignment_methods/weight_by_gene.py: -------------------------------------------------------------------------------- 1 | # Weighted by area method 2 | import anndata 3 | import numpy as np 4 | import pandas as pd 5 | from scipy import sparse 6 | from tqdm import tqdm 7 | from sklearn.cluster import KMeans 8 | from sklearn.decomposition import PCA 9 | from sklearn.preprocessing import StandardScaler 10 | 11 | 12 | def apply_weights_to_adata_counts(expanded_adata, weights_df): 13 | """Applies the weights to the counts matrix 14 | 15 | Args: 16 | adata (AnnData): Counts AnnData 17 | 18 | Returns: 19 | AnnData: Weighted-adjusted AnnData 20 | """ 21 | if weights_df.empty: 22 | return expanded_adata 23 | # Applying the weighting 24 | mask = (expanded_adata.obs_names.isin(weights_df.index)) & ( 25 | expanded_adata.obs["id"].isin(weights_df["id"]) 26 | ) 27 | indices = np.where(mask)[0] 28 | # Apply weights to the entries in the expression matrix 29 | weights_matrix = np.ones(expanded_adata.shape) 30 | 31 | for idx in tqdm(indices, total=len(indices)): 32 | bin_id = expanded_adata.obs.iloc[idx]["index"] 33 | cell_id = expanded_adata.obs.iloc[idx]["id"] 34 | bin_rows = weights_df.loc[bin_id] 35 | weights = bin_rows[bin_rows["id"] == cell_id][expanded_adata.var_names] 36 | weights_matrix[idx] = weights.iloc[0].tolist() 37 | weighted_counts = expanded_adata.X.multiply(weights_matrix) 38 | # convert back to sparse 39 | expanded_adata.X = sparse.csr_matrix(weighted_counts) 40 | return expanded_adata 41 | 42 | 43 | def weight_by_gene_assignment( 44 | result_spatial_join, expanded_adata, unique_cell_by_gene_adata 45 | ): 46 | # Getting the gene counts of the cells (unique signature for each cell) 47 | gene_counts_non_overlap = ( 48 | pd.DataFrame( 49 | unique_cell_by_gene_adata.X.toarray(), 50 | index=unique_cell_by_gene_adata.obs_names, 51 | columns=unique_cell_by_gene_adata.var_names, 52 | ) 53 | .groupby(unique_cell_by_gene_adata.obs["id"]) 54 | .sum() 55 | .reset_index() 56 | ) 57 | 58 | # Getting the bins that overlap with multiple cells 59 | overlapping_bins = result_spatial_join[~result_spatial_join["unique_bin"]] 60 | 61 | # Getting a table of bins with the parent cell and the parent cell's gene content 62 | overlap_merge = pd.merge( 63 | overlapping_bins[["index", "id"]], gene_counts_non_overlap, on="id", how="left" 64 | ) 65 | overlap_merge.set_index("index", inplace=True) 66 | 67 | # Grouping the bins by the bin id 68 | grouped_overlap = overlap_merge.groupby("index") 69 | 70 | # Initialize progress bar for processing overlapping bins 71 | pbar = tqdm(grouped_overlap, desc="Processing overlapping bins", unit="bin") 72 | gene_columns = overlap_merge.columns.drop(["id"]).tolist() 73 | weights_list = [] 74 | # Looping through the bins and splitting the counts 75 | for bin_index, group_rows in pbar: 76 | # getting total gene counts from the cells that share a bin 77 | gene_total = group_rows[gene_columns].sum(axis=0) 78 | # Dividing the cells gene counts by the total gene counts to get the weight 79 | gene_weights = group_rows[gene_columns].div(gene_total, axis=1).fillna(0) 80 | gene_weights["id"] = group_rows["id"] 81 | weights_list.append(gene_weights) 82 | # Getting a weights dataframe 83 | if weights_list: 84 | weights_df = pd.concat(weights_list, axis=0) 85 | else: 86 | weights_df = pd.DataFrame() 87 | pbar.close() 88 | expanded_adata = apply_weights_to_adata_counts(expanded_adata, weights_df) 89 | return result_spatial_join, expanded_adata 90 | 91 | 92 | def weight_by_cluster_assignment( 93 | result_spatial_join, expanded_adata, unique_cell_by_gene_adata, n_clusters=4, n_pcs=250 94 | ): 95 | # Getting the gene counts of the cells (unique signature for each cell) 96 | gene_counts_non_overlap = ( 97 | pd.DataFrame( 98 | unique_cell_by_gene_adata.X.toarray(), 99 | index=unique_cell_by_gene_adata.obs_names, 100 | columns=unique_cell_by_gene_adata.var_names, 101 | ) 102 | .groupby(unique_cell_by_gene_adata.obs["id"]) 103 | .sum() 104 | .reset_index() 105 | ) 106 | 107 | # Getting the bins that overlap with multiple cells 108 | overlapping_bins = result_spatial_join[~result_spatial_join["unique_bin"]] 109 | 110 | gene_columns = gene_counts_non_overlap.columns.drop(["id"]).tolist() 111 | 112 | # Standardize the data 113 | scaler = StandardScaler() 114 | data_scaled = scaler.fit_transform(gene_counts_non_overlap[gene_columns]) 115 | 116 | # Apply PCA for dimensionality reduction 117 | n_pcs = np.min([data_scaled.shape[0], data_scaled.shape[1], n_pcs]) 118 | pca = PCA(n_components=n_pcs) 119 | data_pca = pca.fit_transform(data_scaled) 120 | 121 | # clustering on gene counts from non-overlapping bins 122 | n_clusters = np.min([n_clusters, n_pcs]) 123 | kmeans = KMeans(n_clusters=n_clusters, random_state=0) 124 | clusters = kmeans.fit_predict(data_pca) 125 | gene_counts_non_overlap["cluster"] = clusters 126 | cluster_means = gene_counts_non_overlap.groupby("cluster")[gene_columns].mean() 127 | 128 | # Getting a table of bins with the parent cell and the parent cell's gene content 129 | # index = bin index, id: cell index 130 | # table has the bin, the cells that share them, and cell transcript counts 131 | overlap_merge = pd.merge( 132 | overlapping_bins[["index", "id"]], gene_counts_non_overlap, on="id", how="left" 133 | ) 134 | # merge cluster mean gene counts with overlapping bins - 135 | # using cluster gene counts instead of the bins's gene counts 136 | overlap_merge = pd.merge( 137 | overlap_merge[["index", "id", "cluster"]], 138 | cluster_means, 139 | left_on="cluster", 140 | right_index=True, 141 | how="left", 142 | ) 143 | overlap_merge.set_index("index", inplace=True) 144 | 145 | grouped_overlap = overlap_merge.groupby("index") 146 | 147 | # Initialize progress bar for processing overlapping bins 148 | pbar = tqdm(grouped_overlap, desc="Processing overlapping bins", unit="bin") 149 | weights_list = [] 150 | # Looping through the bins and splitting the counts 151 | for bin_index, group_rows in pbar: 152 | # getting total gene counts from the cells that share a bin 153 | gene_total = group_rows[gene_columns].sum(axis=0) 154 | # Dividing the cells gene counts by the total gene counts to get the weight 155 | gene_weights = group_rows[gene_columns].div(gene_total, axis=1) 156 | num_cells = len(group_rows) 157 | gene_weights = gene_weights.fillna(1/num_cells) 158 | gene_weights = gene_weights.copy() 159 | gene_weights["id"] = group_rows["id"] 160 | weights_list.append(gene_weights) 161 | # Getting a weights dataframe 162 | if weights_list: 163 | weights_df = pd.concat(weights_list, axis=0) 164 | else: 165 | weights_df = pd.DataFrame() 166 | pbar.close() 167 | expanded_adata = apply_weights_to_adata_counts(expanded_adata, weights_df) 168 | return result_spatial_join, expanded_adata -------------------------------------------------------------------------------- /src/enact/cellassign.py: -------------------------------------------------------------------------------- 1 | """Class for defining methods to package pipeline outputs into AnnData objects 2 | """ 3 | 4 | import os 5 | import pandas as pd 6 | import anndata 7 | import scanpy as sc 8 | import scvi 9 | import seaborn as sns 10 | from scvi.external import CellAssign 11 | import numpy as np 12 | import torch 13 | 14 | from .pipeline import ENACT 15 | 16 | seed = 42 17 | 18 | 19 | class CellAssignPipeline(ENACT): 20 | """Class for running CellAssign algorithm""" 21 | 22 | def __init__(self, **kwargs): 23 | super().__init__(**kwargs) 24 | 25 | def format_markers_to_df(self): 26 | """Method to format marker genes to a pandas dataframe 27 | num gene x num cell_types 28 | """ 29 | markers_dict = self.configs["cell_markers"] 30 | genes_set = set([item for sublist in markers_dict.values() for item in sublist]) 31 | markers_df = pd.DataFrame(columns=markers_dict.keys(), index=sorted(genes_set)) 32 | markers_df = markers_df.fillna(0) 33 | for cell_type, gene_markers in markers_dict.items(): 34 | markers_df.loc[gene_markers, cell_type] = 1 35 | self.markers_df = markers_df 36 | 37 | def run_cell_assign(self): 38 | """Runs CellAssign""" 39 | bin_assign_results = self.merge_files_sparse(self.bin_assign_dir) 40 | cell_lookup_df = self.merge_files(self.cell_ix_lookup_dir, save=False) 41 | 42 | spatial_cols = ["cell_x", "cell_y"] 43 | stat_columns = ["num_shared_bins", "num_unique_bins", "num_transcripts"] 44 | cell_lookup_df.loc[:, "id"] = cell_lookup_df["id"].astype(str) 45 | cell_lookup_df = cell_lookup_df.set_index("id") 46 | cell_lookup_df["num_transcripts"] = cell_lookup_df["num_transcripts"].fillna(0) 47 | 48 | bin_assign_result_sparse, gene_columns = bin_assign_results 49 | adata = anndata.AnnData(X=bin_assign_result_sparse, obs=cell_lookup_df.copy()) 50 | adata.var_names = gene_columns 51 | 52 | adata.obsm["spatial"] = cell_lookup_df[spatial_cols].astype(int) 53 | adata.obsm["stats"] = cell_lookup_df[stat_columns].astype(int) 54 | 55 | lib_size = adata.X.sum(1) 56 | adata.obs["size_factor"] = lib_size / np.mean(lib_size) 57 | adata.obs["lib_size"] = lib_size 58 | 59 | marker_gene_mat = self.markers_df.copy() 60 | marker_gene_mat = marker_gene_mat.loc[ 61 | sorted(list(set(self.markers_df.index) & set(gene_columns))) 62 | ] 63 | bdata = adata[:, marker_gene_mat.index].copy() 64 | 65 | torch.manual_seed(seed) 66 | scvi.external.CellAssign.setup_anndata(bdata, size_factor_key="size_factor") 67 | model = CellAssign(bdata, marker_gene_mat, random_b_g_0=False) 68 | model.train() 69 | predictions = model.predict() 70 | 71 | bdata.obs["cell_type"] = predictions.idxmax(axis=1).values 72 | bdata.obs[adata.obsm["spatial"].columns] = adata.obsm["spatial"] 73 | bdata.obs[adata.obsm["stats"].columns] = adata.obsm["stats"] 74 | bdata.obs["chunk_name"] = cell_lookup_df["chunk_name"] 75 | bdata.obs.to_csv( 76 | os.path.join(self.cellannotation_results_dir, "merged_results.csv") 77 | ) 78 | print( 79 | f"saved to : {os.path.join(self.cellannotation_results_dir, 'merged_results.csv')}" 80 | ) 81 | 82 | 83 | if __name__ == "__main__": 84 | # Creating CellAssignPipeline object 85 | cell_assign = CellAssignPipeline(configs_path="config/configs.yaml") 86 | cell_assign.format_markers_to_df() 87 | -------------------------------------------------------------------------------- /src/enact/celltypist.py: -------------------------------------------------------------------------------- 1 | """Class for defining methods to package pipeline outputs into AnnData objects 2 | """ 3 | 4 | import os 5 | import pandas as pd 6 | import anndata 7 | import scanpy as sc 8 | import seaborn as sns 9 | import numpy as np 10 | 11 | ## Attempt to import celltypist, and prompt installation if not found 12 | import celltypist 13 | from celltypist import models 14 | 15 | from .pipeline import ENACT 16 | 17 | 18 | class CellTypistPipeline(ENACT): 19 | """Class for running CellAssign algorithm""" 20 | 21 | def __init__(self, **kwargs): 22 | super().__init__(**kwargs) 23 | 24 | def run_cell_typist(self): 25 | """Runs CellTypist""" 26 | bin_assign_results = self.merge_files_sparse(self.bin_assign_dir) 27 | cell_lookup_df = self.merge_files(self.cell_ix_lookup_dir, save=False) 28 | 29 | spatial_cols = ["cell_x", "cell_y"] 30 | stat_columns = ["num_shared_bins", "num_unique_bins", "num_transcripts"] 31 | cell_lookup_df.loc[:, "id"] = cell_lookup_df["id"].astype(str) 32 | cell_lookup_df = cell_lookup_df.set_index("id") 33 | cell_lookup_df["num_transcripts"] = cell_lookup_df["num_transcripts"].fillna(0) 34 | 35 | bin_assign_result_sparse, gene_columns = bin_assign_results 36 | adata = anndata.AnnData(X=bin_assign_result_sparse, obs=cell_lookup_df.copy()) 37 | adata.var_names = gene_columns 38 | 39 | adata.obsm["spatial"] = cell_lookup_df[spatial_cols].astype(int) 40 | adata.obsm["stats"] = cell_lookup_df[stat_columns].astype(int) 41 | 42 | lib_size = adata.X.sum(1) 43 | adata.obs["size_factor"] = lib_size / np.mean(lib_size) 44 | adata.obs["lib_size"] = lib_size 45 | 46 | # normalize adata to the log1p normalised format (to 10,000 counts per cell) 47 | sc.pp.normalize_total(adata, target_sum=1e4) 48 | sc.pp.log1p(adata) 49 | 50 | # download celltypist model and predict cell type 51 | if ".pkl" not in self.cell_typist_model: 52 | self.cell_typist_model = self.cell_typist_model + ".pkl" 53 | models.download_models(model=self.cell_typist_model) 54 | predictions = celltypist.annotate(adata, model=self.cell_typist_model) 55 | adata = predictions.to_adata( 56 | insert_labels=True, insert_conf=True, insert_prob=True 57 | ) 58 | 59 | adata.obs.rename(columns={"predicted_labels": "cell_type"}, inplace=True) 60 | adata.obs[adata.obsm["spatial"].columns] = adata.obsm["spatial"] 61 | adata.obs[adata.obsm["stats"].columns] = adata.obsm["stats"] 62 | adata.obs["chunk_name"] = cell_lookup_df["chunk_name"] 63 | results_df = adata.obs.drop(columns=adata.obs["cell_type"].unique().tolist()) 64 | results_df.to_csv( 65 | os.path.join(self.cellannotation_results_dir, "merged_results.csv") 66 | ) 67 | 68 | 69 | if __name__ == "__main__": 70 | # Creating CellAssignPipeline object 71 | cell_typist = CellTypistPipeline(configs_path="config/configs.yaml") 72 | cell_typist.run_cell_typist() 73 | -------------------------------------------------------------------------------- /src/enact/package_results.py: -------------------------------------------------------------------------------- 1 | """Class for defining methods to package pipeline outputs into AnnData objects 2 | """ 3 | 4 | import os 5 | import yaml 6 | import json 7 | import shutil 8 | import anndata 9 | import pandas as pd 10 | from PIL import Image 11 | import numpy as np 12 | from scipy.sparse import csr_matrix 13 | 14 | # import squidpy as sq 15 | 16 | from .pipeline import ENACT 17 | 18 | 19 | class PackageResults(ENACT): 20 | """Class for packaging ENACT pipeline outputs""" 21 | 22 | def __init__(self, **kwargs): 23 | super().__init__(**kwargs) 24 | self.files_to_ignore = [ 25 | "merged_results.csv", 26 | "merged_results_old.csv", 27 | "cells_adata.h5", 28 | ".ipynb_checkpoints", 29 | ] 30 | 31 | def merge_cellassign_output_files(self): 32 | """Merges the CellAssign results with gene counts 33 | 34 | Returns: 35 | _type_: _description_ 36 | """ 37 | if self.configs["params"]["chunks_to_run"]: 38 | chunk_list = self.configs["params"]["chunks_to_run"] 39 | else: 40 | chunk_list = os.listdir(self.bin_assign_dir) 41 | cell_by_gene_list = [] 42 | for chunk_name in chunk_list: 43 | if chunk_name in self.files_to_ignore: 44 | continue 45 | index_lookup = pd.read_csv( 46 | os.path.join(self.cell_ix_lookup_dir, chunk_name) 47 | ) 48 | trancript_counts = pd.read_csv( 49 | os.path.join(self.bin_assign_dir, chunk_name) 50 | ).drop(columns=["Unnamed: 0"]) 51 | cell_by_gene_chunk = pd.concat( 52 | [index_lookup["id"], trancript_counts], axis=1 53 | ) 54 | cell_by_gene_list.append(cell_by_gene_chunk) 55 | cell_by_gene_df = pd.concat(cell_by_gene_list, axis=0) 56 | return cell_by_gene_df 57 | 58 | def merge_sargent_output_files(self): 59 | """Merges the Sargent chunk results into a single results file 60 | 61 | Returns: 62 | _type_: _description_ 63 | """ 64 | os.makedirs(self.sargent_results_dir, exist_ok=True) 65 | # Merge the sargent_results_chunks data and gene_to_cell_assignment_chunks_ix_lookup 66 | chunks = os.listdir(self.sargent_results_dir) 67 | sargent_results_list = [] 68 | cell_by_gene_list = [] 69 | for chunk_name in chunks: 70 | if chunk_name in self.files_to_ignore: 71 | continue 72 | cell_labels = pd.read_csv( 73 | os.path.join(self.sargent_results_dir, chunk_name) 74 | ) 75 | index_lookup = pd.read_csv( 76 | os.path.join(self.cell_ix_lookup_dir, chunk_name) 77 | ) 78 | trancript_counts = pd.read_csv( 79 | os.path.join(self.bin_assign_dir, chunk_name) 80 | ).drop(columns=["Unnamed: 0"]) 81 | 82 | sargent_result_chunk = pd.concat([index_lookup, cell_labels["x"]], axis=1) 83 | cell_by_gene_chunk = pd.concat( 84 | [index_lookup["id"], trancript_counts], axis=1 85 | ) 86 | sargent_result_chunk.drop("Unnamed: 0", axis=1, inplace=True) 87 | sargent_results_list.append(sargent_result_chunk) 88 | cell_by_gene_list.append(cell_by_gene_chunk) 89 | sargent_results_df = pd.concat(sargent_results_list, axis=0) 90 | sargent_results_df = sargent_results_df.rename(columns={"x": "cell_type"}) 91 | cell_by_gene_df = pd.concat(cell_by_gene_list, axis=0) 92 | sargent_results_df.to_csv( 93 | os.path.join(self.sargent_results_dir, "merged_results.csv"), index=False 94 | ) 95 | return sargent_results_df, cell_by_gene_df 96 | 97 | def df_to_adata(self, results_df, cell_by_gene_df): 98 | """Converts pd.DataFrame object with pipeline results to AnnData 99 | 100 | Args: 101 | results_df (_type_): _description_ 102 | 103 | Returns: 104 | anndata.AnnData: Anndata with pipeline outputs 105 | """ 106 | file_columns = results_df.columns 107 | spatial_cols = ["cell_x", "cell_y"] 108 | stat_columns = ["num_shared_bins", "num_unique_bins", "num_transcripts"] 109 | results_df.loc[:, "id"] = results_df["id"].astype(str) 110 | results_df = results_df.set_index("id") 111 | results_df["num_transcripts"] = results_df["num_transcripts"].fillna(0) 112 | results_df["cell_type"] = results_df["cell_type"].str.lower() 113 | adata = anndata.AnnData(cell_by_gene_df.set_index("id")) 114 | adata.obs = adata.obs.merge(results_df, on="id").drop_duplicates(keep='first') 115 | 116 | adata.obsm["spatial"] = adata.obs[spatial_cols].astype(int) 117 | adata.obsm["stats"] = adata.obs[stat_columns].astype(int) 118 | 119 | # This column is the output of cell type inference pipeline 120 | adata.obs["cell_type"] = adata.obs[["cell_type"]].astype("category") 121 | adata.obs["patch_id"] = adata.obs[["chunk_name"]] 122 | adata.obs = adata.obs[["cell_type", "patch_id"]] 123 | 124 | # Converting the Anndata cell transcript counts to sparse format for more efficient storage 125 | adata.X = csr_matrix(adata.X).astype(np.float32) 126 | return adata 127 | 128 | def create_tmap_file(self): 129 | """Creates a tmap file for the sample being run on ENACT 130 | """ 131 | # The following three files need to be in the same directory: 132 | # cells_adata.h5, wsi file, experiment_tmap.tmap 133 | tmap_template_path = "./templates/tmap_template.tmap" 134 | with open(tmap_template_path, "r") as stream: 135 | tmap_template = yaml.safe_load(stream) 136 | tmap_template["filename"] = self.configs["analysis_name"] 137 | bin_to_cell_method = self.configs["params"]["bin_to_cell_method"] 138 | cell_annotation_method = self.configs["params"]["cell_annotation_method"] 139 | wsi_src_path = self.configs["paths"]["wsi_path"] 140 | wsi_fname = "wsi.tif" 141 | run_name = f"{bin_to_cell_method}|{cell_annotation_method}" 142 | tmap_template["markerFiles"][0]["title"] = f"ENACT Results: {run_name.replace('|', ' | ')}" 143 | tmap_template["markerFiles"][0]["expectedHeader"].update( 144 | { 145 | "X": "/obsm/spatial/cell_x", 146 | "Y": "/obsm/spatial/cell_y", 147 | "gb_col": "/obs/cell_type/", 148 | } 149 | ) 150 | tmap_template["layers"][0].update( 151 | {"name": wsi_fname, "tileSource": f"{wsi_fname}.dzi"} 152 | ) 153 | tmap_template["markerFiles"][0]["path"] = f"{run_name}_cells_adata.h5" 154 | 155 | # save tmap file at a separate directory "tmap" 156 | tmap_output_dir = os.path.join(self.cache_dir, "tmap") 157 | os.makedirs(tmap_output_dir, exist_ok=True) 158 | tmap_file_path = os.path.join(tmap_output_dir, f"{run_name}_tmap.tmap") 159 | with open(tmap_file_path, "w") as outfile: 160 | outfile.write(json.dumps(tmap_template, indent=4)) 161 | 162 | # Copy the anndata file to the "tmap" directory 163 | adata_src_path = os.path.join( 164 | self.cellannotation_results_dir, "cells_adata.h5" 165 | ) 166 | adata_dst_path = os.path.join(tmap_output_dir, f"{run_name}_cells_adata.h5") 167 | shutil.copy(adata_src_path, adata_dst_path) 168 | 169 | # Copy the cells_layer.png file to the "tmap" directory 170 | layer_src_path = os.path.join( 171 | self.cache_dir, "cells_layer.png" 172 | ) 173 | layer_dst_path = os.path.join(tmap_output_dir, "cells_layer.png") 174 | if os.path.exists(layer_src_path): 175 | shutil.copy(layer_src_path, layer_dst_path) 176 | 177 | # Saving a cropped version (lite version) of the image file to the "tmap" directory 178 | wsi_dst_path = os.path.join(tmap_output_dir, wsi_fname) 179 | cropped_image, _ = self.load_image() 180 | cropped_image = Image.fromarray(cropped_image) 181 | cropped_image.save(wsi_dst_path) 182 | 183 | message = f""" 184 | Sample ready to visualize on TissUUmaps. To install TissUUmaps, follow the instructions at:\n 185 | https://tissuumaps.github.io/TissUUmaps-docs/docs/intro/installation.html#. 186 | 187 | To view the the sample, follow the instructions at:\n 188 | https://tissuumaps.github.io/TissUUmaps-docs/docs/starting/projects.html#loading-projects 189 | 190 | TissUUmaps project file is located here:\n 191 | {tmap_file_path} 192 | """ 193 | print (message) 194 | 195 | # def run_neighborhood_enrichment(self, adata): 196 | # """Sample function to run Squidpy operations on AnnData object 197 | 198 | # Args: 199 | # adata (_type_): _description_ 200 | 201 | # Returns: 202 | # _type_: _description_ 203 | # """ 204 | # sq.gr.spatial_neighbors(adata) 205 | # sq.gr.nhood_enrichment(adata, cluster_key="cell_type") 206 | # return adata 207 | 208 | def save_adata(self, adata): 209 | """Save the anndata object to disk 210 | 211 | Args: 212 | adata (_type_): _description_ 213 | """ 214 | adata.write( 215 | os.path.join(self.cellannotation_results_dir, "cells_adata.h5"), 216 | compression="gzip", 217 | ) 218 | 219 | 220 | if __name__ == "__main__": 221 | # Creating ENACT object 222 | so_hd = PackageResults(configs_path="config/configs.yaml") 223 | results_df, cell_by_gene_df = so_hd.merge_sargent_output_files() 224 | adata = so_hd.df_to_adata(results_df, cell_by_gene_df) 225 | # adata = so_hd.run_neighborhood_enrichment(adata) # Example integration with SquiPy 226 | so_hd.save_adata(adata) -------------------------------------------------------------------------------- /src/enact/utils/logging.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created By : ... 3 | Created Date: DD/MM/YYYY 4 | Description : ... 5 | """ 6 | import os 7 | import logging 8 | 9 | 10 | def get_logger(app_name, cache_dir): 11 | """Create and configure logger. 12 | 13 | :return: logger 14 | :rtype: Logger 15 | """ 16 | # Clear log 17 | log_file = os.path.join(cache_dir, f"{app_name}.log") 18 | 19 | logger = logging.getLogger(app_name) 20 | if logger.hasHandlers(): 21 | return logger 22 | 23 | logger.setLevel(logging.DEBUG) 24 | 25 | # Create file handler 26 | file_handler = logging.FileHandler(log_file) 27 | file_handler.setLevel(logging.INFO) 28 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 29 | file_handler.setFormatter(formatter) 30 | 31 | # Create stream handler 32 | stream_handler = logging.StreamHandler() 33 | stream_handler.setLevel(logging.DEBUG) 34 | stream_handler.setFormatter(formatter) 35 | 36 | # Add handlers to the logger 37 | logger.addHandler(file_handler) 38 | logger.addHandler(stream_handler) 39 | 40 | return logger -------------------------------------------------------------------------------- /src/eval/cell_annotation_eval.py: -------------------------------------------------------------------------------- 1 | # Script runs the evaluation to compare ENACT cell annotations versus pathologist cell annotations 2 | 3 | from shapely.geometry import shape 4 | import plotly.express as px 5 | import geopandas as gpd 6 | import json 7 | from shapely.geometry import Polygon, Point 8 | from shapely import wkt 9 | import pandas as pd 10 | from sklearn.metrics import precision_recall_fscore_support, accuracy_score 11 | from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay 12 | import os 13 | 14 | # from src.pipelines.enact_pipeline import ENACT 15 | 16 | # so_hd = ENACT(configs_path="config/configs.yaml") 17 | 18 | geojson_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/Visium_HD_Human_Colon_Cancer-wsi-40598_0_65263_22706.geojson" 19 | segmentation_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/cells_df.csv" 20 | predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_cluster/cellassign_results/merged_results.csv" 21 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_cluster/sargent_results/merged_results.csv" 22 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_area/cellassign_results/merged_results.csv" 23 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_area/sargent_results/merged_results.csv" 24 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_gene/sargent_results/merged_results.csv" 25 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/naive/sargent_results/merged_results.csv" 26 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_gene/cellassign_results/merged_results.csv" 27 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/naive/cellassign_results/merged_results.csv" 28 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/naive/celltypist_results/merged_results.csv" 29 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_area/celltypist_results/merged_results.csv" 30 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_gene/celltypist_results/merged_results.csv" 31 | # predictions_df_path = "/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/colon/chunks/weighted_by_cluster/celltypist_results/merged_results.csv" 32 | 33 | 34 | results_eval_dir = os.path.join("/".join(predictions_df_path.split("/")[:-1]), "eval") 35 | os.makedirs(results_eval_dir, exist_ok=True) 36 | 37 | 38 | name_map = { 39 | 'unclassified': "no label", 40 | 'Immune': "immune cells", 41 | 'Crypt cells': "epithelial cells", 42 | 'Enterocytes': "epithelial cells", 43 | 'Epithelial': "epithelial cells", 44 | 'Smooth muscle cell': "stromal cells", 45 | 'Fibroblast': "stromal cells", 46 | 'Endothelial': "stromal cells", 47 | 'Paneth cells': "epithelial cells", 48 | 'Enteroendocrine cells': "epithelial cells", 49 | 'Goblet cells': "epithelial cells", 50 | 'Neuronal': "stromal cells", 51 | 'ephitelial cells': "epithelial cells", 52 | 'no label': "no label", 53 | "Ignore*": "no label", 54 | "B cells": "immune cells", 55 | "T cells": "immune cells", 56 | "NK cells": "immune cells", 57 | "Macrophages": "immune cells", 58 | "Neutrophils": "immune cells", 59 | "Eosinophils": "immune cells", 60 | 'CD19+CD20+ B': "immune cells", # B cells are immune cells 61 | 'CD4+ T cells': "immune cells", # CD4+ T cells are immune cells 62 | 'CD8+ T cells': "immune cells", # CD8+ T cells are immune cells 63 | 'CMS1': "epithelial cells", # CMS (Consensus Molecular Subtypes) refer to tumor/epithelial cells 64 | 'CMS2': "epithelial cells", # Same as above 65 | 'CMS3': "epithelial cells", # Same as above 66 | 'CMS4': "epithelial cells", # Same as above 67 | 'Enteric glial cells': "stromal cells", # Glial cells are part of the stromal tissue 68 | 'Goblet cells': "epithelial cells", # Goblet cells are epithelial cells 69 | 'IgA+ Plasma': "immune cells", # Plasma cells are immune cells (B-cell derivatives) 70 | 'IgG+ Plasma': "immune cells", # Same as above 71 | 'Intermediate': "no label", # Ambiguous, no clear label 72 | 'Lymphatic ECs': "stromal cells", # Endothelial cells are considered stromal 73 | 'Mast cells': "immune cells", # Mast cells are immune cells 74 | 'Mature Enterocytes type 1': "epithelial cells", # Enterocytes are epithelial cells 75 | 'Mature Enterocytes type 2': "epithelial cells", # Same as above 76 | 'Myofibroblasts': "stromal cells", # Fibroblasts are stromal cells 77 | 'NK cells': "immune cells", # NK cells are immune cells 78 | 'Pericytes': "stromal cells", # Pericytes are part of the vasculature (stromal) 79 | 'Pro-inflammatory': "immune cells", # Inflammation implies immune function 80 | 'Proliferating': "no label", # Too vague to classify, no label 81 | 'Proliferative ECs': "stromal cells", # Endothelial cells are stromal 82 | 'Regulatory T cells': "immune cells", # T cells are immune cells 83 | 'SPP1+': "no label", # Ambiguous, no clear label 84 | 'Smooth muscle cells': "stromal cells", # Smooth muscle cells are stromal cells 85 | 'Stalk-like ECs': "stromal cells", # Endothelial cells are stromal 86 | 'Stem-like/TA': "epithelial cells", # Stem cells in this context are usually epithelial 87 | 'Stromal 1': "stromal cells", # Explicitly stromal 88 | 'Stromal 2': "stromal cells", # Same as above 89 | 'Stromal 3': "stromal cells", # Same as above 90 | 'T follicular helper cells': "immune cells", # T cells are immune cells 91 | 'T helper 17 cells': "immune cells", # Same as above 92 | 'Tip-like ECs': "stromal cells", # Endothelial cells are stromal 93 | 'Unknown': "no label", # No clear label 94 | 'cDC': "immune cells", # Conventional dendritic cells are immune cells 95 | 'gamma delta T cells': "immune cells" # T cells are immune cells 96 | } 97 | 98 | 99 | segmentation_df = pd.read_csv(segmentation_df_path) 100 | predictions_df = pd.read_csv(predictions_df_path) 101 | predictions_df = predictions_df.merge(segmentation_df[["id", "geometry"]], how="left", on="id") 102 | predictions_df["geometry"] = predictions_df["geometry"].apply(wkt.loads) 103 | pred_gpd = gpd.GeoDataFrame(predictions_df,geometry="geometry") 104 | 105 | def load_path_annotations(): 106 | annotation_names = [] 107 | annotation_geometries = [] 108 | with open(geojson_path) as f: 109 | regions = json.load(f) 110 | for region in regions["features"]: 111 | ann_type = region["properties"]["objectType"] 112 | if ann_type == "annotation": 113 | annotation_name = region["properties"]["classification"]["name"] 114 | if annotation_name in ["Region*"]: 115 | continue 116 | annotation_geometries.append(shape(region["geometry"])) 117 | annotation_names.append(annotation_name) 118 | annotations_gpd = gpd.GeoDataFrame({"geometry": annotation_geometries, "gt_label": annotation_names}) 119 | annotations_gpd["ann_ix"] = [f"ID_{i}" for i in range(len(annotations_gpd))] 120 | return annotations_gpd 121 | 122 | def get_gt_annotations(annotations_gpd): 123 | try: 124 | cells_within_ann_gpd = gpd.sjoin(annotations_gpd, pred_gpd[["cell_type", "cell_x", "cell_y", "geometry", "id"]], how='left', predicate='intersects') 125 | except: 126 | cells_within_ann_gpd = gpd.sjoin(annotations_gpd, pred_gpd[["cell_assign_results", "cell_x", "cell_y", "geometry", "id"]], how='left', predicate='intersects') 127 | cells_within_ann_gpd = cells_within_ann_gpd.drop_duplicates("ann_ix") 128 | try: 129 | cells_within_ann_gpd["cell_type"] = cells_within_ann_gpd["cell_type"].fillna("unclassified") 130 | except: 131 | cells_within_ann_gpd["cell_assign_results"] = cells_within_ann_gpd["cell_assign_results"].fillna("unclassified") 132 | return cells_within_ann_gpd 133 | 134 | def validate_labels(cells_within_ann_gpd): 135 | try: 136 | cell_types_in_pred = set(cells_within_ann_gpd.cell_type.unique()) 137 | except: 138 | cell_types_in_pred = set(cells_within_ann_gpd.cell_assign_results.unique()) 139 | print(f"Cells in pred dataset: {cell_types_in_pred}") 140 | print (f"All cells are in the mapping!: {cell_types_in_pred.issubset(set(name_map.keys()))}") 141 | 142 | def relabel_cells(cells_within_ann_gpd): 143 | # Renaming cell types 144 | for granular_name, generic_name in name_map.items(): 145 | cells_within_ann_gpd.loc[cells_within_ann_gpd.gt_label == granular_name, "gt_label"] = generic_name 146 | try: 147 | cells_within_ann_gpd.loc[cells_within_ann_gpd.cell_type == granular_name, "pred_label_clean"] = generic_name 148 | except: 149 | cells_within_ann_gpd.loc[cells_within_ann_gpd.cell_assign_results == granular_name, "pred_label_clean"] = generic_name 150 | return cells_within_ann_gpd 151 | 152 | def eval_annotations(results_table): 153 | cell_types = sorted(set(results_table.gt_label.unique().tolist() + results_table.pred_label_clean.unique().tolist())) 154 | cm = confusion_matrix( 155 | results_table.gt_label, 156 | results_table.pred_label_clean, 157 | labels=cell_types 158 | ) 159 | cm_plot = ConfusionMatrixDisplay( 160 | confusion_matrix=cm, 161 | display_labels=cell_types 162 | ) 163 | cm_plot.plot() 164 | 165 | averaging_methods = ["micro", "macro", "weighted"] 166 | eval_dict = {} 167 | for method in averaging_methods: 168 | eval_metrics = precision_recall_fscore_support(results_table.gt_label, results_table.pred_label_clean, average=method) 169 | precision, recall, fbeta_score, support = eval_metrics 170 | eval_dict[method] = eval_metrics 171 | num_correct_samples = accuracy_score(results_table.gt_label, results_table.pred_label_clean, normalize=False) 172 | accuracy = accuracy_score(results_table.gt_label, results_table.pred_label_clean, normalize=True) 173 | print(f"Experiment name: {predictions_df_path}") 174 | print (f"Number of GT annotations: {len(results_table)}\nNumber of correct predictions: {num_correct_samples}\nAccuracy: {accuracy}") 175 | print("__________") 176 | try: 177 | print(pd.DataFrame(results_table.cell_type.value_counts())) 178 | except: 179 | print(pd.DataFrame(results_table.cell_assign_results.value_counts())) 180 | print("__________") 181 | print(pd.DataFrame(results_table.pred_label_clean.value_counts())) 182 | print("__________") 183 | metrics_df = pd.DataFrame(eval_dict, index=["Precision", "Recall", "F-Score", "Support"]) 184 | results_table.to_csv(os.path.join(results_eval_dir, "cell_annotation_eval.csv"), index=False) 185 | metrics_df.to_csv(os.path.join(results_eval_dir, "cell_annotation_eval_metrics.csv"), index=True) 186 | cm_plot.figure_.savefig(os.path.join(results_eval_dir, "confusion_matrix.png"),dpi=300) 187 | print (metrics_df) 188 | return results_table, metrics_df 189 | 190 | if __name__ == "__main__": 191 | annotations_gpd = load_path_annotations() 192 | cells_within_ann_gpd = get_gt_annotations(annotations_gpd) 193 | validate_labels(cells_within_ann_gpd) 194 | cells_within_ann_gpd = relabel_cells(cells_within_ann_gpd) 195 | results_table = cells_within_ann_gpd[(cells_within_ann_gpd["gt_label"] != "no label")] 196 | results_table, metrics_df = eval_annotations(results_table) -------------------------------------------------------------------------------- /src/eval/paper_eval-cellassign-methods-highlevel.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "8cd6edbb-e2e7-4474-9b6c-05203f97e7dc", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# !pip install shapely\n", 11 | "# !pip install plotly\n", 12 | "!pip install geopandas" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "id": "74ee8420-da7a-4d00-91d9-e4273e83d21f", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from shapely.geometry import shape\n", 23 | "import plotly.express as px\n", 24 | "import geopandas as gpd\n", 25 | "import json\n", 26 | "from shapely.geometry import Polygon, Point\n", 27 | "from shapely import wkt\n", 28 | "import pandas as pd\n", 29 | "from sklearn.metrics import precision_recall_fscore_support, accuracy_score\n", 30 | "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", 31 | "import os" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 15, 37 | "id": "6f641cb0-1d6a-4cfb-bc35-2fa58302b28f", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# geojson_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/Visium_HD_Human_Colon_Cancer-wsi-40598_0_65263_22706-landmarks.geojson\"\n", 42 | "geojson_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/Visium_HD_Mouse_Small_Intestine-wsi-156_4_23459_24009_all_for_one.geojson\"\n", 43 | "\n", 44 | "segmentation_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/stardist_cells_df.csv\"\n", 45 | "results_eval_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/mouse_anatomical_landmark_eval\"\n", 46 | "os.makedirs(results_eval_dir, exist_ok=True)\n", 47 | "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/Sargent+naive.csv\"\n", 48 | "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/Sargent+weighted.csv\"\n", 49 | "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/cellassign+weighted.csv\"\n", 50 | "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/cellassign+naive.csv\"\n", 51 | "# predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/Sargent+weighted-full.csv\"\n", 52 | "predictions_df_path = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/predictions/sargent+weighted+mouse.csv\"\n", 53 | "\n", 54 | "\n", 55 | "method = predictions_df_path.split(\"/\")[-1].split(\".\")[0]\n", 56 | "\n", 57 | "segmentation_df = pd.read_csv(segmentation_df_path)\n", 58 | "predictions_df = pd.read_csv(predictions_df_path)\n", 59 | "predictions_df = predictions_df.merge(segmentation_df[[\"id\", \"geometry\"]], how=\"left\", on=\"id\")\n", 60 | "predictions_df = predictions_df[~predictions_df.geometry.isna()]\n", 61 | "try:\n", 62 | " predictions_df[\"geometry\"] = predictions_df[\"geometry\"].apply(wkt.loads)\n", 63 | "except:\n", 64 | " pass\n", 65 | "pred_gpd = gpd.GeoDataFrame(predictions_df,geometry=\"geometry\")" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 16, 71 | "id": "9d745a36-0221-450f-9c35-b5c099a8d189", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "annotation_names = []\n", 76 | "annotation_geometries = []\n", 77 | "with open(geojson_path) as f:\n", 78 | " regions = json.load(f)\n", 79 | "for region in regions[\"features\"]:\n", 80 | " ann_type = region[\"properties\"][\"objectType\"]\n", 81 | " if ann_type == \"annotation\":\n", 82 | " annotation_name = region[\"properties\"][\"classification\"][\"name\"]\n", 83 | " if annotation_name in [\"Region*\"]:\n", 84 | " continue\n", 85 | " annotation_geometries.append(shape(region[\"geometry\"]))\n", 86 | " annotation_names.append(annotation_name)\n", 87 | "annotations_gpd = gpd.GeoDataFrame({\"geometry\": annotation_geometries, \"label\": annotation_names})\n", 88 | "annotations_gpd[\"ann_ix\"] = [f\"ID_{i}\" for i in range(len(annotations_gpd))]\n", 89 | "cells_within_ann_gpd = gpd.sjoin(pred_gpd[[\"cell_type\", \"cell_x\", \"cell_y\", \"geometry\", \"id\"]], annotations_gpd, how='left', predicate='within')\n", 90 | "cells_within_ann_gpd = cells_within_ann_gpd.drop_duplicates(subset=[\"id\"])" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "099d30dc-a404-4662-8fec-7b0275079e42", 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "for annotation_name in annotation_names:\n", 101 | " df = cells_within_ann_gpd[cells_within_ann_gpd.label == annotation_name]\n", 102 | " # df = df[~(df.cell_type == \"unclassified\")]\n", 103 | " df = df.groupby([\"cell_type\"]).agg(\"count\").reset_index()\n", 104 | " df = df.sort_values(\"id\", ascending=False)\n", 105 | " fig = px.bar(df, x='cell_type', y='id', title=f\"Region: {annotation_name}\")\n", 106 | " fig.update_layout(\n", 107 | " xaxis_title=\"cell type\", yaxis_title=\"# cells\"\n", 108 | " )\n", 109 | " fig.show()\n", 110 | " fig.write_html(os.path.join(results_eval_dir, f\"{method}_{annotation_name}_cell_counts.html\"))" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "f7a3204e-89bd-4e6b-b14a-ea98a8fe9f5d", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "results_eval_dir" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "159a4cc2-32a4-49d9-9a87-f186a0d255de", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [] 130 | } 131 | ], 132 | "metadata": { 133 | "kernelspec": { 134 | "display_name": "Python 3 (ipykernel)", 135 | "language": "python", 136 | "name": "python3" 137 | }, 138 | "language_info": { 139 | "codemirror_mode": { 140 | "name": "ipython", 141 | "version": 3 142 | }, 143 | "file_extension": ".py", 144 | "mimetype": "text/x-python", 145 | "name": "python", 146 | "nbconvert_exporter": "python", 147 | "pygments_lexer": "ipython3", 148 | "version": "3.10.14" 149 | } 150 | }, 151 | "nbformat": 4, 152 | "nbformat_minor": 5 153 | } 154 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created By : ... 3 | Created Date: DD/MM/YYYY 4 | Description : ... 5 | """ 6 | 7 | import argparse 8 | from utils.logging import get_logger 9 | 10 | 11 | APP_NAME = 'MyProject' 12 | LOGGER = get_logger(APP_NAME) 13 | 14 | 15 | def dummy(dum): 16 | """Example function 17 | 18 | :param dum: Text to log. 19 | :type number: str 20 | :return: The entry text. 21 | :rtype: str 22 | """ 23 | LOGGER.info(f'{dum} in progress') 24 | return dum 25 | 26 | -------------------------------------------------------------------------------- /src/synthetic_data/generate_synthetic_data_Xenium.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "018887fd-e9e8-495b-872f-fefbd9cd6cb5", 6 | "metadata": {}, 7 | "source": [ 8 | "To generate synthetic VisiumHD data from Xenium, please read and run all the cells below. Thanks!" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "bd0c610b-e1b5-43e6-a35d-3548588cb652", 14 | "metadata": {}, 15 | "source": [ 16 | "## Download Xenium output from 10X website\n", 17 | "Paste the URL for the binned_outputs.tar.gz for the sample you want to analyze.\n", 18 | "\n", 19 | "1. Go to Xenium public datasets page:https://www.10xgenomics.com/datasets?query=&page=1&configure%5BhitsPerPage%5D=50&configure%5BmaxValuesPerFacet%5D=1000&refinementList%5Bproduct.name%5D%5B0%5D=In%20Situ%20Gene%20Expression&refinementList%5Bspecies%5D%5B0%5D=Human&refinementList%5BdiseaseStates%5D%5B0%5D=colorectal%20cancer\n", 20 | "\n", 21 | "2. Select sample to analyze scrolling down to downloads section, click \"Batch download\"\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "5f721b2b-4314-4528-9c01-185726147728", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import zipfile\n", 32 | "xenium_outputs_url = \"https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_Human_Colorectal_Cancer_Addon_FFPE/Xenium_V1_Human_Colorectal_Cancer_Addon_FFPE_outs.zip\"\n", 33 | "# Step 1: Download the raw Xenium output\n", 34 | "!curl -O {xenium_outputs_url}\n", 35 | "\n", 36 | "# Extract the ZIP file\n", 37 | "zip_file_path = xenium_outputs_url.split(\"/\")[-1]\n", 38 | "\n", 39 | "with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n", 40 | " zip_ref.extractall(\"extracted_files\")\n", 41 | "\n", 42 | "print(\"Extraction completed.\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "a9fcd48a-2f55-43b4-befd-8d646ea634cf", 48 | "metadata": {}, 49 | "source": [ 50 | "### Install prerequisite libraries" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "7453e3e3-a55c-47fb-ab83-2c3743833b89", 57 | "metadata": { 58 | "scrolled": true, 59 | "tags": [] 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "!pip install --upgrade pip\n", 64 | "!pip install scipy\n", 65 | "!pip install shapely\n", 66 | "!pip install tifffile\n", 67 | "!pip install plotly\n", 68 | "!pip install tensorflow-gpu==2.10.0\n", 69 | "!pip install stardist\n", 70 | "!pip install geopandas\n", 71 | "!pip install scanpy\n", 72 | "!pip install fastparquet\n", 73 | "!pip install opencv-python\n", 74 | "!pip install geojson\n", 75 | "!pip install scikit-learn" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "id": "1f79fb2c-0fd9-4bd4-8be9-4d1bd04d8733", 81 | "metadata": {}, 82 | "source": [ 83 | "### Import Relevant Libraries" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "id": "16e4dc02-2b8d-4e00-9cbd-8a4d151ca5af", 90 | "metadata": { 91 | "scrolled": true, 92 | "tags": [] 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "import geopandas as gpd # Geopandas for storing Shapely objects\n", 97 | "from matplotlib.colors import ListedColormap\n", 98 | "import matplotlib.pyplot as plt\n", 99 | "import scanpy as sc\n", 100 | "import pandas as pd\n", 101 | "from scipy import sparse\n", 102 | "import anndata\n", 103 | "import os\n", 104 | "import gzip\n", 105 | "import numpy as np\n", 106 | "import re\n", 107 | "import shapely\n", 108 | "from shapely.geometry import Polygon, Point # Representing bins and cells as Shapely Polygons and Point objects\n", 109 | "from shapely import wkt" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "id": "46a8d90a-65dd-4e93-b4e2-4a257d6e1dc7", 115 | "metadata": {}, 116 | "source": [ 117 | "### Load Cell & Transcripts Info" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "feb54b91-6757-467c-81d3-7a4f6916fcda", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "# Load the transcript data\n", 128 | "transcripts_path = \"extracted_files/transcripts.csv.gz\"\n", 129 | "with gzip.open(transcripts_path, 'rt') as f:\n", 130 | " transcripts_df = pd.read_csv(f)\n", 131 | "\n", 132 | "# Load cell info\n", 133 | "cells_path = \"extracted_files/cells.csv.gz\"\n", 134 | "with gzip.open(cells_path, 'rt') as f:\n", 135 | " cells_df = pd.read_csv(f)\n" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "id": "ccac1dea-7855-4af4-8989-c2b63deed2f1", 141 | "metadata": {}, 142 | "source": [ 143 | "### Load Cell Boundary Info" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "25d2bdf0-8871-4bb0-a38e-3f9c31c7b3ea", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "import zarr\n", 154 | "\n", 155 | "zarr_file = zarr.open('extracted_files/cells.zarr.zip', mode='r')\n", 156 | "print(zarr_file.tree())" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "id": "c092c013-dd0d-47f5-a6cc-3491f1f62dfe", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "file = zarr_file['polygon_sets/0/vertices'][:]\n", 167 | "# 1 is whole cell, 0 is nucleus" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "id": "0da5ff74-6269-42b4-9a9e-604f520a7528", 173 | "metadata": { 174 | "tags": [] 175 | }, 176 | "source": [ 177 | "### Create folders to store synthetic data" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "id": "a6839176-4a75-4f1e-b4f7-13899b946963", 183 | "metadata": {}, 184 | "source": [ 185 | "For both the `seqfish_dir` and `enact_data_dir`, change `\"/home/oneai/\"` to the directory that stores this repo." 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "id": "7ec69f53-4a93-491a-b6f0-652b27ffaaf1", 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "xenium_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/synthetic_data/xenium\" # Update it to the directory where you want to save the synthetic data\n", 196 | "enact_data_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/xenium_nuclei/chunks\" # Directory that saves all the input and results of the enact pipeline, \n", 197 | "# should end with \"oneai-dda-spatialtr-visiumhd_analysis/cache/seqfish/chunks\"\n", 198 | "\n", 199 | "transcripts_df_chunks_dir = os.path.join(xenium_dir, \"transcripts_patches\") # Directory to store the files that contain the transcripts info for each chunk\n", 200 | "output_dir = os.path.join(enact_data_dir, \"bins_gdf\") # Directory to store the results of gene-to-bin assignment for each chunk\n", 201 | "cells_df_chunks_dir = os.path.join(enact_data_dir,\"cells_gdf\") \n", 202 | "ground_truth_dir = os.path.join(xenium_dir, \"ground_truth_nuclei\")\n", 203 | "\n", 204 | "# Making relevant directories\n", 205 | "os.makedirs(xenium_dir, exist_ok=True)\n", 206 | "os.makedirs(enact_data_dir, exist_ok=True)\n", 207 | "os.makedirs(transcripts_df_chunks_dir, exist_ok=True)\n", 208 | "os.makedirs(output_dir, exist_ok=True)\n", 209 | "os.makedirs(cells_df_chunks_dir, exist_ok=True)\n", 210 | "os.makedirs(ground_truth_dir, exist_ok=True)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "id": "dafe70a1-ed23-4cb6-a7b6-d35e4c01f895", 216 | "metadata": {}, 217 | "source": [ 218 | "### Generate Synthetic VesiumHD Dataset" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "id": "5bdd8461-7bcc-4101-b26b-765daf975916", 224 | "metadata": { 225 | "tags": [] 226 | }, 227 | "source": [ 228 | "#### Break transcripts df to patches (based on location)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "id": "042b4ce0-30d1-4c23-9b2d-0622db0a4f8c", 234 | "metadata": {}, 235 | "source": [ 236 | "Break transcripts df to patches of size 1000um x 1000um (larger patch size may result in memory issue)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "id": "60fb886a-5893-40ba-b187-650d6cfb4ed6", 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "# patch size: 1000 um x 1000 um\n", 247 | "\n", 248 | "patch_size = 1000\n", 249 | "\n", 250 | "# patch indices\n", 251 | "transcripts_df['x_patch'] = (transcripts_df['x_location'] // patch_size).astype(int)\n", 252 | "transcripts_df['y_patch'] = (transcripts_df['y_location'] // patch_size).astype(int)\n", 253 | "transcripts_df[\"patch_id\"] = transcripts_df[\"x_patch\"].astype(str) + \"_\" + transcripts_df[\"y_patch\"].astype(str)\n", 254 | "\n", 255 | "# Create a df for each patch\n", 256 | "grouped = transcripts_df.groupby(['x_patch', 'y_patch'])\n", 257 | "for (x_patch, y_patch), group in grouped:\n", 258 | " # Calculate the start and end locations for each patch\n", 259 | " # x_start = x_patch * patch_size\n", 260 | " # x_end = (x_patch + 1) * patch_size\n", 261 | " # y_start = y_patch * patch_size\n", 262 | " # y_end = (y_patch + 1) * patch_size\n", 263 | " \n", 264 | " filename = f\"patch_{x_patch}_{y_patch}.csv\"\n", 265 | " output_loc = os.path.join(transcripts_df_patch_dir , filename)\n", 266 | " group.to_csv(output_loc)\n", 267 | "\n", 268 | " print(f\"Saved {filename}\")" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "id": "a7bbc9ec-675b-4b25-8448-334ed317798a", 274 | "metadata": { 275 | "tags": [] 276 | }, 277 | "source": [ 278 | "#### Generate synthetic vesiumHD for each patch" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "id": "ceebc8fd-88e9-4b14-8470-a474085dee64", 284 | "metadata": {}, 285 | "source": [ 286 | "Each patch is broken into bins of size 2um x 2um. The synthetic data contains transcript counts orgnized by bin_id. Each row contains transcript counts for a unique bin. Bins with no transcript counts is not included. \n", 287 | "\n", 288 | "In addition to all the gene features, there are two additional columns represent the row number and column number of the bin, and a column contains the Shapely polygon item that represents the bin. The first column is the bin_id." 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "id": "d19155a0-5646-49bd-915c-94737e251bb0", 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "def generate_synthetic_VesiumHD_data(transcripts_df, bin_size=2, whole_cell=True, QScore20=True):\n", 299 | " filtered_df = transcripts_df.copy()\n", 300 | " # only count transcripts in the nucleus\n", 301 | " if not whole_cell:\n", 302 | " filtered_df = transcripts_df[transcripts_df['overlaps_nucleus'] == 1].copy()\n", 303 | " \n", 304 | " only count transcripts with QScore >= 20\n", 305 | " if QScore20:\n", 306 | " filtered_df = filtered_df[filtered_df['qv'] >= 20].copy()\n", 307 | " \n", 308 | " # assigne bin to each transcript\n", 309 | " filtered_df.loc[:, 'row'] =np.ceil(filtered_df['y_location'] / bin_size).astype(int)\n", 310 | " filtered_df.loc[:, 'column'] = np.ceil(filtered_df['x_location'] / bin_size).astype(int)\n", 311 | " filtered_df.loc[:, 'assigned_bin_id'] = filtered_df.apply(\n", 312 | " lambda row: f\"{bin_size}um_\" + str(row['row']).zfill(5) +\"_\"+ str(row['column']).zfill(5),\n", 313 | " axis=1)\n", 314 | " \n", 315 | " bin_coordinates = filtered_df[['assigned_bin_id', 'row', 'column']].drop_duplicates().set_index('assigned_bin_id')\n", 316 | " bin_gene_matrix = filtered_df.groupby(['assigned_bin_id', 'feature_name']).size().unstack(fill_value=0)\n", 317 | " bin_gene_matrix_with_coords = bin_gene_matrix.merge(bin_coordinates, left_index=True, right_index=True)\n", 318 | " \n", 319 | " return bin_gene_matrix_with_coords" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "id": "bd804c49-dc85-4fa9-85d4-a621cf0598ae", 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "# Extract row and column number from the bin_id\n", 330 | "def extract_numbers(entry):\n", 331 | " match = re.search(r'_(\\d{5})_(\\d{5})', entry)\n", 332 | " if match:\n", 333 | " number1 = int(match.group(1).lstrip('0')) \n", 334 | " number2 = int(match.group(2).lstrip('0')) \n", 335 | " return number2*2-1, number1*2-1\n", 336 | " else:\n", 337 | " return None, None" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "id": "f8d45c22-2776-4b80-a29b-37d07f6b06c5", 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "from tqdm import tqdm\n", 348 | "def generate_bin_polys(bins_df, x_col, y_col, bin_size):\n", 349 | " \"\"\"Represents the bins as Shapely polygons\n", 350 | "\n", 351 | " Args:\n", 352 | " bins_df (pd.DataFrame): bins dataframe\n", 353 | " x_col (str): column with the bin centre x-coordinate\n", 354 | " y_col (str): column with the bin centre y-coordinate\n", 355 | " bin_size (int): bin size in pixels\n", 356 | "\n", 357 | " Returns:\n", 358 | " list: list of Shapely polygons\n", 359 | " \"\"\"\n", 360 | " geometry = []\n", 361 | " # Generates Shapely polygons to represent each bin\n", 362 | "\n", 363 | " if True:\n", 364 | " half_bin_size = bin_size / 2\n", 365 | " bbox_coords = pd.DataFrame(\n", 366 | " {\n", 367 | " \"min_x\": bins_df[x_col] - half_bin_size,\n", 368 | " \"min_y\": bins_df[y_col] - half_bin_size,\n", 369 | " \"max_x\": bins_df[x_col] + half_bin_size,\n", 370 | " \"max_y\": bins_df[y_col] + half_bin_size,\n", 371 | " }\n", 372 | " )\n", 373 | " geometry = [\n", 374 | " shapely.geometry.box(min_x, min_y, max_x, max_y)\n", 375 | " for min_x, min_y, max_x, max_y in tqdm(\n", 376 | " zip(\n", 377 | " bbox_coords[\"min_x\"],\n", 378 | " bbox_coords[\"min_y\"],\n", 379 | " bbox_coords[\"max_x\"],\n", 380 | " bbox_coords[\"max_y\"],\n", 381 | " ),\n", 382 | " total=len(bins_df),\n", 383 | " )\n", 384 | " ]\n", 385 | "\n", 386 | " return geometry" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "id": "9f1c4071-ff50-4ec1-bd0d-37c8ddecaa54", 393 | "metadata": { 394 | "tags": [] 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "# Loop through all the transcripra_df chunks and generate gene-to-bin assignments \n", 399 | "patch_size = 1000\n", 400 | "bin_size = 2\n", 401 | "transcripts_df_chunks = os.listdir(transcripts_df_patch_dir)\n", 402 | "for chunk_fname in transcripts_df_chunks:\n", 403 | " output_loc = os.path.join(output_dir, chunk_fname)\n", 404 | " # if os.path.exists(output_loc):\n", 405 | " # continue\n", 406 | " if chunk_fname in [\".ipynb_checkpoints\"]:\n", 407 | " continue\n", 408 | " transcripts_df_chunk = pd.read_csv(os.path.join(transcripts_df_patch_dir, chunk_fname))\n", 409 | " bin_df_chunk = generate_synthetic_VesiumHD_data(transcripts_df_chunk, bin_size, whole_cell=True, QScore20=True)\n", 410 | " bin_df_chunk['column'] = bin_df_chunk['column']*2-1\n", 411 | " bin_df_chunk['row'] = bin_df_chunk['row']*2-1\n", 412 | " bin_df_chunk['geometry'] = generate_bin_polys(bin_df_chunk, 'column', 'row', 2)\n", 413 | " bin_gdf_chunk = gpd.GeoDataFrame( bin_df_chunk, geometry = bin_df_chunk['geometry'])\n", 414 | " bin_df_chunk.to_csv(output_loc)\n", 415 | " print(f\"Successfully assigned transcripts to bins for {chunk_fname}\")\n" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "id": "105e310d-2a9d-41b5-9450-23ab3e57e7f7", 421 | "metadata": { 422 | "tags": [] 423 | }, 424 | "source": [ 425 | "### Generate cell_gdf as enact_pipeline input" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "id": "428d33fd-45be-4dde-b4b9-acc3de13f9e0", 431 | "metadata": {}, 432 | "source": [ 433 | "This session generate the cell_df patches required to run the enact pipeline. The main purpose is to create Shapely polygons that represent the cell outline." 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "id": "5a4bff77-1b7a-4921-a4c2-0b66cf800468", 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "def create_polygons(coords_array):\n", 444 | " polygons = []\n", 445 | " for row in coords_array:\n", 446 | " reshaped_coords = row.reshape(-1, 2)\n", 447 | " polygon = Polygon(reshaped_coords)\n", 448 | " polygons.append(polygon)\n", 449 | " return polygons\n", 450 | "\n", 451 | "# Create the polygons\n", 452 | "polygons = create_polygons(file)\n", 453 | "cells_df['polygons'] = polygons" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "id": "22875d42-5489-4ed0-b370-d693f26318e9", 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "cell_gdf_chunk = gpd.GeoDataFrame(cells_df, geometry = cells_df['polygons'])\n", 464 | "cell_gdf_chunk.rename(columns={'x_centroid': 'cell_x', 'y_centroid': 'cell_y'}, inplace=True)\n", 465 | "cell_gdf_chunk.drop(\"Unnamed: 0\", axis=1, inplace=True)\n", 466 | "cell_gdf_chunk[['cell_id','cell_x','cell_y','geometry']].to_csv(os.path.join(enact_data_dir, \"cells_gdf\"))" 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "id": "5e38a13d-3dfa-45e2-abc3-1b40c382a1db", 472 | "metadata": { 473 | "tags": [] 474 | }, 475 | "source": [ 476 | "### Run ENACT bin-to-cell pipeline\n", 477 | "In the configs.yaml file: \n", 478 | "\n", 479 | " Set \"analysis_name\" in the configs.yaml file to \"xenium\" (or \"xenium_nuclei).\n", 480 | " Set \"run_synthetic\" to True and all other steps to False.\n", 481 | " Set \"bin_to_cell_method\" to one of these four: \"naive\", \"weighted_by_area\", \"weighted_by_gene\", or \"weighted_by_cluster\"\n", 482 | "\n", 483 | "Run `make run_enact`" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "id": "2ae8aa8e-0a17-48ae-86ed-81a04ec203dc", 489 | "metadata": { 490 | "tags": [] 491 | }, 492 | "source": [ 493 | "### Generate Ground Truth" 494 | ] 495 | }, 496 | { 497 | "cell_type": "markdown", 498 | "id": "670974eb-8dae-4d67-b735-1cd53858d560", 499 | "metadata": {}, 500 | "source": [ 501 | "The following cell will generate and save the ground truth of the synthetic VisiumHD data for the use of bin-to-cell assignment methods evaluation. Ground truth dataframe consists of rows representing the transcript counts of each cell. Each column represents a gene feature (gene feature name is also the column name)." 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "id": "8224ea02-5701-450c-9efb-c38de7492764", 507 | "metadata": { 508 | "tags": [] 509 | }, 510 | "source": [ 511 | "#### Generate Cell-gene matrix for evaluation" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "id": "8f23be59-86ef-4ed0-b9fd-b22b203fa769", 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [ 521 | "def generate_ground_truth_table(transcripts_df, cells_df, whole_cell=True, QScore20=True, include_unassigned_transcript=False):\n", 522 | " filtered_df = transcripts_df\n", 523 | " \n", 524 | " # only count transcripts in the nucleus\n", 525 | " if not whole_cell:\n", 526 | " filtered_df = transcripts_df[transcripts_df['overlaps_nucleus'] == 1]\n", 527 | " \n", 528 | " # only count transcripts with QScore >= 20\n", 529 | " if QScore20:\n", 530 | " filtered_df = filtered_df[filtered_df['qv'] >= 20]\n", 531 | " \n", 532 | " # only count transcripts that are assigned to specific cells\n", 533 | " if not include_unassigned_transcript:\n", 534 | " filtered_df = filtered_df[filtered_df['cell_id'] != 'UNASSIGNED']\n", 535 | " \n", 536 | " pivot_df = filtered_df.pivot_table(index='cell_id', columns='feature_name', aggfunc='size', fill_value=0)\n", 537 | " \n", 538 | " merged_df = pivot_df.merge(cells_df[['cell_id']], left_index=True, right_on='cell_id', how='right')\n", 539 | " columns = ['cell_id'] + [col for col in merged_df.columns if col not in ['cell_id', 'x_centroid', 'y_centroid','polygons']]\n", 540 | " merged_df = merged_df[columns]\n", 541 | " merged_df.set_index('cell_id', inplace=True)\n", 542 | " #merged_df['total_gene_counts'] = merged_df.iloc[:, 3:].sum(axis=1)\n", 543 | " \n", 544 | " return merged_df" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "id": "389f2644-5496-4286-961c-fa74ea32e97f", 551 | "metadata": {}, 552 | "outputs": [], 553 | "source": [ 554 | "bin_size = 2\n", 555 | "cell_df_chunks = os.listdir(cells_df_chunks_dir)\n", 556 | "for chunk_fname in cell_df_chunks:\n", 557 | " output_loc = os.path.join(ground_truth_dir,chunk_fname)\n", 558 | " if os.path.exists(output_loc):\n", 559 | " continue\n", 560 | " if chunk_fname in [\".ipynb_checkpoints\"]:\n", 561 | " continue\n", 562 | " cell_df_chunk = pd.read_csv(os.path.join(cell_dir, chunk_fname))\n", 563 | " groundtruth_chunk = generate_ground_truth_table(transcripts_df, cell_df_chunk, whole_cell=False, QScore20=False, include_unassigned_transcript=False)\n", 564 | " groundtruth_chunk.to_csv(output_loc)\n", 565 | " print(f\"Successfully generated groundthuth for {chunk_fname}\")" 566 | ] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "id": "f7a648b8-c2d9-4489-951e-dc0c443b489d", 571 | "metadata": { 572 | "tags": [] 573 | }, 574 | "source": [ 575 | "### Evaluation of ENACT bin-to-cell results" 576 | ] 577 | }, 578 | { 579 | "cell_type": "markdown", 580 | "id": "3759f86f-8498-41b1-a7ea-ca934b102d22", 581 | "metadata": { 582 | "tags": [] 583 | }, 584 | "source": [ 585 | "#### Overall precision, recall, and f1" 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "id": "20f300f2-73fb-4c86-9bf0-704f053d5299", 591 | "metadata": {}, 592 | "source": [ 593 | "Run this session with all the methods you have run with ENACT, change 'method' in the cell bellow to the one you want to evaluate." 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": null, 599 | "id": "5061ee46-1591-4a96-8643-5e96d7c55a44", 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [ 603 | "import pandas as pd\n", 604 | "import numpy as np\n", 605 | "\n", 606 | "method = \"weighted_by_cluster\"\n", 607 | "results_dir = os.path.join(enact_data_dir, method, \"bin_to_cell_assign\")\n", 608 | "\n", 609 | "# Initialize variables to accumulate weighted precision, recall, and F1\n", 610 | "total_cells = 0\n", 611 | "precision_sum = 0\n", 612 | "recall_sum = 0\n", 613 | "missing_cells_count = 0\n", 614 | "total_cells_count = 0\n", 615 | "results_chunks = os.listdir(results_dir)\n", 616 | "\n", 617 | "for chunk_fname in results_chunks:\n", 618 | " if chunk_fname in [\".ipynb_checkpoints\"]:\n", 619 | " continue\n", 620 | "\n", 621 | " generated = pd.read_csv(os.path.join(results_dir, chunk_fname))\n", 622 | " ground_truth = pd.read_csv(os.path.join(ground_truth_dir, chunk_fname))\n", 623 | " if len(generated) ==0:\n", 624 | " print(chunk_fname)\n", 625 | " continue\n", 626 | " generated.rename(columns={'id': 'cell_id'}, inplace=True)\n", 627 | " \n", 628 | " # Align both dataframes by 'cell_id', filling missing cells in generated with 0\n", 629 | " merged = pd.merge(ground_truth, generated, on='cell_id', how='left', suffixes=('_gt', '_gen')).fillna(0)\n", 630 | " num_cells = (ground_truth.iloc[:, 1:] != 0).any(axis=1).sum()\n", 631 | " missing_cells_count += num_cells - len(generated)\n", 632 | " total_cells_count += num_cells\n", 633 | "\n", 634 | " ground_truth_aligned = merged.filter(like='_gt').values\n", 635 | " generated_aligned = merged.filter(like='_gen').values\n", 636 | " assert ground_truth_aligned.shape == generated_aligned.shape, \"Aligned matrices must have the same shape!\"\n", 637 | "\n", 638 | " num_cells = ground_truth_aligned.shape[0]\n", 639 | "\n", 640 | " # Compute precision for the current patch\n", 641 | " patch_precision = np.sum(np.minimum(generated_aligned, ground_truth_aligned)) / np.sum(generated_aligned)\n", 642 | "\n", 643 | " # Compute recall for the current patch\n", 644 | " patch_recall = np.sum(np.minimum(generated_aligned, ground_truth_aligned)) / np.sum(ground_truth_aligned)\n", 645 | "\n", 646 | " # F1 score for the current patch\n", 647 | " if patch_precision + patch_recall > 0:\n", 648 | " patch_f1 = 2 * (patch_precision * patch_recall) / (patch_precision + patch_recall)\n", 649 | " else:\n", 650 | " patch_f1 = 0\n", 651 | "\n", 652 | " # Accumulate the weighted precision, recall, and number of aligned cells\n", 653 | " precision_sum += patch_precision * num_cells\n", 654 | " recall_sum += patch_recall * num_cells\n", 655 | " total_cells += num_cells\n", 656 | " \n", 657 | "# Compute overall weighted precision, recall, and F1 score\n", 658 | "overall_precision = precision_sum / total_cells\n", 659 | "overall_recall = recall_sum / total_cells\n", 660 | "\n", 661 | "if overall_precision + overall_recall > 0:\n", 662 | " overall_f1_score = 2 * (overall_precision * overall_recall) / (overall_precision + overall_recall)\n", 663 | "else:\n", 664 | " overall_f1_score = 0 \n", 665 | "\n", 666 | "# Print results\n", 667 | "print(f\"Overall Precision: {overall_precision}\")\n", 668 | "print(f\"Overall Recall: {overall_recall}\")\n", 669 | "print(f\"Overall F1 Score: {overall_f1_score}\")\n", 670 | "print(f\"Total missing cells in the generated data compared to ground truth: {missing_cells_count}\")\n", 671 | "print(f\"Total cells : {total_cells_count}\")" 672 | ] 673 | }, 674 | { 675 | "cell_type": "markdown", 676 | "id": "eef397d4-ce75-4459-869e-7141fb72ba79", 677 | "metadata": { 678 | "tags": [] 679 | }, 680 | "source": [ 681 | "#### Visualize the distribution using violin plots " 682 | ] 683 | }, 684 | { 685 | "cell_type": "markdown", 686 | "id": "e0b763a9-4dce-48c3-9e43-40d2fbfd7c88", 687 | "metadata": {}, 688 | "source": [ 689 | "The following cells would create violin plots for all four methods in order to better compare the results. You can choose to only compare the ones you have run by changing the 'methods' list below to only include those." 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": null, 695 | "id": "b5e2326d-d85e-4075-afa5-2edf492eef0b", 696 | "metadata": {}, 697 | "outputs": [], 698 | "source": [ 699 | "import pandas as pd\n", 700 | "import numpy as np\n", 701 | "import os\n", 702 | "import seaborn as sns\n", 703 | "import matplotlib.pyplot as plt\n", 704 | "\n", 705 | "# Define methods and their directories\n", 706 | "methods = [\n", 707 | " {\n", 708 | " 'name': 'Naive',\n", 709 | " 'results_dir': os.path.join(enact_data_dir, \"naive\", \"bin_to_cell_assign\"), \n", 710 | " 'ground_truth_dir':ground_truth_dir\n", 711 | " },\n", 712 | " {\n", 713 | " 'name': 'Weighted_by_area',\n", 714 | " 'results_dir': os.path.join(enact_data_dir, \"weighted_by_area\", \"bin_to_cell_assign\"), \n", 715 | " 'ground_truth_dir':ground_truth_dir\n", 716 | " },\n", 717 | " {\n", 718 | " 'name': 'Weighted_by_gene',\n", 719 | " 'results_dir': os.path.join(enact_data_dir, \"weighted_by_gene\", \"bin_to_cell_assign\"), \n", 720 | " 'ground_truth_dir': ground_truth_dir\n", 721 | " },\n", 722 | " {\n", 723 | " 'name': 'Weighted_by_cluster',\n", 724 | " 'results_dir': os.path.join(enact_data_dir, \"weighted_by_cluster\", \"bin_to_cell_assign\"), \n", 725 | " 'ground_truth_dir': ground_truth_dir\n", 726 | " }\n", 727 | "]\n", 728 | "\n", 729 | "# Initialize a list to store per-patch metrics for all methods\n", 730 | "metrics_list = []\n", 731 | "\n", 732 | "# Loop through each method to compute per-patch metrics\n", 733 | "for method in methods:\n", 734 | " method_name = method['name']\n", 735 | " results_dir = method['results_dir']\n", 736 | " ground_truth_dir = method['ground_truth_dir']\n", 737 | " \n", 738 | " print(f\"Processing {method_name}...\")\n", 739 | " \n", 740 | " # Get list of generated and ground truth files\n", 741 | " generated_files = [f for f in os.listdir(results_dir) if f.endswith('.csv') and f not in [\".ipynb_checkpoints\"]]\n", 742 | " ground_truth_files = [f for f in os.listdir(ground_truth_dir) if f.endswith('.csv') and f not in [\".ipynb_checkpoints\"]]\n", 743 | " \n", 744 | " # Find common files between generated results and ground truth\n", 745 | " common_files = set(generated_files) & set(ground_truth_files)\n", 746 | " \n", 747 | " if not common_files:\n", 748 | " print(f\"No common files found for {method_name}. Skipping method.\")\n", 749 | " continue\n", 750 | " \n", 751 | " # Loop through each common file (patch)\n", 752 | " for fname in common_files:\n", 753 | " ground_truth_path = os.path.join(ground_truth_dir, fname)\n", 754 | " generated_path = os.path.join(results_dir, fname)\n", 755 | " \n", 756 | " # Load ground truth and generated data\n", 757 | " ground_truth = pd.read_csv(ground_truth_path)\n", 758 | " generated = pd.read_csv(generated_path)\n", 759 | " \n", 760 | " # Skip if generated data is empty\n", 761 | " if generated.empty:\n", 762 | " print(f\"No data in generated file {fname} for {method_name}. Skipping patch.\")\n", 763 | " continue\n", 764 | " \n", 765 | " # Rename columns for consistency\n", 766 | " if 'id' in generated.columns:\n", 767 | " generated.rename(columns={'id': 'cell_id'}, inplace=True)\n", 768 | " \n", 769 | " # Merge ground truth and generated data on 'cell_id', filling missing values with 0\n", 770 | " merged = pd.merge(\n", 771 | " ground_truth, generated, on='cell_id', how='outer', suffixes=('_gt', '_gen')\n", 772 | " ).fillna(0)\n", 773 | " \n", 774 | " # Extract aligned matrices for ground truth and generated data\n", 775 | " ground_truth_aligned = merged.filter(regex='_gt$').values\n", 776 | " generated_aligned = merged.filter(regex='_gen$').values\n", 777 | " \n", 778 | " # Ensure matrices are aligned\n", 779 | " if ground_truth_aligned.shape != generated_aligned.shape:\n", 780 | " print(f\"Shape mismatch in patch {fname} for {method_name}. Skipping patch.\")\n", 781 | " continue\n", 782 | " \n", 783 | " # Compute counts for this patch\n", 784 | " tp = np.sum(np.minimum(generated_aligned, ground_truth_aligned))\n", 785 | " predicted = np.sum(generated_aligned)\n", 786 | " actual = np.sum(ground_truth_aligned)\n", 787 | " \n", 788 | " # Compute metrics for this patch\n", 789 | " precision = tp / predicted if predicted > 0 else 0\n", 790 | " recall = tp / actual if actual > 0 else 0\n", 791 | " f1_score = (\n", 792 | " 2 * (precision * recall) / (precision + recall)\n", 793 | " if (precision + recall) > 0 else 0\n", 794 | " )\n", 795 | " \n", 796 | " # Store metrics for this patch\n", 797 | " metrics_list.append({\n", 798 | " 'Method': method_name,\n", 799 | " 'Patch': fname,\n", 800 | " 'Precision': precision,\n", 801 | " 'Recall': recall,\n", 802 | " 'F1 Score': f1_score\n", 803 | " })\n", 804 | "\n", 805 | "# Create a DataFrame with per-patch metrics\n", 806 | "metrics_df = pd.DataFrame(metrics_list)\n", 807 | "\n", 808 | "# Display the first few rows of the DataFrame\n", 809 | "print(\"\\nPer-Patch Metrics:\")\n", 810 | "print(metrics_df.head())" 811 | ] 812 | }, 813 | { 814 | "cell_type": "code", 815 | "execution_count": null, 816 | "id": "6ad2f5b2-4b89-4480-85b5-6af2cc6bfb56", 817 | "metadata": {}, 818 | "outputs": [], 819 | "source": [ 820 | "# plotting\n", 821 | "sns.set(style=\"whitegrid\")\n", 822 | "\n", 823 | "# Create a figure with subplots for each metric\n", 824 | "fig, axes = plt.subplots(1, 3, figsize=(18, 6))\n", 825 | "\n", 826 | "# Precision Violin Plot\n", 827 | "sns.violinplot(x='Method', y='Precision', data=metrics_df, ax=axes[0], inner='quartile', palette='Set2')\n", 828 | "axes[0].set_title('Precision')\n", 829 | "axes[0].set_xlabel('Method')\n", 830 | "axes[0].set_ylabel('value')\n", 831 | "axes[0].set_ylim(0,1)\n", 832 | "axes[0].tick_params(axis='x', labelsize=8) # Adjust the font size here\n", 833 | "\n", 834 | "# Recall Violin Plot\n", 835 | "sns.violinplot(x='Method', y='Recall', data=metrics_df, ax=axes[1], inner='quartile', palette='Set2')\n", 836 | "axes[1].set_title('Recall')\n", 837 | "axes[1].set_xlabel('Method')\n", 838 | "axes[1].set_ylabel('value')\n", 839 | "axes[1].set_ylim(0,1)\n", 840 | "axes[1].tick_params(axis='x', labelsize=8) # Adjust the font size here\n", 841 | "\n", 842 | "# F1 Score Violin Plot\n", 843 | "sns.violinplot(x='Method', y='F1 Score', data=metrics_df, ax=axes[2], inner='quartile', palette='Set2')\n", 844 | "axes[2].set_title('F1 Score')\n", 845 | "axes[2].set_xlabel('Method')\n", 846 | "axes[2].set_ylabel('value')\n", 847 | "axes[2].set_ylim(0,1)\n", 848 | "axes[2].tick_params(axis='x', labelsize=8) # Adjust the font size here\n", 849 | "\n", 850 | "plt.tight_layout()\n", 851 | "plt.show()" 852 | ] 853 | } 854 | ], 855 | "metadata": { 856 | "kernelspec": { 857 | "display_name": "Python 3 (ipykernel)", 858 | "language": "python", 859 | "name": "python3" 860 | }, 861 | "language_info": { 862 | "codemirror_mode": { 863 | "name": "ipython", 864 | "version": 3 865 | }, 866 | "file_extension": ".py", 867 | "mimetype": "text/x-python", 868 | "name": "python", 869 | "nbconvert_exporter": "python", 870 | "pygments_lexer": "ipython3", 871 | "version": "3.10.14" 872 | } 873 | }, 874 | "nbformat": 4, 875 | "nbformat_minor": 5 876 | } 877 | -------------------------------------------------------------------------------- /src/synthetic_data/generate_synthetic_data_seqFISH.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4d2cee82-d84e-4c57-af58-6ce961a3f819", 6 | "metadata": {}, 7 | "source": [ 8 | "To generate synthetic VisiumHD data from seqFISH+, please read and run all the cells below. Thanks!" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "a9fcd48a-2f55-43b4-befd-8d646ea634cf", 14 | "metadata": { 15 | "tags": [] 16 | }, 17 | "source": [ 18 | "### Install prerequisite libraries" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "7453e3e3-a55c-47fb-ab83-2c3743833b89", 25 | "metadata": { 26 | "scrolled": true, 27 | "tags": [] 28 | }, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 35 | "Requirement already satisfied: pip in /opt/conda/lib/python3.10/site-packages (24.2)\n", 36 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 37 | "Requirement already satisfied: scipy in /home/oneai/.local/lib/python3.10/site-packages (1.10.0)\n", 38 | "Requirement already satisfied: numpy<1.27.0,>=1.19.5 in /home/oneai/.local/lib/python3.10/site-packages (from scipy) (1.22.4)\n", 39 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 40 | "Requirement already satisfied: shapely in /home/oneai/.local/lib/python3.10/site-packages (2.0.0)\n", 41 | "Requirement already satisfied: numpy>=1.14 in /home/oneai/.local/lib/python3.10/site-packages (from shapely) (1.22.4)\n", 42 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 43 | "Requirement already satisfied: tifffile in /home/oneai/.local/lib/python3.10/site-packages (2022.10.10)\n", 44 | "Requirement already satisfied: numpy>=1.19.2 in /home/oneai/.local/lib/python3.10/site-packages (from tifffile) (1.22.4)\n", 45 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 46 | "Requirement already satisfied: plotly in /home/oneai/.local/lib/python3.10/site-packages (5.13.1)\n", 47 | "Requirement already satisfied: tenacity>=6.2.0 in /home/oneai/.local/lib/python3.10/site-packages (from plotly) (9.0.0)\n", 48 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 49 | "Requirement already satisfied: tensorflow-gpu==2.10.0 in /opt/conda/lib/python3.10/site-packages (2.10.0)\n", 50 | "Requirement already satisfied: absl-py>=1.0.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.1.0)\n", 51 | "Requirement already satisfied: astunparse>=1.6.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.6.3)\n", 52 | "Requirement already satisfied: flatbuffers>=2.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (24.3.25)\n", 53 | "Requirement already satisfied: gast<=0.4.0,>=0.2.1 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (0.4.0)\n", 54 | "Requirement already satisfied: google-pasta>=0.1.1 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (0.2.0)\n", 55 | "Requirement already satisfied: grpcio<2.0,>=1.24.3 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.66.1)\n", 56 | "Requirement already satisfied: h5py>=2.9.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (3.11.0)\n", 57 | "Requirement already satisfied: keras<2.11,>=2.10.0 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.10.0)\n", 58 | "Requirement already satisfied: keras-preprocessing>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.1.2)\n", 59 | "Requirement already satisfied: libclang>=13.0.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (18.1.1)\n", 60 | "Requirement already satisfied: numpy>=1.20 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.22.4)\n", 61 | "Requirement already satisfied: opt-einsum>=2.3.2 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (3.3.0)\n", 62 | "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (24.0)\n", 63 | "Requirement already satisfied: protobuf<3.20,>=3.9.2 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (3.19.6)\n", 64 | "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (69.5.1)\n", 65 | "Requirement already satisfied: six>=1.12.0 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.16.0)\n", 66 | "Requirement already satisfied: tensorboard<2.11,>=2.10 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.10.1)\n", 67 | "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (0.37.1)\n", 68 | "Requirement already satisfied: tensorflow-estimator<2.11,>=2.10.0 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.10.0)\n", 69 | "Requirement already satisfied: termcolor>=1.1.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (2.4.0)\n", 70 | "Requirement already satisfied: typing-extensions>=3.6.6 in /opt/conda/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (4.12.2)\n", 71 | "Requirement already satisfied: wrapt>=1.11.0 in /home/oneai/.local/lib/python3.10/site-packages (from tensorflow-gpu==2.10.0) (1.14.1)\n", 72 | "Requirement already satisfied: wheel<1.0,>=0.23.0 in /opt/conda/lib/python3.10/site-packages (from astunparse>=1.6.0->tensorflow-gpu==2.10.0) (0.43.0)\n", 73 | "Requirement already satisfied: google-auth<3,>=1.6.3 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2.33.0)\n", 74 | "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (0.4.6)\n", 75 | "Requirement already satisfied: markdown>=2.6.8 in /home/oneai/.local/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.7)\n", 76 | "Requirement already satisfied: requests<3,>=2.21.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2.31.0)\n", 77 | "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (0.6.1)\n", 78 | "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (1.8.1)\n", 79 | "Requirement already satisfied: werkzeug>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.0.4)\n", 80 | "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (5.4.0)\n", 81 | "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (0.4.0)\n", 82 | "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.10/site-packages (from google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (4.9)\n", 83 | "Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/conda/lib/python3.10/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2.0.0)\n", 84 | "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.3.2)\n", 85 | "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.6)\n", 86 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (1.26.19)\n", 87 | "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2024.7.4)\n", 88 | "Requirement already satisfied: MarkupSafe>=2.1.1 in /opt/conda/lib/python3.10/site-packages (from werkzeug>=1.0.1->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (2.1.5)\n", 89 | "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /opt/conda/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (0.6.0)\n", 90 | "Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.10/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.11,>=2.10->tensorflow-gpu==2.10.0) (3.2.2)\n", 91 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 92 | "Requirement already satisfied: stardist in /home/oneai/.local/lib/python3.10/site-packages (0.9.1)\n", 93 | "Requirement already satisfied: csbdeep>=0.8.0 in /home/oneai/.local/lib/python3.10/site-packages (from stardist) (0.8.0)\n", 94 | "Requirement already satisfied: scikit-image in /home/oneai/.local/lib/python3.10/site-packages (from stardist) (0.19.3)\n", 95 | "Requirement already satisfied: numba in /home/oneai/.local/lib/python3.10/site-packages (from stardist) (0.55.2)\n", 96 | "Requirement already satisfied: imageio in /home/oneai/.local/lib/python3.10/site-packages (from stardist) (2.35.1)\n", 97 | "Requirement already satisfied: numpy in /home/oneai/.local/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (1.22.4)\n", 98 | "Requirement already satisfied: scipy in /home/oneai/.local/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (1.10.0)\n", 99 | "Requirement already satisfied: matplotlib in /home/oneai/.local/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (3.6.2)\n", 100 | "Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (1.16.0)\n", 101 | "Requirement already satisfied: tifffile in /home/oneai/.local/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (2022.10.10)\n", 102 | "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (4.66.2)\n", 103 | "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from csbdeep>=0.8.0->stardist) (24.0)\n", 104 | "Requirement already satisfied: pillow>=8.3.2 in /home/oneai/.local/lib/python3.10/site-packages (from imageio->stardist) (10.4.0)\n", 105 | "Requirement already satisfied: llvmlite<0.39,>=0.38.0rc1 in /home/oneai/.local/lib/python3.10/site-packages (from numba->stardist) (0.38.1)\n", 106 | "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from numba->stardist) (69.5.1)\n", 107 | "Requirement already satisfied: networkx>=2.2 in /home/oneai/.local/lib/python3.10/site-packages (from scikit-image->stardist) (3.3)\n", 108 | "Requirement already satisfied: PyWavelets>=1.1.1 in /home/oneai/.local/lib/python3.10/site-packages (from scikit-image->stardist) (1.6.0)\n", 109 | "Requirement already satisfied: contourpy>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (1.2.1)\n", 110 | "Requirement already satisfied: cycler>=0.10 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (0.12.1)\n", 111 | "Requirement already satisfied: fonttools>=4.22.0 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (4.53.1)\n", 112 | "Requirement already satisfied: kiwisolver>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (1.4.5)\n", 113 | "Requirement already satisfied: pyparsing>=2.2.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (3.1.4)\n", 114 | "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib->csbdeep>=0.8.0->stardist) (2.9.0)\n", 115 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 116 | "Requirement already satisfied: geopandas in /home/oneai/.local/lib/python3.10/site-packages (0.12.2)\n", 117 | "Requirement already satisfied: pandas>=1.0.0 in /home/oneai/.local/lib/python3.10/site-packages (from geopandas) (1.5.2)\n", 118 | "Requirement already satisfied: shapely>=1.7 in /home/oneai/.local/lib/python3.10/site-packages (from geopandas) (2.0.0)\n", 119 | "Requirement already satisfied: fiona>=1.8 in /home/oneai/.local/lib/python3.10/site-packages (from geopandas) (1.9.6)\n", 120 | "Requirement already satisfied: pyproj>=2.6.1.post1 in /home/oneai/.local/lib/python3.10/site-packages (from geopandas) (3.6.1)\n", 121 | "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from geopandas) (24.0)\n", 122 | "Requirement already satisfied: attrs>=19.2.0 in /opt/conda/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (24.2.0)\n", 123 | "Requirement already satisfied: certifi in /opt/conda/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (2024.7.4)\n", 124 | "Requirement already satisfied: click~=8.0 in /home/oneai/.local/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (8.1.7)\n", 125 | "Requirement already satisfied: click-plugins>=1.0 in /home/oneai/.local/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (1.1.1)\n", 126 | "Requirement already satisfied: cligj>=0.5 in /home/oneai/.local/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (0.7.2)\n", 127 | "Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from fiona>=1.8->geopandas) (1.16.0)\n", 128 | "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.0.0->geopandas) (2.9.0)\n", 129 | "Requirement already satisfied: pytz>=2020.1 in /home/oneai/.local/lib/python3.10/site-packages (from pandas>=1.0.0->geopandas) (2022.7.1)\n", 130 | "Requirement already satisfied: numpy>=1.21.0 in /home/oneai/.local/lib/python3.10/site-packages (from pandas>=1.0.0->geopandas) (1.22.4)\n", 131 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 132 | "Requirement already satisfied: scanpy in /home/oneai/.local/lib/python3.10/site-packages (1.9.1)\n", 133 | "Requirement already satisfied: anndata>=0.7.4 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.8.0)\n", 134 | "Requirement already satisfied: numpy>=1.17.0 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.22.4)\n", 135 | "Requirement already satisfied: matplotlib>=3.4 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (3.6.2)\n", 136 | "Requirement already satisfied: pandas>=1.0 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.5.2)\n", 137 | "Requirement already satisfied: scipy>=1.4 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.10.0)\n", 138 | "Requirement already satisfied: seaborn in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.13.2)\n", 139 | "Requirement already satisfied: h5py>=3 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (3.11.0)\n", 140 | "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from scanpy) (4.66.2)\n", 141 | "Requirement already satisfied: scikit-learn>=0.22 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.2.0)\n", 142 | "Requirement already satisfied: statsmodels>=0.10.0rc2 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.14.2)\n", 143 | "Requirement already satisfied: patsy in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.5.6)\n", 144 | "Requirement already satisfied: networkx>=2.3 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (3.3)\n", 145 | "Requirement already satisfied: natsort in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (8.4.0)\n", 146 | "Requirement already satisfied: joblib in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.4.2)\n", 147 | "Requirement already satisfied: numba>=0.41.0 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.55.2)\n", 148 | "Requirement already satisfied: umap-learn>=0.3.10 in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (0.5.6)\n", 149 | "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from scanpy) (24.0)\n", 150 | "Requirement already satisfied: session-info in /home/oneai/.local/lib/python3.10/site-packages (from scanpy) (1.0.0)\n", 151 | "Requirement already satisfied: contourpy>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (1.2.1)\n", 152 | "Requirement already satisfied: cycler>=0.10 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (0.12.1)\n", 153 | "Requirement already satisfied: fonttools>=4.22.0 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (4.53.1)\n", 154 | "Requirement already satisfied: kiwisolver>=1.0.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (1.4.5)\n", 155 | "Requirement already satisfied: pillow>=6.2.0 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (10.4.0)\n", 156 | "Requirement already satisfied: pyparsing>=2.2.1 in /home/oneai/.local/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (3.1.4)\n", 157 | "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib>=3.4->scanpy) (2.9.0)\n", 158 | "Requirement already satisfied: llvmlite<0.39,>=0.38.0rc1 in /home/oneai/.local/lib/python3.10/site-packages (from numba>=0.41.0->scanpy) (0.38.1)\n", 159 | "Requirement already satisfied: setuptools in /opt/conda/lib/python3.10/site-packages (from numba>=0.41.0->scanpy) (69.5.1)\n", 160 | "Requirement already satisfied: pytz>=2020.1 in /home/oneai/.local/lib/python3.10/site-packages (from pandas>=1.0->scanpy) (2022.7.1)\n", 161 | "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/oneai/.local/lib/python3.10/site-packages (from scikit-learn>=0.22->scanpy) (3.1.0)\n", 162 | "Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from patsy->scanpy) (1.16.0)\n", 163 | "Requirement already satisfied: pynndescent>=0.5 in /home/oneai/.local/lib/python3.10/site-packages (from umap-learn>=0.3.10->scanpy) (0.5.13)\n", 164 | "Requirement already satisfied: stdlib-list in /home/oneai/.local/lib/python3.10/site-packages (from session-info->scanpy) (0.10.0)\n", 165 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 166 | "Requirement already satisfied: fastparquet in /home/oneai/.local/lib/python3.10/site-packages (2024.5.0)\n", 167 | "Requirement already satisfied: pandas>=1.5.0 in /home/oneai/.local/lib/python3.10/site-packages (from fastparquet) (1.5.2)\n", 168 | "Requirement already satisfied: numpy in /home/oneai/.local/lib/python3.10/site-packages (from fastparquet) (1.22.4)\n", 169 | "Requirement already satisfied: cramjam>=2.3 in /home/oneai/.local/lib/python3.10/site-packages (from fastparquet) (2.8.3)\n", 170 | "Requirement already satisfied: fsspec in /home/oneai/.local/lib/python3.10/site-packages (from fastparquet) (2024.6.1)\n", 171 | "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from fastparquet) (24.0)\n", 172 | "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.5.0->fastparquet) (2.9.0)\n", 173 | "Requirement already satisfied: pytz>=2020.1 in /home/oneai/.local/lib/python3.10/site-packages (from pandas>=1.5.0->fastparquet) (2022.7.1)\n", 174 | "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas>=1.5.0->fastparquet) (1.16.0)\n", 175 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 176 | "Requirement already satisfied: imagecodecs in /home/oneai/.local/lib/python3.10/site-packages (2024.6.1)\n", 177 | "Requirement already satisfied: numpy in /home/oneai/.local/lib/python3.10/site-packages (from imagecodecs) (1.22.4)\n", 178 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 179 | "Requirement already satisfied: zarr in /home/oneai/.local/lib/python3.10/site-packages (2.17.1)\n", 180 | "Requirement already satisfied: asciitree in /home/oneai/.local/lib/python3.10/site-packages (from zarr) (0.3.3)\n", 181 | "Requirement already satisfied: numpy>=1.21.1 in /home/oneai/.local/lib/python3.10/site-packages (from zarr) (1.22.4)\n", 182 | "Requirement already satisfied: numcodecs>=0.10.0 in /home/oneai/.local/lib/python3.10/site-packages (from zarr) (0.13.0)\n", 183 | "Requirement already satisfied: fasteners in /home/oneai/.local/lib/python3.10/site-packages (from zarr) (0.19)\n", 184 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 185 | "Requirement already satisfied: scipy in /home/oneai/.local/lib/python3.10/site-packages (1.10.0)\n", 186 | "Requirement already satisfied: numpy<1.27.0,>=1.19.5 in /home/oneai/.local/lib/python3.10/site-packages (from scipy) (1.22.4)\n", 187 | "Looking in indexes: https://jfrog-proxy.services.p171649450587.aws-emea.sanofi.com/artifactory/api/pypi/pypi-one_ai-virtual/simple, https://pypi.org/simple\n", 188 | "Requirement already satisfied: h5py in /home/oneai/.local/lib/python3.10/site-packages (3.11.0)\n", 189 | "Requirement already satisfied: numpy>=1.17.3 in /home/oneai/.local/lib/python3.10/site-packages (from h5py) (1.22.4)\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "!pip install --upgrade pip\n", 195 | "!pip install scipy\n", 196 | "!pip install shapely\n", 197 | "!pip install tifffile\n", 198 | "!pip install plotly\n", 199 | "!pip install tensorflow-gpu==2.10.0\n", 200 | "!pip install stardist\n", 201 | "!pip install geopandas\n", 202 | "!pip install scanpy\n", 203 | "!pip install fastparquet\n", 204 | "!pip install imagecodecs\n", 205 | "!pip install zarr\n", 206 | "!pip install scipy\n", 207 | "!pip install h5py" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "id": "1f79fb2c-0fd9-4bd4-8be9-4d1bd04d8733", 213 | "metadata": { 214 | "tags": [] 215 | }, 216 | "source": [ 217 | "### Import Relevant Libraries" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "id": "16e4dc02-2b8d-4e00-9cbd-8a4d151ca5af", 224 | "metadata": { 225 | "scrolled": true, 226 | "tags": [] 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "import tifffile as tifi # Package to read the WSI (whole slide image)\n", 231 | "from csbdeep.utils import normalize # Image normalization\n", 232 | "from shapely.geometry import Polygon, Point # Representing bins and cells as Shapely Polygons and Point objects\n", 233 | "from shapely import wkt\n", 234 | "import geopandas as gpd # Geopandas for storing Shapely objects\n", 235 | "from matplotlib.colors import ListedColormap\n", 236 | "import matplotlib.pyplot as plt\n", 237 | "import scanpy as sc\n", 238 | "import pandas as pd\n", 239 | "from scipy import sparse\n", 240 | "import anndata\n", 241 | "import os\n", 242 | "import gzip\n", 243 | "import numpy as np\n", 244 | "import re\n", 245 | "import shapely\n", 246 | "import zarr\n" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "id": "a91a092e-781d-4a9e-8777-d3bb9c99309c", 252 | "metadata": { 253 | "tags": [] 254 | }, 255 | "source": [ 256 | "### Create folders to store synthetic data" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "id": "37e30a8b-77f8-4d2c-97c2-8274eb0d23a3", 262 | "metadata": {}, 263 | "source": [ 264 | "For both the `seqfish_dir` and `enact_data_dir`, change `\"/home/oneai/\"` to the directory that stores this repo." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "id": "01f77ecd-3f9a-4a39-bbb2-e90e851ec360", 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "seqfish_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/synthetic_data/seqFISH\" # Update it to the directory where you want to save the synthetic data\n", 275 | "enact_data_dir = \"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/seqfish/chunks\" # Directory that saves all the input and results of the enact pipeline, \n", 276 | "# should end with \"oneai-dda-spatialtr-visiumhd_analysis/cache/seqfish/chunks\"\n", 277 | "\n", 278 | "transcripts_df_chunks_dir = os.path.join(seqfish_dir, \"transcripts_patches\") # Directory to store the files that contain the transcripts info for each chunk\n", 279 | "output_dir = os.path.join(enact_data_dir, \"bins_gdf\") # Directory to store the generated synthetic binned transcript counts\n", 280 | "cells_df_chunks_dir = os.path.join(enact_data_dir,\"cells_gdf\") # Directory to store the generated synthetic binned transcript counts\n", 281 | "\n", 282 | "# Making relevant directories\n", 283 | "os.makedirs(seqfish_dir, exist_ok=True)\n", 284 | "os.makedirs(enact_data_dir, exist_ok=True)\n", 285 | "os.makedirs(transcripts_df_chunks_dir, exist_ok=True)\n", 286 | "os.makedirs(output_dir, exist_ok=True)\n", 287 | "os.makedirs(cells_df_chunks_dir, exist_ok=True)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "id": "0048c41f-18ee-4b92-b7ea-680956330667", 293 | "metadata": { 294 | "tags": [] 295 | }, 296 | "source": [ 297 | "### Download seqFISH+ data\n", 298 | "\n", 299 | "1. Download \"ROIs_Experiment1_NIH3T3.zip\" from https://zenodo.org/records/2669683#.Xqi1w5NKg6g to seqfish_dir. The zipfile contains cell segmentation files\n", 300 | "2. Download \"run1.csv.gz\" from https://github.com/MonashBioinformaticsPlatform/seqfish-hack. It contains the tidy format of \"seqFISH+_NIH3T3_point_locations.zip\" from the official seqFISH+ zenodo site" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "id": "46a8d90a-65dd-4e93-b4e2-4a257d6e1dc7", 306 | "metadata": { 307 | "tags": [] 308 | }, 309 | "source": [ 310 | "### Load Cell & Transcripts Info" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "id": "a40feb4c-1510-4222-bdec-a5e419758f32", 316 | "metadata": {}, 317 | "source": [ 318 | "This following cells first unzip \"ROIs_Experiment1_NIH3T3.zip\" to extract the cell segmentation information. Then load transcripts dataframe from \"run1.csv.gz\"" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "id": "e7bb6152-3999-4ccf-9a08-8fad268ab972", 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "import zipfile\n", 329 | "import os\n", 330 | "zip_file_path = os.path.join(seqfish_dir, \"ROIs_Experiment1_NIH3T3.zip\")\n", 331 | "\n", 332 | "# Open the ZIP file and extract all the contents\n", 333 | "with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n", 334 | " zip_ref.extractall(seqfish_dir)\n", 335 | "\n", 336 | "print(f'Files extracted to {seqfish_dir}')" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "id": "062ee054-6e0e-4c2b-9782-9e2b328c18e0", 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "file_path = os.path.join(seqfish_dir, \"run1.csv.gz\")\n", 347 | "\n", 348 | "transcripts_df = pd.read_csv(file_path, compression='gzip')\n", 349 | "print(transcripts_df)" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "id": "eb2be572-8903-4539-8306-087cf61aa82d", 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "# convert from pixel to um\n", 360 | "transcripts_df.x = transcripts_df.x*0.103\n", 361 | "transcripts_df.y = transcripts_df.y*0.103\n", 362 | "# label cell to include fov and cell number\n", 363 | "transcripts_df['new_cell_name'] = transcripts_df.apply(lambda x: f\"{x['fov']}_Cell_{x['cell']}\", axis=1)" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "id": "5c8134a2-9a2b-41b6-81ab-4e292609e2f2", 369 | "metadata": { 370 | "tags": [] 371 | }, 372 | "source": [ 373 | "### Generate Ground Truth" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "id": "9ae7d39d-f065-454a-bea6-7d31f57139fd", 379 | "metadata": {}, 380 | "source": [ 381 | "The following cell will generate and save the ground truth of the synthetic VisiumHD data for the use of bin-to-cell assignment methods evaluation. Ground truth dataframe consists of rows representing the transcript counts of each cell. Each column represents a gene feature (gene feature name is also the column name)." 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "id": "b9bc5483-2357-40aa-a5b8-1a140b08967a", 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "groundtruth_df = transcripts_df.pivot_table(index=['new_cell_name'], columns='gene', aggfunc='size', fill_value=0)\n", 392 | "ground_truth_file = os.path.join(seqfish_dir, \"groundtruth.csv\")\n", 393 | "groundtruth_df.to_csv(ground_truth_file)" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "id": "dafe70a1-ed23-4cb6-a7b6-d35e4c01f895", 399 | "metadata": { 400 | "tags": [] 401 | }, 402 | "source": [ 403 | "### Generate Synthetic VesiumHD Dataset" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "id": "5bdd8461-7bcc-4101-b26b-765daf975916", 409 | "metadata": { 410 | "tags": [] 411 | }, 412 | "source": [ 413 | "#### Break transcripts df to patches (based on fov)" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "id": "b0d353b0-74cc-44f1-9a5b-ab5533d5d76a", 419 | "metadata": {}, 420 | "source": [ 421 | "Break transcripts df to patches based on their field of view (fov), since cell segmentation is done on each individual fov seperately." 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "id": "60fb886a-5893-40ba-b187-650d6cfb4ed6", 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "# Create a df for each fov\n", 432 | "grouped = transcripts_df.groupby(['fov'])\n", 433 | "for fov, group in grouped:\n", 434 | " filename = f\"patch_{fov}.csv\"\n", 435 | " output_loc = os.path.join(transcripts_df_chunks_dir, filename)\n", 436 | " group.to_csv(output_loc)\n", 437 | "\n", 438 | " print(f\"Saved {filename}\")" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "id": "a7bbc9ec-675b-4b25-8448-334ed317798a", 444 | "metadata": { 445 | "tags": [] 446 | }, 447 | "source": [ 448 | "#### Generate synthetic vesiumHD for each patch" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "id": "99052790-7e12-4851-b9a4-e9ead3a55d0f", 454 | "metadata": {}, 455 | "source": [ 456 | "Each fov is broken into bins of size 2um x 2um. The synthetic data contains transcript counts orgnized by bin_id. Each row contains transcript counts for a unique bin. Bins with no transcript counts is not included. \n", 457 | "\n", 458 | "In addition to all the gene features, there are two additional columns represent the row number and column number of the bin, and a column contains the Shapely polygon item that represents the bin. The first column is the bin_id." 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "id": "d19155a0-5646-49bd-915c-94737e251bb0", 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "def generate_synthetic_VesiumHD_data(transcripts_df, bin_size=2):\n", 469 | " \n", 470 | " filtered_df = transcripts_df.copy()\n", 471 | " \n", 472 | " # assigne bin to each transcript\n", 473 | " filtered_df.loc[:, 'row'] =np.ceil(filtered_df['y'] / bin_size).astype(int)\n", 474 | " filtered_df.loc[:, 'column'] = np.ceil(filtered_df['x'] / bin_size).astype(int)\n", 475 | " filtered_df.loc[:, 'assigned_bin_id'] = filtered_df.apply(\n", 476 | " lambda row: f\"{bin_size}um_\" + str(row['row']).zfill(5) +\"_\"+ str(row['column']).zfill(5),\n", 477 | " axis=1)\n", 478 | " bin_coordinates = filtered_df[['assigned_bin_id', 'row', 'column']].drop_duplicates().set_index('assigned_bin_id')\n", 479 | " bin_gene_matrix = filtered_df.groupby(['assigned_bin_id', 'gene']).size().unstack(fill_value=0)\n", 480 | " bin_gene_matrix_with_coords = bin_gene_matrix.merge(bin_coordinates, left_index=True, right_index=True)\n", 481 | " return bin_gene_matrix_with_coords" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "id": "bd804c49-dc85-4fa9-85d4-a621cf0598ae", 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "# Extract row and column number from the bin_id\n", 492 | "def extract_numbers(entry):\n", 493 | " match = re.search(r'_(\\d{5})_(\\d{5})', entry)\n", 494 | " if match:\n", 495 | " number1 = int(match.group(1).lstrip('0')) \n", 496 | " number2 = int(match.group(2).lstrip('0')) \n", 497 | " return number2*2-1, number1*2-1\n", 498 | " else:\n", 499 | " return None, None" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "id": "ee921e47-70e4-4bee-92e3-6ce40a0fb50d", 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "from tqdm import tqdm\n", 510 | "def generate_bin_polys(bins_df, x_col, y_col, bin_size):\n", 511 | " \"\"\"Represents the bins as Shapely polygons\n", 512 | "\n", 513 | " Args:\n", 514 | " bins_df (pd.DataFrame): bins dataframe\n", 515 | " x_col (str): column with the bin centre x-coordinate\n", 516 | " y_col (str): column with the bin centre y-coordinate\n", 517 | " bin_size (int): bin size in pixels\n", 518 | "\n", 519 | " Returns:\n", 520 | " list: list of Shapely polygons\n", 521 | " \"\"\"\n", 522 | " geometry = []\n", 523 | " # Generates Shapely polygons to represent each bin\n", 524 | "\n", 525 | " if True:\n", 526 | " half_bin_size = bin_size / 2\n", 527 | " bbox_coords = pd.DataFrame(\n", 528 | " {\n", 529 | " \"min_x\": bins_df[x_col] - half_bin_size,\n", 530 | " \"min_y\": bins_df[y_col] - half_bin_size,\n", 531 | " \"max_x\": bins_df[x_col] + half_bin_size,\n", 532 | " \"max_y\": bins_df[y_col] + half_bin_size,\n", 533 | " }\n", 534 | " )\n", 535 | " geometry = [\n", 536 | " shapely.geometry.box(min_x, min_y, max_x, max_y)\n", 537 | " for min_x, min_y, max_x, max_y in tqdm(\n", 538 | " zip(\n", 539 | " bbox_coords[\"min_x\"],\n", 540 | " bbox_coords[\"min_y\"],\n", 541 | " bbox_coords[\"max_x\"],\n", 542 | " bbox_coords[\"max_y\"],\n", 543 | " ),\n", 544 | " total=len(bins_df),\n", 545 | " )\n", 546 | " ]\n", 547 | "\n", 548 | " return geometry" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "id": "9f1c4071-ff50-4ec1-bd0d-37c8ddecaa54", 555 | "metadata": { 556 | "tags": [] 557 | }, 558 | "outputs": [], 559 | "source": [ 560 | "# Loop through all the transcripra_df patches and generate gene-to-bin assignments \n", 561 | "bin_size = 2\n", 562 | "transcripts_df_chunks = os.listdir(transcripts_df_chunks_dir)\n", 563 | "for chunk_fname in transcripts_df_chunks:\n", 564 | " output_loc = os.path.join(output_dir, chunk_fname)\n", 565 | " if chunk_fname in [\".ipynb_checkpoints\"]:\n", 566 | " continue\n", 567 | " # if os.path.exists(output_loc):\n", 568 | " # continue\n", 569 | " transcripts_df_chunk = pd.read_csv(os.path.join(transcripts_df_chunks_dir, chunk_fname))\n", 570 | " bin_df_chunk = generate_synthetic_VesiumHD_data(transcripts_df_chunk, bin_size)\n", 571 | " bin_df_chunk['column'] = bin_df_chunk['column']*2-1\n", 572 | " bin_df_chunk['row'] = bin_df_chunk['row']*2-1\n", 573 | " bin_df_chunk['geometry'] = generate_bin_polys(bin_df_chunk, 'column', 'row', 2)\n", 574 | " bin_gdf_chunk = gpd.GeoDataFrame( bin_df_chunk, geometry = bin_df_chunk['geometry'])\n", 575 | " bin_gdf_chunk.to_csv(output_loc)\n", 576 | " \n", 577 | " print(f\"Successfully assigned transcripts to bins for {chunk_fname}\")" 578 | ] 579 | }, 580 | { 581 | "cell_type": "markdown", 582 | "id": "2ae8aa8e-0a17-48ae-86ed-81a04ec203dc", 583 | "metadata": { 584 | "tags": [] 585 | }, 586 | "source": [ 587 | "### Generate ENACT pipeline cell segmentation input" 588 | ] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "id": "a2fc8e57-23d4-4f71-971b-6e4e1d9f0267", 593 | "metadata": {}, 594 | "source": [ 595 | "This session generate the cell_df patches required to run the enact pipeline. The main purpose is to create Shapely polygons that represent the cell outline." 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "id": "57c34d0c-029c-482f-bc27-fc39e52adf4a", 601 | "metadata": { 602 | "tags": [] 603 | }, 604 | "source": [ 605 | "#### Load cell boundary data and create cell polygons" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": null, 611 | "id": "b140bc6d-f120-4d18-b302-844bb3b79a63", 612 | "metadata": {}, 613 | "outputs": [], 614 | "source": [ 615 | "import read_roi\n", 616 | "def process_roi_file(key, roi_file_path):\n", 617 | " roi_data = read_roi.read_roi_file(roi_file_path)\n", 618 | " data = roi_data[key]\n", 619 | " # Apply the scaling factor to each coordinate separately\n", 620 | " scaled_x = [x * 0.103 for x in data['x']]\n", 621 | " scaled_y = [y * 0.103 for y in data['y']]\n", 622 | " # Create the list of points using zip on the scaled coordinates\n", 623 | " points = [(x, y) for x, y in zip(scaled_x, scaled_y)]\n", 624 | " # Create and return the polygon\n", 625 | " polygon = Polygon(points)\n", 626 | " return polygon" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "id": "b5295212-8548-44a1-b15f-1234bdf28b88", 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | "def extract_fov_from_string(s):\n", 637 | " # Search for one or more digits in the string\n", 638 | " match = re.search(r'\\d+', s)\n", 639 | " if match:\n", 640 | " return int(match.group(0))+1 # Convert the found number to an integer\n", 641 | " else:\n", 642 | " return None # Return None if no number is found" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": null, 648 | "id": "fef3e532-d471-4635-b743-947c402dbe35", 649 | "metadata": {}, 650 | "outputs": [], 651 | "source": [ 652 | "base_path = os.path.join(seqfish_dir, \"ALL_Roi\") # Change this to the path where your fov folders are stored\n", 653 | "fov_data = []\n", 654 | "\n", 655 | "for fov_folder in os.listdir(base_path):\n", 656 | " fov_folder_path = os.path.join(base_path, fov_folder)\n", 657 | " if os.path.isdir(fov_folder_path):\n", 658 | " # Loop through each ROI file in the fov folder\n", 659 | " for roi_file in os.listdir(fov_folder_path):\n", 660 | " if roi_file.endswith('.roi'):\n", 661 | " key = roi_file.replace('.roi', '')\n", 662 | " roi_file_path = os.path.join(fov_folder_path, roi_file)\n", 663 | " polygon = process_roi_file(key, roi_file_path)\n", 664 | " fov_data.append({\n", 665 | " 'fov': extract_fov_from_string(fov_folder),\n", 666 | " 'cell': roi_file.replace('.roi', ''),\n", 667 | " 'geometry': polygon\n", 668 | " })\n", 669 | "\n", 670 | "cell_boundary_df = pd.DataFrame(fov_data)" 671 | ] 672 | }, 673 | { 674 | "cell_type": "markdown", 675 | "id": "1f3b01b4-c042-4e70-9dd0-7ef88741b833", 676 | "metadata": { 677 | "tags": [] 678 | }, 679 | "source": [ 680 | "#### relabel cell name of polygons df to the standard name" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": null, 686 | "id": "15c59f4d-6fce-4702-861a-176516f518b3", 687 | "metadata": {}, 688 | "outputs": [], 689 | "source": [ 690 | "df_sorted = cell_boundary_df.sort_values(by=['fov', 'cell'])\n", 691 | "df_sorted['cell_id'] = df_sorted.groupby('fov').cumcount() + 1\n", 692 | "df_sorted['cell_id'] = df_sorted.apply(lambda x: f\"{x['fov']}_Cell_{x['cell_id']}\", axis=1)\n", 693 | "df_sorted.to_csv(\"/home/oneai/oneai-dda-spatialtr-visiumhd_analysis/cache/seqfish/cells_df.csv\")" 694 | ] 695 | }, 696 | { 697 | "cell_type": "markdown", 698 | "id": "8c9e51e0-b001-4b31-a6cb-d9a9c8f32eb4", 699 | "metadata": { 700 | "tags": [] 701 | }, 702 | "source": [ 703 | "#### Break cell polygons df to patches (based on fov)" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": null, 709 | "id": "13e7bc10-1903-46ef-9042-9086b35259a5", 710 | "metadata": {}, 711 | "outputs": [ 712 | { 713 | "name": "stdout", 714 | "output_type": "stream", 715 | "text": [ 716 | "Saved patch_1.csv\n", 717 | "Saved patch_2.csv\n", 718 | "Saved patch_3.csv\n", 719 | "Saved patch_4.csv\n", 720 | "Saved patch_5.csv\n", 721 | "Saved patch_6.csv\n", 722 | "Saved patch_7.csv\n" 723 | ] 724 | }, 725 | { 726 | "name": "stderr", 727 | "output_type": "stream", 728 | "text": [ 729 | "/tmp/ipykernel_1563651/2577681905.py:3: FutureWarning: In a future version of pandas, a length 1 tuple will be returned when iterating over a groupby with a grouper equal to a list of length 1. Don't supply a list with a single grouper to avoid this warning.\n", 730 | " for fov, group in grouped:\n" 731 | ] 732 | } 733 | ], 734 | "source": [ 735 | "\n", 736 | "# Create a df for each patch\n", 737 | "grouped = df_sorted.groupby(['fov'])\n", 738 | "for fov, group in grouped:\n", 739 | " filename = f\"patch_{fov}.csv\"\n", 740 | " output_loc = os.path.join(cells_df_chunks_dir, filename)\n", 741 | " group.to_csv(output_loc)\n", 742 | "\n", 743 | " print(f\"Saved {filename}\")\n" 744 | ] 745 | }, 746 | { 747 | "cell_type": "markdown", 748 | "id": "eb4bebd9-bc07-44da-a02f-28d5ddc3c1ed", 749 | "metadata": { 750 | "tags": [] 751 | }, 752 | "source": [ 753 | "### Run ENACT bin-to-cell pipeline\n", 754 | "In the configs.yaml file: \n", 755 | "\n", 756 | " Set \"analysis_name\" in the configs.yaml file to \"seqfish\".\n", 757 | " Set \"run_synthetic\" to True.\n", 758 | " Set \"bin_to_cell_method\" to one of these four: \"naive\", \"weighted_by_area\", \"weighted_by_gene\", or \"weighted_by_cluster\"\n", 759 | "\n", 760 | "Run `make run_enact`" 761 | ] 762 | }, 763 | { 764 | "cell_type": "markdown", 765 | "id": "2a8fc042-2406-4db3-9617-7e3968ce8d28", 766 | "metadata": { 767 | "tags": [] 768 | }, 769 | "source": [ 770 | "### Evaluation of ENACT bin-to-cell results" 771 | ] 772 | }, 773 | { 774 | "cell_type": "markdown", 775 | "id": "01ff50d0-2993-42e9-98e9-fe478c32d605", 776 | "metadata": {}, 777 | "source": [ 778 | "To evaluate and compare the four bin-to-cell methods, please first complete the step above with all four methods. You can also only run the methods you are interested in and change the following code accordingly." 779 | ] 780 | }, 781 | { 782 | "cell_type": "markdown", 783 | "id": "1ef3fb7f-cc99-4f9e-b5cc-321412b08ddb", 784 | "metadata": { 785 | "tags": [] 786 | }, 787 | "source": [ 788 | "#### Calculate precision, recall, and F1 for each bin2cell method" 789 | ] 790 | }, 791 | { 792 | "cell_type": "markdown", 793 | "id": "4d11287c-611d-49d2-a1e6-e12c14a973f5", 794 | "metadata": {}, 795 | "source": [ 796 | "Run this session with all the methods you have run with ENACT, change 'method' in the cell below to the one you want to evaluate." 797 | ] 798 | }, 799 | { 800 | "cell_type": "code", 801 | "execution_count": null, 802 | "id": "f3c684f3-5b10-4bd2-8e1c-81d4cdb68ee4", 803 | "metadata": {}, 804 | "outputs": [], 805 | "source": [ 806 | "# Concatenate all patches of ENACT results file \n", 807 | "method = \"weighted_by_gene\" # other methods: \"naive\", \"weighted_by_area\", \"weighted_by_cluster\" \n", 808 | "directory_path = os.path.join(enact_data_dir,method,\"bin_to_cell_assign\") \n", 809 | "output_file = os.path.join(enact_data_dir,method,\"bin_to_cell_assign/merged.csv\") \n", 810 | "\n", 811 | "concatenate_csv_files(directory_path, output_file)" 812 | ] 813 | }, 814 | { 815 | "cell_type": "code", 816 | "execution_count": null, 817 | "id": "4580e62f-e2f3-4d1e-9a25-c483304a119e", 818 | "metadata": {}, 819 | "outputs": [], 820 | "source": [ 821 | "import os\n", 822 | "import pandas as pd\n", 823 | "\n", 824 | "def concatenate_csv_files(directory_path, output_file):\n", 825 | " dataframes = []\n", 826 | "\n", 827 | " for filename in os.listdir(directory_path):\n", 828 | " if filename.endswith('.csv'):\n", 829 | " file_path = os.path.join(directory_path, filename)\n", 830 | " df = pd.read_csv(file_path)\n", 831 | " dataframes.append(df)\n", 832 | " \n", 833 | " concatenated_df = pd.concat(dataframes, ignore_index=True)\n", 834 | " concatenated_df = concatenated_df.drop(columns = ['Unnamed: 0.1','Unnamed: 0'])\n", 835 | " sorted_df = concatenated_df.sort_values(by='id')\n", 836 | " sorted_df.to_csv(output_file, index=False)\n", 837 | " print(f\"All CSV files have been concatenated into {output_file}\")" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "id": "263c024a-821e-4d15-9abd-d3463a8e34f1", 844 | "metadata": {}, 845 | "outputs": [], 846 | "source": [ 847 | "import pandas as pd\n", 848 | "import numpy as np\n", 849 | "from shapely.geometry import Polygon\n", 850 | "\n", 851 | "def calculate_metrics(ground_truth_file, generated_file, eval_file):\n", 852 | " # Load ground truth and generated data\n", 853 | " ground_truth = pd.read_csv(ground_truth_file)\n", 854 | " generated = pd.read_csv(generated_file)\n", 855 | " generated.fillna(0)\n", 856 | " # Ensure 'cell_id' is properly handled\n", 857 | " if 'id' in generated.columns:\n", 858 | " generated.rename(columns={'id': 'new_cell_name'}, inplace=True)\n", 859 | "\n", 860 | " # Merge data on 'cell_id'\n", 861 | " merged = pd.merge(\n", 862 | " ground_truth, generated, on='new_cell_name', how='outer', suffixes=('_gt', '_gen')\n", 863 | " ).fillna(0)\n", 864 | " # print(merged)\n", 865 | "\n", 866 | " # Identify common gene features\n", 867 | " gt_columns = merged.filter(like='_gt').columns\n", 868 | " gen_columns = merged.filter(like='_gen').columns\n", 869 | "\n", 870 | " common_genes = set(gt_columns).intersection(gen_columns)\n", 871 | "\n", 872 | " # Reorder columns based on common genes\n", 873 | " ordered_gt_columns = sorted(gt_columns)\n", 874 | " ordered_gen_columns = sorted(gen_columns)\n", 875 | " \n", 876 | "\n", 877 | " # Extract aligned matrices for ground truth and generated data\n", 878 | " ground_truth_aligned = merged[['new_cell_name'] + [col for col in ordered_gt_columns if col in gt_columns]].values\n", 879 | " generated_aligned = merged[['new_cell_name'] + [col for col in ordered_gen_columns if col in gen_columns]].values\n", 880 | " \n", 881 | " print(ground_truth_aligned)\n", 882 | " print(generated_aligned)\n", 883 | " # Ensure matrices are aligned and have the same shape\n", 884 | " if ground_truth_aligned.shape[1] != generated_aligned.shape[1]:\n", 885 | " raise ValueError(\"The aligned matrices must have the same shape!\")\n", 886 | "\n", 887 | " ground_truth_aligned = ground_truth_aligned[:, 1:] # Exclude cell_ids\n", 888 | " generated_aligned = generated_aligned[:, 1:] \n", 889 | "\n", 890 | " num_cells = (ground_truth.iloc[:, 1:] != 0).any(axis=1).sum()\n", 891 | " tp = np.sum(np.minimum(generated_aligned, ground_truth_aligned), axis=1)\n", 892 | " predicted = np.sum(generated_aligned, axis=1)\n", 893 | " actual = np.sum(ground_truth_aligned, axis=1)\n", 894 | "\n", 895 | " # Calculate precision, recall, and F1 score for each row\n", 896 | " precision = tp / predicted\n", 897 | " recall = tp / actual\n", 898 | " f1_score = 2 * (precision * recall) / (precision + recall)\n", 899 | " \n", 900 | "\n", 901 | " # Add a column called 'Method' where all rows have the same entry\n", 902 | " method_column = np.full((precision.shape[0],), 'Naive') # Replace 'YourMethodName' with the actual method name\n", 903 | "\n", 904 | " df = pd.DataFrame({\n", 905 | " 'Precision': precision,\n", 906 | " 'Recall': recall,\n", 907 | " 'F1 Score': f1_score,\n", 908 | " 'Method': method_column\n", 909 | " })\n", 910 | "\n", 911 | "\n", 912 | " df.to_csv(eval_file)\n" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": null, 918 | "id": "db707de6-8da9-495e-82a0-69c009cf1475", 919 | "metadata": {}, 920 | "outputs": [], 921 | "source": [ 922 | "ground_truth_file = os.path.join(seqfish_dir, \"groundtruth.csv\")\n", 923 | "generated_file = os.path.join(enact_data_dir,method,\"bin_to_cell_assign/merged.csv\")\n", 924 | "eval_file = os.path.join(enact_data_dir,method,\"eval.csv\") \n", 925 | "\n", 926 | "calculate_metrics(ground_truth_file, generated_file, eval_file)" 927 | ] 928 | }, 929 | { 930 | "cell_type": "markdown", 931 | "id": "5cd9470e-8410-4510-9165-cebd466ab343", 932 | "metadata": { 933 | "tags": [] 934 | }, 935 | "source": [ 936 | "#### Create violin plots comparing four bin2cell methods" 937 | ] 938 | }, 939 | { 940 | "cell_type": "markdown", 941 | "id": "b78b6e7d-0e57-46fd-bddb-c701750a625b", 942 | "metadata": {}, 943 | "source": [ 944 | "The following cells would create violin plots for all four methods in order to better compare the results. You can choose to only compare the ones you have run by changing the 'file_names' list to only include those." 945 | ] 946 | }, 947 | { 948 | "cell_type": "code", 949 | "execution_count": null, 950 | "id": "7dc3b4e5-798b-4d0f-a243-bc619daa6f50", 951 | "metadata": {}, 952 | "outputs": [], 953 | "source": [ 954 | "file_names = [os.path.join(enact_data_dir,\"naive/eval.csv\"), \n", 955 | " os.path.join(enact_data_dir,\"weighted_by_area/eval.csv\"), \n", 956 | " os.path.join(enact_data_dir,\"weighted_by_gene/eval.csv\"),\n", 957 | " os.path.join(enact_data_dir,\"weighted_by_cluster/eval.csv\")] # Replace with actual file paths\n", 958 | "\n", 959 | "# Read and concatenate all files\n", 960 | "df_list = [pd.read_csv(file) for file in file_names]\n", 961 | "metrics_df = pd.concat(df_list, ignore_index=True)" 962 | ] 963 | }, 964 | { 965 | "cell_type": "code", 966 | "execution_count": null, 967 | "id": "5a9f977f-6530-446a-9aa4-57d0cbbca85b", 968 | "metadata": {}, 969 | "outputs": [], 970 | "source": [ 971 | "# Visualize the distributions using violin plots\n", 972 | "sns.set(style=\"whitegrid\")\n", 973 | "\n", 974 | "# Create a figure with subplots for each metric\n", 975 | "fig, axes = plt.subplots(1, 3, figsize=(18, 6))\n", 976 | "\n", 977 | "# Precision Violin Plot\n", 978 | "sns.violinplot(x='Method', y='Precision', data=metrics_df, ax=axes[0], inner='quartile', palette='Set2')\n", 979 | "axes[0].set_title('Precision')\n", 980 | "axes[0].set_xlabel('Method')\n", 981 | "axes[0].set_ylabel('value')\n", 982 | "axes[0].set_ylim(0.8,1)\n", 983 | "axes[0].tick_params(axis='x', labelsize=8) # Adjust the font size here\n", 984 | "\n", 985 | "# Recall Violin Plot\n", 986 | "sns.violinplot(x='Method', y='Recall', data=metrics_df, ax=axes[1], inner='quartile', palette='Set2')\n", 987 | "axes[1].set_title('Recall')\n", 988 | "axes[1].set_xlabel('Method')\n", 989 | "axes[1].set_ylabel('value')\n", 990 | "axes[1].set_ylim(0.8,1)\n", 991 | "axes[1].tick_params(axis='x', labelsize=8) # Adjust the font size here\n", 992 | "\n", 993 | "# F1 Score Violin Plot\n", 994 | "sns.violinplot(x='Method', y='F1 Score', data=metrics_df, ax=axes[2], inner='quartile', palette='Set2')\n", 995 | "axes[2].set_title('F1 Score')\n", 996 | "axes[2].set_xlabel('Method')\n", 997 | "axes[2].set_ylabel('value')\n", 998 | "axes[2].set_ylim(0.8,1)\n", 999 | "axes[2].tick_params(axis='x', labelsize=8) # Adjust the font size here\n", 1000 | "\n", 1001 | "plt.tight_layout()\n", 1002 | "plt.show()" 1003 | ] 1004 | } 1005 | ], 1006 | "metadata": { 1007 | "kernelspec": { 1008 | "display_name": "Python 3 (ipykernel)", 1009 | "language": "python", 1010 | "name": "python3" 1011 | }, 1012 | "language_info": { 1013 | "codemirror_mode": { 1014 | "name": "ipython", 1015 | "version": 3 1016 | }, 1017 | "file_extension": ".py", 1018 | "mimetype": "text/x-python", 1019 | "name": "python", 1020 | "nbconvert_exporter": "python", 1021 | "pygments_lexer": "ipython3", 1022 | "version": "3.10.14" 1023 | } 1024 | }, 1025 | "nbformat": 4, 1026 | "nbformat_minor": 5 1027 | } 1028 | -------------------------------------------------------------------------------- /templates/tmap_template.tmap: -------------------------------------------------------------------------------- 1 | { 2 | "compositeMode": "source-over", 3 | "filename": "", 4 | "filters": [ 5 | "Saturation", 6 | "Brightness", 7 | "Contrast" 8 | ], 9 | "layerFilters": { 10 | "0": [ 11 | { 12 | "name": "Saturation", 13 | "value": "0" 14 | }, 15 | { 16 | "name": "Brightness", 17 | "value": "0" 18 | }, 19 | { 20 | "name": "Contrast", 21 | "value": "1" 22 | } 23 | ], 24 | "1": [ 25 | { 26 | "name": "Saturation", 27 | "value": "0" 28 | }, 29 | { 30 | "name": "Brightness", 31 | "value": "0" 32 | }, 33 | { 34 | "name": "Contrast", 35 | "value": "1" 36 | } 37 | ] 38 | }, 39 | "layerOpacities": { 40 | "0": "1", 41 | "1": "0.5" 42 | }, 43 | "layerVisibilities": { 44 | "0": true, 45 | "1": true 46 | }, 47 | "layers": [ 48 | { 49 | "name": "wsi.tif", 50 | "tileSource": "wsi.tif.dzi" 51 | }, 52 | { 53 | "name": "cells_layer.png", 54 | "tileSource": "cells_layer.png.dzi" 55 | } 56 | ], 57 | "markerFiles": [ 58 | { 59 | "autoLoad": false, 60 | "comment": "Displays the cell centroids color-coded by their cell type as predicted by Sargent.", 61 | "expectedHeader": { 62 | "X": "cell_x", 63 | "Y": "cell_y", 64 | "cb_cmap": "", 65 | "cb_col": "", 66 | "cb_gr_dict": "", 67 | "collectionItem_col": "", 68 | "collectionItem_fixed": "0", 69 | "coord_factor": "1", 70 | "gb_col": "cell_type", 71 | "gb_name": "", 72 | "opacity": "0.7", 73 | "opacity_col": "", 74 | "pie_col": "", 75 | "pie_dict": "", 76 | "scale_col": "", 77 | "scale_factor": "0.2", 78 | "shape_col": "", 79 | "shape_fixed": "disc", 80 | "shape_gr_dict": "", 81 | "tooltip_fmt": "" 82 | }, 83 | "expectedRadios": { 84 | "_no_outline": true, 85 | "cb_col": false, 86 | "cb_gr": true, 87 | "cb_gr_dict": false, 88 | "cb_gr_key": false, 89 | "cb_gr_rand": true, 90 | "collectionItem_col": false, 91 | "collectionItem_fixed": false, 92 | "opacity_check": false, 93 | "pie_check": false, 94 | "scale_check": false, 95 | "shape_col": false, 96 | "shape_fixed": true, 97 | "shape_gr": false, 98 | "shape_gr_dict": false, 99 | "shape_gr_rand": true 100 | }, 101 | "fromButton": 0, 102 | "hideSettings": true, 103 | "name": "Cell centroids", 104 | "path": "", 105 | "title": "Sargent results", 106 | "uid": "U48505" 107 | } 108 | ], 109 | "plugins": [ 110 | "Experiment_Data_Export", 111 | "Feature_Space", 112 | "Plot_Histogram", 113 | "Live_Region_Analysis" 114 | ], 115 | "regionFiles": [ 116 | { 117 | "path": "", 118 | "title": "Load Pathologist Annotation" 119 | } 120 | ], 121 | "regions": {}, 122 | "schemaVersion": "1.3" 123 | } --------------------------------------------------------------------------------