├── .gitattributes ├── .gitignore ├── LICENSE.md ├── README.md ├── docs ├── README.md ├── assets │ └── Figure_8_Revised.png ├── envs │ └── tutorial.yaml ├── guidelines.Rmd ├── guidelines.html └── guidelines.pdf └── workflow ├── Snakefile ├── analysis ├── R │ ├── 05_Iniq_Control_Fig_2_Analysis_Plots.R │ ├── 06_Iniq_Control_Fig_2_Analysis_Stat_Tests.R │ ├── 07_Iniq_Control_Fig_3_Analysis_Plots.R │ ├── 08_Iniq_Control_Fig_3_Analysis_Stat_Tests.R │ ├── 09_Iniq_Control_Fig_3_Analysis_Stats_Multinomial_Low_Rep.R │ ├── 10B_Iniq_Real_Datasets_Fig_4_Stat_Tests.R │ ├── 10_Iniq_Real_Datasets_Fig_4_Analysis_Plots.R │ ├── 12B_Iniq_PDAC_Fig_6_Stat_Tests.R │ ├── 12_Fig_6_PDAC_Analysis_Plots.R │ ├── 13_Iniq_Balanced_Metrics_Results_Fig_7_Analysis_Plots.R │ ├── 19_Supplementary_TI_Analysis.R │ ├── 20_Fig_6_PDAC_Reanno_Analysis_Plots.R │ ├── 21_Fig_6_PDAC_Reanno_Stats_Tests.R │ ├── 21_PBMC_perturbation_umap_plots.R │ └── knn_example.R └── python │ └── 01_Fig_7_Imbal_Metric_Analysis.ipynb ├── configs ├── config.json ├── config_control.json ├── config_control_ti_only.json ├── config_custom.json ├── config_lowcap.json ├── config_lowcap_control_like.json ├── config_lowcap_modified.json ├── config_lowcap_modified_pdac_reanno.json ├── config_pdac_comp.json ├── config_pdac_comp_reanno.json ├── config_umap.json ├── test_config.json └── test_config_lite.json ├── envs ├── analysis.yaml └── integrate.yaml ├── preprocessing └── pdac │ ├── Snakefile │ └── preprocess_env.yaml ├── scripts └── python │ ├── annotation_results.py │ ├── annotation_scoring.py │ ├── celltype_imbalance_summary.py │ ├── clustering_concordance.py │ ├── clustering_stats.py │ ├── dge_concordance_full.py │ ├── dge_concordance_stats.py │ ├── dge_ranking_concordance.py │ ├── dge_ranking_marker_subset.py │ ├── dge_ranking_per_cluster.py │ ├── downsample_summary.py │ ├── imbalance_summary.py │ ├── integrate_data.py │ ├── integrate_data_paga.py │ ├── knn_classification.py │ ├── marker_get.py │ ├── reference_annotation.py │ ├── reference_control_annotation.py │ ├── relatedness_metric.py │ ├── ti_concordance.py │ └── umap_plots.py └── src ├── R ├── liger_integrate.R ├── seurat_integrate.R └── seurat_reference_map.R └── python ├── __init__.py ├── imbalanced_clustering ├── __init__.py ├── ami.py ├── ari.py ├── utils │ ├── __init__.py │ ├── _emi_cython.pyx │ ├── avg.py │ ├── checks.py │ ├── contingency.py │ └── mi.py └── vmeasure.py ├── loaders └── rna_data.py └── utils ├── __init__.py ├── cluster_concordance.py ├── clustering.py ├── diffexp.py ├── integrate.py ├── integrate_ti.py ├── kmeans.py ├── liger_integrate.py ├── mnn.py ├── relatedness.py ├── sample.py ├── seurat_integrate.py ├── seurat_reference_mapping.py └── umap.py /.gitattributes: -------------------------------------------------------------------------------- 1 | workflow/analysis/** linguist-vendored 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.png 2 | *.jpeg 3 | *.pdf 4 | *.html 5 | *.svg 6 | *.tsv 7 | *.csv 8 | *.log 9 | *.out 10 | *.snakemake 11 | *.slurm* 12 | *.slrm* 13 | *.ipynb_checkpoints 14 | __pycache__ 15 | *.Rhistory 16 | *.RData 17 | *.tar.gz 18 | *.h5ad 19 | cluster.json 20 | submit.slrm 21 | submit.sh 22 | workflow/tmp 23 | resources/h5ad_files 24 | resources/test_h5ad_objs 25 | resources/references 26 | results 27 | outs 28 | logs 29 | data 30 | run_scripts.sh 31 | run_rscripts.sh 32 | *scratch* -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | ## Imbalanced integration guidelines 2 | 3 | ![Example Image](assets/Figure_8_Revised.png) 4 | 5 | This folder contains an rmarkdown example for using the guidelines shown in the [Imbalanced integration manuscript](https://www.biorxiv.org/content/10.1101/2022.10.06.511156v3). The tutorial is available as a rendered rmarkdown html document (guidelines.html), which can be downloaded and viewed with a web browser. 6 | 7 | The full details of the imbalanced integration guidelines can be found in the manuscript in Results Section: Guidelines for imbalanced single-cell data integration, and the associated Supplementary Table 2. 8 | 9 | ### Viewing the rendered markdown 10 | 11 | #### Through the html 12 | 13 | 1. Download the `guidelines.html` file 14 | 15 | 2. Open the file with a web browser (chrome, safari, firefox, etc.) 16 | 17 | #### Through the pdf 18 | 19 | 1. Download the 'guidelines.pdf' file and view with a PDF browser 20 | 21 | Please note that the pdf rendering of the code might not be formatted well as the file was originally rendered as an html vignette 22 | 23 | ### Running the tutorial 24 | 25 | 1. Install the conda environment for the guidelines: 26 | ``` 27 | conda env install -f envs/tutorial.yaml 28 | ``` 29 | 30 | 2. Activate the conda environment: 31 | ``` 32 | conda activate 33 | ``` 34 | 35 | 3. Activate R and Install rmarkdown and tinytex 36 | ``` 37 | R 38 | ``` 39 | ``` 40 | install.packages("rmarkdown", dep = TRUE) 41 | install.packages("tinytex") 42 | tinytex::install_tinytex() 43 | ``` 44 | 45 | 4. Run the rmarkdown chunk by chunk in [Rstudio](https://posit.co/downloads/) 46 | *** 47 | -------------------------------------------------------------------------------- /docs/assets/Figure_8_Revised.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsmaan/Iniquitate/cb20fe1240be6cb03dd63f1151816ec9d3b70a84/docs/assets/Figure_8_Revised.png -------------------------------------------------------------------------------- /docs/envs/tutorial.yaml: -------------------------------------------------------------------------------- 1 | name: iniq_guidelines 2 | channels: 3 | - pytorch 4 | - bioconda 5 | - r 6 | - conda-forge 7 | - defaults 8 | dependencies: 9 | - python>=3.7,<=3.10 10 | - scipy>=1.5.0 11 | - leidenalg>=0.8.0 12 | - umap-learn>=0.5.0 13 | - scanpy=1.8.2 14 | - anndata>=0.7.5 15 | - r-base>=4.0.5 16 | - r-seurat>=4.0.5 17 | - r-data.table>=1.14.0 18 | - r-ggplot2>=3.3.0 19 | - r-tidyverse>=1.2.1 20 | - r-reshape2>=1.4.3 21 | - r-ggthemes>=4.2.0 22 | - r-ggextra>=0.8.0 23 | - r-dotwhisker>=0.7.4 24 | - r-deldir>=1.0.2 25 | - r-ggpubr>=0.4.0 26 | - r-cowplot>=1.1.1 27 | - r-ggrepel>=0.9.1 28 | - r-rcolorbrewer>=1.1 29 | - r-ggbump>=0.1.0 30 | - bioconductor-complexheatmap<=2.9.0 31 | - r-venndiagram>=1.7.1 32 | - r-multipanelfigure>=2.1.2 33 | - r-gridextra>=2.3 34 | - r-cairo>=1.5 35 | - r-lemon>=0.4.5 36 | - r-networkd3>=0.4 37 | - r-emt>=1.2 38 | - r-rmarkdown>=2.14 39 | - bioconductor-singlecellexperiment>=1.12.0 40 | - bioconductor-scater>=1.18.0 41 | - bioconductor-batchelor>=1.6.0 42 | - bioconductor-bluster=1.4.0 43 | - r-pheatmap>=1.0.12 44 | - r-devtools 45 | - r-harmony 46 | - pip 47 | - pip: 48 | - balanced-clustering==0.1.0 -------------------------------------------------------------------------------- /docs/guidelines.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsmaan/Iniquitate/cb20fe1240be6cb03dd63f1151816ec9d3b70a84/docs/guidelines.pdf -------------------------------------------------------------------------------- /workflow/analysis/R/12B_Iniq_PDAC_Fig_6_Stat_Tests.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(tidyverse) 3 | library(reshape2) 4 | library(ggplot2) 5 | library(ggthemes) 6 | library(ggExtra) 7 | library(ggpubr) 8 | library(dotwhisker) 9 | library(Seurat) 10 | library(SeuratDisk) 11 | library(ComplexHeatmap) 12 | library(circlize) 13 | library(RColorBrewer) 14 | library(Cairo) 15 | library(networkD3) 16 | 17 | ### Note that this analysis done without LIGER ### 18 | `%ni%` <- Negate(`%in%`) 19 | kev_palette <- c( 20 | "dodgerblue2", "#E31A1C", 21 | "green4", 22 | "#6A3D9A", 23 | "#FF7F00", 24 | "black", "gold1", 25 | "skyblue2", "#FB9A99", 26 | "palegreen2", 27 | "#CAB2D6", 28 | "#FDBF6F", 29 | "gray70", "khaki2", 30 | "maroon", "orchid1", "deeppink1", "blue1", "steelblue4", 31 | "darkturquoise", "green1", "yellow4", "yellow3", 32 | "darkorange4", "brown" 33 | ) 34 | 35 | # Load in and concatenate imbalance summary files 36 | setwd("../../../results/pdac_comp/imbalance_summaries/") 37 | imba_files <- list.files() 38 | imba_loaded <- lapply(imba_files, fread) 39 | imba_concat <- Reduce(rbind, imba_loaded) 40 | gc() 41 | 42 | # Load in and concatenate celltype summary files 43 | setwd("../celltype_imbalance_summaries") 44 | cimba_files <- list.files() 45 | cimba_loaded <- lapply(cimba_files, fread) 46 | cimba_concat <- Reduce(rbind, cimba_loaded) 47 | gc() 48 | 49 | # Load in and concatenate the clustering summary results 50 | setwd("../clustering_summaries/") 51 | clus_files <- list.files() 52 | clus_loaded <- lapply(clus_files, fread) 53 | clus_concat <- Reduce(rbind, clus_loaded) 54 | clus_concat <- clus_concat[clus_concat$Method != "liger"] 55 | gc() 56 | 57 | # Load in and concatenate clustering concordance summaries 58 | setwd("../clustering_concord_summaries/") 59 | clus_concord_files <- list.files() 60 | clus_concord_loaded <- lapply(clus_concord_files, fread) 61 | clus_concord_concat <- Reduce(rbind, clus_concord_loaded) 62 | clus_concord_concat <- clus_concord_concat[ 63 | clus_concord_concat$`Method 1` != "liger" 64 | ] 65 | clus_concord_concat <- clus_concord_concat[ 66 | clus_concord_concat$`Method 2` != "liger" 67 | ] 68 | gc() 69 | 70 | # Load in and concatenate dge concordance summaries 71 | setwd("../dge_concord_stats/") 72 | dge_files <- list.files() 73 | dge_loaded <- lapply(dge_files, fread) 74 | dge_concat <- Reduce(rbind, dge_loaded) 75 | dge_concat <- dge_concat[dge_concat$`Method 1` != "liger"] 76 | dge_concat <- dge_concat[dge_concat$`Method 2` != "liger"] 77 | gc() 78 | 79 | # Load in and concatenate dge ranking summaries, subset by marker genes 80 | setwd("../dge_ranking_stats_marker_sub") 81 | dge_rank_files <- list.files() 82 | dge_rank_loaded <- lapply(dge_rank_files, fread) 83 | dge_rank_concat <- Reduce(rbind, dge_rank_loaded) 84 | dge_rank_concat <- dge_rank_concat[dge_rank_concat$Method != "liger"] 85 | gc() 86 | 87 | # Load in markers and corresponding celltypes 88 | setwd("../marker_results/") 89 | base_marker_genes <- fread( 90 | "peng_pdac_tumor_annot_8_batch_preintegration_marker_selection.tsv" 91 | ) 92 | 93 | # Load in and concatenate knn classification summaries 94 | setwd("../knn_classification_reports/") 95 | knn_files <- list.files() 96 | knn_loaded <- lapply(knn_files, fread) 97 | knn_concat <- Reduce(rbind, knn_loaded) 98 | knn_concat <- knn_concat[knn_concat$Method != "liger"] 99 | gc() 100 | 101 | # Change to top level dir 102 | setwd("../../..") 103 | 104 | # Make pdac comp output dir if it doesn't exist 105 | if (!dir.exists("outs/pdac_comp/figures")) { 106 | dir.create("outs/pdac_comp/figures", recursive = TRUE) 107 | } 108 | if (!dir.exists("outs/pdac_comp/results")) { 109 | dir.create("outs/pdac_comp/results", recursive = TRUE) 110 | } 111 | 112 | ### Statistical test - downsampling results on KNN classification scores 113 | ### of given subsets/compartments 114 | imba_knn_merged <- merge( 115 | imba_concat, 116 | knn_concat, 117 | by = c( 118 | "Number of batches downsampled", 119 | "Number of celltypes downsampled", 120 | "Proportion downsampled", 121 | "Replicate" 122 | ) 123 | ) 124 | imba_knn_merged <- distinct(imba_knn_merged) 125 | 126 | # Subset for only cases where the celltype/compartment downsampled is equal to 127 | # the celltype being classified 128 | imba_knn_merged_celltype <- imba_knn_merged[ 129 | imba_knn_merged$Celltype == imba_knn_merged$`Downsampled celltypes` | 130 | imba_knn_merged$`Downsampled celltypes` %in% c("None") 131 | ] 132 | 133 | # Indicate which panels are control and which ones are ablations or downsampling 134 | imba_knn_merged_celltype$type <- ifelse( 135 | imba_knn_merged_celltype$`Number of batches downsampled` == 0, 136 | "Control", 137 | ifelse( 138 | imba_knn_merged_celltype$`Proportion downsampled` == 0, 139 | "Ablated", 140 | "Downsampled" 141 | ) 142 | ) 143 | 144 | # Indicate the separate compartments 145 | compartments <- unique(imba_knn_merged_celltype$Celltype) 146 | 147 | # Create a function to do an ANOVA test for the F1 score based on each 148 | # compartment utilized 149 | anova_compart_knn <- function( 150 | compartment, 151 | dataset 152 | ){ 153 | # Subset data for the given compartment 154 | dataset_sub <- dataset[dataset$Celltype == compartment] 155 | 156 | # Format the data columns for lm 157 | colnames(dataset_sub) <- plyr::mapvalues( 158 | colnames(dataset_sub), 159 | from = c( 160 | "F1-score", 161 | "Method", 162 | "type" 163 | ), 164 | to = c( 165 | "f1_score", 166 | "method", 167 | "type" 168 | ) 169 | ) 170 | 171 | # Fit ANOVA model 172 | model_fit <- lm( 173 | as.formula( 174 | paste0( 175 | "f1_score", 176 | "~", 177 | "method+", 178 | "type" 179 | ) 180 | ), 181 | data = dataset_sub 182 | ) 183 | anova_result <- anova(model_fit, test = "F") 184 | 185 | # Format results and return 186 | anova_result_dt <- as.data.table(anova_result, keep.rownames = TRUE) 187 | colnames(anova_result_dt)[1] <- "Covariate" 188 | anova_result_dt$compartment_name <- compartment 189 | anova_result_dt$metric <- "F1 score" 190 | anova_result_dt$last_covariate <- "type" 191 | return(anova_result_dt) 192 | } 193 | 194 | # Iterate over compartments and get the significance of ds/ablation 195 | knn_anova_comp_results <- mapply( 196 | anova_compart_knn, 197 | compartment = compartments, 198 | MoreArgs = list( 199 | dataset = imba_knn_merged_celltype 200 | ), 201 | SIMPLIFY = FALSE 202 | ) 203 | knn_anova_comp_results 204 | 205 | # Save the concatenated results and plot the ANOVA F-values for 206 | # a supplementary figure 207 | knn_anova_comp_results_concat <- Reduce(rbind, knn_anova_comp_results) 208 | fwrite( 209 | knn_anova_comp_results_concat, 210 | "outs/pdac_comp/results/12B_comp_specific_ds_knn_f1_score_anovas.tsv", 211 | sep = "\t", 212 | quote = FALSE, 213 | row.names = FALSE, 214 | col.names = TRUE 215 | ) 216 | 217 | knn_anova_comp_results_concat_nores <- knn_anova_comp_results_concat[ 218 | knn_anova_comp_results_concat$Covariate != "Residuals" 219 | ] 220 | 221 | f_vals <- knn_anova_comp_results_concat_nores$`F value` 222 | covars <- knn_anova_comp_results_concat_nores$Covariate 223 | comps <- knn_anova_comp_results_concat_nores$compartment_name 224 | 225 | knn_aov_comp_df <- data.frame( 226 | "Covariates" = covars, 227 | "F_values" = f_vals, 228 | "Compartment" = comps 229 | ) 230 | 231 | knn_aov_comp_df_melted <- reshape2::melt( 232 | knn_aov_comp_df, 233 | id.vars = c("Compartment", "Covariates"), 234 | measure.vars = "F_values" 235 | ) 236 | knn_aov_comp_df_melted$Covariates <- plyr::mapvalues( 237 | knn_aov_comp_df_melted$Covariates, 238 | from = c( 239 | "type", 240 | "method" 241 | ), 242 | to = c( 243 | "Unperturbed vs perturbed", 244 | "Integration method" 245 | ) 246 | ) 247 | 248 | ggplot(data = knn_aov_comp_df_melted, aes(Covariates, value)) + 249 | geom_bar( 250 | stat = "identity", 251 | position = position_dodge2(), 252 | aes( 253 | fill = Compartment 254 | ) 255 | ) + 256 | scale_fill_brewer(palette = "Set1") + 257 | theme_classic() + 258 | coord_flip () + 259 | labs(x = "Covariate", y = "ANOVA F-statistic") + 260 | theme(axis.title.x = element_text(size = 16)) + 261 | theme(axis.title.y = element_text(size = 16)) + 262 | theme(strip.text.x = element_text(size = 16)) + 263 | theme(strip.text.y = element_text(size = 16)) + 264 | theme(plot.title = element_text(size = 14)) + 265 | theme(axis.text.x = element_text(size = 16)) + 266 | theme(axis.text.y = element_text(size = 16)) + 267 | theme(legend.title = element_text(size = 16)) + 268 | theme(legend.text = element_text(size = 16)) + 269 | theme(aspect.ratio = 1) 270 | ggsave( 271 | paste0( 272 | "outs/pdac_comp/figures/", 273 | "12B_pdac_knn_aov_comp_ds_f_statistic.pdf" 274 | ), 275 | width = 12, 276 | height = 12, 277 | device = cairo_pdf 278 | ) 279 | -------------------------------------------------------------------------------- /workflow/analysis/R/21_Fig_6_PDAC_Reanno_Stats_Tests.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(tidyverse) 3 | library(reshape2) 4 | library(ggplot2) 5 | library(ggthemes) 6 | library(ggExtra) 7 | library(ggpubr) 8 | library(dotwhisker) 9 | library(Seurat) 10 | library(SeuratDisk) 11 | library(ComplexHeatmap) 12 | library(circlize) 13 | library(RColorBrewer) 14 | library(Cairo) 15 | library(networkD3) 16 | 17 | ### Note that this analysis done without LIGER ### 18 | `%ni%` <- Negate(`%in%`) 19 | kev_palette <- c( 20 | "dodgerblue2", "#E31A1C", 21 | "green4", 22 | "#6A3D9A", 23 | "#FF7F00", 24 | "black", "gold1", 25 | "skyblue2", "#FB9A99", 26 | "palegreen2", 27 | "#CAB2D6", 28 | "#FDBF6F", 29 | "gray70", "khaki2", 30 | "maroon", "orchid1", "deeppink1", "blue1", "steelblue4", 31 | "darkturquoise", "green1", "yellow4", "yellow3", 32 | "darkorange4", "brown" 33 | ) 34 | 35 | # Load in and concatenate imbalance summary files 36 | setwd("../../../results/pdac_comp_reanno/imbalance_summaries/") 37 | imba_files <- list.files() 38 | imba_loaded <- lapply(imba_files, fread) 39 | imba_concat <- Reduce(rbind, imba_loaded) 40 | gc() 41 | 42 | # Load in and concatenate celltype summary files 43 | setwd("../celltype_imbalance_summaries") 44 | cimba_files <- list.files() 45 | cimba_loaded <- lapply(cimba_files, fread) 46 | cimba_concat <- Reduce(rbind, cimba_loaded) 47 | gc() 48 | 49 | # Load in and concatenate the clustering summary results 50 | setwd("../clustering_summaries/") 51 | clus_files <- list.files() 52 | clus_loaded <- lapply(clus_files, fread) 53 | clus_concat <- Reduce(rbind, clus_loaded) 54 | clus_concat <- clus_concat[clus_concat$Method != "liger"] 55 | gc() 56 | 57 | # Load in and concatenate clustering concordance summaries 58 | setwd("../clustering_concord_summaries/") 59 | clus_concord_files <- list.files() 60 | clus_concord_loaded <- lapply(clus_concord_files, fread) 61 | clus_concord_concat <- Reduce(rbind, clus_concord_loaded) 62 | clus_concord_concat <- clus_concord_concat[ 63 | clus_concord_concat$`Method 1` != "liger" 64 | ] 65 | clus_concord_concat <- clus_concord_concat[ 66 | clus_concord_concat$`Method 2` != "liger" 67 | ] 68 | gc() 69 | 70 | # Load in and concatenate knn classification summaries 71 | setwd("../knn_classification_reports/") 72 | knn_files <- list.files() 73 | knn_loaded <- lapply(knn_files, fread) 74 | knn_concat <- Reduce(rbind, knn_loaded) 75 | knn_concat <- knn_concat[knn_concat$Method != "liger"] 76 | gc() 77 | 78 | # Change to top level dir 79 | setwd("../../..") 80 | 81 | # Make pdac comp output dir if it doesn't exist 82 | if (!dir.exists("outs/pdac_comp_reanno/figures")) { 83 | dir.create("outs/pdac_comp_reanno/figures", recursive = TRUE) 84 | } 85 | if (!dir.exists("outs/pdac_comp_reanno/results")) { 86 | dir.create("outs/pdac_comp_reanno/results", recursive = TRUE) 87 | } 88 | 89 | ### Statistical test - downsampling results on KNN classification scores 90 | ### of given subsets/compartments 91 | imba_knn_merged <- merge( 92 | imba_concat, 93 | knn_concat, 94 | by = c( 95 | "Number of batches downsampled", 96 | "Number of celltypes downsampled", 97 | "Proportion downsampled", 98 | "Replicate" 99 | ) 100 | ) 101 | imba_knn_merged <- distinct(imba_knn_merged) 102 | 103 | # Subset for only cases where the celltype/compartment downsampled is equal to 104 | # the celltype being classified 105 | imba_knn_merged_celltype <- imba_knn_merged[ 106 | imba_knn_merged$Celltype == imba_knn_merged$`Downsampled celltypes` | 107 | imba_knn_merged$`Downsampled celltypes` %in% c("None") 108 | ] 109 | 110 | # Indicate which panels are control and which ones are ablations or downsampling 111 | imba_knn_merged_celltype$type <- ifelse( 112 | imba_knn_merged_celltype$`Number of batches downsampled` == 0, 113 | "Control", 114 | ifelse( 115 | imba_knn_merged_celltype$`Proportion downsampled` == 0, 116 | "Ablated", 117 | "Downsampled" 118 | ) 119 | ) 120 | 121 | # Indicate the separate compartments 122 | compartments <- unique(imba_knn_merged_celltype$Celltype) 123 | 124 | # Create a function to do an ANOVA test for the F1 score based on each 125 | # compartment utilized 126 | anova_compart_knn <- function( 127 | compartment, 128 | dataset 129 | ){ 130 | # Subset data for the given compartment 131 | dataset_sub <- dataset[dataset$Celltype == compartment] 132 | 133 | # Format the data columns for lm 134 | colnames(dataset_sub) <- plyr::mapvalues( 135 | colnames(dataset_sub), 136 | from = c( 137 | "F1-score", 138 | "Method", 139 | "type" 140 | ), 141 | to = c( 142 | "f1_score", 143 | "method", 144 | "type" 145 | ) 146 | ) 147 | 148 | # Fit ANOVA model 149 | model_fit <- lm( 150 | as.formula( 151 | paste0( 152 | "f1_score", 153 | "~", 154 | "method+", 155 | "type" 156 | ) 157 | ), 158 | data = dataset_sub 159 | ) 160 | anova_result <- anova(model_fit, test = "F") 161 | 162 | # Format results and return 163 | anova_result_dt <- as.data.table(anova_result, keep.rownames = TRUE) 164 | colnames(anova_result_dt)[1] <- "Covariate" 165 | anova_result_dt$compartment_name <- compartment 166 | anova_result_dt$metric <- "F1 score" 167 | anova_result_dt$last_covariate <- "type" 168 | return(anova_result_dt) 169 | } 170 | 171 | # Iterate over compartments and get the significance of ds/ablation 172 | knn_anova_comp_results <- mapply( 173 | anova_compart_knn, 174 | compartment = compartments, 175 | MoreArgs = list( 176 | dataset = imba_knn_merged_celltype 177 | ), 178 | SIMPLIFY = FALSE 179 | ) 180 | knn_anova_comp_results 181 | 182 | # Save the concatenated results and plot the ANOVA F-values for 183 | # a supplementary figure 184 | knn_anova_comp_results_concat <- Reduce(rbind, knn_anova_comp_results) 185 | fwrite( 186 | knn_anova_comp_results_concat, 187 | "outs/pdac_comp_reanno/results/21_comp_specific_ds_knn_f1_score_anovas.tsv", 188 | sep = "\t", 189 | quote = FALSE, 190 | row.names = FALSE, 191 | col.names = TRUE 192 | ) 193 | 194 | knn_anova_comp_results_concat_nores <- knn_anova_comp_results_concat[ 195 | knn_anova_comp_results_concat$Covariate != "Residuals" 196 | ] 197 | 198 | f_vals <- knn_anova_comp_results_concat_nores$`F value` 199 | covars <- knn_anova_comp_results_concat_nores$Covariate 200 | comps <- knn_anova_comp_results_concat_nores$compartment_name 201 | 202 | knn_aov_comp_df <- data.frame( 203 | "Covariates" = covars, 204 | "F_values" = f_vals, 205 | "Compartment" = comps 206 | ) 207 | 208 | knn_aov_comp_df_melted <- reshape2::melt( 209 | knn_aov_comp_df, 210 | id.vars = c("Compartment", "Covariates"), 211 | measure.vars = "F_values" 212 | ) 213 | knn_aov_comp_df_melted$Covariates <- plyr::mapvalues( 214 | knn_aov_comp_df_melted$Covariates, 215 | from = c( 216 | "type", 217 | "method" 218 | ), 219 | to = c( 220 | "Unperturbed vs perturbed", 221 | "Integration method" 222 | ) 223 | ) 224 | 225 | ggplot(data = knn_aov_comp_df_melted, aes(Covariates, value)) + 226 | geom_bar( 227 | stat = "identity", 228 | position = position_dodge2(), 229 | aes( 230 | fill = Compartment 231 | ) 232 | ) + 233 | scale_fill_brewer(palette = "Set1") + 234 | theme_classic() + 235 | coord_flip () + 236 | labs(x = "Covariate", y = "ANOVA F-statistic") + 237 | theme(axis.title.x = element_text(size = 16)) + 238 | theme(axis.title.y = element_text(size = 16)) + 239 | theme(strip.text.x = element_text(size = 16)) + 240 | theme(strip.text.y = element_text(size = 16)) + 241 | theme(plot.title = element_text(size = 14)) + 242 | theme(axis.text.x = element_text(size = 16)) + 243 | theme(axis.text.y = element_text(size = 16)) + 244 | theme(legend.title = element_text(size = 16)) + 245 | theme(legend.text = element_text(size = 16)) + 246 | theme(aspect.ratio = 1) 247 | ggsave( 248 | paste0( 249 | "outs/pdac_comp_reanno/figures/", 250 | "21_pdac_knn_aov_comp_ds_f_statistic.pdf" 251 | ), 252 | width = 12, 253 | height = 12, 254 | device = cairo_pdf 255 | ) 256 | -------------------------------------------------------------------------------- /workflow/analysis/R/21_PBMC_perturbation_umap_plots.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(tidyverse) 3 | library(reshape2) 4 | library(ggplot2) 5 | library(ggthemes) 6 | library(ggExtra) 7 | library(ggpubr) 8 | library(dotwhisker) 9 | library(Seurat) 10 | library(SeuratDisk) 11 | library(ComplexHeatmap) 12 | library(circlize) 13 | library(RColorBrewer) 14 | library(Cairo) 15 | 16 | # Helper functions 17 | `%ni%` <- Negate(`%in%`) 18 | 19 | # Change to results dir for uamp results data 20 | setwd("../../../results/umap/") 21 | 22 | # Load color palette 23 | kev_palette <- c("dodgerblue2", 24 | "#E31A1C", 25 | "green4", 26 | "#6A3D9A", 27 | "#FF7F00", 28 | "black", 29 | "gold1", 30 | "skyblue2", 31 | "#FB9A99", 32 | "palegreen2", 33 | "#CAB2D6", 34 | "#FDBF6F", 35 | "gray70", 36 | "khaki2", 37 | "maroon", 38 | "orchid1", 39 | "deeppink1", 40 | "blue1", 41 | "steelblue4", 42 | "darkturquoise", 43 | "green1", 44 | "yellow4", 45 | "yellow3", 46 | "darkorange4", 47 | "brown") 48 | 49 | ##### Analysis of PBMC 2 batch balanced data - baseline ##### 50 | 51 | # Load in the umap plot results 52 | setwd("umap_plots/") 53 | umap_files <- list.files() 54 | umap_files <- grep( 55 | ".tsv", 56 | umap_files, 57 | value = TRUE 58 | ) 59 | umap_files <- grep( 60 | "pbmc_2_batch_base_balanced", 61 | umap_files, 62 | value = TRUE 63 | ) 64 | umap_loaded <- lapply(umap_files, fread) 65 | umap_names <- str_split_fixed(umap_files, fixed(".tsv"), 2)[,1] 66 | names(umap_loaded) <- umap_names 67 | 68 | setwd("../../..") 69 | 70 | # Create directory for umap results if it doesn't exist 71 | if (!dir.exists("outs/umap/results")) { 72 | dir.create("outs/umap/results", recursive = TRUE) 73 | } 74 | if (!dir.exists("outs/umap/figures")) { 75 | dir.create("outs/umap/figures") 76 | } 77 | 78 | # Create function to loop over the umap files and return the results 79 | umap_plot <- function(df, save_prefix) { 80 | # Format celltype names 81 | df$Clustering <- plyr::mapvalues( 82 | df$Clustering, 83 | from = c( 84 | "Monocyte_CD14", 85 | "Monocyte_FCGR3A", 86 | "CD4 T cell", 87 | "CD8 T cell" 88 | ), 89 | to = c( 90 | "CD14+ Monocyte", 91 | "FCGR3A+ Monocyte", 92 | "CD4+ T cell", 93 | "CD8+ T cell" 94 | ) 95 | ) 96 | 97 | # Format batch names 98 | df$Clustering <- plyr::mapvalues( 99 | df$Clustering, 100 | from = c( 101 | "batch_1", 102 | "batch_2" 103 | ), 104 | to = c( 105 | "Batch 1", 106 | "Batch 2" 107 | ) 108 | ) 109 | 110 | unique_cluster_len <- length(unique(df$Clustering)) 111 | if (unique_cluster_len > 8) { 112 | ggplot(data = df, aes(x = `UMAP 1`, y = `UMAP 2`)) + 113 | geom_point( 114 | aes( 115 | color = factor( 116 | as.numeric(Clustering), 117 | levels = sort(as.numeric(unique(df$Clustering))) 118 | ) 119 | ), 120 | size = 0.25 121 | ) + 122 | facet_wrap( 123 | .~Subset, 124 | scales = "free" 125 | ) + 126 | labs( 127 | color = "", 128 | x = "UMAP 1", 129 | y = "UMAP 2" 130 | ) + 131 | scale_color_manual( 132 | name = "", 133 | values = kev_palette[1:unique_cluster_len] 134 | ) + 135 | guides(color = guide_legend(override.aes = list(size=2))) + 136 | theme_few() + 137 | theme(axis.title.x = element_text(size = 16)) + 138 | theme(axis.title.y = element_text(size = 16)) + 139 | theme(strip.text.x = element_text(size = 16)) + 140 | theme(plot.title = element_text(size = 14)) + 141 | theme(axis.text.x = element_text(size = 16)) + 142 | theme(axis.text.y = element_text(size = 16)) + 143 | theme(legend.title = element_text(size = 16)) + 144 | theme(legend.text = element_text(size = 16)) 145 | ggsave( 146 | paste0( 147 | "outs/umap/figures/", 148 | save_prefix, 149 | ".pdf" 150 | ), 151 | width = 16, 152 | height = 8, 153 | device = cairo_pdf 154 | ) 155 | } else { 156 | if (any(grepl("Batch", df$Clustering))) { 157 | pal = "Set1" 158 | } else { 159 | pal = "Dark2" 160 | } 161 | ggplot(data = df, aes(x = `UMAP 1`, y = `UMAP 2`)) + 162 | geom_point( 163 | aes( 164 | color = factor(Clustering), 165 | ), 166 | size = 0.5 167 | ) + 168 | facet_wrap( 169 | .~Subset, 170 | scales = "free" 171 | ) + 172 | labs( 173 | color = "", 174 | x = "UMAP 1", 175 | y = "UMAP 2" 176 | ) + 177 | guides(color = guide_legend(override.aes = list(size=2))) + 178 | scale_color_brewer(palette = pal) + 179 | theme_few() + 180 | theme(axis.title.x = element_text(size = 16)) + 181 | theme(axis.title.y = element_text(size = 16)) + 182 | theme(strip.text.x = element_text(size = 16)) + 183 | theme(plot.title = element_text(size = 14)) + 184 | theme(axis.text.x = element_text(size = 16)) + 185 | theme(axis.text.y = element_text(size = 16)) + 186 | theme(legend.title = element_text(size = 16)) + 187 | theme(legend.text = element_text(size = 16)) 188 | ggsave( 189 | paste0( 190 | "outs/umap/figures/", 191 | save_prefix, 192 | ".pdf" 193 | ), 194 | width = 16, 195 | height = 8, 196 | device = cairo_pdf 197 | ) 198 | } 199 | } 200 | 201 | # Iterate over the umap files and names and save the results 202 | mapply( 203 | umap_plot, 204 | df = umap_loaded, 205 | save_prefix = umap_names 206 | ) 207 | -------------------------------------------------------------------------------- /workflow/analysis/R/knn_example.R: -------------------------------------------------------------------------------- 1 | # Load the necessary libraries - please note that the analysis environment 2 | # (found in envs/analysis.yaml) should be used to run this script 3 | library(data.table) 4 | library(tidyverse) 5 | library(reshape2) 6 | library(ggplot2) 7 | library(ggthemes) 8 | library(ggExtra) 9 | library(ggpubr) 10 | library(dotwhisker) 11 | library(Seurat) 12 | library(SeuratDisk) 13 | library(ComplexHeatmap) 14 | library(circlize) 15 | library(RColorBrewer) 16 | library(Cairo) 17 | 18 | # Helper functions 19 | `%ni%` <- Negate(`%in%`) 20 | 21 | # Change to results dir for the custom data - 22 | # change this directory as necessary 23 | setwd("../../../results/custom") 24 | 25 | # Load in and concatenate imbalance summary files 26 | setwd("imbalance_summaries/") 27 | imba_files <- list.files() 28 | imba_loaded <- lapply(imba_files, fread) 29 | imba_concat <- Reduce(rbind, imba_loaded) 30 | 31 | # Load in and concatenate knn classification summaries 32 | setwd("../knn_classification_reports/") 33 | knn_files <- list.files() 34 | knn_loaded <- lapply(knn_files, fread) 35 | knn_concat <- Reduce(rbind, knn_loaded) 36 | 37 | # Change to top level dir 38 | setwd("../../..") 39 | 40 | # Create directory for output of results and figures 41 | if (!dir.exists("outs/custom/figures")) { 42 | dir.create("outs/custom/figures", recursive = TRUE) 43 | } 44 | if (!dir.exists("outs/custom/results")) { 45 | dir.create("outs/custom/results", recursive = TRUE) 46 | } 47 | 48 | ### Results of celltype downsampling and ablation on 49 | ### KNN classification scores 50 | 51 | # Merge imbalance and knn classification results together - several 52 | # other analysis can be done with this data now, but we'll only 53 | # highlight the main KNN analysis that was done in the paper 54 | imba_knn_merged <- merge( 55 | imba_concat, 56 | knn_concat, 57 | by = c( 58 | "Number of batches downsampled", 59 | "Number of celltypes downsampled", 60 | "Proportion downsampled", 61 | "Replicate" 62 | ) 63 | ) 64 | imba_knn_merged <- distinct(imba_knn_merged) 65 | 66 | # Subset for only cases where the celltype downsampled is equal to the 67 | # celltype being classified 68 | imba_knn_merged_celltype <- imba_knn_merged[ 69 | imba_knn_merged$Celltype == imba_knn_merged$`Downsampled celltypes` | 70 | imba_knn_merged$`Downsampled celltypes` %in% c("None") 71 | ] 72 | 73 | # Indicate which panels are control and which ones are ablations or downsampling 74 | imba_knn_merged_celltype$type <- ifelse( 75 | imba_knn_merged_celltype$`Number of batches downsampled` == 0, 76 | "Control", 77 | ifelse( 78 | imba_knn_merged_celltype$`Proportion downsampled` == 0, 79 | "Ablated", 80 | "Downsampled" 81 | ) 82 | ) 83 | 84 | # Create function to format facet labels (downsampled celltypes) and plot 85 | # the results (this figure will be in the same format as the paper) 86 | ds_celltype_labelled <- function(variable,value){ 87 | return(paste0("Cell-type affected = ", value)) 88 | } 89 | 90 | ggplot(data = imba_knn_merged_celltype, aes(x = `Method`, y = `F1-score`)) + 91 | geom_boxplot( 92 | aes( 93 | fill = factor(`type`, levels = c("Control", "Downsampled", "Ablated")), 94 | ), 95 | notch = FALSE, 96 | alpha = 0.8 97 | ) + 98 | ylim(0, 1) + 99 | facet_wrap( 100 | .~Celltype, 101 | scales = "free_x", 102 | labeller = ds_celltype_labelled, 103 | ncol = 2 104 | ) + 105 | labs( 106 | fill = "Type", 107 | x = "Method", 108 | y = "Affected celltype F1-classification score post-integration" 109 | ) + 110 | scale_fill_manual( 111 | breaks = c("Control", "Downsampled", "Ablated"), 112 | values = c("forestgreen", "darkorchid3", "firebrick2") 113 | ) + 114 | theme_few() + 115 | theme(axis.title.x = element_text(size = 16)) + 116 | theme(axis.title.y = element_text(size = 16)) + 117 | theme(strip.text.x = element_text(size = 16)) + 118 | theme(plot.title = element_text(size = 14)) + 119 | theme(axis.text.x = element_text(size = 16)) + 120 | theme(axis.text.y = element_text(size = 16)) + 121 | theme(legend.title = element_text(size = 16)) + 122 | theme(legend.text = element_text(size = 16)) 123 | # Change the name and extension of file as necessary 124 | ggsave( 125 | "outs/custom/figures/ds_ablate_allmethod_knn_f1_score.pdf", 126 | width = 12, 127 | height = 14, 128 | device = cairo_pdf 129 | ) -------------------------------------------------------------------------------- /workflow/configs/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pbmc_2_batch" : { 3 | "data_folder": "pbmc_2_batch", 4 | "ds_celltypes": [1, 2, 3, 4, 5, 6], 5 | "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05, 0], 6 | "num_batches": [0, 1], 7 | "repetitions": 10 8 | }, 9 | "pbmc_4_batch" : { 10 | "data_folder": "pbmc_4_batch", 11 | "ds_celltypes": [1, 2, 3, 4, 5, 6], 12 | "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05, 0], 13 | "num_batches": [0, 1, 2, 3], 14 | "repetitions": 10 15 | }, 16 | "mouse_hindbrain_10_batch": { 17 | "data_folder": "mouse_hindbrain_10_batch", 18 | "ds_celltypes": [1, 2, 3, 4, 5, 6], 19 | "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05, 0], 20 | "num_batches": [0, 2, 4, 6, 8], 21 | "repetitions": 10 22 | }, 23 | "peng_pdac_23_batch": { 24 | "data_folder": "peng_pdac_23_batch", 25 | "ds_celltypes": [1, 2, 3, 4, 5, 6], 26 | "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05, 0], 27 | "num_batches": [0, 3, 6, 9, 12, 15, 18, 21], 28 | "repetitions": 10 29 | }, 30 | "steele_pdac_17_batch": { 31 | "data_folder": "steele_pdac_17_batch", 32 | "ds_celltypes": [1, 2, 3, 4, 5, 6], 33 | "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05, 0], 34 | "num_batches": [0, 2, 4, 6, 8, 10, 12, 14, 16], 35 | "repetitions": 10 36 | } 37 | } -------------------------------------------------------------------------------- /workflow/configs/config_control.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_name": "control", 3 | "int_datasets": { 4 | "pbmc_2_batch_base_balanced" : { 5 | "data_folder": "pbmc_2_batch_base_balanced", 6 | "ds_celltypes": [1], 7 | "ds_proportions": [0.1, 0], 8 | "num_batches": [0, 1], 9 | "repetitions": 200 10 | }, 11 | "pbmc_2_batch_hierarchical_balanced": { 12 | "data_folder": "pbmc_2_batch_hierarchical_balanced", 13 | "ds_celltypes": [1], 14 | "ds_proportions": [0.1, 0], 15 | "num_batches": [0, 1], 16 | "repetitions": 200 17 | } 18 | }, 19 | "int_ti_datasets": {}, 20 | "query_to_reference": "Yes", 21 | "celltype_list": "No" 22 | } -------------------------------------------------------------------------------- /workflow/configs/config_control_ti_only.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_name": "control_ti_only", 3 | "int_datasets": {}, 4 | "int_ti_datasets": { 5 | "cao_organ_dev_sublin_2_batch": { 6 | "data_folder": "cao_organ_dev_sublin_balanced_2_batch", 7 | "root_celltype": "Early_mesenchyme", 8 | "ds_celltypes": [1], 9 | "ds_proportions": [0.1, 0], 10 | "num_batches": [0, 1], 11 | "repetitions": 200 12 | } 13 | }, 14 | "query_to_reference": "No", 15 | "celltype_list": "No" 16 | } -------------------------------------------------------------------------------- /workflow/configs/config_custom.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_name": "custom", 3 | "int_datasets": { 4 | "custom_dataset" : { 5 | "data_folder": "custom_dataset", 6 | "ds_celltypes": [1], 7 | "ds_proportions": [0.1, 0], 8 | "num_batches": [0, 1], 9 | "repetitions": 200 10 | } 11 | }, 12 | "int_ti_datasets": {}, 13 | "query_to_reference": "No", 14 | "celltype_list": "No" 15 | } -------------------------------------------------------------------------------- /workflow/configs/config_lowcap.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_name": "lowcap", 3 | "int_datasets": { 4 | "pbmc_2_batch" : { 5 | "data_folder": "pbmc_2_batch", 6 | "ds_celltypes": [1], 7 | "ds_proportions": [0.25, 0.1, 0.05, 0], 8 | "num_batches": [0, 1], 9 | "repetitions": 50 10 | }, 11 | "pbmc_4_batch" : { 12 | "data_folder": "pbmc_4_batch", 13 | "ds_celltypes": [1], 14 | "ds_proportions": [0.25, 0.1, 0.05, 0], 15 | "num_batches": [0, 1], 16 | "repetitions": 50 17 | }, 18 | "mouse_hindbrain_6_batch": { 19 | "data_folder": "mouse_hindbrain_6_batch", 20 | "ds_celltypes": [1], 21 | "ds_proportions": [0.25, 0.1, 0.05, 0], 22 | "num_batches": [0, 1], 23 | "repetitions": 50 24 | }, 25 | "peng_pdac_8_batch": { 26 | "data_folder": "peng_pdac_8_batch", 27 | "ds_celltypes": [1], 28 | "ds_proportions": [0.25, 0.1, 0.05, 0], 29 | "num_batches": [0, 1], 30 | "repetitions": 50 31 | } 32 | }, 33 | "int_ti_datasets": { 34 | "cao_organ_dev_sublin_2_batch": { 35 | "data_folder": "cao_organ_dev_sublin_2_batch", 36 | "root_celltype": "Early_mesenchyme", 37 | "ds_celltypes": [1], 38 | "ds_proportions": [0.25, 0.1, 0.05, 0], 39 | "num_batches": [0, 1], 40 | "repetitions": 50 41 | } 42 | } 43 | } -------------------------------------------------------------------------------- /workflow/configs/config_lowcap_control_like.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_name": "lowcap_control_like", 3 | "int_datasets": { 4 | "pbmc_2_batch" : { 5 | "data_folder": "pbmc_2_batch", 6 | "ds_celltypes": [1], 7 | "ds_proportions": [0.1, 0], 8 | "num_batches": [0, 1], 9 | "repetitions": 50 10 | }, 11 | "pbmc_4_batch" : { 12 | "data_folder": "pbmc_4_batch", 13 | "ds_celltypes": [1], 14 | "ds_proportions": [0.1, 0], 15 | "num_batches": [0, 1], 16 | "repetitions": 50 17 | }, 18 | "mouse_hindbrain_6_batch": { 19 | "data_folder": "mouse_hindbrain_6_batch", 20 | "ds_celltypes": [1], 21 | "ds_proportions": [0.1, 0], 22 | "num_batches": [0, 1], 23 | "repetitions": 50 24 | }, 25 | "peng_pdac_8_batch": { 26 | "data_folder": "peng_pdac_8_batch", 27 | "ds_celltypes": [1], 28 | "ds_proportions": [0.1, 0], 29 | "num_batches": [0, 1], 30 | "repetitions": 50 31 | } 32 | }, 33 | "int_ti_datasets": { 34 | "cao_organ_dev_sublin_2_batch": { 35 | "data_folder": "cao_organ_dev_sublin_2_batch", 36 | "root_celltype": "Early_mesenchyme", 37 | "ds_celltypes": [1], 38 | "ds_proportions": [0.1, 0], 39 | "num_batches": [0, 1], 40 | "repetitions": 50 41 | } 42 | } 43 | } -------------------------------------------------------------------------------- /workflow/configs/config_lowcap_modified.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_name": "lowcap_modified", 3 | "int_datasets": { 4 | "pbmc_2_batch" : { 5 | "data_folder": "pbmc_2_batch", 6 | "ds_celltypes": [0], 7 | "ds_proportions": [0], 8 | "num_batches": [0], 9 | "repetitions": 50 10 | }, 11 | "pbmc_4_batch" : { 12 | "data_folder": "pbmc_4_batch", 13 | "ds_celltypes": [0], 14 | "ds_proportions": [0], 15 | "num_batches": [0], 16 | "repetitions": 50 17 | }, 18 | "mouse_hindbrain_6_batch": { 19 | "data_folder": "mouse_hindbrain_6_batch", 20 | "ds_celltypes": [0], 21 | "ds_proportions": [0], 22 | "num_batches": [0], 23 | "repetitions": 50 24 | }, 25 | "peng_pdac_8_batch": { 26 | "data_folder": "peng_pdac_tumor_annot_8_batch_granular", 27 | "ds_celltypes": [0], 28 | "ds_proportions": [0], 29 | "num_batches": [0], 30 | "repetitions": 50 31 | } 32 | }, 33 | "int_ti_datasets": {}, 34 | "query_to_reference": "No", 35 | "celltype_list": "No" 36 | } -------------------------------------------------------------------------------- /workflow/configs/config_lowcap_modified_pdac_reanno.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_name": "lowcap_modified_pdac_reanno", 3 | "int_datasets": { 4 | "peng_pdac_8_batch": { 5 | "data_folder": "peng_pdac_tumor_annot_8_batch_granular", 6 | "ds_celltypes": [0], 7 | "ds_proportions": [0], 8 | "num_batches": [0], 9 | "repetitions": 50 10 | } 11 | }, 12 | "int_ti_datasets": {}, 13 | "query_to_reference": "No", 14 | "celltype_list": "No" 15 | } -------------------------------------------------------------------------------- /workflow/configs/config_pdac_comp.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_name": "pdac_comp", 3 | "int_datasets": { 4 | "peng_pdac_tumor_annot_8_batch": { 5 | "data_folder": "peng_pdac_tumor_annot_8_batch", 6 | "ds_celltypes": [1], 7 | "ds_proportions": [0.1, 0], 8 | "num_batches": [0, 4], 9 | "repetitions": 50 10 | } 11 | }, 12 | "int_ti_datasets": {}, 13 | "query_to_reference": "No", 14 | "celltype_list": "No" 15 | } -------------------------------------------------------------------------------- /workflow/configs/config_pdac_comp_reanno.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_name": "pdac_comp_reanno", 3 | "int_datasets": { 4 | "peng_pdac_tumor_annot_8_reanno_batch": { 5 | "data_folder": "peng_pdac_tumor_annot_8_batch", 6 | "ds_celltypes": [1], 7 | "ds_proportions": [0.1, 0], 8 | "num_batches": [0, 4], 9 | "repetitions": 50 10 | } 11 | }, 12 | "int_ti_datasets": {}, 13 | "query_to_reference": "No", 14 | "celltype_list": "No" 15 | } -------------------------------------------------------------------------------- /workflow/configs/config_umap.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_name": "umap", 3 | "int_datasets": { 4 | "pbmc_2_batch_base_balanced" : { 5 | "data_folder": "pbmc_2_batch_base_balanced", 6 | "ds_celltypes": [1], 7 | "ds_proportions": [0.1, 0], 8 | "num_batches": [0, 1], 9 | "repetitions": 1 10 | }, 11 | "pbmc_2_batch_hierarchical_balanced": { 12 | "data_folder": "pbmc_2_batch_hierarchical_balanced", 13 | "ds_celltypes": [1], 14 | "ds_proportions": [0.1, 0], 15 | "num_batches": [0, 1], 16 | "repetitions": 1 17 | }, 18 | "pbmc_2_batch" : { 19 | "data_folder": "pbmc_2_batch", 20 | "ds_celltypes": [0], 21 | "ds_proportions": [0], 22 | "num_batches": [0], 23 | "repetitions": 1 24 | }, 25 | "pbmc_4_batch" : { 26 | "data_folder": "pbmc_4_batch", 27 | "ds_celltypes": [0], 28 | "ds_proportions": [0], 29 | "num_batches": [0], 30 | "repetitions": 1 31 | }, 32 | "mouse_hindbrain_6_batch": { 33 | "data_folder": "mouse_hindbrain_6_batch", 34 | "ds_celltypes": [0], 35 | "ds_proportions": [0], 36 | "num_batches": [0], 37 | "repetitions": 1 38 | }, 39 | "peng_pdac_8_batch": { 40 | "data_folder": "peng_pdac_tumor_annot_8_batch", 41 | "ds_celltypes": [0], 42 | "ds_proportions": [0], 43 | "num_batches": [0], 44 | "repetitions": 1 45 | } 46 | }, 47 | "int_ti_datasets": {}, 48 | "query_to_reference": "No", 49 | "celltype_list": "No" 50 | } -------------------------------------------------------------------------------- /workflow/configs/test_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "pbmc_2_batch" : { 3 | "data_folder": "pbmc_2_batch", 4 | "ds_celltypes": [1, 2, 3, 4, 5, 6, 7, 8, 9], 5 | "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05], 6 | "num_batches": [0, 1], 7 | "repetitions": 10 8 | }, 9 | "pbmc_4_batch" : { 10 | "data_folder": "pbmc_4_batch", 11 | "ds_celltypes": [1, 2, 3, 4, 5, 6, 7], 12 | "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05], 13 | "num_batches": [0, 1, 2, 3], 14 | "repetitions": 10 15 | }, 16 | "mouse_hindbrain_10_batch": { 17 | "data_folder": "mouse_hindbrain_10_batch", 18 | "ds_celltypes": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 19 | "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05], 20 | "num_batches": [0, 2, 4, 6, 8], 21 | "repetitions": 10 22 | } 23 | } -------------------------------------------------------------------------------- /workflow/configs/test_config_lite.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_name": "test_config_lite", 3 | "int_datasets": { 4 | "pbmc_2_batch_balanced" : { 5 | "data_folder": "pbmc_2_batch", 6 | "ds_celltypes": [1], 7 | "ds_proportions": [0.1, 0.25], 8 | "num_batches": [0, 1], 9 | "repetitions": 2 10 | } 11 | }, 12 | "int_ti_datasets": {}, 13 | "query_to_reference": "No", 14 | "celltype_list": "No" 15 | } -------------------------------------------------------------------------------- /workflow/envs/analysis.yaml: -------------------------------------------------------------------------------- 1 | name: iniq_analysis 2 | channels: 3 | - pytorch 4 | - bioconda 5 | - r 6 | - conda-forge 7 | - defaults 8 | dependencies: 9 | - python>=3.7,<=3.10 10 | - numpy>=1.19.0,<=1.23.5 11 | - pandas>=1.2.0 12 | - scipy>=1.5.0 13 | - seaborn>=0.11.2 14 | - plotnine>=0.8.0 15 | - leidenalg>=0.8.0 16 | - umap-learn>=0.5.0 17 | - scikit-learn=1.0.1 18 | - scanpy=1.8.2 19 | - anndata>=0.7.5 20 | - ipykernel>=6.4.0 21 | - jupyterlab>=3.2.9 22 | - notebook>=6.4.2 23 | - scvi-tools=0.14.4 24 | - pytorch=1.10.1 25 | - torchmetrics<=0.6.0 # Issue with torch loading 26 | - cudatoolkit=10.2 27 | - bbknn=1.5.1 28 | - harmonypy=0.0.5 29 | - scanorama=1.7.1 30 | - r-base>=4.0.5 31 | - r-seurat>=4.0.5 32 | - r-data.table>=1.14.0 33 | - r-ggplot2>=3.3.0 34 | - r-tidyverse>=1.2.1 35 | - r-reshape2>=1.4.3 36 | - r-ggthemes>=4.2.0 37 | - r-ggextra>=0.8.0 38 | - r-dotwhisker>=0.7.4 39 | - r-seuratdisk>=0.0.9019 40 | - r-deldir>=1.0.2 41 | - r-ggpubr>=0.4.0 42 | - r-cowplot>=1.1.1 43 | - r-ggrepel>=0.9.1 44 | - r-rcolorbrewer>=1.1 45 | - r-ggbump>=0.1.0 46 | - bioconductor-complexheatmap<=2.9.0 47 | - r-venndiagram>=1.7.1 48 | - r-multipanelfigure>=2.1.2 49 | - r-gridextra>=2.3 50 | - r-cairo>=1.5 51 | - r-lemon>=0.4.5 52 | - r-networkd3>=0.4 53 | - r-emt>=1.2 54 | - cython>=0.29.25 55 | - mkl==2024.0 # Pinning due to https://github.com/pytorch/pytorch/issues/123097 -------------------------------------------------------------------------------- /workflow/envs/integrate.yaml: -------------------------------------------------------------------------------- 1 | name: iniq_integrate 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - bioconda 6 | - defaults 7 | dependencies: 8 | - python>=3.7,<=3.10 9 | - matplotlib<3.7 10 | - numpy>=1.19.0,<=1.23.5 11 | - pandas>=1.2.0,<1.5.0 12 | - scipy>=1.5.0 13 | - leidenalg>=0.8.0 14 | - umap-learn>=0.5.0,<=0.5.3 15 | - mnnpy>=0.1.9.0,<=0.1.9.5 16 | - scikit-learn=1.0.1 17 | - scanpy=1.8.2 18 | - anndata=0.8.0 19 | - faiss-cpu>=1.7.0,<=1.7.3 20 | - pytorch=1.10.1 21 | - torchmetrics<=0.6.0 # Issue with torch loading 22 | - cudatoolkit=10.2 23 | - scvi-tools=0.14.4 24 | - bbknn=1.5.1 25 | - harmonypy=0.0.5 26 | - scanorama=1.7.1 27 | - r-base>=4.0.0 28 | - r-liger=0.5.0 29 | - r-seurat=4.0.6 30 | - r-seuratdisk=0.0.9019 31 | - r-data.table>=1.14.0 32 | - r-reticulate=1.24 33 | - cython>=0.29.25,<=0.29.34 34 | - r-rann=2.6.1 35 | - natsort>=7.0.0 36 | - colorcet>=3.0.0 37 | - seaborn>=0.11.0 38 | - mkl==2024.0 # Pinning due to https://github.com/pytorch/pytorch/issues/123097 39 | variables: 40 | TMPDIR: "/tmp" 41 | -------------------------------------------------------------------------------- /workflow/preprocessing/pdac/Snakefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hsmaan/Iniquitate/cb20fe1240be6cb03dd63f1151816ec9d3b70a84/workflow/preprocessing/pdac/Snakefile -------------------------------------------------------------------------------- /workflow/preprocessing/pdac/preprocess_env.yaml: -------------------------------------------------------------------------------- 1 | name: iniq_pdac_preprocess 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python>=3.7,<=3.10 8 | - numpy>=1.19.0 9 | - pandas>=1.2.0 10 | - scipy>=1.5.0 11 | - leidenalg>=0.8.0 12 | - umap-learn>=0.5.0 13 | - scanpy=1.8.2 14 | - anndata>=0.7.5 15 | - pip 16 | - pip: 17 | - infercnvpy -------------------------------------------------------------------------------- /workflow/scripts/python/annotation_results.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | sys.path.append("src/python/") 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import anndata as ann 9 | import scanpy as sc 10 | from sklearn.metrics import accuracy_score, balanced_accuracy_score, \ 11 | f1_score, classification_report 12 | 13 | def none_or_str(value): 14 | if value == 'None': 15 | return None 16 | return value 17 | 18 | def main(h5ad_loc, save_loc, dataset_name, ds_celltypes, ds_proportions, 19 | num_batches, rep): 20 | # Load h5ad file for query to reference mapping results 21 | adata = sc.read_h5ad(h5ad_loc) 22 | 23 | # Get the classification results as a dataframe 24 | class_results = pd.DataFrame({ 25 | "Real celltype": adata.obs["celltype"], 26 | "Predicted L1": adata.obs["predicted.celltype.l1"], 27 | "Predicted L2": adata.obs["predicted.celltype.l2"], 28 | "Control predicted L1": adata.obs["baseline.knn.l1"], 29 | "Control predicted L2": adata.obs["baseline.knn.l2"] 30 | }) 31 | 32 | # Append information on dataset to results 33 | class_results["Dataset"] = dataset_name 34 | class_results["Number of batches downsampled"] = num_batches 35 | class_results["Number of celltypes downsampled"] = ds_celltypes 36 | class_results["Proportion downsampled"] = ds_proportions 37 | class_results["Replicate"] = rep 38 | 39 | # Save results to file 40 | class_results.to_csv(save_loc, index=False, sep="\t") 41 | 42 | if __name__ == '__main__': 43 | parser = argparse.ArgumentParser( 44 | description = "Input and output files for annotation results summary" 45 | ) 46 | parser.add_argument( 47 | "--infile", 48 | type = str, 49 | help = "Path of Seurat annotated h5ad file" 50 | ) 51 | parser.add_argument( 52 | "--outfile", 53 | type = str, 54 | help = "Filepath for saving annotation results" 55 | ) 56 | parser.add_argument( 57 | "--dataset", 58 | type = str, 59 | help = "Name of dataset" 60 | ) 61 | parser.add_argument( 62 | "--rep", 63 | type = int, 64 | help = "Repetition number" 65 | ) 66 | parser.add_argument( 67 | "--ds_celltypes", 68 | type = int, 69 | help = "Number of celltypes to randomly downsample in given batch" 70 | ) 71 | parser.add_argument( 72 | "--ds_proportions", 73 | type = float, 74 | help = "Proportion of downsampling per celltype in a given batch" 75 | ) 76 | parser.add_argument( 77 | "--num_batches", 78 | type = int, 79 | help = "Number of batches to perform downsampling on" 80 | ) 81 | args = parser.parse_args() 82 | main( 83 | h5ad_loc = args.infile, 84 | save_loc = args.outfile, 85 | dataset_name = args.dataset, 86 | rep = args.rep, 87 | ds_celltypes = args.ds_celltypes, 88 | ds_proportions = args.ds_proportions, 89 | num_batches = args.num_batches 90 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/celltype_imbalance_summary.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import scanpy as sc 7 | 8 | def main(h5ad_loc, save_loc, dataset_name, rep): 9 | # Load h5ad file 10 | adata_full = sc.read_h5ad(h5ad_loc) 11 | 12 | # Extract data from just one integration method subset 13 | int_method_select = np.random.choice( 14 | np.unique(adata_full.obs.integration_method.__array__()) 15 | ) 16 | 17 | # Extract summary statistics from h5ad file 18 | num_batches_ds = adata_full.uns["downsampling_stats"]["num_batches"] 19 | batches_ds = adata_full.uns["downsampling_stats"]["ds_batch_names"] 20 | num_celltypes_ds = adata_full.uns["downsampling_stats"]["num_celltypes_downsampled"] 21 | prop_ds = adata_full.uns["downsampling_stats"]["proportion_downsampled"] 22 | downsampled_celltypes = adata_full.uns["downsampling_stats"]["downsampled_celltypes"] 23 | 24 | # Format downsampled celltypes and batches to correspond to a single item 25 | if isinstance(downsampled_celltypes, str): 26 | if downsampled_celltypes == "None": 27 | downsampled_celltypes = "None" 28 | else: 29 | raise ValueError("Downsampled celltypes is a str and not 'None'") 30 | elif isinstance(downsampled_celltypes, np.ndarray): 31 | if downsampled_celltypes.shape == (1,): 32 | downsampled_celltypes = downsampled_celltypes[0] 33 | else: 34 | downsampled_celltypes = np.concatenate(downsampled_celltypes).flatten() 35 | downsampled_celltypes = ", ".join(downsampled_celltypes) 36 | else: 37 | raise TypeError("Downsampled celltypes is not a str or ndarray") 38 | 39 | if isinstance(batches_ds, str): 40 | if batches_ds == "None": 41 | batches_ds = "None" 42 | elif batches_ds == "Placeholder due to h5py bug": 43 | batches_ds = "Placeholder due to h5py bug" 44 | else: 45 | raise ValueError("Downsampled batches is a str and not 'None'") 46 | elif isinstance(batches_ds, np.ndarray): 47 | if batches_ds.shape == (1,): 48 | batches_ds = batches_ds[0] 49 | else: 50 | batches_ds = np.concatenate(batches_ds).flatten() 51 | batches_ds = ", ".join(batches_ds) 52 | else: 53 | raise TypeError("Downsampled batches is not a str or ndarray") 54 | 55 | # Subset data for only one method and split datasets by batch 56 | adata_select = adata_full[adata_full.obs.integration_method == int_method_select] 57 | adata_list = [] 58 | batches = np.unique(adata_select.obs.batch.__array__()) 59 | for batch in batches: 60 | adata_batch_select = adata_select[adata_select.obs.batch == batch] 61 | adata_list.append(adata_batch_select) 62 | 63 | # Get celltype value counts for each batch 64 | val_counts_dfs = [] 65 | for idx, adata in enumerate(adata_list): 66 | val_counts_df = pd.DataFrame(adata.obs.celltype.value_counts()) 67 | val_counts_df = val_counts_df.reset_index() 68 | val_counts_df.columns = ["celltype", "celltype_count_batch_{}".format(idx)] 69 | val_counts_dfs.append(val_counts_df) 70 | 71 | # Concatenate all celltype value counts results 72 | merge = functools.partial(pd.merge, on = ["celltype"], how = "outer") 73 | val_counts_merged = functools.reduce(merge, val_counts_dfs) 74 | 75 | # Replace NAs with 0 and add downsampling information 76 | val_counts_merged.iloc[:, 1:] = val_counts_merged.iloc[:, 1:].fillna(0) 77 | val_counts_merged["Dataset"] = dataset_name 78 | val_counts_merged["Number of batches downsampled"] = num_batches_ds 79 | val_counts_merged["Batches downsampled"] = batches_ds 80 | val_counts_merged["Number of celltypes downsampled"] = num_celltypes_ds 81 | val_counts_merged["Proportion downsampled"] = prop_ds 82 | val_counts_merged["Downsampled celltypes"] = downsampled_celltypes 83 | val_counts_merged["Replicate"] = rep 84 | val_counts_merged["Total batches"] = len(batches) 85 | val_counts_merged.to_csv(save_loc, index=False, sep="\t") 86 | 87 | if __name__ == '__main__': 88 | parser = argparse.ArgumentParser( 89 | description = "Input and output files for celltype imbalance summary" 90 | ) 91 | parser.add_argument( 92 | "--infile", 93 | type = str, 94 | help = "Path of integrated h5ad file" 95 | ) 96 | parser.add_argument( 97 | "--outfile", 98 | type = str, 99 | help = "Filepath for saving celltype imbalance statistics of h5ad file" 100 | ) 101 | parser.add_argument( 102 | "--dataset", 103 | type = str, 104 | help = "Name of dataset" 105 | ) 106 | parser.add_argument( 107 | "--rep", 108 | type = int, 109 | help = "Repetition number" 110 | ) 111 | args = parser.parse_args() 112 | main( 113 | h5ad_loc = args.infile, 114 | save_loc = args.outfile, 115 | dataset_name = args.dataset, 116 | rep = args.rep 117 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/clustering_concordance.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | sys.path.append("src/python/") 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import anndata as ann 9 | import scanpy as sc 10 | from utils import cluster_concordance 11 | 12 | def main(h5ad_loc, save_loc, dataset_name, rep): 13 | # Load h5ad file 14 | adata = sc.read_h5ad(h5ad_loc) 15 | 16 | # Extract summary statistics from h5ad file 17 | num_batches_ds = adata.uns["downsampling_stats"]["num_batches"] 18 | num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"] 19 | prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"] 20 | 21 | # Get clustering concordance results 22 | cluster_concordance_df = cluster_concordance(adata = adata) 23 | 24 | # Create cluster concordance summary df 25 | cluster_concordance_summary_df = pd.DataFrame({ 26 | "Dataset": dataset_name, 27 | "Number of batches downsampled": num_batches_ds, 28 | "Number of celltypes downsampled": num_celltypes_ds, 29 | "Proportion downsampled": prop_ds, 30 | "Replicate": rep, 31 | "Method 1": cluster_concordance_df["Method 1"].__array__(), 32 | "Method 2": cluster_concordance_df["Method 2"].__array__(), 33 | "ARI": cluster_concordance_df["ARI"].__array__(), 34 | "Median ARI": cluster_concordance_df["Median ARI"].__array__() 35 | }) 36 | 37 | # Save clustering concordance dataframe to tsv 38 | cluster_concordance_summary_df.to_csv( 39 | save_loc, 40 | index=False, 41 | sep="\t" 42 | ) 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser( 46 | description = "Input and output files for downsampling summary" 47 | ) 48 | parser.add_argument( 49 | "--infile", 50 | type = str, 51 | help = "Path of integrated h5ad file" 52 | ) 53 | parser.add_argument( 54 | "--outfile", 55 | type = str, 56 | help = "Filepath for saving clustering concordance statistics tsv" 57 | ) 58 | parser.add_argument( 59 | "--dataset", 60 | type = str, 61 | help = "Name of dataset" 62 | ) 63 | parser.add_argument( 64 | "--rep", 65 | type = int, 66 | help = "Repetition number" 67 | ) 68 | args = parser.parse_args() 69 | main( 70 | h5ad_loc = args.infile, 71 | save_loc = args.outfile, 72 | dataset_name = args.dataset, 73 | rep = args.rep 74 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/clustering_stats.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | sys.path.append("src/python/") 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import anndata as ann 9 | import scanpy as sc 10 | from sklearn import metrics 11 | 12 | from imbalanced_clustering import balanced_adjusted_rand_index, \ 13 | balanced_adjusted_mutual_info, balanced_completeness, \ 14 | balanced_homogeneity 15 | 16 | def main(h5ad_loc, save_loc, dataset_name, rep): 17 | # Load h5ad file 18 | adata = sc.read_h5ad(h5ad_loc) 19 | 20 | # Extract summary statistics from h5ad file 21 | num_batches_ds = adata.uns["downsampling_stats"]["num_batches"] 22 | num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"] 23 | prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"] 24 | 25 | # Subset h5ad based on batch-correction method used 26 | adata_method_sub = [] 27 | methods = ["harmony", "scvi", "bbknn", "scanorama", "seurat", "liger"] 28 | for method in methods: 29 | adata_sub = adata[adata.obs["integration_method"] == method] 30 | adata_method_sub.append( 31 | adata_sub 32 | ) 33 | 34 | # Get ARI, NMI, Homogeneity, Completeness values for each batch-correction method 35 | # and batch and celltype subsets. Both balanced and imbalanced subsets are considered 36 | # for the celltype data 37 | celltype_aris_imbalanced = [] 38 | celltype_amis_imbalanced = [] 39 | celltype_homs_imbalanced = [] 40 | celltype_comps_imbalanced = [] 41 | celltype_aris_balanced = [] 42 | celltype_amis_balanced = [] 43 | celltype_homs_balanced = [] 44 | celltype_comps_balanced = [] 45 | batch_aris = [] 46 | batch_amis = [] 47 | batch_homs = [] 48 | batch_comps = [] 49 | for adata_sub in adata_method_sub: 50 | celltype_aris_imbalanced.append( 51 | metrics.adjusted_rand_score( 52 | adata_sub.obs["celltype"].__array__(), 53 | adata_sub.obs["leiden"].__array__() 54 | ) 55 | ) 56 | celltype_amis_imbalanced.append( 57 | metrics.adjusted_mutual_info_score( 58 | adata_sub.obs["celltype"].__array__(), 59 | adata_sub.obs["leiden"].__array__() 60 | ) 61 | ) 62 | celltype_homs_imbalanced.append( 63 | metrics.homogeneity_score( 64 | adata_sub.obs["celltype"].__array__(), 65 | adata_sub.obs["leiden"].__array__() 66 | ) 67 | ) 68 | celltype_comps_imbalanced.append( 69 | metrics.completeness_score( 70 | adata_sub.obs["celltype"].__array__(), 71 | adata_sub.obs["leiden"].__array__() 72 | ) 73 | ) 74 | celltype_aris_balanced.append( 75 | balanced_adjusted_rand_index( 76 | adata_sub.obs["celltype"].__array__(), 77 | adata_sub.obs["leiden"].__array__(), 78 | reweigh=True 79 | ) 80 | ) 81 | celltype_amis_balanced.append( 82 | balanced_adjusted_mutual_info( 83 | adata_sub.obs["celltype"].__array__(), 84 | adata_sub.obs["leiden"].__array__(), 85 | reweigh=True 86 | ) 87 | ) 88 | celltype_homs_balanced.append( 89 | balanced_homogeneity( 90 | adata_sub.obs["celltype"].__array__(), 91 | adata_sub.obs["leiden"].__array__(), 92 | reweigh=True 93 | ) 94 | ) 95 | celltype_comps_balanced.append( 96 | balanced_completeness( 97 | adata_sub.obs["celltype"].__array__(), 98 | adata_sub.obs["leiden"].__array__(), 99 | reweigh=True 100 | ) 101 | ) 102 | batch_aris.append( 103 | 1 - metrics.adjusted_rand_score( 104 | adata_sub.obs["batch"].__array__(), 105 | adata_sub.obs["leiden"].__array__() 106 | ) 107 | ) 108 | batch_amis.append( 109 | 1 - metrics.adjusted_mutual_info_score( 110 | adata_sub.obs["batch"].__array__(), 111 | adata_sub.obs["leiden"].__array__() 112 | ) 113 | ) 114 | batch_homs.append( 115 | 1 - metrics.homogeneity_score( 116 | adata_sub.obs["batch"].__array__(), 117 | adata_sub.obs["leiden"].__array__() 118 | ) 119 | ) 120 | batch_comps.append( 121 | 1 - metrics.completeness_score( 122 | adata_sub.obs["batch"].__array__(), 123 | adata_sub.obs["leiden"].__array__() 124 | ) 125 | ) 126 | 127 | # Get number of clusters per method 128 | cluster_nums = [] 129 | for adata_sub in adata_method_sub: 130 | cluster_nums.append( 131 | len(np.unique(adata_sub.obs["leiden"].__array__())) 132 | ) 133 | 134 | # Get number of cells per method 135 | cell_nums = [] 136 | for adata_sub in adata_method_sub: 137 | cell_nums.append( 138 | adata_sub.n_obs 139 | ) 140 | 141 | # Create summary dataframe for clustering statistics 142 | cluster_summary_df = pd.DataFrame({ 143 | "Dataset": dataset_name, 144 | "Number of batches downsampled": num_batches_ds, 145 | "Number of celltypes downsampled": num_celltypes_ds, 146 | "Proportion downsampled": prop_ds, 147 | "Replicate": rep, 148 | "Method": methods, 149 | "Cluster number": cluster_nums, 150 | "Cell number": cell_nums, 151 | "Celltype ARI Imbalanced": celltype_aris_imbalanced, 152 | "Celltype AMI Imbalanced": celltype_amis_imbalanced, 153 | "Celltype Homogeneity Imbalanced": celltype_homs_imbalanced, 154 | "Celltype Completeness Imbalanced": celltype_comps_imbalanced, 155 | "Celltype ARI Balanced": celltype_aris_balanced, 156 | "Celltype AMI Balanced": celltype_amis_balanced, 157 | "Celltype Homogeneity Balanced": celltype_homs_balanced, 158 | "Celltype Completeness Balanced": celltype_comps_balanced, 159 | "Batch ARI": batch_aris, 160 | "Batch AMI": batch_amis, 161 | "Batch Homogeneity": batch_homs, 162 | "Batch Completeness": batch_comps 163 | }) 164 | 165 | # Save clustering summary dataframe to tsv 166 | cluster_summary_df.to_csv( 167 | save_loc, 168 | index=False, 169 | sep="\t" 170 | ) 171 | 172 | if __name__ == '__main__': 173 | parser = argparse.ArgumentParser( 174 | description = "Input and output files for clustering results summary" 175 | ) 176 | parser.add_argument( 177 | "--infile", 178 | type = str, 179 | help = "Path of integrated h5ad file" 180 | ) 181 | parser.add_argument( 182 | "--outfile", 183 | type = str, 184 | help = "Filepath for saving clustering results of integrated h5ad file" 185 | ) 186 | parser.add_argument( 187 | "--dataset", 188 | type = str, 189 | help = "Name of dataset" 190 | ) 191 | parser.add_argument( 192 | "--rep", 193 | type = int, 194 | help = "Repetition number" 195 | ) 196 | args = parser.parse_args() 197 | main( 198 | h5ad_loc = args.infile, 199 | save_loc = args.outfile, 200 | dataset_name = args.dataset, 201 | rep = args.rep 202 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/dge_concordance_full.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | sys.path.append("src/python/") 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import anndata as ann 9 | import scanpy as sc 10 | 11 | from utils import dge_top_n, diffexp 12 | 13 | def main(h5ad_loc, save_loc, dataset_name, rep): 14 | # Load h5ad file 15 | adata = sc.read_h5ad(h5ad_loc) 16 | 17 | # Extract summary statistics from h5ad file 18 | num_batches_ds = adata.uns["downsampling_stats"]["num_batches"] 19 | num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"] 20 | prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"] 21 | k_initial = adata.uns["kmeans_stats"]["kmeans_initial_k"] 22 | k_final = adata.uns["kmeans_stats"]["kmeans_final_k"] 23 | 24 | # Check if k_final is 1 and if so, skip DGE 25 | if k_final == 1: 26 | # Create and save summary dataframe for DGE results 27 | dge_summary_df = pd.DataFrame( 28 | { 29 | "Dataset": dataset_name, 30 | "Number of batches downsampled": num_batches_ds, 31 | "Number of celltypes downsampled": num_celltypes_ds, 32 | "Proportion downsampled": prop_ds, 33 | "Replicate": rep, 34 | "Cluster number before convergence": k_initial, 35 | "Cluster number after convergence": k_final, 36 | "Method": "NA", 37 | "Cluster": "NA", 38 | "Differentially expressed genes": "NA - k_final = 1" 39 | }, 40 | index = [0] 41 | ) 42 | dge_summary_df.to_csv(save_loc, sep = "\t", index = False) 43 | else: 44 | # Subset adatas based on method for integration and store lognorm counts in raw 45 | # attribute for diffexp testing 46 | methods = ["harmony", "scvi", "scanorama", "seurat", "liger"] 47 | method_adatas = [] 48 | for method in methods: 49 | adata_copy = adata.copy() 50 | adata_subset = adata_copy[adata_copy.obs["integration_method"] == method] 51 | adata_subset.X = adata_subset.layers["raw"] # Unlogged, unnorm counts 52 | sc.pp.normalize_total( 53 | adata_subset, 54 | target_sum = 1e4 55 | ) 56 | sc.pp.log1p(adata_subset) 57 | adata_subset.raw = adata_subset # Freeze for DGE test - lognorm counts 58 | method_adatas.append(adata_subset) 59 | 60 | # Extract top 50 DGEs for each cluster in each method 61 | method_dge_dfs = [] 62 | for adata_method_subset in method_adatas: 63 | adata_method_subset = diffexp( 64 | adata_method_subset, 65 | groupby = "kmeans_faiss", 66 | use_raw = True, 67 | method = "wilcoxon" 68 | ) 69 | dge_results = dge_top_n( 70 | adata_method_subset, 71 | n = 50, 72 | obs_group = "kmeans_faiss" 73 | ) 74 | method_dge_dfs.append(dge_results) 75 | 76 | # Concatenate DGE results from each method 77 | method_dge_dfs_concat = pd.concat(method_dge_dfs, axis = 0) 78 | 79 | # Create long form array for methods 80 | methods_long = np.repeat(np.array(methods), 50*k_final) 81 | 82 | # Create and save summary dataframe for DGE results 83 | dge_summary_df = pd.DataFrame({ 84 | "Dataset": dataset_name, 85 | "Number of batches downsampled": num_batches_ds, 86 | "Number of celltypes downsampled": num_celltypes_ds, 87 | "Proportion downsampled": prop_ds, 88 | "Replicate": rep, 89 | "Cluster number before convergence": k_initial, 90 | "Cluster number after convergence": k_final, 91 | "Method": methods_long, 92 | "Cluster": method_dge_dfs_concat["Cluster"].__array__(), 93 | "Differentially expressed genes": method_dge_dfs_concat["Top 50 DGEs"].__array__() 94 | }) 95 | dge_summary_df.to_csv(save_loc, sep = "\t", index = False) 96 | 97 | if __name__ == '__main__': 98 | parser = argparse.ArgumentParser( 99 | description = "Input and output files for dge concordance summary" 100 | ) 101 | parser.add_argument( 102 | "--infile", 103 | type = str, 104 | help = "Path of integrated h5ad file" 105 | ) 106 | parser.add_argument( 107 | "--outfile", 108 | type = str, 109 | help = "Filepath for saving dge concordance statistics of integrated h5ad file" 110 | ) 111 | parser.add_argument( 112 | "--dataset", 113 | type = str, 114 | help = "Name of dataset" 115 | ) 116 | parser.add_argument( 117 | "--rep", 118 | type = int, 119 | help = "Repetition number" 120 | ) 121 | args = parser.parse_args() 122 | main( 123 | h5ad_loc = args.infile, 124 | save_loc = args.outfile, 125 | dataset_name = args.dataset, 126 | rep = args.rep 127 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/dge_concordance_stats.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | sys.path.append("src/python/") 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import anndata as ann 9 | import scanpy as sc 10 | 11 | from utils import dge_top_n, diffexp 12 | 13 | def main(h5ad_loc, save_loc, dataset_name, rep): 14 | # Load h5ad file 15 | adata = sc.read_h5ad(h5ad_loc) 16 | 17 | # Extract summary statistics from h5ad file 18 | num_batches_ds = adata.uns["downsampling_stats"]["num_batches"] 19 | num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"] 20 | prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"] 21 | k_initial = adata.uns["kmeans_stats"]["kmeans_initial_k"] 22 | k_final = adata.uns["kmeans_stats"]["kmeans_final_k"] 23 | 24 | # Check if k_final is 1 and if so, skip DGE 25 | if k_final == 1: 26 | # Create and save summary dataframe for DGE results 27 | dge_int_summary_df = pd.DataFrame({ 28 | "Dataset": dataset_name, 29 | "Batches downsampled": num_batches_ds, 30 | "Number of celltypes downsampled": num_celltypes_ds, 31 | "Proportion downsampled": prop_ds, 32 | "Replicate": rep, 33 | "Cluster number before convergence": k_initial, 34 | "Cluster number after convergence": k_final, 35 | "Method 1": "NA", 36 | "Method 2": "NA", 37 | "DGE Set Intersection Ratio": "NA", 38 | "Median DGE Set Intersection Ratio": "NA" 39 | }, 40 | index = [0] 41 | ) 42 | dge_int_summary_df.to_csv(save_loc, sep = "\t", index = False) 43 | else: 44 | # Subset adatas based on method for integration and store lognorm counts in raw 45 | # attribute for diffexp testing 46 | methods = ["harmony", "scvi", "scanorama", "seurat", "liger"] 47 | method_adatas = [] 48 | for method in methods: 49 | adata_copy = adata.copy() 50 | adata_subset = adata_copy[adata_copy.obs["integration_method"] == method] 51 | adata_subset.X = adata_subset.layers["raw"] # Unlogged, unnorm counts 52 | sc.pp.normalize_total( 53 | adata_subset, 54 | target_sum = 1e4 55 | ) 56 | sc.pp.log1p(adata_subset) 57 | adata_subset.raw = adata_subset # Freeze for DGE test 58 | method_adatas.append(adata_subset) 59 | 60 | # Extract top 50 DGEs for each cluster in each method 61 | method_dge_dfs = [] 62 | for adata_method_subset in method_adatas: 63 | adata_method_subset = diffexp( 64 | adata_method_subset, 65 | groupby = "kmeans_faiss", 66 | use_raw = True, 67 | method = "wilcoxon" 68 | ) 69 | dge_results = dge_top_n( 70 | adata_method_subset, 71 | n = 50, 72 | obs_group = "kmeans_faiss" 73 | ) 74 | method_dge_dfs.append(dge_results) 75 | 76 | # Concatenate DGE results from each method 77 | method_dge_dfs_concat = pd.concat(method_dge_dfs, axis = 0) 78 | 79 | # Create long form array for methods 80 | methods_long = np.repeat(np.array(methods), 50*k_final) 81 | 82 | # Create summary dataframe for DGE results 83 | dge_summary_df = pd.DataFrame({ 84 | "Dataset": dataset_name, 85 | "Number of batches downsampled": num_batches_ds, 86 | "Number of celltypes downsampled": num_celltypes_ds, 87 | "Proportion downsampled": prop_ds, 88 | "Replicate": rep, 89 | "Cluster number before convergence": k_initial, 90 | "Cluster number after convergence": k_final, 91 | "Method": methods_long, 92 | "Cluster": method_dge_dfs_concat["Cluster"].__array__(), 93 | "Differentially expressed genes": method_dge_dfs_concat["Top 50 DGEs"].__array__() 94 | }) 95 | 96 | # Determine DGE concordance through set intersection in a pairwise manner 97 | method_concordance_mat = np.zeros((len(methods), len(methods))) 98 | for i, method_i in enumerate(methods): 99 | for j, method_j in enumerate(methods): 100 | dge_sub_1 = dge_summary_df[dge_summary_df["Method"] == method_i] 101 | dge_sub_2 = dge_summary_df[dge_summary_df["Method"] == method_j] 102 | dge_sub_1_genes = dge_sub_1["Differentially expressed genes"].values 103 | dge_sub_2_genes = dge_sub_2["Differentially expressed genes"].values 104 | set_int = np.intersect1d(dge_sub_1_genes, dge_sub_2_genes) 105 | int_ratio = len(set_int)/len(np.unique(dge_sub_1_genes)) 106 | method_concordance_mat[i, j] = int_ratio 107 | 108 | 109 | # Create dataframe of values 110 | method_int_df = pd.DataFrame(method_concordance_mat) 111 | method_int_df.index = methods 112 | method_int_df.columns = methods 113 | 114 | # Convert to long format 115 | method_int_df_long = method_int_df.melt(ignore_index = False) 116 | method_int_df_long = method_int_df_long.reset_index() 117 | method_int_df_long.columns = ["Method 1", "Method 2", "DGE Set Intersection Ratio"] 118 | 119 | # Get median of DGE concordance 120 | method_int_df_no_self = method_int_df_long[method_int_df_long["Method 1"] != method_int_df_long["Method 2"]] 121 | median_set_int_ratio = np.median(method_int_df_no_self["DGE Set Intersection Ratio"]) 122 | method_int_df_long["Median DGE Set Intersection Ratio"] = median_set_int_ratio 123 | 124 | # Create and save summary dataframe for DGE intersection results 125 | dge_int_summary_df = pd.DataFrame({ 126 | "Dataset": dataset_name, 127 | "Batches downsampled": num_batches_ds, 128 | "Number of celltypes downsampled": num_celltypes_ds, 129 | "Proportion downsampled": prop_ds, 130 | "Replicate": rep, 131 | "Cluster number before convergence": k_initial, 132 | "Cluster number after convergence": k_final, 133 | "Method 1": method_int_df_long["Method 1"].__array__(), 134 | "Method 2": method_int_df_long["Method 2"].__array__(), 135 | "DGE Set Intersection Ratio": method_int_df_long["DGE Set Intersection Ratio"].__array__(), 136 | "Median DGE Set Intersection Ratio": method_int_df_long["Median DGE Set Intersection Ratio"].__array__() 137 | }) 138 | dge_int_summary_df.to_csv(save_loc, sep = "\t", index = False) 139 | 140 | 141 | if __name__ == '__main__': 142 | parser = argparse.ArgumentParser( 143 | description = "Input and output files for dge concordance summary" 144 | ) 145 | parser.add_argument( 146 | "--infile", 147 | type = str, 148 | help = "Path of integrated h5ad file" 149 | ) 150 | parser.add_argument( 151 | "--outfile", 152 | type = str, 153 | help = "Filepath for saving dge concordance statistics of integrated h5ad file" 154 | ) 155 | parser.add_argument( 156 | "--dataset", 157 | type = str, 158 | help = "Name of dataset" 159 | ) 160 | parser.add_argument( 161 | "--rep", 162 | type = int, 163 | help = "Repetition number" 164 | ) 165 | args = parser.parse_args() 166 | main( 167 | h5ad_loc = args.infile, 168 | save_loc = args.outfile, 169 | dataset_name = args.dataset, 170 | rep = args.rep 171 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/dge_ranking_concordance.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | sys.path.append("src/python/") 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import anndata as ann 9 | import scanpy as sc 10 | 11 | from utils import diffexp, dge_top_n 12 | 13 | def main(h5ad_loc, save_loc, dataset_name, rep): 14 | # Load h5ad file 15 | adata = sc.read_h5ad(h5ad_loc) 16 | 17 | # Extract summary statistics from h5ad file 18 | num_batches_ds = adata.uns["downsampling_stats"]["num_batches"] 19 | num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"] 20 | prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"] 21 | 22 | # Subset adatas based on method for integration and store lognorm counts in raw 23 | # attribute for diffexp testing 24 | methods = ["harmony", "scvi", "scanorama", "seurat", "liger", "bbknn"] 25 | method_adatas = [] 26 | for method in methods: 27 | adata_copy = adata.copy() 28 | adata_subset = adata_copy[adata_copy.obs["integration_method"] == method] 29 | adata_subset.X = adata_subset.layers["raw"] # Unlogged, unnorm counts 30 | sc.pp.normalize_total( 31 | adata_subset, 32 | target_sum = 1e4 33 | ) 34 | sc.pp.log1p(adata_subset) 35 | adata_subset.raw = adata_subset # Freeze for DGE test 36 | method_adatas.append(adata_subset) 37 | 38 | # Extract list of all DGEs for all leiden clusters in each method 39 | method_dge_dfs = [] 40 | for adata_method_subset in method_adatas: 41 | adata_method_subset = diffexp( 42 | adata_method_subset, 43 | groupby = "leiden", 44 | use_raw = True, 45 | method = "wilcoxon" 46 | ) 47 | dge_results = dge_top_n( 48 | adata_method_subset, 49 | n = len(adata.var), 50 | obs_group = "leiden" 51 | ) 52 | method_dge_dfs.append(dge_results) 53 | 54 | # For each method, compute the ranking metrics for all genes in the dataset 55 | all_genes = np.sort(adata.var.index.values) 56 | all_genes_tiled = np.tile(all_genes, (len(methods))) 57 | methods_repeat = np.repeat(methods, len(all_genes)) 58 | gene_max_imp_per_method = [] 59 | gene_min_imp_per_method = [] 60 | for method_dge_df in method_dge_dfs: 61 | method_clusters = np.unique(method_dge_df["Cluster"].__array__()) 62 | cluster_ranks = [] 63 | for cluster in method_clusters: 64 | cluster_sub = method_dge_df[method_dge_df["Cluster"] == cluster] 65 | gene_ranks_sorted = np.argsort(cluster_sub.iloc[:, 1].__array__()) 66 | cluster_ranks.append(gene_ranks_sorted) 67 | cluster_ranks_stack = np.stack(cluster_ranks, axis = 0) 68 | # Min for max because lowest number for ranking corresponds to highest importance 69 | gene_max_imp_per_method.append( 70 | np.min(cluster_ranks_stack, axis = 0) 71 | ) 72 | gene_min_imp_per_method.append( 73 | np.max(cluster_ranks_stack, axis = 0) 74 | ) 75 | 76 | # Concatenate max and min ranks for each method 77 | gene_max_imp_per_method_concat = np.concatenate(gene_max_imp_per_method) 78 | gene_min_imp_per_method_concat = np.concatenate(gene_min_imp_per_method) 79 | 80 | # Create summary df of all genes and their ranking metrics 81 | dge_ranking_summary_df = pd.DataFrame({ 82 | "Dataset": dataset_name, 83 | "Number of batches downsampled": num_batches_ds, 84 | "Number of celltypes downsampled": num_celltypes_ds, 85 | "Proportion downsampled": prop_ds, 86 | "Replicate": rep, 87 | "Method": methods_repeat, 88 | "Gene": all_genes_tiled, 89 | "Max rank": gene_max_imp_per_method_concat, 90 | "Min rank": gene_min_imp_per_method_concat 91 | }) 92 | dge_ranking_summary_df.to_csv(save_loc, sep = "\t", index = False) 93 | 94 | if __name__ == '__main__': 95 | parser = argparse.ArgumentParser( 96 | description = "Input and output files for dge ranking summary" 97 | ) 98 | parser.add_argument( 99 | "--infile", 100 | type = str, 101 | help = "Path of integrated h5ad file" 102 | ) 103 | parser.add_argument( 104 | "--outfile", 105 | type = str, 106 | help = "Filepath for saving dge ranking statistics of integrated h5ad file" 107 | ) 108 | parser.add_argument( 109 | "--dataset", 110 | type = str, 111 | help = "Name of dataset" 112 | ) 113 | parser.add_argument( 114 | "--rep", 115 | type = int, 116 | help = "Repetition number" 117 | ) 118 | args = parser.parse_args() 119 | main( 120 | h5ad_loc = args.infile, 121 | save_loc = args.outfile, 122 | dataset_name = args.dataset, 123 | rep = args.rep 124 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/dge_ranking_marker_subset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import pandas as pd 4 | 5 | def main(dge_rank_file_loc, marker_file_loc, save_loc): 6 | # Read in the dge rank file and marker file 7 | dge_rank_df = pd.read_csv(dge_rank_file_loc, sep = "\t") 8 | marker_df = pd.read_csv(marker_file_loc, sep = "\t") 9 | 10 | # Subset the dge rank df by the markers in the marker df 11 | dataset_markers = marker_df["Top 10 marker genes (union across batches)"].__array__() 12 | dge_rank_df_marker_sub = dge_rank_df[dge_rank_df["Gene"].isin(dataset_markers)] 13 | 14 | # Save the marker subset dge rank df 15 | dge_rank_df_marker_sub.to_csv(save_loc, sep = "\t", index = False) 16 | 17 | if __name__ == "__main__": 18 | parser = argparse.ArgumentParser( 19 | description = "Input and output files for dge concordance summary" 20 | ) 21 | parser.add_argument( 22 | "--infile_dge_rank", 23 | type = str, 24 | help = "Path of dge rank file for given dataset" 25 | ) 26 | parser.add_argument( 27 | "--infile_marker", 28 | type = str, 29 | help = "Path of marker gene file for given dataset" 30 | ) 31 | parser.add_argument( 32 | "--outfile", 33 | type = str, 34 | help = "Filepath for saving marker gene subset dge rank file" 35 | ) 36 | args = parser.parse_args() 37 | main( 38 | dge_rank_file_loc=args.infile_dge_rank, 39 | marker_file_loc=args.infile_marker, 40 | save_loc=args.outfile 41 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/dge_ranking_per_cluster.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | sys.path.append("src/python/") 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import anndata as ann 9 | import scanpy as sc 10 | 11 | from utils import diffexp, dge_top_n 12 | 13 | def main(h5ad_loc, save_loc, dataset_name, rep): 14 | # Load h5ad file 15 | adata = sc.read_h5ad(h5ad_loc) 16 | 17 | # Extract summary statistics from h5ad file 18 | num_batches_ds = adata.uns["downsampling_stats"]["num_batches"] 19 | num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"] 20 | prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"] 21 | 22 | # Subset adatas based on method for integration and store lognorm counts in raw 23 | # attribute for diffexp testing 24 | methods = ["harmony", "scvi", "scanorama", "seurat", "liger", "bbknn"] 25 | method_adatas = [] 26 | for method in methods: 27 | adata_copy = adata.copy() 28 | adata_subset = adata_copy[adata_copy.obs["integration_method"] == method] 29 | adata_subset.X = adata_subset.layers["raw"] # Unlogged, unnorm counts 30 | sc.pp.normalize_total( 31 | adata_subset, 32 | target_sum = 1e4 33 | ) 34 | sc.pp.log1p(adata_subset) 35 | adata_subset.raw = adata_subset # Freeze for DGE test 36 | method_adatas.append(adata_subset) 37 | 38 | # Extract list of all DGEs for all leiden clusters in each method 39 | method_dge_dfs = [] 40 | for adata_method_subset in method_adatas: 41 | adata_method_subset = diffexp( 42 | adata_method_subset, 43 | groupby = "leiden", 44 | use_raw = True, 45 | method = "wilcoxon" 46 | ) 47 | dge_results = dge_top_n( 48 | adata_method_subset, 49 | n = len(adata.var), 50 | obs_group = "leiden" 51 | ) 52 | method_dge_dfs.append(dge_results) 53 | 54 | # For each method, compute the ranking metrics for all genes in the dataset 55 | # based on each cluster - extract the top 50 dges for each cluster across subsets 56 | method_adata_result_dfs = [] 57 | for method_adata, method_dge_df in zip(method_adatas, method_dge_dfs): 58 | method_clusters = np.unique(method_dge_df["Cluster"].__array__()) 59 | method_name = np.unique(method_adata.obs["integration_method"].__array__()) 60 | cluster_ranks = [] 61 | cluster_celltype = [] 62 | cluster_number = [] 63 | for cluster in method_clusters: 64 | cluster_celltype_unique = np.unique( 65 | method_adata.obs["celltype"][method_adata.obs["leiden"] == cluster], 66 | return_counts = True 67 | ) 68 | celltype_most_prev = cluster_celltype_unique[0][ 69 | np.argmax(cluster_celltype_unique[1]) 70 | ] 71 | cluster_sub = method_dge_df[method_dge_df["Cluster"] == cluster] 72 | genes_top_50 = cluster_sub.iloc[:, 1].__array__()[0:50] 73 | cluster_ranks.append(genes_top_50) 74 | cluster_celltype.append(np.repeat(celltype_most_prev, 50)) 75 | cluster_number.append(np.repeat(cluster, 50)) 76 | cluster_ranks_full = np.concatenate(cluster_ranks) 77 | cluster_celltypes_full = np.concatenate(cluster_celltype) 78 | cluster_numbers_full = np.concatenate(cluster_number) 79 | method_adata_result = pd.DataFrame({ 80 | "Top 50 cluster markers (ordered)": cluster_ranks_full, 81 | "Cluster celltype (majority)": cluster_celltypes_full, 82 | "Cluster number": cluster_numbers_full 83 | }) 84 | method_adata_result["Method"] = method_name[0] 85 | method_adata_result_dfs.append(method_adata_result) 86 | 87 | # Concatenate all results into one dataframe 88 | method_adata_result_df = pd.concat(method_adata_result_dfs) 89 | 90 | # Add all of the summary statistics to the dataframe and save 91 | method_adata_result_df["Dataset"] = dataset_name 92 | method_adata_result_df["Replicate"] = rep 93 | method_adata_result_df["Number of batches downsampled"] = num_batches_ds 94 | method_adata_result_df["Number of celltypes downsampled"] = num_celltypes_ds 95 | method_adata_result_df["Proportion downsampled"] = prop_ds 96 | 97 | method_adata_result_df.to_csv(save_loc, sep = "\t", index = False) 98 | 99 | if __name__ == '__main__': 100 | parser = argparse.ArgumentParser( 101 | description = "Input and output files for dge ranking summary per cluster" 102 | ) 103 | parser.add_argument( 104 | "--infile", 105 | type = str, 106 | help = "Path of integrated h5ad file" 107 | ) 108 | parser.add_argument( 109 | "--outfile", 110 | type = str, 111 | help = "Filepath for saving dge ranking statistics per cluster of integrated h5ad file" 112 | ) 113 | parser.add_argument( 114 | "--dataset", 115 | type = str, 116 | help = "Name of dataset" 117 | ) 118 | parser.add_argument( 119 | "--rep", 120 | type = int, 121 | help = "Repetition number" 122 | ) 123 | args = parser.parse_args() 124 | main( 125 | h5ad_loc = args.infile, 126 | save_loc = args.outfile, 127 | dataset_name = args.dataset, 128 | rep = args.rep 129 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/downsample_summary.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import anndata as ann 8 | import scanpy as sc 9 | 10 | def main(h5ad_loc, save_loc, dataset_name, rep): 11 | # Load h5ad file 12 | adata = sc.read_h5ad(h5ad_loc) 13 | 14 | # Extract summary statistics from h5ad file 15 | num_batches_ds = adata.uns["downsampling_stats"]["num_batches"] 16 | batches_ds = adata.uns["downsampling_stats"]["ds_batch_names"] 17 | num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"] 18 | prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"] 19 | downsampled_celltypes = adata.uns["downsampling_stats"]["downsampled_celltypes"] 20 | 21 | # Format downsampled celltypes and batches to correspond to a single item 22 | if isinstance(downsampled_celltypes, str): 23 | if downsampled_celltypes == "None": 24 | downsampled_celltypes = "None" 25 | else: 26 | raise ValueError("Downsampled celltypes is a str and not 'None'") 27 | elif isinstance(downsampled_celltypes, np.ndarray): 28 | if downsampled_celltypes.shape == (1,): 29 | downsampled_celltypes = downsampled_celltypes[0] 30 | else: 31 | downsampled_celltypes = np.concatenate(downsampled_celltypes).flatten() 32 | downsampled_celltypes = ", ".join(downsampled_celltypes) 33 | else: 34 | raise TypeError("Downsampled celltypes is not a str or ndarray") 35 | 36 | if isinstance(batches_ds, str): 37 | if batches_ds == "None": 38 | batches_ds = "None" 39 | elif batches_ds == "Placeholder due to h5py bug": 40 | batches_ds = "Placeholder due to h5py bug" 41 | else: 42 | raise ValueError("Downsampled batches is a str and not 'None'") 43 | elif isinstance(batches_ds, np.ndarray): 44 | if batches_ds.shape == (1,): 45 | batches_ds = batches_ds[0] 46 | else: 47 | batches_ds = np.concatenate(batches_ds).flatten() 48 | batches_ds = ", ".join(batches_ds) 49 | else: 50 | raise TypeError("Downsampled batches is not a str or ndarray") 51 | 52 | # Extract data from just one integration method subset - for getting unique batches 53 | int_method_select = np.random.choice( 54 | np.unique(adata.obs.integration_method.__array__()) 55 | ) 56 | adata_select = adata[adata.obs.integration_method == int_method_select] 57 | 58 | # Create downsampling summary df 59 | ds_summary_df = pd.DataFrame( 60 | { 61 | "Dataset": dataset_name, 62 | "Number of batches downsampled": num_batches_ds, 63 | "Batches downsampled": batches_ds, 64 | "Number of celltypes downsampled": num_celltypes_ds, 65 | "Proportion downsampled": prop_ds, 66 | "Downsampled celltypes": downsampled_celltypes, 67 | "Replicate": rep, 68 | "Total batches": len(np.unique(adata_select.obs["batch"])) 69 | }, 70 | index = [0] 71 | ) 72 | ds_summary_df.to_csv(save_loc, index=False, sep="\t") 73 | 74 | if __name__ == '__main__': 75 | parser = argparse.ArgumentParser( 76 | description = "Input and output files for downsampling summary" 77 | ) 78 | parser.add_argument( 79 | "--infile", 80 | type = str, 81 | help = "Path of integrated h5ad file" 82 | ) 83 | parser.add_argument( 84 | "--outfile", 85 | type = str, 86 | help = "Filepath for saving downsampling statistics of integrated h5ad file" 87 | ) 88 | parser.add_argument( 89 | "--dataset", 90 | type = str, 91 | help = "Name of dataset" 92 | ) 93 | parser.add_argument( 94 | "--rep", 95 | type = int, 96 | help = "Repetition number" 97 | ) 98 | args = parser.parse_args() 99 | main( 100 | h5ad_loc = args.infile, 101 | save_loc = args.outfile, 102 | dataset_name = args.dataset, 103 | rep = args.rep 104 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/imbalance_summary.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import scipy.spatial as sp 6 | import scanpy as sc 7 | 8 | def main(h5ad_loc, save_loc, dataset_name, rep): 9 | # Load h5ad file 10 | adata_full = sc.read_h5ad(h5ad_loc) 11 | 12 | # Extract summary statistics from h5ad file 13 | num_batches_ds = adata_full.uns["downsampling_stats"]["num_batches"] 14 | batches_ds = adata_full.uns["downsampling_stats"]["ds_batch_names"] 15 | num_celltypes_ds = adata_full.uns["downsampling_stats"]["num_celltypes_downsampled"] 16 | prop_ds = adata_full.uns["downsampling_stats"]["proportion_downsampled"] 17 | downsampled_celltypes = adata_full.uns["downsampling_stats"]["downsampled_celltypes"] 18 | 19 | # Format downsampled celltypes and batches to correspond to a single item 20 | if isinstance(downsampled_celltypes, str): 21 | if downsampled_celltypes == "None": 22 | downsampled_celltypes = "None" 23 | else: 24 | raise ValueError("Downsampled celltypes is a str and not 'None'") 25 | elif isinstance(downsampled_celltypes, np.ndarray): 26 | if downsampled_celltypes.shape == (1,): 27 | downsampled_celltypes = downsampled_celltypes[0] 28 | else: 29 | downsampled_celltypes = np.concatenate(downsampled_celltypes).flatten() 30 | downsampled_celltypes = ", ".join(downsampled_celltypes) 31 | else: 32 | raise TypeError("Downsampled celltypes is not a str or ndarray") 33 | 34 | if isinstance(batches_ds, str): 35 | if batches_ds == "None": 36 | batches_ds = "None" 37 | elif batches_ds == "Placeholder due to h5py bug": 38 | batches_ds = "Placeholder due to h5py bug" 39 | else: 40 | raise ValueError("Downsampled batches is a str and not 'None'") 41 | elif isinstance(batches_ds, np.ndarray): 42 | if batches_ds.shape == (1,): 43 | batches_ds = batches_ds[0] 44 | else: 45 | batches_ds = np.concatenate(batches_ds).flatten() 46 | batches_ds = ", ".join(batches_ds) 47 | else: 48 | raise TypeError("Downsampled batches is not a str or ndarray") 49 | 50 | # Extract data from just one integration method subset 51 | int_method_select = np.random.choice( 52 | np.unique(adata_full.obs.integration_method.__array__()) 53 | ) 54 | 55 | # Subset data for only one method and split datasets by batch 56 | adata_select = adata_full[adata_full.obs.integration_method == int_method_select] 57 | adata_list = [] 58 | batches = np.unique(adata_select.obs.batch.__array__()) 59 | for batch in batches: 60 | adata_batch_select = adata_select[adata_select.obs.batch == batch] 61 | adata_list.append(adata_batch_select) 62 | 63 | # Get union of cell types across all batches 64 | celltype_union = np.unique(np.concatenate([adata.obs.celltype.__array__() for adata in adata_list])) 65 | 66 | # Get intersection of cell types across all batches 67 | celltype_intersection = set.intersection(*[set(adata.obs.celltype.__array__()) for adata in adata_list]) 68 | 69 | # Get proportion vector of cells in each batch 70 | celltype_props = [] 71 | for adata in adata_list: 72 | celltype_prop = np.zeros(len(celltype_union)) 73 | for idx, celltype in enumerate(celltype_union): 74 | celltype_prop[idx] = np.sum(adata.obs.celltype.__array__() == celltype)/len(adata) 75 | celltype_props.append(celltype_prop) 76 | 77 | # Get cosine distances across celltype proportions 78 | cos_distances = [] 79 | for celltype_prop in celltype_props: 80 | for celltype_prop_other in celltype_props: 81 | cos_distances.append(sp.distance.cosine(celltype_prop, celltype_prop_other)) 82 | 83 | # Get mean cosine distance across batches 84 | cos_dist_mean = np.mean(cos_distances) 85 | 86 | # Get ratio of unique celltypes over intersection (Jaccard index) 87 | celltype_unique_ratio = len(celltype_intersection) / len(celltype_union) 88 | 89 | # Get adata lens stdev proportional to total cells 90 | adata_lens = [len(adata) for adata in adata_list] 91 | adata_lens_stdev = np.std(adata_lens) 92 | adata_lens_mean = np.mean(adata_lens) 93 | adata_coeff_var = adata_lens_stdev / adata_lens_mean 94 | 95 | # Return dataset imbalance summary stats 96 | imba_summary_df = pd.DataFrame( 97 | { 98 | "Dataset": dataset_name, 99 | "Number of batches downsampled": num_batches_ds, 100 | "Batches downsampled": batches_ds, 101 | "Number of celltypes downsampled": num_celltypes_ds, 102 | "Proportion downsampled": prop_ds, 103 | "Downsampled celltypes": downsampled_celltypes, 104 | "Replicate": rep, 105 | "Total batches": len(batches), 106 | "Celltype intersection ratio": celltype_unique_ratio, 107 | "Mean proportion cosine distance": cos_dist_mean, 108 | "Length coeff var": adata_coeff_var 109 | }, 110 | index=[0] 111 | ) 112 | imba_summary_df.to_csv(save_loc, index=False, sep="\t") 113 | 114 | if __name__ == '__main__': 115 | parser = argparse.ArgumentParser( 116 | description = "Input and output files for imbalance summary" 117 | ) 118 | parser.add_argument( 119 | "--infile", 120 | type = str, 121 | help = "Path of integrated h5ad file" 122 | ) 123 | parser.add_argument( 124 | "--outfile", 125 | type = str, 126 | help = "Filepath for saving imbalance statistics of h5ad file" 127 | ) 128 | parser.add_argument( 129 | "--dataset", 130 | type = str, 131 | help = "Name of dataset" 132 | ) 133 | parser.add_argument( 134 | "--rep", 135 | type = int, 136 | help = "Repetition number" 137 | ) 138 | args = parser.parse_args() 139 | main( 140 | h5ad_loc = args.infile, 141 | save_loc = args.outfile, 142 | dataset_name = args.dataset, 143 | rep = args.rep 144 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/integrate_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | sys.path.append("src/python/") 5 | os.environ['CUDA_VISIBLE_DEVICES'] = "0, 1" 6 | 7 | import scanpy as sc 8 | import anndata as ann 9 | import numpy as np 10 | 11 | from utils import Integration, downsample, faiss_kmeans 12 | 13 | def none_or_str(value): 14 | if value == 'None': 15 | return None 16 | return value 17 | 18 | def main(h5ad_dir, save_loc, ds_celltypes, ds_proportions, num_batches): 19 | # Load h5ad files 20 | files_list = os.listdir(h5ad_dir) 21 | adata_loaded = [] 22 | for f in files_list: 23 | adata = sc.read_h5ad(os.path.join(h5ad_dir, f), as_sparse = "raw/X") 24 | adata.layers["raw"] = adata.X # Store raw counts 25 | adata.obs = adata.obs[["batch", "celltype"]] # Only store relevant columns 26 | if "gene" not in adata.var.columns: 27 | adata.var["gene"] = adata.var_names # Add gene names if not present 28 | adata.var = adata.var[["gene"]] # Only store relevant columns 29 | adata_loaded.append(adata) 30 | 31 | # Downsample loaded h5ad files based on params 32 | if num_batches == 0: 33 | selected_celltypes_downsampled = "None" # Placeholder - not used 34 | batches_ds = "None" # Placeholder - not used 35 | else: 36 | # Initialize random number generator 37 | rng = np.random.default_rng() 38 | 39 | # Select indices for downsampling 40 | selected_indices = np.random.choice( 41 | len(adata_loaded), num_batches, replace = False 42 | ) 43 | adata_selected = [adata_loaded[i] for i in selected_indices] 44 | adata_unselected = [adata_loaded[i] for i in range(len(adata_loaded)) if i not in selected_indices] 45 | 46 | # Downsample the same selected celltypes across all of the batches - this change will not affect 47 | # previous runs, as they all downsampled either 0 or only 1 celltype, in either 0 or 1 batches 48 | # NOTE - this setup operates on the assumption that the celltypes are the same across all batches 49 | celltypes_all = np.unique(np.concatenate([adata.obs["celltype"].__array__() for adata in adata_selected])) 50 | rng.shuffle(celltypes_all) 51 | celltypes_selected = rng.choice(celltypes_all, ds_celltypes, replace = False) 52 | selected_celltypes_downsampled = np.array(celltypes_selected) 53 | adata_downsampled = [] 54 | for adata in adata_selected: 55 | adata_ds, selected_celltypes_ds = downsample( 56 | adata = adata, 57 | num_celltypes = None, 58 | celltype_names = celltypes_selected, 59 | proportion = ds_proportions 60 | ) 61 | adata_downsampled.append(adata_ds) 62 | adata_loaded = adata_unselected + adata_downsampled 63 | batches_ds = np.unique(np.concatenate([adata.obs["batch"].__array__() for adata in adata_downsampled])) 64 | 65 | # Store batch name separately for each anndata object 66 | for adata in adata_loaded: 67 | adata.obs["batch_name"] = adata.obs["batch"] 68 | 69 | # Concatenate files (assume data is raw counts) 70 | adata_concat = ann.AnnData.concatenate(*adata_loaded) 71 | adata_concat.obs_names = range(len(adata_concat.obs_names)) 72 | adata_concat.obs_names_make_unique() 73 | adata_concat.obs["batch"] = adata_concat.obs["batch_name"] 74 | adata_concat.obs.drop("batch_name", axis = 1, inplace = True) 75 | 76 | # Create integration class instance 77 | integration = Integration(adata = adata_concat) 78 | 79 | # Integrate across subsets 80 | harmony_integrated = integration.harmony_integrate() 81 | scvi_integrated = integration.scvi_integrate() 82 | bbknn_integrated = integration.bbknn_integrate() 83 | scanorama_integrated = integration.scanorama_integrate() 84 | seurat_integrated = integration.seurat_integrate() 85 | liger_integrated = integration.liger_integrate() 86 | 87 | # Add integration type to each subset and concatenate 88 | harmony_integrated.obs["integration_method"] = "harmony" 89 | scvi_integrated.obs["integration_method"] = "scvi" 90 | bbknn_integrated.obs["integration_method"] = "bbknn" 91 | scanorama_integrated.obs["integration_method"] = "scanorama" 92 | seurat_integrated.obs["integration_method"] = "seurat" 93 | liger_integrated.obs["integration_method"] = "liger" 94 | 95 | integrated_concat = ann.concat([ 96 | harmony_integrated, 97 | scvi_integrated, 98 | bbknn_integrated, 99 | scanorama_integrated, 100 | seurat_integrated, 101 | liger_integrated 102 | ]) 103 | integrated_concat.obs_names = range(len(integrated_concat.obs_names)) 104 | integrated_concat.obs_names_make_unique() 105 | 106 | # Add placeholder in entire obs dataframe for kmeans clustering 107 | integrated_concat.obs["kmeans_faiss"] = np.zeros(len(integrated_concat.obs_names)) 108 | 109 | # Perform kmeans clustering on integrated data 110 | # Define method subsets and iterate over them until the same number of k clusters is found 111 | k = 10 112 | k_initial = k # Integers are immutable 113 | methods = ["harmony", "scvi", "scanorama", "seurat", "liger"] 114 | method_kmeans_adatas = [] 115 | i = 0 116 | while i < len(methods): 117 | # Create a copy of adata to avoid overwriting the original 118 | adata_copy = integrated_concat.copy() 119 | 120 | # Define method subset 121 | adata_subset = adata_copy[adata_copy.obs["integration_method"] == methods[i]] 122 | 123 | # Perform HVG selection on raw (unnormalized, unlogged) data 124 | adata_subset.X = adata_subset.layers["raw"] 125 | sc.pp.normalize_total( 126 | adata_subset, 127 | target_sum = 1e4 128 | ) 129 | sc.pp.log1p(adata_subset) 130 | sc.pp.highly_variable_genes( 131 | adata_subset, 132 | n_top_genes = 2500, 133 | flavor = "seurat" 134 | ) 135 | 136 | # Perform faiss kmeans clustering 137 | adata_subset, k_method = faiss_kmeans(adata_subset, k) 138 | 139 | # Test concordance of k values and either append or reset 140 | if k_method != k: 141 | k = k_method 142 | i = 0 143 | method_kmeans_adatas.clear() 144 | continue 145 | else: 146 | i += 1 147 | method_kmeans_adatas.append(adata_subset) 148 | 149 | # Append kmeans cluster info to integrated data 150 | for method, method_kmeans_adata in zip(methods, method_kmeans_adatas): 151 | method_kmeans_clusters = method_kmeans_adata.obs["kmeans_faiss"].__array__().astype('str') 152 | integrated_concat.obs.loc[ 153 | integrated_concat.obs["integration_method"] == method, 154 | "kmeans_faiss" 155 | ] = method_kmeans_clusters 156 | 157 | # Add placeholder for bbknn kmeans clustering 158 | integrated_concat.obs.loc[ 159 | integrated_concat.obs["integration_method"] == "bbknn", 160 | "kmeans_faiss" 161 | ] = "NA" 162 | 163 | # Append information about kmeans faiss clusters to .uns of adata_concat 164 | integrated_concat.uns["kmeans_stats"] = { 165 | "kmeans_initial_k": k_initial, 166 | "kmeans_final_k": k 167 | } 168 | 169 | # If downsampled celltypes and batches are of array length greater than one, combine them 170 | if len(batches_ds) > 1: 171 | batches_ds = np.array(",".join(batches_ds)) 172 | if len(selected_celltypes_downsampled) > 1: 173 | selected_celltypes_downsampled = np.array(",".join(selected_celltypes_downsampled)) 174 | 175 | # Add data about downsampling to .uns of adata_concat 176 | if num_batches == 0: 177 | integrated_concat.uns["downsampling_stats"] = { 178 | "num_batches": 0, 179 | "num_celltypes_downsampled": ds_celltypes, 180 | "ds_batch_names": "None", 181 | "proportion_downsampled": ds_proportions, 182 | "downsampled_celltypes": "None" 183 | } 184 | else: 185 | integrated_concat.uns["downsampling_stats"] = { 186 | "num_batches": num_batches, 187 | "num_celltypes_downsampled": ds_celltypes, 188 | "ds_batch_names": "Placeholder due to h5py bug", 189 | "proportion_downsampled": ds_proportions, 190 | "downsampled_celltypes": selected_celltypes_downsampled 191 | } 192 | 193 | # Save integrated h5ad object 194 | integrated_concat.write_h5ad( 195 | filename = save_loc, 196 | compression = "gzip" 197 | ) 198 | 199 | if __name__ == "__main__": 200 | parser = argparse.ArgumentParser( 201 | description = "Input and output files for scRNA-seq integration" 202 | ) 203 | parser.add_argument( 204 | "--filedir", 205 | type = str, 206 | help = "Path of directory containing scRNA-seq h5ad files" 207 | ) 208 | parser.add_argument( 209 | "--ds_celltypes", 210 | type = int, 211 | help = "Number of celltypes to randomly downsample in given batch" 212 | ) 213 | parser.add_argument( 214 | "--ds_proportions", 215 | type = float, 216 | help = "Proportion of downsampling per celltype in a given batch" 217 | ) 218 | parser.add_argument( 219 | "--num_batches", 220 | type = int, 221 | help = "Number of batches to perform downsampling on" 222 | ) 223 | parser.add_argument( 224 | "--outfile", 225 | type = str, 226 | help = "Filepath for saving output from scRNA-seq integration" 227 | ) 228 | args = parser.parse_args() 229 | main( 230 | h5ad_dir = args.filedir, 231 | save_loc = args.outfile, 232 | ds_celltypes = args.ds_celltypes, 233 | ds_proportions = args.ds_proportions, 234 | num_batches = args.num_batches 235 | ) 236 | -------------------------------------------------------------------------------- /workflow/scripts/python/integrate_data_paga.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | sys.path.append("src/python/") 5 | os.environ['CUDA_VISIBLE_DEVICES'] = "0, 1" 6 | 7 | import scanpy as sc 8 | import anndata as ann 9 | import numpy as np 10 | 11 | from utils import IntegrationPAGA, downsample 12 | 13 | def none_or_str(value): 14 | if value == 'None': 15 | return None 16 | return value 17 | 18 | def main(h5ad_dir, root_celltype, save_loc, ds_celltypes, ds_proportions, num_batches): 19 | # Load h5ad files 20 | files_list = os.listdir(h5ad_dir) 21 | adata_loaded = [] 22 | for f in files_list: 23 | adata = sc.read_h5ad(os.path.join(h5ad_dir, f), as_sparse = "raw/X") 24 | adata.layers["raw"] = adata.X # Store raw counts 25 | if "gene" not in adata.var.columns: 26 | adata.var["gene"] = adata.var_names # Add gene names if not present 27 | adata.var = adata.var[["gene"]] # Only store relevant columns 28 | adata.obs.celltype = [ 29 | c.replace(" ", "_") for c in adata.obs.celltype 30 | ] # Remove spaces from celltype names - for Snakemake wildcard matching 31 | adata_loaded.append(adata) 32 | 33 | # Downsample loaded h5ad files based on params 34 | if num_batches == 0: 35 | selected_celltypes_downsampled = "None" # Placeholder - not used 36 | batches_ds = "None" # Placeholder - not used 37 | else: 38 | # Initialize random number generator 39 | rng = np.random.default_rng() 40 | 41 | # Select indices for downsampling 42 | selected_indices = np.random.choice( 43 | len(adata_loaded), num_batches, replace = False 44 | ) 45 | adata_selected = [adata_loaded[i] for i in selected_indices] 46 | adata_unselected = [adata_loaded[i] for i in range(len(adata_loaded)) if i not in selected_indices] 47 | 48 | # Downsample the same selected celltypes across all of the batches - this change will not affect 49 | # previous runs, as they all downsampled either 0 or only 1 celltype, in either 0 or 1 batches 50 | # #NOTE - this setup operates on the assumption that the celltypes are the same across all batches 51 | celltypes_all = np.unique(np.concatenate([adata.obs["celltype"].__array__() for adata in adata_selected])) 52 | rng.shuffle(celltypes_all) 53 | celltypes_selected = rng.choice(celltypes_all, ds_celltypes, replace = False) 54 | selected_celltypes_downsampled = np.array(celltypes_selected) 55 | adata_downsampled = [] 56 | for adata in adata_selected: 57 | adata_ds, selected_celltypes_ds = downsample( 58 | adata = adata, 59 | num_celltypes = None, 60 | celltype_names = celltypes_selected, 61 | proportion = ds_proportions 62 | ) 63 | adata_downsampled.append(adata_ds) 64 | adata_loaded = adata_unselected + adata_downsampled 65 | batches_ds = np.concatenate([np.unique(adata.obs["batch"].__array__()) for adata in adata_downsampled]) 66 | 67 | # Store batch name separately for each anndata object 68 | for adata in adata_loaded: 69 | adata.obs["batch_name"] = adata.obs["batch"] 70 | 71 | # Concatenate files (assume data is raw counts) 72 | adata_concat = ann.AnnData.concatenate(*adata_loaded) 73 | adata_concat.obs_names = range(len(adata_concat.obs_names)) 74 | adata_concat.obs_names_make_unique() 75 | adata_concat.obs["batch"] = adata_concat.obs["batch_name"] 76 | adata_concat.obs.drop("batch_name", axis = 1, inplace = True) 77 | 78 | # Create PAGA integration class instance 79 | integration_paga = IntegrationPAGA( 80 | adata = adata_concat, 81 | root_celltype = root_celltype 82 | ) 83 | 84 | # Integrate across subsets (including unintegrated) 85 | unintegrated = integration_paga.unintegrated() 86 | harmony_integrated = integration_paga.harmony_integrate() 87 | scvi_integrated = integration_paga.scvi_integrate() 88 | bbknn_integrated = integration_paga.bbknn_integrate() 89 | scanorama_integrated = integration_paga.scanorama_integrate() 90 | seurat_integrated = integration_paga.seurat_integrate() 91 | liger_integrated = integration_paga.liger_integrate() 92 | 93 | # Add integration type to each subset and concatenate 94 | unintegrated.obs["integration_method"] = "unintegrated" 95 | harmony_integrated.obs["integration_method"] = "harmony" 96 | scvi_integrated.obs["integration_method"] = "scvi" 97 | bbknn_integrated.obs["integration_method"] = "bbknn" 98 | scanorama_integrated.obs["integration_method"] = "scanorama" 99 | seurat_integrated.obs["integration_method"] = "seurat" 100 | liger_integrated.obs["integration_method"] = "liger" 101 | 102 | integrated_concat = ann.concat([ 103 | unintegrated, 104 | harmony_integrated, 105 | scvi_integrated, 106 | bbknn_integrated, 107 | scanorama_integrated, 108 | seurat_integrated, 109 | liger_integrated 110 | ]) 111 | integrated_concat.obs_names = range(len(integrated_concat.obs_names)) 112 | integrated_concat.obs_names_make_unique() 113 | 114 | # If downsampled celltypes and batches are of array length greater than one, combine them 115 | if len(batches_ds) > 1: 116 | batches_ds = np.array(",".join(batches_ds)) 117 | if len(selected_celltypes_downsampled) > 1: 118 | selected_celltypes_downsampled = np.array(",".join(selected_celltypes_downsampled)) 119 | 120 | # Add data about downsampling to .uns of adata_concat 121 | if num_batches == 0: 122 | integrated_concat.uns["downsampling_stats"] = { 123 | "num_batches": 0, 124 | "num_celltypes_downsampled": ds_celltypes, 125 | "ds_batch_names": "None", 126 | "proportion_downsampled": ds_proportions, 127 | "downsampled_celltypes": "None" 128 | } 129 | else: 130 | integrated_concat.uns["downsampling_stats"] = { 131 | "num_batches": num_batches, 132 | "num_celltypes_downsampled": ds_celltypes, 133 | "ds_batch_names": "Placeholder due to h5py bug", 134 | "proportion_downsampled": ds_proportions, 135 | "downsampled_celltypes": selected_celltypes_downsampled 136 | } 137 | 138 | # Save integrated h5ad object 139 | integrated_concat.write_h5ad( 140 | filename = save_loc, 141 | compression = "gzip" 142 | ) 143 | 144 | if __name__ == "__main__": 145 | parser = argparse.ArgumentParser( 146 | description = "Input and output files for scRNA-seq PAGA integration" 147 | ) 148 | parser.add_argument( 149 | "--filedir", 150 | type = str, 151 | help = "Path of directory containing scRNA-seq h5ad files" 152 | ) 153 | parser.add_argument( 154 | "--root_celltype", 155 | type = str, 156 | help = "Root celltype to utilize for diffusion pseudotime estimation" 157 | ) 158 | parser.add_argument( 159 | "--ds_celltypes", 160 | type = int, 161 | help = "Number of celltypes to randomly downsample in given batch" 162 | ) 163 | parser.add_argument( 164 | "--ds_proportions", 165 | type = float, 166 | help = "Proportion of downsampling per celltype in a given batch" 167 | ) 168 | parser.add_argument( 169 | "--num_batches", 170 | type = int, 171 | help = "Number of batches to perform downsampling on" 172 | ) 173 | parser.add_argument( 174 | "--outfile", 175 | type = str, 176 | help = "Filepath for saving output from scRNA-seq integration and pseudotime estimation" 177 | ) 178 | args = parser.parse_args() 179 | main( 180 | h5ad_dir = args.filedir, 181 | root_celltype = args.root_celltype, 182 | save_loc = args.outfile, 183 | ds_celltypes = args.ds_celltypes, 184 | ds_proportions = args.ds_proportions, 185 | num_batches = args.num_batches 186 | ) 187 | -------------------------------------------------------------------------------- /workflow/scripts/python/knn_classification.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | 5 | import scanpy as sc 6 | import anndata as ann 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.neighbors import KNeighborsClassifier 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.neighbors import KNeighborsClassifier 12 | from sklearn.metrics import classification_report 13 | 14 | def main(h5ad_loc, save_loc, dataset_name, rep): 15 | # Load h5ad file 16 | adata = sc.read_h5ad(h5ad_loc) 17 | 18 | # Extract summary statistics from h5ad file 19 | num_batches_ds = adata.uns["downsampling_stats"]["num_batches"] 20 | num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"] 21 | prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"] 22 | 23 | # Subset h5ad based on batch-correction method used 24 | adata_method_sub = [] 25 | methods = ["harmony", "scvi", "scanorama", "seurat", "liger"] # Omitting BBKNN due to lack of embedding 26 | for method in methods: 27 | adata_sub = adata[adata.obs["integration_method"] == method] 28 | adata_method_sub.append( 29 | adata_sub 30 | ) 31 | 32 | # Determine KNN accuracy for each batch-correction method 33 | precision_scores = [] 34 | recall_scores = [] 35 | f1_scores = [] 36 | supports = [] 37 | celltypes = [] 38 | for adata_sub in adata_method_sub: 39 | # Split testing and training data in stratified manner (70/30) 40 | X = adata_sub.obsm["X_kmeans"] 41 | y = adata_sub.obs["celltype"].__array__() 42 | X_train, X_test, y_train, y_test = train_test_split( 43 | X, y, stratify=y, test_size=0.7, random_state=42 44 | ) 45 | 46 | # Train k-nearest neighbors classifier with k=15 and predict on test data 47 | knn = KNeighborsClassifier( 48 | n_neighbors=15 49 | ) 50 | knn.fit(X_train, y_train) 51 | y_pred = knn.predict(X_test) 52 | 53 | # Get classification report and subset for only relevant columns 54 | class_report_dict = classification_report( 55 | y_test, y_pred, output_dict=True 56 | ) 57 | class_report_df = pd.DataFrame(class_report_dict) 58 | class_report_df = class_report_df.iloc[:, :-3] 59 | 60 | # Append appropriate values to lists 61 | precision_scores.append(class_report_df.loc["precision"].values) 62 | recall_scores.append(class_report_df.loc["recall"].values) 63 | f1_scores.append(class_report_df.loc["f1-score"].values) 64 | supports.append(class_report_df.loc["support"].values) 65 | celltypes.append(class_report_df.columns.values) 66 | 67 | # Repeat method values to have same length as scores (one for each celltype) 68 | methods_repeat = np.repeat(methods, len(precision_scores[0])) 69 | 70 | # Concatenate scores and celltypes 71 | precision_scores_concat = np.concatenate(precision_scores) 72 | recall_scores_concat = np.concatenate(recall_scores) 73 | f1_scores_concat = np.concatenate(f1_scores) 74 | supports_concat = np.concatenate(supports) 75 | celltypes_concat = np.concatenate(celltypes) 76 | 77 | # Create summary dataframe for classification statistics and save 78 | classification_summary_df = pd.DataFrame({ 79 | "Dataset": dataset_name, 80 | "Number of batches downsampled": num_batches_ds, 81 | "Number of celltypes downsampled": num_celltypes_ds, 82 | "Proportion downsampled": prop_ds, 83 | "Replicate": rep, 84 | "Method": methods_repeat, 85 | "Celltype": celltypes_concat, 86 | "Precision": precision_scores_concat, 87 | "Recall": recall_scores_concat, 88 | "F1-score": f1_scores_concat, 89 | "Support": supports_concat, 90 | "Mean KNN F1-score": np.mean(f1_scores_concat) 91 | }) 92 | classification_summary_df.to_csv( 93 | save_loc, 94 | index=False, 95 | sep="\t" 96 | ) 97 | 98 | if __name__ == '__main__': 99 | parser = argparse.ArgumentParser( 100 | description = "Input and output files for KNN classification results summary" 101 | ) 102 | parser.add_argument( 103 | "--infile", 104 | type = str, 105 | help = "Path of integrated h5ad file" 106 | ) 107 | parser.add_argument( 108 | "--outfile", 109 | type = str, 110 | help = "Filepath for saving KNN classification results of integrated h5ad file" 111 | ) 112 | parser.add_argument( 113 | "--dataset", 114 | type = str, 115 | help = "Name of dataset" 116 | ) 117 | parser.add_argument( 118 | "--rep", 119 | type = int, 120 | help = "Repetition number" 121 | ) 122 | args = parser.parse_args() 123 | main( 124 | h5ad_loc = args.infile, 125 | save_loc = args.outfile, 126 | dataset_name = args.dataset, 127 | rep = args.rep 128 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/marker_get.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | sys.path.append("src/python/") 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import anndata as ann 9 | import scanpy as sc 10 | 11 | from utils import dge_top_n 12 | 13 | def main(h5ad_dir, save_loc, top_n = 10): 14 | # Load h5ad files 15 | files_list = os.listdir(h5ad_dir) 16 | adata_loaded = [] 17 | for f in files_list: 18 | adata = sc.read_h5ad(os.path.join(h5ad_dir, f), as_sparse = "raw/X") 19 | adata.layers["raw"] = adata.X # Store raw counts 20 | adata.obs = adata.obs[["batch", "celltype"]] # Only store relevant columns 21 | if "gene" not in adata.var.columns: 22 | adata.var["gene"] = adata.var_names # Add gene names if not present 23 | adata.var = adata.var[["gene"]] # Only store relevant columns 24 | adata_loaded.append(adata) 25 | 26 | # Get the differential gene expression results for the celltypes in each batch, top n 27 | # for each celltype and get and return the union 28 | adata_dge_top_n_dfs = [] 29 | for adata in adata_loaded: 30 | # Removing human mito and ribo genes (won't affect mouse data) 31 | adata = adata[:, adata.var.gene.str.startswith("MT-") == False] 32 | adata = adata[:, adata.var.gene.str.contains("RPS") == False] 33 | adata = adata[:, adata.var.gene.str.contains("RPL") == False] 34 | adata = adata[:, adata.var.gene.str.contains("MRPL") == False] 35 | adata = adata[:, adata.var.gene.str.contains("MRPS") == False] 36 | 37 | # Removing mouse mito and ribo genes (won't affect human data) 38 | adata = adata[:, adata.var.gene.str.startswith("Mt-") == False] 39 | adata = adata[:, adata.var.gene.str.startswith("mt-") == False] 40 | adata = adata[:, adata.var.gene.str.contains("Rpl") == False] 41 | adata = adata[:, adata.var.gene.str.contains("Rps") == False] 42 | adata = adata[:, adata.var.gene.str.contains("Mrpl") == False] 43 | adata = adata[:, adata.var.gene.str.contains("Mrps") == False] 44 | 45 | # Remove any celltypes with less than 5 cells 46 | celltype_vcounts = adata.obs.celltype.value_counts() 47 | celltype_vcounts_sub = celltype_vcounts[celltype_vcounts >= 5] 48 | adata = adata[adata.obs.celltype.isin(celltype_vcounts_sub.index)] 49 | 50 | # Log-normalize the data 51 | sc.pp.normalize_total( 52 | adata, 53 | target_sum = 1e4 54 | ) 55 | sc.pp.log1p(adata) 56 | 57 | # Store lognorm counts and perform DGE based on celltype 58 | adata.raw = adata # Freeze for DGE test - lognorm counts 59 | sc.tl.rank_genes_groups( 60 | adata, 61 | groupby = "celltype", 62 | use_raw = True, 63 | method = "wilcoxon" 64 | ) 65 | 66 | # Get the top n degs for each celltype and append to all results 67 | dge_results = dge_top_n( 68 | adata, 69 | n = top_n, 70 | obs_group = "celltype" 71 | ) 72 | adata_dge_top_n_dfs.append(dge_results) 73 | 74 | # Concatenate all dge dataframes and keep distinct rows 75 | adata_dge_top_n_concat = pd.concat(adata_dge_top_n_dfs) 76 | adata_dge_top_n_concat = adata_dge_top_n_concat.drop_duplicates() 77 | 78 | # Rename columns appropriately and save 79 | adata_dge_top_n_concat.columns = [ 80 | "Celltype", 81 | "Top {n} marker genes (union across batches)".format( 82 | n = top_n 83 | ) 84 | ] 85 | adata_dge_top_n_concat.to_csv(save_loc, sep = "\t", index = False) 86 | 87 | if __name__ == '__main__': 88 | parser = argparse.ArgumentParser( 89 | description = "Input and output files for marker gene summary" 90 | ) 91 | parser.add_argument( 92 | "--filedir", 93 | type = str, 94 | help = "Path of directory containing scRNA-seq h5ad files" 95 | ) 96 | parser.add_argument( 97 | "--outfile", 98 | type = str, 99 | help = "Filepath for saving output from marker gene selection" 100 | ) 101 | args = parser.parse_args() 102 | main( 103 | h5ad_dir = args.filedir, 104 | save_loc = args.outfile, 105 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/reference_annotation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | sys.path.append("src/python/") 4 | 5 | from utils import SeuratReferenceMap 6 | 7 | def main(h5ad_loc, ref_h5_loc, save_loc): 8 | # Create an instance of SeuratReferenceMap 9 | refmap = SeuratReferenceMap( 10 | integrated_data_h5 = h5ad_loc, 11 | reference_h5 = ref_h5_loc, 12 | mapped_h5 = save_loc 13 | ) 14 | 15 | # Run SeuratReferenceMap to save mapped query object to h5ad file 16 | refmap.refmap() 17 | 18 | if __name__ == '__main__': 19 | parser = argparse.ArgumentParser( 20 | description = "Input and output files for query to reference mapping" 21 | ) 22 | parser.add_argument( 23 | "--infile", 24 | type = str, 25 | help = "Path of integrated h5ad file" 26 | ) 27 | parser.add_argument( 28 | "--ref_file", 29 | type = str, 30 | help = "Path of reference h5Seurat file" 31 | ) 32 | parser.add_argument( 33 | "--outfile", 34 | type = str, 35 | help = "Filepath for saving Seurat reference mapped and annotated h5ad file" 36 | ) 37 | args = parser.parse_args() 38 | main( 39 | h5ad_loc = args.infile, 40 | ref_h5_loc = args.ref_file, 41 | save_loc = args.outfile, 42 | ) 43 | -------------------------------------------------------------------------------- /workflow/scripts/python/reference_control_annotation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | sys.path.append("src/python/") 4 | 5 | import numpy as np 6 | import scanpy as sc 7 | import anndata as ann 8 | 9 | from utils import cross_data_knn 10 | 11 | def main(h5ad_loc, ref_h5ad_loc, save_loc): 12 | # Load the seurat reference mapped h5ad file 13 | query_h5ad_full = sc.read_h5ad(h5ad_loc) 14 | query_h5ad = query_h5ad_full.raw.to_adata() # Use sctransformed counts 15 | 16 | # Load the reference h5ad file 17 | ref_h5ad = sc.read_h5ad(ref_h5ad_loc) 18 | 19 | # Add var/gene of both information to indices of both 20 | query_h5ad.var.index = query_h5ad.var._index 21 | ref_h5ad.var.index = ref_h5ad.var._index 22 | 23 | # Get the intersection of the genes in the query and reference h5ad files 24 | query_genes = set(query_h5ad.var.index.__array__()) 25 | ref_genes = set(ref_h5ad.var.index.__array__()) 26 | common_genes_list = list(ref_genes.intersection(query_genes)) 27 | 28 | # Subset anndata objects for the common genes 29 | query_h5ad_sub = query_h5ad[:, common_genes_list] 30 | ref_h5ad_sub = ref_h5ad[:, common_genes_list] 31 | 32 | # Ensure genes are equal between query and reference 33 | if not np.array_equal( 34 | query_h5ad_sub.var.index.__array__(), 35 | ref_h5ad_sub.var.index.__array__() 36 | ): 37 | raise ValueError( 38 | "Genes not equal between query and reference h5ad files after intersection" 39 | ) 40 | 41 | # Get highly variable gene subset of the reference data and return 42 | sc.pp.highly_variable_genes(ref_h5ad_sub, flavor="seurat", n_top_genes=2500) 43 | hvg_indices = ref_h5ad_sub.var["highly_variable"].__array__() 44 | 45 | # Get the SCTransformed data subsets for both the query and reference (common gene subsets) 46 | query_sct = query_h5ad_sub.X.toarray() 47 | ref_sct = ref_h5ad_sub.X.toarray() 48 | 49 | # Subset for the indices of the hvg genes 50 | query_sct_hvg_subset = query_sct[:, hvg_indices] 51 | ref_sct_hvg_subset = ref_sct[:, hvg_indices] 52 | 53 | # Get the (1) nearest neighbors for the reference data within the query data 54 | query_1_nn = cross_data_knn(query_sct_hvg_subset, ref_sct_hvg_subset, 1) 55 | 56 | # Get the celltypes (both l1 and l2) corresponding to the nearest neighbors for the reference data 57 | ref_celltypes_l1 = ref_h5ad.obs["celltype.l1"][query_1_nn.flatten()].__array__() 58 | ref_celltypes_l2 = ref_h5ad.obs["celltype.l2"][query_1_nn.flatten()].__array__() 59 | 60 | # Append the celltypes to the query h5ad file 61 | query_h5ad.obs["baseline.knn.l1"] = ref_celltypes_l1 62 | query_h5ad.obs["baseline.knn.l2"] = ref_celltypes_l2 63 | 64 | # Change colnames of query var to not collide with h5ad writing in anndata 65 | query_h5ad.var["gene_name"] = query_h5ad.var.index 66 | query_h5ad.var = query_h5ad.var.drop(query_h5ad.var.columns[0], axis=1) 67 | 68 | # Save the query h5ad file with baseline annotations 69 | query_h5ad.write_h5ad( 70 | filename = save_loc, 71 | compression = "gzip" 72 | ) 73 | 74 | if __name__ == '__main__': 75 | parser = argparse.ArgumentParser( 76 | description = "Input and output files for query to reference mapping - control experiment" 77 | ) 78 | parser.add_argument( 79 | "--infile", 80 | type = str, 81 | help = "Path of seurat reference mapped h5ad file" 82 | ) 83 | parser.add_argument( 84 | "--ref_file", 85 | type = str, 86 | help = "Path of reference h5ad file" 87 | ) 88 | parser.add_argument( 89 | "--outfile", 90 | type = str, 91 | help = "Filepath for saving seurat and control reference mapped and annotated h5ad file" 92 | ) 93 | args = parser.parse_args() 94 | main( 95 | h5ad_loc = args.infile, 96 | ref_h5ad_loc = args.ref_file, 97 | save_loc = args.outfile, 98 | ) 99 | -------------------------------------------------------------------------------- /workflow/scripts/python/relatedness_metric.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | sys.path.append("src/python/") 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import scanpy as sc 9 | 10 | from utils import relatedness_score 11 | 12 | def main(h5ad_dir, save_loc, dataset_name): 13 | # Load h5ad files 14 | files_list = os.listdir(h5ad_dir) 15 | adata_loaded = [] 16 | for f in files_list: 17 | adata = sc.read_h5ad(os.path.join(h5ad_dir, f), as_sparse = "raw/X") 18 | adata_loaded.append(adata) 19 | 20 | # Get relatedness metric for each celltype within each batch 21 | celltype_relatedness_dfs = [] 22 | for adata in adata_loaded: 23 | celltype_relatedness_df = relatedness_score(adata, pca_performed = False) 24 | celltype_relatedness_dfs.append(celltype_relatedness_df) 25 | 26 | # Concatenate results, add relevant metadata and save 27 | celltype_relatedness_dfs_concat = pd.concat(celltype_relatedness_dfs) 28 | celltype_relatedness_dfs_concat["Dataset"] = dataset_name 29 | celltype_relatedness_dfs_concat.to_csv(save_loc, sep = "\t", index = False) 30 | 31 | if __name__ == '__main__': 32 | parser = argparse.ArgumentParser( 33 | description = "Input and output files for calculating relatedness metric" 34 | ) 35 | parser.add_argument( 36 | "--filedir", 37 | type = str, 38 | help = "Path of directory containing scRNA-seq h5ad files" 39 | ) 40 | parser.add_argument( 41 | "--outfile", 42 | type = str, 43 | help = "Filepath for saving output from relatedness metric calculation" 44 | ) 45 | parser.add_argument( 46 | "--dataset", 47 | type = str, 48 | help = "Name of dataset" 49 | ) 50 | args = parser.parse_args() 51 | main( 52 | h5ad_dir = args.filedir, 53 | save_loc = args.outfile, 54 | dataset_name = args.dataset 55 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/ti_concordance.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | sys.path.append("src/python/") 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import scanpy as sc 9 | import scipy.stats as sp 10 | 11 | def main(h5ad_loc, save_loc, dataset_name, rep): 12 | # Load h5ad file 13 | adata = sc.read_h5ad(h5ad_loc) 14 | 15 | # Extract summary statistics from h5ad file 16 | num_batches_ds = adata.uns["downsampling_stats"]["num_batches"] 17 | num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"] 18 | prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"] 19 | batches = np.unique(adata.obs.batch.__array__()) 20 | 21 | # Drop any samples/cells that contain NaN or Inf pseudotime estimates 22 | pt_drop_indices_1 = np.where(np.isnan(adata.obs.dpt_pseudotime.__array__()))[0] 23 | pt_drop_indices_2 = np.where(np.isinf(adata.obs.dpt_pseudotime.__array__()))[0] 24 | pt_drop_indices = np.unique(np.concatenate([pt_drop_indices_1, pt_drop_indices_2])) 25 | pt_drop_samples = np.unique(adata.obs["sample"].__array__()[pt_drop_indices]) 26 | adata = adata[~adata.obs["sample"].isin(pt_drop_samples)].copy() 27 | 28 | # Subset h5ad based on batch-correction method used 29 | adata_method_sub = [] 30 | methods = ["harmony", "scvi", "bbknn", "scanorama", "seurat", "liger"] 31 | for method in methods: 32 | adata_sub = adata[adata.obs["integration_method"] == method] 33 | adata_method_sub.append( 34 | adata_sub 35 | ) 36 | 37 | # Subset the data for unintegrated results and extract the dpt_pseudotime values 38 | unintegrated_adata = adata[ 39 | adata.obs["integration_method"] == "unintegrated" 40 | ] 41 | unintegrated_pt = unintegrated_adata.obs["dpt_pseudotime"].__array__() 42 | 43 | # Determine pearson, spearman, and kendall correlations between post-integration 44 | # PAGA estimated pseudotime and pre-integration pseudotime for each batch-correction 45 | # method 46 | spearman_corrs = [] 47 | pearson_corrs = [] 48 | kendall_corrs = [] 49 | for adata_sub in adata_method_sub: 50 | # Get DPT pseudotime estimates 51 | dpt_pt = adata_sub.obs["dpt_pseudotime"].__array__() 52 | 53 | # Get correlations between pre-integration/unintegrated pseudotime and DPT pseudotime 54 | spearman_corr = sp.spearmanr(unintegrated_pt, dpt_pt)[0] 55 | pearson_corr = sp.pearsonr(unintegrated_pt, dpt_pt)[0] 56 | kendall_corr = sp.kendalltau(unintegrated_pt, dpt_pt)[0] 57 | 58 | spearman_corrs.append(spearman_corr) 59 | pearson_corrs.append(pearson_corr) 60 | kendall_corrs.append(kendall_corr) 61 | 62 | # Create a dataframe with the results 63 | ti_corr_df = pd.DataFrame({ 64 | "Method" : methods, 65 | "Spearman correlations" : spearman_corrs, 66 | "Pearson correlations" : pearson_corrs, 67 | "Kendall correlations" : kendall_corrs 68 | }) 69 | ti_corr_df["Dataset"] = dataset_name 70 | ti_corr_df["Number of batches downsampled"] = num_batches_ds 71 | ti_corr_df["Number of celltypes downsampled"] = num_celltypes_ds 72 | ti_corr_df["Proportion downsampled"] = prop_ds 73 | ti_corr_df["Replicate"] = rep 74 | ti_corr_df["Total batches"] = len(batches) 75 | 76 | # Save dataframe to file 77 | ti_corr_df.to_csv( 78 | save_loc, 79 | index = False, 80 | sep = "\t" 81 | ) 82 | 83 | if __name__ == "__main__": 84 | parser = argparse.ArgumentParser( 85 | description = "Input and output files for trajectory inference scoring" 86 | ) 87 | parser.add_argument( 88 | "--infile", 89 | type = str, 90 | help = "Path of PAGA integrated h5ad file" 91 | ) 92 | parser.add_argument( 93 | "--outfile", 94 | type = str, 95 | help = "Filepath for saving trajectory inference scoring results" 96 | ) 97 | parser.add_argument( 98 | "--dataset", 99 | type = str, 100 | help = "Name of dataset" 101 | ) 102 | parser.add_argument( 103 | "--rep", 104 | type = int, 105 | help = "Repetition number" 106 | ) 107 | args = parser.parse_args() 108 | main( 109 | h5ad_loc = args.infile, 110 | save_loc = args.outfile, 111 | dataset_name = args.dataset, 112 | rep = args.rep 113 | ) -------------------------------------------------------------------------------- /workflow/scripts/python/umap_plots.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | sys.path.append("src/python/") 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import anndata as ann 9 | import scanpy as sc 10 | 11 | from utils import Umap 12 | 13 | def main(h5ad_loc, leiden_save_loc, celltype_save_loc, batch_save_loc): 14 | # Load h5ad file 15 | adata = sc.read_h5ad(h5ad_loc) 16 | 17 | # Get the umap coordinates for all the methods and create dictionary object 18 | methods = ["bbknn", "harmony", "scanorama", "scvi", "seurat"] 19 | umap_dict = {} 20 | for method in methods: 21 | adata_sub = adata[adata.obs["integration_method"] == method] 22 | umap_dict[method] = adata_sub.obsm["X_umap"].__array__() 23 | 24 | # Get the leiden clustering for all the methods and create dictionary object 25 | leiden_dict = {} 26 | for method in methods: 27 | adata_sub = adata[adata.obs["integration_method"] == method] 28 | leiden_dict[method] = adata_sub.obs["leiden"].__array__() 29 | 30 | # Create a cell type dictionary object 31 | celltype_dict = {} 32 | for method in methods: 33 | adata_sub = adata[adata.obs["integration_method"] == method] 34 | celltype_dict[method] = adata_sub.obs["celltype"].__array__() 35 | 36 | # Create a batch dictionary object 37 | batch_dict = {} 38 | for method in methods: 39 | adata_sub = adata[adata.obs["integration_method"] == method] 40 | batch_dict[method] = adata_sub.obs["batch"].__array__() 41 | 42 | # Create a umap object for each subset of information 43 | umap_leiden = Umap( 44 | coords = umap_dict, 45 | clustering = leiden_dict, 46 | subset_name = "Clustering", 47 | ) 48 | umap_celltype = Umap( 49 | coords = umap_dict, 50 | clustering = celltype_dict, 51 | subset_name = "Cell-type", 52 | ) 53 | umap_batch = Umap( 54 | coords = umap_dict, 55 | clustering = batch_dict, 56 | subset_name = "Batch", 57 | ) 58 | 59 | # Plot each of the umap objects 60 | umap_leiden.umap_df() 61 | umap_leiden.umap_plot(show_plot=True) 62 | umap_leiden.save_umap( 63 | save_dir=leiden_save_loc, 64 | dpi=300 65 | ) 66 | 67 | umap_celltype.umap_df() 68 | umap_celltype.umap_plot(show_plot=True) 69 | umap_celltype.save_umap( 70 | save_dir=celltype_save_loc, 71 | dpi=300 72 | ) 73 | 74 | umap_batch.umap_df() 75 | umap_batch.umap_plot(show_plot=True) 76 | umap_batch.save_umap( 77 | save_dir=batch_save_loc, 78 | dpi=300 79 | ) 80 | 81 | # Save the umap dataframes as auxiliary files 82 | umap_leiden_df = umap_leiden.umap_concat 83 | umap_celltype_df = umap_celltype.umap_concat 84 | umap_batch_df = umap_batch.umap_concat 85 | 86 | umap_leiden_df.to_csv( 87 | os.path.splitext(leiden_save_loc)[0] + ".tsv", 88 | sep = "\t" 89 | ) 90 | umap_celltype_df.to_csv( 91 | os.path.splitext(celltype_save_loc)[0] + ".tsv", 92 | sep = "\t" 93 | ) 94 | umap_batch_df.to_csv( 95 | os.path.splitext(batch_save_loc)[0] + ".tsv", 96 | sep = "\t" 97 | ) 98 | 99 | if __name__ == '__main__': 100 | parser = argparse.ArgumentParser( 101 | description = "Input and output files for UMAP plot generation" 102 | ) 103 | parser.add_argument( 104 | "--infile", 105 | type = str, 106 | help = "Path of integrated h5ad file" 107 | ) 108 | parser.add_argument( 109 | "--leiden_plot_loc", 110 | type = str, 111 | help = "Filepath for saving leiden overlayed UMAP results" 112 | ) 113 | parser.add_argument( 114 | "--celltype_plot_loc", 115 | type = str, 116 | help = "Filepath for saving celltype overlayed UMAP results" 117 | ) 118 | parser.add_argument( 119 | "--batch_plot_loc", 120 | type = str, 121 | help = "Filepath for saving batch overlayed UMAP results" 122 | ) 123 | args = parser.parse_args() 124 | main( 125 | h5ad_loc = args.infile, 126 | leiden_save_loc = args.leiden_plot_loc, 127 | celltype_save_loc = args.celltype_plot_loc, 128 | batch_save_loc = args.batch_plot_loc 129 | ) -------------------------------------------------------------------------------- /workflow/src/R/liger_integrate.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(liger) 3 | library(Seurat) 4 | library(SeuratDisk) 5 | library(reticulate) 6 | 7 | # Get a random seed 8 | rand_seed <- sample(1:100000000, 1) 9 | 10 | # Read in matrix for full data, including last batch column 11 | args <- commandArgs(trailingOnly = TRUE) 12 | file <- args[1] 13 | filename <- args[2] 14 | 15 | # Load anndata, scanpy, and scipy sparse through reticulate 16 | ad <- import("anndata") 17 | sc <- import("scanpy") 18 | sp_sparse <- import("scipy.sparse") 19 | 20 | # Load h5ad object through reticulate and create Seurat object 21 | temp_adata <- ad$read_h5ad(file) 22 | exprs <- t(temp_adata$X$todense()) 23 | colnames(exprs) <- temp_adata$obs_names$to_list() 24 | rownames(exprs) <- temp_adata$var_names$to_list() 25 | seur_obj <- CreateSeuratObject(exprs) 26 | seur_obj <- SetAssayData(seur_obj, "data", exprs) 27 | seur_obj <- AddMetaData(seur_obj, temp_adata$obs) 28 | 29 | # Split object by batch information 30 | seur_obj_list <- SplitObject( 31 | seur_obj, 32 | split.by = "batch" 33 | ) 34 | 35 | # Get matrices of rna data for each batch and name by batch 36 | seur_counts_list <- lapply(seur_obj_list, function(x) { 37 | return(x@assays$RNA@counts) 38 | }) 39 | seur_counts_list_names <- lapply(seur_obj_list, function(x) { 40 | return(unique(x@meta.data$batch)) 41 | }) 42 | names(seur_counts_list) <- seur_counts_list_names 43 | 44 | # Create Liger object from seurat list of matrices 45 | liger_obj <- createLiger(seur_counts_list, remove.missing = FALSE) 46 | 47 | # Normalize and select highly variable genes using LIGER's functions 48 | liger_obj <- normalize(liger_obj) 49 | liger_obj <- selectGenes(liger_obj) 50 | 51 | # Scale data, perform iNFM and quantile normalization 52 | liger_obj <- scaleNotCenter(liger_obj) 53 | liger_obj <- optimizeALS(liger_obj, k = 20, rand.seed = rand_seed) 54 | liger_obj <- quantile_norm(liger_obj) # No seeding done in version 0.5.0 55 | 56 | # Extract normalized cell loadings, save as h5seurat object, 57 | # and convert to h5ad 58 | liger_norm_h <- liger_obj@H.norm 59 | rownames(liger_norm_h) <- colnames(seur_obj) 60 | colnames(liger_norm_h) <- paste0( 61 | "h_norm_comp_", seq(1:ncol(liger_norm_h)) 62 | ) 63 | norm_cell_loadings <- CreateSeuratObject(counts = t(liger_norm_h)) 64 | SaveH5Seurat( 65 | object = norm_cell_loadings, 66 | filename = paste0("./tmp/", filename, "_liger_out.h5seurat"), 67 | overwrite = TRUE, 68 | verbose = TRUE 69 | ) 70 | 71 | # Convert tempfile to h5ad object 72 | Convert( 73 | paste0("./tmp/", filename, "_liger_out.h5seurat"), 74 | dest = "h5ad" 75 | ) -------------------------------------------------------------------------------- /workflow/src/R/seurat_integrate.R: -------------------------------------------------------------------------------- 1 | library(Seurat) 2 | library(SeuratDisk) 3 | library(reticulate) 4 | 5 | # Read in matrix for full data, including last batch column 6 | args <- commandArgs(trailingOnly = TRUE) 7 | file <- args[1] 8 | filename <- args[2] 9 | int_type <- args[3] # Integration type 10 | 11 | # Load anndata and scanpy 12 | ad <- import("anndata") 13 | sc <- import("scanpy") 14 | 15 | # Load h5ad object through reticulate and create Seurat object 16 | temp_adata <- ad$read_h5ad(file) 17 | exprs <- t(temp_adata$X$todense()) 18 | colnames(exprs) <- temp_adata$obs_names$to_list() 19 | rownames(exprs) <- temp_adata$var_names$to_list() 20 | seur_obj <- CreateSeuratObject(exprs) 21 | seur_obj <- SetAssayData(seur_obj, "data", exprs) 22 | seur_obj <- AddMetaData(seur_obj, temp_adata$obs) 23 | 24 | # Split object by batch information 25 | seur_obj_list <- SplitObject( 26 | seur_obj, 27 | split.by = "batch" 28 | ) 29 | 30 | # Iterate over batches and find highly variable genes 31 | for (i in 1:length(seur_obj_list)) { 32 | seur_obj_list[[i]] <- FindVariableFeatures( 33 | seur_obj_list[[i]], 34 | selection.method = "mean.var.plot", 35 | nfeatures = 2500, 36 | verbose = TRUE 37 | ) 38 | } 39 | 40 | # Determine type of integration to perform (CCA or RPCA) 41 | if (int_type == "CCA") { 42 | int_anchors <- FindIntegrationAnchors( 43 | object.list = seur_obj_list, 44 | dims = 1:20, 45 | anchor.features = 2500 46 | ) 47 | batches_integrated <- IntegrateData(anchorset = int_anchors, dims = 1:20) 48 | } else if (int_type == "RPCA") { 49 | int_features <- SelectIntegrationFeatures(object.list = seur_obj_list) 50 | for (i in 1:length(seur_obj_list)) { 51 | x <- seur_obj_list[[i]] 52 | x <- ScaleData(x, features = int_features, verbose = TRUE) 53 | x <- RunPCA(x, features = int_features, verbose = TRUE) 54 | seur_obj_list[[i]] <- x 55 | } 56 | int_anchors <- FindIntegrationAnchors( 57 | object.list = seur_obj_list, 58 | reduction = "rpca", 59 | dims = 1:20 60 | ) 61 | batches_integrated <- IntegrateData(anchorset = int_anchors, dims = 1:20) 62 | } else { 63 | stop( 64 | "Please indicate either 'CCA' or 'RPCA' for the integration type option" 65 | ) 66 | } 67 | 68 | # Return integrated adata object as hda5 file -> tempfile 69 | SaveH5Seurat( 70 | object = batches_integrated, 71 | filename = paste0("./tmp/", filename, "_seur_out.h5Seurat"), 72 | overwrite = TRUE, 73 | verbose = TRUE 74 | ) 75 | 76 | # Convert tempfile to h5ad object 77 | Convert( 78 | paste0("./tmp/", filename, "_seur_out.h5Seurat"), 79 | dest = "h5ad" 80 | ) 81 | -------------------------------------------------------------------------------- /workflow/src/R/seurat_reference_map.R: -------------------------------------------------------------------------------- 1 | library(Seurat) 2 | library(SeuratDisk) 3 | library(reticulate) 4 | 5 | # Read in arguments 6 | args <- commandArgs(trailingOnly = TRUE) 7 | ref_file <- args[1] 8 | temp_adata_file <- args[2] 9 | temp_adata_filename <- args[3] 10 | outfile_name <- args[4] 11 | 12 | # Read in h5seurat reference data 13 | ref_data <- LoadH5Seurat(ref_file) 14 | 15 | # Load anndata and scanpy 16 | ad <- import("anndata") 17 | sc <- import("scanpy") 18 | 19 | # Convert h5ad anndata temp file 20 | temp_adata <- ad$read_h5ad(temp_adata_file) 21 | 22 | # Create Seurat object and split by batch information - use anndata import 23 | exprs <- t(temp_adata$X$todense()) 24 | colnames(exprs) <- temp_adata$obs_names$to_list() 25 | rownames(exprs) <- temp_adata$var_names$to_list() 26 | query_obj <- CreateSeuratObject(exprs) 27 | query_obj <- SetAssayData(query_obj, "data", exprs) 28 | query_obj <- AddMetaData(query_obj, temp_adata$obs) 29 | query_obj_list <- SplitObject( 30 | query_obj, 31 | split.by = "batch" 32 | ) 33 | 34 | # Normalize query batches using scTransform 35 | query_obj_list <- lapply(X = query_obj_list, FUN = SCTransform, verbose = FALSE) 36 | 37 | # Get anchors between each query batches and the reference 38 | anchors <- list() 39 | for (i in 1:length(query_obj_list)) { 40 | anchors[[i]] <- FindTransferAnchors( 41 | reference = ref_data, 42 | query = query_obj_list[[i]], 43 | reference.reduction = "spca", 44 | dims = 1:50 45 | ) 46 | } 47 | 48 | # Note - this may not be ideal to simulate effects of downsampling 49 | # as each batch is being mapped individually here and not separately 50 | # Map each of the query batches individually 51 | for (i in 1:length(query_obj_list)) { 52 | query_obj_list[[i]] <- MapQuery( 53 | anchorset = anchors[[i]], 54 | query = query_obj_list[[i]], 55 | reference = ref_data, 56 | refdata = list( 57 | celltype.l1 = "celltype.l1", 58 | celltype.l2 = "celltype.l2", 59 | predicted_ADT = "ADT" 60 | ), 61 | reference.reduction = "spca", 62 | reduction.model = "wnn.umap" 63 | ) 64 | } 65 | 66 | # Set default assay of all to SCT (for outputting scTransformed counts) 67 | for (i in 1:length(query_obj_list)) { 68 | DefaultAssay(query_obj_list[[i]]) <- "SCT" 69 | } 70 | 71 | # Remerge the batches into one object - reset default assay as failsafe 72 | query_ref_mapped_obj <- Reduce(merge, query_obj_list) 73 | DefaultAssay(query_ref_mapped_obj) <- "SCT" 74 | 75 | # Return reference mapped adata object as hda5 file -> tempfile 76 | SaveH5Seurat( 77 | object = query_ref_mapped_obj, 78 | filename = paste0(outfile_name, ".h5Seurat"), 79 | overwrite = TRUE, 80 | verbose = TRUE 81 | ) 82 | 83 | # Convert tempfile to h5ad object 84 | Convert( 85 | paste0(outfile_name, ".h5Seurat"), 86 | dest = "h5ad" 87 | ) 88 | 89 | # Remove h5seurat file 90 | file.remove( 91 | paste0(outfile_name, ".h5Seurat") 92 | ) -------------------------------------------------------------------------------- /workflow/src/python/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import Integration, IntegrationPAGA, cluster_num, leiden_clip, \ 2 | cluster_membership, downsample, diffexp, dge_top_n, set_concordance, \ 3 | cluster_concordance, faiss_kmeans, SeuratReferenceMap, mutual_nn, \ 4 | find_mutual_nn, find_knn, cross_data_knn, relatedness_score 5 | from .imbalanced_clustering import balanced_adjusted_rand_index, \ 6 | balanced_adjusted_mutual_info, balanced_completeness, balanced_homogeneity, \ 7 | balanced_v_measure -------------------------------------------------------------------------------- /workflow/src/python/imbalanced_clustering/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0" 2 | from .ari import balanced_adjusted_rand_index 3 | from .ami import balanced_adjusted_mutual_info 4 | from .vmeasure import balanced_homogeneity, balanced_completeness, balanced_v_measure 5 | -------------------------------------------------------------------------------- /workflow/src/python/imbalanced_clustering/ami.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | 4 | from .utils import ( 5 | check_clusterings, 6 | contingency_matrix, 7 | entropy, 8 | mutual_info_score, 9 | expected_mutual_information, 10 | generalized_average, 11 | ) 12 | 13 | 14 | def balanced_adjusted_mutual_info( 15 | labels_true, labels_pred, *, average_method="arithmetic", reweigh=True 16 | ): 17 | """Mutual Information adjusted for chance and balanced across true labels. 18 | Adjusted Mutual Information (AMI) is an adjustment of the Mutual 19 | Information (MI) score to account for chance. It accounts for the fact that 20 | the MI is generally higher for two clusterings with a larger number of 21 | clusters, regardless of whether there is actually more information shared. 22 | For two clusterings :math:`U` and :math:`V`, the AMI is given as:: 23 | AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))] 24 | This metric is independent of the absolute values of the labels: 25 | a permutation of the class or cluster label values won't change the 26 | score value in any way. 27 | The original AMI metric is a symmetric measure: switching :math:`U` 28 | (``label_true``) with :math:`V` (``labels_pred``) will return the same score value, 29 | but this is not the case for the reweighted and balanced AMI. 30 | The balanced AMI is obtained by reweighing the contingency table 31 | for all true label marginals, such that they sum to the same nummber, 32 | while preserving the total number of samples. 33 | Be mindful that this function is an order of magnitude slower than other 34 | metrics, such as the Adjusted Rand Index. 35 | Parameters 36 | ---------- 37 | labels_true : int array, shape = [n_samples] 38 | A clustering of the data into disjoint subsets, called :math:`U` in 39 | the above formula. 40 | labels_pred : int array-like of shape (n_samples,) 41 | A clustering of the data into disjoint subsets, called :math:`V` in 42 | the above formula. 43 | average_method : str, default='arithmetic' 44 | How to compute the normalizer in the denominator. Possible options 45 | are 'min', 'geometric', 'arithmetic', and 'max'. 46 | .. versionadded:: 0.20 47 | .. versionchanged:: 0.22 48 | The default value of ``average_method`` changed from 'max' to 49 | 'arithmetic'. 50 | reweigh : bool, default=True 51 | if `True`, reweighs the contingency table based on the true labels 52 | such that they all have equal membership. The total number of samples 53 | is preserved with a round-off error. If 'False', this reverts the 54 | balanced AMI to the original AMI implementation. 55 | Returns 56 | ------- 57 | AMI: float (upperlimited by 1.0) 58 | The AMI returns a value of 1 when the two partitions are identical 59 | (ie perfectly matched). Random partitions (independent labellings) have 60 | an expected AMI around 0 on average hence can be negative. The value is 61 | in adjusted nats (based on the natural logarithm). 62 | References 63 | ---------- 64 | .. [1] `Vinh, Epps, and Bailey, (2010). Information Theoretic Measures for 65 | Clusterings Comparison: Variants, Properties, Normalization and 66 | Correction for Chance, JMLR 67 | `_ 68 | .. [2] `Wikipedia entry for the Adjusted Mutual Information 69 | `_ 70 | """ 71 | labels_true, labels_pred = check_clusterings(labels_true, labels_pred) 72 | n_samples = labels_true.shape[0] 73 | classes = np.unique(labels_true) 74 | clusters = np.unique(labels_pred) 75 | # Special limit cases: no clustering since the data is not split. 76 | # This is a perfect match hence return 1.0. 77 | if ( 78 | classes.shape[0] == clusters.shape[0] == 1 79 | or classes.shape[0] == clusters.shape[0] == 0 80 | ): 81 | return 1.0 82 | contingency = contingency_matrix( 83 | labels_true, labels_pred, reweigh=reweigh, sparse=True 84 | ) 85 | # Recalculate labels_true and labels_pred if reweigh is True to 86 | # factor in the reweighting based on the true class frequencies. 87 | # These won't preserve order but this is fine since entropy is 88 | # invariant to order 89 | if reweigh is True: 90 | true_sums = np.squeeze(np.asarray(sp.csc_matrix.sum(contingency, axis = 1))) 91 | pred_sums = np.squeeze(np.asarray(sp.csc_matrix.sum(contingency, axis = 0))) 92 | labels_true = np.repeat( 93 | np.arange(len(true_sums)), true_sums 94 | ) 95 | labels_pred = np.repeat( 96 | np.arange(len(pred_sums)), pred_sums 97 | ) 98 | contingency = contingency.astype(np.float64) 99 | # Calculate the MI for the two clusterings 100 | mi = mutual_info_score(labels_true, labels_pred, contingency=contingency) 101 | # Calculate the expected value for the mutual information 102 | emi = expected_mutual_information(contingency, n_samples) 103 | # Calculate entropy for each labeling 104 | h_true, h_pred = entropy(labels_true), entropy(labels_pred) 105 | normalizer = generalized_average(h_true, h_pred, average_method) 106 | denominator = normalizer - emi 107 | # Avoid 0.0 / 0.0 when expectation equals maximum, i.e a perfect match. 108 | # normalizer should always be >= emi, but because of floating-point 109 | # representation, sometimes emi is slightly larger. Correct this 110 | # by preserving the sign. 111 | if denominator < 0: 112 | denominator = min(denominator, -np.finfo("float64").eps) 113 | else: 114 | denominator = max(denominator, np.finfo("float64").eps) 115 | ami = (mi - emi) / denominator 116 | return ami 117 | -------------------------------------------------------------------------------- /workflow/src/python/imbalanced_clustering/ari.py: -------------------------------------------------------------------------------- 1 | from .utils import pair_confusion_matrix 2 | 3 | 4 | def balanced_adjusted_rand_index(labels_true, labels_pred, reweigh=True): 5 | """Rand index adjusted for chance and balanced across true labels. 6 | The Rand Index computes a similarity measure between two clusterings 7 | by considering all pairs of samples and counting pairs that are 8 | assigned in the same or different clusters in the predicted and 9 | true clusterings. 10 | The raw RI score is then "adjusted for chance" into the ARI score 11 | using the following scheme:: 12 | ARI = (RI - Expected_RI) / (max(RI) - Expected_RI) 13 | The adjusted Rand index is thus ensured to have a value close to 14 | 0.0 for random labeling independently of the number of clusters and 15 | samples and exactly 1.0 when the clusterings are identical (up to 16 | a permutation). 17 | The original ARI is a symmetric measure: 18 | adjusted_rand_score(a, b) == adjusted_rand_score(b, a) 19 | But this does not hold due for the balanced ARI metric. 20 | The balanced ARI is obtained by reweighing the contingency table 21 | for all true label marginals, such that they sum to the same nummber, 22 | while preserving the total number of samples. 23 | Parameters 24 | ---------- 25 | labels_true : int array, shape = [n_samples] 26 | Ground truth class labels to be used as a reference 27 | labels_pred : array-like of shape (n_samples,) 28 | Cluster labels to evaluate 29 | reweigh : bool, default=True 30 | if `True`, reweighs the contingency table based on the true labels 31 | such that they all have equal membership. The total number of samples 32 | is preserved with a round-off error. If 'False', this reverts the 33 | balanced ARI to the original ARI implementation. 34 | Returns 35 | ------- 36 | ARI : float 37 | Similarity score between -1.0 and 1.0. Random labelings have an ARI 38 | close to 0.0. 1.0 stands for perfect match. 39 | References 40 | ---------- 41 | .. [Hubert1985] L. Hubert and P. Arabie, Comparing Partitions, 42 | Journal of Classification 1985 43 | https://link.springer.com/article/10.1007%2FBF01908075 44 | .. [Steinley2004] D. Steinley, Properties of the Hubert-Arabie 45 | adjusted Rand index, Psychological Methods 2004 46 | .. [wk] https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index 47 | """ 48 | (tn, fp), (fn, tp) = pair_confusion_matrix( 49 | labels_true, labels_pred, reweigh=reweigh 50 | ) 51 | # convert to Python integer types, to avoid overflow or underflow 52 | tn, fp, fn, tp = int(tn), int(fp), int(fn), int(tp) 53 | 54 | # Special cases: empty data or full agreement 55 | if fn == 0 and fp == 0: 56 | return 1.0 57 | 58 | return 2.0 * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn)) 59 | -------------------------------------------------------------------------------- /workflow/src/python/imbalanced_clustering/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import pyximport 2 | import numpy 3 | 4 | pyximport.install(setup_args={"include_dirs": numpy.get_include()}, reload_support=True) 5 | from ._emi_cython import expected_mutual_information 6 | from .contingency import pair_confusion_matrix, contingency_matrix 7 | from .checks import check_clusterings 8 | from .mi import mutual_info_score, entropy 9 | from .avg import generalized_average 10 | -------------------------------------------------------------------------------- /workflow/src/python/imbalanced_clustering/utils/_emi_cython.pyx: -------------------------------------------------------------------------------- 1 | # Authors: Robert Layton 2 | # Corey Lynch 3 | # License: BSD 3 clause 4 | 5 | from libc.math cimport exp, lgamma 6 | from scipy.special import gammaln 7 | import numpy as np 8 | cimport numpy as np 9 | cimport cython 10 | 11 | np.import_array() 12 | ctypedef np.float64_t DOUBLE 13 | 14 | def expected_mutual_information(contingency, int n_samples): 15 | """Calculate the expected mutual information for two labelings.""" 16 | cdef int R, C 17 | cdef DOUBLE N, gln_N, emi, term2, term3, gln 18 | cdef np.ndarray[DOUBLE] gln_a, gln_b, gln_Na, gln_Nb, gln_nij, log_Nnij 19 | cdef np.ndarray[DOUBLE] nijs, term1 20 | cdef np.ndarray[DOUBLE] log_a, log_b 21 | cdef np.ndarray[np.int32_t] a, b 22 | #cdef np.ndarray[int, ndim=2] start, end 23 | R, C = contingency.shape 24 | N = n_samples 25 | a = np.ravel(contingency.sum(axis=1).astype(np.int32, copy=False)) 26 | b = np.ravel(contingency.sum(axis=0).astype(np.int32, copy=False)) 27 | # There are three major terms to the EMI equation, which are multiplied to 28 | # and then summed over varying nij values. 29 | # While nijs[0] will never be used, having it simplifies the indexing. 30 | nijs = np.arange(0, max(np.max(a), np.max(b)) + 1, dtype='float') 31 | nijs[0] = 1 # Stops divide by zero warnings. As its not used, no issue. 32 | # term1 is nij / N 33 | term1 = nijs / N 34 | # term2 is log((N*nij) / (a * b)) == log(N * nij) - log(a * b) 35 | log_a = np.log(a) 36 | log_b = np.log(b) 37 | # term2 uses log(N * nij) = log(N) + log(nij) 38 | log_Nnij = np.log(N) + np.log(nijs) 39 | # term3 is large, and involved many factorials. Calculate these in log 40 | # space to stop overflows. 41 | gln_a = gammaln(a + 1) 42 | gln_b = gammaln(b + 1) 43 | gln_Na = gammaln(N - a + 1) 44 | gln_Nb = gammaln(N - b + 1) 45 | gln_N = gammaln(N + 1) 46 | gln_nij = gammaln(nijs + 1) 47 | # start and end values for nij terms for each summation. 48 | start = np.array([[v - N + w for w in b] for v in a], dtype='int') 49 | start = np.maximum(start, 1) 50 | end = np.minimum(np.resize(a, (C, R)).T, np.resize(b, (R, C))) + 1 51 | # emi itself is a summation over the various values. 52 | emi = 0.0 53 | cdef Py_ssize_t i, j, nij 54 | for i in range(R): 55 | for j in range(C): 56 | for nij in range(start[i,j], end[i,j]): 57 | term2 = log_Nnij[nij] - log_a[i] - log_b[j] 58 | # Numerators are positive, denominators are negative. 59 | gln = (gln_a[i] + gln_b[j] + gln_Na[i] + gln_Nb[j] 60 | - gln_N - gln_nij[nij] - lgamma(a[i] - nij + 1) 61 | - lgamma(b[j] - nij + 1) 62 | - lgamma(N - a[i] - b[j] + nij + 1)) 63 | term3 = exp(gln) 64 | emi += (term1[nij] * term2 * term3) 65 | return emi -------------------------------------------------------------------------------- /workflow/src/python/imbalanced_clustering/utils/avg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def generalized_average(U, V, average_method): 5 | """Return a particular mean of two numbers.""" 6 | if average_method == "min": 7 | return min(U, V) 8 | elif average_method == "geometric": 9 | return np.sqrt(U * V) 10 | elif average_method == "arithmetic": 11 | return np.mean([U, V]) 12 | elif average_method == "max": 13 | return max(U, V) 14 | else: 15 | raise ValueError( 16 | "'average_method' must be 'min', 'geometric', 'arithmetic', or 'max'" 17 | ) 18 | -------------------------------------------------------------------------------- /workflow/src/python/imbalanced_clustering/utils/contingency.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | from sklearn.utils import sparsefuncs 4 | 5 | 6 | def contingency_matrix( 7 | labels_true, labels_pred, *, reweigh=False, eps=None, sparse=False, dtype=np.int64 8 | ): 9 | """Build a contingency matrix describing the relationship between labels. 10 | Parameters 11 | ---------- 12 | labels_true : int array, shape = [n_samples] 13 | Ground truth class labels to be used as a reference. 14 | labels_pred : array-like of shape (n_samples,) 15 | Cluster labels to evaluate. 16 | reweigh : bool, default=False 17 | if `True`, reweighs the contingency table based on the true labels 18 | such that they all have equal membership. The total number of samples 19 | is preserved with a round-off error. 20 | eps : float, default=None 21 | If a float, that value is added to all values in the contingency 22 | matrix. This helps to stop NaN propagation. 23 | If ``None``, nothing is adjusted. 24 | sparse : bool, default=False 25 | If `True`, return a sparse CSR continency matrix. If `eps` is not 26 | `None` and `sparse` is `True` will raise ValueError. 27 | .. versionadded:: 0.18 28 | dtype : numeric type, default=np.int64 29 | Output dtype. Ignored if `eps` is not `None`. 30 | .. versionadded:: 0.24 31 | Returns 32 | ------- 33 | contingency : {array-like, sparse}, shape=[n_classes_true, n_classes_pred] 34 | Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in 35 | true class :math:`i` and in predicted class :math:`j`. If 36 | ``eps is None``, the dtype of this array will be integer unless set 37 | otherwise with the ``dtype`` argument. If ``eps`` is given, the dtype 38 | will be float. 39 | Will be a ``sklearn.sparse.csr_matrix`` if ``sparse=True``. 40 | """ 41 | 42 | if eps is not None and sparse: 43 | raise ValueError("Cannot set 'eps' when sparse=True") 44 | 45 | classes, class_idx = np.unique(labels_true, return_inverse=True) 46 | clusters, cluster_idx = np.unique(labels_pred, return_inverse=True) 47 | n_classes = classes.shape[0] 48 | n_clusters = clusters.shape[0] 49 | # Using coo_matrix to accelerate simple histogram calculation, 50 | # i.e. bins are consecutive integers 51 | # Currently, coo_matrix is faster than histogram2d for simple cases 52 | contingency = sp.coo_matrix( 53 | (np.ones(class_idx.shape[0]), (class_idx, cluster_idx)), 54 | shape=(n_classes, n_clusters), 55 | dtype=dtype, 56 | ) 57 | if sparse: 58 | contingency = contingency.tocsr() 59 | contingency.sum_duplicates() 60 | else: 61 | contingency = contingency.toarray() 62 | if eps is not None: 63 | # don't use += as contingency is integer 64 | contingency = contingency + eps 65 | # reweight contingency table if indicated 66 | if reweigh is True: 67 | contingency = contingency.astype(np.float64) 68 | counts_sum_per_class = np.ravel(contingency.sum(1)) 69 | target = round(np.mean(counts_sum_per_class)) 70 | counts_norm = counts_sum_per_class / target 71 | sparsefuncs.inplace_row_scale(contingency, 1 / counts_norm) 72 | contingency = contingency.astype(np.int64) 73 | 74 | return contingency 75 | 76 | 77 | def pair_confusion_matrix(labels_true, labels_pred, reweigh=False): 78 | """Pair confusion matrix arising from two clusterings. 79 | The pair confusion matrix :math:`C` computes a 2 by 2 similarity matrix 80 | between two clusterings by considering all pairs of samples and counting 81 | pairs that are assigned into the same or into different clusters under 82 | the true and predicted clusterings. 83 | Considering a pair of samples that is clustered together a positive pair, 84 | then as in binary classification the count of true negatives is 85 | :math:`C_{00}`, false negatives is :math:`C_{10}`, true positives is 86 | :math:`C_{11}` and false positives is :math:`C_{01}`. 87 | Read more in the :ref:`User Guide `. 88 | Parameters 89 | ---------- 90 | labels_true : array-like of shape (n_samples,), dtype=integral 91 | Ground truth class labels to be used as a reference. 92 | labels_pred : array-like of shape (n_samples,), dtype=integral 93 | Cluster labels to evaluate. 94 | reweigh : bool, default=False 95 | if `True`, reweighs the contingency table based on the true labels 96 | such that they all have equal membership. The total number of samples 97 | is preserved with a round-off error. 98 | Returns 99 | ------- 100 | C : ndarray of shape (2, 2), dtype=np.int64 101 | The contingency matrix. 102 | ------ 103 | Note that the matrix is not symmetric. 104 | ------ 105 | References 106 | ---------- 107 | .. L. Hubert and P. Arabie, Comparing Partitions, Journal of 108 | Classification 1985 109 | https://link.springer.com/article/10.1007%2FBF01908075 110 | """ 111 | n_samples = np.int64(labels_true.shape[0]) 112 | 113 | # Computation using the contingency data 114 | contingency = contingency_matrix( 115 | labels_true, labels_pred, reweigh=reweigh, sparse=True, dtype=np.int64 116 | ) 117 | n_c = np.ravel(contingency.sum(axis=1)) 118 | n_k = np.ravel(contingency.sum(axis=0)) 119 | sum_squares = (contingency.data**2).sum() 120 | C = np.empty((2, 2), dtype=np.int64) 121 | C[1, 1] = sum_squares - n_samples 122 | C[0, 1] = contingency.dot(n_k).sum() - sum_squares 123 | C[1, 0] = contingency.transpose().dot(n_c).sum() - sum_squares 124 | C[0, 0] = n_samples**2 - C[0, 1] - C[1, 0] - sum_squares 125 | return C 126 | -------------------------------------------------------------------------------- /workflow/src/python/imbalanced_clustering/utils/mi.py: -------------------------------------------------------------------------------- 1 | from math import log 2 | 3 | import numpy as np 4 | import scipy.sparse as sp 5 | 6 | from .checks import check_clusterings, check_array 7 | from .contingency import contingency_matrix 8 | 9 | 10 | def entropy(labels): 11 | """Calculates the entropy for a labeling. 12 | Parameters 13 | ---------- 14 | labels : int array, shape = [n_samples] 15 | The labels 16 | Notes 17 | ----- 18 | The logarithm used is the natural logarithm (base-e). 19 | """ 20 | if len(labels) == 0: 21 | return 1.0 22 | label_idx = np.unique(labels, return_inverse=True)[1] 23 | pi = np.bincount(label_idx).astype(np.float64) 24 | pi = pi[pi > 0] 25 | pi_sum = np.sum(pi) 26 | # log(a / b) should be calculated as log(a) - log(b) for 27 | # possible loss of precision 28 | return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum))) 29 | 30 | 31 | def mutual_info_score(labels_true, labels_pred, *, contingency=None): 32 | """Mutual Information between two clusterings. 33 | The Mutual Information is a measure of the similarity between two labels 34 | of the same data. Where :math:`|U_i|` is the number of the samples 35 | in cluster :math:`U_i` and :math:`|V_j|` is the number of the 36 | samples in cluster :math:`V_j`, the Mutual Information 37 | between clusterings :math:`U` and :math:`V` is given as: 38 | .. math:: 39 | MI(U,V)=\\sum_{i=1}^{|U|} \\sum_{j=1}^{|V|} \\frac{|U_i\\cap V_j|}{N} 40 | \\log\\frac{N|U_i \\cap V_j|}{|U_i||V_j|} 41 | This metric is independent of the absolute values of the labels: 42 | a permutation of the class or cluster label values won't change the 43 | score value in any way. 44 | This metric is furthermore symmetric: switching :math:`U` (i.e 45 | ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the 46 | same score value. This can be useful to measure the agreement of two 47 | independent label assignments strategies on the same dataset when the 48 | real ground truth is not known. 49 | Read more in the :ref:`User Guide `. 50 | Parameters 51 | ---------- 52 | labels_true : int array, shape = [n_samples] 53 | A clustering of the data into disjoint subsets, called :math:`U` in 54 | the above formula. 55 | labels_pred : int array-like of shape (n_samples,) 56 | A clustering of the data into disjoint subsets, called :math:`V` in 57 | the above formula. 58 | contingency : {ndarray, sparse matrix} of shape \ 59 | (n_classes_true, n_classes_pred), default=None 60 | A contingency matrix given by the :func:`contingency_matrix` function. 61 | If value is ``None``, it will be computed, otherwise the given value is 62 | used, with ``labels_true`` and ``labels_pred`` ignored. 63 | Returns 64 | ------- 65 | mi : float 66 | Mutual information, a non-negative value, measured in nats using the 67 | natural logarithm. 68 | Notes 69 | ----- 70 | The logarithm used is the natural logarithm (base-e). 71 | See Also 72 | -------- 73 | adjusted_mutual_info_score : Adjusted against chance Mutual Information. 74 | normalized_mutual_info_score : Normalized Mutual Information. 75 | """ 76 | if contingency is None: 77 | labels_true, labels_pred = check_clusterings(labels_true, labels_pred) 78 | contingency = contingency_matrix(labels_true, labels_pred, sparse=True) 79 | else: 80 | contingency = check_array( 81 | contingency, 82 | accept_sparse=["csr", "csc", "coo"], 83 | dtype=[int, np.int32, np.int64], 84 | ) 85 | 86 | if isinstance(contingency, np.ndarray): 87 | # For an array 88 | nzx, nzy = np.nonzero(contingency) 89 | nz_val = contingency[nzx, nzy] 90 | elif sp.issparse(contingency): 91 | # For a sparse matrix 92 | nzx, nzy, nz_val = sp.find(contingency) 93 | else: 94 | raise ValueError("Unsupported type for 'contingency': %s" % type(contingency)) 95 | 96 | contingency_sum = contingency.sum() 97 | pi = np.ravel(contingency.sum(axis=1)) 98 | pj = np.ravel(contingency.sum(axis=0)) 99 | log_contingency_nm = np.log(nz_val) 100 | contingency_nm = nz_val / contingency_sum 101 | # Don't need to calculate the full outer product, just for non-zeroes 102 | outer = pi.take(nzx).astype(np.int64, copy=False) * pj.take(nzy).astype( 103 | np.int64, copy=False 104 | ) 105 | log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum()) 106 | mi = ( 107 | contingency_nm * (log_contingency_nm - log(contingency_sum)) 108 | + contingency_nm * log_outer 109 | ) 110 | mi = np.where(np.abs(mi) < np.finfo(mi.dtype).eps, 0.0, mi) 111 | return np.clip(mi.sum(), 0.0, None) 112 | -------------------------------------------------------------------------------- /workflow/src/python/loaders/rna_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import scipy 4 | import scipy.io as sio 5 | import scanpy as sc 6 | import anndata as ann 7 | 8 | def adata -------------------------------------------------------------------------------- /workflow/src/python/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .integrate import Integration 2 | from .integrate_ti import IntegrationPAGA 3 | from .seurat_integrate import SeuratIntegrate 4 | from .liger_integrate import LigerIntegrate 5 | from .clustering import cluster_num, leiden_clip, cluster_membership 6 | from .sample import downsample 7 | from .diffexp import diffexp, dge_top_n, set_concordance 8 | from .cluster_concordance import cluster_concordance 9 | from .kmeans import faiss_kmeans 10 | from .seurat_reference_mapping import SeuratReferenceMap 11 | from .mnn import mutual_nn, find_mutual_nn, find_knn, cross_data_knn 12 | from .relatedness import relatedness_score 13 | from .umap import Umap -------------------------------------------------------------------------------- /workflow/src/python/utils/cluster_concordance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import metrics 4 | 5 | def cluster_concordance(adata): 6 | # Pull out clustering values per subset 7 | methods = ["harmony", "scvi", "bbknn", "scanorama", "seurat", "liger"] 8 | adata_subsets = [] 9 | for method in methods: 10 | adata_subsets.append(adata[adata.obs["integration_method"] == method]) 11 | cluster_subsets = [ 12 | adata_subset.obs["leiden"].__array__() for adata_subset in adata_subsets 13 | ] 14 | 15 | # Get ARI values across subsets 16 | ari_vals_mat = np.zeros((len(cluster_subsets), len(cluster_subsets))) 17 | for i, clusters_i in enumerate(cluster_subsets): 18 | for j, clusters_j in enumerate(cluster_subsets): 19 | ari_val = metrics.adjusted_rand_score(clusters_i, clusters_j) 20 | ari_vals_mat[i, j] = ari_val 21 | 22 | # Create dataframe of values 23 | ari_val_df = pd.DataFrame(ari_vals_mat) 24 | ari_val_df.index = methods 25 | ari_val_df.columns = methods 26 | 27 | # Get global ARI value (median) 28 | ari_vals_mat_nan_diag = np.zeros((len(cluster_subsets), len(cluster_subsets))) 29 | for i, clusters_i in enumerate(cluster_subsets): 30 | for j, clusters_j in enumerate(cluster_subsets): 31 | if i == j: 32 | ari_vals_mat_nan_diag[i, j] = np.nan 33 | else: 34 | ari_val = metrics.adjusted_rand_score(clusters_i, clusters_j) 35 | ari_vals_mat_nan_diag[i, j] = ari_val 36 | 37 | ari_vals_mat_no_diag = ari_vals_mat_nan_diag[~np.isnan(ari_vals_mat_nan_diag)] 38 | median_ari = np.median(ari_vals_mat_no_diag) 39 | 40 | # Convert concordance dataframe to long format 41 | ari_val_df_long = ari_val_df.melt(ignore_index = False) 42 | ari_val_df_long = ari_val_df_long.reset_index() 43 | ari_val_df_long.columns = ["Method 1", "Method 2", "ARI"] 44 | 45 | # Append median ARI value to dataframe and return 46 | ari_val_df_long["Median ARI"] = median_ari 47 | 48 | return ari_val_df_long -------------------------------------------------------------------------------- /workflow/src/python/utils/clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scanpy as sc 3 | import anndata as ann 4 | 5 | def cluster_num(adata): 6 | clusters_unique = np.unique(adata.obs.leiden.__array__()) 7 | clusters_len = len(clusters_unique) 8 | return clusters_len 9 | 10 | def cluster_membership(adata): 11 | clusters_unique = np.unique(adata.obs.leiden.__array__()) 12 | clusters_membership = dict() 13 | for i in clusters_unique: 14 | clusters_membership[i] = len(adata.obs[adata.obs.leiden == i]) 15 | return clusters_membership 16 | 17 | def leiden_clip(adata, num_clusters, step_size = 0.05): 18 | counter = 0 19 | leiden_resolution = 1 20 | while cluster_num(adata) != num_clusters: 21 | if cluster_num(adata) < num_clusters: 22 | leiden_resolution += step_size 23 | sc.tl.leiden(adata, resolution = leiden_resolution) 24 | elif cluster_num(adata) > num_clusters: 25 | leiden_resolution -= step_size 26 | sc.tl.leiden(adata, resolution = leiden_resolution) 27 | counter += 1 28 | if counter > 100: 29 | raise Exception( 30 | "Attempted more than 100 iterations - convergence not possible, set lower step size" 31 | ) 32 | return adata 33 | -------------------------------------------------------------------------------- /workflow/src/python/utils/diffexp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scanpy as sc 4 | import anndata as ann 5 | 6 | def diffexp(adata, groupby, **kwargs): 7 | """ 8 | Perform differential expression analysis on an AnnData object. 9 | 10 | Args: 11 | adata (AnnData):Annotated data matrix object. 12 | groupby (str): The column name of the dataframe to group by. 13 | **kwargs: Keyword arguments to be passed to scanpy.tl.rank_genes_groups. 14 | 15 | Returns: 16 | adata (AnnData): Annotated data matrix object with differential expression analysis results. 17 | """ 18 | sc.tl.rank_genes_groups(adata, groupby = groupby, **kwargs) 19 | return adata 20 | 21 | def dge_top_n(adata, n, obs_group): 22 | """ 23 | Return clusters and genes with the top n differential expression. 24 | 25 | Args: 26 | adata (AnnData): Annotated data matrix object. 27 | n (int): The number of top differentially expressed genes to return. 28 | obs_group (str): The column name in obs of adata object to group by. 29 | Returns: 30 | data (DataFrame): A dataframe with the top n differentially expressed genes in each cluster. 31 | """ 32 | unique_groups = np.sort(np.unique(adata.obs[obs_group].__array__())) 33 | unique_group_top_n_dges = [] 34 | for group in unique_groups: 35 | score_df = sc.get.rank_genes_groups_df(adata, group = group) 36 | score_df_sorted = score_df.sort_values(["pvals_adj"], ascending = True) 37 | top_n_dges = score_df_sorted[0:n]["names"].__array__() 38 | unique_group_top_n_dges.append(top_n_dges) 39 | unique_groups_long = np.repeat(unique_groups, n) 40 | 41 | group_dges_df_n = pd.DataFrame({ 42 | "Cluster": unique_groups_long, 43 | "Top {n} DGEs".format(n = n): np.concatenate(unique_group_top_n_dges) 44 | }) 45 | return group_dges_df_n 46 | 47 | def set_concordance(*args): 48 | """Determines number of overlapping elements between n sets. 49 | 50 | Args: 51 | *args: A list of n sets. 52 | 53 | Returns: 54 | concordance (int): The number of overlapping elements between n sets. 55 | """ 56 | concordance = len(set.intersection(*args)) 57 | return concordance 58 | -------------------------------------------------------------------------------- /workflow/src/python/utils/integrate.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | import gc 3 | import random 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import scanpy as sc 8 | import anndata as ann 9 | import scvi 10 | import bbknn 11 | import torch 12 | 13 | # Undoing scvi's random seed setting 14 | random.seed(None) 15 | np.random.seed(None) 16 | torch.manual_seed(random.randint(1, 10000000000000000000)) 17 | 18 | from utils.seurat_integrate import SeuratIntegrate 19 | from utils.liger_integrate import LigerIntegrate 20 | 21 | class Integration: 22 | """Class for integrating scRNA-seq data and returning processed data.""" 23 | 24 | def __init__(self, adata, gpu = True): 25 | """ 26 | Args: 27 | adata (AnnData): AnnData object to be utilized in integration methods. 28 | Assumes that the counts being input are unnormalized (raw counts), 29 | and that raw counts are stored in "counts" layer, and batch covariate 30 | is available. 31 | gpu (bool): Whether or not to use GPU for scVI. 32 | """ 33 | self.adata = adata 34 | # Check anndata object 35 | if not isinstance(adata, ann.AnnData): 36 | raise Exception("Please input an AnnData object.") 37 | # Check if gpu is available 38 | if gpu is True: 39 | if torch.cuda.is_available(): 40 | self.gpu = True 41 | else: 42 | raise Exception("GPU not available. Please set gpu = False.") 43 | else: 44 | self.gpu = False 45 | 46 | def scvi_integrate(self, n_neighbors = 15, n_pcs = 20): 47 | print("Performing scVI integration.." + "\n") 48 | ascvi = self.adata.copy() 49 | scvi.data.setup_anndata(ascvi, batch_key = "batch") 50 | vae = scvi.model.SCVI(ascvi) 51 | vae.train(use_gpu = self.gpu) 52 | ascvi.obsm["X_scVI"] = vae.get_latent_representation() 53 | ascvi.obsm["X_kmeans"] = ascvi.obsm["X_scVI"][:, 0:n_pcs] 54 | sc.pp.neighbors( 55 | ascvi, 56 | n_neighbors = n_neighbors, 57 | n_pcs = n_pcs, 58 | use_rep = "X_scVI" 59 | ) 60 | sc.tl.leiden(ascvi) 61 | sc.tl.umap(ascvi) 62 | print("Done!" + "\n") 63 | return ascvi 64 | 65 | def harmony_integrate(self, n_neighbors = 15, n_pcs = 20, num_hvgs = 2500): 66 | print("Performing Harmony integration.." + "\n") 67 | aharmony = self.adata.copy() 68 | sc.pp.normalize_total( 69 | aharmony, 70 | target_sum = 1e4 71 | ) 72 | sc.pp.log1p(aharmony) 73 | sc.pp.highly_variable_genes( 74 | aharmony, 75 | n_top_genes = num_hvgs, 76 | flavor = "seurat" 77 | ) 78 | sc.pp.pca(aharmony, svd_solver="arpack") 79 | sc.external.pp.harmony_integrate( 80 | aharmony, 81 | key = "batch", 82 | random_state = None 83 | ) 84 | sc.pp.neighbors( 85 | aharmony, 86 | n_neighbors = n_neighbors, 87 | n_pcs = n_pcs, 88 | use_rep = "X_pca_harmony" 89 | ) 90 | aharmony.obsm["X_kmeans"] = aharmony.obsm["X_pca_harmony"][:, 0:n_pcs] 91 | sc.tl.leiden(aharmony) 92 | sc.tl.umap(aharmony) 93 | print("Done!" + "\n") 94 | return aharmony 95 | 96 | def bbknn_integrate(self, n_pcs = 20, num_hvgs = 2500, metric = "euclidean"): 97 | print("Performing BBKNN integration.." + "\n") 98 | abbknn = self.adata.copy() 99 | sc.pp.normalize_total( 100 | abbknn, 101 | target_sum = 1e4 102 | ) 103 | sc.pp.log1p(abbknn) 104 | sc.pp.highly_variable_genes( 105 | abbknn, 106 | n_top_genes = num_hvgs, 107 | flavor = "seurat" 108 | ) 109 | sc.pp.pca(abbknn, svd_solver = "arpack") 110 | if metric == "euclidean": 111 | bbknn.bbknn( 112 | abbknn, 113 | approx = False, 114 | metric = "euclidean", 115 | batch_key = "batch", 116 | n_pcs = n_pcs, 117 | pynndescent_random_state = None 118 | ) 119 | elif metric == "angular": 120 | bbknn.bbknn( 121 | abbknn, 122 | approx = True, 123 | metric = "angular", 124 | batch_key = "batch", 125 | n_pcs = n_pcs, 126 | pynndescent_random_state = None 127 | ) 128 | else: 129 | raise Exception( 130 | "Please enter either 'euclidean' or 'angular' for 'metric'" 131 | ) 132 | # Add placeholder for kmeans 133 | abbknn.obsm["X_kmeans"] = np.ones(( 134 | abbknn.obsm["X_pca"].shape[0], 135 | abbknn.obsm["X_pca"].shape[1] 136 | )) 137 | sc.tl.leiden(abbknn) 138 | sc.tl.umap(abbknn) 139 | print("Done!" + "\n") 140 | return abbknn 141 | 142 | def scanorama_integrate(self, n_neighbors = 15, n_pcs = 20, num_hvgs = 2500): 143 | print("Performing Scanorama integration.." + "\n") 144 | ascanorama = self.adata.copy() 145 | sc.pp.normalize_total( 146 | ascanorama, 147 | target_sum = 1e4 148 | ) 149 | sc.pp.log1p(ascanorama) 150 | sc.pp.highly_variable_genes( 151 | ascanorama, 152 | n_top_genes = num_hvgs, 153 | flavor = "seurat" 154 | ) 155 | sc.pp.pca(ascanorama, svd_solver="arpack") 156 | sc.external.pp.scanorama_integrate( 157 | ascanorama, 158 | key = "batch" 159 | ) 160 | sc.pp.neighbors( 161 | ascanorama, 162 | n_neighbors = n_neighbors, 163 | n_pcs = n_pcs, 164 | use_rep = "X_scanorama" 165 | ) 166 | ascanorama.obsm["X_kmeans"] = ascanorama.obsm["X_scanorama"][:, 0:n_pcs] 167 | sc.tl.leiden(ascanorama) 168 | sc.tl.umap(ascanorama) 169 | print("Done!" + "\n") 170 | return ascanorama 171 | 172 | def seurat_integrate(self,int_type = "CCA", n_neighbors = 15, n_pcs = 20): 173 | print("Performing Seurat integration.." + "\n") 174 | aseurat = self.adata.copy() 175 | sc.pp.normalize_total( 176 | aseurat, 177 | target_sum = 1e4 178 | ) 179 | sc.pp.log1p(aseurat) 180 | seurat_integrate = SeuratIntegrate( 181 | adata = aseurat, 182 | int_type = int_type 183 | ) 184 | aseurat_int = seurat_integrate.integrate() # Create seurat integrated anndata object 185 | sc.pp.pca(aseurat_int, svd_solver = "arpack") 186 | sc.pp.neighbors( 187 | aseurat_int, 188 | n_neighbors = n_neighbors, 189 | n_pcs = n_pcs 190 | ) 191 | sc.tl.leiden(aseurat_int) 192 | sc.tl.umap(aseurat_int) 193 | # Append seurat integrated data to original adata object 194 | aseurat.obs["leiden"] = aseurat_int.obs["leiden"] 195 | aseurat.obsm["X_pca"] = aseurat_int.obsm["X_pca"] 196 | aseurat.obsm["X_kmeans"] = aseurat_int.obsm["X_pca"][:, 0:n_pcs] 197 | aseurat.obsm["X_umap"] = aseurat_int.obsm["X_umap"] 198 | aseurat.obsm["seurat_hvg"] = aseurat_int.X 199 | print("Done!" + "\n") 200 | return aseurat 201 | 202 | def liger_integrate(self, n_neighbors = 15, n_pcs = 20): 203 | print("Performing LIGER integration.." + "\n") 204 | aliger = self.adata.copy() 205 | # Don't normalize for LIGER (R script normalizes) 206 | # sc.pp.normalize_total( 207 | # aliger, 208 | # target_sum = 1e4 209 | # ) 210 | # Don't log-transform for LIGER 211 | # sc.pp.log1p(aliger) 212 | liger_integrate = LigerIntegrate( 213 | adata = aliger, 214 | ) 215 | aliger = liger_integrate.integrate() # Substitute liger integrated anndata object 216 | sc.pp.neighbors( 217 | aliger, 218 | n_neighbors = n_neighbors, 219 | n_pcs = n_pcs, 220 | use_rep = "X_liger" 221 | ) 222 | aliger.obsm["X_kmeans"] = aliger.obsm["X_liger"][:, 0:n_pcs] 223 | sc.tl.leiden(aliger) 224 | sc.tl.umap(aliger) 225 | print("Done!" + "\n") 226 | return aliger -------------------------------------------------------------------------------- /workflow/src/python/utils/kmeans.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import faiss 3 | 4 | # Perform k-means clustering using Faiss 5 | def faiss_kmeans(adata, k, niter = 300, nredo = 10, 6 | min_points_per_centroid = 5): 7 | """Function to perform k-means clustering using FAISS on AnnData objects. 8 | 9 | Args: 10 | adata (AnnData): An object of AnnData class with highly variable gene selection 11 | performed. 12 | k (int): Number of clusters to form. 13 | niter (int): Number of iterations to run k-means. Defaults to 300. 14 | nredo (int): Number of times to run k-means - selects best result. 15 | Defaults to 10. 16 | min_points_per_centroid (int): Minimum number of points per k-means 17 | centroid. Defaults to 5. 18 | """ 19 | # Subset data to kmeans reduction to utilize for clustering 20 | reduction_sub = adata.obsm["X_kmeans"] 21 | 22 | # Run k-means using faiss given options 23 | kmeans_faiss = faiss.Kmeans( 24 | d = reduction_sub.shape[1], 25 | k = k, 26 | niter = niter, 27 | nredo = nredo, 28 | min_points_per_centroid = min_points_per_centroid 29 | ) 30 | kmeans_faiss.train(np.ascontiguousarray(reduction_sub, dtype = np.float32)) 31 | kmeans_faiss_labels = np.concatenate( 32 | kmeans_faiss.index.search( 33 | np.ascontiguousarray(reduction_sub, dtype = np.float32), 1 34 | )[1] 35 | ) 36 | 37 | # Check if any clusters have less than the min required members and redo clustering with less 38 | unique_labels, counts = np.unique(kmeans_faiss_labels, return_counts = True) 39 | while any(counts < min_points_per_centroid): 40 | k -= 1 41 | kmeans_faiss = faiss.Kmeans( 42 | d = reduction_sub.shape[1], 43 | k = k, 44 | niter = niter, 45 | nredo = nredo, 46 | min_points_per_centroid = min_points_per_centroid 47 | ) 48 | kmeans_faiss.train(np.ascontiguousarray(reduction_sub, dtype = np.float32)) 49 | kmeans_faiss_labels = np.concatenate( 50 | kmeans_faiss.index.search( 51 | np.ascontiguousarray(reduction_sub, dtype = np.float32), 1 52 | )[1] 53 | ) 54 | unique_labels, counts = np.unique(kmeans_faiss_labels, return_counts = True) 55 | 56 | # Append kmeans labels to AnnData object 57 | kmeans_faiss_labels_str = kmeans_faiss_labels.astype("str") 58 | adata.obs['kmeans_faiss'] = kmeans_faiss_labels_str 59 | 60 | # Return AnnData object and kmeans number 61 | return adata, k -------------------------------------------------------------------------------- /workflow/src/python/utils/liger_integrate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | import uuid 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import scipy as sp 9 | import anndata as ann 10 | import scanpy as sc 11 | 12 | class LigerIntegrate: 13 | """ 14 | Class for interpolating between the Integration class and R-script 15 | based integration of RNA-seq batches using the LIGER R package. 16 | Integration is done on data output to a temporary file from the 17 | Integration class through and RScript, which then outputs a temporary 18 | file reread into python code and used to substitute the unintegrated data. 19 | Integration is performed using 20 latent components in the NMF factorization, 20 | or 20 "metagenes". 21 | """ 22 | def __init__(self, adata): 23 | """ 24 | Args: 25 | adata (object): An instance of an anndata class corresponding to the liger 26 | subset from the Integration class. 27 | """ 28 | self.adata = adata.copy() 29 | self.adata_copy = adata.copy() # Keep copy for later referencing 30 | 31 | def _format(self): 32 | # Append a column on gene names 33 | self.adata.var["gene"] = self.adata.var_names 34 | # Remove layers and raw from AnnData object (avoid conflicts with h5seurat) 35 | self.adata.layers = None 36 | self.adata.raw = None 37 | 38 | def _output_temp_h5ad(self): 39 | # Check if temp exists, if not, make dir 40 | if not os.path.exists("tmp"): 41 | os.makedirs("tmp") 42 | 43 | # Output temporary file with data 44 | self.filename = ''.join(str(uuid.uuid4()).split("-")) 45 | self.file = "{filename}.h5ad".format(filename = self.filename) 46 | self.adata.write_h5ad(os.path.join("tmp", self.file)) 47 | 48 | def _liger_integrate(self): 49 | # Call subprocess and call R script 50 | tempfile_script = \ 51 | "Rscript src/R/liger_integrate.R tmp/{tempfile} {tempfile_name} --verbose".format( 52 | tempfile = self.file, 53 | tempfile_name = self.filename 54 | ) 55 | 56 | self.sp_integrate = subprocess.run(tempfile_script, shell = True, text = True, capture_output = True) 57 | if self.sp_integrate.returncode != 0: 58 | raise Exception( 59 | "Subprocess call returned nonzero exit code - call: {call} \n Output: {output}".format( 60 | call = self.sp_integrate.stderr, 61 | output = self.sp_integrate.stdout 62 | ) 63 | ) 64 | 65 | def _return_integrated(self): 66 | # Get liger output file and read it into memory as anndata object 67 | self.liger_outfile = "{filename}_liger_out.h5ad".format(filename = self.filename) 68 | adata_liger = sc.read_h5ad( 69 | os.path.join("tmp", self.liger_outfile) 70 | ) 71 | 72 | # Read in cell-specific loadings from anndata object and convert to array 73 | norm_loadings_arr = adata_liger.X.toarray() 74 | 75 | # Add normalized loadings to anndata object (original copy) 76 | self.adata_copy.obsm["X_liger"] = norm_loadings_arr 77 | 78 | # Return integrated AnnData object 79 | return self.adata_copy 80 | 81 | def _clean_files(self): 82 | # Remove temporary python and liger files 83 | tmp_files = os.listdir("tmp") 84 | tmp_files_instance = [f for f in tmp_files if self.filename in f] 85 | for f in tmp_files_instance: 86 | os.remove(os.path.join("tmp", f)) 87 | 88 | # Check if all files related to the filename are removed from folder 89 | tmp_files = os.listdir("tmp") 90 | tmp_files_instance = [f for f in tmp_files if self.filename in f] 91 | if len(tmp_files_instance) > 0: 92 | raise Exception( 93 | "Temporary file cleanup incomplete - files remain in folder" 94 | ) 95 | 96 | def integrate(self): 97 | # Perform workflow and return integrated anndata object 98 | self._format() 99 | self._output_temp_h5ad() 100 | self._liger_integrate() 101 | integrated_adata = self._return_integrated() 102 | self._clean_files() 103 | 104 | return integrated_adata -------------------------------------------------------------------------------- /workflow/src/python/utils/mnn.py: -------------------------------------------------------------------------------- 1 | from itertools import combinations 2 | 3 | import faiss 4 | import numpy as np 5 | 6 | def find_mutual_nn(data_list, k = 15): 7 | """Gets mutual nearest neighbors pairs across all datasets. 8 | 9 | Using each data subset in data_list, gets mutual nearest neighbors by 10 | considering the intersection of MNNs across all datasets. 11 | 12 | Args: 13 | data_list (list): List of numpy arrays corresponding to data subsets. Datasets 14 | must have an equivalent number of features. 15 | k (integer): Positive integer value indicating how many neighbors to consider 16 | in the mutual-nearest-neighbors algorithm. Default value is 15. 17 | 18 | Returns: 19 | mnn_1_concat (array): Array of values corresponding to query MNN indices (MNN_1) that can be in 20 | any dataset, and are indexed based on the concatenated representation of all datasets in 21 | dataset_list. 22 | mnn_2_concat (array): Array of values corresponding to query-value MNN indices (MNN_2) that can 23 | be in any dataset, and are indexed based on the concatenated representation of all datasets in 24 | dataset_list. 25 | """ 26 | # Get lengths of all datasets for reindexing 27 | data_lens = [len(dataset) for dataset in data_list] 28 | 29 | # Get all indices 30 | indices = [i for i in range(len(data_list))] 31 | 32 | # Create combinations for all indices 33 | index_combo_iter = combinations(indices, 2) 34 | index_combos = [combo for combo in index_combo_iter] 35 | 36 | # Iterate over combinations and record mnn pairs - append to both lists 37 | mnn_1_list = [] 38 | mnn_2_list = [] 39 | for combo in index_combos: 40 | idx_1 = combo[0] 41 | idx_2 = combo[1] 42 | data_1 = data_list[idx_1] 43 | data_2 = data_list[idx_2] 44 | mnn_1, mnn_2 = mutual_nn(data_1, data_2, k1=k, k2=k) 45 | if idx_1 == 0: 46 | pass 47 | else: 48 | len_addition = sum(data_lens[0:idx_1]) 49 | mnn_1 = mnn_1 + len_addition 50 | if idx_2 == 0: 51 | pass 52 | else: 53 | len_addition = sum(data_lens[0:idx_2]) 54 | mnn_2 = mnn_2 + len_addition 55 | mnn_concat_1 = np.concatenate((mnn_1, mnn_2)) 56 | mnn_concat_2 = np.concatenate((mnn_2, mnn_1)) 57 | mnn_1_list.append(mnn_concat_1) 58 | mnn_2_list.append(mnn_concat_2) 59 | 60 | # Concatenate MNNs in mnn_1 and mnn_2 into one array and return 61 | mnn_1_concat = np.concatenate(mnn_1_list) 62 | mnn_2_concat = np.concatenate(mnn_2_list) 63 | return mnn_1_concat, mnn_2_concat 64 | 65 | 66 | def mutual_nn(data_1, data_2, k1, k2): 67 | """Given two datasets, gets and returns mutual nearest neighbors. 68 | 69 | Args: 70 | data_1 (array): Data array 1 that is used to create the graph representing 71 | dataset 1. Dataset 1 and 2 must have the same numbers of featres. 72 | data_2 (array): Data array 1 that is used to create the graph representing 73 | dataset 2. Dataset 1 and 2 must have the same numbers of features. 74 | k1 (integer): Positive integer value indicating how many neighbors to consider 75 | in the mutual-nearest-neighbors algorithm for dataset 1. 76 | k2 (integer): Positive integer value indicating how many neighbors to consider 77 | in the mutual-nearest-neighbors algorithm for dataset 2. 78 | 79 | Returns: 80 | mutual_1_arr (array): Array of mutual-nearest neighbors corresponding to indices in dataset 1. 81 | mutual_2_arr (array): Array of mutual-nearest neighbors corresponding to indices in dataset 2. 82 | """ 83 | 84 | data_1 = np.ascontiguousarray(data_1, dtype = np.float32) 85 | data_2 = np.ascontiguousarray(data_2, dtype = np.float32) 86 | 87 | index_1 = faiss.IndexFlatL2(data_1.shape[1]) 88 | index_2 = faiss.IndexFlatL2(data_2.shape[1]) 89 | 90 | index_1.add(data_1) 91 | index_2.add(data_2) 92 | 93 | d_index_1, k_index_1 = index_1.search(data_2, k1) 94 | d_index_2, k_index_2 = index_2.search(data_1, k2) 95 | 96 | mutual_1 = [] 97 | mutual_2 = [] 98 | for index_2 in range(data_2.shape[0]): 99 | for index_1 in k_index_1[index_2]: 100 | if index_2 in k_index_2[index_1]: 101 | mutual_1.append(index_1) 102 | mutual_2.append(index_2) 103 | mutual_1_arr = np.asarray(mutual_1) 104 | mutual_2_arr = np.asarray(mutual_2) 105 | return mutual_1_arr, mutual_2_arr 106 | 107 | def cross_data_knn(data_1, data_2, k): 108 | """Given two datasets, gets and returns KNN of dataset 1 in dataset 2. 109 | 110 | data_1 (array): Data array 1 that is used to create the graph representing 111 | dataset 1. Dataset 1 and 2 must have the same numbers of features. 112 | data_2 (array): Data array 1 that is used to create the graph representing 113 | dataset 2. Dataset 1 and 2 must have the same numbers of features. 114 | k (integer): Positive integer value indicating how many neighbors to consider 115 | in the cross-data knn lookup. 116 | 117 | Returns: 118 | knn_arr (array): Array of k-nearest neighbors corresponding to indices in dataset 1, 119 | with indices in dataset 2. 120 | """ 121 | data_1 = np.ascontiguousarray(data_1, dtype = np.float32) 122 | data_2 = np.ascontiguousarray(data_2, dtype = np.float32) 123 | 124 | index_2 = faiss.IndexFlatL2(data_2.shape[1]) 125 | index_2.add(data_2) 126 | d_index_2, k_index_2 = index_2.search(data_1, k) 127 | 128 | return k_index_2 129 | 130 | 131 | def find_knn(data_list, k = 15): 132 | """Gets k nearest-neighbors for all datasets. 133 | 134 | Using each data subset in data_list, gets k-nearest-neighbors for each subset and 135 | returns concatenated k-nearest-neighbors for each index across all datasets. 136 | 137 | Args: 138 | data_list (list): List of numpy arrays corresponding to data subsets. Datasets 139 | must have an equivalent number of features. 140 | k (integer): Positive integer value indicating how many neighbors to consider 141 | in the k-nearest-neighbors algorithm. Default value is 15. 142 | 143 | Returns: 144 | knn_concat (array): Array of values corresponding to query KNN indices for all 145 | datasets in data_list, indexed based on their concatenation. Will return 146 | k nearest-neighbors at query position for each index from input data. 147 | """ 148 | # Get lengths of all datasets for reindexing 149 | data_lens = [len(dataset) for dataset in data_list] 150 | 151 | # Get all indices 152 | indices = [i for i in range(len(data_list))] 153 | 154 | # Get knn pairs for each dataset and reindex as necessary 155 | knn_list = [] 156 | for idx in indices: 157 | dataset = data_list[idx] 158 | dataset = np.ascontiguousarray(dataset, dtype = np.float32) 159 | index = faiss.IndexFlatL2(dataset.shape[1]) 160 | index.add(dataset) 161 | knn_vals, knn = index.search(dataset, k) 162 | if idx == 0: 163 | knn_corrected = knn 164 | knn_list.append(knn_corrected) 165 | else: 166 | knn_corrected = [] 167 | len_addition = sum(data_lens[0:idx]) 168 | for i in range(len(knn)): 169 | knn_corrected.append(knn[i] + len_addition) 170 | knn_corrected_arr = np.asarray(knn_corrected) 171 | knn_list.append(knn_corrected_arr) 172 | 173 | # Concatenate all KNNs corresponding to all dataset indices and return 174 | knn_concat = np.concatenate(knn_list) 175 | return knn_concat -------------------------------------------------------------------------------- /workflow/src/python/utils/relatedness.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scipy.spatial as sp 4 | import scanpy as sc 5 | 6 | def relatedness_score(adata, pca_performed = True): 7 | """Computes the relatedeness between celltypes using cosine distance in PCA space 8 | 9 | Args: 10 | adata (AnnData): AnnData object containing relevant count information with celltype 11 | and batch observations. 12 | pca_performed (bool): True or False value indicating whether PCA decomposition steps 13 | have been performed already for AnnData object. Default is True. 14 | """ 15 | # Perform PCA if not already performed 16 | if pca_performed is False: 17 | sc.pp.normalize_total(adata, target_sum=1e4) 18 | sc.pp.log1p(adata) 19 | sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2500) 20 | sc.pp.pca(adata) 21 | 22 | # Get the batch and celltype information from AnnData object 23 | batch_vals = np.unique(adata.obs["batch"].__array__()) 24 | if len(batch_vals) > 1: 25 | raise ValueError("More than one batch found in AnnData object") 26 | batch = batch_vals[0] 27 | celltypes = np.unique(adata.obs["celltype"].__array__()) 28 | 29 | # Utilize the cosine distance between the average PCA embedding for celltype i and 30 | # average PCA embedding for celltype j 31 | pca_top_20 = adata.obsm["X_pca"][:, 0:20] 32 | top_20_pc_weights = adata.uns["pca"]["variance_ratio"][0:20] 33 | celltype_is = [] 34 | celltype_js = [] 35 | pca_cosine_dists = [] 36 | for celltype_i in celltypes: 37 | for celltype_j in celltypes: 38 | celltype_is.append(celltype_i) 39 | celltype_js.append(celltype_j) 40 | pca_celltype_i = pca_top_20[ 41 | adata.obs.celltype == celltype_i 42 | ] 43 | pca_celltype_j = pca_top_20[ 44 | adata.obs.celltype == celltype_j 45 | ] 46 | pca_celltype_i_avg = np.sum(pca_celltype_i, axis = 0)/len(pca_celltype_i) 47 | pca_celltype_j_avg = np.sum(pca_celltype_j, axis = 0)/len(pca_celltype_j) 48 | pca_cosine_dist = sp.distance.cosine( 49 | pca_celltype_i_avg, 50 | pca_celltype_j_avg, 51 | w = top_20_pc_weights 52 | ) 53 | pca_cosine_dists.append(pca_cosine_dist) 54 | 55 | # Gather the cosine distance results in a dataframe and return 56 | dist_results_df = pd.DataFrame({ 57 | "Celltype 1": celltype_is, 58 | "Celltype 2": celltype_js, 59 | "PCA cosine dist": pca_cosine_dists, 60 | "Batch": batch 61 | }) 62 | return dist_results_df -------------------------------------------------------------------------------- /workflow/src/python/utils/sample.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scanpy as sc 3 | import anndata as ann 4 | 5 | def downsample(adata, num_celltypes = None, celltype_names = None, proportion = 0.5): 6 | # Initialize random number generator 7 | rng = np.random.default_rng() 8 | 9 | # For the given number of celltypes, select num_celltypes 10 | # randomly, unless non random indicated by celltype_names 11 | if celltype_names is not None: 12 | celltypes_sample = celltype_names 13 | else: 14 | if num_celltypes is None: 15 | raise ValueError( 16 | "num_celltypes and celltype_names cannot both be None" 17 | ) 18 | if num_celltypes == 0: 19 | celltypes_sample = "None" 20 | return adata, celltypes_sample 21 | unique_celltypes = np.unique(adata.obs["celltype"].__array__()) 22 | rng.shuffle(unique_celltypes) 23 | celltypes_sample = rng.choice(unique_celltypes, num_celltypes, replace = False) 24 | 25 | # Save the original batch label for later 26 | adata.obs["batch_orig"] = adata.obs["batch"] 27 | 28 | # Downsample selected celltypes by given proportion 29 | for celltype in celltypes_sample: 30 | adata_celltype = adata[adata.obs["celltype"] == celltype] 31 | adata_noncelltype = adata[adata.obs["celltype"] != celltype] 32 | if proportion == 0: 33 | adata = adata_noncelltype 34 | continue 35 | adata_celltype_ds = sc.pp.subsample( 36 | adata_celltype, 37 | fraction = proportion, 38 | random_state = None, 39 | copy = True 40 | ) 41 | adata = ann.AnnData.concatenate(adata_noncelltype, adata_celltype_ds) 42 | 43 | # Replace batch column with batch original and drop batch_orig 44 | adata.obs["batch"] = adata.obs["batch_orig"] 45 | adata.obs.drop("batch_orig", axis = 1, inplace = True) 46 | 47 | # Return downsampled data + sampled celltypes 48 | return adata, celltypes_sample -------------------------------------------------------------------------------- /workflow/src/python/utils/seurat_integrate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | import uuid 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import scipy as sp 9 | import anndata as ann 10 | import scanpy as sc 11 | 12 | class SeuratIntegrate: 13 | """ 14 | Class for interpolating between the Integration class and R-script 15 | based integration of RNA-seq batches using the Seurat4.0 package. 16 | Integration is done on data output to a temporary file from the 17 | Integration class through and RScript, which then outputs a temporary 18 | file reread into python code and used to substitute the unintegrated data. 19 | Uses the first 20 components of CCA/RPCA space to integrate and get correction 20 | vectors for correcting whole data matrix. 21 | """ 22 | def __init__(self, adata, int_type = "CCA"): 23 | """ 24 | Args: 25 | adata (object): An instance of an anndata class corresponding to the seurat 26 | subset from the Integration class. 27 | int_type (string): Either "CCA" or "RPCA", indicating which Seurat workflow to 28 | utilize for integration - canonical correlation analysis and reciprocal PCA, 29 | respectively. RPCA should be used for larger datasets to avoid out-of-memory 30 | exceptions. Details on both workflows can be found at: 31 | https://satijalab.org/seurat/articles/integration_introduction.html 32 | https://satijalab.org/seurat/articles/integration_rpca.html 33 | """ 34 | self.adata = adata.copy() 35 | self.int_type = int_type 36 | 37 | def _format(self): 38 | # Append a column on gene names 39 | self.adata.var["gene"] = self.adata.var_names 40 | # Remove layers and raw from AnnData object (avoid conflicts with h5seurat) 41 | self.adata.layers = None 42 | self.adata.raw = None 43 | 44 | def _output_temp_h5ad(self): 45 | # Check if temp exists, if not, make dir 46 | if not os.path.exists("tmp"): 47 | os.makedirs("tmp") 48 | 49 | # Output temporary file with data 50 | self.filename = ''.join(str(uuid.uuid4()).split("-")) 51 | self.file = "{filename}.h5ad".format(filename = self.filename) 52 | self.adata.write_h5ad(os.path.join("tmp", self.file)) 53 | 54 | def _seurat_integrate(self): 55 | # Call subprocess and call R script 56 | tempfile_script = \ 57 | "Rscript src/R/seurat_integrate.R tmp/{tempfile} {tempfile_name} {int_type} --verbose".format( 58 | tempfile = self.file, 59 | tempfile_name = self.filename, 60 | int_type = self.int_type 61 | ) 62 | 63 | self.sp_integrate = subprocess.run(tempfile_script, shell = True, text = True, capture_output = True) 64 | if self.sp_integrate.returncode != 0: 65 | raise Exception( 66 | "Subprocess call returned nonzero exit code - call: {call} \n Output: {output}".format( 67 | call = self.sp_integrate.stderr, 68 | output = self.sp_integrate.stdout 69 | ) 70 | ) 71 | 72 | def _return_integrated(self): 73 | # Get seurat output file 74 | self.seur_outfile = "{filename}_seur_out.h5ad".format(filename = self.filename) 75 | 76 | # Read in as AnnData object 77 | integrated_adata = sc.read_h5ad(filename = os.path.join("tmp", self.seur_outfile)) 78 | 79 | # Reappend original obs columns 80 | integrated_adata.obs = self.adata.obs 81 | 82 | # Return integrated AnnData object 83 | return integrated_adata 84 | 85 | def _clean_files(self): 86 | # Remove temporary python and seurat files 87 | tmp_files = os.listdir("tmp") 88 | tmp_files_instance = [f for f in tmp_files if self.filename in f] 89 | for f in tmp_files_instance: 90 | os.remove(os.path.join("tmp", f)) 91 | 92 | # Check if all files related to the filename are removed from folder 93 | tmp_files = os.listdir("tmp") 94 | tmp_files_instance = [f for f in tmp_files if self.filename in f] 95 | if len(tmp_files_instance) > 0: 96 | raise Exception( 97 | "Temporary file cleanup incomplete - files remain in folder" 98 | ) 99 | 100 | def integrate(self): 101 | # Perform workflow and return integrated anndata object 102 | self._format() 103 | self._output_temp_h5ad() 104 | self._seurat_integrate() 105 | integrated_adata = self._return_integrated() 106 | self._clean_files() 107 | 108 | return integrated_adata -------------------------------------------------------------------------------- /workflow/src/python/utils/seurat_reference_mapping.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | import uuid 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import scipy as sp 9 | import anndata as ann 10 | import scanpy as sc 11 | 12 | class SeuratReferenceMap: 13 | """ 14 | Class for reference to query mapping through integration of RNA-seq batches 15 | using the Seurat4.0 package. Integration is done on data output to a temporary 16 | file a downsampled and integrated result through an RScript, which then outputs the 17 | reference mapped anndata (h5ad) file to be used for later downstream testing and 18 | analysis. The reference mapping workflow follows that of: 19 | https://satijalab.org/seurat/articles/multimodal_reference_mapping.html. 20 | """ 21 | def __init__(self, integrated_data_h5, reference_h5, mapped_h5): 22 | """ 23 | Args: 24 | integrated_data_h5 (str): Path to the anndata file for the integrated result after 25 | downsampling and integration. 26 | reference_h5 (str): Path to the reference h5Seurat file that contains the data 27 | to be used in query to reference mapping and annotation. 28 | mapped_h5 (str): Path to the h5ad output file from seurat that contains the 29 | mapped and annotated data. 30 | """ 31 | self.integrated_data_h5 = integrated_data_h5 32 | self.reference_h5 = reference_h5 33 | self.mapped_h5 = mapped_h5 34 | 35 | def _load(self): 36 | # Load the integrated data and subset for the seurat results 37 | self.adata = sc.read_h5ad(self.integrated_data_h5, as_sparse = "raw/X") 38 | self.adata = self.adata[self.adata.obs.integration_method == "seurat"].copy() 39 | self.adata.obs.index = range(len(self.adata.obs)) # Reset index 40 | 41 | def _format(self): 42 | # Substitute in raw counts for X, remove unecessary obs, obsm, and uns info 43 | self.adata.X = self.adata.layers["raw"] 44 | self.adata.obs = self.adata.obs.drop(columns = [ 45 | "leiden", 46 | "integration_method", 47 | "kmeans_faiss" 48 | ]) 49 | self.adata.obsm = None 50 | del self.adata.uns # Uns doesn't support None for resetting 51 | 52 | # Append a column on gene names 53 | self.adata.var["gene"] = self.adata.var_names 54 | 55 | # Remove layers and raw from AnnData object (avoid conflicts with h5seurat) 56 | if self.adata.layers["raw"] is not None: 57 | del self.adata.layers["raw"] 58 | self.adata.raw = None 59 | 60 | # Strip mapped h5 of extension - keep only name for internal seurat h5 conversions 61 | self.mapped_h5_name = os.path.splitext(self.mapped_h5)[0] 62 | 63 | def _output_temp_h5ad(self): 64 | # Check if temp exists, if not, make dir 65 | if not os.path.exists("tmp"): 66 | os.makedirs("tmp") 67 | 68 | # Output temporary file with data 69 | self.filename = ''.join(str(uuid.uuid4()).split("-")) 70 | self.file = "{filename}.h5ad".format(filename = self.filename) 71 | self.adata.write_h5ad(os.path.join("tmp", self.file)) 72 | 73 | def _seurat_refmap(self): 74 | # Call subprocess and call R script 75 | refmap_script = \ 76 | "Rscript src/R/seurat_reference_map.R {ref_h5} tmp/{tempfile} {tempname} {out_name} --verbose".format( 77 | ref_h5 = self.reference_h5, 78 | tempfile = self.file, 79 | tempname = self.filename, 80 | out_name = self.mapped_h5_name 81 | ) 82 | 83 | self.sp_refmap = subprocess.run(refmap_script, shell = True, text = True, capture_output = True) 84 | if self.sp_refmap.returncode != 0: 85 | raise Exception( 86 | "Subprocess call returned nonzero exit code - call: {call} \n Output: {output}".format( 87 | call = self.sp_refmap.stderr, 88 | output = self.sp_refmap.stdout 89 | ) 90 | ) 91 | 92 | def _clean_files(self): 93 | # Remove temporary python and seurat files 94 | tmp_files = os.listdir("tmp") 95 | tmp_files_instance = [f for f in tmp_files if self.filename in f] 96 | for f in tmp_files_instance: 97 | os.remove(os.path.join("tmp", f)) 98 | 99 | # Check if all files related to the filename are removed from folder 100 | tmp_files = os.listdir("tmp") 101 | tmp_files_instance = [f for f in tmp_files if self.filename in f] 102 | if len(tmp_files_instance) > 0: 103 | raise Exception( 104 | "Temporary file cleanup incomplete - files remain in folder" 105 | ) 106 | 107 | def refmap(self): 108 | # Perform workflow and return reference mapped anndata object 109 | self._load() 110 | self._format() 111 | self._output_temp_h5ad() 112 | self._seurat_refmap() 113 | self._clean_files() -------------------------------------------------------------------------------- /workflow/src/python/utils/umap.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import seaborn as sns 4 | import colorcet as cc 5 | from natsort import natsorted 6 | 7 | 8 | class Umap: 9 | """Class for plotting results of integration experiments""" 10 | 11 | def __init__(self, coords, clustering, subset_name = None): 12 | """ 13 | Args: 14 | coords (dictionary): coordinates of umap in numpy format where 15 | keys correspond to following integration methods - 16 | 'bbknn', 'harmony', 'scanorama', 'seurat', and 'scvi'. 17 | clustering (dictionary): leiden or celltype clustering in numpy 18 | format of integrated where keys correspond to following 19 | integration methods - 'bbknn', 'harmony', 'scanorama', 20 | 'seurat', and 'scvi'. 21 | subset_name (string): name of subset being utilized for clustering 22 | comparisons (e.g. batch, celltype). 23 | """ 24 | self.clustering_harmony = clustering.get("harmony") 25 | self.clustering_scvi = clustering.get("scvi") 26 | self.clustering_bbknn = clustering.get("bbknn") 27 | self.clustering_scanorama = clustering.get("scanorama") 28 | self.clustering_seurat = clustering.get("seurat") 29 | 30 | self.umap_harmony = coords.get("harmony") 31 | self.umap_scvi = coords.get("scvi") 32 | self.umap_bbknn = coords.get("bbknn") 33 | self.umap_scanorama = coords.get("scanorama") 34 | self.umap_seurat = coords.get("seurat") 35 | 36 | if subset_name is not None: 37 | self.subset_name = subset_name 38 | else: 39 | self.subset_name = "Subset" 40 | 41 | sns.set_style("ticks") 42 | 43 | def df_get(self, subset, clustering, coords, category = None): 44 | df = pd.DataFrame({ 45 | "Subset" : np.repeat(subset, len(clustering)), 46 | "UMAP 1" : coords[:, 0], 47 | "UMAP 2" : coords[:, 1] 48 | }) 49 | subsets = natsorted(np.unique(subset)) 50 | df["Subset"] = pd.Categorical( 51 | np.repeat(subset, len(clustering)), categories=subsets, ordered=True 52 | ) 53 | df["Clustering"] = pd.Categorical( 54 | clustering, categories=category, ordered=True 55 | ) 56 | return df 57 | 58 | def umap_df(self): 59 | subset_list = [ 60 | "bbknn", 61 | "harmony", 62 | "scanorama", 63 | "scvi", 64 | "seurat" 65 | ] 66 | clustering_list = [ 67 | self.clustering_bbknn, 68 | self.clustering_harmony, 69 | self.clustering_scanorama, 70 | self.clustering_scvi, 71 | self.clustering_seurat 72 | ] 73 | clustering_unique = natsorted(np.unique(np.concatenate(clustering_list))) 74 | coords_list = [ 75 | self.umap_bbknn, 76 | self.umap_harmony, 77 | self.umap_scanorama, 78 | self.umap_scvi, 79 | self.umap_seurat 80 | ] 81 | umap_dfs = [ 82 | self.df_get(i, j, k, category = clustering_unique) for i, j, k in zip( 83 | subset_list, 84 | clustering_list, 85 | coords_list 86 | ) 87 | ] 88 | self.umap_concat = pd.concat(umap_dfs) 89 | 90 | def umap_plot(self, show_plot = False): 91 | self.umap_df() 92 | palette = cc.glasbey_bw[0:len(np.unique(self.umap_concat["Clustering"]))] 93 | self.umap_plt = sns.FacetGrid( 94 | self.umap_concat, 95 | col = "Subset", 96 | col_wrap = 3, 97 | hue = "Clustering", 98 | palette = palette 99 | ) 100 | self.umap_plt.map( 101 | sns.scatterplot, 102 | "UMAP 1", 103 | "UMAP 2", 104 | s = 5, 105 | alpha = 0.5 106 | ) 107 | self.umap_plt.add_legend(markerscale = 3, title = self.subset_name) 108 | if show_plot is True: 109 | return self.umap_plt 110 | 111 | def save_umap(self, save_dir, dpi = 300): 112 | try: 113 | self.umap_plt.savefig( 114 | save_dir, 115 | dpi = dpi 116 | ) 117 | except: 118 | self.umap_plot() 119 | self.umap_plt.savefig( 120 | save_dir, 121 | dpi = dpi 122 | ) --------------------------------------------------------------------------------