├── .gitattributes
├── .gitignore
├── LICENSE.md
├── README.md
├── docs
    ├── README.md
    ├── assets
    │   └── Figure_8_Revised.png
    ├── envs
    │   └── tutorial.yaml
    ├── guidelines.Rmd
    ├── guidelines.html
    └── guidelines.pdf
└── workflow
    ├── Snakefile
    ├── analysis
        ├── R
        │   ├── 05_Iniq_Control_Fig_2_Analysis_Plots.R
        │   ├── 06_Iniq_Control_Fig_2_Analysis_Stat_Tests.R
        │   ├── 07_Iniq_Control_Fig_3_Analysis_Plots.R
        │   ├── 08_Iniq_Control_Fig_3_Analysis_Stat_Tests.R
        │   ├── 09_Iniq_Control_Fig_3_Analysis_Stats_Multinomial_Low_Rep.R
        │   ├── 10B_Iniq_Real_Datasets_Fig_4_Stat_Tests.R
        │   ├── 10_Iniq_Real_Datasets_Fig_4_Analysis_Plots.R
        │   ├── 12B_Iniq_PDAC_Fig_6_Stat_Tests.R
        │   ├── 12_Fig_6_PDAC_Analysis_Plots.R
        │   ├── 13_Iniq_Balanced_Metrics_Results_Fig_7_Analysis_Plots.R
        │   ├── 19_Supplementary_TI_Analysis.R
        │   ├── 20_Fig_6_PDAC_Reanno_Analysis_Plots.R
        │   ├── 21_Fig_6_PDAC_Reanno_Stats_Tests.R
        │   ├── 21_PBMC_perturbation_umap_plots.R
        │   └── knn_example.R
        └── python
        │   └── 01_Fig_7_Imbal_Metric_Analysis.ipynb
    ├── configs
        ├── config.json
        ├── config_control.json
        ├── config_control_ti_only.json
        ├── config_custom.json
        ├── config_lowcap.json
        ├── config_lowcap_control_like.json
        ├── config_lowcap_modified.json
        ├── config_lowcap_modified_pdac_reanno.json
        ├── config_pdac_comp.json
        ├── config_pdac_comp_reanno.json
        ├── config_umap.json
        ├── test_config.json
        └── test_config_lite.json
    ├── envs
        ├── analysis.yaml
        └── integrate.yaml
    ├── preprocessing
        └── pdac
        │   ├── Snakefile
        │   └── preprocess_env.yaml
    ├── scripts
        └── python
        │   ├── annotation_results.py
        │   ├── annotation_scoring.py
        │   ├── celltype_imbalance_summary.py
        │   ├── clustering_concordance.py
        │   ├── clustering_stats.py
        │   ├── dge_concordance_full.py
        │   ├── dge_concordance_stats.py
        │   ├── dge_ranking_concordance.py
        │   ├── dge_ranking_marker_subset.py
        │   ├── dge_ranking_per_cluster.py
        │   ├── downsample_summary.py
        │   ├── imbalance_summary.py
        │   ├── integrate_data.py
        │   ├── integrate_data_paga.py
        │   ├── knn_classification.py
        │   ├── marker_get.py
        │   ├── reference_annotation.py
        │   ├── reference_control_annotation.py
        │   ├── relatedness_metric.py
        │   ├── ti_concordance.py
        │   └── umap_plots.py
    └── src
        ├── R
            ├── liger_integrate.R
            ├── seurat_integrate.R
            └── seurat_reference_map.R
        └── python
            ├── __init__.py
            ├── imbalanced_clustering
                ├── __init__.py
                ├── ami.py
                ├── ari.py
                ├── utils
                │   ├── __init__.py
                │   ├── _emi_cython.pyx
                │   ├── avg.py
                │   ├── checks.py
                │   ├── contingency.py
                │   └── mi.py
                └── vmeasure.py
            ├── loaders
                └── rna_data.py
            └── utils
                ├── __init__.py
                ├── cluster_concordance.py
                ├── clustering.py
                ├── diffexp.py
                ├── integrate.py
                ├── integrate_ti.py
                ├── kmeans.py
                ├── liger_integrate.py
                ├── mnn.py
                ├── relatedness.py
                ├── sample.py
                ├── seurat_integrate.py
                ├── seurat_reference_mapping.py
                └── umap.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | workflow/analysis/** linguist-vendored
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.png
 2 | *.jpeg
 3 | *.pdf
 4 | *.html
 5 | *.svg
 6 | *.tsv
 7 | *.csv
 8 | *.log
 9 | *.out
10 | *.snakemake
11 | *.slurm*
12 | *.slrm*
13 | *.ipynb_checkpoints
14 | __pycache__
15 | *.Rhistory
16 | *.RData
17 | *.tar.gz
18 | *.h5ad
19 | cluster.json
20 | submit.slrm
21 | submit.sh
22 | workflow/tmp
23 | resources/h5ad_files
24 | resources/test_h5ad_objs
25 | resources/references
26 | results
27 | outs
28 | logs
29 | data
30 | run_scripts.sh
31 | run_rscripts.sh
32 | *scratch*


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | ## Imbalanced integration guidelines 
 2 | 
 3 | ![Example Image](assets/Figure_8_Revised.png)
 4 | 
 5 | This folder contains an rmarkdown example for using the guidelines shown in the [Imbalanced integration manuscript](https://www.biorxiv.org/content/10.1101/2022.10.06.511156v3). The tutorial is available as a rendered rmarkdown html document (guidelines.html), which can be downloaded and viewed with a web browser.
 6 | 
 7 | The full details of the imbalanced integration guidelines can be found in the manuscript in Results Section: Guidelines for imbalanced single-cell data integration, and the associated Supplementary Table 2.
 8 | 
 9 | ### Viewing the rendered markdown
10 | 
11 | #### Through the html
12 | 
13 | 1. Download the `guidelines.html` file
14 | 
15 | 2. Open the file with a web browser (chrome, safari, firefox, etc.)
16 | 
17 | #### Through the pdf 
18 | 
19 | 1. Download the 'guidelines.pdf' file and view with a PDF browser
20 | 
21 | Please note that the pdf rendering of the code might not be formatted well as the file was originally rendered as an html vignette
22 | 
23 | ### Running the tutorial
24 | 
25 | 1. Install the conda environment for the guidelines:
26 | ```
27 | conda env install -f envs/tutorial.yaml
28 | ```
29 | 
30 | 2. Activate the conda environment:
31 | ```
32 | conda activate
33 | ```
34 | 
35 | 3. Activate R and Install rmarkdown and tinytex
36 | ```
37 | R
38 | ```
39 | ```
40 | install.packages("rmarkdown", dep = TRUE)
41 | install.packages("tinytex")
42 | tinytex::install_tinytex() 
43 | ```
44 | 
45 | 4. Run the rmarkdown chunk by chunk in [Rstudio](https://posit.co/downloads/) 
46 | ***
47 | 


--------------------------------------------------------------------------------
/docs/assets/Figure_8_Revised.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsmaan/Iniquitate/cb20fe1240be6cb03dd63f1151816ec9d3b70a84/docs/assets/Figure_8_Revised.png


--------------------------------------------------------------------------------
/docs/envs/tutorial.yaml:
--------------------------------------------------------------------------------
 1 | name: iniq_guidelines
 2 | channels:
 3 |   - pytorch
 4 |   - bioconda
 5 |   - r
 6 |   - conda-forge
 7 |   - defaults
 8 | dependencies:
 9 |   - python>=3.7,<=3.10
10 |   - scipy>=1.5.0
11 |   - leidenalg>=0.8.0
12 |   - umap-learn>=0.5.0
13 |   - scanpy=1.8.2
14 |   - anndata>=0.7.5
15 |   - r-base>=4.0.5
16 |   - r-seurat>=4.0.5
17 |   - r-data.table>=1.14.0
18 |   - r-ggplot2>=3.3.0
19 |   - r-tidyverse>=1.2.1
20 |   - r-reshape2>=1.4.3
21 |   - r-ggthemes>=4.2.0
22 |   - r-ggextra>=0.8.0
23 |   - r-dotwhisker>=0.7.4
24 |   - r-deldir>=1.0.2
25 |   - r-ggpubr>=0.4.0
26 |   - r-cowplot>=1.1.1
27 |   - r-ggrepel>=0.9.1
28 |   - r-rcolorbrewer>=1.1
29 |   - r-ggbump>=0.1.0
30 |   - bioconductor-complexheatmap<=2.9.0
31 |   - r-venndiagram>=1.7.1
32 |   - r-multipanelfigure>=2.1.2
33 |   - r-gridextra>=2.3
34 |   - r-cairo>=1.5
35 |   - r-lemon>=0.4.5
36 |   - r-networkd3>=0.4
37 |   - r-emt>=1.2
38 |   - r-rmarkdown>=2.14
39 |   - bioconductor-singlecellexperiment>=1.12.0
40 |   - bioconductor-scater>=1.18.0
41 |   - bioconductor-batchelor>=1.6.0
42 |   - bioconductor-bluster=1.4.0
43 |   - r-pheatmap>=1.0.12
44 |   - r-devtools
45 |   - r-harmony
46 |   - pip
47 |   - pip:
48 |      - balanced-clustering==0.1.0


--------------------------------------------------------------------------------
/docs/guidelines.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsmaan/Iniquitate/cb20fe1240be6cb03dd63f1151816ec9d3b70a84/docs/guidelines.pdf


--------------------------------------------------------------------------------
/workflow/analysis/R/12B_Iniq_PDAC_Fig_6_Stat_Tests.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(tidyverse)
  3 | library(reshape2)
  4 | library(ggplot2)
  5 | library(ggthemes)
  6 | library(ggExtra)
  7 | library(ggpubr)
  8 | library(dotwhisker)
  9 | library(Seurat)
 10 | library(SeuratDisk)
 11 | library(ComplexHeatmap)
 12 | library(circlize)
 13 | library(RColorBrewer)
 14 | library(Cairo)
 15 | library(networkD3)
 16 | 
 17 | ### Note that this analysis done without LIGER ### 
 18 | `%ni%` <- Negate(`%in%`)
 19 | kev_palette <- c(
 20 |   "dodgerblue2", "#E31A1C",
 21 |   "green4",
 22 |   "#6A3D9A", 
 23 |   "#FF7F00", 
 24 |   "black", "gold1",
 25 |   "skyblue2", "#FB9A99", 
 26 |   "palegreen2",
 27 |   "#CAB2D6", 
 28 |   "#FDBF6F", 
 29 |   "gray70", "khaki2",
 30 |   "maroon", "orchid1", "deeppink1", "blue1", "steelblue4",
 31 |   "darkturquoise", "green1", "yellow4", "yellow3",
 32 |   "darkorange4", "brown"
 33 | )
 34 | 
 35 | # Load in and concatenate imbalance summary files 
 36 | setwd("../../../results/pdac_comp/imbalance_summaries/")
 37 | imba_files <- list.files()
 38 | imba_loaded <- lapply(imba_files, fread)
 39 | imba_concat <- Reduce(rbind, imba_loaded)
 40 | gc()
 41 | 
 42 | # Load in and concatenate celltype summary files
 43 | setwd("../celltype_imbalance_summaries")
 44 | cimba_files <- list.files()
 45 | cimba_loaded <- lapply(cimba_files, fread)
 46 | cimba_concat <- Reduce(rbind, cimba_loaded)
 47 | gc()
 48 | 
 49 | # Load in and concatenate the clustering summary results 
 50 | setwd("../clustering_summaries/")
 51 | clus_files <- list.files()
 52 | clus_loaded <- lapply(clus_files, fread)
 53 | clus_concat <- Reduce(rbind, clus_loaded)
 54 | clus_concat <- clus_concat[clus_concat$Method != "liger"]
 55 | gc()
 56 | 
 57 | # Load in and concatenate clustering concordance summaries 
 58 | setwd("../clustering_concord_summaries/")
 59 | clus_concord_files <- list.files()
 60 | clus_concord_loaded <- lapply(clus_concord_files, fread)
 61 | clus_concord_concat <- Reduce(rbind, clus_concord_loaded)
 62 | clus_concord_concat <- clus_concord_concat[
 63 |   clus_concord_concat$`Method 1` != "liger"
 64 | ]
 65 | clus_concord_concat <- clus_concord_concat[
 66 |   clus_concord_concat$`Method 2` != "liger"
 67 | ]
 68 | gc()
 69 | 
 70 | # Load in and concatenate dge concordance summaries
 71 | setwd("../dge_concord_stats/")
 72 | dge_files <- list.files()
 73 | dge_loaded <- lapply(dge_files, fread)
 74 | dge_concat <- Reduce(rbind, dge_loaded)
 75 | dge_concat <- dge_concat[dge_concat$`Method 1` != "liger"]
 76 | dge_concat <- dge_concat[dge_concat$`Method 2` != "liger"]
 77 | gc()
 78 | 
 79 | # Load in and concatenate dge ranking summaries, subset by marker genes
 80 | setwd("../dge_ranking_stats_marker_sub")
 81 | dge_rank_files <- list.files()
 82 | dge_rank_loaded <- lapply(dge_rank_files, fread)
 83 | dge_rank_concat <- Reduce(rbind, dge_rank_loaded)
 84 | dge_rank_concat <- dge_rank_concat[dge_rank_concat$Method != "liger"]
 85 | gc()
 86 | 
 87 | # Load in markers and corresponding celltypes 
 88 | setwd("../marker_results/")
 89 | base_marker_genes <- fread(
 90 |   "peng_pdac_tumor_annot_8_batch_preintegration_marker_selection.tsv"
 91 | ) 
 92 | 
 93 | # Load in and concatenate knn classification summaries
 94 | setwd("../knn_classification_reports/")
 95 | knn_files <- list.files()
 96 | knn_loaded <- lapply(knn_files, fread)
 97 | knn_concat <- Reduce(rbind, knn_loaded)
 98 | knn_concat <- knn_concat[knn_concat$Method != "liger"]
 99 | gc()
100 | 
101 | # Change to top level dir 
102 | setwd("../../..")
103 | 
104 | # Make pdac comp output dir if it doesn't exist
105 | if (!dir.exists("outs/pdac_comp/figures")) {
106 |   dir.create("outs/pdac_comp/figures", recursive = TRUE)
107 | }
108 | if (!dir.exists("outs/pdac_comp/results")) {
109 |   dir.create("outs/pdac_comp/results", recursive = TRUE)
110 | }
111 | 
112 | ### Statistical test -  downsampling results on KNN classification scores
113 | ### of given subsets/compartments 
114 | imba_knn_merged <- merge(
115 | imba_concat,
116 | knn_concat,
117 | by = c(
118 |   "Number of batches downsampled",
119 |   "Number of celltypes downsampled",
120 |   "Proportion downsampled",
121 |   "Replicate"
122 | )
123 | )
124 | imba_knn_merged <- distinct(imba_knn_merged)
125 | 
126 | # Subset for only cases where the celltype/compartment downsampled is equal to 
127 | # the celltype being classified
128 | imba_knn_merged_celltype <- imba_knn_merged[
129 |   imba_knn_merged$Celltype == imba_knn_merged$`Downsampled celltypes` |
130 |     imba_knn_merged$`Downsampled celltypes` %in% c("None")
131 | ]
132 | 
133 | # Indicate which panels are control and which ones are ablations or downsampling
134 | imba_knn_merged_celltype$type <- ifelse(
135 |   imba_knn_merged_celltype$`Number of batches downsampled` == 0,
136 |   "Control",
137 |   ifelse(
138 |     imba_knn_merged_celltype$`Proportion downsampled` == 0,
139 |     "Ablated",
140 |     "Downsampled"
141 |   )
142 | ) 
143 | 
144 | # Indicate the separate compartments
145 | compartments <- unique(imba_knn_merged_celltype$Celltype)
146 | 
147 | # Create a function to do an ANOVA test for the F1 score based on each
148 | # compartment utilized 
149 | anova_compart_knn <- function(
150 |     compartment, 
151 |     dataset
152 | ){
153 |   # Subset data for the given compartment
154 |   dataset_sub <- dataset[dataset$Celltype == compartment]
155 |   
156 |   # Format the data columns for lm 
157 |   colnames(dataset_sub) <- plyr::mapvalues(
158 |     colnames(dataset_sub),
159 |     from = c(
160 |       "F1-score",
161 |       "Method",
162 |       "type"
163 |     ),
164 |     to = c(
165 |       "f1_score",
166 |       "method",
167 |       "type"
168 |     )
169 |   )
170 |   
171 |   # Fit ANOVA model
172 |   model_fit <- lm(
173 |     as.formula(
174 |       paste0(
175 |         "f1_score", 
176 |         "~",
177 |         "method+",
178 |         "type"
179 |       )
180 |     ),
181 |     data = dataset_sub
182 |   )
183 |   anova_result <- anova(model_fit, test = "F")
184 |   
185 |   # Format results and return
186 |   anova_result_dt <- as.data.table(anova_result, keep.rownames = TRUE)
187 |   colnames(anova_result_dt)[1] <- "Covariate"
188 |   anova_result_dt$compartment_name <- compartment
189 |   anova_result_dt$metric <- "F1 score"
190 |   anova_result_dt$last_covariate <- "type"
191 |   return(anova_result_dt)
192 | }
193 | 
194 | # Iterate over compartments and get the significance of ds/ablation 
195 | knn_anova_comp_results <- mapply(
196 |   anova_compart_knn,
197 |   compartment = compartments,
198 |   MoreArgs = list(
199 |     dataset = imba_knn_merged_celltype
200 |   ),
201 |   SIMPLIFY = FALSE
202 | )
203 | knn_anova_comp_results
204 | 
205 | # Save the concatenated results and plot the ANOVA F-values for 
206 | # a supplementary figure 
207 | knn_anova_comp_results_concat <- Reduce(rbind, knn_anova_comp_results)
208 | fwrite(
209 |   knn_anova_comp_results_concat,
210 |   "outs/pdac_comp/results/12B_comp_specific_ds_knn_f1_score_anovas.tsv",
211 |   sep = "\t",
212 |   quote = FALSE,
213 |   row.names = FALSE,
214 |   col.names = TRUE
215 | )
216 | 
217 | knn_anova_comp_results_concat_nores <- knn_anova_comp_results_concat[
218 |   knn_anova_comp_results_concat$Covariate != "Residuals"
219 | ]
220 | 
221 | f_vals <- knn_anova_comp_results_concat_nores$`F value`
222 | covars <- knn_anova_comp_results_concat_nores$Covariate
223 | comps <- knn_anova_comp_results_concat_nores$compartment_name
224 | 
225 | knn_aov_comp_df <- data.frame(
226 |   "Covariates" = covars,
227 |   "F_values" = f_vals,
228 |   "Compartment" = comps
229 | )
230 | 
231 | knn_aov_comp_df_melted <- reshape2::melt(
232 |   knn_aov_comp_df,
233 |   id.vars = c("Compartment", "Covariates"),
234 |   measure.vars = "F_values"
235 | )
236 | knn_aov_comp_df_melted$Covariates <- plyr::mapvalues(
237 |   knn_aov_comp_df_melted$Covariates,
238 |   from = c(
239 |     "type",
240 |     "method"
241 |   ),
242 |   to = c(
243 |     "Unperturbed vs perturbed",
244 |     "Integration method"
245 |   )
246 | ) 
247 | 
248 | ggplot(data = knn_aov_comp_df_melted, aes(Covariates, value)) +
249 |   geom_bar(
250 |     stat = "identity",
251 |     position = position_dodge2(),
252 |     aes(
253 |       fill = Compartment
254 |     )
255 |   ) + 
256 |   scale_fill_brewer(palette = "Set1") +
257 |   theme_classic() + 
258 |   coord_flip () +
259 |   labs(x = "Covariate", y = "ANOVA F-statistic") +
260 |   theme(axis.title.x = element_text(size = 16)) +
261 |   theme(axis.title.y = element_text(size = 16)) +
262 |   theme(strip.text.x = element_text(size = 16)) +
263 |   theme(strip.text.y = element_text(size = 16)) +
264 |   theme(plot.title = element_text(size = 14)) +
265 |   theme(axis.text.x = element_text(size = 16)) +
266 |   theme(axis.text.y = element_text(size = 16)) +
267 |   theme(legend.title = element_text(size = 16)) +
268 |   theme(legend.text = element_text(size = 16)) +
269 |   theme(aspect.ratio = 1)
270 | ggsave(
271 |   paste0(
272 |     "outs/pdac_comp/figures/",
273 |     "12B_pdac_knn_aov_comp_ds_f_statistic.pdf"
274 |   ),
275 |   width = 12,
276 |   height = 12,
277 |   device = cairo_pdf
278 | )
279 | 


--------------------------------------------------------------------------------
/workflow/analysis/R/21_Fig_6_PDAC_Reanno_Stats_Tests.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(tidyverse)
  3 | library(reshape2)
  4 | library(ggplot2)
  5 | library(ggthemes)
  6 | library(ggExtra)
  7 | library(ggpubr)
  8 | library(dotwhisker)
  9 | library(Seurat)
 10 | library(SeuratDisk)
 11 | library(ComplexHeatmap)
 12 | library(circlize)
 13 | library(RColorBrewer)
 14 | library(Cairo)
 15 | library(networkD3)
 16 | 
 17 | ### Note that this analysis done without LIGER ### 
 18 | `%ni%` <- Negate(`%in%`)
 19 | kev_palette <- c(
 20 |   "dodgerblue2", "#E31A1C",
 21 |   "green4",
 22 |   "#6A3D9A", 
 23 |   "#FF7F00", 
 24 |   "black", "gold1",
 25 |   "skyblue2", "#FB9A99", 
 26 |   "palegreen2",
 27 |   "#CAB2D6", 
 28 |   "#FDBF6F", 
 29 |   "gray70", "khaki2",
 30 |   "maroon", "orchid1", "deeppink1", "blue1", "steelblue4",
 31 |   "darkturquoise", "green1", "yellow4", "yellow3",
 32 |   "darkorange4", "brown"
 33 | )
 34 | 
 35 | # Load in and concatenate imbalance summary files 
 36 | setwd("../../../results/pdac_comp_reanno/imbalance_summaries/")
 37 | imba_files <- list.files()
 38 | imba_loaded <- lapply(imba_files, fread)
 39 | imba_concat <- Reduce(rbind, imba_loaded)
 40 | gc()
 41 | 
 42 | # Load in and concatenate celltype summary files
 43 | setwd("../celltype_imbalance_summaries")
 44 | cimba_files <- list.files()
 45 | cimba_loaded <- lapply(cimba_files, fread)
 46 | cimba_concat <- Reduce(rbind, cimba_loaded)
 47 | gc()
 48 | 
 49 | # Load in and concatenate the clustering summary results 
 50 | setwd("../clustering_summaries/")
 51 | clus_files <- list.files()
 52 | clus_loaded <- lapply(clus_files, fread)
 53 | clus_concat <- Reduce(rbind, clus_loaded)
 54 | clus_concat <- clus_concat[clus_concat$Method != "liger"]
 55 | gc()
 56 | 
 57 | # Load in and concatenate clustering concordance summaries 
 58 | setwd("../clustering_concord_summaries/")
 59 | clus_concord_files <- list.files()
 60 | clus_concord_loaded <- lapply(clus_concord_files, fread)
 61 | clus_concord_concat <- Reduce(rbind, clus_concord_loaded)
 62 | clus_concord_concat <- clus_concord_concat[
 63 |   clus_concord_concat$`Method 1` != "liger"
 64 | ]
 65 | clus_concord_concat <- clus_concord_concat[
 66 |   clus_concord_concat$`Method 2` != "liger"
 67 | ]
 68 | gc()
 69 | 
 70 | # Load in and concatenate knn classification summaries
 71 | setwd("../knn_classification_reports/")
 72 | knn_files <- list.files()
 73 | knn_loaded <- lapply(knn_files, fread)
 74 | knn_concat <- Reduce(rbind, knn_loaded)
 75 | knn_concat <- knn_concat[knn_concat$Method != "liger"]
 76 | gc()
 77 | 
 78 | # Change to top level dir 
 79 | setwd("../../..")
 80 | 
 81 | # Make pdac comp output dir if it doesn't exist
 82 | if (!dir.exists("outs/pdac_comp_reanno/figures")) {
 83 |   dir.create("outs/pdac_comp_reanno/figures", recursive = TRUE)
 84 | }
 85 | if (!dir.exists("outs/pdac_comp_reanno/results")) {
 86 |   dir.create("outs/pdac_comp_reanno/results", recursive = TRUE)
 87 | }
 88 | 
 89 | ### Statistical test -  downsampling results on KNN classification scores
 90 | ### of given subsets/compartments 
 91 | imba_knn_merged <- merge(
 92 | imba_concat,
 93 | knn_concat,
 94 | by = c(
 95 |   "Number of batches downsampled",
 96 |   "Number of celltypes downsampled",
 97 |   "Proportion downsampled",
 98 |   "Replicate"
 99 | )
100 | )
101 | imba_knn_merged <- distinct(imba_knn_merged)
102 | 
103 | # Subset for only cases where the celltype/compartment downsampled is equal to 
104 | # the celltype being classified
105 | imba_knn_merged_celltype <- imba_knn_merged[
106 |   imba_knn_merged$Celltype == imba_knn_merged$`Downsampled celltypes` |
107 |     imba_knn_merged$`Downsampled celltypes` %in% c("None")
108 | ]
109 | 
110 | # Indicate which panels are control and which ones are ablations or downsampling
111 | imba_knn_merged_celltype$type <- ifelse(
112 |   imba_knn_merged_celltype$`Number of batches downsampled` == 0,
113 |   "Control",
114 |   ifelse(
115 |     imba_knn_merged_celltype$`Proportion downsampled` == 0,
116 |     "Ablated",
117 |     "Downsampled"
118 |   )
119 | ) 
120 | 
121 | # Indicate the separate compartments
122 | compartments <- unique(imba_knn_merged_celltype$Celltype)
123 | 
124 | # Create a function to do an ANOVA test for the F1 score based on each
125 | # compartment utilized 
126 | anova_compart_knn <- function(
127 |     compartment, 
128 |     dataset
129 | ){
130 |   # Subset data for the given compartment
131 |   dataset_sub <- dataset[dataset$Celltype == compartment]
132 |   
133 |   # Format the data columns for lm 
134 |   colnames(dataset_sub) <- plyr::mapvalues(
135 |     colnames(dataset_sub),
136 |     from = c(
137 |       "F1-score",
138 |       "Method",
139 |       "type"
140 |     ),
141 |     to = c(
142 |       "f1_score",
143 |       "method",
144 |       "type"
145 |     )
146 |   )
147 |   
148 |   # Fit ANOVA model
149 |   model_fit <- lm(
150 |     as.formula(
151 |       paste0(
152 |         "f1_score", 
153 |         "~",
154 |         "method+",
155 |         "type"
156 |       )
157 |     ),
158 |     data = dataset_sub
159 |   )
160 |   anova_result <- anova(model_fit, test = "F")
161 |   
162 |   # Format results and return
163 |   anova_result_dt <- as.data.table(anova_result, keep.rownames = TRUE)
164 |   colnames(anova_result_dt)[1] <- "Covariate"
165 |   anova_result_dt$compartment_name <- compartment
166 |   anova_result_dt$metric <- "F1 score"
167 |   anova_result_dt$last_covariate <- "type"
168 |   return(anova_result_dt)
169 | }
170 | 
171 | # Iterate over compartments and get the significance of ds/ablation 
172 | knn_anova_comp_results <- mapply(
173 |   anova_compart_knn,
174 |   compartment = compartments,
175 |   MoreArgs = list(
176 |     dataset = imba_knn_merged_celltype
177 |   ),
178 |   SIMPLIFY = FALSE
179 | )
180 | knn_anova_comp_results
181 | 
182 | # Save the concatenated results and plot the ANOVA F-values for 
183 | # a supplementary figure 
184 | knn_anova_comp_results_concat <- Reduce(rbind, knn_anova_comp_results)
185 | fwrite(
186 |   knn_anova_comp_results_concat,
187 |   "outs/pdac_comp_reanno/results/21_comp_specific_ds_knn_f1_score_anovas.tsv",
188 |   sep = "\t",
189 |   quote = FALSE,
190 |   row.names = FALSE,
191 |   col.names = TRUE
192 | )
193 | 
194 | knn_anova_comp_results_concat_nores <- knn_anova_comp_results_concat[
195 |   knn_anova_comp_results_concat$Covariate != "Residuals"
196 | ]
197 | 
198 | f_vals <- knn_anova_comp_results_concat_nores$`F value`
199 | covars <- knn_anova_comp_results_concat_nores$Covariate
200 | comps <- knn_anova_comp_results_concat_nores$compartment_name
201 | 
202 | knn_aov_comp_df <- data.frame(
203 |   "Covariates" = covars,
204 |   "F_values" = f_vals,
205 |   "Compartment" = comps
206 | )
207 | 
208 | knn_aov_comp_df_melted <- reshape2::melt(
209 |   knn_aov_comp_df,
210 |   id.vars = c("Compartment", "Covariates"),
211 |   measure.vars = "F_values"
212 | )
213 | knn_aov_comp_df_melted$Covariates <- plyr::mapvalues(
214 |   knn_aov_comp_df_melted$Covariates,
215 |   from = c(
216 |     "type",
217 |     "method"
218 |   ),
219 |   to = c(
220 |     "Unperturbed vs perturbed",
221 |     "Integration method"
222 |   )
223 | ) 
224 | 
225 | ggplot(data = knn_aov_comp_df_melted, aes(Covariates, value)) +
226 |   geom_bar(
227 |     stat = "identity",
228 |     position = position_dodge2(),
229 |     aes(
230 |       fill = Compartment
231 |     )
232 |   ) + 
233 |   scale_fill_brewer(palette = "Set1") +
234 |   theme_classic() + 
235 |   coord_flip () +
236 |   labs(x = "Covariate", y = "ANOVA F-statistic") +
237 |   theme(axis.title.x = element_text(size = 16)) +
238 |   theme(axis.title.y = element_text(size = 16)) +
239 |   theme(strip.text.x = element_text(size = 16)) +
240 |   theme(strip.text.y = element_text(size = 16)) +
241 |   theme(plot.title = element_text(size = 14)) +
242 |   theme(axis.text.x = element_text(size = 16)) +
243 |   theme(axis.text.y = element_text(size = 16)) +
244 |   theme(legend.title = element_text(size = 16)) +
245 |   theme(legend.text = element_text(size = 16)) +
246 |   theme(aspect.ratio = 1)
247 | ggsave(
248 |   paste0(
249 |     "outs/pdac_comp_reanno/figures/",
250 |     "21_pdac_knn_aov_comp_ds_f_statistic.pdf"
251 |   ),
252 |   width = 12,
253 |   height = 12,
254 |   device = cairo_pdf
255 | )
256 | 


--------------------------------------------------------------------------------
/workflow/analysis/R/21_PBMC_perturbation_umap_plots.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(tidyverse)
  3 | library(reshape2)
  4 | library(ggplot2)
  5 | library(ggthemes)
  6 | library(ggExtra)
  7 | library(ggpubr)
  8 | library(dotwhisker)
  9 | library(Seurat)
 10 | library(SeuratDisk)
 11 | library(ComplexHeatmap)
 12 | library(circlize)
 13 | library(RColorBrewer)
 14 | library(Cairo)
 15 | 
 16 | # Helper functions
 17 | `%ni%` <- Negate(`%in%`)
 18 | 
 19 | # Change to results dir for uamp results data
 20 | setwd("../../../results/umap/")
 21 | 
 22 | # Load color palette 
 23 | kev_palette <- c("dodgerblue2", 
 24 |                  "#E31A1C",
 25 |                  "green4",
 26 |                  "#6A3D9A", 
 27 |                  "#FF7F00", 
 28 |                  "black",
 29 |                  "gold1",
 30 |                  "skyblue2",
 31 |                  "#FB9A99", 
 32 |                  "palegreen2",
 33 |                  "#CAB2D6", 
 34 |                  "#FDBF6F", 
 35 |                  "gray70", 
 36 |                  "khaki2",
 37 |                  "maroon",
 38 |                  "orchid1",
 39 |                  "deeppink1",
 40 |                  "blue1",
 41 |                  "steelblue4",
 42 |                  "darkturquoise",
 43 |                  "green1",
 44 |                  "yellow4",
 45 |                  "yellow3",
 46 |                  "darkorange4",
 47 |                  "brown")
 48 | 
 49 | ##### Analysis of PBMC 2 batch balanced data - baseline #####
 50 | 
 51 | # Load in the umap plot results 
 52 | setwd("umap_plots/")
 53 | umap_files <- list.files()
 54 | umap_files <- grep(
 55 |   ".tsv",
 56 |   umap_files,
 57 |   value = TRUE
 58 | )
 59 | umap_files <- grep(
 60 |   "pbmc_2_batch_base_balanced",
 61 |   umap_files,
 62 |   value = TRUE
 63 | )
 64 | umap_loaded <- lapply(umap_files, fread)
 65 | umap_names <- str_split_fixed(umap_files, fixed(".tsv"), 2)[,1]
 66 | names(umap_loaded) <- umap_names
 67 | 
 68 | setwd("../../..")
 69 | 
 70 | # Create directory for umap results if it doesn't exist
 71 | if (!dir.exists("outs/umap/results")) {
 72 |   dir.create("outs/umap/results", recursive = TRUE)
 73 | }
 74 | if (!dir.exists("outs/umap/figures")) {
 75 |   dir.create("outs/umap/figures")
 76 | }
 77 | 
 78 | # Create function to loop over the umap files and return the results 
 79 | umap_plot <- function(df, save_prefix) {
 80 |   # Format celltype names
 81 |   df$Clustering <- plyr::mapvalues(
 82 |     df$Clustering,
 83 |     from = c(
 84 |       "Monocyte_CD14",
 85 |       "Monocyte_FCGR3A",
 86 |       "CD4 T cell",
 87 |       "CD8 T cell"
 88 |     ),
 89 |     to = c(
 90 |       "CD14+ Monocyte",
 91 |       "FCGR3A+ Monocyte",
 92 |       "CD4+ T cell",
 93 |       "CD8+ T cell"
 94 |     )
 95 |   )
 96 |   
 97 |   # Format batch names 
 98 |   df$Clustering <- plyr::mapvalues(
 99 |     df$Clustering,
100 |     from = c(
101 |       "batch_1",
102 |       "batch_2"
103 |     ),
104 |     to = c(
105 |       "Batch 1",
106 |       "Batch 2"
107 |     )
108 |   )
109 |   
110 |   unique_cluster_len <- length(unique(df$Clustering))
111 |   if (unique_cluster_len > 8) {
112 |     ggplot(data = df, aes(x = `UMAP 1`, y = `UMAP 2`)) +
113 |       geom_point(
114 |         aes(
115 |           color = factor(
116 |             as.numeric(Clustering),
117 |             levels = sort(as.numeric(unique(df$Clustering)))
118 |           )
119 |         ),
120 |         size = 0.25
121 |       ) +
122 |       facet_wrap(
123 |         .~Subset, 
124 |         scales = "free"
125 |       ) +
126 |       labs(
127 |         color = "",
128 |         x = "UMAP 1",
129 |         y = "UMAP 2"
130 |       ) +
131 |       scale_color_manual(
132 |         name = "",
133 |         values = kev_palette[1:unique_cluster_len]
134 |       ) + 
135 |       guides(color = guide_legend(override.aes = list(size=2))) + 
136 |       theme_few() +
137 |       theme(axis.title.x = element_text(size = 16)) +
138 |       theme(axis.title.y = element_text(size = 16)) +
139 |       theme(strip.text.x = element_text(size = 16)) +
140 |       theme(plot.title = element_text(size = 14)) +
141 |       theme(axis.text.x = element_text(size = 16)) +
142 |       theme(axis.text.y = element_text(size = 16)) +
143 |       theme(legend.title = element_text(size = 16)) +
144 |       theme(legend.text = element_text(size = 16))
145 |     ggsave(
146 |       paste0(
147 |         "outs/umap/figures/",
148 |         save_prefix,
149 |         ".pdf"
150 |       ),
151 |       width = 16,
152 |       height = 8,
153 |       device = cairo_pdf
154 |     )
155 |   } else {
156 |     if (any(grepl("Batch", df$Clustering))) {
157 |       pal = "Set1"
158 |     } else {
159 |       pal = "Dark2"
160 |     }
161 |     ggplot(data = df, aes(x = `UMAP 1`, y = `UMAP 2`)) +
162 |       geom_point(
163 |         aes(
164 |           color = factor(Clustering),
165 |         ),
166 |         size = 0.5
167 |       ) +
168 |       facet_wrap(
169 |         .~Subset, 
170 |         scales = "free"
171 |       ) +
172 |       labs(
173 |         color = "",
174 |         x = "UMAP 1",
175 |         y = "UMAP 2"
176 |       ) +
177 |       guides(color = guide_legend(override.aes = list(size=2))) +
178 |       scale_color_brewer(palette = pal) +  
179 |       theme_few() +
180 |       theme(axis.title.x = element_text(size = 16)) +
181 |       theme(axis.title.y = element_text(size = 16)) +
182 |       theme(strip.text.x = element_text(size = 16)) +
183 |       theme(plot.title = element_text(size = 14)) +
184 |       theme(axis.text.x = element_text(size = 16)) +
185 |       theme(axis.text.y = element_text(size = 16)) +
186 |       theme(legend.title = element_text(size = 16)) +
187 |       theme(legend.text = element_text(size = 16))
188 |     ggsave(
189 |       paste0(
190 |         "outs/umap/figures/",
191 |         save_prefix,
192 |         ".pdf"
193 |       ),
194 |       width = 16,
195 |       height = 8,
196 |       device = cairo_pdf
197 |     )
198 |   }
199 | }
200 | 
201 | # Iterate over the umap files and names and save the results 
202 | mapply(
203 |   umap_plot,
204 |   df = umap_loaded, 
205 |   save_prefix = umap_names
206 | )
207 | 


--------------------------------------------------------------------------------
/workflow/analysis/R/knn_example.R:
--------------------------------------------------------------------------------
  1 | # Load the necessary libraries - please note that the analysis environment
  2 | # (found in envs/analysis.yaml) should be used to run this script
  3 | library(data.table)
  4 | library(tidyverse)
  5 | library(reshape2)
  6 | library(ggplot2)
  7 | library(ggthemes)
  8 | library(ggExtra)
  9 | library(ggpubr)
 10 | library(dotwhisker)
 11 | library(Seurat)
 12 | library(SeuratDisk)
 13 | library(ComplexHeatmap)
 14 | library(circlize)
 15 | library(RColorBrewer)
 16 | library(Cairo)
 17 | 
 18 | # Helper functions
 19 | `%ni%` <- Negate(`%in%`)
 20 | 
 21 | # Change to results dir for the custom data - 
 22 | # change this directory as necessary
 23 | setwd("../../../results/custom")
 24 | 
 25 | # Load in and concatenate imbalance summary files
 26 | setwd("imbalance_summaries/")
 27 | imba_files <- list.files()
 28 | imba_loaded <- lapply(imba_files, fread)
 29 | imba_concat <- Reduce(rbind, imba_loaded)
 30 | 
 31 | # Load in and concatenate knn classification summaries
 32 | setwd("../knn_classification_reports/")
 33 | knn_files <- list.files()
 34 | knn_loaded <- lapply(knn_files, fread)
 35 | knn_concat <- Reduce(rbind, knn_loaded)
 36 | 
 37 | # Change to top level dir
 38 | setwd("../../..")
 39 | 
 40 | # Create directory for output of results and figures
 41 | if (!dir.exists("outs/custom/figures")) {
 42 |   dir.create("outs/custom/figures", recursive = TRUE)
 43 | }
 44 | if (!dir.exists("outs/custom/results")) {
 45 |   dir.create("outs/custom/results", recursive = TRUE)
 46 | }
 47 | 
 48 | ### Results of celltype downsampling and ablation on  
 49 | ### KNN classification scores 
 50 | 
 51 | # Merge imbalance and knn classification results together - several 
 52 | # other analysis can be done with this data now, but we'll only 
 53 | # highlight the main KNN analysis that was done in the paper 
 54 | imba_knn_merged <- merge(
 55 |   imba_concat,
 56 |   knn_concat,
 57 |   by = c(
 58 |     "Number of batches downsampled",
 59 |     "Number of celltypes downsampled",
 60 |     "Proportion downsampled",
 61 |     "Replicate"
 62 |   )
 63 | )
 64 | imba_knn_merged <- distinct(imba_knn_merged)
 65 | 
 66 | # Subset for only cases where the celltype downsampled is equal to the 
 67 | # celltype being classified
 68 | imba_knn_merged_celltype <- imba_knn_merged[
 69 |   imba_knn_merged$Celltype == imba_knn_merged$`Downsampled celltypes` |
 70 |     imba_knn_merged$`Downsampled celltypes` %in% c("None")
 71 | ]
 72 | 
 73 | # Indicate which panels are control and which ones are ablations or downsampling
 74 | imba_knn_merged_celltype$type <- ifelse(
 75 |   imba_knn_merged_celltype$`Number of batches downsampled` == 0,
 76 |   "Control",
 77 |   ifelse(
 78 |     imba_knn_merged_celltype$`Proportion downsampled` == 0,
 79 |     "Ablated",
 80 |     "Downsampled"
 81 |   )
 82 | )
 83 | 
 84 | # Create function to format facet labels (downsampled celltypes) and plot 
 85 | # the results (this figure will be in the same format as the paper)
 86 | ds_celltype_labelled <- function(variable,value){
 87 |   return(paste0("Cell-type affected = ", value))
 88 | }
 89 | 
 90 | ggplot(data = imba_knn_merged_celltype, aes(x = `Method`, y = `F1-score`)) +
 91 |   geom_boxplot(
 92 |     aes(
 93 |       fill = factor(`type`, levels = c("Control", "Downsampled", "Ablated")),
 94 |     ),
 95 |     notch = FALSE,
 96 |     alpha = 0.8 
 97 |   ) +
 98 |   ylim(0, 1) +
 99 |   facet_wrap(
100 |     .~Celltype, 
101 |     scales = "free_x", 
102 |     labeller = ds_celltype_labelled,
103 |     ncol = 2
104 |   ) +
105 |   labs(
106 |     fill = "Type",
107 |     x = "Method",
108 |     y = "Affected celltype F1-classification score post-integration"
109 |   ) +
110 |   scale_fill_manual( 
111 |     breaks = c("Control", "Downsampled", "Ablated"),
112 |     values = c("forestgreen", "darkorchid3", "firebrick2")
113 |   ) +
114 |   theme_few() +
115 |   theme(axis.title.x = element_text(size = 16)) +
116 |   theme(axis.title.y = element_text(size = 16)) +
117 |   theme(strip.text.x = element_text(size = 16)) +
118 |   theme(plot.title = element_text(size = 14)) +
119 |   theme(axis.text.x = element_text(size = 16)) +
120 |   theme(axis.text.y = element_text(size = 16)) +
121 |   theme(legend.title = element_text(size = 16)) +
122 |   theme(legend.text = element_text(size = 16))
123 | # Change the name and extension of file as necessary
124 | ggsave(
125 |   "outs/custom/figures/ds_ablate_allmethod_knn_f1_score.pdf",
126 |   width = 12,
127 |   height = 14,
128 |   device = cairo_pdf
129 | )


--------------------------------------------------------------------------------
/workflow/configs/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pbmc_2_batch" : {
 3 |         "data_folder": "pbmc_2_batch",
 4 |         "ds_celltypes": [1, 2, 3, 4, 5, 6],
 5 |         "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05, 0],
 6 |         "num_batches": [0, 1],
 7 |         "repetitions": 10
 8 |     },
 9 |     "pbmc_4_batch" : {
10 |         "data_folder": "pbmc_4_batch",
11 |         "ds_celltypes": [1, 2, 3, 4, 5, 6],
12 |         "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05, 0],
13 |         "num_batches": [0, 1, 2, 3],
14 |         "repetitions": 10
15 |     },
16 |     "mouse_hindbrain_10_batch": {
17 |         "data_folder": "mouse_hindbrain_10_batch",
18 |         "ds_celltypes": [1, 2, 3, 4, 5, 6],
19 |         "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05, 0],
20 |         "num_batches": [0, 2, 4, 6, 8],
21 |         "repetitions": 10
22 |     },
23 |     "peng_pdac_23_batch": {
24 |         "data_folder": "peng_pdac_23_batch",
25 |         "ds_celltypes": [1, 2, 3, 4, 5, 6],
26 |         "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05, 0],
27 |         "num_batches": [0, 3, 6, 9, 12, 15, 18, 21],
28 |         "repetitions": 10
29 |     },
30 |     "steele_pdac_17_batch": {
31 |         "data_folder": "steele_pdac_17_batch",
32 |         "ds_celltypes": [1, 2, 3, 4, 5, 6],
33 |         "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05, 0],
34 |         "num_batches": [0, 2, 4, 6, 8, 10, 12, 14, 16],
35 |         "repetitions": 10
36 |     }
37 | }


--------------------------------------------------------------------------------
/workflow/configs/config_control.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "config_name": "control",
 3 |     "int_datasets": {
 4 |         "pbmc_2_batch_base_balanced" : {
 5 |             "data_folder": "pbmc_2_batch_base_balanced",
 6 |             "ds_celltypes": [1],
 7 |             "ds_proportions": [0.1, 0],
 8 |             "num_batches": [0, 1],
 9 |             "repetitions": 200
10 |         },
11 |         "pbmc_2_batch_hierarchical_balanced": {
12 |             "data_folder": "pbmc_2_batch_hierarchical_balanced",
13 |             "ds_celltypes": [1],
14 |             "ds_proportions": [0.1, 0],
15 |             "num_batches": [0, 1],
16 |             "repetitions": 200
17 |         }
18 |     },
19 |     "int_ti_datasets": {},
20 |     "query_to_reference": "Yes",
21 |     "celltype_list": "No"
22 | }


--------------------------------------------------------------------------------
/workflow/configs/config_control_ti_only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "config_name": "control_ti_only",
 3 |     "int_datasets": {},
 4 |     "int_ti_datasets": {
 5 |         "cao_organ_dev_sublin_2_batch": {
 6 |             "data_folder": "cao_organ_dev_sublin_balanced_2_batch",
 7 |             "root_celltype": "Early_mesenchyme",
 8 |             "ds_celltypes": [1],
 9 |             "ds_proportions": [0.1, 0],
10 |             "num_batches": [0, 1],
11 |             "repetitions": 200
12 |         }
13 |     },
14 |     "query_to_reference": "No",
15 |     "celltype_list": "No"
16 | }


--------------------------------------------------------------------------------
/workflow/configs/config_custom.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "config_name": "custom",
 3 |     "int_datasets": {
 4 |         "custom_dataset" : {
 5 |             "data_folder": "custom_dataset",
 6 |             "ds_celltypes": [1],
 7 |             "ds_proportions": [0.1, 0],
 8 |             "num_batches": [0, 1],
 9 |             "repetitions": 200
10 |         }
11 |     },
12 |     "int_ti_datasets": {},
13 |     "query_to_reference": "No",
14 |     "celltype_list": "No"
15 | }


--------------------------------------------------------------------------------
/workflow/configs/config_lowcap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "config_name": "lowcap",
 3 |     "int_datasets": {
 4 |         "pbmc_2_batch" : {
 5 |             "data_folder": "pbmc_2_batch",
 6 |             "ds_celltypes": [1],
 7 |             "ds_proportions": [0.25, 0.1, 0.05, 0],
 8 |             "num_batches": [0, 1],
 9 |             "repetitions": 50
10 |         },
11 |         "pbmc_4_batch" : {
12 |             "data_folder": "pbmc_4_batch",
13 |             "ds_celltypes": [1],
14 |             "ds_proportions": [0.25, 0.1, 0.05, 0],
15 |             "num_batches": [0, 1],
16 |             "repetitions": 50
17 |         },
18 |         "mouse_hindbrain_6_batch": {
19 |             "data_folder": "mouse_hindbrain_6_batch",
20 |             "ds_celltypes": [1],
21 |             "ds_proportions": [0.25, 0.1, 0.05, 0],
22 |             "num_batches": [0, 1],
23 |             "repetitions": 50
24 |         },
25 |         "peng_pdac_8_batch": {
26 |             "data_folder": "peng_pdac_8_batch",
27 |             "ds_celltypes": [1],
28 |             "ds_proportions": [0.25, 0.1, 0.05, 0],
29 |             "num_batches": [0, 1],
30 |             "repetitions": 50
31 |         }
32 |     },
33 |     "int_ti_datasets": {
34 |         "cao_organ_dev_sublin_2_batch": {
35 |             "data_folder": "cao_organ_dev_sublin_2_batch",
36 |             "root_celltype": "Early_mesenchyme",
37 |             "ds_celltypes": [1],
38 |             "ds_proportions": [0.25, 0.1, 0.05, 0],
39 |             "num_batches": [0, 1],
40 |             "repetitions": 50
41 |         }
42 |     }
43 | }


--------------------------------------------------------------------------------
/workflow/configs/config_lowcap_control_like.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "config_name": "lowcap_control_like",
 3 |     "int_datasets": {
 4 |         "pbmc_2_batch" : {
 5 |             "data_folder": "pbmc_2_batch",
 6 |             "ds_celltypes": [1],
 7 |             "ds_proportions": [0.1, 0],
 8 |             "num_batches": [0, 1],
 9 |             "repetitions": 50
10 |         },
11 |         "pbmc_4_batch" : {
12 |             "data_folder": "pbmc_4_batch",
13 |             "ds_celltypes": [1],
14 |             "ds_proportions": [0.1, 0],
15 |             "num_batches": [0, 1],
16 |             "repetitions": 50
17 |         },
18 |         "mouse_hindbrain_6_batch": {
19 |             "data_folder": "mouse_hindbrain_6_batch",
20 |             "ds_celltypes": [1],
21 |             "ds_proportions": [0.1, 0],
22 |             "num_batches": [0, 1],
23 |             "repetitions": 50
24 |         },
25 |         "peng_pdac_8_batch": {
26 |             "data_folder": "peng_pdac_8_batch",
27 |             "ds_celltypes": [1],
28 |             "ds_proportions": [0.1, 0],
29 |             "num_batches": [0, 1],
30 |             "repetitions": 50
31 |         }
32 |     },
33 |     "int_ti_datasets": {
34 |         "cao_organ_dev_sublin_2_batch": {
35 |             "data_folder": "cao_organ_dev_sublin_2_batch",
36 |             "root_celltype": "Early_mesenchyme",
37 |             "ds_celltypes": [1],
38 |             "ds_proportions": [0.1, 0],
39 |             "num_batches": [0, 1],
40 |             "repetitions": 50
41 |         }
42 |     }
43 | }


--------------------------------------------------------------------------------
/workflow/configs/config_lowcap_modified.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "config_name": "lowcap_modified",
 3 |     "int_datasets": {
 4 |         "pbmc_2_batch" : {
 5 |             "data_folder": "pbmc_2_batch",
 6 |             "ds_celltypes": [0],
 7 |             "ds_proportions": [0],
 8 |             "num_batches": [0],
 9 |             "repetitions": 50
10 |         },
11 |         "pbmc_4_batch" : {
12 |             "data_folder": "pbmc_4_batch",
13 |             "ds_celltypes": [0],
14 |             "ds_proportions": [0],
15 |             "num_batches": [0],
16 |             "repetitions": 50
17 |         },
18 |         "mouse_hindbrain_6_batch": {
19 |             "data_folder": "mouse_hindbrain_6_batch",
20 |             "ds_celltypes": [0],
21 |             "ds_proportions": [0],
22 |             "num_batches": [0],
23 |             "repetitions": 50
24 |         },
25 |         "peng_pdac_8_batch": {
26 |             "data_folder": "peng_pdac_tumor_annot_8_batch_granular",
27 |             "ds_celltypes": [0],
28 |             "ds_proportions": [0],
29 |             "num_batches": [0],
30 |             "repetitions": 50
31 |         }
32 |     },
33 |     "int_ti_datasets": {},
34 |     "query_to_reference": "No",
35 |     "celltype_list": "No"
36 | }


--------------------------------------------------------------------------------
/workflow/configs/config_lowcap_modified_pdac_reanno.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "config_name": "lowcap_modified_pdac_reanno",
 3 |     "int_datasets": {
 4 |         "peng_pdac_8_batch": {
 5 |             "data_folder": "peng_pdac_tumor_annot_8_batch_granular",
 6 |             "ds_celltypes": [0],
 7 |             "ds_proportions": [0],
 8 |             "num_batches": [0],
 9 |             "repetitions": 50
10 |         }
11 |     },
12 |     "int_ti_datasets": {},
13 |     "query_to_reference": "No",
14 |     "celltype_list": "No"
15 | }


--------------------------------------------------------------------------------
/workflow/configs/config_pdac_comp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "config_name": "pdac_comp",
 3 |     "int_datasets": {
 4 |         "peng_pdac_tumor_annot_8_batch": {
 5 |             "data_folder": "peng_pdac_tumor_annot_8_batch",
 6 |             "ds_celltypes": [1],
 7 |             "ds_proportions": [0.1, 0],
 8 |             "num_batches": [0, 4],
 9 |             "repetitions": 50
10 |         }
11 |     },
12 |     "int_ti_datasets": {},
13 |     "query_to_reference": "No",
14 |     "celltype_list": "No"
15 | }


--------------------------------------------------------------------------------
/workflow/configs/config_pdac_comp_reanno.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "config_name": "pdac_comp_reanno",
 3 |     "int_datasets": {
 4 |         "peng_pdac_tumor_annot_8_reanno_batch": {
 5 |             "data_folder": "peng_pdac_tumor_annot_8_batch",
 6 |             "ds_celltypes": [1],
 7 |             "ds_proportions": [0.1, 0],
 8 |             "num_batches": [0, 4],
 9 |             "repetitions": 50
10 |         }
11 |     },
12 |     "int_ti_datasets": {},
13 |     "query_to_reference": "No",
14 |     "celltype_list": "No"
15 | }


--------------------------------------------------------------------------------
/workflow/configs/config_umap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "config_name": "umap",
 3 |     "int_datasets": {
 4 |         "pbmc_2_batch_base_balanced" : {
 5 |             "data_folder": "pbmc_2_batch_base_balanced",
 6 |             "ds_celltypes": [1],
 7 |             "ds_proportions": [0.1, 0],
 8 |             "num_batches": [0, 1],
 9 |             "repetitions": 1
10 |         },
11 |         "pbmc_2_batch_hierarchical_balanced": {
12 |             "data_folder": "pbmc_2_batch_hierarchical_balanced",
13 |             "ds_celltypes": [1],
14 |             "ds_proportions": [0.1, 0],
15 |             "num_batches": [0, 1],
16 |             "repetitions": 1
17 |         },
18 |         "pbmc_2_batch" : {
19 |             "data_folder": "pbmc_2_batch",
20 |             "ds_celltypes": [0],
21 |             "ds_proportions": [0],
22 |             "num_batches": [0],
23 |             "repetitions": 1
24 |         },
25 |         "pbmc_4_batch" : {
26 |             "data_folder": "pbmc_4_batch",
27 |             "ds_celltypes": [0],
28 |             "ds_proportions": [0],
29 |             "num_batches": [0],
30 |             "repetitions": 1
31 |         },
32 |         "mouse_hindbrain_6_batch": {
33 |             "data_folder": "mouse_hindbrain_6_batch",
34 |             "ds_celltypes": [0],
35 |             "ds_proportions": [0],
36 |             "num_batches": [0],
37 |             "repetitions": 1
38 |         },
39 |         "peng_pdac_8_batch": {
40 |             "data_folder": "peng_pdac_tumor_annot_8_batch",
41 |             "ds_celltypes": [0],
42 |             "ds_proportions": [0],
43 |             "num_batches": [0],
44 |             "repetitions": 1
45 |         }
46 |     },
47 |     "int_ti_datasets": {},
48 |     "query_to_reference": "No",
49 |     "celltype_list": "No"
50 | }


--------------------------------------------------------------------------------
/workflow/configs/test_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pbmc_2_batch" : {
 3 |         "data_folder": "pbmc_2_batch",
 4 |         "ds_celltypes": [1, 2, 3, 4, 5, 6, 7, 8, 9],
 5 |         "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05],
 6 |         "num_batches": [0, 1],
 7 |         "repetitions": 10
 8 |     },
 9 |     "pbmc_4_batch" : {
10 |         "data_folder": "pbmc_4_batch",
11 |         "ds_celltypes": [1, 2, 3, 4, 5, 6, 7],
12 |         "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05],
13 |         "num_batches": [0, 1, 2, 3],
14 |         "repetitions": 10
15 |     },
16 |     "mouse_hindbrain_10_batch": {
17 |         "data_folder": "mouse_hindbrain_10_batch",
18 |         "ds_celltypes": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
19 |         "ds_proportions": [0.75, 0.5, 0.25, 0.1, 0.05],
20 |         "num_batches": [0, 2, 4, 6, 8],
21 |         "repetitions": 10
22 |     }
23 | }


--------------------------------------------------------------------------------
/workflow/configs/test_config_lite.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "config_name": "test_config_lite",
 3 |     "int_datasets": {
 4 |         "pbmc_2_batch_balanced" : {
 5 |             "data_folder": "pbmc_2_batch",
 6 |             "ds_celltypes": [1],
 7 |             "ds_proportions": [0.1, 0.25],
 8 |             "num_batches": [0, 1],
 9 |             "repetitions": 2
10 |         }
11 |     },
12 |     "int_ti_datasets": {},
13 |     "query_to_reference": "No",
14 |     "celltype_list": "No"
15 | }


--------------------------------------------------------------------------------
/workflow/envs/analysis.yaml:
--------------------------------------------------------------------------------
 1 | name: iniq_analysis
 2 | channels:
 3 |   - pytorch
 4 |   - bioconda
 5 |   - r
 6 |   - conda-forge
 7 |   - defaults
 8 | dependencies:
 9 |   - python>=3.7,<=3.10
10 |   - numpy>=1.19.0,<=1.23.5
11 |   - pandas>=1.2.0
12 |   - scipy>=1.5.0
13 |   - seaborn>=0.11.2
14 |   - plotnine>=0.8.0
15 |   - leidenalg>=0.8.0
16 |   - umap-learn>=0.5.0
17 |   - scikit-learn=1.0.1
18 |   - scanpy=1.8.2
19 |   - anndata>=0.7.5
20 |   - ipykernel>=6.4.0
21 |   - jupyterlab>=3.2.9
22 |   - notebook>=6.4.2
23 |   - scvi-tools=0.14.4
24 |   - pytorch=1.10.1
25 |   - torchmetrics<=0.6.0 # Issue with torch loading
26 |   - cudatoolkit=10.2
27 |   - bbknn=1.5.1
28 |   - harmonypy=0.0.5
29 |   - scanorama=1.7.1
30 |   - r-base>=4.0.5
31 |   - r-seurat>=4.0.5
32 |   - r-data.table>=1.14.0
33 |   - r-ggplot2>=3.3.0
34 |   - r-tidyverse>=1.2.1
35 |   - r-reshape2>=1.4.3
36 |   - r-ggthemes>=4.2.0
37 |   - r-ggextra>=0.8.0
38 |   - r-dotwhisker>=0.7.4
39 |   - r-seuratdisk>=0.0.9019
40 |   - r-deldir>=1.0.2
41 |   - r-ggpubr>=0.4.0
42 |   - r-cowplot>=1.1.1
43 |   - r-ggrepel>=0.9.1
44 |   - r-rcolorbrewer>=1.1
45 |   - r-ggbump>=0.1.0
46 |   - bioconductor-complexheatmap<=2.9.0
47 |   - r-venndiagram>=1.7.1
48 |   - r-multipanelfigure>=2.1.2
49 |   - r-gridextra>=2.3
50 |   - r-cairo>=1.5
51 |   - r-lemon>=0.4.5
52 |   - r-networkd3>=0.4
53 |   - r-emt>=1.2
54 |   - cython>=0.29.25
55 |   - mkl==2024.0 # Pinning due to https://github.com/pytorch/pytorch/issues/123097


--------------------------------------------------------------------------------
/workflow/envs/integrate.yaml:
--------------------------------------------------------------------------------
 1 | name: iniq_integrate
 2 | channels:
 3 |   - pytorch
 4 |   - conda-forge
 5 |   - bioconda
 6 |   - defaults
 7 | dependencies:
 8 |   - python>=3.7,<=3.10
 9 |   - matplotlib<3.7
10 |   - numpy>=1.19.0,<=1.23.5
11 |   - pandas>=1.2.0,<1.5.0
12 |   - scipy>=1.5.0
13 |   - leidenalg>=0.8.0
14 |   - umap-learn>=0.5.0,<=0.5.3
15 |   - mnnpy>=0.1.9.0,<=0.1.9.5
16 |   - scikit-learn=1.0.1
17 |   - scanpy=1.8.2
18 |   - anndata=0.8.0
19 |   - faiss-cpu>=1.7.0,<=1.7.3
20 |   - pytorch=1.10.1
21 |   - torchmetrics<=0.6.0 # Issue with torch loading
22 |   - cudatoolkit=10.2
23 |   - scvi-tools=0.14.4
24 |   - bbknn=1.5.1
25 |   - harmonypy=0.0.5
26 |   - scanorama=1.7.1
27 |   - r-base>=4.0.0
28 |   - r-liger=0.5.0
29 |   - r-seurat=4.0.6
30 |   - r-seuratdisk=0.0.9019
31 |   - r-data.table>=1.14.0
32 |   - r-reticulate=1.24
33 |   - cython>=0.29.25,<=0.29.34
34 |   - r-rann=2.6.1
35 |   - natsort>=7.0.0
36 |   - colorcet>=3.0.0
37 |   - seaborn>=0.11.0
38 |   - mkl==2024.0 # Pinning due to https://github.com/pytorch/pytorch/issues/123097
39 | variables:
40 |   TMPDIR: "/tmp" 
41 |   


--------------------------------------------------------------------------------
/workflow/preprocessing/pdac/Snakefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hsmaan/Iniquitate/cb20fe1240be6cb03dd63f1151816ec9d3b70a84/workflow/preprocessing/pdac/Snakefile


--------------------------------------------------------------------------------
/workflow/preprocessing/pdac/preprocess_env.yaml:
--------------------------------------------------------------------------------
 1 | name: iniq_pdac_preprocess
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python>=3.7,<=3.10
 8 |   - numpy>=1.19.0
 9 |   - pandas>=1.2.0
10 |   - scipy>=1.5.0
11 |   - leidenalg>=0.8.0
12 |   - umap-learn>=0.5.0
13 |   - scanpy=1.8.2
14 |   - anndata>=0.7.5
15 |   - pip
16 |   - pip:
17 |     - infercnvpy


--------------------------------------------------------------------------------
/workflow/scripts/python/annotation_results.py:
--------------------------------------------------------------------------------
 1 | import argparse 
 2 | import os 
 3 | import sys 
 4 | sys.path.append("src/python/")
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | import anndata as ann
 9 | import scanpy as sc
10 | from sklearn.metrics import accuracy_score, balanced_accuracy_score, \
11 |     f1_score, classification_report
12 | 
13 | def none_or_str(value):
14 |     if value == 'None':
15 |         return None
16 |     return value
17 | 
18 | def main(h5ad_loc, save_loc, dataset_name, ds_celltypes, ds_proportions, 
19 |          num_batches, rep):
20 |     # Load h5ad file for query to reference mapping results
21 |     adata = sc.read_h5ad(h5ad_loc)
22 |     
23 |     # Get the classification results as a dataframe 
24 |     class_results = pd.DataFrame({
25 |         "Real celltype": adata.obs["celltype"],
26 |         "Predicted L1": adata.obs["predicted.celltype.l1"],
27 |         "Predicted L2": adata.obs["predicted.celltype.l2"],
28 |         "Control predicted L1": adata.obs["baseline.knn.l1"],
29 |         "Control predicted L2": adata.obs["baseline.knn.l2"]
30 |     }) 
31 |     
32 |     # Append information on dataset to results
33 |     class_results["Dataset"] = dataset_name
34 |     class_results["Number of batches downsampled"] = num_batches
35 |     class_results["Number of celltypes downsampled"] = ds_celltypes
36 |     class_results["Proportion downsampled"] = ds_proportions
37 |     class_results["Replicate"] = rep
38 |     
39 |     # Save results to file
40 |     class_results.to_csv(save_loc, index=False, sep="\t")
41 | 
42 | if __name__ == '__main__':
43 |     parser = argparse.ArgumentParser(
44 |         description = "Input and output files for annotation results summary"
45 |     )
46 |     parser.add_argument(
47 |         "--infile",
48 |         type = str,
49 |         help = "Path of Seurat annotated h5ad file"
50 |     )
51 |     parser.add_argument(
52 |         "--outfile",
53 |         type = str,
54 |         help = "Filepath for saving annotation results"
55 |     )
56 |     parser.add_argument(
57 |         "--dataset",
58 |         type = str,
59 |         help = "Name of dataset"
60 |     )
61 |     parser.add_argument(
62 |         "--rep",
63 |         type = int,
64 |         help = "Repetition number"
65 |     )
66 |     parser.add_argument(
67 |         "--ds_celltypes",
68 |         type = int,
69 |         help = "Number of celltypes to randomly downsample in given batch"
70 |     )
71 |     parser.add_argument(
72 |         "--ds_proportions",
73 |         type = float,
74 |         help = "Proportion of downsampling per celltype in a given batch"
75 |     )
76 |     parser.add_argument(
77 |         "--num_batches",
78 |         type = int,
79 |         help = "Number of batches to perform downsampling on"
80 |     )
81 |     args = parser.parse_args()
82 |     main(
83 |         h5ad_loc = args.infile,
84 |         save_loc = args.outfile,
85 |         dataset_name = args.dataset,
86 |         rep = args.rep,
87 |         ds_celltypes = args.ds_celltypes,
88 |         ds_proportions = args.ds_proportions,
89 |         num_batches = args.num_batches  
90 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/celltype_imbalance_summary.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | import functools
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import scanpy as sc 
  7 | 
  8 | def main(h5ad_loc, save_loc, dataset_name, rep):
  9 |     # Load h5ad file 
 10 |     adata_full = sc.read_h5ad(h5ad_loc)
 11 |     
 12 |     # Extract data from just one integration method subset 
 13 |     int_method_select = np.random.choice(
 14 |         np.unique(adata_full.obs.integration_method.__array__())
 15 |     )
 16 |     
 17 |     # Extract summary statistics from h5ad file
 18 |     num_batches_ds = adata_full.uns["downsampling_stats"]["num_batches"]
 19 |     batches_ds = adata_full.uns["downsampling_stats"]["ds_batch_names"]
 20 |     num_celltypes_ds = adata_full.uns["downsampling_stats"]["num_celltypes_downsampled"]
 21 |     prop_ds = adata_full.uns["downsampling_stats"]["proportion_downsampled"]
 22 |     downsampled_celltypes = adata_full.uns["downsampling_stats"]["downsampled_celltypes"]
 23 |     
 24 |     # Format downsampled celltypes and batches to correspond to a single item
 25 |     if isinstance(downsampled_celltypes, str):
 26 |         if downsampled_celltypes == "None":  
 27 |             downsampled_celltypes = "None"
 28 |         else:
 29 |             raise ValueError("Downsampled celltypes is a str and not 'None'")
 30 |     elif isinstance(downsampled_celltypes, np.ndarray):
 31 |         if downsampled_celltypes.shape == (1,):
 32 |             downsampled_celltypes = downsampled_celltypes[0]
 33 |         else:
 34 |             downsampled_celltypes = np.concatenate(downsampled_celltypes).flatten()
 35 |             downsampled_celltypes = ", ".join(downsampled_celltypes)
 36 |     else:
 37 |         raise TypeError("Downsampled celltypes is not a str or ndarray")
 38 |     
 39 |     if isinstance(batches_ds, str):
 40 |         if batches_ds == "None":
 41 |             batches_ds = "None"
 42 |         elif batches_ds == "Placeholder due to h5py bug":
 43 |             batches_ds = "Placeholder due to h5py bug"
 44 |         else:
 45 |             raise ValueError("Downsampled batches is a str and not 'None'")
 46 |     elif isinstance(batches_ds, np.ndarray):
 47 |         if batches_ds.shape == (1,):
 48 |             batches_ds = batches_ds[0]
 49 |         else:
 50 |             batches_ds = np.concatenate(batches_ds).flatten()
 51 |             batches_ds = ", ".join(batches_ds)
 52 |     else:
 53 |         raise TypeError("Downsampled batches is not a str or ndarray")
 54 |     
 55 |     # Subset data for only one method and split datasets by batch
 56 |     adata_select = adata_full[adata_full.obs.integration_method == int_method_select]
 57 |     adata_list = []
 58 |     batches = np.unique(adata_select.obs.batch.__array__())
 59 |     for batch in batches:
 60 |         adata_batch_select = adata_select[adata_select.obs.batch == batch]
 61 |         adata_list.append(adata_batch_select)
 62 |         
 63 |     # Get celltype value counts for each batch
 64 |     val_counts_dfs = []
 65 |     for idx, adata in enumerate(adata_list):
 66 |         val_counts_df = pd.DataFrame(adata.obs.celltype.value_counts())
 67 |         val_counts_df = val_counts_df.reset_index()
 68 |         val_counts_df.columns = ["celltype", "celltype_count_batch_{}".format(idx)]
 69 |         val_counts_dfs.append(val_counts_df)
 70 |         
 71 |     # Concatenate all celltype value counts results 
 72 |     merge = functools.partial(pd.merge, on = ["celltype"], how = "outer")
 73 |     val_counts_merged = functools.reduce(merge, val_counts_dfs)
 74 |     
 75 |     # Replace NAs with 0 and add downsampling information
 76 |     val_counts_merged.iloc[:, 1:] = val_counts_merged.iloc[:, 1:].fillna(0)
 77 |     val_counts_merged["Dataset"] = dataset_name
 78 |     val_counts_merged["Number of batches downsampled"] = num_batches_ds
 79 |     val_counts_merged["Batches downsampled"] = batches_ds
 80 |     val_counts_merged["Number of celltypes downsampled"] = num_celltypes_ds
 81 |     val_counts_merged["Proportion downsampled"] = prop_ds
 82 |     val_counts_merged["Downsampled celltypes"] = downsampled_celltypes
 83 |     val_counts_merged["Replicate"] = rep
 84 |     val_counts_merged["Total batches"] = len(batches)
 85 |     val_counts_merged.to_csv(save_loc, index=False, sep="\t")
 86 |     
 87 | if __name__ == '__main__':
 88 |     parser = argparse.ArgumentParser(
 89 |         description = "Input and output files for celltype imbalance summary"
 90 |     )
 91 |     parser.add_argument(
 92 |         "--infile",
 93 |         type = str,
 94 |         help = "Path of integrated h5ad file"
 95 |     )
 96 |     parser.add_argument(
 97 |         "--outfile",
 98 |         type = str,
 99 |         help = "Filepath for saving celltype imbalance statistics of h5ad file"
100 |     )
101 |     parser.add_argument(
102 |         "--dataset",
103 |         type = str,
104 |         help = "Name of dataset"
105 |     )
106 |     parser.add_argument(
107 |         "--rep",
108 |         type = int,
109 |         help = "Repetition number"
110 |     )
111 |     args = parser.parse_args()
112 |     main(
113 |         h5ad_loc = args.infile,
114 |         save_loc = args.outfile,
115 |         dataset_name = args.dataset,
116 |         rep = args.rep
117 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/clustering_concordance.py:
--------------------------------------------------------------------------------
 1 | import argparse 
 2 | import os 
 3 | import sys 
 4 | sys.path.append("src/python/")
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | import anndata as ann
 9 | import scanpy as sc
10 | from utils import cluster_concordance 
11 | 
12 | def main(h5ad_loc, save_loc, dataset_name, rep):
13 |     # Load h5ad file 
14 |     adata = sc.read_h5ad(h5ad_loc)
15 |     
16 |     # Extract summary statistics from h5ad file
17 |     num_batches_ds = adata.uns["downsampling_stats"]["num_batches"]
18 |     num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"]
19 |     prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"]
20 |     
21 |     # Get clustering concordance results 
22 |     cluster_concordance_df = cluster_concordance(adata = adata)
23 |     
24 |     # Create cluster concordance summary df 
25 |     cluster_concordance_summary_df = pd.DataFrame({
26 |         "Dataset": dataset_name,
27 |         "Number of batches downsampled": num_batches_ds,
28 |         "Number of celltypes downsampled": num_celltypes_ds,
29 |         "Proportion downsampled": prop_ds,
30 |         "Replicate": rep,
31 |         "Method 1": cluster_concordance_df["Method 1"].__array__(),
32 |         "Method 2": cluster_concordance_df["Method 2"].__array__(),
33 |         "ARI": cluster_concordance_df["ARI"].__array__(),
34 |         "Median ARI": cluster_concordance_df["Median ARI"].__array__()
35 |     }) 
36 |         
37 |     # Save clustering concordance dataframe to tsv
38 |     cluster_concordance_summary_df.to_csv(
39 |         save_loc,
40 |         index=False,
41 |         sep="\t"
42 |     )
43 | 
44 | if __name__ == '__main__':
45 |     parser = argparse.ArgumentParser(
46 |         description = "Input and output files for downsampling summary"
47 |     )
48 |     parser.add_argument(
49 |         "--infile",
50 |         type = str,
51 |         help = "Path of integrated h5ad file"
52 |     )
53 |     parser.add_argument(
54 |         "--outfile",
55 |         type = str,
56 |         help = "Filepath for saving clustering concordance statistics tsv"
57 |     )
58 |     parser.add_argument(
59 |         "--dataset",
60 |         type = str,
61 |         help = "Name of dataset"
62 |     )
63 |     parser.add_argument(
64 |         "--rep",
65 |         type = int,
66 |         help = "Repetition number"
67 |     )
68 |     args = parser.parse_args()
69 |     main(
70 |         h5ad_loc = args.infile,
71 |         save_loc = args.outfile,
72 |         dataset_name = args.dataset,
73 |         rep = args.rep
74 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/clustering_stats.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | import os 
  3 | import sys 
  4 | sys.path.append("src/python/")
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import anndata as ann
  9 | import scanpy as sc
 10 | from sklearn import metrics
 11 | 
 12 | from imbalanced_clustering import balanced_adjusted_rand_index, \
 13 |     balanced_adjusted_mutual_info, balanced_completeness, \
 14 |     balanced_homogeneity
 15 | 
 16 | def main(h5ad_loc, save_loc, dataset_name, rep):
 17 |     # Load h5ad file 
 18 |     adata = sc.read_h5ad(h5ad_loc)
 19 |     
 20 |     # Extract summary statistics from h5ad file
 21 |     num_batches_ds = adata.uns["downsampling_stats"]["num_batches"]
 22 |     num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"]
 23 |     prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"]
 24 |     
 25 |     # Subset h5ad based on batch-correction method used
 26 |     adata_method_sub = []
 27 |     methods = ["harmony", "scvi", "bbknn", "scanorama", "seurat", "liger"]
 28 |     for method in methods:
 29 |         adata_sub = adata[adata.obs["integration_method"] == method]
 30 |         adata_method_sub.append(
 31 |             adata_sub
 32 |         )
 33 |         
 34 |     # Get ARI, NMI, Homogeneity, Completeness values for each batch-correction method
 35 |     # and batch and celltype subsets. Both balanced and imbalanced subsets are considered
 36 |     # for the celltype data
 37 |     celltype_aris_imbalanced = []
 38 |     celltype_amis_imbalanced = []
 39 |     celltype_homs_imbalanced = []
 40 |     celltype_comps_imbalanced = []
 41 |     celltype_aris_balanced = []
 42 |     celltype_amis_balanced = []
 43 |     celltype_homs_balanced = []
 44 |     celltype_comps_balanced = []
 45 |     batch_aris = []
 46 |     batch_amis = []
 47 |     batch_homs = []
 48 |     batch_comps = []
 49 |     for adata_sub in adata_method_sub:
 50 |         celltype_aris_imbalanced.append(
 51 |             metrics.adjusted_rand_score(
 52 |                 adata_sub.obs["celltype"].__array__(),
 53 |                 adata_sub.obs["leiden"].__array__()
 54 |             )
 55 |         )
 56 |         celltype_amis_imbalanced.append(
 57 |             metrics.adjusted_mutual_info_score(
 58 |                 adata_sub.obs["celltype"].__array__(),
 59 |                 adata_sub.obs["leiden"].__array__()
 60 |             )
 61 |         )
 62 |         celltype_homs_imbalanced.append(
 63 |             metrics.homogeneity_score(
 64 |                 adata_sub.obs["celltype"].__array__(),
 65 |                 adata_sub.obs["leiden"].__array__()
 66 |             )
 67 |         )
 68 |         celltype_comps_imbalanced.append(
 69 |             metrics.completeness_score(
 70 |                 adata_sub.obs["celltype"].__array__(),
 71 |                 adata_sub.obs["leiden"].__array__()                
 72 |             )
 73 |         )
 74 |         celltype_aris_balanced.append(
 75 |             balanced_adjusted_rand_index(
 76 |                 adata_sub.obs["celltype"].__array__(),
 77 |                 adata_sub.obs["leiden"].__array__(),
 78 |                 reweigh=True
 79 |             )
 80 |         )
 81 |         celltype_amis_balanced.append(
 82 |             balanced_adjusted_mutual_info(
 83 |                 adata_sub.obs["celltype"].__array__(),
 84 |                 adata_sub.obs["leiden"].__array__(),
 85 |                 reweigh=True
 86 |             )
 87 |         )
 88 |         celltype_homs_balanced.append(
 89 |             balanced_homogeneity(
 90 |                 adata_sub.obs["celltype"].__array__(),
 91 |                 adata_sub.obs["leiden"].__array__(),
 92 |                 reweigh=True
 93 |             )
 94 |         )
 95 |         celltype_comps_balanced.append(
 96 |             balanced_completeness(
 97 |                 adata_sub.obs["celltype"].__array__(),
 98 |                 adata_sub.obs["leiden"].__array__(),
 99 |                 reweigh=True                
100 |             )
101 |         )
102 |         batch_aris.append(
103 |             1 - metrics.adjusted_rand_score(
104 |                 adata_sub.obs["batch"].__array__(),
105 |                 adata_sub.obs["leiden"].__array__()
106 |             )
107 |         )
108 |         batch_amis.append(
109 |             1 - metrics.adjusted_mutual_info_score(
110 |                 adata_sub.obs["batch"].__array__(),
111 |                 adata_sub.obs["leiden"].__array__()
112 |             )
113 |         )
114 |         batch_homs.append(
115 |             1 - metrics.homogeneity_score(
116 |                 adata_sub.obs["batch"].__array__(),
117 |                 adata_sub.obs["leiden"].__array__()
118 |             )
119 |         )
120 |         batch_comps.append(
121 |             1 - metrics.completeness_score(
122 |                 adata_sub.obs["batch"].__array__(),
123 |                 adata_sub.obs["leiden"].__array__()
124 |             )
125 |         )
126 |     
127 |     # Get number of clusters per method 
128 |     cluster_nums = []
129 |     for adata_sub in adata_method_sub:
130 |         cluster_nums.append(
131 |             len(np.unique(adata_sub.obs["leiden"].__array__()))
132 |         )
133 |     
134 |     # Get number of cells per method 
135 |     cell_nums = []
136 |     for adata_sub in adata_method_sub:
137 |         cell_nums.append(
138 |             adata_sub.n_obs
139 |         )
140 | 
141 |     # Create summary dataframe for clustering statistics
142 |     cluster_summary_df = pd.DataFrame({
143 |         "Dataset": dataset_name,
144 |         "Number of batches downsampled": num_batches_ds,
145 |         "Number of celltypes downsampled": num_celltypes_ds,
146 |         "Proportion downsampled": prop_ds,
147 |         "Replicate": rep,
148 |         "Method": methods,
149 |         "Cluster number": cluster_nums,
150 |         "Cell number": cell_nums,
151 |         "Celltype ARI Imbalanced": celltype_aris_imbalanced,
152 |         "Celltype AMI Imbalanced": celltype_amis_imbalanced,
153 |         "Celltype Homogeneity Imbalanced": celltype_homs_imbalanced,
154 |         "Celltype Completeness Imbalanced": celltype_comps_imbalanced,
155 |         "Celltype ARI Balanced": celltype_aris_balanced,
156 |         "Celltype AMI Balanced": celltype_amis_balanced,
157 |         "Celltype Homogeneity Balanced": celltype_homs_balanced,
158 |         "Celltype Completeness Balanced": celltype_comps_balanced,
159 |         "Batch ARI": batch_aris,
160 |         "Batch AMI": batch_amis,
161 |         "Batch Homogeneity": batch_homs,
162 |         "Batch Completeness": batch_comps
163 |     })
164 |     
165 |     # Save clustering summary dataframe to tsv
166 |     cluster_summary_df.to_csv(
167 |         save_loc,
168 |         index=False,
169 |         sep="\t"
170 |     )
171 | 
172 | if __name__ == '__main__':
173 |     parser = argparse.ArgumentParser(
174 |         description = "Input and output files for clustering results summary"
175 |     )
176 |     parser.add_argument(
177 |         "--infile",
178 |         type = str,
179 |         help = "Path of integrated h5ad file"
180 |     )
181 |     parser.add_argument(
182 |         "--outfile",
183 |         type = str,
184 |         help = "Filepath for saving clustering results of integrated h5ad file"
185 |     )
186 |     parser.add_argument(
187 |         "--dataset",
188 |         type = str,
189 |         help = "Name of dataset"
190 |     )
191 |     parser.add_argument(
192 |         "--rep",
193 |         type = int,
194 |         help = "Repetition number"
195 |     )
196 |     args = parser.parse_args()
197 |     main(
198 |         h5ad_loc = args.infile,
199 |         save_loc = args.outfile,
200 |         dataset_name = args.dataset,
201 |         rep = args.rep
202 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/dge_concordance_full.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | import os 
  3 | import sys 
  4 | sys.path.append("src/python/")
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import anndata as ann
  9 | import scanpy as sc
 10 | 
 11 | from utils import dge_top_n, diffexp
 12 | 
 13 | def main(h5ad_loc, save_loc, dataset_name, rep):
 14 |     # Load h5ad file 
 15 |     adata = sc.read_h5ad(h5ad_loc)
 16 |     
 17 |     # Extract summary statistics from h5ad file
 18 |     num_batches_ds = adata.uns["downsampling_stats"]["num_batches"]
 19 |     num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"]
 20 |     prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"]
 21 |     k_initial = adata.uns["kmeans_stats"]["kmeans_initial_k"]
 22 |     k_final = adata.uns["kmeans_stats"]["kmeans_final_k"]
 23 |     
 24 |     # Check if k_final is 1 and if so, skip DGE
 25 |     if k_final == 1:
 26 |         # Create and save summary dataframe for DGE results
 27 |         dge_summary_df = pd.DataFrame(
 28 |             {
 29 |                 "Dataset": dataset_name,
 30 |                 "Number of batches downsampled": num_batches_ds,
 31 |                 "Number of celltypes downsampled": num_celltypes_ds,
 32 |                 "Proportion downsampled": prop_ds,
 33 |                 "Replicate": rep,
 34 |                 "Cluster number before convergence": k_initial,
 35 |                 "Cluster number after convergence": k_final,
 36 |                 "Method": "NA",
 37 |                 "Cluster": "NA",
 38 |                 "Differentially expressed genes": "NA - k_final = 1"
 39 |             },
 40 |             index = [0]
 41 |         )
 42 |         dge_summary_df.to_csv(save_loc, sep = "\t", index = False)
 43 |     else:
 44 |         # Subset adatas based on method for integration and store lognorm counts in raw
 45 |         # attribute for diffexp testing
 46 |         methods = ["harmony", "scvi", "scanorama", "seurat", "liger"]
 47 |         method_adatas = []
 48 |         for method in methods:
 49 |             adata_copy = adata.copy()
 50 |             adata_subset = adata_copy[adata_copy.obs["integration_method"] == method]
 51 |             adata_subset.X = adata_subset.layers["raw"] # Unlogged, unnorm counts
 52 |             sc.pp.normalize_total(
 53 |                 adata_subset,
 54 |                 target_sum = 1e4
 55 |             )
 56 |             sc.pp.log1p(adata_subset)
 57 |             adata_subset.raw = adata_subset # Freeze for DGE test - lognorm counts
 58 |             method_adatas.append(adata_subset)
 59 |                 
 60 |         # Extract top 50 DGEs for each cluster in each method 
 61 |         method_dge_dfs = []
 62 |         for adata_method_subset in method_adatas:
 63 |             adata_method_subset = diffexp(
 64 |                 adata_method_subset, 
 65 |                 groupby = "kmeans_faiss",
 66 |                 use_raw = True,
 67 |                 method = "wilcoxon"
 68 |             )
 69 |             dge_results = dge_top_n(
 70 |                 adata_method_subset, 
 71 |                 n = 50,
 72 |                 obs_group = "kmeans_faiss"
 73 |             )
 74 |             method_dge_dfs.append(dge_results)
 75 |             
 76 |         # Concatenate DGE results from each method 
 77 |         method_dge_dfs_concat = pd.concat(method_dge_dfs, axis = 0)
 78 |         
 79 |         # Create long form array for methods 
 80 |         methods_long = np.repeat(np.array(methods), 50*k_final)
 81 |         
 82 |         # Create and save summary dataframe for DGE results
 83 |         dge_summary_df = pd.DataFrame({
 84 |             "Dataset": dataset_name,
 85 |             "Number of batches downsampled": num_batches_ds,
 86 |             "Number of celltypes downsampled": num_celltypes_ds,
 87 |             "Proportion downsampled": prop_ds,
 88 |             "Replicate": rep,
 89 |             "Cluster number before convergence": k_initial,
 90 |             "Cluster number after convergence": k_final,
 91 |             "Method": methods_long,
 92 |             "Cluster": method_dge_dfs_concat["Cluster"].__array__(),
 93 |             "Differentially expressed genes": method_dge_dfs_concat["Top 50 DGEs"].__array__()
 94 |         })
 95 |         dge_summary_df.to_csv(save_loc, sep = "\t", index = False)
 96 |     
 97 | if __name__ == '__main__':
 98 |     parser = argparse.ArgumentParser(
 99 |         description = "Input and output files for dge concordance summary"
100 |     )
101 |     parser.add_argument(
102 |         "--infile",
103 |         type = str,
104 |         help = "Path of integrated h5ad file"
105 |     )
106 |     parser.add_argument(
107 |         "--outfile",
108 |         type = str,
109 |         help = "Filepath for saving dge concordance statistics of integrated h5ad file"
110 |     )
111 |     parser.add_argument(
112 |         "--dataset",
113 |         type = str,
114 |         help = "Name of dataset"
115 |     )
116 |     parser.add_argument(
117 |         "--rep",
118 |         type = int,
119 |         help = "Repetition number"
120 |     )
121 |     args = parser.parse_args()
122 |     main(
123 |         h5ad_loc = args.infile,
124 |         save_loc = args.outfile,
125 |         dataset_name = args.dataset,
126 |         rep = args.rep
127 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/dge_concordance_stats.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | import os 
  3 | import sys 
  4 | sys.path.append("src/python/")
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import anndata as ann
  9 | import scanpy as sc
 10 | 
 11 | from utils import dge_top_n, diffexp
 12 | 
 13 | def main(h5ad_loc, save_loc, dataset_name, rep):
 14 |     # Load h5ad file 
 15 |     adata = sc.read_h5ad(h5ad_loc)
 16 |     
 17 |     # Extract summary statistics from h5ad file
 18 |     num_batches_ds = adata.uns["downsampling_stats"]["num_batches"]
 19 |     num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"]
 20 |     prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"]
 21 |     k_initial = adata.uns["kmeans_stats"]["kmeans_initial_k"]
 22 |     k_final = adata.uns["kmeans_stats"]["kmeans_final_k"]
 23 |     
 24 |     # Check if k_final is 1 and if so, skip DGE
 25 |     if k_final == 1:
 26 |         # Create and save summary dataframe for DGE results
 27 |         dge_int_summary_df = pd.DataFrame({
 28 |                 "Dataset": dataset_name,
 29 |                 "Batches downsampled": num_batches_ds,
 30 |                 "Number of celltypes downsampled": num_celltypes_ds,
 31 |                 "Proportion downsampled": prop_ds,
 32 |                 "Replicate": rep,
 33 |                 "Cluster number before convergence": k_initial,
 34 |                 "Cluster number after convergence": k_final,
 35 |                 "Method 1": "NA",
 36 |                 "Method 2": "NA",
 37 |                 "DGE Set Intersection Ratio": "NA",
 38 |                 "Median DGE Set Intersection Ratio": "NA"
 39 |             },
 40 |             index = [0]
 41 |         )
 42 |         dge_int_summary_df.to_csv(save_loc, sep = "\t", index = False)
 43 |     else:
 44 |         # Subset adatas based on method for integration and store lognorm counts in raw
 45 |         # attribute for diffexp testing
 46 |         methods = ["harmony", "scvi", "scanorama", "seurat", "liger"]
 47 |         method_adatas = []
 48 |         for method in methods:
 49 |             adata_copy = adata.copy()
 50 |             adata_subset = adata_copy[adata_copy.obs["integration_method"] == method]
 51 |             adata_subset.X = adata_subset.layers["raw"] # Unlogged, unnorm counts
 52 |             sc.pp.normalize_total(
 53 |                 adata_subset,
 54 |                 target_sum = 1e4
 55 |             )
 56 |             sc.pp.log1p(adata_subset)
 57 |             adata_subset.raw = adata_subset # Freeze for DGE test 
 58 |             method_adatas.append(adata_subset)
 59 |                 
 60 |         # Extract top 50 DGEs for each cluster in each method 
 61 |         method_dge_dfs = []
 62 |         for adata_method_subset in method_adatas:
 63 |             adata_method_subset = diffexp(
 64 |                 adata_method_subset, 
 65 |                 groupby = "kmeans_faiss",
 66 |                 use_raw = True,
 67 |                 method = "wilcoxon"
 68 |             )
 69 |             dge_results = dge_top_n(
 70 |                 adata_method_subset, 
 71 |                 n = 50,
 72 |                 obs_group = "kmeans_faiss"
 73 |             )
 74 |             method_dge_dfs.append(dge_results)
 75 |             
 76 |         # Concatenate DGE results from each method 
 77 |         method_dge_dfs_concat = pd.concat(method_dge_dfs, axis = 0)
 78 |         
 79 |         # Create long form array for methods 
 80 |         methods_long = np.repeat(np.array(methods), 50*k_final)
 81 |         
 82 |         # Create summary dataframe for DGE results
 83 |         dge_summary_df = pd.DataFrame({
 84 |             "Dataset": dataset_name,
 85 |             "Number of batches downsampled": num_batches_ds,
 86 |             "Number of celltypes downsampled": num_celltypes_ds,
 87 |             "Proportion downsampled": prop_ds,
 88 |             "Replicate": rep,
 89 |             "Cluster number before convergence": k_initial,
 90 |             "Cluster number after convergence": k_final,
 91 |             "Method": methods_long,
 92 |             "Cluster": method_dge_dfs_concat["Cluster"].__array__(),
 93 |             "Differentially expressed genes": method_dge_dfs_concat["Top 50 DGEs"].__array__()
 94 |         })
 95 |         
 96 |         # Determine DGE concordance through set intersection in a pairwise manner 
 97 |         method_concordance_mat = np.zeros((len(methods), len(methods)))
 98 |         for i, method_i in enumerate(methods):
 99 |             for j, method_j in enumerate(methods):
100 |                 dge_sub_1 = dge_summary_df[dge_summary_df["Method"] == method_i]
101 |                 dge_sub_2 = dge_summary_df[dge_summary_df["Method"] == method_j]
102 |                 dge_sub_1_genes = dge_sub_1["Differentially expressed genes"].values
103 |                 dge_sub_2_genes = dge_sub_2["Differentially expressed genes"].values
104 |                 set_int = np.intersect1d(dge_sub_1_genes, dge_sub_2_genes)
105 |                 int_ratio = len(set_int)/len(np.unique(dge_sub_1_genes))
106 |                 method_concordance_mat[i, j] = int_ratio
107 |         
108 |                 
109 |         # Create dataframe of values
110 |         method_int_df = pd.DataFrame(method_concordance_mat)
111 |         method_int_df.index = methods
112 |         method_int_df.columns = methods
113 |         
114 |         # Convert to long format 
115 |         method_int_df_long = method_int_df.melt(ignore_index = False)
116 |         method_int_df_long = method_int_df_long.reset_index()
117 |         method_int_df_long.columns = ["Method 1", "Method 2", "DGE Set Intersection Ratio"] 
118 | 
119 |         # Get median of DGE concordance
120 |         method_int_df_no_self = method_int_df_long[method_int_df_long["Method 1"] != method_int_df_long["Method 2"]]
121 |         median_set_int_ratio = np.median(method_int_df_no_self["DGE Set Intersection Ratio"])
122 |         method_int_df_long["Median DGE Set Intersection Ratio"] = median_set_int_ratio
123 |         
124 |         # Create and save summary dataframe for DGE intersection results
125 |         dge_int_summary_df = pd.DataFrame({
126 |             "Dataset": dataset_name,
127 |             "Batches downsampled": num_batches_ds,
128 |             "Number of celltypes downsampled": num_celltypes_ds,
129 |             "Proportion downsampled": prop_ds,
130 |             "Replicate": rep,
131 |             "Cluster number before convergence": k_initial,
132 |             "Cluster number after convergence": k_final,
133 |             "Method 1": method_int_df_long["Method 1"].__array__(),
134 |             "Method 2": method_int_df_long["Method 2"].__array__(),
135 |             "DGE Set Intersection Ratio": method_int_df_long["DGE Set Intersection Ratio"].__array__(),
136 |             "Median DGE Set Intersection Ratio": method_int_df_long["Median DGE Set Intersection Ratio"].__array__()
137 |         })
138 |         dge_int_summary_df.to_csv(save_loc, sep = "\t", index = False)
139 |         
140 |     
141 | if __name__ == '__main__':
142 |     parser = argparse.ArgumentParser(
143 |         description = "Input and output files for dge concordance summary"
144 |     )
145 |     parser.add_argument(
146 |         "--infile",
147 |         type = str,
148 |         help = "Path of integrated h5ad file"
149 |     )
150 |     parser.add_argument(
151 |         "--outfile",
152 |         type = str,
153 |         help = "Filepath for saving dge concordance statistics of integrated h5ad file"
154 |     )
155 |     parser.add_argument(
156 |         "--dataset",
157 |         type = str,
158 |         help = "Name of dataset"
159 |     )
160 |     parser.add_argument(
161 |         "--rep",
162 |         type = int,
163 |         help = "Repetition number"
164 |     )
165 |     args = parser.parse_args()
166 |     main(
167 |         h5ad_loc = args.infile,
168 |         save_loc = args.outfile,
169 |         dataset_name = args.dataset,
170 |         rep = args.rep
171 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/dge_ranking_concordance.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | import os 
  3 | import sys 
  4 | sys.path.append("src/python/")
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import anndata as ann
  9 | import scanpy as sc
 10 | 
 11 | from utils import diffexp, dge_top_n
 12 | 
 13 | def main(h5ad_loc, save_loc, dataset_name, rep):
 14 |     # Load h5ad file 
 15 |     adata = sc.read_h5ad(h5ad_loc)
 16 |     
 17 |     # Extract summary statistics from h5ad file
 18 |     num_batches_ds = adata.uns["downsampling_stats"]["num_batches"]
 19 |     num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"]
 20 |     prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"]
 21 | 
 22 |     # Subset adatas based on method for integration and store lognorm counts in raw
 23 |     # attribute for diffexp testing
 24 |     methods = ["harmony", "scvi", "scanorama", "seurat", "liger", "bbknn"]
 25 |     method_adatas = []
 26 |     for method in methods:
 27 |         adata_copy = adata.copy()
 28 |         adata_subset = adata_copy[adata_copy.obs["integration_method"] == method]
 29 |         adata_subset.X = adata_subset.layers["raw"] # Unlogged, unnorm counts
 30 |         sc.pp.normalize_total(
 31 |             adata_subset,
 32 |             target_sum = 1e4
 33 |         )
 34 |         sc.pp.log1p(adata_subset)
 35 |         adata_subset.raw = adata_subset # Freeze for DGE test 
 36 |         method_adatas.append(adata_subset)
 37 |         
 38 |     # Extract list of all DGEs for all leiden clusters in each method 
 39 |     method_dge_dfs = []
 40 |     for adata_method_subset in method_adatas:
 41 |         adata_method_subset = diffexp(
 42 |             adata_method_subset, 
 43 |             groupby = "leiden",
 44 |             use_raw = True,
 45 |             method = "wilcoxon"
 46 |         )
 47 |         dge_results = dge_top_n(
 48 |             adata_method_subset, 
 49 |             n = len(adata.var),
 50 |             obs_group = "leiden"
 51 |         )
 52 |         method_dge_dfs.append(dge_results)
 53 |     
 54 |     # For each method, compute the ranking metrics for all genes in the dataset  
 55 |     all_genes = np.sort(adata.var.index.values)
 56 |     all_genes_tiled = np.tile(all_genes, (len(methods)))
 57 |     methods_repeat = np.repeat(methods, len(all_genes))
 58 |     gene_max_imp_per_method = []
 59 |     gene_min_imp_per_method = []
 60 |     for method_dge_df in method_dge_dfs:
 61 |         method_clusters = np.unique(method_dge_df["Cluster"].__array__())
 62 |         cluster_ranks = []
 63 |         for cluster in method_clusters:
 64 |             cluster_sub = method_dge_df[method_dge_df["Cluster"] == cluster]
 65 |             gene_ranks_sorted = np.argsort(cluster_sub.iloc[:, 1].__array__())
 66 |             cluster_ranks.append(gene_ranks_sorted)
 67 |         cluster_ranks_stack = np.stack(cluster_ranks, axis = 0)
 68 |         # Min for max because lowest number for ranking corresponds to highest importance
 69 |         gene_max_imp_per_method.append(
 70 |             np.min(cluster_ranks_stack, axis = 0)
 71 |         )
 72 |         gene_min_imp_per_method.append(
 73 |             np.max(cluster_ranks_stack, axis = 0)
 74 |         )
 75 |         
 76 |     # Concatenate max and min ranks for each method 
 77 |     gene_max_imp_per_method_concat = np.concatenate(gene_max_imp_per_method)
 78 |     gene_min_imp_per_method_concat = np.concatenate(gene_min_imp_per_method)
 79 |     
 80 |     # Create summary df of all genes and their ranking metrics
 81 |     dge_ranking_summary_df = pd.DataFrame({
 82 |         "Dataset": dataset_name,
 83 |         "Number of batches downsampled": num_batches_ds,
 84 |         "Number of celltypes downsampled": num_celltypes_ds,
 85 |         "Proportion downsampled": prop_ds,
 86 |         "Replicate": rep,
 87 |         "Method": methods_repeat,
 88 |         "Gene": all_genes_tiled,
 89 |         "Max rank": gene_max_imp_per_method_concat,
 90 |         "Min rank": gene_min_imp_per_method_concat
 91 |     })
 92 |     dge_ranking_summary_df.to_csv(save_loc, sep = "\t", index = False)
 93 |     
 94 | if __name__ == '__main__':
 95 |     parser = argparse.ArgumentParser(
 96 |         description = "Input and output files for dge ranking summary"
 97 |     )
 98 |     parser.add_argument(
 99 |         "--infile",
100 |         type = str,
101 |         help = "Path of integrated h5ad file"
102 |     )
103 |     parser.add_argument(
104 |         "--outfile",
105 |         type = str,
106 |         help = "Filepath for saving dge ranking statistics of integrated h5ad file"
107 |     )
108 |     parser.add_argument(
109 |         "--dataset",
110 |         type = str,
111 |         help = "Name of dataset"
112 |     )
113 |     parser.add_argument(
114 |         "--rep",
115 |         type = int,
116 |         help = "Repetition number"
117 |     )
118 |     args = parser.parse_args()
119 |     main(
120 |         h5ad_loc = args.infile,
121 |         save_loc = args.outfile,
122 |         dataset_name = args.dataset,
123 |         rep = args.rep
124 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/dge_ranking_marker_subset.py:
--------------------------------------------------------------------------------
 1 | import argparse 
 2 | 
 3 | import pandas as pd 
 4 | 
 5 | def main(dge_rank_file_loc, marker_file_loc, save_loc):
 6 |     # Read in the dge rank file and marker file
 7 |     dge_rank_df = pd.read_csv(dge_rank_file_loc, sep = "\t")
 8 |     marker_df = pd.read_csv(marker_file_loc, sep = "\t")
 9 |     
10 |     # Subset the dge rank df by the markers in the marker df
11 |     dataset_markers = marker_df["Top 10 marker genes (union across batches)"].__array__()
12 |     dge_rank_df_marker_sub = dge_rank_df[dge_rank_df["Gene"].isin(dataset_markers)]
13 |     
14 |     # Save the marker subset dge rank df
15 |     dge_rank_df_marker_sub.to_csv(save_loc, sep = "\t", index = False)
16 |     
17 | if __name__ == "__main__":
18 |     parser = argparse.ArgumentParser(
19 |         description = "Input and output files for dge concordance summary"
20 |     )
21 |     parser.add_argument(
22 |         "--infile_dge_rank",
23 |         type = str,
24 |         help = "Path of dge rank file for given dataset"
25 |     )
26 |     parser.add_argument(
27 |         "--infile_marker",
28 |         type = str,
29 |         help = "Path of marker gene file for given dataset"
30 |     )
31 |     parser.add_argument(
32 |         "--outfile",
33 |         type = str,
34 |         help = "Filepath for saving marker gene subset dge rank file"
35 |     )
36 |     args = parser.parse_args()
37 |     main(
38 |         dge_rank_file_loc=args.infile_dge_rank,
39 |         marker_file_loc=args.infile_marker,
40 |         save_loc=args.outfile
41 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/dge_ranking_per_cluster.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | import os 
  3 | import sys 
  4 | sys.path.append("src/python/")
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import anndata as ann
  9 | import scanpy as sc
 10 | 
 11 | from utils import diffexp, dge_top_n
 12 | 
 13 | def main(h5ad_loc, save_loc, dataset_name, rep):
 14 |     # Load h5ad file 
 15 |     adata = sc.read_h5ad(h5ad_loc)
 16 |     
 17 |     # Extract summary statistics from h5ad file
 18 |     num_batches_ds = adata.uns["downsampling_stats"]["num_batches"]
 19 |     num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"]
 20 |     prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"]
 21 | 
 22 |     # Subset adatas based on method for integration and store lognorm counts in raw
 23 |     # attribute for diffexp testing
 24 |     methods = ["harmony", "scvi", "scanorama", "seurat", "liger", "bbknn"]
 25 |     method_adatas = []
 26 |     for method in methods:
 27 |         adata_copy = adata.copy()
 28 |         adata_subset = adata_copy[adata_copy.obs["integration_method"] == method]
 29 |         adata_subset.X = adata_subset.layers["raw"] # Unlogged, unnorm counts
 30 |         sc.pp.normalize_total(
 31 |             adata_subset,
 32 |             target_sum = 1e4
 33 |         )
 34 |         sc.pp.log1p(adata_subset)
 35 |         adata_subset.raw = adata_subset # Freeze for DGE test 
 36 |         method_adatas.append(adata_subset)
 37 |         
 38 |     # Extract list of all DGEs for all leiden clusters in each method 
 39 |     method_dge_dfs = []
 40 |     for adata_method_subset in method_adatas:
 41 |         adata_method_subset = diffexp(
 42 |             adata_method_subset, 
 43 |             groupby = "leiden",
 44 |             use_raw = True,
 45 |             method = "wilcoxon"
 46 |         )
 47 |         dge_results = dge_top_n(
 48 |             adata_method_subset, 
 49 |             n = len(adata.var),
 50 |             obs_group = "leiden"
 51 |         )
 52 |         method_dge_dfs.append(dge_results)
 53 |     
 54 |     # For each method, compute the ranking metrics for all genes in the dataset
 55 |     # based on each cluster - extract the top 50 dges for each cluster across subsets
 56 |     method_adata_result_dfs = []
 57 |     for method_adata, method_dge_df in zip(method_adatas, method_dge_dfs):
 58 |         method_clusters = np.unique(method_dge_df["Cluster"].__array__())
 59 |         method_name = np.unique(method_adata.obs["integration_method"].__array__())
 60 |         cluster_ranks = []
 61 |         cluster_celltype = []
 62 |         cluster_number = []
 63 |         for cluster in method_clusters:
 64 |             cluster_celltype_unique = np.unique(
 65 |                 method_adata.obs["celltype"][method_adata.obs["leiden"] == cluster],
 66 |                 return_counts = True
 67 |             )
 68 |             celltype_most_prev = cluster_celltype_unique[0][
 69 |                 np.argmax(cluster_celltype_unique[1])
 70 |             ]    
 71 |             cluster_sub = method_dge_df[method_dge_df["Cluster"] == cluster]
 72 |             genes_top_50 = cluster_sub.iloc[:, 1].__array__()[0:50]
 73 |             cluster_ranks.append(genes_top_50)
 74 |             cluster_celltype.append(np.repeat(celltype_most_prev, 50))
 75 |             cluster_number.append(np.repeat(cluster, 50))
 76 |         cluster_ranks_full = np.concatenate(cluster_ranks)
 77 |         cluster_celltypes_full = np.concatenate(cluster_celltype)
 78 |         cluster_numbers_full = np.concatenate(cluster_number)
 79 |         method_adata_result = pd.DataFrame({
 80 |             "Top 50 cluster markers (ordered)": cluster_ranks_full,
 81 |             "Cluster celltype (majority)": cluster_celltypes_full,
 82 |             "Cluster number": cluster_numbers_full
 83 |         })
 84 |         method_adata_result["Method"] = method_name[0]
 85 |         method_adata_result_dfs.append(method_adata_result)
 86 |         
 87 |     # Concatenate all results into one dataframe
 88 |     method_adata_result_df = pd.concat(method_adata_result_dfs)
 89 |     
 90 |     # Add all of the summary statistics to the dataframe and save
 91 |     method_adata_result_df["Dataset"] = dataset_name
 92 |     method_adata_result_df["Replicate"] = rep
 93 |     method_adata_result_df["Number of batches downsampled"] = num_batches_ds
 94 |     method_adata_result_df["Number of celltypes downsampled"] = num_celltypes_ds
 95 |     method_adata_result_df["Proportion downsampled"] = prop_ds
 96 | 
 97 |     method_adata_result_df.to_csv(save_loc, sep = "\t", index = False)
 98 |     
 99 | if __name__ == '__main__':
100 |     parser = argparse.ArgumentParser(
101 |         description = "Input and output files for dge ranking summary per cluster"
102 |     )
103 |     parser.add_argument(
104 |         "--infile",
105 |         type = str,
106 |         help = "Path of integrated h5ad file"
107 |     )
108 |     parser.add_argument(
109 |         "--outfile",
110 |         type = str,
111 |         help = "Filepath for saving dge ranking statistics per cluster of integrated h5ad file"
112 |     )
113 |     parser.add_argument(
114 |         "--dataset",
115 |         type = str,
116 |         help = "Name of dataset"
117 |     )
118 |     parser.add_argument(
119 |         "--rep",
120 |         type = int,
121 |         help = "Repetition number"
122 |     )
123 |     args = parser.parse_args()
124 |     main(
125 |         h5ad_loc = args.infile,
126 |         save_loc = args.outfile,
127 |         dataset_name = args.dataset,
128 |         rep = args.rep
129 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/downsample_summary.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | import os 
  3 | import sys 
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import anndata as ann
  8 | import scanpy as sc 
  9 | 
 10 | def main(h5ad_loc, save_loc, dataset_name, rep):
 11 |     # Load h5ad file 
 12 |     adata = sc.read_h5ad(h5ad_loc)
 13 |     
 14 |     # Extract summary statistics from h5ad file
 15 |     num_batches_ds = adata.uns["downsampling_stats"]["num_batches"]
 16 |     batches_ds = adata.uns["downsampling_stats"]["ds_batch_names"]
 17 |     num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"]
 18 |     prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"]
 19 |     downsampled_celltypes = adata.uns["downsampling_stats"]["downsampled_celltypes"]
 20 |     
 21 |     # Format downsampled celltypes and batches to correspond to a single item
 22 |     if isinstance(downsampled_celltypes, str):
 23 |         if downsampled_celltypes == "None":  
 24 |             downsampled_celltypes = "None"
 25 |         else:
 26 |             raise ValueError("Downsampled celltypes is a str and not 'None'")
 27 |     elif isinstance(downsampled_celltypes, np.ndarray):
 28 |         if downsampled_celltypes.shape == (1,):
 29 |             downsampled_celltypes = downsampled_celltypes[0]
 30 |         else:
 31 |             downsampled_celltypes = np.concatenate(downsampled_celltypes).flatten()
 32 |             downsampled_celltypes = ", ".join(downsampled_celltypes)
 33 |     else:
 34 |         raise TypeError("Downsampled celltypes is not a str or ndarray")
 35 |     
 36 |     if isinstance(batches_ds, str):
 37 |         if batches_ds == "None":
 38 |             batches_ds = "None"
 39 |         elif batches_ds == "Placeholder due to h5py bug":
 40 |             batches_ds = "Placeholder due to h5py bug"
 41 |         else:
 42 |             raise ValueError("Downsampled batches is a str and not 'None'")
 43 |     elif isinstance(batches_ds, np.ndarray):
 44 |         if batches_ds.shape == (1,):
 45 |             batches_ds = batches_ds[0]
 46 |         else:
 47 |             batches_ds = np.concatenate(batches_ds).flatten()
 48 |             batches_ds = ", ".join(batches_ds)
 49 |     else:
 50 |         raise TypeError("Downsampled batches is not a str or ndarray")
 51 |     
 52 |     # Extract data from just one integration method subset - for getting unique batches
 53 |     int_method_select = np.random.choice(
 54 |         np.unique(adata.obs.integration_method.__array__())
 55 |     )
 56 |     adata_select = adata[adata.obs.integration_method == int_method_select]
 57 |     
 58 |     # Create downsampling summary df 
 59 |     ds_summary_df = pd.DataFrame(
 60 |         {
 61 |             "Dataset": dataset_name,
 62 |             "Number of batches downsampled": num_batches_ds,
 63 |             "Batches downsampled": batches_ds,
 64 |             "Number of celltypes downsampled": num_celltypes_ds,
 65 |             "Proportion downsampled": prop_ds,
 66 |             "Downsampled celltypes": downsampled_celltypes,
 67 |             "Replicate": rep,
 68 |             "Total batches": len(np.unique(adata_select.obs["batch"]))
 69 |         },
 70 |         index = [0]
 71 |     )
 72 |     ds_summary_df.to_csv(save_loc, index=False, sep="\t")
 73 |     
 74 | if __name__ == '__main__':
 75 |     parser = argparse.ArgumentParser(
 76 |         description = "Input and output files for downsampling summary"
 77 |     )
 78 |     parser.add_argument(
 79 |         "--infile",
 80 |         type = str,
 81 |         help = "Path of integrated h5ad file"
 82 |     )
 83 |     parser.add_argument(
 84 |         "--outfile",
 85 |         type = str,
 86 |         help = "Filepath for saving downsampling statistics of integrated h5ad file"
 87 |     )
 88 |     parser.add_argument(
 89 |         "--dataset",
 90 |         type = str,
 91 |         help = "Name of dataset"
 92 |     )
 93 |     parser.add_argument(
 94 |         "--rep",
 95 |         type = int,
 96 |         help = "Repetition number"
 97 |     )
 98 |     args = parser.parse_args()
 99 |     main(
100 |         h5ad_loc = args.infile,
101 |         save_loc = args.outfile,
102 |         dataset_name = args.dataset,
103 |         rep = args.rep
104 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/imbalance_summary.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import scipy.spatial as sp
  6 | import scanpy as sc 
  7 | 
  8 | def main(h5ad_loc, save_loc, dataset_name, rep):
  9 |     # Load h5ad file 
 10 |     adata_full = sc.read_h5ad(h5ad_loc)
 11 |     
 12 |     # Extract summary statistics from h5ad file
 13 |     num_batches_ds = adata_full.uns["downsampling_stats"]["num_batches"]
 14 |     batches_ds = adata_full.uns["downsampling_stats"]["ds_batch_names"]
 15 |     num_celltypes_ds = adata_full.uns["downsampling_stats"]["num_celltypes_downsampled"]
 16 |     prop_ds = adata_full.uns["downsampling_stats"]["proportion_downsampled"]
 17 |     downsampled_celltypes = adata_full.uns["downsampling_stats"]["downsampled_celltypes"]
 18 |     
 19 |     # Format downsampled celltypes and batches to correspond to a single item
 20 |     if isinstance(downsampled_celltypes, str):
 21 |         if downsampled_celltypes == "None":  
 22 |             downsampled_celltypes = "None"
 23 |         else:
 24 |             raise ValueError("Downsampled celltypes is a str and not 'None'")
 25 |     elif isinstance(downsampled_celltypes, np.ndarray):
 26 |         if downsampled_celltypes.shape == (1,):
 27 |             downsampled_celltypes = downsampled_celltypes[0]
 28 |         else:
 29 |             downsampled_celltypes = np.concatenate(downsampled_celltypes).flatten()
 30 |             downsampled_celltypes = ", ".join(downsampled_celltypes)
 31 |     else:
 32 |         raise TypeError("Downsampled celltypes is not a str or ndarray")
 33 |     
 34 |     if isinstance(batches_ds, str):
 35 |         if batches_ds == "None":
 36 |             batches_ds = "None"
 37 |         elif batches_ds == "Placeholder due to h5py bug":
 38 |             batches_ds = "Placeholder due to h5py bug"
 39 |         else:
 40 |             raise ValueError("Downsampled batches is a str and not 'None'")
 41 |     elif isinstance(batches_ds, np.ndarray):
 42 |         if batches_ds.shape == (1,):
 43 |             batches_ds = batches_ds[0]
 44 |         else:
 45 |             batches_ds = np.concatenate(batches_ds).flatten()
 46 |             batches_ds = ", ".join(batches_ds)
 47 |     else:
 48 |         raise TypeError("Downsampled batches is not a str or ndarray")
 49 | 
 50 |     # Extract data from just one integration method subset 
 51 |     int_method_select = np.random.choice(
 52 |         np.unique(adata_full.obs.integration_method.__array__())
 53 |     )
 54 |     
 55 |     # Subset data for only one method and split datasets by batch
 56 |     adata_select = adata_full[adata_full.obs.integration_method == int_method_select]
 57 |     adata_list = []
 58 |     batches = np.unique(adata_select.obs.batch.__array__())
 59 |     for batch in batches:
 60 |         adata_batch_select = adata_select[adata_select.obs.batch == batch]
 61 |         adata_list.append(adata_batch_select)
 62 |     
 63 |     # Get union of cell types across all batches
 64 |     celltype_union = np.unique(np.concatenate([adata.obs.celltype.__array__() for adata in adata_list]))
 65 |     
 66 |     # Get intersection of cell types across all batches
 67 |     celltype_intersection = set.intersection(*[set(adata.obs.celltype.__array__()) for adata in adata_list])
 68 |     
 69 |     # Get proportion vector of cells in each batch
 70 |     celltype_props = []
 71 |     for adata in adata_list:
 72 |         celltype_prop = np.zeros(len(celltype_union))
 73 |         for idx, celltype in enumerate(celltype_union):
 74 |             celltype_prop[idx] = np.sum(adata.obs.celltype.__array__() == celltype)/len(adata)
 75 |         celltype_props.append(celltype_prop)
 76 |         
 77 |     # Get cosine distances across celltype proportions
 78 |     cos_distances = []
 79 |     for celltype_prop in celltype_props:
 80 |         for celltype_prop_other in celltype_props:
 81 |             cos_distances.append(sp.distance.cosine(celltype_prop, celltype_prop_other))
 82 |     
 83 |     # Get mean cosine distance across batches
 84 |     cos_dist_mean = np.mean(cos_distances)
 85 |     
 86 |     # Get ratio of unique celltypes over intersection (Jaccard index)
 87 |     celltype_unique_ratio = len(celltype_intersection) / len(celltype_union)
 88 |     
 89 |     # Get adata lens stdev proportional to total cells 
 90 |     adata_lens = [len(adata) for adata in adata_list]
 91 |     adata_lens_stdev = np.std(adata_lens)
 92 |     adata_lens_mean = np.mean(adata_lens)
 93 |     adata_coeff_var = adata_lens_stdev / adata_lens_mean
 94 |     
 95 |     # Return dataset imbalance summary stats 
 96 |     imba_summary_df = pd.DataFrame(
 97 |         {
 98 |             "Dataset": dataset_name,
 99 |             "Number of batches downsampled": num_batches_ds,
100 |             "Batches downsampled": batches_ds,
101 |             "Number of celltypes downsampled": num_celltypes_ds,
102 |             "Proportion downsampled": prop_ds,
103 |             "Downsampled celltypes": downsampled_celltypes,
104 |             "Replicate": rep,
105 |             "Total batches": len(batches),
106 |             "Celltype intersection ratio": celltype_unique_ratio,
107 |             "Mean proportion cosine distance": cos_dist_mean,
108 |             "Length coeff var": adata_coeff_var
109 |         },
110 |         index=[0]
111 |     )
112 |     imba_summary_df.to_csv(save_loc, index=False, sep="\t")
113 |     
114 | if __name__ == '__main__':
115 |     parser = argparse.ArgumentParser(
116 |         description = "Input and output files for imbalance summary"
117 |     )
118 |     parser.add_argument(
119 |         "--infile",
120 |         type = str,
121 |         help = "Path of integrated h5ad file"
122 |     )
123 |     parser.add_argument(
124 |         "--outfile",
125 |         type = str,
126 |         help = "Filepath for saving imbalance statistics of h5ad file"
127 |     )
128 |     parser.add_argument(
129 |         "--dataset",
130 |         type = str,
131 |         help = "Name of dataset"
132 |     )
133 |     parser.add_argument(
134 |         "--rep",
135 |         type = int,
136 |         help = "Repetition number"
137 |     )
138 |     args = parser.parse_args()
139 |     main(
140 |         h5ad_loc = args.infile,
141 |         save_loc = args.outfile,
142 |         dataset_name = args.dataset,
143 |         rep = args.rep
144 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/integrate_data.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | import os 
  3 | import sys 
  4 | sys.path.append("src/python/")
  5 | os.environ['CUDA_VISIBLE_DEVICES'] = "0, 1"
  6 | 
  7 | import scanpy as sc
  8 | import anndata as ann
  9 | import numpy as np
 10 | 
 11 | from utils import Integration, downsample, faiss_kmeans
 12 | 
 13 | def none_or_str(value):
 14 |     if value == 'None':
 15 |         return None
 16 |     return value
 17 | 
 18 | def main(h5ad_dir, save_loc, ds_celltypes, ds_proportions, num_batches):
 19 |     # Load h5ad files 
 20 |     files_list = os.listdir(h5ad_dir)
 21 |     adata_loaded = []
 22 |     for f in files_list:
 23 |         adata = sc.read_h5ad(os.path.join(h5ad_dir, f), as_sparse = "raw/X")
 24 |         adata.layers["raw"] = adata.X # Store raw counts
 25 |         adata.obs = adata.obs[["batch", "celltype"]] # Only store relevant columns
 26 |         if "gene" not in adata.var.columns:
 27 |             adata.var["gene"] = adata.var_names # Add gene names if not present
 28 |         adata.var = adata.var[["gene"]] # Only store relevant columns
 29 |         adata_loaded.append(adata)
 30 |     
 31 |     # Downsample loaded h5ad files based on params 
 32 |     if num_batches == 0:
 33 |         selected_celltypes_downsampled = "None" # Placeholder - not used
 34 |         batches_ds = "None" # Placeholder - not used
 35 |     else:
 36 |         # Initialize random number generator
 37 |         rng = np.random.default_rng()
 38 |         
 39 |         # Select indices for downsampling
 40 |         selected_indices = np.random.choice(
 41 |             len(adata_loaded), num_batches, replace = False
 42 |         )
 43 |         adata_selected = [adata_loaded[i] for i in selected_indices]
 44 |         adata_unselected = [adata_loaded[i] for i in range(len(adata_loaded)) if i not in selected_indices]
 45 |         
 46 |         # Downsample the same selected celltypes across all of the batches - this change will not affect
 47 |         # previous runs, as they all downsampled either 0 or only 1 celltype, in either 0 or 1 batches 
 48 |         # NOTE - this setup operates on the assumption that the celltypes are the same across all batches
 49 |         celltypes_all = np.unique(np.concatenate([adata.obs["celltype"].__array__() for adata in adata_selected]))
 50 |         rng.shuffle(celltypes_all)
 51 |         celltypes_selected = rng.choice(celltypes_all, ds_celltypes, replace = False)
 52 |         selected_celltypes_downsampled = np.array(celltypes_selected)
 53 |         adata_downsampled = []
 54 |         for adata in adata_selected:
 55 |             adata_ds, selected_celltypes_ds = downsample(
 56 |                 adata = adata, 
 57 |                 num_celltypes = None,
 58 |                 celltype_names = celltypes_selected,
 59 |                 proportion = ds_proportions
 60 |             )
 61 |             adata_downsampled.append(adata_ds)
 62 |         adata_loaded = adata_unselected + adata_downsampled
 63 |         batches_ds = np.unique(np.concatenate([adata.obs["batch"].__array__() for adata in adata_downsampled]))
 64 | 
 65 |     # Store batch name separately for each anndata object
 66 |     for adata in adata_loaded:
 67 |         adata.obs["batch_name"] = adata.obs["batch"]
 68 | 
 69 |     # Concatenate files (assume data is raw counts)
 70 |     adata_concat = ann.AnnData.concatenate(*adata_loaded)
 71 |     adata_concat.obs_names = range(len(adata_concat.obs_names))
 72 |     adata_concat.obs_names_make_unique()
 73 |     adata_concat.obs["batch"] = adata_concat.obs["batch_name"]
 74 |     adata_concat.obs.drop("batch_name", axis = 1, inplace = True)
 75 |     
 76 |     # Create integration class instance 
 77 |     integration = Integration(adata = adata_concat)
 78 |     
 79 |     # Integrate across subsets
 80 |     harmony_integrated = integration.harmony_integrate()
 81 |     scvi_integrated = integration.scvi_integrate()
 82 |     bbknn_integrated = integration.bbknn_integrate()
 83 |     scanorama_integrated = integration.scanorama_integrate()
 84 |     seurat_integrated = integration.seurat_integrate()
 85 |     liger_integrated = integration.liger_integrate()
 86 |     
 87 |     # Add integration type to each subset and concatenate
 88 |     harmony_integrated.obs["integration_method"] = "harmony" 
 89 |     scvi_integrated.obs["integration_method"] = "scvi"
 90 |     bbknn_integrated.obs["integration_method"] = "bbknn"
 91 |     scanorama_integrated.obs["integration_method"] = "scanorama"
 92 |     seurat_integrated.obs["integration_method"] = "seurat"
 93 |     liger_integrated.obs["integration_method"] = "liger"
 94 |     
 95 |     integrated_concat = ann.concat([
 96 |         harmony_integrated,
 97 |         scvi_integrated,
 98 |         bbknn_integrated,
 99 |         scanorama_integrated,
100 |         seurat_integrated,
101 |         liger_integrated
102 |     ])
103 |     integrated_concat.obs_names = range(len(integrated_concat.obs_names))
104 |     integrated_concat.obs_names_make_unique()
105 |     
106 |     # Add placeholder in entire obs dataframe for kmeans clustering
107 |     integrated_concat.obs["kmeans_faiss"] = np.zeros(len(integrated_concat.obs_names))
108 |     
109 |     # Perform kmeans clustering on integrated data 
110 |     # Define method subsets and iterate over them until the same number of k clusters is found
111 |     k = 10
112 |     k_initial = k # Integers are immutable 
113 |     methods = ["harmony", "scvi", "scanorama", "seurat", "liger"]
114 |     method_kmeans_adatas = []
115 |     i = 0
116 |     while i < len(methods):
117 |         # Create a copy of adata to avoid overwriting the original
118 |         adata_copy = integrated_concat.copy()
119 |         
120 |         # Define method subset
121 |         adata_subset = adata_copy[adata_copy.obs["integration_method"] == methods[i]]
122 |         
123 |         # Perform HVG selection on raw (unnormalized, unlogged) data
124 |         adata_subset.X = adata_subset.layers["raw"]
125 |         sc.pp.normalize_total(
126 |             adata_subset,
127 |             target_sum = 1e4
128 |         )
129 |         sc.pp.log1p(adata_subset)
130 |         sc.pp.highly_variable_genes(
131 |             adata_subset,
132 |             n_top_genes = 2500,
133 |             flavor = "seurat"
134 |         )
135 |         
136 |         # Perform faiss kmeans clustering
137 |         adata_subset, k_method = faiss_kmeans(adata_subset, k)
138 |         
139 |         # Test concordance of k values and either append or reset
140 |         if k_method != k:
141 |             k = k_method
142 |             i = 0 
143 |             method_kmeans_adatas.clear()
144 |             continue 
145 |         else:
146 |             i += 1
147 |             method_kmeans_adatas.append(adata_subset)
148 |     
149 |     # Append kmeans cluster info to integrated data
150 |     for method, method_kmeans_adata in zip(methods, method_kmeans_adatas):
151 |         method_kmeans_clusters = method_kmeans_adata.obs["kmeans_faiss"].__array__().astype('str')
152 |         integrated_concat.obs.loc[
153 |             integrated_concat.obs["integration_method"] == method,
154 |             "kmeans_faiss"
155 |         ] = method_kmeans_clusters
156 |         
157 |     # Add placeholder for bbknn kmeans clustering
158 |     integrated_concat.obs.loc[
159 |         integrated_concat.obs["integration_method"] == "bbknn",
160 |         "kmeans_faiss"
161 |     ] = "NA"
162 |     
163 |     # Append information about kmeans faiss clusters to .uns of adata_concat
164 |     integrated_concat.uns["kmeans_stats"] = {
165 |         "kmeans_initial_k": k_initial,
166 |         "kmeans_final_k": k
167 |     }
168 | 
169 |     # If downsampled celltypes and batches are of array length greater than one, combine them 
170 |     if len(batches_ds) > 1:
171 |         batches_ds = np.array(",".join(batches_ds))
172 |     if len(selected_celltypes_downsampled) > 1:
173 |         selected_celltypes_downsampled = np.array(",".join(selected_celltypes_downsampled))
174 | 
175 |     # Add data about downsampling to .uns of adata_concat
176 |     if num_batches == 0:
177 |         integrated_concat.uns["downsampling_stats"] = {
178 |             "num_batches": 0,
179 |             "num_celltypes_downsampled": ds_celltypes,
180 |             "ds_batch_names": "None",
181 |             "proportion_downsampled": ds_proportions,
182 |             "downsampled_celltypes": "None"
183 |         }
184 |     else:
185 |         integrated_concat.uns["downsampling_stats"] = {
186 |             "num_batches": num_batches,
187 |             "num_celltypes_downsampled": ds_celltypes,
188 |             "ds_batch_names": "Placeholder due to h5py bug",
189 |             "proportion_downsampled": ds_proportions,
190 |             "downsampled_celltypes": selected_celltypes_downsampled
191 |         }
192 |         
193 |     # Save integrated h5ad object
194 |     integrated_concat.write_h5ad(
195 |         filename = save_loc,
196 |         compression = "gzip"
197 |     )
198 |     
199 | if __name__ == "__main__":
200 |     parser = argparse.ArgumentParser(
201 |         description = "Input and output files for scRNA-seq integration"
202 |     )
203 |     parser.add_argument(
204 |         "--filedir",
205 |         type = str,
206 |         help = "Path of directory containing scRNA-seq h5ad files"
207 |     )
208 |     parser.add_argument(
209 |         "--ds_celltypes",
210 |         type = int,
211 |         help = "Number of celltypes to randomly downsample in given batch"
212 |     )
213 |     parser.add_argument(
214 |         "--ds_proportions",
215 |         type = float,
216 |         help = "Proportion of downsampling per celltype in a given batch"
217 |     )
218 |     parser.add_argument(
219 |         "--num_batches",
220 |         type = int,
221 |         help = "Number of batches to perform downsampling on"
222 |     )
223 |     parser.add_argument(
224 |         "--outfile",
225 |         type = str,
226 |         help = "Filepath for saving output from scRNA-seq integration"
227 |     )
228 |     args = parser.parse_args()
229 |     main(
230 |         h5ad_dir = args.filedir,
231 |         save_loc = args.outfile,
232 |         ds_celltypes = args.ds_celltypes,
233 |         ds_proportions = args.ds_proportions,
234 |         num_batches = args.num_batches        
235 |     )
236 | 


--------------------------------------------------------------------------------
/workflow/scripts/python/integrate_data_paga.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | import os 
  3 | import sys 
  4 | sys.path.append("src/python/")
  5 | os.environ['CUDA_VISIBLE_DEVICES'] = "0, 1"
  6 | 
  7 | import scanpy as sc
  8 | import anndata as ann
  9 | import numpy as np
 10 | 
 11 | from utils import IntegrationPAGA, downsample
 12 | 
 13 | def none_or_str(value):
 14 |     if value == 'None':
 15 |         return None
 16 |     return value
 17 | 
 18 | def main(h5ad_dir, root_celltype, save_loc, ds_celltypes, ds_proportions, num_batches):
 19 |     # Load h5ad files 
 20 |     files_list = os.listdir(h5ad_dir)
 21 |     adata_loaded = []
 22 |     for f in files_list:
 23 |         adata = sc.read_h5ad(os.path.join(h5ad_dir, f), as_sparse = "raw/X")
 24 |         adata.layers["raw"] = adata.X # Store raw counts
 25 |         if "gene" not in adata.var.columns:
 26 |             adata.var["gene"] = adata.var_names # Add gene names if not present
 27 |         adata.var = adata.var[["gene"]] # Only store relevant columns
 28 |         adata.obs.celltype = [
 29 |             c.replace(" ", "_") for c in adata.obs.celltype
 30 |         ] # Remove spaces from celltype names - for Snakemake wildcard matching
 31 |         adata_loaded.append(adata)
 32 |     
 33 |     # Downsample loaded h5ad files based on params 
 34 |     if num_batches == 0:
 35 |         selected_celltypes_downsampled = "None" # Placeholder - not used
 36 |         batches_ds = "None" # Placeholder - not used
 37 |     else:
 38 |         # Initialize random number generator
 39 |         rng = np.random.default_rng()
 40 |         
 41 |         # Select indices for downsampling
 42 |         selected_indices = np.random.choice(
 43 |             len(adata_loaded), num_batches, replace = False
 44 |         )
 45 |         adata_selected = [adata_loaded[i] for i in selected_indices]
 46 |         adata_unselected = [adata_loaded[i] for i in range(len(adata_loaded)) if i not in selected_indices]
 47 |         
 48 |         # Downsample the same selected celltypes across all of the batches - this change will not affect
 49 |         # previous runs, as they all downsampled either 0 or only 1 celltype, in either 0 or 1 batches 
 50 |         # #NOTE - this setup operates on the assumption that the celltypes are the same across all batches
 51 |         celltypes_all = np.unique(np.concatenate([adata.obs["celltype"].__array__() for adata in adata_selected]))
 52 |         rng.shuffle(celltypes_all)
 53 |         celltypes_selected = rng.choice(celltypes_all, ds_celltypes, replace = False)
 54 |         selected_celltypes_downsampled = np.array(celltypes_selected)
 55 |         adata_downsampled = []
 56 |         for adata in adata_selected:
 57 |             adata_ds, selected_celltypes_ds = downsample(
 58 |                 adata = adata, 
 59 |                 num_celltypes = None,
 60 |                 celltype_names = celltypes_selected,
 61 |                 proportion = ds_proportions
 62 |             )
 63 |             adata_downsampled.append(adata_ds)
 64 |         adata_loaded = adata_unselected + adata_downsampled
 65 |         batches_ds = np.concatenate([np.unique(adata.obs["batch"].__array__()) for adata in adata_downsampled])
 66 | 
 67 |     # Store batch name separately for each anndata object
 68 |     for adata in adata_loaded:
 69 |         adata.obs["batch_name"] = adata.obs["batch"]
 70 | 
 71 |     # Concatenate files (assume data is raw counts)
 72 |     adata_concat = ann.AnnData.concatenate(*adata_loaded)
 73 |     adata_concat.obs_names = range(len(adata_concat.obs_names))
 74 |     adata_concat.obs_names_make_unique()
 75 |     adata_concat.obs["batch"] = adata_concat.obs["batch_name"]
 76 |     adata_concat.obs.drop("batch_name", axis = 1, inplace = True)
 77 |     
 78 |     # Create PAGA integration class instance 
 79 |     integration_paga = IntegrationPAGA(
 80 |         adata = adata_concat,
 81 |         root_celltype = root_celltype
 82 |     )
 83 |     
 84 |     # Integrate across subsets (including unintegrated)
 85 |     unintegrated = integration_paga.unintegrated()
 86 |     harmony_integrated = integration_paga.harmony_integrate()
 87 |     scvi_integrated = integration_paga.scvi_integrate()
 88 |     bbknn_integrated = integration_paga.bbknn_integrate()
 89 |     scanorama_integrated = integration_paga.scanorama_integrate()
 90 |     seurat_integrated = integration_paga.seurat_integrate()
 91 |     liger_integrated = integration_paga.liger_integrate()
 92 |     
 93 |     # Add integration type to each subset and concatenate
 94 |     unintegrated.obs["integration_method"] = "unintegrated"
 95 |     harmony_integrated.obs["integration_method"] = "harmony" 
 96 |     scvi_integrated.obs["integration_method"] = "scvi"
 97 |     bbknn_integrated.obs["integration_method"] = "bbknn"
 98 |     scanorama_integrated.obs["integration_method"] = "scanorama"
 99 |     seurat_integrated.obs["integration_method"] = "seurat"
100 |     liger_integrated.obs["integration_method"] = "liger"
101 |     
102 |     integrated_concat = ann.concat([
103 |         unintegrated,
104 |         harmony_integrated,
105 |         scvi_integrated,
106 |         bbknn_integrated,
107 |         scanorama_integrated,
108 |         seurat_integrated,
109 |         liger_integrated
110 |     ])
111 |     integrated_concat.obs_names = range(len(integrated_concat.obs_names))
112 |     integrated_concat.obs_names_make_unique()
113 |     
114 |     # If downsampled celltypes and batches are of array length greater than one, combine them 
115 |     if len(batches_ds) > 1:
116 |         batches_ds = np.array(",".join(batches_ds))
117 |     if len(selected_celltypes_downsampled) > 1:
118 |         selected_celltypes_downsampled = np.array(",".join(selected_celltypes_downsampled))
119 |     
120 |     # Add data about downsampling to .uns of adata_concat
121 |     if num_batches == 0:
122 |         integrated_concat.uns["downsampling_stats"] = {
123 |             "num_batches": 0,
124 |             "num_celltypes_downsampled": ds_celltypes,
125 |             "ds_batch_names": "None",
126 |             "proportion_downsampled": ds_proportions,
127 |             "downsampled_celltypes": "None"
128 |         }
129 |     else:
130 |         integrated_concat.uns["downsampling_stats"] = {
131 |             "num_batches": num_batches,
132 |             "num_celltypes_downsampled": ds_celltypes,
133 |             "ds_batch_names": "Placeholder due to h5py bug",
134 |             "proportion_downsampled": ds_proportions,
135 |             "downsampled_celltypes": selected_celltypes_downsampled
136 |         }
137 |         
138 |     # Save integrated h5ad object
139 |     integrated_concat.write_h5ad(
140 |         filename = save_loc,
141 |         compression = "gzip"
142 |     )
143 |     
144 | if __name__ == "__main__":
145 |     parser = argparse.ArgumentParser(
146 |         description = "Input and output files for scRNA-seq PAGA integration"
147 |     )
148 |     parser.add_argument(
149 |         "--filedir",
150 |         type = str,
151 |         help = "Path of directory containing scRNA-seq h5ad files"
152 |     )
153 |     parser.add_argument(
154 |         "--root_celltype",
155 |         type = str,
156 |         help = "Root celltype to utilize for diffusion pseudotime estimation"
157 |     )
158 |     parser.add_argument(
159 |         "--ds_celltypes",
160 |         type = int,
161 |         help = "Number of celltypes to randomly downsample in given batch"
162 |     )
163 |     parser.add_argument(
164 |         "--ds_proportions",
165 |         type = float,
166 |         help = "Proportion of downsampling per celltype in a given batch"
167 |     )
168 |     parser.add_argument(
169 |         "--num_batches",
170 |         type = int,
171 |         help = "Number of batches to perform downsampling on"
172 |     )
173 |     parser.add_argument(
174 |         "--outfile",
175 |         type = str,
176 |         help = "Filepath for saving output from scRNA-seq integration and pseudotime estimation"
177 |     )
178 |     args = parser.parse_args()
179 |     main(
180 |         h5ad_dir = args.filedir,
181 |         root_celltype  = args.root_celltype,
182 |         save_loc = args.outfile,
183 |         ds_celltypes = args.ds_celltypes,
184 |         ds_proportions = args.ds_proportions,
185 |         num_batches = args.num_batches      
186 |     )
187 | 


--------------------------------------------------------------------------------
/workflow/scripts/python/knn_classification.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | import os 
  3 | import sys 
  4 | 
  5 | import scanpy as sc
  6 | import anndata as ann
  7 | import numpy as np
  8 | import pandas as pd 
  9 | from sklearn.neighbors import KNeighborsClassifier
 10 | from sklearn.model_selection import train_test_split
 11 | from sklearn.neighbors import KNeighborsClassifier
 12 | from sklearn.metrics import classification_report
 13 | 
 14 | def main(h5ad_loc, save_loc, dataset_name, rep):
 15 |     # Load h5ad file 
 16 |     adata = sc.read_h5ad(h5ad_loc)
 17 |     
 18 |     # Extract summary statistics from h5ad file
 19 |     num_batches_ds = adata.uns["downsampling_stats"]["num_batches"]
 20 |     num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"]
 21 |     prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"]
 22 |     
 23 |     # Subset h5ad based on batch-correction method used
 24 |     adata_method_sub = []
 25 |     methods = ["harmony", "scvi", "scanorama", "seurat", "liger"] # Omitting BBKNN due to lack of embedding
 26 |     for method in methods:
 27 |         adata_sub = adata[adata.obs["integration_method"] == method]
 28 |         adata_method_sub.append(
 29 |             adata_sub
 30 |         )
 31 | 
 32 |     # Determine KNN accuracy for each batch-correction method
 33 |     precision_scores = []
 34 |     recall_scores = []
 35 |     f1_scores = []
 36 |     supports = []
 37 |     celltypes = []
 38 |     for adata_sub in adata_method_sub:
 39 |         # Split testing and training data in stratified manner (70/30)
 40 |         X = adata_sub.obsm["X_kmeans"]
 41 |         y = adata_sub.obs["celltype"].__array__()
 42 |         X_train, X_test, y_train, y_test = train_test_split(
 43 |             X, y, stratify=y, test_size=0.7, random_state=42
 44 |         )
 45 |         
 46 |         # Train k-nearest neighbors classifier with k=15 and predict on test data
 47 |         knn = KNeighborsClassifier(
 48 |             n_neighbors=15
 49 |         )
 50 |         knn.fit(X_train, y_train)
 51 |         y_pred = knn.predict(X_test)
 52 |         
 53 |         # Get classification report and subset for only relevant columns 
 54 |         class_report_dict = classification_report(
 55 |             y_test, y_pred, output_dict=True
 56 |         )
 57 |         class_report_df = pd.DataFrame(class_report_dict)
 58 |         class_report_df = class_report_df.iloc[:, :-3]
 59 |         
 60 |         # Append appropriate values to lists
 61 |         precision_scores.append(class_report_df.loc["precision"].values)
 62 |         recall_scores.append(class_report_df.loc["recall"].values)
 63 |         f1_scores.append(class_report_df.loc["f1-score"].values)
 64 |         supports.append(class_report_df.loc["support"].values)
 65 |         celltypes.append(class_report_df.columns.values)
 66 |         
 67 |     # Repeat method values to have same length as scores (one for each celltype)
 68 |     methods_repeat = np.repeat(methods, len(precision_scores[0]))
 69 |     
 70 |     # Concatenate scores and celltypes 
 71 |     precision_scores_concat = np.concatenate(precision_scores)
 72 |     recall_scores_concat = np.concatenate(recall_scores)
 73 |     f1_scores_concat = np.concatenate(f1_scores)
 74 |     supports_concat = np.concatenate(supports)
 75 |     celltypes_concat = np.concatenate(celltypes)
 76 |     
 77 |     # Create summary dataframe for classification statistics and save
 78 |     classification_summary_df = pd.DataFrame({
 79 |         "Dataset": dataset_name,
 80 |         "Number of batches downsampled": num_batches_ds,
 81 |         "Number of celltypes downsampled": num_celltypes_ds,
 82 |         "Proportion downsampled": prop_ds,
 83 |         "Replicate": rep,
 84 |         "Method": methods_repeat,
 85 |         "Celltype": celltypes_concat,
 86 |         "Precision": precision_scores_concat,
 87 |         "Recall": recall_scores_concat,
 88 |         "F1-score": f1_scores_concat,
 89 |         "Support": supports_concat,
 90 |         "Mean KNN F1-score": np.mean(f1_scores_concat)
 91 |     })
 92 |     classification_summary_df.to_csv(
 93 |         save_loc,
 94 |         index=False,
 95 |         sep="\t"
 96 |     )
 97 | 
 98 | if __name__ == '__main__':
 99 |     parser = argparse.ArgumentParser(
100 |         description = "Input and output files for KNN classification results summary"
101 |     )
102 |     parser.add_argument(
103 |         "--infile",
104 |         type = str,
105 |         help = "Path of integrated h5ad file"
106 |     )
107 |     parser.add_argument(
108 |         "--outfile",
109 |         type = str,
110 |         help = "Filepath for saving KNN classification results of integrated h5ad file"
111 |     )
112 |     parser.add_argument(
113 |         "--dataset",
114 |         type = str,
115 |         help = "Name of dataset"
116 |     )
117 |     parser.add_argument(
118 |         "--rep",
119 |         type = int,
120 |         help = "Repetition number"
121 |     )
122 |     args = parser.parse_args()
123 |     main(
124 |         h5ad_loc = args.infile,
125 |         save_loc = args.outfile,
126 |         dataset_name = args.dataset,
127 |         rep = args.rep
128 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/marker_get.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | import os 
  3 | import sys 
  4 | sys.path.append("src/python/")
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import anndata as ann
  9 | import scanpy as sc
 10 | 
 11 | from utils import dge_top_n
 12 | 
 13 | def main(h5ad_dir, save_loc, top_n = 10):
 14 |     # Load h5ad files 
 15 |     files_list = os.listdir(h5ad_dir)
 16 |     adata_loaded = []
 17 |     for f in files_list:
 18 |         adata = sc.read_h5ad(os.path.join(h5ad_dir, f), as_sparse = "raw/X")
 19 |         adata.layers["raw"] = adata.X # Store raw counts
 20 |         adata.obs = adata.obs[["batch", "celltype"]] # Only store relevant columns
 21 |         if "gene" not in adata.var.columns:
 22 |             adata.var["gene"] = adata.var_names # Add gene names if not present
 23 |         adata.var = adata.var[["gene"]] # Only store relevant columns
 24 |         adata_loaded.append(adata)
 25 |         
 26 |     # Get the differential gene expression results for the celltypes in each batch, top n
 27 |     # for each celltype and get and return the union
 28 |     adata_dge_top_n_dfs = []
 29 |     for adata in adata_loaded:
 30 |         # Removing human mito and ribo genes (won't affect mouse data)
 31 |         adata = adata[:, adata.var.gene.str.startswith("MT-") == False]
 32 |         adata = adata[:, adata.var.gene.str.contains("RPS") == False]
 33 |         adata = adata[:, adata.var.gene.str.contains("RPL") == False]
 34 |         adata = adata[:, adata.var.gene.str.contains("MRPL") == False]
 35 |         adata = adata[:, adata.var.gene.str.contains("MRPS") == False]
 36 |         
 37 |         # Removing mouse mito and ribo genes (won't affect human data)
 38 |         adata = adata[:, adata.var.gene.str.startswith("Mt-") == False]
 39 |         adata = adata[:, adata.var.gene.str.startswith("mt-") == False]
 40 |         adata = adata[:, adata.var.gene.str.contains("Rpl") == False]
 41 |         adata = adata[:, adata.var.gene.str.contains("Rps") == False]
 42 |         adata = adata[:, adata.var.gene.str.contains("Mrpl") == False]
 43 |         adata = adata[:, adata.var.gene.str.contains("Mrps") == False]
 44 |         
 45 |         # Remove any celltypes with less than 5 cells
 46 |         celltype_vcounts = adata.obs.celltype.value_counts()
 47 |         celltype_vcounts_sub = celltype_vcounts[celltype_vcounts >= 5]
 48 |         adata = adata[adata.obs.celltype.isin(celltype_vcounts_sub.index)]
 49 |     
 50 |         # Log-normalize the data
 51 |         sc.pp.normalize_total(
 52 |             adata,
 53 |             target_sum = 1e4
 54 |         )
 55 |         sc.pp.log1p(adata)
 56 |         
 57 |         # Store lognorm counts and perform DGE based on celltype
 58 |         adata.raw = adata # Freeze for DGE test - lognorm counts
 59 |         sc.tl.rank_genes_groups(
 60 |             adata, 
 61 |             groupby = "celltype",
 62 |             use_raw = True,
 63 |             method = "wilcoxon" 
 64 |         )
 65 |         
 66 |         # Get the top n degs for each celltype and append to all results 
 67 |         dge_results = dge_top_n(
 68 |             adata, 
 69 |             n = top_n,
 70 |             obs_group = "celltype"
 71 |         )
 72 |         adata_dge_top_n_dfs.append(dge_results)
 73 |         
 74 |     # Concatenate all dge dataframes and keep distinct rows 
 75 |     adata_dge_top_n_concat = pd.concat(adata_dge_top_n_dfs)
 76 |     adata_dge_top_n_concat = adata_dge_top_n_concat.drop_duplicates()
 77 |     
 78 |     # Rename columns appropriately and save 
 79 |     adata_dge_top_n_concat.columns = [
 80 |         "Celltype",
 81 |         "Top {n} marker genes (union across batches)".format(
 82 |             n = top_n
 83 |         )
 84 |     ]
 85 |     adata_dge_top_n_concat.to_csv(save_loc, sep = "\t", index = False)
 86 |     
 87 | if __name__ == '__main__':
 88 |     parser = argparse.ArgumentParser(
 89 |         description = "Input and output files for marker gene summary"
 90 |     )
 91 |     parser.add_argument(
 92 |         "--filedir",
 93 |         type = str,
 94 |         help = "Path of directory containing scRNA-seq h5ad files"
 95 |     )        
 96 |     parser.add_argument(
 97 |         "--outfile",
 98 |         type = str,
 99 |         help = "Filepath for saving output from marker gene selection"
100 |     )
101 |     args = parser.parse_args()
102 |     main(
103 |         h5ad_dir = args.filedir,
104 |         save_loc = args.outfile,
105 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/reference_annotation.py:
--------------------------------------------------------------------------------
 1 | import argparse 
 2 | import sys 
 3 | sys.path.append("src/python/")
 4 | 
 5 | from utils import SeuratReferenceMap
 6 | 
 7 | def main(h5ad_loc, ref_h5_loc, save_loc):
 8 |     # Create an instance of SeuratReferenceMap
 9 |     refmap = SeuratReferenceMap(
10 |         integrated_data_h5 = h5ad_loc, 
11 |         reference_h5 = ref_h5_loc, 
12 |         mapped_h5 = save_loc
13 |     )
14 |     
15 |     # Run SeuratReferenceMap to save mapped query object to h5ad file
16 |     refmap.refmap()
17 |     
18 | if __name__ == '__main__':
19 |     parser = argparse.ArgumentParser(
20 |         description = "Input and output files for query to reference mapping"
21 |     )
22 |     parser.add_argument(
23 |         "--infile",
24 |         type = str,
25 |         help = "Path of integrated h5ad file"
26 |     )
27 |     parser.add_argument(
28 |         "--ref_file",
29 |         type = str,
30 |         help = "Path of reference h5Seurat file"
31 |     )
32 |     parser.add_argument(
33 |         "--outfile",
34 |         type = str,
35 |         help = "Filepath for saving Seurat reference mapped and annotated h5ad file"
36 |     )
37 |     args = parser.parse_args()
38 |     main(
39 |         h5ad_loc = args.infile,
40 |         ref_h5_loc = args.ref_file,
41 |         save_loc = args.outfile,
42 |     )
43 |     


--------------------------------------------------------------------------------
/workflow/scripts/python/reference_control_annotation.py:
--------------------------------------------------------------------------------
 1 | import argparse 
 2 | import sys 
 3 | sys.path.append("src/python/")
 4 | 
 5 | import numpy as np
 6 | import scanpy as sc 
 7 | import anndata as ann
 8 | 
 9 | from utils import cross_data_knn
10 | 
11 | def main(h5ad_loc, ref_h5ad_loc, save_loc):
12 |     # Load the seurat reference mapped h5ad file 
13 |     query_h5ad_full = sc.read_h5ad(h5ad_loc)
14 |     query_h5ad = query_h5ad_full.raw.to_adata() # Use sctransformed counts
15 |     
16 |     # Load the reference h5ad file
17 |     ref_h5ad = sc.read_h5ad(ref_h5ad_loc)
18 | 
19 |     # Add var/gene of both information to indices of both
20 |     query_h5ad.var.index = query_h5ad.var._index
21 |     ref_h5ad.var.index = ref_h5ad.var._index 
22 |     
23 |     # Get the intersection of the genes in the query and reference h5ad files
24 |     query_genes = set(query_h5ad.var.index.__array__())
25 |     ref_genes = set(ref_h5ad.var.index.__array__())
26 |     common_genes_list = list(ref_genes.intersection(query_genes))
27 | 
28 |     # Subset anndata objects for the common genes
29 |     query_h5ad_sub = query_h5ad[:, common_genes_list]
30 |     ref_h5ad_sub = ref_h5ad[:, common_genes_list]
31 | 
32 |     # Ensure genes are equal between query and reference 
33 |     if not np.array_equal(
34 |         query_h5ad_sub.var.index.__array__(), 
35 |         ref_h5ad_sub.var.index.__array__()
36 |     ):
37 |         raise ValueError(
38 |             "Genes not equal between query and reference h5ad files after intersection"
39 |         )
40 |     
41 |     # Get highly variable gene subset of the reference data and return 
42 |     sc.pp.highly_variable_genes(ref_h5ad_sub, flavor="seurat", n_top_genes=2500)
43 |     hvg_indices = ref_h5ad_sub.var["highly_variable"].__array__()
44 | 
45 |     # Get the SCTransformed data subsets for both the query and reference (common gene subsets)
46 |     query_sct = query_h5ad_sub.X.toarray()
47 |     ref_sct = ref_h5ad_sub.X.toarray()
48 | 
49 |     # Subset for the indices of the hvg genes
50 |     query_sct_hvg_subset = query_sct[:, hvg_indices]
51 |     ref_sct_hvg_subset = ref_sct[:, hvg_indices]
52 |     
53 |     # Get the (1) nearest neighbors for the reference data within the query data
54 |     query_1_nn = cross_data_knn(query_sct_hvg_subset, ref_sct_hvg_subset, 1)
55 |     
56 |     # Get the celltypes (both l1 and l2) corresponding to the nearest neighbors for the reference data
57 |     ref_celltypes_l1 = ref_h5ad.obs["celltype.l1"][query_1_nn.flatten()].__array__()
58 |     ref_celltypes_l2 = ref_h5ad.obs["celltype.l2"][query_1_nn.flatten()].__array__()
59 |     
60 |     # Append the celltypes to the query h5ad file
61 |     query_h5ad.obs["baseline.knn.l1"] = ref_celltypes_l1
62 |     query_h5ad.obs["baseline.knn.l2"] = ref_celltypes_l2
63 | 
64 |     # Change colnames of query var to not collide with h5ad writing in anndata
65 |     query_h5ad.var["gene_name"] = query_h5ad.var.index 
66 |     query_h5ad.var = query_h5ad.var.drop(query_h5ad.var.columns[0], axis=1)
67 |     
68 |     # Save the query h5ad file with baseline annotations
69 |     query_h5ad.write_h5ad(
70 |         filename = save_loc,
71 |         compression = "gzip"
72 |     )
73 |     
74 | if __name__ == '__main__':
75 |     parser = argparse.ArgumentParser(
76 |         description = "Input and output files for query to reference mapping - control experiment"
77 |     )
78 |     parser.add_argument(
79 |         "--infile",
80 |         type = str,
81 |         help = "Path of seurat reference mapped h5ad file"
82 |     )
83 |     parser.add_argument(
84 |         "--ref_file",
85 |         type = str,
86 |         help = "Path of reference h5ad file"
87 |     )
88 |     parser.add_argument(
89 |         "--outfile",
90 |         type = str,
91 |         help = "Filepath for saving seurat and control reference mapped and annotated h5ad file"
92 |     )
93 |     args = parser.parse_args()
94 |     main(
95 |         h5ad_loc = args.infile,
96 |         ref_h5ad_loc = args.ref_file,
97 |         save_loc = args.outfile,
98 |     )
99 |     


--------------------------------------------------------------------------------
/workflow/scripts/python/relatedness_metric.py:
--------------------------------------------------------------------------------
 1 | import argparse 
 2 | import os 
 3 | import sys 
 4 | sys.path.append("src/python/")
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | import scanpy as sc
 9 | 
10 | from utils import relatedness_score
11 | 
12 | def main(h5ad_dir, save_loc, dataset_name):
13 |     # Load h5ad files 
14 |     files_list = os.listdir(h5ad_dir)
15 |     adata_loaded = []
16 |     for f in files_list:
17 |         adata = sc.read_h5ad(os.path.join(h5ad_dir, f), as_sparse = "raw/X")
18 |         adata_loaded.append(adata)
19 |         
20 |     # Get relatedness metric for each celltype within each batch 
21 |     celltype_relatedness_dfs = []
22 |     for adata in adata_loaded:
23 |         celltype_relatedness_df = relatedness_score(adata, pca_performed = False)
24 |         celltype_relatedness_dfs.append(celltype_relatedness_df)
25 |         
26 |     # Concatenate results, add relevant metadata and save
27 |     celltype_relatedness_dfs_concat = pd.concat(celltype_relatedness_dfs)
28 |     celltype_relatedness_dfs_concat["Dataset"] = dataset_name
29 |     celltype_relatedness_dfs_concat.to_csv(save_loc, sep = "\t", index = False)
30 |          
31 | if __name__ == '__main__':
32 |     parser = argparse.ArgumentParser(
33 |         description = "Input and output files for calculating relatedness metric"
34 |     )
35 |     parser.add_argument(
36 |         "--filedir",
37 |         type = str,
38 |         help = "Path of directory containing scRNA-seq h5ad files"
39 |     )        
40 |     parser.add_argument(
41 |         "--outfile",
42 |         type = str,
43 |         help = "Filepath for saving output from relatedness metric calculation"
44 |     )
45 |     parser.add_argument(
46 |         "--dataset",
47 |         type = str,
48 |         help = "Name of dataset"
49 |     )
50 |     args = parser.parse_args()
51 |     main(
52 |         h5ad_dir = args.filedir,
53 |         save_loc = args.outfile,
54 |         dataset_name = args.dataset
55 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/ti_concordance.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | import os 
  3 | import sys 
  4 | sys.path.append("src/python/")
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import scanpy as sc
  9 | import scipy.stats as sp 
 10 | 
 11 | def main(h5ad_loc, save_loc, dataset_name, rep):
 12 |     # Load h5ad file 
 13 |     adata = sc.read_h5ad(h5ad_loc)
 14 |     
 15 |     # Extract summary statistics from h5ad file
 16 |     num_batches_ds = adata.uns["downsampling_stats"]["num_batches"]
 17 |     num_celltypes_ds = adata.uns["downsampling_stats"]["num_celltypes_downsampled"]
 18 |     prop_ds = adata.uns["downsampling_stats"]["proportion_downsampled"]
 19 |     batches = np.unique(adata.obs.batch.__array__())
 20 |     
 21 |     # Drop any samples/cells that contain NaN or Inf pseudotime estimates
 22 |     pt_drop_indices_1 = np.where(np.isnan(adata.obs.dpt_pseudotime.__array__()))[0]
 23 |     pt_drop_indices_2 = np.where(np.isinf(adata.obs.dpt_pseudotime.__array__()))[0]
 24 |     pt_drop_indices = np.unique(np.concatenate([pt_drop_indices_1, pt_drop_indices_2]))
 25 |     pt_drop_samples = np.unique(adata.obs["sample"].__array__()[pt_drop_indices])
 26 |     adata = adata[~adata.obs["sample"].isin(pt_drop_samples)].copy()
 27 |     
 28 |     # Subset h5ad based on batch-correction method used
 29 |     adata_method_sub = []
 30 |     methods = ["harmony", "scvi", "bbknn", "scanorama", "seurat", "liger"]
 31 |     for method in methods:
 32 |         adata_sub = adata[adata.obs["integration_method"] == method]
 33 |         adata_method_sub.append(
 34 |             adata_sub
 35 |         )
 36 |         
 37 |     # Subset the data for unintegrated results and extract the dpt_pseudotime values
 38 |     unintegrated_adata = adata[
 39 |         adata.obs["integration_method"] == "unintegrated"
 40 |     ]
 41 |     unintegrated_pt = unintegrated_adata.obs["dpt_pseudotime"].__array__()
 42 |     
 43 |     # Determine pearson, spearman, and kendall correlations between post-integration 
 44 |     # PAGA estimated pseudotime and pre-integration pseudotime for each batch-correction 
 45 |     # method
 46 |     spearman_corrs = []
 47 |     pearson_corrs = []
 48 |     kendall_corrs = []
 49 |     for adata_sub in adata_method_sub:
 50 |         # Get DPT pseudotime estimates
 51 |         dpt_pt = adata_sub.obs["dpt_pseudotime"].__array__()
 52 |         
 53 |         # Get correlations between pre-integration/unintegrated pseudotime and DPT pseudotime
 54 |         spearman_corr = sp.spearmanr(unintegrated_pt, dpt_pt)[0]
 55 |         pearson_corr = sp.pearsonr(unintegrated_pt, dpt_pt)[0]
 56 |         kendall_corr = sp.kendalltau(unintegrated_pt, dpt_pt)[0]
 57 |         
 58 |         spearman_corrs.append(spearman_corr)
 59 |         pearson_corrs.append(pearson_corr)
 60 |         kendall_corrs.append(kendall_corr)
 61 |         
 62 |     # Create a dataframe with the results
 63 |     ti_corr_df = pd.DataFrame({
 64 |         "Method" : methods,
 65 |         "Spearman correlations" : spearman_corrs,
 66 |         "Pearson correlations" : pearson_corrs,
 67 |         "Kendall correlations" : kendall_corrs
 68 |     })
 69 |     ti_corr_df["Dataset"] = dataset_name
 70 |     ti_corr_df["Number of batches downsampled"] = num_batches_ds
 71 |     ti_corr_df["Number of celltypes downsampled"] = num_celltypes_ds
 72 |     ti_corr_df["Proportion downsampled"] = prop_ds
 73 |     ti_corr_df["Replicate"] = rep
 74 |     ti_corr_df["Total batches"] = len(batches)
 75 |     
 76 |     # Save dataframe to file 
 77 |     ti_corr_df.to_csv(
 78 |         save_loc,
 79 |         index = False,
 80 |         sep = "\t"
 81 |     )
 82 |     
 83 | if __name__ == "__main__":
 84 |     parser = argparse.ArgumentParser(
 85 |         description = "Input and output files for trajectory inference scoring"
 86 |     )
 87 |     parser.add_argument(
 88 |         "--infile",
 89 |         type = str,
 90 |         help = "Path of PAGA integrated h5ad file"
 91 |     )
 92 |     parser.add_argument(
 93 |         "--outfile",
 94 |         type = str,
 95 |         help = "Filepath for saving trajectory inference scoring results"
 96 |     )
 97 |     parser.add_argument(
 98 |         "--dataset",
 99 |         type = str,
100 |         help = "Name of dataset"
101 |     )
102 |     parser.add_argument(
103 |         "--rep",
104 |         type = int,
105 |         help = "Repetition number"
106 |     )
107 |     args = parser.parse_args()
108 |     main(
109 |         h5ad_loc = args.infile,
110 |         save_loc = args.outfile,
111 |         dataset_name = args.dataset,
112 |         rep = args.rep
113 |     )


--------------------------------------------------------------------------------
/workflow/scripts/python/umap_plots.py:
--------------------------------------------------------------------------------
  1 | import argparse 
  2 | import os 
  3 | import sys 
  4 | sys.path.append("src/python/")
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import anndata as ann
  9 | import scanpy as sc 
 10 | 
 11 | from utils import Umap
 12 | 
 13 | def main(h5ad_loc, leiden_save_loc, celltype_save_loc, batch_save_loc):
 14 |     # Load h5ad file 
 15 |     adata = sc.read_h5ad(h5ad_loc)
 16 |     
 17 |     # Get the umap coordinates for all the methods and create dictionary object
 18 |     methods = ["bbknn", "harmony", "scanorama", "scvi", "seurat"]
 19 |     umap_dict = {}
 20 |     for method in methods:
 21 |         adata_sub = adata[adata.obs["integration_method"] == method]
 22 |         umap_dict[method] = adata_sub.obsm["X_umap"].__array__()
 23 |         
 24 |     # Get the leiden clustering for all the methods and create dictionary object
 25 |     leiden_dict = {}
 26 |     for method in methods:
 27 |         adata_sub = adata[adata.obs["integration_method"] == method]
 28 |         leiden_dict[method] = adata_sub.obs["leiden"].__array__()
 29 |         
 30 |     # Create a cell type dictionary object
 31 |     celltype_dict = {}
 32 |     for method in methods:
 33 |         adata_sub = adata[adata.obs["integration_method"] == method]
 34 |         celltype_dict[method] = adata_sub.obs["celltype"].__array__()
 35 |         
 36 |     # Create a batch dictionary object
 37 |     batch_dict = {}
 38 |     for method in methods:
 39 |         adata_sub = adata[adata.obs["integration_method"] == method]
 40 |         batch_dict[method] = adata_sub.obs["batch"].__array__()
 41 |         
 42 |     # Create a umap object for each subset of information
 43 |     umap_leiden = Umap(
 44 |         coords = umap_dict,
 45 |         clustering = leiden_dict,
 46 |         subset_name = "Clustering",
 47 |     )
 48 |     umap_celltype = Umap(
 49 |         coords = umap_dict,
 50 |         clustering = celltype_dict,
 51 |         subset_name = "Cell-type",
 52 |     )
 53 |     umap_batch = Umap(
 54 |         coords = umap_dict,
 55 |         clustering = batch_dict,
 56 |         subset_name = "Batch",
 57 |     )
 58 |     
 59 |     # Plot each of the umap objects
 60 |     umap_leiden.umap_df()
 61 |     umap_leiden.umap_plot(show_plot=True)
 62 |     umap_leiden.save_umap(
 63 |         save_dir=leiden_save_loc,
 64 |         dpi=300
 65 |     )
 66 |     
 67 |     umap_celltype.umap_df()
 68 |     umap_celltype.umap_plot(show_plot=True)
 69 |     umap_celltype.save_umap(
 70 |         save_dir=celltype_save_loc,
 71 |         dpi=300
 72 |     )
 73 |     
 74 |     umap_batch.umap_df()
 75 |     umap_batch.umap_plot(show_plot=True)
 76 |     umap_batch.save_umap(
 77 |         save_dir=batch_save_loc,
 78 |         dpi=300
 79 |     )
 80 |     
 81 |     # Save the umap dataframes as auxiliary files
 82 |     umap_leiden_df = umap_leiden.umap_concat
 83 |     umap_celltype_df = umap_celltype.umap_concat
 84 |     umap_batch_df = umap_batch.umap_concat
 85 |     
 86 |     umap_leiden_df.to_csv(
 87 |         os.path.splitext(leiden_save_loc)[0] + ".tsv",
 88 |         sep = "\t"
 89 |     )
 90 |     umap_celltype_df.to_csv(
 91 |         os.path.splitext(celltype_save_loc)[0] + ".tsv",
 92 |         sep = "\t"
 93 |     )
 94 |     umap_batch_df.to_csv(
 95 |         os.path.splitext(batch_save_loc)[0] + ".tsv",
 96 |         sep = "\t"
 97 |     )
 98 |     
 99 | if __name__ == '__main__':
100 |     parser = argparse.ArgumentParser(
101 |         description = "Input and output files for UMAP plot generation"
102 |     )
103 |     parser.add_argument(
104 |         "--infile",
105 |         type = str,
106 |         help = "Path of integrated h5ad file"
107 |     )
108 |     parser.add_argument(
109 |         "--leiden_plot_loc",
110 |         type = str,
111 |         help = "Filepath for saving leiden overlayed UMAP results"
112 |     )
113 |     parser.add_argument(
114 |         "--celltype_plot_loc",
115 |         type = str,
116 |         help = "Filepath for saving celltype overlayed UMAP results"
117 |     )
118 |     parser.add_argument(
119 |         "--batch_plot_loc",
120 |         type = str,
121 |         help = "Filepath for saving batch overlayed UMAP results"
122 |     )
123 |     args = parser.parse_args()
124 |     main(
125 |         h5ad_loc = args.infile,
126 |         leiden_save_loc = args.leiden_plot_loc,
127 |         celltype_save_loc = args.celltype_plot_loc,
128 |         batch_save_loc = args.batch_plot_loc
129 |     )


--------------------------------------------------------------------------------
/workflow/src/R/liger_integrate.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(liger)
 3 | library(Seurat)
 4 | library(SeuratDisk)
 5 | library(reticulate)
 6 | 
 7 | # Get a random seed 
 8 | rand_seed <- sample(1:100000000, 1)
 9 | 
10 | # Read in matrix for full data, including last batch column 
11 | args <- commandArgs(trailingOnly = TRUE)
12 | file <- args[1]
13 | filename <- args[2]
14 | 
15 | # Load anndata, scanpy, and scipy sparse through reticulate
16 | ad <- import("anndata")
17 | sc <- import("scanpy")
18 | sp_sparse <- import("scipy.sparse")
19 | 
20 | # Load h5ad object through reticulate and create Seurat object
21 | temp_adata <- ad$read_h5ad(file) 
22 | exprs <- t(temp_adata$X$todense())
23 | colnames(exprs) <- temp_adata$obs_names$to_list()
24 | rownames(exprs) <- temp_adata$var_names$to_list()
25 | seur_obj <- CreateSeuratObject(exprs)
26 | seur_obj <- SetAssayData(seur_obj, "data", exprs)
27 | seur_obj <- AddMetaData(seur_obj, temp_adata$obs)
28 | 
29 | # Split object by batch information
30 | seur_obj_list <- SplitObject(
31 |     seur_obj,
32 |     split.by = "batch"
33 | )
34 | 
35 | # Get matrices of rna data for each batch and name by batch 
36 | seur_counts_list <- lapply(seur_obj_list, function(x) {
37 |     return(x@assays$RNA@counts)
38 | })
39 | seur_counts_list_names <- lapply(seur_obj_list, function(x) {
40 |     return(unique(x@meta.data$batch))
41 | })
42 | names(seur_counts_list) <- seur_counts_list_names
43 | 
44 | # Create Liger object from seurat list of matrices
45 | liger_obj <- createLiger(seur_counts_list, remove.missing = FALSE)
46 | 
47 | # Normalize and select highly variable genes using LIGER's functions
48 | liger_obj <- normalize(liger_obj)
49 | liger_obj <- selectGenes(liger_obj)
50 | 
51 | # Scale data, perform iNFM and quantile normalization
52 | liger_obj <- scaleNotCenter(liger_obj)
53 | liger_obj <- optimizeALS(liger_obj, k = 20, rand.seed = rand_seed)
54 | liger_obj <- quantile_norm(liger_obj) # No seeding done in version 0.5.0
55 | 
56 | # Extract normalized cell loadings, save as h5seurat object,
57 | # and convert to h5ad
58 | liger_norm_h <- liger_obj@H.norm
59 | rownames(liger_norm_h) <- colnames(seur_obj)
60 | colnames(liger_norm_h) <- paste0(
61 |     "h_norm_comp_", seq(1:ncol(liger_norm_h))
62 | )
63 | norm_cell_loadings <- CreateSeuratObject(counts = t(liger_norm_h))
64 | SaveH5Seurat(
65 |     object = norm_cell_loadings, 
66 |     filename = paste0("./tmp/", filename, "_liger_out.h5seurat"),
67 |     overwrite = TRUE,
68 |     verbose = TRUE
69 | )
70 | 
71 | # Convert tempfile to h5ad object
72 | Convert(
73 |     paste0("./tmp/", filename, "_liger_out.h5seurat"),
74 |     dest = "h5ad"
75 | )


--------------------------------------------------------------------------------
/workflow/src/R/seurat_integrate.R:
--------------------------------------------------------------------------------
 1 | library(Seurat)
 2 | library(SeuratDisk)
 3 | library(reticulate)
 4 | 
 5 | # Read in matrix for full data, including last batch column 
 6 | args <- commandArgs(trailingOnly = TRUE)
 7 | file <- args[1]
 8 | filename <- args[2]
 9 | int_type <- args[3] # Integration type
10 | 
11 | # Load anndata and scanpy 
12 | ad <- import("anndata")
13 | sc <- import("scanpy")
14 | 
15 | # Load h5ad object through reticulate and create Seurat object
16 | temp_adata <- ad$read_h5ad(file) 
17 | exprs <- t(temp_adata$X$todense())
18 | colnames(exprs) <- temp_adata$obs_names$to_list()
19 | rownames(exprs) <- temp_adata$var_names$to_list()
20 | seur_obj <- CreateSeuratObject(exprs)
21 | seur_obj <- SetAssayData(seur_obj, "data", exprs)
22 | seur_obj <- AddMetaData(seur_obj, temp_adata$obs)
23 | 
24 | # Split object by batch information
25 | seur_obj_list <- SplitObject(
26 |     seur_obj,
27 |     split.by = "batch"
28 | )
29 | 
30 | # Iterate over batches and find highly variable genes 
31 | for (i in 1:length(seur_obj_list)) {
32 |     seur_obj_list[[i]] <- FindVariableFeatures(
33 |         seur_obj_list[[i]], 
34 |         selection.method = "mean.var.plot",
35 |         nfeatures = 2500, 
36 |         verbose = TRUE
37 |     )
38 | }
39 | 
40 | # Determine type of integration to perform (CCA or RPCA)
41 | if (int_type == "CCA") {
42 |     int_anchors <- FindIntegrationAnchors(
43 |         object.list = seur_obj_list,
44 |         dims = 1:20,
45 |         anchor.features = 2500
46 |     )
47 |     batches_integrated <- IntegrateData(anchorset = int_anchors, dims = 1:20)
48 | } else if (int_type == "RPCA") {
49 |     int_features <- SelectIntegrationFeatures(object.list = seur_obj_list)
50 |     for (i in 1:length(seur_obj_list)) {
51 |         x <- seur_obj_list[[i]]
52 |         x <- ScaleData(x, features = int_features, verbose = TRUE)
53 |         x <- RunPCA(x, features = int_features, verbose = TRUE)
54 |         seur_obj_list[[i]] <- x
55 |     }
56 |     int_anchors <- FindIntegrationAnchors(
57 |         object.list = seur_obj_list,
58 |         reduction = "rpca",
59 |         dims = 1:20
60 |     )
61 |     batches_integrated <- IntegrateData(anchorset = int_anchors, dims = 1:20)
62 | } else {
63 |     stop(
64 |         "Please indicate either 'CCA' or 'RPCA' for the integration type option"
65 |     )
66 | }
67 | 
68 | # Return integrated adata object as hda5 file -> tempfile
69 | SaveH5Seurat(
70 |     object = batches_integrated, 
71 |     filename = paste0("./tmp/", filename, "_seur_out.h5Seurat"),
72 |     overwrite = TRUE,
73 |     verbose = TRUE
74 | )
75 | 
76 | # Convert tempfile to h5ad object
77 | Convert(
78 |     paste0("./tmp/", filename, "_seur_out.h5Seurat"),
79 |     dest = "h5ad"
80 | )
81 | 


--------------------------------------------------------------------------------
/workflow/src/R/seurat_reference_map.R:
--------------------------------------------------------------------------------
 1 | library(Seurat)
 2 | library(SeuratDisk)
 3 | library(reticulate)
 4 | 
 5 | # Read in arguments  
 6 | args <- commandArgs(trailingOnly = TRUE)
 7 | ref_file <- args[1]
 8 | temp_adata_file <- args[2]
 9 | temp_adata_filename <- args[3]
10 | outfile_name <- args[4] 
11 | 
12 | # Read in h5seurat reference data 
13 | ref_data <- LoadH5Seurat(ref_file) 
14 | 
15 | # Load anndata and scanpy 
16 | ad <- import("anndata")
17 | sc <- import("scanpy")
18 | 
19 | # Convert h5ad anndata temp file 
20 | temp_adata <- ad$read_h5ad(temp_adata_file)
21 | 
22 | # Create Seurat object and split by batch information - use anndata import
23 | exprs <- t(temp_adata$X$todense())
24 | colnames(exprs) <- temp_adata$obs_names$to_list()
25 | rownames(exprs) <- temp_adata$var_names$to_list()
26 | query_obj <- CreateSeuratObject(exprs)
27 | query_obj <- SetAssayData(query_obj, "data", exprs)
28 | query_obj <- AddMetaData(query_obj, temp_adata$obs)
29 | query_obj_list <- SplitObject(
30 |     query_obj,
31 |     split.by = "batch"
32 | ) 
33 | 
34 | # Normalize query batches using scTransform  
35 | query_obj_list <- lapply(X = query_obj_list, FUN = SCTransform, verbose = FALSE)
36 | 
37 | # Get anchors between each query batches and the reference 
38 | anchors <- list()
39 | for (i in 1:length(query_obj_list)) {
40 |   anchors[[i]] <- FindTransferAnchors(
41 |     reference = ref_data,
42 |     query = query_obj_list[[i]],
43 |     reference.reduction = "spca", 
44 |     dims = 1:50
45 |   )
46 | }
47 | 
48 | # Note - this may not be ideal to simulate effects of downsampling 
49 | # as each batch is being mapped individually here and not separately
50 | # Map each of the query batches individually 
51 | for (i in 1:length(query_obj_list)) {
52 |   query_obj_list[[i]] <- MapQuery(
53 |     anchorset = anchors[[i]], 
54 |     query = query_obj_list[[i]],
55 |     reference = ref_data, 
56 |     refdata = list(
57 |         celltype.l1 = "celltype.l1",
58 |         celltype.l2 = "celltype.l2",
59 |         predicted_ADT = "ADT"
60 |     ),
61 |     reference.reduction = "spca",
62 |     reduction.model = "wnn.umap"
63 |   )
64 | }
65 | 
66 | # Set default assay of all to SCT (for outputting scTransformed counts)
67 | for (i in 1:length(query_obj_list)) {
68 |   DefaultAssay(query_obj_list[[i]]) <- "SCT"
69 | }
70 | 
71 | # Remerge the batches into one object - reset default assay as failsafe 
72 | query_ref_mapped_obj <- Reduce(merge, query_obj_list)
73 | DefaultAssay(query_ref_mapped_obj) <- "SCT"
74 | 
75 | # Return reference mapped adata object as hda5 file -> tempfile
76 | SaveH5Seurat(
77 |     object = query_ref_mapped_obj, 
78 |     filename = paste0(outfile_name, ".h5Seurat"),
79 |     overwrite = TRUE,
80 |     verbose = TRUE
81 | )
82 | 
83 | # Convert tempfile to h5ad object
84 | Convert(
85 |     paste0(outfile_name, ".h5Seurat"),
86 |     dest = "h5ad"
87 | )
88 | 
89 | # Remove h5seurat file 
90 | file.remove(
91 |   paste0(outfile_name, ".h5Seurat")
92 | )


--------------------------------------------------------------------------------
/workflow/src/python/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import Integration, IntegrationPAGA, cluster_num, leiden_clip, \
2 |     cluster_membership, downsample, diffexp, dge_top_n, set_concordance, \
3 |     cluster_concordance, faiss_kmeans, SeuratReferenceMap, mutual_nn, \
4 |     find_mutual_nn, find_knn, cross_data_knn, relatedness_score
5 | from .imbalanced_clustering import balanced_adjusted_rand_index, \
6 |     balanced_adjusted_mutual_info, balanced_completeness, balanced_homogeneity, \
7 |     balanced_v_measure


--------------------------------------------------------------------------------
/workflow/src/python/imbalanced_clustering/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.0"
2 | from .ari import balanced_adjusted_rand_index
3 | from .ami import balanced_adjusted_mutual_info
4 | from .vmeasure import balanced_homogeneity, balanced_completeness, balanced_v_measure
5 | 


--------------------------------------------------------------------------------
/workflow/src/python/imbalanced_clustering/ami.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.sparse as sp
  3 | 
  4 | from .utils import (
  5 |     check_clusterings,
  6 |     contingency_matrix,
  7 |     entropy,
  8 |     mutual_info_score,
  9 |     expected_mutual_information,
 10 |     generalized_average,
 11 | )
 12 | 
 13 | 
 14 | def balanced_adjusted_mutual_info(
 15 |     labels_true, labels_pred, *, average_method="arithmetic", reweigh=True
 16 | ):
 17 |     """Mutual Information adjusted for chance and balanced across true labels.
 18 |     Adjusted Mutual Information (AMI) is an adjustment of the Mutual
 19 |     Information (MI) score to account for chance. It accounts for the fact that
 20 |     the MI is generally higher for two clusterings with a larger number of
 21 |     clusters, regardless of whether there is actually more information shared.
 22 |     For two clusterings :math:`U` and :math:`V`, the AMI is given as::
 23 |         AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]
 24 |     This metric is independent of the absolute values of the labels:
 25 |     a permutation of the class or cluster label values won't change the
 26 |     score value in any way.
 27 |     The original AMI metric is a symmetric measure: switching :math:`U`
 28 |     (``label_true``) with :math:`V` (``labels_pred``) will return the same score value,
 29 |     but this is not the case for the reweighted and balanced AMI.
 30 |     The balanced AMI is obtained by reweighing the contingency table
 31 |     for all true label marginals, such that they sum to the same nummber,
 32 |     while preserving the total number of samples.
 33 |     Be mindful that this function is an order of magnitude slower than other
 34 |     metrics, such as the Adjusted Rand Index.
 35 |     Parameters
 36 |     ----------
 37 |     labels_true : int array, shape = [n_samples]
 38 |         A clustering of the data into disjoint subsets, called :math:`U` in
 39 |         the above formula.
 40 |     labels_pred : int array-like of shape (n_samples,)
 41 |         A clustering of the data into disjoint subsets, called :math:`V` in
 42 |         the above formula.
 43 |     average_method : str, default='arithmetic'
 44 |         How to compute the normalizer in the denominator. Possible options
 45 |         are 'min', 'geometric', 'arithmetic', and 'max'.
 46 |         .. versionadded:: 0.20
 47 |         .. versionchanged:: 0.22
 48 |            The default value of ``average_method`` changed from 'max' to
 49 |            'arithmetic'.
 50 |     reweigh : bool, default=True
 51 |         if `True`, reweighs the contingency table based on the true labels
 52 |         such that they all have equal membership. The total number of samples
 53 |         is preserved with a round-off error. If 'False', this reverts the
 54 |         balanced AMI to the original AMI implementation.
 55 |     Returns
 56 |     -------
 57 |     AMI: float (upperlimited by 1.0)
 58 |        The AMI returns a value of 1 when the two partitions are identical
 59 |        (ie perfectly matched). Random partitions (independent labellings) have
 60 |        an expected AMI around 0 on average hence can be negative. The value is
 61 |        in adjusted nats (based on the natural logarithm).
 62 |     References
 63 |     ----------
 64 |     .. [1] `Vinh, Epps, and Bailey, (2010). Information Theoretic Measures for
 65 |        Clusterings Comparison: Variants, Properties, Normalization and
 66 |        Correction for Chance, JMLR
 67 |        <http://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf>`_
 68 |     .. [2] `Wikipedia entry for the Adjusted Mutual Information
 69 |        <https://en.wikipedia.org/wiki/Adjusted_Mutual_Information>`_
 70 |     """
 71 |     labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
 72 |     n_samples = labels_true.shape[0]
 73 |     classes = np.unique(labels_true)
 74 |     clusters = np.unique(labels_pred)
 75 |     # Special limit cases: no clustering since the data is not split.
 76 |     # This is a perfect match hence return 1.0.
 77 |     if (
 78 |         classes.shape[0] == clusters.shape[0] == 1
 79 |         or classes.shape[0] == clusters.shape[0] == 0
 80 |     ):
 81 |         return 1.0
 82 |     contingency = contingency_matrix(
 83 |         labels_true, labels_pred, reweigh=reweigh, sparse=True
 84 |     )
 85 |     # Recalculate labels_true and labels_pred if reweigh is True to
 86 |     # factor in the reweighting based on the true class frequencies.
 87 |     # These won't preserve order but this is fine since entropy is 
 88 |     # invariant to order
 89 |     if reweigh is True:
 90 |         true_sums = np.squeeze(np.asarray(sp.csc_matrix.sum(contingency, axis = 1)))
 91 |         pred_sums = np.squeeze(np.asarray(sp.csc_matrix.sum(contingency, axis = 0)))
 92 |         labels_true = np.repeat(
 93 |             np.arange(len(true_sums)), true_sums
 94 |         )
 95 |         labels_pred = np.repeat(
 96 |             np.arange(len(pred_sums)), pred_sums
 97 |         )
 98 |     contingency = contingency.astype(np.float64)
 99 |     # Calculate the MI for the two clusterings
100 |     mi = mutual_info_score(labels_true, labels_pred, contingency=contingency)
101 |     # Calculate the expected value for the mutual information
102 |     emi = expected_mutual_information(contingency, n_samples)
103 |     # Calculate entropy for each labeling
104 |     h_true, h_pred = entropy(labels_true), entropy(labels_pred)
105 |     normalizer = generalized_average(h_true, h_pred, average_method)
106 |     denominator = normalizer - emi
107 |     # Avoid 0.0 / 0.0 when expectation equals maximum, i.e a perfect match.
108 |     # normalizer should always be >= emi, but because of floating-point
109 |     # representation, sometimes emi is slightly larger. Correct this
110 |     # by preserving the sign.
111 |     if denominator < 0:
112 |         denominator = min(denominator, -np.finfo("float64").eps)
113 |     else:
114 |         denominator = max(denominator, np.finfo("float64").eps)
115 |     ami = (mi - emi) / denominator
116 |     return ami
117 | 


--------------------------------------------------------------------------------
/workflow/src/python/imbalanced_clustering/ari.py:
--------------------------------------------------------------------------------
 1 | from .utils import pair_confusion_matrix
 2 | 
 3 | 
 4 | def balanced_adjusted_rand_index(labels_true, labels_pred, reweigh=True):
 5 |     """Rand index adjusted for chance and balanced across true labels.
 6 |     The Rand Index computes a similarity measure between two clusterings
 7 |     by considering all pairs of samples and counting pairs that are
 8 |     assigned in the same or different clusters in the predicted and
 9 |     true clusterings.
10 |     The raw RI score is then "adjusted for chance" into the ARI score
11 |     using the following scheme::
12 |         ARI = (RI - Expected_RI) / (max(RI) - Expected_RI)
13 |     The adjusted Rand index is thus ensured to have a value close to
14 |     0.0 for random labeling independently of the number of clusters and
15 |     samples and exactly 1.0 when the clusterings are identical (up to
16 |     a permutation).
17 |     The original ARI is a symmetric measure:
18 |         adjusted_rand_score(a, b) == adjusted_rand_score(b, a)
19 |     But this does not hold due for the balanced ARI metric.
20 |     The balanced ARI is obtained by reweighing the contingency table
21 |     for all true label marginals, such that they sum to the same nummber,
22 |     while preserving the total number of samples.
23 |     Parameters
24 |     ----------
25 |     labels_true : int array, shape = [n_samples]
26 |         Ground truth class labels to be used as a reference
27 |     labels_pred : array-like of shape (n_samples,)
28 |         Cluster labels to evaluate
29 |     reweigh : bool, default=True
30 |         if `True`, reweighs the contingency table based on the true labels
31 |         such that they all have equal membership. The total number of samples
32 |         is preserved with a round-off error. If 'False', this reverts the
33 |         balanced ARI to the original ARI implementation.
34 |     Returns
35 |     -------
36 |     ARI : float
37 |        Similarity score between -1.0 and 1.0. Random labelings have an ARI
38 |        close to 0.0. 1.0 stands for perfect match.
39 |     References
40 |     ----------
41 |     .. [Hubert1985] L. Hubert and P. Arabie, Comparing Partitions,
42 |       Journal of Classification 1985
43 |       https://link.springer.com/article/10.1007%2FBF01908075
44 |     .. [Steinley2004] D. Steinley, Properties of the Hubert-Arabie
45 |       adjusted Rand index, Psychological Methods 2004
46 |     .. [wk] https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index
47 |     """
48 |     (tn, fp), (fn, tp) = pair_confusion_matrix(
49 |         labels_true, labels_pred, reweigh=reweigh
50 |     )
51 |     # convert to Python integer types, to avoid overflow or underflow
52 |     tn, fp, fn, tp = int(tn), int(fp), int(fn), int(tp)
53 | 
54 |     # Special cases: empty data or full agreement
55 |     if fn == 0 and fp == 0:
56 |         return 1.0
57 | 
58 |     return 2.0 * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn))
59 | 


--------------------------------------------------------------------------------
/workflow/src/python/imbalanced_clustering/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | import pyximport
 2 | import numpy
 3 | 
 4 | pyximport.install(setup_args={"include_dirs": numpy.get_include()}, reload_support=True)
 5 | from ._emi_cython import expected_mutual_information
 6 | from .contingency import pair_confusion_matrix, contingency_matrix
 7 | from .checks import check_clusterings
 8 | from .mi import mutual_info_score, entropy
 9 | from .avg import generalized_average
10 | 


--------------------------------------------------------------------------------
/workflow/src/python/imbalanced_clustering/utils/_emi_cython.pyx:
--------------------------------------------------------------------------------
 1 | # Authors: Robert Layton <robertlayton@gmail.com>
 2 | #           Corey Lynch <coreylynch9@gmail.com>
 3 | # License: BSD 3 clause
 4 | 
 5 | from libc.math cimport exp, lgamma
 6 | from scipy.special import gammaln
 7 | import numpy as np
 8 | cimport numpy as np
 9 | cimport cython
10 | 
11 | np.import_array()
12 | ctypedef np.float64_t DOUBLE
13 | 
14 | def expected_mutual_information(contingency, int n_samples):
15 |     """Calculate the expected mutual information for two labelings."""
16 |     cdef int R, C
17 |     cdef DOUBLE N, gln_N, emi, term2, term3, gln
18 |     cdef np.ndarray[DOUBLE] gln_a, gln_b, gln_Na, gln_Nb, gln_nij, log_Nnij
19 |     cdef np.ndarray[DOUBLE] nijs, term1
20 |     cdef np.ndarray[DOUBLE] log_a, log_b
21 |     cdef np.ndarray[np.int32_t] a, b
22 |     #cdef np.ndarray[int, ndim=2] start, end
23 |     R, C = contingency.shape
24 |     N = <DOUBLE>n_samples
25 |     a = np.ravel(contingency.sum(axis=1).astype(np.int32, copy=False))
26 |     b = np.ravel(contingency.sum(axis=0).astype(np.int32, copy=False))
27 |     # There are three major terms to the EMI equation, which are multiplied to
28 |     # and then summed over varying nij values.
29 |     # While nijs[0] will never be used, having it simplifies the indexing.
30 |     nijs = np.arange(0, max(np.max(a), np.max(b)) + 1, dtype='float')
31 |     nijs[0] = 1  # Stops divide by zero warnings. As its not used, no issue.
32 |     # term1 is nij / N
33 |     term1 = nijs / N
34 |     # term2 is log((N*nij) / (a * b)) == log(N * nij) - log(a * b)
35 |     log_a = np.log(a)
36 |     log_b = np.log(b)
37 |     # term2 uses log(N * nij) = log(N) + log(nij)
38 |     log_Nnij = np.log(N) + np.log(nijs)
39 |     # term3 is large, and involved many factorials. Calculate these in log
40 |     # space to stop overflows.
41 |     gln_a = gammaln(a + 1)
42 |     gln_b = gammaln(b + 1)
43 |     gln_Na = gammaln(N - a + 1)
44 |     gln_Nb = gammaln(N - b + 1)
45 |     gln_N = gammaln(N + 1)
46 |     gln_nij = gammaln(nijs + 1)
47 |     # start and end values for nij terms for each summation.
48 |     start = np.array([[v - N + w for w in b] for v in a], dtype='int')
49 |     start = np.maximum(start, 1)
50 |     end = np.minimum(np.resize(a, (C, R)).T, np.resize(b, (R, C))) + 1
51 |     # emi itself is a summation over the various values.
52 |     emi = 0.0
53 |     cdef Py_ssize_t i, j, nij
54 |     for i in range(R):
55 |         for j in range(C):
56 |             for nij in range(start[i,j], end[i,j]):
57 |                 term2 = log_Nnij[nij] - log_a[i] - log_b[j]
58 |                 # Numerators are positive, denominators are negative.
59 |                 gln = (gln_a[i] + gln_b[j] + gln_Na[i] + gln_Nb[j]
60 |                      - gln_N - gln_nij[nij] - lgamma(a[i] - nij + 1)
61 |                      - lgamma(b[j] - nij + 1)
62 |                      - lgamma(N - a[i] - b[j] + nij + 1))
63 |                 term3 = exp(gln)
64 |                 emi += (term1[nij] * term2 * term3)
65 |     return emi


--------------------------------------------------------------------------------
/workflow/src/python/imbalanced_clustering/utils/avg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def generalized_average(U, V, average_method):
 5 |     """Return a particular mean of two numbers."""
 6 |     if average_method == "min":
 7 |         return min(U, V)
 8 |     elif average_method == "geometric":
 9 |         return np.sqrt(U * V)
10 |     elif average_method == "arithmetic":
11 |         return np.mean([U, V])
12 |     elif average_method == "max":
13 |         return max(U, V)
14 |     else:
15 |         raise ValueError(
16 |             "'average_method' must be 'min', 'geometric', 'arithmetic', or 'max'"
17 |         )
18 | 


--------------------------------------------------------------------------------
/workflow/src/python/imbalanced_clustering/utils/contingency.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.sparse as sp
  3 | from sklearn.utils import sparsefuncs
  4 | 
  5 | 
  6 | def contingency_matrix(
  7 |     labels_true, labels_pred, *, reweigh=False, eps=None, sparse=False, dtype=np.int64
  8 | ):
  9 |     """Build a contingency matrix describing the relationship between labels.
 10 |     Parameters
 11 |     ----------
 12 |     labels_true : int array, shape = [n_samples]
 13 |         Ground truth class labels to be used as a reference.
 14 |     labels_pred : array-like of shape (n_samples,)
 15 |         Cluster labels to evaluate.
 16 |     reweigh : bool, default=False
 17 |         if `True`, reweighs the contingency table based on the true labels
 18 |         such that they all have equal membership. The total number of samples
 19 |         is preserved with a round-off error.
 20 |     eps : float, default=None
 21 |         If a float, that value is added to all values in the contingency
 22 |         matrix. This helps to stop NaN propagation.
 23 |         If ``None``, nothing is adjusted.
 24 |     sparse : bool, default=False
 25 |         If `True`, return a sparse CSR continency matrix. If `eps` is not
 26 |         `None` and `sparse` is `True` will raise ValueError.
 27 |         .. versionadded:: 0.18
 28 |     dtype : numeric type, default=np.int64
 29 |         Output dtype. Ignored if `eps` is not `None`.
 30 |         .. versionadded:: 0.24
 31 |     Returns
 32 |     -------
 33 |     contingency : {array-like, sparse}, shape=[n_classes_true, n_classes_pred]
 34 |         Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in
 35 |         true class :math:`i` and in predicted class :math:`j`. If
 36 |         ``eps is None``, the dtype of this array will be integer unless set
 37 |         otherwise with the ``dtype`` argument. If ``eps`` is given, the dtype
 38 |         will be float.
 39 |         Will be a ``sklearn.sparse.csr_matrix`` if ``sparse=True``.
 40 |     """
 41 | 
 42 |     if eps is not None and sparse:
 43 |         raise ValueError("Cannot set 'eps' when sparse=True")
 44 | 
 45 |     classes, class_idx = np.unique(labels_true, return_inverse=True)
 46 |     clusters, cluster_idx = np.unique(labels_pred, return_inverse=True)
 47 |     n_classes = classes.shape[0]
 48 |     n_clusters = clusters.shape[0]
 49 |     # Using coo_matrix to accelerate simple histogram calculation,
 50 |     # i.e. bins are consecutive integers
 51 |     # Currently, coo_matrix is faster than histogram2d for simple cases
 52 |     contingency = sp.coo_matrix(
 53 |         (np.ones(class_idx.shape[0]), (class_idx, cluster_idx)),
 54 |         shape=(n_classes, n_clusters),
 55 |         dtype=dtype,
 56 |     )
 57 |     if sparse:
 58 |         contingency = contingency.tocsr()
 59 |         contingency.sum_duplicates()
 60 |     else:
 61 |         contingency = contingency.toarray()
 62 |         if eps is not None:
 63 |             # don't use += as contingency is integer
 64 |             contingency = contingency + eps
 65 |     # reweight contingency table if indicated
 66 |     if reweigh is True:
 67 |         contingency = contingency.astype(np.float64)
 68 |         counts_sum_per_class = np.ravel(contingency.sum(1))
 69 |         target = round(np.mean(counts_sum_per_class))
 70 |         counts_norm = counts_sum_per_class / target
 71 |         sparsefuncs.inplace_row_scale(contingency, 1 / counts_norm)
 72 |         contingency = contingency.astype(np.int64)
 73 | 
 74 |     return contingency
 75 | 
 76 | 
 77 | def pair_confusion_matrix(labels_true, labels_pred, reweigh=False):
 78 |     """Pair confusion matrix arising from two clusterings.
 79 |     The pair confusion matrix :math:`C` computes a 2 by 2 similarity matrix
 80 |     between two clusterings by considering all pairs of samples and counting
 81 |     pairs that are assigned into the same or into different clusters under
 82 |     the true and predicted clusterings.
 83 |     Considering a pair of samples that is clustered together a positive pair,
 84 |     then as in binary classification the count of true negatives is
 85 |     :math:`C_{00}`, false negatives is :math:`C_{10}`, true positives is
 86 |     :math:`C_{11}` and false positives is :math:`C_{01}`.
 87 |     Read more in the :ref:`User Guide <pair_confusion_matrix>`.
 88 |     Parameters
 89 |     ----------
 90 |     labels_true : array-like of shape (n_samples,), dtype=integral
 91 |         Ground truth class labels to be used as a reference.
 92 |     labels_pred : array-like of shape (n_samples,), dtype=integral
 93 |         Cluster labels to evaluate.
 94 |     reweigh : bool, default=False
 95 |         if `True`, reweighs the contingency table based on the true labels
 96 |         such that they all have equal membership. The total number of samples
 97 |         is preserved with a round-off error.
 98 |     Returns
 99 |     -------
100 |     C : ndarray of shape (2, 2), dtype=np.int64
101 |         The contingency matrix.
102 |     ------
103 |     Note that the matrix is not symmetric.
104 |     ------
105 |     References
106 |     ----------
107 |     .. L. Hubert and P. Arabie, Comparing Partitions, Journal of
108 |       Classification 1985
109 |       https://link.springer.com/article/10.1007%2FBF01908075
110 |     """
111 |     n_samples = np.int64(labels_true.shape[0])
112 | 
113 |     # Computation using the contingency data
114 |     contingency = contingency_matrix(
115 |         labels_true, labels_pred, reweigh=reweigh, sparse=True, dtype=np.int64
116 |     )
117 |     n_c = np.ravel(contingency.sum(axis=1))
118 |     n_k = np.ravel(contingency.sum(axis=0))
119 |     sum_squares = (contingency.data**2).sum()
120 |     C = np.empty((2, 2), dtype=np.int64)
121 |     C[1, 1] = sum_squares - n_samples
122 |     C[0, 1] = contingency.dot(n_k).sum() - sum_squares
123 |     C[1, 0] = contingency.transpose().dot(n_c).sum() - sum_squares
124 |     C[0, 0] = n_samples**2 - C[0, 1] - C[1, 0] - sum_squares
125 |     return C
126 | 


--------------------------------------------------------------------------------
/workflow/src/python/imbalanced_clustering/utils/mi.py:
--------------------------------------------------------------------------------
  1 | from math import log
  2 | 
  3 | import numpy as np
  4 | import scipy.sparse as sp
  5 | 
  6 | from .checks import check_clusterings, check_array
  7 | from .contingency import contingency_matrix
  8 | 
  9 | 
 10 | def entropy(labels):
 11 |     """Calculates the entropy for a labeling.
 12 |     Parameters
 13 |     ----------
 14 |     labels : int array, shape = [n_samples]
 15 |         The labels
 16 |     Notes
 17 |     -----
 18 |     The logarithm used is the natural logarithm (base-e).
 19 |     """
 20 |     if len(labels) == 0:
 21 |         return 1.0
 22 |     label_idx = np.unique(labels, return_inverse=True)[1]
 23 |     pi = np.bincount(label_idx).astype(np.float64)
 24 |     pi = pi[pi > 0]
 25 |     pi_sum = np.sum(pi)
 26 |     # log(a / b) should be calculated as log(a) - log(b) for
 27 |     # possible loss of precision
 28 |     return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum)))
 29 | 
 30 | 
 31 | def mutual_info_score(labels_true, labels_pred, *, contingency=None):
 32 |     """Mutual Information between two clusterings.
 33 |     The Mutual Information is a measure of the similarity between two labels
 34 |     of the same data. Where :math:`|U_i|` is the number of the samples
 35 |     in cluster :math:`U_i` and :math:`|V_j|` is the number of the
 36 |     samples in cluster :math:`V_j`, the Mutual Information
 37 |     between clusterings :math:`U` and :math:`V` is given as:
 38 |     .. math::
 39 |         MI(U,V)=\\sum_{i=1}^{|U|} \\sum_{j=1}^{|V|} \\frac{|U_i\\cap V_j|}{N}
 40 |         \\log\\frac{N|U_i \\cap V_j|}{|U_i||V_j|}
 41 |     This metric is independent of the absolute values of the labels:
 42 |     a permutation of the class or cluster label values won't change the
 43 |     score value in any way.
 44 |     This metric is furthermore symmetric: switching :math:`U` (i.e
 45 |     ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the
 46 |     same score value. This can be useful to measure the agreement of two
 47 |     independent label assignments strategies on the same dataset when the
 48 |     real ground truth is not known.
 49 |     Read more in the :ref:`User Guide <mutual_info_score>`.
 50 |     Parameters
 51 |     ----------
 52 |     labels_true : int array, shape = [n_samples]
 53 |         A clustering of the data into disjoint subsets, called :math:`U` in
 54 |         the above formula.
 55 |     labels_pred : int array-like of shape (n_samples,)
 56 |         A clustering of the data into disjoint subsets, called :math:`V` in
 57 |         the above formula.
 58 |     contingency : {ndarray, sparse matrix} of shape \
 59 |             (n_classes_true, n_classes_pred), default=None
 60 |         A contingency matrix given by the :func:`contingency_matrix` function.
 61 |         If value is ``None``, it will be computed, otherwise the given value is
 62 |         used, with ``labels_true`` and ``labels_pred`` ignored.
 63 |     Returns
 64 |     -------
 65 |     mi : float
 66 |        Mutual information, a non-negative value, measured in nats using the
 67 |        natural logarithm.
 68 |     Notes
 69 |     -----
 70 |     The logarithm used is the natural logarithm (base-e).
 71 |     See Also
 72 |     --------
 73 |     adjusted_mutual_info_score : Adjusted against chance Mutual Information.
 74 |     normalized_mutual_info_score : Normalized Mutual Information.
 75 |     """
 76 |     if contingency is None:
 77 |         labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
 78 |         contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
 79 |     else:
 80 |         contingency = check_array(
 81 |             contingency,
 82 |             accept_sparse=["csr", "csc", "coo"],
 83 |             dtype=[int, np.int32, np.int64],
 84 |         )
 85 | 
 86 |     if isinstance(contingency, np.ndarray):
 87 |         # For an array
 88 |         nzx, nzy = np.nonzero(contingency)
 89 |         nz_val = contingency[nzx, nzy]
 90 |     elif sp.issparse(contingency):
 91 |         # For a sparse matrix
 92 |         nzx, nzy, nz_val = sp.find(contingency)
 93 |     else:
 94 |         raise ValueError("Unsupported type for 'contingency': %s" % type(contingency))
 95 | 
 96 |     contingency_sum = contingency.sum()
 97 |     pi = np.ravel(contingency.sum(axis=1))
 98 |     pj = np.ravel(contingency.sum(axis=0))
 99 |     log_contingency_nm = np.log(nz_val)
100 |     contingency_nm = nz_val / contingency_sum
101 |     # Don't need to calculate the full outer product, just for non-zeroes
102 |     outer = pi.take(nzx).astype(np.int64, copy=False) * pj.take(nzy).astype(
103 |         np.int64, copy=False
104 |     )
105 |     log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum())
106 |     mi = (
107 |         contingency_nm * (log_contingency_nm - log(contingency_sum))
108 |         + contingency_nm * log_outer
109 |     )
110 |     mi = np.where(np.abs(mi) < np.finfo(mi.dtype).eps, 0.0, mi)
111 |     return np.clip(mi.sum(), 0.0, None)
112 | 


--------------------------------------------------------------------------------
/workflow/src/python/loaders/rna_data.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import scipy
4 | import scipy.io as sio
5 | import scanpy as sc
6 | import anndata as ann
7 | 
8 | def adata


--------------------------------------------------------------------------------
/workflow/src/python/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .integrate import Integration
 2 | from .integrate_ti import IntegrationPAGA
 3 | from .seurat_integrate import SeuratIntegrate
 4 | from .liger_integrate import LigerIntegrate
 5 | from .clustering import cluster_num, leiden_clip, cluster_membership   
 6 | from .sample import downsample
 7 | from .diffexp import diffexp, dge_top_n, set_concordance
 8 | from .cluster_concordance import cluster_concordance
 9 | from .kmeans import faiss_kmeans
10 | from .seurat_reference_mapping import SeuratReferenceMap
11 | from .mnn import mutual_nn, find_mutual_nn, find_knn, cross_data_knn
12 | from .relatedness import relatedness_score
13 | from .umap import Umap


--------------------------------------------------------------------------------
/workflow/src/python/utils/cluster_concordance.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn import metrics
 4 | 
 5 | def cluster_concordance(adata):
 6 |     # Pull out clustering values per subset
 7 |     methods = ["harmony", "scvi", "bbknn", "scanorama", "seurat", "liger"]
 8 |     adata_subsets = []
 9 |     for method in methods:
10 |         adata_subsets.append(adata[adata.obs["integration_method"] == method])
11 |     cluster_subsets = [
12 |         adata_subset.obs["leiden"].__array__() for adata_subset in adata_subsets 
13 |     ]
14 |         
15 |     # Get ARI values across subsets 
16 |     ari_vals_mat = np.zeros((len(cluster_subsets), len(cluster_subsets)))
17 |     for i, clusters_i in enumerate(cluster_subsets):
18 |         for j, clusters_j in enumerate(cluster_subsets):
19 |             ari_val = metrics.adjusted_rand_score(clusters_i, clusters_j)
20 |             ari_vals_mat[i, j] = ari_val
21 |         
22 |     # Create dataframe of values
23 |     ari_val_df = pd.DataFrame(ari_vals_mat)
24 |     ari_val_df.index = methods
25 |     ari_val_df.columns = methods
26 |     
27 |     # Get global ARI value (median)
28 |     ari_vals_mat_nan_diag = np.zeros((len(cluster_subsets), len(cluster_subsets)))
29 |     for i, clusters_i in enumerate(cluster_subsets):
30 |         for j, clusters_j in enumerate(cluster_subsets):
31 |             if i == j:
32 |                 ari_vals_mat_nan_diag[i, j] = np.nan
33 |             else:
34 |                 ari_val = metrics.adjusted_rand_score(clusters_i, clusters_j)
35 |                 ari_vals_mat_nan_diag[i, j] = ari_val
36 |     
37 |     ari_vals_mat_no_diag = ari_vals_mat_nan_diag[~np.isnan(ari_vals_mat_nan_diag)]
38 |     median_ari = np.median(ari_vals_mat_no_diag)
39 |     
40 |     # Convert concordance dataframe to long format 
41 |     ari_val_df_long = ari_val_df.melt(ignore_index = False)
42 |     ari_val_df_long = ari_val_df_long.reset_index()
43 |     ari_val_df_long.columns = ["Method 1", "Method 2", "ARI"]
44 | 
45 |     # Append median ARI value to dataframe and return
46 |     ari_val_df_long["Median ARI"] = median_ari
47 |     
48 |     return ari_val_df_long


--------------------------------------------------------------------------------
/workflow/src/python/utils/clustering.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scanpy as sc
 3 | import anndata as ann
 4 | 
 5 | def cluster_num(adata):
 6 |     clusters_unique = np.unique(adata.obs.leiden.__array__())
 7 |     clusters_len = len(clusters_unique)
 8 |     return clusters_len
 9 | 
10 | def cluster_membership(adata):
11 |     clusters_unique = np.unique(adata.obs.leiden.__array__())
12 |     clusters_membership = dict()
13 |     for i in clusters_unique:
14 |         clusters_membership[i] = len(adata.obs[adata.obs.leiden == i])
15 |     return clusters_membership
16 | 
17 | def leiden_clip(adata, num_clusters, step_size = 0.05):
18 |     counter = 0
19 |     leiden_resolution = 1
20 |     while cluster_num(adata) != num_clusters:
21 |         if cluster_num(adata) < num_clusters:
22 |             leiden_resolution += step_size
23 |             sc.tl.leiden(adata, resolution = leiden_resolution)
24 |         elif cluster_num(adata) > num_clusters:
25 |             leiden_resolution -= step_size
26 |             sc.tl.leiden(adata, resolution = leiden_resolution)
27 |         counter += 1
28 |         if counter > 100:
29 |             raise Exception(
30 |                 "Attempted more than 100 iterations - convergence not possible, set lower step size"
31 |             )
32 |     return adata
33 |     


--------------------------------------------------------------------------------
/workflow/src/python/utils/diffexp.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd 
 3 | import scanpy as sc 
 4 | import anndata as ann
 5 | 
 6 | def diffexp(adata, groupby, **kwargs):
 7 |     """
 8 |     Perform differential expression analysis on an AnnData object.
 9 | 
10 |     Args:
11 |         adata (AnnData):Annotated data matrix object.
12 |         groupby (str): The column name of the dataframe to group by.
13 |         **kwargs: Keyword arguments to be passed to scanpy.tl.rank_genes_groups.
14 | 
15 |     Returns:
16 |         adata (AnnData): Annotated data matrix object with differential expression analysis results.
17 |     """
18 |     sc.tl.rank_genes_groups(adata, groupby = groupby, **kwargs)
19 |     return adata
20 | 
21 | def dge_top_n(adata, n, obs_group):
22 |     """
23 |     Return clusters and genes with the top n differential expression.
24 | 
25 |     Args:
26 |         adata (AnnData): Annotated data matrix object.
27 |         n (int): The number of top differentially expressed genes to return.
28 |         obs_group (str): The column name in obs of adata object to group by.
29 |     Returns:
30 |         data (DataFrame): A dataframe with the top n differentially expressed genes in each cluster.
31 |     """
32 |     unique_groups = np.sort(np.unique(adata.obs[obs_group].__array__()))
33 |     unique_group_top_n_dges = []
34 |     for group in unique_groups:
35 |         score_df = sc.get.rank_genes_groups_df(adata, group = group)
36 |         score_df_sorted = score_df.sort_values(["pvals_adj"], ascending = True)
37 |         top_n_dges = score_df_sorted[0:n]["names"].__array__()
38 |         unique_group_top_n_dges.append(top_n_dges)
39 |     unique_groups_long = np.repeat(unique_groups, n)
40 |     
41 |     group_dges_df_n = pd.DataFrame({
42 |         "Cluster": unique_groups_long,
43 |         "Top {n} DGEs".format(n = n): np.concatenate(unique_group_top_n_dges)
44 |     })
45 |     return group_dges_df_n
46 | 
47 | def set_concordance(*args):
48 |     """Determines number of overlapping elements between n sets.
49 | 
50 |     Args:
51 |         *args: A list of n sets.
52 | 
53 |     Returns:
54 |         concordance (int): The number of overlapping elements between n sets.
55 |     """
56 |     concordance = len(set.intersection(*args))
57 |     return concordance
58 | 


--------------------------------------------------------------------------------
/workflow/src/python/utils/integrate.py:
--------------------------------------------------------------------------------
  1 | from functools import reduce
  2 | import gc
  3 | import random 
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import scanpy as sc
  8 | import anndata as ann
  9 | import scvi 
 10 | import bbknn 
 11 | import torch
 12 | 
 13 | # Undoing scvi's random seed setting
 14 | random.seed(None)
 15 | np.random.seed(None)
 16 | torch.manual_seed(random.randint(1, 10000000000000000000))
 17 | 
 18 | from utils.seurat_integrate import SeuratIntegrate
 19 | from utils.liger_integrate import LigerIntegrate
 20 | 
 21 | class Integration:
 22 |     """Class for integrating scRNA-seq data and returning processed data."""
 23 |     
 24 |     def __init__(self, adata, gpu = True):
 25 |         """
 26 |         Args:
 27 |             adata (AnnData): AnnData object to be utilized in integration methods.
 28 |                 Assumes that the counts being input are unnormalized (raw counts),
 29 |                 and that raw counts are stored in "counts" layer, and batch covariate
 30 |                 is available.
 31 |             gpu (bool): Whether or not to use GPU for scVI.
 32 |         """
 33 |         self.adata = adata
 34 |         # Check anndata object 
 35 |         if not isinstance(adata, ann.AnnData):
 36 |             raise Exception("Please input an AnnData object.")
 37 |         # Check if gpu is available
 38 |         if gpu is True:
 39 |             if torch.cuda.is_available():
 40 |                 self.gpu = True
 41 |             else:
 42 |                 raise Exception("GPU not available. Please set gpu = False.")
 43 |         else:
 44 |             self.gpu = False
 45 | 
 46 |     def scvi_integrate(self, n_neighbors = 15, n_pcs = 20):
 47 |         print("Performing scVI integration.." + "\n")
 48 |         ascvi = self.adata.copy()
 49 |         scvi.data.setup_anndata(ascvi, batch_key = "batch")
 50 |         vae = scvi.model.SCVI(ascvi)
 51 |         vae.train(use_gpu = self.gpu)
 52 |         ascvi.obsm["X_scVI"] = vae.get_latent_representation()
 53 |         ascvi.obsm["X_kmeans"] = ascvi.obsm["X_scVI"][:, 0:n_pcs]
 54 |         sc.pp.neighbors(
 55 |             ascvi,
 56 |             n_neighbors = n_neighbors,
 57 |             n_pcs = n_pcs,
 58 |             use_rep = "X_scVI"
 59 |         )
 60 |         sc.tl.leiden(ascvi)
 61 |         sc.tl.umap(ascvi)
 62 |         print("Done!" + "\n")
 63 |         return ascvi
 64 |     
 65 |     def harmony_integrate(self, n_neighbors = 15, n_pcs = 20, num_hvgs = 2500):
 66 |         print("Performing Harmony integration.." + "\n")
 67 |         aharmony = self.adata.copy()
 68 |         sc.pp.normalize_total(
 69 |             aharmony,
 70 |             target_sum = 1e4
 71 |         )
 72 |         sc.pp.log1p(aharmony)
 73 |         sc.pp.highly_variable_genes(
 74 |             aharmony,
 75 |             n_top_genes = num_hvgs,
 76 |             flavor = "seurat"
 77 |         )
 78 |         sc.pp.pca(aharmony, svd_solver="arpack")
 79 |         sc.external.pp.harmony_integrate(
 80 |             aharmony,
 81 |             key = "batch",
 82 |             random_state = None
 83 |         )
 84 |         sc.pp.neighbors(
 85 |             aharmony,
 86 |             n_neighbors = n_neighbors,
 87 |             n_pcs = n_pcs,
 88 |             use_rep = "X_pca_harmony"
 89 |         )
 90 |         aharmony.obsm["X_kmeans"] = aharmony.obsm["X_pca_harmony"][:, 0:n_pcs]
 91 |         sc.tl.leiden(aharmony)
 92 |         sc.tl.umap(aharmony)
 93 |         print("Done!" + "\n")
 94 |         return aharmony
 95 |     
 96 |     def bbknn_integrate(self, n_pcs = 20, num_hvgs = 2500, metric = "euclidean"):
 97 |         print("Performing BBKNN integration.." + "\n")
 98 |         abbknn = self.adata.copy()
 99 |         sc.pp.normalize_total(
100 |             abbknn,
101 |             target_sum = 1e4
102 |         )
103 |         sc.pp.log1p(abbknn)
104 |         sc.pp.highly_variable_genes(
105 |             abbknn,
106 |             n_top_genes = num_hvgs,
107 |             flavor = "seurat"
108 |         )
109 |         sc.pp.pca(abbknn, svd_solver = "arpack")
110 |         if metric == "euclidean":
111 |             bbknn.bbknn(
112 |                 abbknn,
113 |                 approx = False,
114 |                 metric = "euclidean",
115 |                 batch_key = "batch",
116 |                 n_pcs = n_pcs,
117 |                 pynndescent_random_state = None
118 |             )
119 |         elif metric == "angular":
120 |             bbknn.bbknn(
121 |                 abbknn,
122 |                 approx = True,
123 |                 metric = "angular",
124 |                 batch_key = "batch",
125 |                 n_pcs = n_pcs,
126 |                 pynndescent_random_state = None
127 |             )
128 |         else:
129 |             raise Exception(
130 |                 "Please enter either 'euclidean' or 'angular' for 'metric'"
131 |             )
132 |         # Add placeholder for kmeans
133 |         abbknn.obsm["X_kmeans"] = np.ones((
134 |             abbknn.obsm["X_pca"].shape[0],
135 |             abbknn.obsm["X_pca"].shape[1]
136 |         ))
137 |         sc.tl.leiden(abbknn)
138 |         sc.tl.umap(abbknn)
139 |         print("Done!" + "\n")
140 |         return abbknn
141 |     
142 |     def scanorama_integrate(self, n_neighbors = 15, n_pcs = 20, num_hvgs = 2500):
143 |         print("Performing Scanorama integration.." + "\n")
144 |         ascanorama = self.adata.copy()
145 |         sc.pp.normalize_total(
146 |             ascanorama,
147 |             target_sum = 1e4
148 |         )
149 |         sc.pp.log1p(ascanorama)
150 |         sc.pp.highly_variable_genes(
151 |             ascanorama,
152 |             n_top_genes = num_hvgs,
153 |             flavor = "seurat"
154 |         )
155 |         sc.pp.pca(ascanorama, svd_solver="arpack")
156 |         sc.external.pp.scanorama_integrate(
157 |             ascanorama,
158 |             key = "batch"
159 |         )
160 |         sc.pp.neighbors(
161 |             ascanorama,
162 |             n_neighbors = n_neighbors,
163 |             n_pcs = n_pcs,
164 |             use_rep = "X_scanorama"
165 |         )
166 |         ascanorama.obsm["X_kmeans"] = ascanorama.obsm["X_scanorama"][:, 0:n_pcs]
167 |         sc.tl.leiden(ascanorama)
168 |         sc.tl.umap(ascanorama)
169 |         print("Done!" + "\n")
170 |         return ascanorama
171 |     
172 |     def seurat_integrate(self,int_type = "CCA", n_neighbors = 15, n_pcs = 20):
173 |         print("Performing Seurat integration.." + "\n")
174 |         aseurat = self.adata.copy()
175 |         sc.pp.normalize_total(
176 |             aseurat,
177 |             target_sum = 1e4
178 |         )
179 |         sc.pp.log1p(aseurat)
180 |         seurat_integrate = SeuratIntegrate(
181 |             adata = aseurat,
182 |             int_type = int_type
183 |         )
184 |         aseurat_int = seurat_integrate.integrate() # Create seurat integrated anndata object
185 |         sc.pp.pca(aseurat_int, svd_solver = "arpack")
186 |         sc.pp.neighbors(
187 |             aseurat_int,
188 |             n_neighbors = n_neighbors,
189 |             n_pcs = n_pcs
190 |         )
191 |         sc.tl.leiden(aseurat_int)
192 |         sc.tl.umap(aseurat_int)
193 |         # Append seurat integrated data to original adata object
194 |         aseurat.obs["leiden"] = aseurat_int.obs["leiden"]
195 |         aseurat.obsm["X_pca"] = aseurat_int.obsm["X_pca"]
196 |         aseurat.obsm["X_kmeans"] = aseurat_int.obsm["X_pca"][:, 0:n_pcs]
197 |         aseurat.obsm["X_umap"] = aseurat_int.obsm["X_umap"]
198 |         aseurat.obsm["seurat_hvg"] = aseurat_int.X
199 |         print("Done!" + "\n")
200 |         return aseurat
201 |         
202 |     def liger_integrate(self, n_neighbors = 15, n_pcs = 20):
203 |         print("Performing LIGER integration.." + "\n")
204 |         aliger = self.adata.copy()
205 |         # Don't normalize for LIGER (R script normalizes)
206 |         # sc.pp.normalize_total(
207 |         #     aliger,
208 |         #     target_sum = 1e4
209 |         # )
210 |         # Don't log-transform for LIGER
211 |         # sc.pp.log1p(aliger)       
212 |         liger_integrate = LigerIntegrate(
213 |             adata = aliger,
214 |         )
215 |         aliger = liger_integrate.integrate() # Substitute liger integrated anndata object
216 |         sc.pp.neighbors(
217 |             aliger,
218 |             n_neighbors = n_neighbors,
219 |             n_pcs = n_pcs,
220 |             use_rep = "X_liger"
221 |         )
222 |         aliger.obsm["X_kmeans"] = aliger.obsm["X_liger"][:, 0:n_pcs]
223 |         sc.tl.leiden(aliger)
224 |         sc.tl.umap(aliger)
225 |         print("Done!" + "\n")
226 |         return aliger


--------------------------------------------------------------------------------
/workflow/src/python/utils/kmeans.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import faiss 
 3 | 
 4 | # Perform k-means clustering using Faiss
 5 | def faiss_kmeans(adata, k, niter = 300, nredo = 10, 
 6 |                  min_points_per_centroid = 5):
 7 |     """Function to perform k-means clustering using FAISS on AnnData objects.
 8 | 
 9 |     Args:
10 |         adata (AnnData): An object of AnnData class with highly variable gene selection
11 |             performed.
12 |         k (int): Number of clusters to form.
13 |         niter (int): Number of iterations to run k-means. Defaults to 300.
14 |         nredo (int): Number of times to run k-means - selects best result. 
15 |             Defaults to 10.
16 |         min_points_per_centroid (int): Minimum number of points per k-means
17 |             centroid. Defaults to 5.
18 |     """
19 |     # Subset data to kmeans reduction to utilize for clustering
20 |     reduction_sub = adata.obsm["X_kmeans"]
21 |     
22 |     # Run k-means using faiss given options 
23 |     kmeans_faiss = faiss.Kmeans(
24 |         d = reduction_sub.shape[1], 
25 |         k = k, 
26 |         niter = niter, 
27 |         nredo = nredo, 
28 |         min_points_per_centroid = min_points_per_centroid
29 |     )
30 |     kmeans_faiss.train(np.ascontiguousarray(reduction_sub, dtype = np.float32))
31 |     kmeans_faiss_labels = np.concatenate(
32 |         kmeans_faiss.index.search(
33 |             np.ascontiguousarray(reduction_sub, dtype = np.float32), 1
34 |         )[1]
35 |     )
36 |     
37 |     # Check if any clusters have less than the min required members and redo clustering with less
38 |     unique_labels, counts = np.unique(kmeans_faiss_labels, return_counts = True)
39 |     while any(counts < min_points_per_centroid):
40 |         k -= 1
41 |         kmeans_faiss = faiss.Kmeans(
42 |             d = reduction_sub.shape[1], 
43 |             k = k, 
44 |             niter = niter, 
45 |             nredo = nredo, 
46 |             min_points_per_centroid = min_points_per_centroid
47 |         )
48 |         kmeans_faiss.train(np.ascontiguousarray(reduction_sub, dtype = np.float32))
49 |         kmeans_faiss_labels = np.concatenate(
50 |             kmeans_faiss.index.search(
51 |                 np.ascontiguousarray(reduction_sub, dtype = np.float32), 1
52 |             )[1]
53 |         )
54 |         unique_labels, counts = np.unique(kmeans_faiss_labels, return_counts = True)
55 |             
56 |     # Append kmeans labels to AnnData object
57 |     kmeans_faiss_labels_str = kmeans_faiss_labels.astype("str")
58 |     adata.obs['kmeans_faiss'] = kmeans_faiss_labels_str
59 |     
60 |     # Return AnnData object and kmeans number
61 |     return adata, k


--------------------------------------------------------------------------------
/workflow/src/python/utils/liger_integrate.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import sys
  3 | import subprocess
  4 | import uuid
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import scipy as sp
  9 | import anndata as ann
 10 | import scanpy as sc
 11 | 
 12 | class LigerIntegrate:
 13 |     """
 14 |     Class for interpolating between the Integration class and R-script 
 15 |         based integration of RNA-seq batches using the LIGER R package.
 16 |         Integration is done on data output to a temporary file from the 
 17 |         Integration class through and RScript, which then outputs a temporary
 18 |         file reread into python code and used to substitute the unintegrated data.
 19 |         Integration is performed using 20 latent components in the NMF factorization,
 20 |         or 20 "metagenes".
 21 |     """
 22 |     def __init__(self, adata):
 23 |         """
 24 |         Args:
 25 |             adata (object): An instance of an anndata class corresponding to the liger
 26 |                 subset from the Integration class.
 27 |         """
 28 |         self.adata = adata.copy()
 29 |         self.adata_copy = adata.copy() # Keep copy for later referencing
 30 |         
 31 |     def _format(self):
 32 |         # Append a column on gene names 
 33 |         self.adata.var["gene"] = self.adata.var_names
 34 |         # Remove layers and raw from AnnData object (avoid conflicts with h5seurat)
 35 |         self.adata.layers = None
 36 |         self.adata.raw = None
 37 |         
 38 |     def _output_temp_h5ad(self):
 39 |         # Check if temp exists, if not, make dir
 40 |         if not os.path.exists("tmp"):
 41 |             os.makedirs("tmp")
 42 | 
 43 |         # Output temporary file with data 
 44 |         self.filename = ''.join(str(uuid.uuid4()).split("-"))
 45 |         self.file = "{filename}.h5ad".format(filename = self.filename)
 46 |         self.adata.write_h5ad(os.path.join("tmp", self.file))
 47 |     
 48 |     def _liger_integrate(self):
 49 |         # Call subprocess and call R script
 50 |         tempfile_script = \
 51 |             "Rscript src/R/liger_integrate.R tmp/{tempfile} {tempfile_name} --verbose".format(
 52 |                 tempfile = self.file,
 53 |                 tempfile_name = self.filename
 54 |             )
 55 |             
 56 |         self.sp_integrate = subprocess.run(tempfile_script, shell = True, text = True, capture_output = True)
 57 |         if self.sp_integrate.returncode != 0:
 58 |             raise Exception(
 59 |                 "Subprocess call returned nonzero exit code - call: {call} \n Output: {output}".format(
 60 |                     call = self.sp_integrate.stderr,
 61 |                     output = self.sp_integrate.stdout
 62 |                 )
 63 |             )
 64 |         
 65 |     def _return_integrated(self):
 66 |         # Get liger output file and read it into memory as anndata object
 67 |         self.liger_outfile = "{filename}_liger_out.h5ad".format(filename = self.filename)
 68 |         adata_liger = sc.read_h5ad(
 69 |             os.path.join("tmp", self.liger_outfile)
 70 |         )
 71 |         
 72 |         # Read in cell-specific loadings from anndata object and convert to array
 73 |         norm_loadings_arr = adata_liger.X.toarray()
 74 |         
 75 |         # Add normalized loadings to anndata object (original copy)
 76 |         self.adata_copy.obsm["X_liger"] = norm_loadings_arr
 77 |         
 78 |         # Return integrated AnnData object
 79 |         return self.adata_copy
 80 |     
 81 |     def _clean_files(self):
 82 |         # Remove temporary python and liger files
 83 |         tmp_files = os.listdir("tmp")
 84 |         tmp_files_instance = [f for f in tmp_files if self.filename in f]
 85 |         for f in tmp_files_instance:
 86 |             os.remove(os.path.join("tmp", f))
 87 |         
 88 |         # Check if all files related to the filename are removed from folder
 89 |         tmp_files = os.listdir("tmp")
 90 |         tmp_files_instance = [f for f in tmp_files if self.filename in f]
 91 |         if len(tmp_files_instance) > 0:
 92 |             raise Exception(
 93 |                 "Temporary file cleanup incomplete - files remain in folder"
 94 |             )
 95 |             
 96 |     def integrate(self):
 97 |         # Perform workflow and return integrated anndata object
 98 |         self._format()
 99 |         self._output_temp_h5ad()
100 |         self._liger_integrate()
101 |         integrated_adata = self._return_integrated()
102 |         self._clean_files()
103 |         
104 |         return integrated_adata


--------------------------------------------------------------------------------
/workflow/src/python/utils/mnn.py:
--------------------------------------------------------------------------------
  1 | from itertools import combinations
  2 | 
  3 | import faiss
  4 | import numpy as np
  5 | 
  6 | def find_mutual_nn(data_list, k = 15):
  7 |     """Gets mutual nearest neighbors pairs across all datasets.
  8 | 
  9 |     Using each data subset in data_list, gets mutual nearest neighbors by
 10 |     considering the intersection of MNNs across all datasets.
 11 | 
 12 |     Args:
 13 |         data_list (list): List of numpy arrays corresponding to data subsets. Datasets
 14 |             must have an equivalent number of features.
 15 |         k (integer): Positive integer value indicating how many neighbors to consider
 16 |             in the mutual-nearest-neighbors algorithm. Default value is 15.
 17 | 
 18 |     Returns:
 19 |         mnn_1_concat (array): Array of values corresponding to query MNN indices (MNN_1) that can be in
 20 |             any dataset, and are indexed based on the concatenated representation of all datasets in
 21 |             dataset_list.
 22 |         mnn_2_concat (array): Array of values corresponding to query-value MNN indices (MNN_2) that can
 23 |             be in any dataset, and are indexed based on the concatenated representation of all datasets in
 24 |             dataset_list.
 25 |     """
 26 |     # Get lengths of all datasets for reindexing
 27 |     data_lens = [len(dataset) for dataset in data_list]
 28 | 
 29 |     # Get all indices
 30 |     indices = [i for i in range(len(data_list))]
 31 | 
 32 |     # Create combinations for all indices
 33 |     index_combo_iter = combinations(indices, 2)
 34 |     index_combos = [combo for combo in index_combo_iter]
 35 | 
 36 |     # Iterate over combinations and record mnn pairs - append to both lists
 37 |     mnn_1_list = []
 38 |     mnn_2_list = []
 39 |     for combo in index_combos:
 40 |         idx_1 = combo[0]
 41 |         idx_2 = combo[1]
 42 |         data_1 = data_list[idx_1]
 43 |         data_2 = data_list[idx_2]
 44 |         mnn_1, mnn_2 = mutual_nn(data_1, data_2, k1=k, k2=k)
 45 |         if idx_1 == 0:
 46 |             pass
 47 |         else:
 48 |             len_addition = sum(data_lens[0:idx_1])
 49 |             mnn_1 = mnn_1 + len_addition
 50 |         if idx_2 == 0:
 51 |             pass
 52 |         else:
 53 |             len_addition = sum(data_lens[0:idx_2])
 54 |             mnn_2 = mnn_2 + len_addition
 55 |         mnn_concat_1 = np.concatenate((mnn_1, mnn_2))
 56 |         mnn_concat_2 = np.concatenate((mnn_2, mnn_1))
 57 |         mnn_1_list.append(mnn_concat_1)
 58 |         mnn_2_list.append(mnn_concat_2)
 59 | 
 60 |     # Concatenate MNNs in mnn_1 and mnn_2 into one array and return
 61 |     mnn_1_concat = np.concatenate(mnn_1_list)
 62 |     mnn_2_concat = np.concatenate(mnn_2_list)
 63 |     return mnn_1_concat, mnn_2_concat
 64 | 
 65 | 
 66 | def mutual_nn(data_1, data_2, k1, k2):
 67 |     """Given two datasets, gets and returns mutual nearest neighbors.
 68 | 
 69 |     Args:
 70 |         data_1 (array): Data array 1 that is used to create the graph representing
 71 |             dataset 1. Dataset 1 and 2 must have the same numbers of featres.
 72 |         data_2 (array): Data array 1 that is used to create the graph representing
 73 |             dataset 2. Dataset 1 and 2 must have the same numbers of features.
 74 |         k1 (integer): Positive integer value indicating how many neighbors to consider
 75 |             in the mutual-nearest-neighbors algorithm for dataset 1.
 76 |         k2 (integer): Positive integer value indicating how many neighbors to consider
 77 |             in the mutual-nearest-neighbors algorithm for dataset 2.
 78 | 
 79 |     Returns:
 80 |         mutual_1_arr (array): Array of mutual-nearest neighbors corresponding to indices in dataset 1.
 81 |         mutual_2_arr (array): Array of mutual-nearest neighbors corresponding to indices in dataset 2.
 82 |     """
 83 | 
 84 |     data_1 = np.ascontiguousarray(data_1, dtype = np.float32)
 85 |     data_2 = np.ascontiguousarray(data_2, dtype = np.float32)
 86 |     
 87 |     index_1 = faiss.IndexFlatL2(data_1.shape[1])
 88 |     index_2 = faiss.IndexFlatL2(data_2.shape[1])
 89 |     
 90 |     index_1.add(data_1)
 91 |     index_2.add(data_2)
 92 |     
 93 |     d_index_1, k_index_1 = index_1.search(data_2, k1)
 94 |     d_index_2, k_index_2 = index_2.search(data_1, k2)
 95 |     
 96 |     mutual_1 = []
 97 |     mutual_2 = []
 98 |     for index_2 in range(data_2.shape[0]):
 99 |         for index_1 in k_index_1[index_2]:
100 |             if index_2 in k_index_2[index_1]:
101 |                 mutual_1.append(index_1)
102 |                 mutual_2.append(index_2)
103 |     mutual_1_arr = np.asarray(mutual_1)
104 |     mutual_2_arr = np.asarray(mutual_2)
105 |     return mutual_1_arr, mutual_2_arr
106 | 
107 | def cross_data_knn(data_1, data_2, k):
108 |     """Given two datasets, gets and returns KNN of dataset 1 in dataset 2.
109 |     
110 |     data_1 (array): Data array 1 that is used to create the graph representing
111 |         dataset 1. Dataset 1 and 2 must have the same numbers of features.
112 |     data_2 (array): Data array 1 that is used to create the graph representing
113 |         dataset 2. Dataset 1 and 2 must have the same numbers of features.
114 |     k (integer): Positive integer value indicating how many neighbors to consider
115 |         in the cross-data knn lookup.
116 | 
117 |     Returns:
118 |         knn_arr (array): Array of k-nearest neighbors corresponding to indices in dataset 1, 
119 |             with indices in dataset 2.
120 |     """
121 |     data_1 = np.ascontiguousarray(data_1, dtype = np.float32)
122 |     data_2 = np.ascontiguousarray(data_2, dtype = np.float32)
123 |     
124 |     index_2 = faiss.IndexFlatL2(data_2.shape[1])
125 |     index_2.add(data_2)
126 |     d_index_2, k_index_2 = index_2.search(data_1, k)
127 |     
128 |     return k_index_2
129 | 
130 | 
131 | def find_knn(data_list, k = 15):
132 |     """Gets k nearest-neighbors for all datasets.
133 | 
134 |     Using each data subset in data_list, gets k-nearest-neighbors for each subset and
135 |     returns concatenated k-nearest-neighbors for each index across all datasets.
136 | 
137 |     Args:
138 |         data_list (list): List of numpy arrays corresponding to data subsets. Datasets
139 |             must have an equivalent number of features.
140 |         k (integer): Positive integer value indicating how many neighbors to consider
141 |             in the k-nearest-neighbors algorithm. Default value is 15.
142 | 
143 |     Returns:
144 |         knn_concat (array): Array of values corresponding to query KNN indices for all
145 |             datasets in data_list, indexed based on their concatenation. Will return
146 |             k nearest-neighbors at query position for each index from input data.
147 |     """
148 |     # Get lengths of all datasets for reindexing
149 |     data_lens = [len(dataset) for dataset in data_list]
150 | 
151 |     # Get all indices
152 |     indices = [i for i in range(len(data_list))]
153 | 
154 |     # Get knn pairs for each dataset and reindex as necessary
155 |     knn_list = []
156 |     for idx in indices:
157 |         dataset = data_list[idx]
158 |         dataset = np.ascontiguousarray(dataset, dtype = np.float32)
159 |         index = faiss.IndexFlatL2(dataset.shape[1])
160 |         index.add(dataset)
161 |         knn_vals, knn = index.search(dataset, k)
162 |         if idx == 0:
163 |             knn_corrected = knn
164 |             knn_list.append(knn_corrected)
165 |         else:
166 |             knn_corrected = []
167 |             len_addition = sum(data_lens[0:idx])
168 |             for i in range(len(knn)):
169 |                 knn_corrected.append(knn[i] + len_addition)
170 |             knn_corrected_arr = np.asarray(knn_corrected)
171 |             knn_list.append(knn_corrected_arr)
172 | 
173 |     # Concatenate all KNNs corresponding to all dataset indices and return
174 |     knn_concat = np.concatenate(knn_list)
175 |     return knn_concat


--------------------------------------------------------------------------------
/workflow/src/python/utils/relatedness.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import scipy.spatial as sp
 4 | import scanpy as sc 
 5 | 
 6 | def relatedness_score(adata, pca_performed = True):
 7 |     """Computes the relatedeness between celltypes using cosine distance in PCA space
 8 |     
 9 |     Args:
10 |         adata (AnnData): AnnData object containing relevant count information with celltype
11 |             and batch observations.
12 |         pca_performed (bool): True or False value indicating whether PCA decomposition steps
13 |             have been performed already for AnnData object. Default is True.
14 |     """
15 |     # Perform PCA if not already performed
16 |     if pca_performed is False:
17 |         sc.pp.normalize_total(adata, target_sum=1e4)
18 |         sc.pp.log1p(adata)
19 |         sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2500)
20 |         sc.pp.pca(adata)
21 |         
22 |     # Get the batch and celltype information from AnnData object
23 |     batch_vals = np.unique(adata.obs["batch"].__array__())
24 |     if len(batch_vals) > 1:
25 |         raise ValueError("More than one batch found in AnnData object")
26 |     batch = batch_vals[0]
27 |     celltypes = np.unique(adata.obs["celltype"].__array__())
28 |     
29 |     # Utilize the cosine distance between the average PCA embedding for celltype i and 
30 |     # average PCA embedding for celltype j
31 |     pca_top_20 = adata.obsm["X_pca"][:, 0:20]
32 |     top_20_pc_weights = adata.uns["pca"]["variance_ratio"][0:20]
33 |     celltype_is = []
34 |     celltype_js = []
35 |     pca_cosine_dists = []
36 |     for celltype_i in celltypes:
37 |         for celltype_j in celltypes:
38 |             celltype_is.append(celltype_i)
39 |             celltype_js.append(celltype_j)
40 |             pca_celltype_i = pca_top_20[
41 |                 adata.obs.celltype == celltype_i
42 |             ]
43 |             pca_celltype_j = pca_top_20[
44 |                 adata.obs.celltype == celltype_j
45 |             ]
46 |             pca_celltype_i_avg = np.sum(pca_celltype_i, axis = 0)/len(pca_celltype_i)
47 |             pca_celltype_j_avg = np.sum(pca_celltype_j, axis = 0)/len(pca_celltype_j)
48 |             pca_cosine_dist = sp.distance.cosine(
49 |                 pca_celltype_i_avg,
50 |                 pca_celltype_j_avg,
51 |                 w = top_20_pc_weights
52 |             )
53 |             pca_cosine_dists.append(pca_cosine_dist)
54 |             
55 |     # Gather the cosine distance results in a dataframe and return  
56 |     dist_results_df = pd.DataFrame({
57 |         "Celltype 1": celltype_is,
58 |         "Celltype 2": celltype_js,
59 |         "PCA cosine dist": pca_cosine_dists,
60 |         "Batch": batch
61 |     })
62 |     return dist_results_df


--------------------------------------------------------------------------------
/workflow/src/python/utils/sample.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scanpy as sc
 3 | import anndata as ann
 4 | 
 5 | def downsample(adata, num_celltypes = None, celltype_names = None, proportion = 0.5):
 6 |     # Initialize random number generator
 7 |     rng = np.random.default_rng()
 8 |     
 9 |     # For the given number of celltypes, select num_celltypes
10 |     # randomly, unless non random indicated by celltype_names
11 |     if celltype_names is not None:
12 |         celltypes_sample = celltype_names
13 |     else:
14 |         if num_celltypes is None:
15 |             raise ValueError(
16 |                 "num_celltypes and celltype_names cannot both be None"
17 |             )
18 |         if num_celltypes == 0:
19 |             celltypes_sample = "None"
20 |             return adata, celltypes_sample
21 |         unique_celltypes = np.unique(adata.obs["celltype"].__array__())
22 |         rng.shuffle(unique_celltypes)
23 |         celltypes_sample = rng.choice(unique_celltypes, num_celltypes, replace = False)
24 |         
25 |     # Save the original batch label for later 
26 |     adata.obs["batch_orig"] = adata.obs["batch"]
27 |     
28 |     # Downsample selected celltypes by given proportion
29 |     for celltype in celltypes_sample:
30 |         adata_celltype = adata[adata.obs["celltype"] == celltype]
31 |         adata_noncelltype = adata[adata.obs["celltype"] != celltype]
32 |         if proportion == 0:
33 |             adata = adata_noncelltype
34 |             continue
35 |         adata_celltype_ds = sc.pp.subsample(
36 |             adata_celltype, 
37 |             fraction = proportion,
38 |             random_state = None,
39 |             copy = True
40 |         )
41 |         adata = ann.AnnData.concatenate(adata_noncelltype, adata_celltype_ds)
42 | 
43 |     # Replace batch column with batch original and drop batch_orig
44 |     adata.obs["batch"] = adata.obs["batch_orig"]
45 |     adata.obs.drop("batch_orig", axis = 1, inplace = True)
46 |         
47 |     # Return downsampled data + sampled celltypes   
48 |     return adata, celltypes_sample


--------------------------------------------------------------------------------
/workflow/src/python/utils/seurat_integrate.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import sys
  3 | import subprocess
  4 | import uuid
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import scipy as sp
  9 | import anndata as ann
 10 | import scanpy as sc
 11 | 
 12 | class SeuratIntegrate:
 13 |     """
 14 |     Class for interpolating between the Integration class and R-script 
 15 |         based integration of RNA-seq batches using the Seurat4.0 package.
 16 |         Integration is done on data output to a temporary file from the 
 17 |         Integration class through and RScript, which then outputs a temporary
 18 |         file reread into python code and used to substitute the unintegrated data.
 19 |         Uses the first 20 components of CCA/RPCA space to integrate and get correction
 20 |         vectors for correcting whole data matrix.
 21 |     """
 22 |     def __init__(self, adata, int_type = "CCA"):
 23 |         """
 24 |         Args:
 25 |             adata (object): An instance of an anndata class corresponding to the seurat
 26 |                 subset from the Integration class.
 27 |             int_type (string): Either "CCA" or "RPCA", indicating which Seurat workflow to 
 28 |                 utilize for integration - canonical correlation analysis and reciprocal PCA,
 29 |                 respectively. RPCA should be used for larger datasets to avoid out-of-memory
 30 |                 exceptions. Details on both workflows can be found at:
 31 |                 https://satijalab.org/seurat/articles/integration_introduction.html 
 32 |                 https://satijalab.org/seurat/articles/integration_rpca.html
 33 |         """
 34 |         self.adata = adata.copy()
 35 |         self.int_type = int_type
 36 |         
 37 |     def _format(self):
 38 |         # Append a column on gene names 
 39 |         self.adata.var["gene"] = self.adata.var_names
 40 |         # Remove layers and raw from AnnData object (avoid conflicts with h5seurat)
 41 |         self.adata.layers = None
 42 |         self.adata.raw = None
 43 |         
 44 |     def _output_temp_h5ad(self):
 45 |         # Check if temp exists, if not, make dir
 46 |         if not os.path.exists("tmp"):
 47 |             os.makedirs("tmp")
 48 | 
 49 |         # Output temporary file with data 
 50 |         self.filename = ''.join(str(uuid.uuid4()).split("-"))
 51 |         self.file = "{filename}.h5ad".format(filename = self.filename)
 52 |         self.adata.write_h5ad(os.path.join("tmp", self.file))
 53 |     
 54 |     def _seurat_integrate(self):
 55 |         # Call subprocess and call R script
 56 |         tempfile_script = \
 57 |             "Rscript src/R/seurat_integrate.R tmp/{tempfile} {tempfile_name} {int_type} --verbose".format(
 58 |                 tempfile = self.file,
 59 |                 tempfile_name = self.filename,
 60 |                 int_type = self.int_type
 61 |             )
 62 |             
 63 |         self.sp_integrate = subprocess.run(tempfile_script, shell = True, text = True, capture_output = True)
 64 |         if self.sp_integrate.returncode != 0:
 65 |             raise Exception(
 66 |                 "Subprocess call returned nonzero exit code - call: {call} \n Output: {output}".format(
 67 |                     call = self.sp_integrate.stderr,
 68 |                     output = self.sp_integrate.stdout
 69 |                 )
 70 |             )
 71 |         
 72 |     def _return_integrated(self):
 73 |         # Get seurat output file
 74 |         self.seur_outfile = "{filename}_seur_out.h5ad".format(filename = self.filename)
 75 | 
 76 |         # Read in as AnnData object 
 77 |         integrated_adata = sc.read_h5ad(filename = os.path.join("tmp", self.seur_outfile))
 78 |         
 79 |         # Reappend original obs columns
 80 |         integrated_adata.obs = self.adata.obs
 81 |         
 82 |         # Return integrated AnnData object
 83 |         return integrated_adata
 84 |     
 85 |     def _clean_files(self):
 86 |         # Remove temporary python and seurat files
 87 |         tmp_files = os.listdir("tmp")
 88 |         tmp_files_instance = [f for f in tmp_files if self.filename in f]
 89 |         for f in tmp_files_instance:
 90 |             os.remove(os.path.join("tmp", f))
 91 |         
 92 |         # Check if all files related to the filename are removed from folder
 93 |         tmp_files = os.listdir("tmp")
 94 |         tmp_files_instance = [f for f in tmp_files if self.filename in f]
 95 |         if len(tmp_files_instance) > 0:
 96 |             raise Exception(
 97 |                 "Temporary file cleanup incomplete - files remain in folder"
 98 |             )
 99 |             
100 |     def integrate(self):
101 |         # Perform workflow and return integrated anndata object
102 |         self._format()
103 |         self._output_temp_h5ad()
104 |         self._seurat_integrate()
105 |         integrated_adata = self._return_integrated()
106 |         self._clean_files()
107 |         
108 |         return integrated_adata


--------------------------------------------------------------------------------
/workflow/src/python/utils/seurat_reference_mapping.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import sys
  3 | import subprocess
  4 | import uuid
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import scipy as sp
  9 | import anndata as ann
 10 | import scanpy as sc
 11 | 
 12 | class SeuratReferenceMap:
 13 |     """
 14 |     Class for reference to query mapping through integration of RNA-seq batches 
 15 |         using the Seurat4.0 package. Integration is done on data output to a temporary 
 16 |         file a downsampled and integrated result through an RScript, which then outputs the 
 17 |         reference mapped anndata (h5ad) file to be used for later downstream testing and 
 18 |         analysis. The reference mapping workflow follows that of: 
 19 |             https://satijalab.org/seurat/articles/multimodal_reference_mapping.html.
 20 |     """
 21 |     def __init__(self, integrated_data_h5, reference_h5, mapped_h5):
 22 |         """
 23 |         Args:
 24 |             integrated_data_h5 (str): Path to the anndata file for the integrated result after 
 25 |                 downsampling and integration.
 26 |             reference_h5 (str): Path to the reference h5Seurat file that contains the data
 27 |                 to be used in query to reference mapping and annotation.
 28 |             mapped_h5 (str): Path to the h5ad output file from seurat that contains the 
 29 |                 mapped and annotated data.
 30 |         """
 31 |         self.integrated_data_h5 = integrated_data_h5
 32 |         self.reference_h5 = reference_h5
 33 |         self.mapped_h5 = mapped_h5
 34 |     
 35 |     def _load(self):
 36 |         # Load the integrated data and subset for the seurat results 
 37 |         self.adata = sc.read_h5ad(self.integrated_data_h5, as_sparse = "raw/X")
 38 |         self.adata = self.adata[self.adata.obs.integration_method == "seurat"].copy()
 39 |         self.adata.obs.index = range(len(self.adata.obs)) # Reset index
 40 |     
 41 |     def _format(self):
 42 |         # Substitute in raw counts for X, remove unecessary obs, obsm, and uns info
 43 |         self.adata.X = self.adata.layers["raw"]
 44 |         self.adata.obs = self.adata.obs.drop(columns = [
 45 |             "leiden", 
 46 |             "integration_method",
 47 |             "kmeans_faiss"
 48 |         ])
 49 |         self.adata.obsm = None 
 50 |         del self.adata.uns # Uns doesn't support None for resetting 
 51 |         
 52 |         # Append a column on gene names 
 53 |         self.adata.var["gene"] = self.adata.var_names
 54 |         
 55 |         # Remove layers and raw from AnnData object (avoid conflicts with h5seurat)
 56 |         if self.adata.layers["raw"] is not None:
 57 |             del self.adata.layers["raw"]
 58 |         self.adata.raw = None
 59 |         
 60 |         # Strip mapped h5 of extension - keep only name for internal seurat h5 conversions
 61 |         self.mapped_h5_name = os.path.splitext(self.mapped_h5)[0]
 62 |         
 63 |     def _output_temp_h5ad(self):
 64 |         # Check if temp exists, if not, make dir
 65 |         if not os.path.exists("tmp"):
 66 |             os.makedirs("tmp")
 67 | 
 68 |         # Output temporary file with data 
 69 |         self.filename = ''.join(str(uuid.uuid4()).split("-"))
 70 |         self.file = "{filename}.h5ad".format(filename = self.filename)
 71 |         self.adata.write_h5ad(os.path.join("tmp", self.file))
 72 |     
 73 |     def _seurat_refmap(self):
 74 |         # Call subprocess and call R script
 75 |         refmap_script = \
 76 |             "Rscript src/R/seurat_reference_map.R {ref_h5} tmp/{tempfile} {tempname} {out_name} --verbose".format(
 77 |                 ref_h5 = self.reference_h5,
 78 |                 tempfile = self.file,
 79 |                 tempname = self.filename,
 80 |                 out_name = self.mapped_h5_name
 81 |             )
 82 |             
 83 |         self.sp_refmap = subprocess.run(refmap_script, shell = True, text = True, capture_output = True)
 84 |         if self.sp_refmap.returncode != 0:
 85 |             raise Exception(
 86 |                 "Subprocess call returned nonzero exit code - call: {call} \n Output: {output}".format(
 87 |                     call = self.sp_refmap.stderr,
 88 |                     output = self.sp_refmap.stdout
 89 |                 )
 90 |             )
 91 |         
 92 |     def _clean_files(self):
 93 |         # Remove temporary python and seurat files
 94 |         tmp_files = os.listdir("tmp")
 95 |         tmp_files_instance = [f for f in tmp_files if self.filename in f]
 96 |         for f in tmp_files_instance:
 97 |             os.remove(os.path.join("tmp", f))
 98 |         
 99 |         # Check if all files related to the filename are removed from folder
100 |         tmp_files = os.listdir("tmp")
101 |         tmp_files_instance = [f for f in tmp_files if self.filename in f]
102 |         if len(tmp_files_instance) > 0:
103 |             raise Exception(
104 |                 "Temporary file cleanup incomplete - files remain in folder"
105 |             )
106 |             
107 |     def refmap(self):
108 |         # Perform workflow and return reference mapped anndata object
109 |         self._load()
110 |         self._format()
111 |         self._output_temp_h5ad()
112 |         self._seurat_refmap()
113 |         self._clean_files()


--------------------------------------------------------------------------------
/workflow/src/python/utils/umap.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import seaborn as sns
  4 | import colorcet as cc
  5 | from natsort import natsorted
  6 | 
  7 | 
  8 | class Umap:
  9 |     """Class for plotting results of integration experiments"""
 10 | 
 11 |     def __init__(self, coords, clustering, subset_name = None):
 12 |         """
 13 |         Args:
 14 |             coords (dictionary): coordinates of umap in numpy format where
 15 |                 keys correspond to following integration methods -
 16 |                 'bbknn', 'harmony', 'scanorama', 'seurat', and 'scvi'.
 17 |             clustering (dictionary): leiden or celltype clustering in numpy
 18 |                 format of integrated where keys correspond to following
 19 |                 integration methods - 'bbknn', 'harmony', 'scanorama',
 20 |                 'seurat', and 'scvi'.
 21 |             subset_name (string): name of subset being utilized for clustering
 22 |                 comparisons (e.g. batch, celltype).
 23 |         """
 24 |         self.clustering_harmony = clustering.get("harmony")
 25 |         self.clustering_scvi = clustering.get("scvi")
 26 |         self.clustering_bbknn = clustering.get("bbknn")
 27 |         self.clustering_scanorama = clustering.get("scanorama")
 28 |         self.clustering_seurat = clustering.get("seurat")
 29 |     
 30 |         self.umap_harmony = coords.get("harmony")
 31 |         self.umap_scvi = coords.get("scvi")
 32 |         self.umap_bbknn = coords.get("bbknn")
 33 |         self.umap_scanorama = coords.get("scanorama")
 34 |         self.umap_seurat = coords.get("seurat")
 35 |         
 36 |         if subset_name is not None:
 37 |             self.subset_name = subset_name
 38 |         else:
 39 |             self.subset_name = "Subset"
 40 |             
 41 |         sns.set_style("ticks")
 42 | 
 43 |     def df_get(self, subset, clustering, coords, category = None):
 44 |         df = pd.DataFrame({
 45 |             "Subset" : np.repeat(subset, len(clustering)),
 46 |             "UMAP 1" : coords[:, 0],
 47 |             "UMAP 2" : coords[:, 1]
 48 |         })
 49 |         subsets = natsorted(np.unique(subset))
 50 |         df["Subset"] = pd.Categorical(
 51 |             np.repeat(subset, len(clustering)), categories=subsets, ordered=True
 52 |         )
 53 |         df["Clustering"] = pd.Categorical(
 54 |             clustering, categories=category, ordered=True
 55 |         )
 56 |         return df
 57 | 
 58 |     def umap_df(self):
 59 |         subset_list = [
 60 |             "bbknn",
 61 |             "harmony",
 62 |             "scanorama",
 63 |             "scvi",
 64 |             "seurat"
 65 |         ]
 66 |         clustering_list = [
 67 |             self.clustering_bbknn,
 68 |             self.clustering_harmony,
 69 |             self.clustering_scanorama,
 70 |             self.clustering_scvi,
 71 |             self.clustering_seurat
 72 |         ]
 73 |         clustering_unique = natsorted(np.unique(np.concatenate(clustering_list)))
 74 |         coords_list = [
 75 |             self.umap_bbknn,
 76 |             self.umap_harmony,
 77 |             self.umap_scanorama,
 78 |             self.umap_scvi,
 79 |             self.umap_seurat
 80 |         ]
 81 |         umap_dfs = [
 82 |             self.df_get(i, j, k, category = clustering_unique) for i, j, k in zip(
 83 |                 subset_list,
 84 |                 clustering_list,
 85 |                 coords_list
 86 |             )
 87 |         ]
 88 |         self.umap_concat = pd.concat(umap_dfs)
 89 | 
 90 |     def umap_plot(self, show_plot = False):
 91 |         self.umap_df()
 92 |         palette = cc.glasbey_bw[0:len(np.unique(self.umap_concat["Clustering"]))]
 93 |         self.umap_plt = sns.FacetGrid(
 94 |             self.umap_concat,
 95 |             col = "Subset",
 96 |             col_wrap = 3,
 97 |             hue = "Clustering",
 98 |             palette = palette
 99 |         )
100 |         self.umap_plt.map(
101 |             sns.scatterplot,
102 |             "UMAP 1",
103 |             "UMAP 2",
104 |             s = 5,
105 |             alpha = 0.5
106 |         )
107 |         self.umap_plt.add_legend(markerscale = 3, title = self.subset_name)
108 |         if show_plot is True:
109 |             return self.umap_plt                
110 | 
111 |     def save_umap(self, save_dir, dpi = 300):
112 |         try:
113 |             self.umap_plt.savefig(
114 |                 save_dir,
115 |                 dpi = dpi
116 |             )
117 |         except:
118 |             self.umap_plot()
119 |             self.umap_plt.savefig(
120 |                 save_dir,
121 |                 dpi = dpi
122 |             )


--------------------------------------------------------------------------------