├── .gitignore
├── LICENSE
├── README.md
├── evaluate_vary_situations_public.ipynb
├── hpap_result_plot_public.ipynb
├── methods
    ├── bindsc
    │   ├── bindsc.yml
    │   ├── bindsc_env.R
    │   ├── run_rbindsc.R
    │   ├── run_rbindsc_batch_sequential.R
    │   ├── run_rbindsc_noPred.R
    │   └── run_rbindsc_single.R
    ├── cobolt
    │   ├── cobolt.yml
    │   ├── cobolt_env.txt
    │   └── run_cobolt.py
    ├── eval_scib2
    │   ├── scib2.yml
    │   └── scib2_env.txt
    ├── figr
    │   ├── figr.yml
    │   ├── figr_env.R
    │   ├── run_rfigr_2.R
    │   ├── run_rfigr_2_hpap_sequential.R
    │   └── run_rfigr_single.R
    ├── glue
    │   ├── glue_env.txt
    │   ├── run_glue_hg38.py
    │   ├── run_glue_hg38_batch.py
    │   ├── run_glue_hg38_noPred.py
    │   ├── run_glue_hg38_single.py
    │   ├── run_glue_hg38_single_noPred.py
    │   ├── run_glue_mm10.py
    │   ├── run_glue_mm10_noPred.py
    │   ├── run_glue_mm10_single.py
    │   └── run_glue_mm10_single_noPred.py
    ├── liger
    │   ├── liger.yml
    │   ├── rliger_env.R
    │   ├── run_rliger.R
    │   ├── run_rliger_batch_sequential.R
    │   └── run_rliger_single.R
    ├── multivi
    │   ├── multivi.yml
    │   ├── multivi_env.R
    │   ├── run_multivi_2.py
    │   ├── run_multivi_2_noPred.py
    │   └── run_multivi_batch.py
    ├── scmomat
    │   ├── prep_GxR_for_scmomat.ipynb
    │   ├── run_scmomat_batch_bmmc_2.py
    │   ├── run_scmomat_batch_hpap.py
    │   ├── run_scmomat_bmmc.py
    │   ├── run_scmomat_mouse_skin.py
    │   ├── run_scmomat_pbmc.py
    │   ├── scmomat.yml
    │   └── scmomat_env.txt
    ├── seuratv3
    │   ├── run_seurat3.R
    │   ├── run_seurat3_batch_sequential.R
    │   ├── run_seurat3_noPred.R
    │   ├── run_seurat3_single.R
    │   ├── run_seurat3_single_noPred.R
    │   ├── seurat.yml
    │   └── seurat_env.txt
    └── seuratv4
    │   ├── run_seurat4_3.R
    │   ├── run_seurat4_3_noPred.R
    │   ├── run_seurat4_4.R
    │   ├── run_seurat4_4_sequential.R
    │   ├── seurat.yml
    │   └── seurat_env.txt
├── scenario_parameters.txt
├── scenario_parameters.xlsx
├── scripts
    ├── data_simulation.py
    ├── eval_missing_modality_prediction_single.R
    ├── eval_missing_modality_prediction_single_mm10.R
    ├── r_utils.R
    ├── run_metric_eval_batch.py
    ├── run_metric_eval_batch2.py
    ├── run_metric_eval_batch2_hpap.py
    ├── run_metric_eval_fair.py
    ├── run_metric_eval_fair_missing_ct_perMod.py
    ├── run_metric_eval_single.py
    ├── seurat_wnn_project.R
    ├── submit_job_per_condition_n_eval2.sh
    ├── submit_job_per_condition_n_missing_ct_eval.sh
    ├── submit_job_per_missing_mod_eval.sh
    └── utils_eval.py
├── summary_metric_plot_public.ipynb
└── umap_plot_generation_public.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Michelle YY Lee
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![DOI](https://zenodo.org/badge/580110879.svg)](https://zenodo.org/badge/latestdoi/580110879)
 2 | 
 3 | # Benchmarking algorithms for joint integration of unpaired and paired single-cell RNA-seq and ATAC-seq data
 4 | **Michelle Y. Y. Lee, Klaus H. Kaestner, Mingyao Li**
 5 | 
 6 | This repository contains codes for data simulation and method evaluation described in this study. We evaluated nine methods at five types of simulated scenarios using three publicly available datasets, plus one real data situation. 
 7 | 
 8 | ## Benchmarking framework 
 9 | 
10 | Codes for data simulation and method evaluation are in evaluate_vary_situations_public.ipynb. 
11 | 
12 | Codes for generating summary plots are in summary_metric_plot_public.ipynb. 
13 | 
14 | An example for generating UMAP plots are in umap_plot_generation_public.ipynb.
15 | 
16 | Plots related to the HPAP integration are generated using codes in hpap_result_plot_public.ipynb. 
17 | 
18 | Each file is separated into five major scenario plus one real data situation and a total of 17 challenges. For each challenge, first two steps are in evaluate_vary_situations_public.ipynb. First, data is simulated. Secondly, a list of methods are run and the performance is calculated. To generate a summary plot for each challenge, follow the steps in  summary_metric_plot_public.ipynb, which is again separated into the same six situations and 17 challenges. 
19 | 
20 | ## Setup 
21 | To run the methods as specified in the step above, one needs to create a conda environment for each method and install all the necessary packages. 
22 | 
23 | ### Method-specific environment setup and execution
24 | The list of methods evaluated are under the **methods** folder. Each subfolder contains files related to one method. 
25 | 
26 | Installation (one conda environment is created for each method) 
27 | - Option 1: run the '.*_env.txt' file line-by-line in linux and R to install the method and its dependencies. 
28 | - Option 2: install using the .yml file to create the conda environment. E.g. run the code below to install the conda environment to run Seurat v4 or Seurat v3. 
29 |     ```
30 |     conda env create -f seurat.yml
31 |     ```
32 | ### Evaluation environment setup 
33 | Data simulation, evaluation, and plot generation are run after activating the scib2 environment. Details about environment setup can be found in the **methods/eval_scib2** folder. 
34 | 
35 | ## Data 
36 | Processed source data and reference files used during data simulation and evaluations, as well as scripts to generate these files from raw dataset can be downloaded [here](https://upenn.box.com/s/jtua3rmmvzempjq55z4kj9xiij9dqoez).
37 | 
38 | 


--------------------------------------------------------------------------------
/methods/bindsc/bindsc_env.R:
--------------------------------------------------------------------------------
 1 | # command line
 2 | 
 3 | conda create -n bindsc r-essentials r-base
 4 | 
 5 | conda activate bindsc
 6 | conda install -y igraph hdf5
 7 | 
 8 | R
 9 | # R
10 | install.packages('Seurat')
11 | install.packages('IRkernel')
12 | IRkernel::installspec(name = 'bindsc', displayname = 'rbindsc')
13 | 
14 | 
15 | system("conda install -y -c conda-forge r-devtools")
16 | install.packages("BiocManager")
17 | BiocManager::install("ComplexHeatmap")
18 | library(devtools)
19 | install_github('KChen-lab/bindSC')
20 | 
21 | 
22 | BiocManager::install(c("GenomeInfoDb","IRanges", "Rsamtools", "S4Vectors", "BiocGenerics"))
23 | remotes::install_version("RSQLite", version = "2.2.5")
24 | BiocManager::install(c("EnsDb.Hsapiens.v86","biovizBase"))
25 | install.packages("Signac") 
26 | remotes::install_github("mojaveazure/seurat-disk")
27 | 
28 | 
29 | library(bindSC)
30 | library(Seurat)
31 | library(Signac)
32 | library(stringr)
33 | 


--------------------------------------------------------------------------------
/methods/bindsc/run_rbindsc.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(stringr)
  5 | require(Seurat)
  6 | require(Signac)
  7 | require(Matrix)
  8 | require(bindSC)
  9 | source("r_utils.R")
 10 | require(future)
 11 | 
 12 | run_rbindsc_fn <- function(in_dir, out_dir){
 13 |     # starting time
 14 |     t1 <- Sys.time()
 15 |     n_lat = 30
 16 |     plan("multisession")
 17 |     options(future.rng.onMisue = "ignore")
 18 |     print(paste0("workers used:",nbrOfWorkers()))
 19 |     
 20 |     datasets = load_datasets(in_dir)
 21 |     paired_rna=datasets$paired_rna
 22 |     paired_atac=datasets$paired_atac
 23 |     unpaired_rna=datasets$unpaired_rna
 24 |     unpaired_atac=datasets$unpaired_atac
 25 | 
 26 |     dataset_vec <- rep(c("scRNA","snATAC","Multiome-RNA","Multiome-ATAC"),
 27 |                    c(ncol(unpaired_rna),
 28 |                      ncol(unpaired_atac),
 29 |                      ncol(paired_rna),
 30 |                     ncol(paired_atac)))
 31 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 32 |                             paste0(colnames(unpaired_atac)),
 33 |                             paste0("prna_",colnames(paired_rna)),
 34 |                             paste0("patac_",colnames(paired_atac)))
 35 |     print(table(dataset_vec))
 36 |     
 37 |     paired_rna <- RenameCells(paired_rna,add.cell.id = "prna",for.merge = FALSE)
 38 |     paired_atac <- RenameCells(paired_atac,add.cell.id = "patac",for.merge = FALSE)
 39 | 
 40 |     # merging
 41 |     unpaired_rna <- merge(unpaired_rna,paired_rna)
 42 |     unpaired_atac <- merge(unpaired_atac,paired_atac)
 43 | 
 44 |     DefaultAssay(unpaired_rna) <- "RNA"
 45 |     unpaired_rna <- NormalizeData(unpaired_rna)
 46 |     unpaired_rna <- FindVariableFeatures(unpaired_rna, nfeatures = 5000)
 47 |     unpaired_rna <- ScaleData(unpaired_rna)
 48 |     unpaired_rna <- RunPCA(unpaired_rna)
 49 |     unpaired_rna <- FindNeighbors(unpaired_rna, dims = 1:20, reduction = "pca")
 50 |     unpaired_rna <- FindClusters(unpaired_rna, resolution = 0.5)
 51 |     unpaired_rna <- RunUMAP(unpaired_rna, reduction = "pca", dims = 1:15)
 52 | 
 53 |     # quantify gene activity
 54 |     gene.activities <- GeneActivity(unpaired_atac, features = VariableFeatures(unpaired_rna))
 55 |     # add gene activities as a new assay
 56 |     unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 57 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 58 |     unpaired_atac <- FindVariableFeatures(unpaired_atac, nfeatures = 5000)
 59 |     
 60 |     DefaultAssay(unpaired_atac) <- "ATAC"
 61 |     unpaired_atac <- RunTFIDF(unpaired_atac)
 62 |     unpaired_atac <- FindTopFeatures(unpaired_atac, min.cutoff = 50)
 63 |     unpaired_atac <- RunSVD(unpaired_atac)
 64 |     unpaired_atac <- FindNeighbors(unpaired_atac, dims = 1:20, reduction = "lsi")
 65 |     unpaired_atac <- FindClusters(unpaired_atac, resolution = 0.5)
 66 |     unpaired_atac <- RunUMAP(unpaired_atac, reduction = "lsi", dims = 1:15)
 67 | 
 68 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 69 |     gene.use <- intersect(VariableFeatures(unpaired_rna), 
 70 |                           VariableFeatures(unpaired_atac))
 71 |     
 72 |     X <- unpaired_rna[["RNA"]][gene.use,]
 73 |     Y <- unpaired_atac[["ATAC"]][]
 74 |     Z0 <- unpaired_atac[["ACTIVITY"]][gene.use]
 75 |     type <- c(rep("RNA", ncol(X)), rep("ATAC", ncol(X)))
 76 | 
 77 |     a <- rowSums(as.matrix(Y))
 78 |     Y <- Y[a>50,]
 79 |     
 80 |     out <- dimReduce(dt1 =  X, dt2 = Z0,  K = 30)
 81 |     x <- out$dt1
 82 |     z0 <- out$dt2
 83 |     y  <- unpaired_atac@reductions$lsi@cell.embeddings
 84 | 
 85 |     res <- BiCCA( X = t(x) ,
 86 |                  Y = t(y), 
 87 |                  Z0 =t(z0), 
 88 |                  X.clst = unpaired_rna$seurat_clusters,
 89 |                  Y.clst = unpaired_atac$seurat_clusters,
 90 |                  alpha = 0.5, 
 91 |                  lambda = 0.5,
 92 |                  K = 15,
 93 |                  temp.path  = "out",
 94 |                  num.iteration = 50,
 95 |                  tolerance = 0.01,
 96 |                  save = TRUE,
 97 |                  parameter.optimize = FALSE,
 98 |                  block.size = 0)
 99 | 
100 |     df_umap <- as.data.frame(rbind(res$u, res$r))
101 |     colnames(df_umap) <- paste0("latent_",1:dim(df_umap)[2])
102 |     df_umap$dataset = "scRNA"
103 |     df_umap[names(dataset_vec),"dataset"] = dataset_vec
104 |     print("------ Saving integration result ------")
105 |     dir.create(file.path(out_dir,"rbindsc"),recursive=TRUE)
106 |     write.csv(df_umap,file.path(out_dir,"rbindsc","rbindsc_result.csv"))
107 |     t2 <- Sys.time()
108 |     dir.create(file.path(out_dir,"runtime"),recursive=TRUE)
109 |     
110 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
111 |                 file = file.path(out_dir,"runtime","rbindsc_runtime.txt"), 
112 |                 sep = "\t",
113 |                 row.names = FALSE,
114 |                 col.names = FALSE)
115 |     print("------ Done ------")
116 |     
117 |     print("------ Prediction ------")
118 |     # starting time
119 |     t1 <- Sys.time()
120 |     # prediction
121 |     Z_impu <- impuZ(X=unpaired_rna[["RNA"]][gene.use,], bicca = res)
122 |     # whole range normalization, ran in plot_geneScoreChange 
123 |     Z_impu_norm<- (Z_impu-min(Z_impu))/(max(Z_impu)-min(Z_impu))
124 |     unpaired_atac[['RNA_impute']] <- CreateAssayObject(counts=Z_impu_norm)
125 | 
126 |     # saved cell name will be "barcodes" column in object@meta.data
127 |     write_mtx_folder(file.path(out_dir,"rbindsc","predicted","ATAC"),unpaired_atac,assay_key="ATAC",slot_key="counts","peak")
128 |     write_mtx_folder(file.path(out_dir,"rbindsc","predicted","RNA"),unpaired_atac,assay_key="RNA_impute",slot_key="counts","gene")
129 |     
130 |     t2 <- Sys.time()
131 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
132 |                 file = file.path(out_dir,"runtime","rbindsc_prediction_time.txt"), 
133 |                 sep = "\t",
134 |                 row.names = FALSE,
135 |                 col.names = FALSE)
136 |     print("------ Prediction Done ------")
137 | 
138 | }
139 | 
140 | 
141 | if (length(args) < 2) {
142 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
143 | }
144 | 
145 | print(paste0("argument 1: ",args[1]))
146 | print(paste0("argument 2: ",args[2]))
147 | 
148 | run_rbindsc_fn(args[1], args[2])
149 | 


--------------------------------------------------------------------------------
/methods/bindsc/run_rbindsc_batch_sequential.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(stringr)
  5 | require(Seurat)
  6 | require(Signac)
  7 | require(Matrix)
  8 | require(bindSC)
  9 | source("r_utils.R")
 10 | require(future)
 11 | 
 12 | run_rbindsc_fn <- function(in_dir, out_dir){
 13 |     # starting time
 14 |     t1 <- Sys.time()
 15 | #     n_lat = 30
 16 | #     plan("multisession")
 17 | #     options(future.rng.onMisue = "ignore")
 18 | #     print(paste0("workers used:",nbrOfWorkers()))
 19 | #     options(future.globals.maxSize = 8000 * 1024^2)
 20 |     
 21 |     datasets = load_datasets(in_dir,obs=c("barcodes","batch"))
 22 |     paired_rna=datasets$paired_rna
 23 |     paired_atac=datasets$paired_atac
 24 |     unpaired_rna=datasets$unpaired_rna
 25 |     unpaired_atac=datasets$unpaired_atac
 26 | 
 27 |     dataset_vec <- rep(c("scRNA","snATAC","Multiome-RNA","Multiome-ATAC"),
 28 |                    c(ncol(unpaired_rna),
 29 |                      ncol(unpaired_atac),
 30 |                      ncol(paired_rna),
 31 |                     ncol(paired_atac)))
 32 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 33 |                             paste0(colnames(unpaired_atac)),
 34 |                             paste0("prna_",colnames(paired_rna)),
 35 |                             paste0("patac_",colnames(paired_atac)))
 36 |     print(table(dataset_vec))
 37 |     
 38 |     paired_rna <- RenameCells(paired_rna,add.cell.id = "prna",for.merge = FALSE)
 39 |     paired_atac <- RenameCells(paired_atac,add.cell.id = "patac",for.merge = FALSE)
 40 | 
 41 |     # merging
 42 |     unpaired_rna@meta.data$technology = "scRNA"
 43 |     unpaired_atac@meta.data$technology = "snATAC"
 44 |     paired_rna@meta.data$technology = "Multiome-RNA"
 45 |     paired_atac@meta.data$technology = "Multiome-ATAC"
 46 |     
 47 |     unpaired_rna <- merge(unpaired_rna,paired_rna)
 48 |     unpaired_atac <- merge(unpaired_atac,paired_atac)
 49 |     
 50 |     unpaired_rna@meta.data$group <- paste0(unpaired_rna$batch,"_",unpaired_rna$technology)
 51 |     
 52 |     DefaultAssay(unpaired_rna) <- "RNA"
 53 |     unpaired_rna <- NormalizeData(unpaired_rna)
 54 |     rna.list <- SplitObject(unpaired_rna, split.by = "group")
 55 |     
 56 |     #select high variable features across samples (assume cells have been normalized by library size, does not need to be sample-specific)
 57 |     features = SelectIntegrationFeatures(
 58 |       rna.list,
 59 |       nfeatures = 5000,
 60 |       verbose = TRUE,
 61 |       fvf.nfeatures = 10000,
 62 |     )
 63 |     unpaired_rna@assays$RNA@var.features = features
 64 |     unpaired_rna <- ScaleData(unpaired_rna,features=features,split.by='group')
 65 |     unpaired_rna <- RunPCA(unpaired_rna,features=features)
 66 |     unpaired_rna <- FindNeighbors(unpaired_rna, dims = 1:20, reduction = "pca")
 67 |     unpaired_rna <- FindClusters(unpaired_rna, resolution = 0.5)
 68 |     unpaired_rna <- RunUMAP(unpaired_rna, reduction = "pca", dims = 1:15)
 69 | 
 70 |     
 71 |     # quantify gene activity
 72 |     gene.activities <- GeneActivity(unpaired_atac, features = VariableFeatures(unpaired_rna))
 73 |     # add gene activities as a new assay
 74 |     unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 75 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 76 |     unpaired_atac <- FindVariableFeatures(unpaired_atac, nfeatures = 5000)
 77 |     
 78 |     DefaultAssay(unpaired_atac) <- "ATAC"
 79 |     unpaired_atac <- RunTFIDF(unpaired_atac)
 80 |     unpaired_atac <- FindTopFeatures(unpaired_atac, min.cutoff = 50)
 81 |     unpaired_atac <- RunSVD(unpaired_atac)
 82 |     unpaired_atac <- FindNeighbors(unpaired_atac, dims = 1:20, reduction = "lsi")
 83 |     unpaired_atac <- FindClusters(unpaired_atac, resolution = 0.5)
 84 |     unpaired_atac <- RunUMAP(unpaired_atac, reduction = "lsi", dims = 1:15)
 85 | 
 86 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 87 |     gene.use <- intersect(VariableFeatures(unpaired_rna), 
 88 |                           VariableFeatures(unpaired_atac))
 89 |     # getting normalized counts (@data field)
 90 |     X <- unpaired_rna[["RNA"]][gene.use,]
 91 |     Y <- unpaired_atac[["ATAC"]][]
 92 |     Z0 <- unpaired_atac[["ACTIVITY"]][gene.use]
 93 |     type <- c(rep("RNA", ncol(X)), rep("ATAC", ncol(X)))
 94 | 
 95 |     a <- rowSums(Y)
 96 |     Y <- Y[a>50,]
 97 |     
 98 |     out <- dimReduce(dt1 =  X, dt2 = Z0,  K = 30)
 99 |     x <- out$dt1
100 |     z0 <- out$dt2
101 |     y  <- unpaired_atac@reductions$lsi@cell.embeddings
102 | 
103 |     res <- BiCCA( X = t(x) ,
104 |                  Y = t(y), 
105 |                  Z0 =t(z0), 
106 |                  X.clst = unpaired_rna$seurat_clusters,
107 |                  Y.clst = unpaired_atac$seurat_clusters,
108 |                  alpha = 0.5, 
109 |                  lambda = 0.5,
110 |                  K = 15,
111 |                  temp.path  = "out",
112 |                  num.iteration = 50,
113 |                  tolerance = 0.01,
114 |                  save = TRUE,
115 |                  parameter.optimize = FALSE,
116 |                  block.size = 0)
117 | 
118 |     df_umap <- as.data.frame(rbind(res$u, res$r))
119 |     colnames(df_umap) <- paste0("latent_",1:dim(df_umap)[2])
120 |     df_umap$dataset = "scRNA"
121 |     df_umap[names(dataset_vec),"dataset"] = dataset_vec
122 |     print("------ Saving integration result ------")
123 |     dir.create(file.path(out_dir,"rbindsc"),recursive=TRUE)
124 |     write.csv(df_umap,file.path(out_dir,"rbindsc","rbindsc_result.csv"))
125 |     t2 <- Sys.time()
126 |     dir.create(file.path(out_dir,"runtime"),recursive=TRUE)
127 |     
128 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
129 |                 file = file.path(out_dir,"runtime","rbindsc_runtime.txt"), 
130 |                 sep = "\t",
131 |                 row.names = FALSE,
132 |                 col.names = FALSE)
133 |     print("------ Done ------")
134 |     
135 |     print("------ No prediction ------")
136 | #     # starting time
137 | #     t1 <- Sys.time()
138 | #     # prediction
139 | #     Z_impu <- impuZ(X=unpaired_rna[["RNA"]][gene.use,], bicca = res)
140 | #     # whole range normalization, ran in plot_geneScoreChange 
141 | #     Z_impu_norm<- (Z_impu-min(Z_impu))/(max(Z_impu)-min(Z_impu))
142 | #     unpaired_atac[['RNA_impute']] <- CreateAssayObject(counts=Z_impu_norm)
143 | 
144 | #     # saved cell name will be "barcodes" column in object@meta.data
145 | #     write_mtx_folder(file.path(out_dir,"rbindsc","predicted","ATAC"),unpaired_atac,assay_key="ATAC",slot_key="counts","peak")
146 | #     write_mtx_folder(file.path(out_dir,"rbindsc","predicted","RNA"),unpaired_atac,assay_key="RNA_impute",slot_key="counts","gene")
147 |     
148 | #     t2 <- Sys.time()
149 | #     write.table(difftime(t2, t1, units = "secs")[[1]], 
150 | #                 file = file.path(out_dir,"runtime","rbindsc_prediction_time.txt"), 
151 | #                 sep = "\t",
152 | #                 row.names = FALSE,
153 | #                 col.names = FALSE)
154 | #     print("------ Prediction Done ------")
155 | 
156 | }
157 | 
158 | 
159 | if (length(args) < 2) {
160 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
161 | }
162 | 
163 | print(paste0("argument 1: ",args[1]))
164 | print(paste0("argument 2: ",args[2]))
165 | 
166 | run_rbindsc_fn(args[1], args[2])
167 | 


--------------------------------------------------------------------------------
/methods/bindsc/run_rbindsc_noPred.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(stringr)
  5 | require(Seurat)
  6 | require(Signac)
  7 | require(Matrix)
  8 | require(bindSC)
  9 | source("r_utils.R")
 10 | require(future)
 11 | 
 12 | run_rbindsc_fn <- function(in_dir, out_dir){
 13 |     # starting time
 14 |     t1 <- Sys.time()
 15 |     n_lat = 30
 16 |     plan("multisession")
 17 |     options(future.rng.onMisue = "ignore")
 18 |     print(paste0("workers used:",nbrOfWorkers()))
 19 |     
 20 |     datasets = load_datasets(in_dir)
 21 |     paired_rna=datasets$paired_rna
 22 |     paired_atac=datasets$paired_atac
 23 |     unpaired_rna=datasets$unpaired_rna
 24 |     unpaired_atac=datasets$unpaired_atac
 25 | 
 26 |     dataset_vec <- rep(c("scRNA","snATAC","Multiome-RNA","Multiome-ATAC"),
 27 |                    c(ncol(unpaired_rna),
 28 |                      ncol(unpaired_atac),
 29 |                      ncol(paired_rna),
 30 |                     ncol(paired_atac)))
 31 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 32 |                             paste0(colnames(unpaired_atac)),
 33 |                             paste0("prna_",colnames(paired_rna)),
 34 |                             paste0("patac_",colnames(paired_atac)))
 35 |     print(table(dataset_vec))
 36 |     
 37 |     paired_rna <- RenameCells(paired_rna,add.cell.id = "prna",for.merge = FALSE)
 38 |     paired_atac <- RenameCells(paired_atac,add.cell.id = "patac",for.merge = FALSE)
 39 | 
 40 |     # merging
 41 |     unpaired_rna <- merge(unpaired_rna,paired_rna)
 42 |     unpaired_atac <- merge(unpaired_atac,paired_atac)
 43 | 
 44 |     DefaultAssay(unpaired_rna) <- "RNA"
 45 |     unpaired_rna <- NormalizeData(unpaired_rna)
 46 |     unpaired_rna <- FindVariableFeatures(unpaired_rna, nfeatures = 5000)
 47 |     unpaired_rna <- ScaleData(unpaired_rna)
 48 |     unpaired_rna <- RunPCA(unpaired_rna)
 49 |     unpaired_rna <- FindNeighbors(unpaired_rna, dims = 1:20, reduction = "pca")
 50 |     unpaired_rna <- FindClusters(unpaired_rna, resolution = 0.5)
 51 |     unpaired_rna <- RunUMAP(unpaired_rna, reduction = "pca", dims = 1:15)
 52 | 
 53 |     # quantify gene activity
 54 |     gene.activities <- GeneActivity(unpaired_atac, features = VariableFeatures(unpaired_rna))
 55 |     # add gene activities as a new assay
 56 |     unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 57 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 58 |     unpaired_atac <- FindVariableFeatures(unpaired_atac, nfeatures = 5000)
 59 |     
 60 |     DefaultAssay(unpaired_atac) <- "ATAC"
 61 |     unpaired_atac <- RunTFIDF(unpaired_atac)
 62 |     unpaired_atac <- FindTopFeatures(unpaired_atac, min.cutoff = 50)
 63 |     unpaired_atac <- RunSVD(unpaired_atac)
 64 |     unpaired_atac <- FindNeighbors(unpaired_atac, dims = 1:20, reduction = "lsi")
 65 |     unpaired_atac <- FindClusters(unpaired_atac, resolution = 0.5)
 66 |     unpaired_atac <- RunUMAP(unpaired_atac, reduction = "lsi", dims = 1:15)
 67 | 
 68 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 69 |     gene.use <- intersect(VariableFeatures(unpaired_rna), 
 70 |                           VariableFeatures(unpaired_atac))
 71 |     
 72 |     X <- unpaired_rna[["RNA"]][gene.use,]
 73 |     Y <- unpaired_atac[["ATAC"]][]
 74 |     Z0 <- unpaired_atac[["ACTIVITY"]][gene.use]
 75 |     type <- c(rep("RNA", ncol(X)), rep("ATAC", ncol(X)))
 76 | 
 77 |     a <- rowSums(Y)
 78 |     Y <- Y[a>50,]
 79 |     
 80 |     out <- dimReduce(dt1 =  X, dt2 = Z0,  K = 30)
 81 |     x <- out$dt1
 82 |     z0 <- out$dt2
 83 |     y  <- unpaired_atac@reductions$lsi@cell.embeddings
 84 | 
 85 |     res <- BiCCA( X = t(x) ,
 86 |                  Y = t(y), 
 87 |                  Z0 =t(z0), 
 88 |                  X.clst = unpaired_rna$seurat_clusters,
 89 |                  Y.clst = unpaired_atac$seurat_clusters,
 90 |                  alpha = 0.5, 
 91 |                  lambda = 0.5,
 92 |                  K = 15,
 93 |                  temp.path  = "out",
 94 |                  num.iteration = 50,
 95 |                  tolerance = 0.01,
 96 |                  save = TRUE,
 97 |                  parameter.optimize = FALSE,
 98 |                  block.size = 0)
 99 | 
100 |     df_umap <- as.data.frame(rbind(res$u, res$r))
101 |     colnames(df_umap) <- paste0("latent_",1:dim(df_umap)[2])
102 |     df_umap$dataset = "scRNA"
103 |     df_umap[names(dataset_vec),"dataset"] = dataset_vec
104 |     print("------ Saving integration result ------")
105 |     dir.create(file.path(out_dir,"rbindsc"),recursive=TRUE)
106 |     write.csv(df_umap,file.path(out_dir,"rbindsc","rbindsc_result.csv"))
107 |     t2 <- Sys.time()
108 |     dir.create(file.path(out_dir,"runtime"),recursive=TRUE)
109 |     
110 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
111 |                 file = file.path(out_dir,"runtime","rbindsc_runtime.txt"), 
112 |                 sep = "\t",
113 |                 row.names = FALSE,
114 |                 col.names = FALSE)
115 |     print("------ Done ------")
116 |     
117 | }
118 | 
119 | 
120 | if (length(args) < 2) {
121 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
122 | }
123 | 
124 | print(paste0("argument 1: ",args[1]))
125 | print(paste0("argument 2: ",args[2]))
126 | 
127 | run_rbindsc_fn(args[1], args[2])
128 | 


--------------------------------------------------------------------------------
/methods/bindsc/run_rbindsc_single.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(stringr)
  5 | require(Seurat)
  6 | require(Signac)
  7 | require(Matrix)
  8 | require(bindSC)
  9 | source("r_utils.R")
 10 | require(future)
 11 | 
 12 | 
 13 | run_rbindsc_fn <- function(in_dir, out_dir){
 14 |     # starting time
 15 |     t1 <- Sys.time()
 16 |     n_lat = 30
 17 |     plan("multisession")
 18 |     options(future.rng.onMisue = "ignore")
 19 |     print(paste0("workers used:",nbrOfWorkers()))
 20 |     
 21 |     datasets = load_datasets(in_dir)
 22 |     print("loading single modality datasets only, ignoring paired RNA and paired ATAC folder")
 23 |     unpaired_rna=datasets$unpaired_rna
 24 |     unpaired_atac=datasets$unpaired_atac
 25 | 
 26 |     # print number of cells per data type
 27 |     dataset_vec <- rep(c("scRNA","snATAC"),
 28 |                        c(ncol(unpaired_rna),
 29 |                          ncol(unpaired_atac)))
 30 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 31 |                             paste0(colnames(unpaired_atac)))
 32 |     print(table(dataset_vec))
 33 |     
 34 |     unpaired_atac@meta.data$dataset <- "snATAC"
 35 |     unpaired_rna@meta.data$dataset <- "scRNA"
 36 |     
 37 |     # merging
 38 |     #unpaired_rna <- merge(unpaired_rna,paired_rna, add.cell.ids = c("urna", "prna"))
 39 |     #unpaired_atac <- merge(unpaired_atac,paired_atac, add.cell.ids = c("uatac", "patac"))
 40 | 
 41 |     DefaultAssay(unpaired_rna) <- "RNA"
 42 |     unpaired_rna <- NormalizeData(unpaired_rna)
 43 |     unpaired_rna <- FindVariableFeatures(unpaired_rna, nfeatures = 5000)
 44 |     unpaired_rna <- ScaleData(unpaired_rna)
 45 |     unpaired_rna <- RunPCA(unpaired_rna)
 46 |     unpaired_rna <- FindNeighbors(unpaired_rna, dims = 1:20, reduction = "pca")
 47 |     unpaired_rna <- FindClusters(unpaired_rna, resolution = 0.5)
 48 |     unpaired_rna <- RunUMAP(unpaired_rna, reduction = "pca", dims = 1:15)
 49 | 
 50 |     # quantify gene activity
 51 |     gene.activities <- GeneActivity(unpaired_atac, features = VariableFeatures(unpaired_rna))
 52 |     # add gene activities as a new assay
 53 |     unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 54 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 55 |     unpaired_atac <- FindVariableFeatures(unpaired_atac, nfeatures = 5000)
 56 |     
 57 |     DefaultAssay(unpaired_atac) <- "ATAC"
 58 |     unpaired_atac <- RunTFIDF(unpaired_atac)
 59 |     unpaired_atac <- FindTopFeatures(unpaired_atac, min.cutoff = 50)
 60 |     unpaired_atac <- RunSVD(unpaired_atac)
 61 |     unpaired_atac <- FindNeighbors(unpaired_atac, dims = 1:20, reduction = "lsi")
 62 |     unpaired_atac <- FindClusters(unpaired_atac, resolution = 0.5)
 63 |     unpaired_atac <- RunUMAP(unpaired_atac, reduction = "lsi", dims = 1:15)
 64 | 
 65 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 66 |     gene.use <- intersect(VariableFeatures(unpaired_rna), 
 67 |                           VariableFeatures(unpaired_atac))
 68 |     
 69 |     X <- unpaired_rna[["RNA"]][gene.use,]
 70 |     Y <- unpaired_atac[["ATAC"]][]
 71 |     Z0 <- unpaired_atac[["ACTIVITY"]][gene.use]
 72 |     type <- c(rep("RNA", ncol(X)), rep("ATAC", ncol(X)))
 73 | 
 74 |     a <- rowSums(as.matrix(Y))
 75 |     Y <- Y[a>50,]
 76 |     
 77 |     out <- dimReduce(dt1 =  X, dt2 = Z0,  K = 30)
 78 |     x <- out$dt1
 79 |     z0 <- out$dt2
 80 |     y  <- unpaired_atac@reductions$lsi@cell.embeddings
 81 | 
 82 |     res <- BiCCA( X = t(x) ,
 83 |                  Y = t(y), 
 84 |                  Z0 =t(z0), 
 85 |                  X.clst = unpaired_rna$seurat_clusters,
 86 |                  Y.clst = unpaired_atac$seurat_clusters,
 87 |                  alpha = 0.5, 
 88 |                  lambda = 0.5,
 89 |                  K = 15,
 90 |                  temp.path  = "out",
 91 |                  num.iteration = 50,
 92 |                  tolerance = 0.01,
 93 |                  save = TRUE,
 94 |                  parameter.optimize = FALSE,
 95 |                  block.size = 0)
 96 | 
 97 |     df_umap <- as.data.frame(rbind(res$u, res$r))
 98 |     colnames(df_umap) <- paste0("latent_",1:dim(df_umap)[2])
 99 |     df_umap$dataset = "scRNA"
100 |     df_umap[names(dataset_vec),"dataset"] = dataset_vec
101 |     print("------ Saving integration result ------")
102 |     dir.create(file.path(out_dir,"rbindsc"),recursive=TRUE)
103 |     write.csv(df_umap,file.path(out_dir,"rbindsc","rbindsc_result.csv"))
104 |     t2 <- Sys.time()
105 |     dir.create(file.path(out_dir,"runtime"),recursive=TRUE)
106 |     
107 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
108 |                 file = file.path(out_dir,"runtime","rbindsc_runtime.txt"), 
109 |                 sep = "\t",
110 |                 row.names = FALSE,
111 |                 col.names = FALSE)
112 |     print("------ Done ------")
113 |     
114 |     print("------ Prediction ------")
115 |     # starting time
116 |     t1 <- Sys.time()
117 |     # prediction
118 |     Z_impu <- impuZ(X=unpaired_rna[["RNA"]][gene.use,], bicca = res)
119 |     # whole range normalization, ran in plot_geneScoreChange 
120 |     Z_impu_norm<- (Z_impu-min(Z_impu))/(max(Z_impu)-min(Z_impu))
121 |     unpaired_atac[['RNA_impute']] <- CreateAssayObject(counts=Z_impu_norm)
122 | 
123 |     write_mtx_folder(file.path(out_dir,"rbindsc","predicted/ATAC/"),unpaired_atac,assay_key="ATAC",slot_key="counts","peak")
124 |     write_mtx_folder(file.path(out_dir,"rbindsc","predicted/RNA/"),unpaired_atac,assay_key="RNA_impute",slot_key="counts","gene")
125 |     
126 |     t2 <- Sys.time()
127 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
128 |                 file = file.path(out_dir,"runtime","rbindsc_prediction_time.txt"), 
129 |                 sep = "\t",
130 |                 row.names = FALSE,
131 |                 col.names = FALSE)
132 |     print("------ Prediction Done ------")
133 | 
134 | }
135 | 
136 | 
137 | if (length(args) < 2) {
138 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
139 | }
140 | 
141 | print(paste0("argument 1: ",args[1]))
142 | print(paste0("argument 2: ",args[2]))
143 | 
144 | run_rbindsc_fn(args[1], args[2])
145 | 


--------------------------------------------------------------------------------
/methods/cobolt/cobolt.yml:
--------------------------------------------------------------------------------
  1 | name: cobolt
  2 | channels:
  3 |   - anaconda
  4 |   - conda-forge
  5 |   - bioconda
  6 |   - defaults
  7 | dependencies:
  8 |   - _libgcc_mutex=0.1=conda_forge
  9 |   - _openmp_mutex=4.5=1_gnu
 10 |   - bzip2=1.0.8=h7b6447c_0
 11 |   - c-ares=1.18.1=h7f98852_0
 12 |   - ca-certificates=2020.10.14=0
 13 |   - curl=7.81.0=h494985f_0
 14 |   - expat=2.4.6=h27087fc_0
 15 |   - gettext=0.19.8.1=h9b4dc7a_1
 16 |   - git=2.35.0=pl5321hf874766_0
 17 |   - krb5=1.19.2=h48eae69_3
 18 |   - ld_impl_linux-64=2.36.1=hea4e1c9_2
 19 |   - libcurl=7.81.0=h494985f_0
 20 |   - libedit=3.1.20191231=h14c3975_1
 21 |   - libev=4.33=h7b6447c_0
 22 |   - libffi=3.4.2=h7f98852_5
 23 |   - libgcc-ng=11.2.0=h1d223b6_12
 24 |   - libgomp=11.2.0=h1d223b6_12
 25 |   - libiconv=1.16=h516909a_0
 26 |   - libnghttp2=1.47.0=he49606f_0
 27 |   - libnsl=2.0.0=h7f98852_0
 28 |   - libssh2=1.10.0=ha35d2d1_2
 29 |   - libstdcxx-ng=11.2.0=he4da1e4_12
 30 |   - libzlib=1.2.11=h36c2ea0_1013
 31 |   - ncurses=6.3=h9c3ff4c_0
 32 |   - openssl=3.0.0=h7f98852_2
 33 |   - pcre2=10.37=h032f7d1_0
 34 |   - perl=5.26.2=h14c3975_0
 35 |   - pip=22.0.3=pyhd8ed1ab_0
 36 |   - python=3.7.12=hf930737_100_cpython
 37 |   - python_abi=3.7=2_cp37m
 38 |   - readline=8.1=h46c0cb4_0
 39 |   - setuptools=60.9.3=py37h89c1867_0
 40 |   - sqlite=3.37.0=h9cd32fc_0
 41 |   - tk=8.6.12=h27826a3_0
 42 |   - wheel=0.37.1=pyhd8ed1ab_0
 43 |   - xz=5.2.5=h516909a_1
 44 |   - zlib=1.2.11=h36c2ea0_1013
 45 |   - pip:
 46 |     - anndata==0.7.8
 47 |     - backcall==0.2.0
 48 |     - cached-property==1.5.2
 49 |     - cobolt==0.0.1
 50 |     - cycler==0.11.0
 51 |     - debugpy==1.5.1
 52 |     - decorator==5.1.1
 53 |     - entrypoints==0.4
 54 |     - fonttools==4.29.1
 55 |     - h5py==3.6.0
 56 |     - igraph==0.9.9
 57 |     - importlib-metadata==4.11.1
 58 |     - ipykernel==6.9.1
 59 |     - ipython==7.32.0
 60 |     - jedi==0.18.1
 61 |     - joblib==1.1.0
 62 |     - jupyter-client==7.1.2
 63 |     - jupyter-core==4.9.2
 64 |     - kiwisolver==1.3.2
 65 |     - leidenalg==0.8.9
 66 |     - llvmlite==0.38.0
 67 |     - matplotlib==3.5.1
 68 |     - matplotlib-inline==0.1.3
 69 |     - natsort==8.1.0
 70 |     - nest-asyncio==1.5.4
 71 |     - networkx==2.6.3
 72 |     - numba==0.55.1
 73 |     - numexpr==2.8.1
 74 |     - numpy==1.21.5
 75 |     - packaging==21.3
 76 |     - pandas==1.3.5
 77 |     - parso==0.8.3
 78 |     - patsy==0.5.2
 79 |     - pexpect==4.8.0
 80 |     - pickleshare==0.7.5
 81 |     - pillow==9.0.1
 82 |     - prompt-toolkit==3.0.28
 83 |     - ptyprocess==0.7.0
 84 |     - pygments==2.11.2
 85 |     - pynndescent==0.5.6
 86 |     - pyparsing==3.0.7
 87 |     - python-dateutil==2.8.2
 88 |     - python-igraph==0.9.9
 89 |     - pytz==2021.3
 90 |     - pyzmq==22.3.0
 91 |     - scanpy==1.8.2
 92 |     - scikit-learn==1.0.2
 93 |     - scipy==1.7.3
 94 |     - seaborn==0.11.2
 95 |     - sinfo==0.3.4
 96 |     - six==1.16.0
 97 |     - sklearn==0.0
 98 |     - statsmodels==0.13.2
 99 |     - stdlib-list==0.8.0
100 |     - tables==3.7.0
101 |     - texttable==1.6.4
102 |     - threadpoolctl==3.1.0
103 |     - torch==1.10.2
104 |     - tornado==6.1
105 |     - tqdm==4.62.3
106 |     - traitlets==5.1.1
107 |     - typing-extensions==4.1.1
108 |     - umap-learn==0.5.2
109 |     - wcwidth==0.2.5
110 |     - xgboost==1.5.2
111 |     - xlrd==1.2.0
112 |     - zipp==3.7.0
113 | prefix: /home/myylee/anaconda3/envs/cobolt
114 | 


--------------------------------------------------------------------------------
/methods/cobolt/cobolt_env.txt:
--------------------------------------------------------------------------------
 1 | # command line
 2 | 
 3 | conda create -n cobolt python=3.7
 4 | 
 5 | conda activate cobolt
 6 | 
 7 | # install git first for installing cobolt as a git repo (v1.0.0)
 8 | conda install -c anaconda git
 9 | pip install git+https://github.com/boyinggong/cobolt.git#egg=cobolt
10 | pip install scanpy leidenalg ipykernel
11 | 
12 | python -m ipykernel install --user --name cobolt --display-name "cobolt"
13 | 


--------------------------------------------------------------------------------
/methods/cobolt/run_cobolt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import sys 
 3 | print('Number of arguments:', len(sys.argv), 'arguments.')
 4 | print('Argument List:'+ str(sys.argv))
 5 | 
 6 | from cobolt.utils import SingleData, MultiomicDataset
 7 | from cobolt.model import Cobolt
 8 | import os
 9 | import pandas as pd
10 | import numpy as np
11 | import pickle
12 | import timeit
13 | 
14 | def run_cobolt_fn(in_dir,out_dir):
15 |     start = timeit.default_timer()
16 |     # Read the SNARE-seq gene expression data.
17 |     paired_rna = SingleData.from_file(path=os.path.join(in_dir, "paired_RNA"),
18 |                                       dataset_name="Multiome",
19 |                                       feature_name="GeneExpr",
20 |                                       count_file="RNA_counts.mtx",
21 |                                       feature_file="gene.tsv")
22 | 
23 |     # Read the SNARE-seq chromatin accessibility data.
24 |     paired_atac = SingleData.from_file(path=os.path.join(in_dir, "paired_ATAC"),
25 |                                       dataset_name="Multiome",
26 |                                       feature_name="ChromAccess",
27 |                                       count_file="ATAC_counts.mtx",
28 |                                       feature_file="peak.tsv")
29 | 
30 |     unpaired_rna = SingleData.from_file(path=os.path.join(in_dir, "unpaired_RNA"),
31 |                                     dataset_name="scRNA",
32 |                                     feature_name="GeneExpr",
33 |                                     feature_file="gene.tsv",
34 |                                     count_file="RNA_counts.mtx")
35 |     unpaired_atac = SingleData.from_file(path=os.path.join(in_dir, "unpaired_ATAC"),
36 |                                 dataset_name="snATAC",
37 |                                 feature_name="ChromAccess",
38 |                                 feature_file="peak.tsv",
39 |                                 count_file="ATAC_counts.mtx")
40 | 
41 | 
42 |     # Quality filtering on features.
43 |     paired_rna.filter_features(upper_quantile=0.99, lower_quantile=0.7)
44 |     paired_atac.filter_features(upper_quantile=0.99, lower_quantile=0.7)
45 |     unpaired_rna.filter_features(upper_quantile=0.99, lower_quantile=0.7)
46 |     unpaired_atac.filter_features(upper_quantile=0.99, lower_quantile=0.7)
47 |     
48 |     multi_dt = MultiomicDataset.from_singledata(unpaired_rna, unpaired_atac, paired_rna, paired_atac)
49 | 
50 |     model = Cobolt(dataset=multi_dt, lr=0.001, n_latent=16)
51 |     model.train(num_epochs=20)
52 |     os.makedirs(os.path.join(out_dir,"runtime"), exist_ok=True)
53 |     os.makedirs(os.path.join(out_dir,"cobolt"), exist_ok=True)
54 |     model_out = os.path.join(out_dir,"cobolt","cobolt_model.pickle")
55 |     pickle.dump(model, open(model_out, 'wb'))
56 |     
57 |     # save latent embedding as csv 
58 |     latent = model.get_all_latent()
59 |     res_df = pd.DataFrame(latent[0],index=latent[1])
60 |     res_df = res_df.set_axis(["latent_" + s  for s in res_df.columns.astype("str").tolist()],axis="columns")
61 |     res_df['dataset'] = np.array([model.dataset.dataset[b] for b in res_df.index])
62 |     csv_out = os.path.join(out_dir, "cobolt","cobolt_result.csv")
63 |     res_df.to_csv(csv_out)
64 |     
65 |     print("------ Done ------")
66 |     stop = timeit.default_timer()
67 |     print('Time(s): ', stop - start)
68 |     runtime_out = os.path.join(out_dir, "runtime","cobolt_runtime.txt")
69 |     print(stop - start,  file=open(runtime_out, 'w'))
70 | 
71 |     return(model)
72 | 
73 | 
74 | print("argument 1:",sys.argv[1])
75 | print("argument 2:",sys.argv[2])
76 | mvi = run_cobolt_fn(sys.argv[1],sys.argv[2])
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/methods/eval_scib2/scib2_env.txt:
--------------------------------------------------------------------------------
 1 | # command line
 2 | 
 3 | conda create -n scib2 python=3.7
 4 | 
 5 | conda activate scib2
 6 | 
 7 | pip install scanpy leidenalg ipykernel
 8 | 
 9 | conda install r-essentials r-base
10 | # devtools:
11 | conda install -y -c conda-forge r-devtools
12 | conda install -y igraph hdf5
13 | 
14 | #pip install scib #(did not work, ilsi calculation error)
15 | # install through git instead 
16 | conda install -c anaconda git
17 | pip install git+https://github.com/theislab/scib.git
18 | # missing .o file while running scib functions
19 | conda install -c conda-forge gxx
20 | cd /home/myylee/anaconda3/envs/scib2/lib/python3.7/site-packages/scib/
21 | g++ -std=c++11 -O3 knn_graph.cpp -o knn_graph.o
22 | 
23 | cd ~
24 | python -m ipykernel install --user --name scib2 --display-name "scib2"
25 | conda install -c conda-forge r-rgeos
26 | conda install -c bioconda tabix
27 | pip install bgzip
28 | 
29 | R
30 | install.packages('IRkernel')
31 | IRkernel::installspec(name = 'scib2', displayname = 'rscib2')
32 | 
33 | devtools::install_github('theislab/kBET')
34 | install.packages("BiocManager")
35 | BiocManager::install(c("GenomeInfoDb","IRanges", "Rsamtools", "S4Vectors", "BiocGenerics"))
36 | remotes::install_version("RSQLite", version = "2.2.5")
37 | BiocManager::install(c("EnsDb.Hsapiens.v86","biovizBase"))
38 | BiocManager::install(c("BSgenome.Hsapiens.UCSC.hg38"))
39 | system("conda install -c conda-forge r-rgeos")
40 | 
41 | install.packages("Seurat") 
42 | devtools::install_github('satijalab/seurat-data')
43 | remotes::install_github("mojaveazure/seurat-disk")
44 | 
45 | install.packages("Signac") 
46 | install.packages('qlcMatrix')
47 | 
48 | 


--------------------------------------------------------------------------------
/methods/figr/figr_env.R:
--------------------------------------------------------------------------------
 1 | # command line
 2 | 
 3 | conda create -n figr r-essentials r-base
 4 | 
 5 | conda activate figr
 6 | conda install -y igraph hdf5
 7 | 
 8 | R
 9 | # R
10 | install.packages('Seurat')
11 | install.packages('IRkernel')
12 | IRkernel::installspec(name = 'figr', displayname = 'rfigr')
13 | 
14 | 
15 | system("conda install -y -c conda-forge r-devtools")
16 | install.packages("BiocManager")
17 | library(devtools)
18 | 
19 | BiocManager::install(c("GenomeInfoDb","IRanges", "Rsamtools", "S4Vectors", "BiocGenerics"))
20 | remotes::install_version("RSQLite", version = "2.2.5")
21 | BiocManager::install(c("EnsDb.Hsapiens.v86","biovizBase"))
22 | install.packages("Signac") 
23 | remotes::install_github("mojaveazure/seurat-disk")
24 | 
25 | install.packages("optmatch")
26 | BiocManager::install("chromVAR")
27 | install.packages("pbmcapply")
28 | 
29 | system("conda install -c conda-forge r-ggrastr")
30 | devtools::install_github("caleblareau/BuenColors")
31 | BiocManager::install("ComplexHeatmap")
32 | install.packages(c("networkD3","network","GGally","network"))
33 | 
34 | library(Seurat)
35 | library(Signac)
36 | library(stringr)
37 | 
38 | # figr download from github repo 
39 | # git clone https://github.com/buenrostrolab/stimATAC_analyses_code.git
40 | 
41 | 


--------------------------------------------------------------------------------
/methods/figr/run_rfigr_2.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(Seurat)
  5 | require(stringr)
  6 | require(Signac)
  7 | require(Matrix)
  8 | require(future)
  9 | source("r_utils.R")
 10 | 
 11 | orig_wd = getwd()
 12 | figr_path = "/home/myylee/scmint/methods_eval/stimATAC_analyses_code/R/"
 13 | setwd(figr_path)
 14 | source("optMatching_functions.R")
 15 | source("DORC_functions.R")
 16 | source("FigR_functions.R")
 17 | 
 18 | setwd(orig_wd)
 19 | 
 20 | options(future.globals.maxSize = 3000*1024^3)
 21 | 
 22 | run_rfigr_fn <- function(in_dir, out_dir){
 23 |     # starting time
 24 |     t1 <- Sys.time()
 25 |     #plan("multisession")
 26 |     #options(future.rng.onMisue = "ignore")
 27 |     print(paste0("number of cores available:",availableCores()))
 28 |     # load dataset 
 29 |     datasets = load_datasets(in_dir)
 30 |     paired_rna = datasets$paired_rna
 31 |     paired_atac = datasets$paired_atac
 32 |     unpaired_rna = datasets$unpaired_rna
 33 |     unpaired_atac = datasets$unpaired_atac
 34 |     
 35 |     # verify number of cells in each condition 
 36 |     dataset_vec <- rep(c("scRNA","snATAC","Multiome-RNA","Multiome-ATAC"),
 37 |                    c(ncol(unpaired_rna),
 38 |                      ncol(unpaired_atac),
 39 |                      ncol(paired_rna),
 40 |                     ncol(paired_atac)))
 41 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 42 |                             paste0(colnames(unpaired_atac)),
 43 |                             paste0("prna_",colnames(paired_rna)),
 44 |                             paste0("patac_",colnames(paired_atac)))
 45 |     print(table(dataset_vec))
 46 |     
 47 |     paired_rna <- RenameCells(paired_rna,add.cell.id = "prna",for.merge = FALSE)
 48 |     paired_atac <- RenameCells(paired_atac,add.cell.id = "patac",for.merge = FALSE)
 49 |     
 50 |     unpaired_atac@meta.data$dataset <- "snATAC"
 51 |     unpaired_rna@meta.data$dataset <- "scRNA"
 52 |     paired_atac@meta.data$dataset <- "Multiome-ATAC"
 53 |     paired_rna@meta.data$dataset <- "Multiome-RNA"
 54 | 
 55 |     # merging
 56 |     unpaired_rna <- merge(unpaired_rna,paired_rna)
 57 |     unpaired_atac <- merge(unpaired_atac,paired_atac)
 58 | 
 59 |     unpaired_rna <- NormalizeData(unpaired_rna)
 60 |     unpaired_rna <- FindVariableFeatures(unpaired_rna,nfeatures = 5000)
 61 |     # We exclude the first dimension as this is typically correlated with sequencing depth
 62 |     unpaired_atac <- RunTFIDF(unpaired_atac)
 63 |     unpaired_atac <- FindTopFeatures(unpaired_atac, min.cutoff = "q0")
 64 |     unpaired_atac <- RunSVD(unpaired_atac)
 65 |     unpaired_atac <- RunUMAP(unpaired_atac, reduction = "lsi", dims = 2:30, reduction.name = "umap.atac", reduction.key = "atacUMAP_")
 66 |     # quantify gene activity
 67 |     gene.activities <- GeneActivity(unpaired_atac)
 68 |     unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 69 |     # normalize gene activities
 70 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 71 |     unpaired_atac <- NormalizeData(unpaired_atac)
 72 |     unpaired_atac <- FindVariableFeatures(unpaired_atac, nfeatures = 5000)
 73 |     unpaired_atac <- ScaleData(unpaired_atac)
 74 | 
 75 |     gene_sel <- intersect(unpaired_rna@assays$RNA@var.features,unpaired_atac@assays$ACTIVITY@var.features)
 76 | 
 77 |     cca_res <- RunCCA(object1 = unpaired_rna, 
 78 |                       object2 = unpaired_atac,
 79 |                       assay1 = "RNA",
 80 |                       assay2 = "ACTIVITY",
 81 |                       num.cc = 30,
 82 |                       features = gene_sel,
 83 |                       renormalize = TRUE,
 84 |                       rescale = TRUE)
 85 |     cca_res <- RunUMAP(cca_res, dims = 1:30,reduction="cca")
 86 |     cell_loading <- cca_res@reductions$cca@cell.embeddings
 87 |     cell_pair <- cell_pairing(cell_loading[colnames(unpaired_rna),],
 88 |                           cell_loading[colnames(unpaired_atac),])
 89 |     # RNA column is the list of RNA cell barcodes, while the ATAC column stores the ATAC profiles. The two lists are in the same order
 90 |     colnames(cell_pair) <- c("RNA","ATAC")
 91 |     print("------ Saving integration result ------")
 92 |     # cca loadings
 93 |     df_umap = as.data.frame(cell_loading)
 94 |     # ===== added =====
 95 |     colnames(df_umap) = paste0("latent_",1:ncol(df_umap))
 96 |     df = cbind(df_umap,dataset=cca_res$dataset)
 97 |     print(table(df$dataset))
 98 |     dir.create(file.path(out_dir,"rfigr"),recursive=TRUE)
 99 |     print("------ Saving integration and cell pairing result ------")
100 |     write.csv(df,file.path(out_dir,"rfigr","rfigr_result.csv"))
101 |     # ASSUMING ONE-TO-ONE cell pairing 
102 |     cell_pair = cell_pair[which(!duplicated(cell_pair$RNA)),]
103 |     cell_pair = cell_pair[which(!duplicated(cell_pair$ATAC)),]
104 |     # saving cell pairs 
105 |     write.csv(cell_pair,file.path(out_dir,"rfigr","cell_pair.csv"))
106 |     t2 <- Sys.time()
107 |     dir.create(file.path(out_dir,"runtime"),recursive=TRUE)
108 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
109 |                 file = file.path(out_dir,"runtime","rfigr_runtime.txt"), 
110 |                 sep = "\t",
111 |                 row.names = FALSE,
112 |                 col.names = FALSE)
113 |     print("------ Integration Done ------")
114 |     print("------ Prediction ------")
115 |     # starting time
116 |     t1 <- Sys.time()
117 |     # save prediction result 
118 |     paired_rna <- subset(unpaired_rna, cells=cell_pair$RNA)
119 |     paired_atac <- subset(unpaired_atac, cells=cell_pair$ATAC)
120 | 
121 |     # keep genes expressed by at least 3 cells. Set RNA cells to have ATAC cell barcodes
122 |     rna_counts_mtx <- GetAssayData(object = paired_rna, slot = "counts",assay="RNA")
123 |     colnames(rna_counts_mtx) <- colnames(paired_atac)
124 |     md<-paired_atac@meta.data
125 |     
126 |     paired_comb <- CreateSeuratObject(counts = rna_counts_mtx,min.cells = 3,meta.data = md, assay = "RNA")
127 |     
128 |     atac_counts_mtx <- GetAssayData(object = paired_atac, slot = "counts",assay="ATAC")
129 |     
130 |     chrom_assay <- CreateChromatinAssay(
131 |       counts = atac_counts_mtx,
132 |       sep = c("-", "-"),
133 |       min.cells = 1,
134 |       min.features = 0
135 |     )
136 | 
137 |     paired_comb[["ATAC"]] <- chrom_assay
138 | 
139 |     DefaultAssay(paired_comb)<-'RNA'
140 |     paired_comb <- NormalizeData(paired_comb)
141 |     
142 |     write_mtx_folder(file.path(out_dir,"rfigr","predicted","ATAC"),paired_comb,assay_key="ATAC",slot_key="counts","peak")
143 |     write_mtx_folder(file.path(out_dir,"rfigr","predicted","RNA"),paired_comb,assay_key="RNA",slot_key="data","gene")
144 |     
145 |     t2 <- Sys.time()
146 |     ## use '[[1]]' for clean output
147 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
148 |                 file = file.path(out_dir,"runtime","rfigr_prediction_time.txt"), 
149 |                 sep = "\t",
150 |                 row.names = FALSE,
151 |                 col.names = FALSE)
152 |     print("------ Prediction Done ------")
153 | }
154 | 
155 | print(paste0("argument 1: ",args[1]))
156 | print(paste0("argument 2: ",args[2]))
157 | 
158 | if (length(args)<2) {
159 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
160 | }else if(length(args)==2) {
161 |     run_rfigr_fn(args[1], args[2])
162 | }else{
163 |     print("More arguments than function needed are supplied, running function with the first two arugments")
164 |     run_rfigr_fn(args[1], args[2])
165 | }
166 | 
167 | 


--------------------------------------------------------------------------------
/methods/figr/run_rfigr_2_hpap_sequential.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(Seurat)
  5 | require(stringr)
  6 | require(Signac)
  7 | require(Matrix)
  8 | require(future)
  9 | source("r_utils.R")
 10 | 
 11 | orig_wd = getwd()
 12 | figr_path = "/home/myylee/scmint/methods_eval/stimATAC_analyses_code/R/"
 13 | setwd(figr_path)
 14 | source("optMatching_functions.R")
 15 | source("DORC_functions.R")
 16 | source("FigR_functions.R")
 17 | 
 18 | setwd(orig_wd)
 19 | 
 20 | 
 21 | run_rfigr_fn <- function(in_dir, out_dir){
 22 |     # starting time
 23 |     t1 <- Sys.time()
 24 | #     plan("multisession")
 25 | #     options(future.rng.onMisue = "ignore")
 26 | #     print(paste0("workers used:",nbrOfWorkers()))
 27 | #     options(future.globals.maxSize = 8000 * 1024^2)
 28 |     
 29 |     # load dataset 
 30 |     datasets = load_datasets(in_dir,obs=c("barcodes","batch"))
 31 |     paired_rna = datasets$paired_rna
 32 |     paired_atac = datasets$paired_atac
 33 |     unpaired_rna = datasets$unpaired_rna
 34 |     unpaired_atac = datasets$unpaired_atac
 35 |     
 36 |     # verify number of cells in each condition 
 37 |     dataset_vec <- rep(c("scRNA","snATAC","Multiome-RNA","Multiome-ATAC"),
 38 |                    c(ncol(unpaired_rna),
 39 |                      ncol(unpaired_atac),
 40 |                      ncol(paired_rna),
 41 |                     ncol(paired_atac)))
 42 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 43 |                             paste0(colnames(unpaired_atac)),
 44 |                             paste0("prna_",colnames(paired_rna)),
 45 |                             paste0("patac_",colnames(paired_atac)))
 46 |     print(table(dataset_vec))
 47 |     
 48 |     paired_rna <- RenameCells(paired_rna,add.cell.id = "prna",for.merge = FALSE)
 49 |     paired_atac <- RenameCells(paired_atac,add.cell.id = "patac",for.merge = FALSE)
 50 |     
 51 |     unpaired_atac@meta.data$technology <- "snATAC"
 52 |     unpaired_rna@meta.data$technology <- "scRNA"
 53 |     paired_atac@meta.data$technology <- "Multiome-ATAC"
 54 |     paired_rna@meta.data$technology <- "Multiome-RNA"
 55 | 
 56 |     # merging
 57 |     unpaired_rna <- merge(unpaired_rna,paired_rna)
 58 |     unpaired_atac <- merge(unpaired_atac,paired_atac)
 59 |     
 60 |     unpaired_rna@meta.data$group <- paste0(unpaired_rna$batch,"_",unpaired_rna$technology)
 61 |     unpaired_atac@meta.data$group <- paste0(unpaired_atac$batch,"_",unpaired_atac$technology)
 62 | 
 63 |     # Normalize gene expression and obtain highly variable genes 
 64 |     DefaultAssay(unpaired_rna) <- "RNA"
 65 |     unpaired_rna <- NormalizeData(unpaired_rna)
 66 |     rna.list <- SplitObject(unpaired_rna, split.by = "group")
 67 |         #select high variable features across samples 
 68 |         features = SelectIntegrationFeatures(
 69 |           rna.list,
 70 |           nfeatures = 5000,
 71 |           verbose = TRUE,
 72 |           fvf.nfeatures = 10000,
 73 |         )
 74 |     unpaired_rna@assays$RNA@var.features = features
 75 |     unpaired_rna <- ScaleData(unpaired_rna,features=features,split.by='group')
 76 |     
 77 |     # quantify gene activity
 78 |     gene.activities <- GeneActivity(unpaired_atac)
 79 |     unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 80 |     # normalize gene activities
 81 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 82 |     unpaired_atac <- NormalizeData(unpaired_atac)
 83 |     gene.activity.list <- SplitObject(unpaired_atac, split.by = "group")
 84 |         #select high variable features across samples 
 85 |     features2 = SelectIntegrationFeatures(
 86 |           gene.activity.list,
 87 |           nfeatures = 5000,
 88 |           verbose = TRUE,
 89 |           fvf.nfeatures = 10000,
 90 |     )
 91 |     unpaired_atac@assays$ACTIVITY@var.features = features2
 92 |     unpaired_atac <- ScaleData(unpaired_atac,split.by='group')
 93 | 
 94 |     gene_sel <- intersect(unpaired_rna@assays$RNA@var.features,unpaired_atac@assays$ACTIVITY@var.features)
 95 | 
 96 |     cca_res <- RunCCA(object1 = unpaired_rna, 
 97 |                       object2 = unpaired_atac,
 98 |                       assay1 = "RNA",
 99 |                       assay2 = "ACTIVITY",
100 |                       num.cc = 30,
101 |                       features = gene_sel,
102 |                       renormalize = FALSE,
103 |                       rescale = FALSE)
104 |     cca_res <- RunUMAP(cca_res, dims = 1:30,reduction="cca")
105 |     cell_loading <- cca_res@reductions$cca@cell.embeddings
106 |     # cell_pair <- cell_pairing(cell_loading[colnames(unpaired_rna),],
107 |     #                       cell_loading[colnames(unpaired_atac),])
108 |     # RNA column is the list of RNA cell barcodes, while the ATAC column stores the ATAC profiles. The two lists are in the same order
109 |     #colnames(cell_pair) <- c("RNA","ATAC")
110 |     print("------ Saving integration result ------")
111 |     # cca loadings
112 |     df_umap = as.data.frame(cell_loading)
113 |     # ===== added =====
114 |     colnames(df_umap) = paste0("latent_",1:ncol(df_umap))
115 |     df = cbind(df_umap,dataset=cca_res$technology)
116 |     print(table(df$dataset))
117 |     dir.create(file.path(out_dir,"rfigr"),recursive=TRUE)
118 |     print("------ Saving integration and cell pairing result ------")
119 |     write.csv(df,file.path(out_dir,"rfigr","rfigr_result.csv"))
120 |     # # ASSUMING ONE-TO-ONE cell pairing 
121 |     # cell_pair = cell_pair[which(!duplicated(cell_pair$RNA)),]
122 |     # cell_pair = cell_pair[which(!duplicated(cell_pair$ATAC)),]
123 |     # # saving cell pairs 
124 |     # write.csv(cell_pair,file.path(out_dir,"rfigr","cell_pair.csv"))
125 |     t2 <- Sys.time()
126 |     dir.create(file.path(out_dir,"runtime"),recursive=TRUE)
127 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
128 |                 file = file.path(out_dir,"runtime","rfigr_runtime.txt"), 
129 |                 sep = "\t",
130 |                 row.names = FALSE,
131 |                 col.names = FALSE)
132 |     print("------ Integration Done ------")
133 |     print("------ No prediction ------")
134 | 
135 |     
136 | }
137 | 
138 | print(paste0("argument 1: ",args[1]))
139 | print(paste0("argument 2: ",args[2]))
140 | 
141 | if (length(args)<2) {
142 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
143 | }else if(length(args)==2) {
144 |     run_rfigr_fn(args[1], args[2])
145 | }else{
146 |     print("More arguments than function needed are supplied, running function with the first two arugments")
147 |     run_rfigr_fn(args[1], args[2])
148 | }
149 | 
150 | 


--------------------------------------------------------------------------------
/methods/figr/run_rfigr_single.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(Seurat)
  5 | require(stringr)
  6 | require(Signac)
  7 | require(Matrix)
  8 | require(future)
  9 | source("r_utils.R")
 10 | 
 11 | orig_wd = getwd()
 12 | figr_path = "/home/myylee/scmint/methods_eval/stimATAC_analyses_code/R/"
 13 | setwd(figr_path)
 14 | source("optMatching_functions.R")
 15 | source("DORC_functions.R")
 16 | source("FigR_functions.R")
 17 | 
 18 | setwd(orig_wd)
 19 | 
 20 | options(future.globals.maxSize = 3000*1024^3)
 21 | 
 22 | run_rfigr_fn <- function(in_dir, out_dir){
 23 |     
 24 | #     # try system.path 
 25 | #     dir.create(system.file(out_dir,"rfigr", package="RcppBDT"),recursive=TRUE)
 26 | #     print("------ Saving integration and cell pairing result ------")
 27 | #     write.csv(c(1,2,3,4),system.file(out_dir,"rfigr","rfigr_result.csv", package="RcppBDT"))
 28 |     
 29 |     # starting time
 30 |     t1 <- Sys.time()
 31 |     #plan("multisession")
 32 |     #options(future.rng.onMisue = "ignore")
 33 |     print(paste0("workers used:",nbrOfWorkers()))
 34 |     
 35 |     datasets = load_datasets(in_dir)
 36 |     print("loading single modality datasets only, ignoring paired RNA and paired ATAC folder")
 37 |     unpaired_rna=datasets$unpaired_rna
 38 |     unpaired_atac=datasets$unpaired_atac
 39 | 
 40 |     # print number of cells per data type
 41 |     dataset_vec <- rep(c("scRNA","snATAC"),
 42 |                        c(ncol(unpaired_rna),
 43 |                          ncol(unpaired_atac)))
 44 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 45 |                             paste0(colnames(unpaired_atac)))
 46 |     print(table(dataset_vec))
 47 |     
 48 |     unpaired_atac@meta.data$dataset <- "snATAC"
 49 |     unpaired_rna@meta.data$dataset <- "scRNA"
 50 |     
 51 |     # merging
 52 |     #unpaired_rna <- merge(unpaired_rna,paired_rna, add.cell.ids = c("urna", "prna"))
 53 |     #unpaired_atac <- merge(unpaired_atac,paired_atac, add.cell.ids = c("uatac", "patac"))
 54 |     
 55 |     unpaired_rna <- NormalizeData(unpaired_rna)
 56 |     unpaired_rna <- FindVariableFeatures(unpaired_rna,nfeatures = 5000)
 57 |     # We exclude the first dimension as this is typically correlated with sequencing depth
 58 |     unpaired_atac <- RunTFIDF(unpaired_atac)
 59 |     unpaired_atac <- FindTopFeatures(unpaired_atac, min.cutoff = "q0")
 60 |     unpaired_atac <- RunSVD(unpaired_atac)
 61 |     unpaired_atac <- RunUMAP(unpaired_atac, reduction = "lsi", dims = 2:30, reduction.name = "umap.atac", reduction.key = "atacUMAP_")
 62 |     # quantify gene activity
 63 |     gene.activities <- GeneActivity(unpaired_atac)
 64 |     unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 65 |     # normalize gene activities
 66 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 67 |     unpaired_atac <- NormalizeData(unpaired_atac)
 68 |     unpaired_atac <- FindVariableFeatures(unpaired_atac, nfeatures = 5000)
 69 |     unpaired_atac <- ScaleData(unpaired_atac)
 70 | 
 71 |     gene_sel <- intersect(unpaired_rna@assays$RNA@var.features,unpaired_atac@assays$ACTIVITY@var.features)
 72 | 
 73 |     cca_res <- RunCCA(object1 = unpaired_rna, 
 74 |                       object2 = unpaired_atac,
 75 |                       assay1 = "RNA",
 76 |                       assay2 = "ACTIVITY",
 77 |                       num.cc = 30,
 78 |                       features = gene_sel,
 79 |                       renormalize = TRUE,
 80 |                       rescale = TRUE)
 81 |     cca_res <- RunUMAP(cca_res, dims = 1:30,reduction="cca")
 82 |     cell_loading <- cca_res@reductions$cca@cell.embeddings
 83 |     cell_pair <- cell_pairing(cell_loading[colnames(unpaired_rna),],
 84 |                           cell_loading[colnames(unpaired_atac),])
 85 |     # RNA column is the list of RNA cell barcodes, while the ATAC column stores the ATAC profiles. The two lists are in the same order
 86 |     colnames(cell_pair) <- c("RNA","ATAC")
 87 |     print("------ Saving integration result ------")
 88 |     # cca loadings
 89 |     df_umap = as.data.frame(cell_loading)
 90 |     # ===== added =====
 91 |     colnames(df_umap) = paste0("latent_",1:ncol(df_umap))
 92 |     df = cbind(df_umap,dataset=cca_res$dataset)
 93 |     print(table(df$dataset))
 94 |     dir.create(file.path(out_dir,"rfigr"),recursive=TRUE)
 95 |     #dir.create(file.path(paste0(out_dir,"rfigr/"),recursive=TRUE)
 96 |     print("------ Saving integration and cell pairing result ------")
 97 |     write.csv(df,file.path(out_dir,"rfigr","rfigr_result.csv"))
 98 |     # ASSUMING ONE-TO-ONE cell pairing 
 99 |     cell_pair = cell_pair[which(!duplicated(cell_pair$RNA)),]
100 |     cell_pair = cell_pair[which(!duplicated(cell_pair$ATAC)),]
101 |     # saving cell pairs 
102 |     write.csv(cell_pair,file.path(out_dir,"rfigr","cell_pair.csv"))
103 |     t2 <- Sys.time()
104 |     dir.create(file.path(out_dir,"runtime"),recursive=TRUE)
105 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
106 |                 file = file.path(out_dir,"runtime","rfigr_runtime.txt"), 
107 |                 sep = "\t",
108 |                 row.names = FALSE,
109 |                 col.names = FALSE)
110 |     print("------ Integration Done ------")
111 |     print("------ Prediction ------")
112 |     # starting time
113 |     t1 <- Sys.time()
114 |     # save prediction result 
115 |     paired_rna <- subset(unpaired_rna, cells=cell_pair$RNA)
116 |     paired_atac <- subset(unpaired_atac, cells=cell_pair$ATAC)
117 | 
118 |     # keep genes expressed by at least 3 cells. Set RNA cells to have ATAC cell barcodes
119 |     rna_counts_mtx <- GetAssayData(object = paired_rna, slot = "counts",assay="RNA")
120 |     colnames(rna_counts_mtx) <- colnames(paired_atac)
121 |     md<-paired_atac@meta.data
122 |     
123 |     paired_comb <- CreateSeuratObject(counts = rna_counts_mtx,min.cells = 3,meta.data = md, assay = "RNA")
124 |     
125 |     atac_counts_mtx <- GetAssayData(object = paired_atac, slot = "counts",assay="ATAC")
126 |     
127 |     chrom_assay <- CreateChromatinAssay(
128 |       counts = atac_counts_mtx,
129 |       sep = c("-", "-"),
130 |       min.cells = 1,
131 |       min.features = 0
132 |     )
133 | 
134 |     paired_comb[["ATAC"]] <- chrom_assay
135 | 
136 |     DefaultAssay(paired_comb)<-'RNA'
137 |     paired_comb <- NormalizeData(paired_comb)
138 |     
139 |     write_mtx_folder(file.path(out_dir,"rfigr","predicted","ATAC"),paired_comb,assay_key="ATAC",slot_key="counts","peak")
140 |     write_mtx_folder(file.path(out_dir,"rfigr","predicted","RNA"),paired_comb,assay_key="RNA",slot_key="data","gene")
141 |     
142 |     t2 <- Sys.time()
143 |     ## use '[[1]]' for clean output
144 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
145 |                 file = file.path(out_dir,"runtime","rfigr_prediction_time.txt"), 
146 |                 sep = "\t",
147 |                 row.names = FALSE,
148 |                 col.names = FALSE)
149 |     print("------ Prediction Done ------")
150 | }
151 | 
152 | print(paste0("argument 1: ",args[1]))
153 | print(paste0("argument 2: ",args[2]))
154 | 
155 | if (length(args)<2) {
156 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
157 | }else if(length(args)==2) {
158 |     run_rfigr_fn(args[1], args[2])
159 | }else{
160 |     print("More arguments than function needed are supplied, running function with the first two arugments")
161 |     run_rfigr_fn(args[1], args[2])
162 | }
163 | 
164 | 


--------------------------------------------------------------------------------
/methods/glue/glue_env.txt:
--------------------------------------------------------------------------------
 1 | # command line
 2 | 
 3 | # try installing glue again 
 4 | conda create -n glue2 python=3.8
 5 | 
 6 | conda activate glue2
 7 | 
 8 | # install using pip instead of condo 
 9 | pip install scglue
10 | 
11 | pip install leidenalg ipykernel
12 | 
13 | python -m ipykernel install --user --name glue2 --display-name "glue2"
14 | 
15 | conda install -c pytorch faiss-cpu
16 | 
17 | pip install numpy==1.23.5
18 | 
19 | # glue_gpu 
20 | 
21 | conda create --name glue_gpu --clone glue2
22 | conda activate glue_gpu
23 | conda install -c conda-forge -c bioconda pytorch-gpu
24 | python -m ipykernel install --user --name glue_gpu --display-name "glue_gpu"
25 | 
26 | # try installing glue_gpu again 
27 | conda create -n glueGPU python=3.8
28 | 
29 | conda activate glueGPU
30 | 
31 | conda install -c conda-forge -c bioconda scglue pytorch-gpu  # With GPU support
32 | 
33 | pip install leidenalg ipykernel
34 | 
35 | python -m ipykernel install --user --name glueGPU --display-name "glueGPU"
36 | 
37 | conda install -c pytorch faiss-gpu
38 | 
39 | # pip install numpy==1.23.5
40 | 
41 | 


--------------------------------------------------------------------------------
/methods/glue/run_glue_hg38.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys 
  4 | print('Number of arguments:', len(sys.argv), 'arguments.')
  5 | print('Argument List:'+ str(sys.argv))
  6 | 
  7 | #==== method specific ==== 
  8 | import networkx as nx
  9 | import scglue
 10 | from itertools import chain
 11 | import seaborn as sns
 12 | from matplotlib import rcParams
 13 | 
 14 | #from matplotlib import rcParams
 15 | from anndata import AnnData
 16 | import anndata as ad
 17 | import scipy
 18 | import numpy as np
 19 | import pandas as pd
 20 | import scipy.io as sio
 21 | import os
 22 | import scanpy as sc
 23 | from copy import deepcopy
 24 | from utils_eval import read_mtx_folder, write_adata
 25 | import timeit
 26 | 
 27 | def run_glue_fn(in_dir,out_dir):
 28 |     start = timeit.default_timer()
 29 |     
 30 |     adata_prna = read_mtx_folder(os.path.join(in_dir,"paired_RNA/"),
 31 |                                        "Gene Expression",
 32 |                                        ["gene"],
 33 |                                        ["barcodes"])
 34 | 
 35 |     adata_patac = read_mtx_folder(os.path.join(in_dir,"paired_ATAC/"),
 36 |                                        "Peaks",
 37 |                                        ["peak"],
 38 |                                        ["barcodes"])
 39 | 
 40 |     adata_urna = read_mtx_folder(os.path.join(in_dir,"unpaired_RNA/"),
 41 |                                        "Gene Expression",
 42 |                                        ["gene"],
 43 |                                        ["barcodes"])
 44 | 
 45 |     adata_uatac = read_mtx_folder(os.path.join(in_dir,"unpaired_ATAC/"),
 46 |                                        "Peaks",
 47 |                                        ["peak"],
 48 |                                        ["barcodes"])
 49 |     
 50 |     adata_prna.obs['dataset'] = 'multiomeRNA'
 51 |     adata_patac.obs['dataset'] = 'multiomeATAC'
 52 |     adata_urna.obs['dataset'] = 'scRNA'
 53 |     adata_uatac.obs['dataset'] = 'snATAC'
 54 |     
 55 |     rna = ad.concat([adata_prna, adata_urna])
 56 |     atac = ad.concat([adata_patac, adata_uatac])
 57 | 
 58 |     os.makedirs(out_dir, exist_ok=True)
 59 |     os.makedirs(os.path.join(out_dir,"glue"), exist_ok=True)
 60 |     os.makedirs(os.path.join(out_dir,"runtime"), exist_ok=True)
 61 |     
 62 |     # preprocessing of scRNA
 63 |     rna.layers["counts"] = rna.X.copy()
 64 |     sc.pp.filter_genes(rna, min_cells=3)
 65 |     sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")
 66 |     sc.pp.normalize_total(rna)
 67 |     sc.pp.log1p(rna)
 68 |     sc.pp.scale(rna)
 69 |     sc.tl.pca(rna, n_comps=100, svd_solver="auto")
 70 |     
 71 |     # preprocessing of snATAC
 72 |     sc.pp.filter_genes(atac,min_counts=1)
 73 |     scglue.data.lsi(atac, n_components=100, n_iter=15)
 74 | 
 75 |     # build graph
 76 |     scglue.data.get_gene_annotation(
 77 |         # this works for human hg38 genome-build 
 78 |         rna, gtf="/home/myylee/scmint/methods_eval/GRCg38_genes.gtf.gz",
 79 |         gtf_by="gene_name"
 80 |     )
 81 |     rna.var.loc[:, ["chrom", "chromStart", "chromEnd"]].head()
 82 |     
 83 |     split = atac.var_names.str.split(r"[--]")
 84 |     atac.var["chrom"] = split.map(lambda x: x[0])
 85 |     atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
 86 |     atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)
 87 |     atac_chrs = atac.var['chrom'].value_counts().index.tolist()
 88 |     row_keep = rna.var_names[rna.var['chrom'].isin(atac_chrs).tolist()]
 89 |     rna = rna[:,row_keep].copy()
 90 |     guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
 91 |     scglue.graph.check_graph(guidance, [rna, atac])
 92 |     
 93 |     # prepare for training 
 94 |     scglue.models.configure_dataset(
 95 |         rna, "NB", use_highly_variable=True,
 96 |         use_layer="counts", use_rep="X_pca"
 97 |     )
 98 |     scglue.models.configure_dataset(
 99 |         atac, "NB", use_highly_variable=True,
100 |         use_rep="X_lsi"
101 |     )
102 | 
103 |     guidance_hvf = guidance.subgraph(chain(
104 |         rna.var.query("highly_variable").index,
105 |         atac.var.query("highly_variable").index
106 |     )).copy()
107 |     
108 |     # GLUE training 
109 |     glue = scglue.models.fit_SCGLUE(
110 |         {"rna": rna, "atac": atac}, guidance_hvf,
111 |         fit_kws={"directory": os.path.join(out_dir,"glue")}
112 |     )
113 |     
114 |     dx = scglue.models.integration_consistency(
115 |         glue, {"rna": rna, "atac": atac}, guidance_hvf
116 |     )
117 |     print(dx)
118 |     rna.obsm["X_glue"] = glue.encode_data("rna", rna)
119 |     atac.obsm["X_glue"] = glue.encode_data("atac", atac)
120 |     combined = ad.concat([rna, atac])
121 | 
122 |     # extract latent representation
123 |     res_df = pd.DataFrame(combined.obsm['X_glue'],index=combined.obs.index)
124 |     # set column names as latent_x 
125 |     res_df = res_df.set_axis(["latent_" + s  for s in res_df.columns.astype("str").tolist()],axis="columns")
126 |     res_df['dataset'] = combined.obs['dataset']
127 |     res_df['dataset'] = res_df['dataset'].astype("string")
128 |     
129 |     # save latent representation and model
130 |     
131 |     csv_out = os.path.join(out_dir, "glue","glue_result.csv")
132 |     res_df.to_csv(csv_out)
133 |     model_out = os.path.join(out_dir,"glue","glue.dill")
134 |     glue.save(model_out)
135 |     stop = timeit.default_timer()
136 |     
137 |     print('Time(s): ', stop - start)  
138 |     # record time 
139 |     runtime_out = os.path.join(out_dir,"runtime","glue_runtime.txt")
140 |     print(stop - start,  file=open(runtime_out, 'w'))
141 |     print("------ Done ------")
142 |     print("------ Prediction ------")
143 |     start = timeit.default_timer()
144 |     # get imputated gene expression 
145 |     feature_embeddings = glue.encode_graph(guidance_hvf)
146 |     feature_embeddings = pd.DataFrame(feature_embeddings, index=glue.vertices)
147 |     feature_embeddings.iloc[:5, :5]
148 |     rna.varm["X_glue"] = feature_embeddings.reindex(rna.var_names).to_numpy()
149 |     atac.varm["X_glue"] = feature_embeddings.reindex(atac.var_names).to_numpy()
150 |     rna.var["name"] = rna.var_names
151 |     atac.var["name"] = atac.var_names
152 |     genes = rna.var.query("highly_variable").index
153 |     peaks = atac.var.query("highly_variable").index
154 |     features = pd.Index(np.concatenate([rna.var_names, atac.var_names]))
155 |     feature_embeddings = np.concatenate([rna.varm["X_glue"], atac.varm["X_glue"]])
156 |     skeleton = guidance_hvf.edge_subgraph(
157 |         e for e, attr in dict(guidance_hvf.edges).items()
158 |         if attr["type"] == "fwd"
159 |     ).copy()
160 |     reginf = scglue.genomics.regulatory_inference(
161 |         features, feature_embeddings,
162 |         skeleton=skeleton, random_state=0
163 |     )
164 |     # gene2peak = reginf.edge_subgraph(
165 |     #     e for e, attr in dict(reginf.edges).items()
166 |     #     if attr["qval"] < 0.05
167 |     # )
168 |     gene2peak_out = os.path.join(out_dir, "glue","gene2peak_all.csv")
169 |     nx.to_pandas_edgelist(reginf).to_csv(gene2peak_out)
170 |     stop = timeit.default_timer()
171 |     print('Time(s): ', stop - start)  
172 |     # save prediction time 
173 |     # this is the inference time for peak-to-gene pair. Much shorter than the other ones that need to run the SHARE-seq pairing using imputed gene expression. 
174 |     prediction_time_out = os.path.join(out_dir, "runtime","glue_prediction_time.txt")
175 |     print(stop - start,  file=open(prediction_time_out, 'w'))
176 |     print("------ Prediction Done ------")
177 | 
178 | print("argument 1:",sys.argv[1])
179 | print("argument 2:",sys.argv[2])
180 | 
181 | run_glue_fn(sys.argv[1],sys.argv[2])
182 | 
183 | 


--------------------------------------------------------------------------------
/methods/glue/run_glue_hg38_noPred.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys 
  4 | print('Number of arguments:', len(sys.argv), 'arguments.')
  5 | print('Argument List:'+ str(sys.argv))
  6 | 
  7 | #==== method specific ==== 
  8 | import networkx as nx
  9 | import scglue
 10 | from itertools import chain
 11 | import seaborn as sns
 12 | from matplotlib import rcParams
 13 | 
 14 | #from matplotlib import rcParams
 15 | from anndata import AnnData
 16 | import anndata as ad
 17 | import scipy
 18 | import numpy as np
 19 | import pandas as pd
 20 | import scipy.io as sio
 21 | import os
 22 | import scanpy as sc
 23 | from copy import deepcopy
 24 | from utils_eval import read_mtx_folder, write_adata
 25 | import timeit
 26 | 
 27 | def run_glue_fn(in_dir,out_dir):
 28 |     start = timeit.default_timer()
 29 |     
 30 |     adata_prna = read_mtx_folder(os.path.join(in_dir,"paired_RNA/"),
 31 |                                        "Gene Expression",
 32 |                                        ["gene"],
 33 |                                        ["barcodes"])
 34 | 
 35 |     adata_patac = read_mtx_folder(os.path.join(in_dir,"paired_ATAC/"),
 36 |                                        "Peaks",
 37 |                                        ["peak"],
 38 |                                        ["barcodes"])
 39 | 
 40 |     adata_urna = read_mtx_folder(os.path.join(in_dir,"unpaired_RNA/"),
 41 |                                        "Gene Expression",
 42 |                                        ["gene"],
 43 |                                        ["barcodes"])
 44 | 
 45 |     adata_uatac = read_mtx_folder(os.path.join(in_dir,"unpaired_ATAC/"),
 46 |                                        "Peaks",
 47 |                                        ["peak"],
 48 |                                        ["barcodes"])
 49 |     
 50 |     adata_prna.obs['dataset'] = 'multiomeRNA'
 51 |     adata_patac.obs['dataset'] = 'multiomeATAC'
 52 |     adata_urna.obs['dataset'] = 'scRNA'
 53 |     adata_uatac.obs['dataset'] = 'snATAC'
 54 |     
 55 |     rna = ad.concat([adata_prna, adata_urna])
 56 |     atac = ad.concat([adata_patac, adata_uatac])
 57 | 
 58 |     os.makedirs(out_dir, exist_ok=True)
 59 |     os.makedirs(os.path.join(out_dir,"glue"), exist_ok=True)
 60 |     os.makedirs(os.path.join(out_dir,"runtime"), exist_ok=True)
 61 |     
 62 |     # preprocessing of scRNA
 63 |     rna.layers["counts"] = rna.X.copy()
 64 |     sc.pp.filter_genes(rna, min_cells=3)
 65 |     sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")
 66 |     sc.pp.normalize_total(rna)
 67 |     sc.pp.log1p(rna)
 68 |     sc.pp.scale(rna)
 69 |     sc.tl.pca(rna, n_comps=100, svd_solver="auto")
 70 |     
 71 |     # preprocessing of snATAC
 72 |     sc.pp.filter_genes(atac,min_counts=1)
 73 |     scglue.data.lsi(atac, n_components=100, n_iter=15)
 74 | 
 75 |     # build graph
 76 |     scglue.data.get_gene_annotation(
 77 |         # this works for human hg38 genome-build 
 78 |         rna, gtf="/home/myylee/scmint/methods_eval/GRCg38_genes.gtf.gz",
 79 |         gtf_by="gene_name"
 80 |     )
 81 |     rna.var.loc[:, ["chrom", "chromStart", "chromEnd"]].head()
 82 |     
 83 |     split = atac.var_names.str.split(r"[--]")
 84 |     atac.var["chrom"] = split.map(lambda x: x[0])
 85 |     atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
 86 |     atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)
 87 |     atac_chrs = atac.var['chrom'].value_counts().index.tolist()
 88 |     row_keep = rna.var_names[rna.var['chrom'].isin(atac_chrs).tolist()]
 89 |     rna = rna[:,row_keep].copy()
 90 |     guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
 91 |     scglue.graph.check_graph(guidance, [rna, atac])
 92 |     
 93 |     # prepare for training 
 94 |     scglue.models.configure_dataset(
 95 |         rna, "NB", use_highly_variable=True,
 96 |         use_layer="counts", use_rep="X_pca"
 97 |     )
 98 |     scglue.models.configure_dataset(
 99 |         atac, "NB", use_highly_variable=True,
100 |         use_rep="X_lsi"
101 |     )
102 | 
103 |     guidance_hvf = guidance.subgraph(chain(
104 |         rna.var.query("highly_variable").index,
105 |         atac.var.query("highly_variable").index
106 |     )).copy()
107 |     
108 |     # GLUE training 
109 |     glue = scglue.models.fit_SCGLUE(
110 |         {"rna": rna, "atac": atac}, guidance_hvf,
111 |         fit_kws={"directory": os.path.join(out_dir,"glue")}
112 |     )
113 |     
114 |     dx = scglue.models.integration_consistency(
115 |         glue, {"rna": rna, "atac": atac}, guidance_hvf
116 |     )
117 |     print(dx)
118 |     rna.obsm["X_glue"] = glue.encode_data("rna", rna)
119 |     atac.obsm["X_glue"] = glue.encode_data("atac", atac)
120 |     combined = ad.concat([rna, atac])
121 | 
122 |     # extract latent representation
123 |     res_df = pd.DataFrame(combined.obsm['X_glue'],index=combined.obs.index)
124 |     # set column names as latent_x 
125 |     res_df = res_df.set_axis(["latent_" + s  for s in res_df.columns.astype("str").tolist()],axis="columns")
126 |     res_df['dataset'] = combined.obs['dataset']
127 |     res_df['dataset'] = res_df['dataset'].astype("string")
128 |     
129 |     # save latent representation and model
130 |     
131 |     csv_out = os.path.join(out_dir, "glue","glue_result.csv")
132 |     res_df.to_csv(csv_out)
133 |     model_out = os.path.join(out_dir,"glue","glue.dill")
134 |     glue.save(model_out)
135 |     stop = timeit.default_timer()
136 |     
137 |     print('Time(s): ', stop - start)  
138 |     # record time 
139 |     runtime_out = os.path.join(out_dir,"runtime","glue_runtime.txt")
140 |     print(stop - start,  file=open(runtime_out, 'w'))
141 |     print("------ Done ------")
142 |     print("------ No prediction ------")
143 | 
144 | print("argument 1:",sys.argv[1])
145 | print("argument 2:",sys.argv[2])
146 | 
147 | run_glue_fn(sys.argv[1],sys.argv[2])
148 | 
149 | 


--------------------------------------------------------------------------------
/methods/glue/run_glue_hg38_single.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys 
  4 | print('Number of arguments:', len(sys.argv), 'arguments.')
  5 | print('Argument List:'+ str(sys.argv))
  6 | 
  7 | #==== method specific ==== 
  8 | import networkx as nx
  9 | import scglue
 10 | from itertools import chain
 11 | import seaborn as sns
 12 | from matplotlib import rcParams
 13 | 
 14 | #from matplotlib import rcParams
 15 | from anndata import AnnData
 16 | import anndata as ad
 17 | import scipy
 18 | import numpy as np
 19 | import pandas as pd
 20 | import scipy.io as sio
 21 | import os
 22 | import scanpy as sc
 23 | from copy import deepcopy
 24 | from utils_eval import read_mtx_folder, write_adata
 25 | import timeit
 26 | 
 27 | def run_glue_fn(in_dir,out_dir):
 28 |     start = timeit.default_timer()
 29 |     
 30 |     adata_prna = read_mtx_folder(os.path.join(in_dir,"paired_RNA/"),
 31 |                                        "Gene Expression",
 32 |                                        ["gene"],
 33 |                                        ["barcodes"])
 34 | 
 35 |     adata_patac = read_mtx_folder(os.path.join(in_dir,"paired_ATAC/"),
 36 |                                        "Peaks",
 37 |                                        ["peak"],
 38 |                                        ["barcodes"])
 39 | 
 40 |     adata_urna = read_mtx_folder(os.path.join(in_dir,"unpaired_RNA/"),
 41 |                                        "Gene Expression",
 42 |                                        ["gene"],
 43 |                                        ["barcodes"])
 44 | 
 45 |     adata_uatac = read_mtx_folder(os.path.join(in_dir,"unpaired_ATAC/"),
 46 |                                        "Peaks",
 47 |                                        ["peak"],
 48 |                                        ["barcodes"])
 49 |     
 50 |     adata_prna.obs['dataset'] = 'multiomeRNA'
 51 |     adata_patac.obs['dataset'] = 'multiomeATAC'
 52 |     adata_urna.obs['dataset'] = 'scRNA'
 53 |     adata_uatac.obs['dataset'] = 'snATAC'
 54 |     
 55 |     rna = adata_urna
 56 |     atac = adata_uatac
 57 |     os.makedirs(out_dir, exist_ok=True)
 58 |     os.makedirs(os.path.join(out_dir,"glue"), exist_ok=True)
 59 |     os.makedirs(os.path.join(out_dir,"runtime"), exist_ok=True)
 60 |     
 61 |     # preprocessing of scRNA
 62 |     rna.layers["counts"] = rna.X.copy()
 63 |     sc.pp.filter_genes(rna, min_cells=3)
 64 |     sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")
 65 |     sc.pp.normalize_total(rna)
 66 |     sc.pp.log1p(rna)
 67 |     sc.pp.scale(rna)
 68 |     sc.tl.pca(rna, n_comps=100, svd_solver="auto")
 69 |     
 70 |     # preprocessing of snATAC
 71 |     sc.pp.filter_genes(atac,min_counts=1)
 72 |     scglue.data.lsi(atac, n_components=100, n_iter=15)
 73 | 
 74 |     # build graph
 75 |     scglue.data.get_gene_annotation(
 76 |         # this works for human hg38 genome-build 
 77 |         rna, gtf="/home/myylee/scmint/methods_eval/GRCg38_genes.gtf.gz",
 78 |         gtf_by="gene_name"
 79 |     )
 80 |     rna.var.loc[:, ["chrom", "chromStart", "chromEnd"]].head()
 81 |     
 82 |     split = atac.var_names.str.split(r"[--]")
 83 |     atac.var["chrom"] = split.map(lambda x: x[0])
 84 |     atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
 85 |     atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)
 86 |     atac_chrs = atac.var['chrom'].value_counts().index.tolist()
 87 |     row_keep = rna.var_names[rna.var['chrom'].isin(atac_chrs).tolist()]
 88 |     rna = rna[:,row_keep].copy()
 89 |     guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
 90 |     scglue.graph.check_graph(guidance, [rna, atac])
 91 |     
 92 |     # prepare for training 
 93 |     scglue.models.configure_dataset(
 94 |         rna, "NB", use_highly_variable=True,
 95 |         use_layer="counts", use_rep="X_pca"
 96 |     )
 97 |     scglue.models.configure_dataset(
 98 |         atac, "NB", use_highly_variable=True,
 99 |         use_rep="X_lsi"
100 |     )
101 | 
102 |     guidance_hvf = guidance.subgraph(chain(
103 |         rna.var.query("highly_variable").index,
104 |         atac.var.query("highly_variable").index
105 |     )).copy()
106 |     
107 |     # GLUE training 
108 |     glue = scglue.models.fit_SCGLUE(
109 |         {"rna": rna, "atac": atac}, guidance_hvf,
110 |         fit_kws={"directory": os.path.join(out_dir,"glue")}
111 |     )
112 |     
113 |     dx = scglue.models.integration_consistency(
114 |         glue, {"rna": rna, "atac": atac}, guidance_hvf
115 |     )
116 |     print(dx)
117 |     rna.obsm["X_glue"] = glue.encode_data("rna", rna)
118 |     atac.obsm["X_glue"] = glue.encode_data("atac", atac)
119 |     combined = ad.concat([rna, atac])
120 | 
121 |     # extract latent representation
122 |     res_df = pd.DataFrame(combined.obsm['X_glue'],index=combined.obs.index)
123 |     # set column names as latent_x 
124 |     res_df = res_df.set_axis(["latent_" + s  for s in res_df.columns.astype("str").tolist()],axis="columns")
125 |     res_df['dataset'] = combined.obs['dataset']
126 |     res_df['dataset'] = res_df['dataset'].astype("string")
127 |     
128 |     # save latent representation and model
129 |     
130 |     csv_out = os.path.join(out_dir, "glue","glue_result.csv")
131 |     res_df.to_csv(csv_out)
132 |     model_out = os.path.join(out_dir,"glue","glue.dill")
133 |     glue.save(model_out)
134 |     stop = timeit.default_timer()
135 |     
136 |     print('Time(s): ', stop - start)  
137 |     # record time 
138 |     runtime_out = os.path.join(out_dir,"runtime","glue_runtime.txt")
139 |     print(stop - start,  file=open(runtime_out, 'w'))
140 |     print("------ Done ------")
141 |     print("------ Prediction ------")
142 |     start = timeit.default_timer()
143 |     # get imputated gene expression 
144 |     feature_embeddings = glue.encode_graph(guidance_hvf)
145 |     feature_embeddings = pd.DataFrame(feature_embeddings, index=glue.vertices)
146 |     feature_embeddings.iloc[:5, :5]
147 |     rna.varm["X_glue"] = feature_embeddings.reindex(rna.var_names).to_numpy()
148 |     atac.varm["X_glue"] = feature_embeddings.reindex(atac.var_names).to_numpy()
149 |     rna.var["name"] = rna.var_names
150 |     atac.var["name"] = atac.var_names
151 |     genes = rna.var.query("highly_variable").index
152 |     peaks = atac.var.query("highly_variable").index
153 |     features = pd.Index(np.concatenate([rna.var_names, atac.var_names]))
154 |     feature_embeddings = np.concatenate([rna.varm["X_glue"], atac.varm["X_glue"]])
155 |     skeleton = guidance_hvf.edge_subgraph(
156 |         e for e, attr in dict(guidance_hvf.edges).items()
157 |         if attr["type"] == "fwd"
158 |     ).copy()
159 |     reginf = scglue.genomics.regulatory_inference(
160 |         features, feature_embeddings,
161 |         skeleton=skeleton, random_state=0
162 |     )
163 |     # gene2peak = reginf.edge_subgraph(
164 |     #     e for e, attr in dict(reginf.edges).items()
165 |     #     if attr["qval"] < 0.05
166 |     # )
167 |     gene2peak_out = os.path.join(out_dir, "glue","gene2peak_all.csv")
168 |     nx.to_pandas_edgelist(reginf).to_csv(gene2peak_out)
169 |     stop = timeit.default_timer()
170 |     print('Time(s): ', stop - start)  
171 |     # save prediction time 
172 |     # this is the inference time for peak-to-gene pair. Much shorter than the other ones that need to run the SHARE-seq pairing using imputed gene expression. 
173 |     prediction_time_out = os.path.join(out_dir, "runtime","glue_prediction_time.txt")
174 |     print(stop - start,  file=open(prediction_time_out, 'w'))
175 |     print("------ Prediction Done ------")
176 | 
177 | print("argument 1:",sys.argv[1])
178 | print("argument 2:",sys.argv[2])
179 | 
180 | run_glue_fn(sys.argv[1],sys.argv[2])
181 | 
182 | 


--------------------------------------------------------------------------------
/methods/glue/run_glue_hg38_single_noPred.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys 
  4 | print('Number of arguments:', len(sys.argv), 'arguments.')
  5 | print('Argument List:'+ str(sys.argv))
  6 | 
  7 | #==== method specific ==== 
  8 | import networkx as nx
  9 | import scglue
 10 | from itertools import chain
 11 | import seaborn as sns
 12 | from matplotlib import rcParams
 13 | 
 14 | #from matplotlib import rcParams
 15 | from anndata import AnnData
 16 | import anndata as ad
 17 | import scipy
 18 | import numpy as np
 19 | import pandas as pd
 20 | import scipy.io as sio
 21 | import os
 22 | import scanpy as sc
 23 | from copy import deepcopy
 24 | from utils_eval import read_mtx_folder, write_adata
 25 | import timeit
 26 | 
 27 | def run_glue_fn(in_dir,out_dir):
 28 |     start = timeit.default_timer()
 29 |     
 30 |     adata_prna = read_mtx_folder(os.path.join(in_dir,"paired_RNA/"),
 31 |                                        "Gene Expression",
 32 |                                        ["gene"],
 33 |                                        ["barcodes"])
 34 | 
 35 |     adata_patac = read_mtx_folder(os.path.join(in_dir,"paired_ATAC/"),
 36 |                                        "Peaks",
 37 |                                        ["peak"],
 38 |                                        ["barcodes"])
 39 | 
 40 |     adata_urna = read_mtx_folder(os.path.join(in_dir,"unpaired_RNA/"),
 41 |                                        "Gene Expression",
 42 |                                        ["gene"],
 43 |                                        ["barcodes"])
 44 | 
 45 |     adata_uatac = read_mtx_folder(os.path.join(in_dir,"unpaired_ATAC/"),
 46 |                                        "Peaks",
 47 |                                        ["peak"],
 48 |                                        ["barcodes"])
 49 |     
 50 |     adata_prna.obs['dataset'] = 'multiomeRNA'
 51 |     adata_patac.obs['dataset'] = 'multiomeATAC'
 52 |     adata_urna.obs['dataset'] = 'scRNA'
 53 |     adata_uatac.obs['dataset'] = 'snATAC'
 54 |     
 55 |     rna = adata_urna
 56 |     atac = adata_uatac
 57 |     os.makedirs(out_dir, exist_ok=True)
 58 |     os.makedirs(os.path.join(out_dir,"glue"), exist_ok=True)
 59 |     os.makedirs(os.path.join(out_dir,"runtime"), exist_ok=True)
 60 |     
 61 |     # preprocessing of scRNA
 62 |     rna.layers["counts"] = rna.X.copy()
 63 |     sc.pp.filter_genes(rna, min_cells=3)
 64 |     sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")
 65 |     sc.pp.normalize_total(rna)
 66 |     sc.pp.log1p(rna)
 67 |     sc.pp.scale(rna)
 68 |     sc.tl.pca(rna, n_comps=100, svd_solver="auto")
 69 |     
 70 |     # preprocessing of snATAC
 71 |     sc.pp.filter_genes(atac,min_counts=1)
 72 |     scglue.data.lsi(atac, n_components=100, n_iter=15)
 73 | 
 74 |     # build graph
 75 |     scglue.data.get_gene_annotation(
 76 |         # this works for human hg38 genome-build 
 77 |         rna, gtf="/home/myylee/scmint/methods_eval/GRCg38_genes.gtf.gz",
 78 |         gtf_by="gene_name"
 79 |     )
 80 |     rna.var.loc[:, ["chrom", "chromStart", "chromEnd"]].head()
 81 |     
 82 |     split = atac.var_names.str.split(r"[--]")
 83 |     atac.var["chrom"] = split.map(lambda x: x[0])
 84 |     atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
 85 |     atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)
 86 |     atac_chrs = atac.var['chrom'].value_counts().index.tolist()
 87 |     row_keep = rna.var_names[rna.var['chrom'].isin(atac_chrs).tolist()]
 88 |     rna = rna[:,row_keep].copy()
 89 |     guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
 90 |     scglue.graph.check_graph(guidance, [rna, atac])
 91 |     
 92 |     # prepare for training 
 93 |     scglue.models.configure_dataset(
 94 |         rna, "NB", use_highly_variable=True,
 95 |         use_layer="counts", use_rep="X_pca"
 96 |     )
 97 |     scglue.models.configure_dataset(
 98 |         atac, "NB", use_highly_variable=True,
 99 |         use_rep="X_lsi"
100 |     )
101 | 
102 |     guidance_hvf = guidance.subgraph(chain(
103 |         rna.var.query("highly_variable").index,
104 |         atac.var.query("highly_variable").index
105 |     )).copy()
106 |     
107 |     # GLUE training 
108 |     glue = scglue.models.fit_SCGLUE(
109 |         {"rna": rna, "atac": atac}, guidance_hvf,
110 |         fit_kws={"directory": os.path.join(out_dir,"glue")}
111 |     )
112 |     
113 |     dx = scglue.models.integration_consistency(
114 |         glue, {"rna": rna, "atac": atac}, guidance_hvf
115 |     )
116 |     print(dx)
117 |     rna.obsm["X_glue"] = glue.encode_data("rna", rna)
118 |     atac.obsm["X_glue"] = glue.encode_data("atac", atac)
119 |     combined = ad.concat([rna, atac])
120 | 
121 |     # extract latent representation
122 |     res_df = pd.DataFrame(combined.obsm['X_glue'],index=combined.obs.index)
123 |     # set column names as latent_x 
124 |     res_df = res_df.set_axis(["latent_" + s  for s in res_df.columns.astype("str").tolist()],axis="columns")
125 |     res_df['dataset'] = combined.obs['dataset']
126 |     res_df['dataset'] = res_df['dataset'].astype("string")
127 |     
128 |     # save latent representation and model
129 |     
130 |     csv_out = os.path.join(out_dir, "glue","glue_result.csv")
131 |     res_df.to_csv(csv_out)
132 |     model_out = os.path.join(out_dir,"glue","glue.dill")
133 |     glue.save(model_out)
134 |     stop = timeit.default_timer()
135 |     
136 |     print('Time(s): ', stop - start)  
137 |     # record time 
138 |     runtime_out = os.path.join(out_dir,"runtime","glue_runtime.txt")
139 |     print(stop - start,  file=open(runtime_out, 'w'))
140 |     print("------ Done ------")
141 |     print("------ No prediction ------")
142 |     
143 | print("argument 1:",sys.argv[1])
144 | print("argument 2:",sys.argv[2])
145 | 
146 | run_glue_fn(sys.argv[1],sys.argv[2])
147 | 
148 | 


--------------------------------------------------------------------------------
/methods/glue/run_glue_mm10_noPred.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys 
  4 | print('Number of arguments:', len(sys.argv), 'arguments.')
  5 | print('Argument List:'+ str(sys.argv))
  6 | 
  7 | #==== method specific ==== 
  8 | import networkx as nx
  9 | import scglue
 10 | from itertools import chain
 11 | import seaborn as sns
 12 | from matplotlib import rcParams
 13 | 
 14 | #from matplotlib import rcParams
 15 | from anndata import AnnData
 16 | import anndata as ad
 17 | import scipy
 18 | import numpy as np
 19 | import pandas as pd
 20 | import scipy.io as sio
 21 | import os
 22 | import scanpy as sc
 23 | from copy import deepcopy
 24 | from utils_eval import read_mtx_folder, write_adata
 25 | import timeit
 26 | 
 27 | def run_glue_fn(in_dir,out_dir):
 28 |     start = timeit.default_timer()
 29 |     
 30 |     adata_prna = read_mtx_folder(os.path.join(in_dir,"paired_RNA/"),
 31 |                                        "Gene Expression",
 32 |                                        ["gene"],
 33 |                                        ["barcodes"])
 34 | 
 35 |     adata_patac = read_mtx_folder(os.path.join(in_dir,"paired_ATAC/"),
 36 |                                        "Peaks",
 37 |                                        ["peak"],
 38 |                                        ["barcodes"])
 39 | 
 40 |     adata_urna = read_mtx_folder(os.path.join(in_dir,"unpaired_RNA/"),
 41 |                                        "Gene Expression",
 42 |                                        ["gene"],
 43 |                                        ["barcodes"])
 44 | 
 45 |     adata_uatac = read_mtx_folder(os.path.join(in_dir,"unpaired_ATAC/"),
 46 |                                        "Peaks",
 47 |                                        ["peak"],
 48 |                                        ["barcodes"])
 49 |     
 50 |     adata_prna.obs['dataset'] = 'multiomeRNA'
 51 |     adata_patac.obs['dataset'] = 'multiomeATAC'
 52 |     adata_urna.obs['dataset'] = 'scRNA'
 53 |     adata_uatac.obs['dataset'] = 'snATAC'
 54 |     
 55 |     rna = ad.concat([adata_prna, adata_urna])
 56 |     atac = ad.concat([adata_patac, adata_uatac])
 57 | 
 58 |     os.makedirs(out_dir, exist_ok=True)
 59 |     os.makedirs(os.path.join(out_dir,"glue"), exist_ok=True)
 60 |     os.makedirs(os.path.join(out_dir,"runtime"), exist_ok=True)
 61 |     
 62 |     # preprocessing of scRNA
 63 |     rna.layers["counts"] = rna.X.copy()
 64 |     sc.pp.filter_genes(rna, min_cells=3)
 65 |     sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")
 66 |     sc.pp.normalize_total(rna)
 67 |     sc.pp.log1p(rna)
 68 |     sc.pp.scale(rna)
 69 |     sc.tl.pca(rna, n_comps=100, svd_solver="auto")
 70 |     
 71 |     # preprocessing of snATAC
 72 |     sc.pp.filter_genes(atac,min_counts=1)
 73 |     scglue.data.lsi(atac, n_components=100, n_iter=15)
 74 | 
 75 |     # build graph
 76 |     scglue.data.get_gene_annotation(
 77 |         # this works for mouse mm10 genome-build (downloaded from https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz)
 78 |         rna, gtf="/home/myylee/scmint/methods_eval/mm10_genes.gtf.gz",
 79 |         gtf_by="gene_name"
 80 |     )
 81 |     rna.var.loc[:, ["chrom", "chromStart", "chromEnd"]].head()
 82 |     
 83 |     split = atac.var_names.str.split(r"[--]")
 84 |     atac.var["chrom"] = split.map(lambda x: x[0])
 85 |     atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
 86 |     atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)
 87 |     atac_chrs = atac.var['chrom'].value_counts().index.tolist()
 88 |     row_keep = rna.var_names[rna.var['chrom'].isin(atac_chrs).tolist()]
 89 |     rna = rna[:,row_keep].copy()
 90 |     guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
 91 |     scglue.graph.check_graph(guidance, [rna, atac])
 92 |     
 93 |     # prepare for training 
 94 |     scglue.models.configure_dataset(
 95 |         rna, "NB", use_highly_variable=True,
 96 |         use_layer="counts", use_rep="X_pca"
 97 |     )
 98 |     scglue.models.configure_dataset(
 99 |         atac, "NB", use_highly_variable=True,
100 |         use_rep="X_lsi"
101 |     )
102 | 
103 |     guidance_hvf = guidance.subgraph(chain(
104 |         rna.var.query("highly_variable").index,
105 |         atac.var.query("highly_variable").index
106 |     )).copy()
107 |     
108 |     # GLUE training 
109 |     glue = scglue.models.fit_SCGLUE(
110 |         {"rna": rna, "atac": atac}, guidance_hvf,
111 |         fit_kws={"directory": os.path.join(out_dir,"glue")}
112 |     )
113 |     
114 |     dx = scglue.models.integration_consistency(
115 |         glue, {"rna": rna, "atac": atac}, guidance_hvf
116 |     )
117 |     print(dx)
118 |     rna.obsm["X_glue"] = glue.encode_data("rna", rna)
119 |     atac.obsm["X_glue"] = glue.encode_data("atac", atac)
120 |     combined = ad.concat([rna, atac])
121 | 
122 |     # extract latent representation
123 |     res_df = pd.DataFrame(combined.obsm['X_glue'],index=combined.obs.index)
124 |     # set column names as latent_x 
125 |     res_df = res_df.set_axis(["latent_" + s  for s in res_df.columns.astype("str").tolist()],axis="columns")
126 |     res_df['dataset'] = combined.obs['dataset']
127 |     res_df['dataset'] = res_df['dataset'].astype("string")
128 |     
129 |     # save latent representation and model
130 |     
131 |     csv_out = os.path.join(out_dir, "glue","glue_result.csv")
132 |     res_df.to_csv(csv_out)
133 |     model_out = os.path.join(out_dir,"glue","glue.dill")
134 |     glue.save(model_out)
135 |     stop = timeit.default_timer()
136 |     
137 |     print('Time(s): ', stop - start)  
138 |     # record time 
139 |     runtime_out = os.path.join(out_dir,"runtime","glue_runtime.txt")
140 |     print(stop - start,  file=open(runtime_out, 'w'))
141 |     print("------ Done ------")
142 |     print("------ No prediction ------")
143 | 
144 | print("argument 1:",sys.argv[1])
145 | print("argument 2:",sys.argv[2])
146 | 
147 | run_glue_fn(sys.argv[1],sys.argv[2])
148 | 
149 | 


--------------------------------------------------------------------------------
/methods/glue/run_glue_mm10_single_noPred.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys 
  4 | print('Number of arguments:', len(sys.argv), 'arguments.')
  5 | print('Argument List:'+ str(sys.argv))
  6 | 
  7 | #==== method specific ==== 
  8 | import networkx as nx
  9 | import scglue
 10 | from itertools import chain
 11 | import seaborn as sns
 12 | from matplotlib import rcParams
 13 | 
 14 | #from matplotlib import rcParams
 15 | from anndata import AnnData
 16 | import anndata as ad
 17 | import scipy
 18 | import numpy as np
 19 | import pandas as pd
 20 | import scipy.io as sio
 21 | import os
 22 | import scanpy as sc
 23 | from copy import deepcopy
 24 | from utils_eval import read_mtx_folder, write_adata
 25 | import timeit
 26 | 
 27 | def run_glue_fn(in_dir,out_dir):
 28 |     start = timeit.default_timer()
 29 |     
 30 |     adata_prna = read_mtx_folder(os.path.join(in_dir,"paired_RNA/"),
 31 |                                        "Gene Expression",
 32 |                                        ["gene"],
 33 |                                        ["barcodes"])
 34 | 
 35 |     adata_patac = read_mtx_folder(os.path.join(in_dir,"paired_ATAC/"),
 36 |                                        "Peaks",
 37 |                                        ["peak"],
 38 |                                        ["barcodes"])
 39 | 
 40 |     adata_urna = read_mtx_folder(os.path.join(in_dir,"unpaired_RNA/"),
 41 |                                        "Gene Expression",
 42 |                                        ["gene"],
 43 |                                        ["barcodes"])
 44 | 
 45 |     adata_uatac = read_mtx_folder(os.path.join(in_dir,"unpaired_ATAC/"),
 46 |                                        "Peaks",
 47 |                                        ["peak"],
 48 |                                        ["barcodes"])
 49 |     
 50 |     adata_prna.obs['dataset'] = 'multiomeRNA'
 51 |     adata_patac.obs['dataset'] = 'multiomeATAC'
 52 |     adata_urna.obs['dataset'] = 'scRNA'
 53 |     adata_uatac.obs['dataset'] = 'snATAC'
 54 |     
 55 |     rna = adata_urna
 56 |     atac = adata_uatac
 57 |     os.makedirs(out_dir, exist_ok=True)
 58 |     os.makedirs(os.path.join(out_dir,"glue"), exist_ok=True)
 59 |     os.makedirs(os.path.join(out_dir,"runtime"), exist_ok=True)
 60 |     
 61 |     # preprocessing of scRNA
 62 |     rna.layers["counts"] = rna.X.copy()
 63 |     sc.pp.filter_genes(rna, min_cells=3)
 64 |     sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")
 65 |     sc.pp.normalize_total(rna)
 66 |     sc.pp.log1p(rna)
 67 |     sc.pp.scale(rna)
 68 |     sc.tl.pca(rna, n_comps=100, svd_solver="auto")
 69 |     
 70 |     # preprocessing of snATAC
 71 |     sc.pp.filter_genes(atac,min_counts=1)
 72 |     scglue.data.lsi(atac, n_components=100, n_iter=15)
 73 | 
 74 |     # build graph
 75 |     scglue.data.get_gene_annotation(
 76 |         # this works for mouse mm10 genome-build (downloaded from https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz)
 77 |         rna, gtf="/home/myylee/scmint/methods_eval/mm10_genes.gtf.gz",
 78 |         gtf_by="gene_name"
 79 |     )
 80 |     rna.var.loc[:, ["chrom", "chromStart", "chromEnd"]].head()
 81 |     
 82 |     split = atac.var_names.str.split(r"[--]")
 83 |     atac.var["chrom"] = split.map(lambda x: x[0])
 84 |     atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
 85 |     atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)
 86 |     atac_chrs = atac.var['chrom'].value_counts().index.tolist()
 87 |     row_keep = rna.var_names[rna.var['chrom'].isin(atac_chrs).tolist()]
 88 |     rna = rna[:,row_keep].copy()
 89 |     guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
 90 |     scglue.graph.check_graph(guidance, [rna, atac])
 91 |     
 92 |     # prepare for training 
 93 |     scglue.models.configure_dataset(
 94 |         rna, "NB", use_highly_variable=True,
 95 |         use_layer="counts", use_rep="X_pca"
 96 |     )
 97 |     scglue.models.configure_dataset(
 98 |         atac, "NB", use_highly_variable=True,
 99 |         use_rep="X_lsi"
100 |     )
101 | 
102 |     guidance_hvf = guidance.subgraph(chain(
103 |         rna.var.query("highly_variable").index,
104 |         atac.var.query("highly_variable").index
105 |     )).copy()
106 |     
107 |     # GLUE training 
108 |     glue = scglue.models.fit_SCGLUE(
109 |         {"rna": rna, "atac": atac}, guidance_hvf,
110 |         fit_kws={"directory": os.path.join(out_dir,"glue")}
111 |     )
112 |     
113 |     dx = scglue.models.integration_consistency(
114 |         glue, {"rna": rna, "atac": atac}, guidance_hvf
115 |     )
116 |     print(dx)
117 |     rna.obsm["X_glue"] = glue.encode_data("rna", rna)
118 |     atac.obsm["X_glue"] = glue.encode_data("atac", atac)
119 |     combined = ad.concat([rna, atac])
120 | 
121 |     # extract latent representation
122 |     res_df = pd.DataFrame(combined.obsm['X_glue'],index=combined.obs.index)
123 |     # set column names as latent_x 
124 |     res_df = res_df.set_axis(["latent_" + s  for s in res_df.columns.astype("str").tolist()],axis="columns")
125 |     res_df['dataset'] = combined.obs['dataset']
126 |     res_df['dataset'] = res_df['dataset'].astype("string")
127 |     
128 |     # save latent representation and model
129 |     
130 |     csv_out = os.path.join(out_dir, "glue","glue_result.csv")
131 |     res_df.to_csv(csv_out)
132 |     model_out = os.path.join(out_dir,"glue","glue.dill")
133 |     glue.save(model_out)
134 |     stop = timeit.default_timer()
135 |     
136 |     print('Time(s): ', stop - start)  
137 |     # record time 
138 |     runtime_out = os.path.join(out_dir,"runtime","glue_runtime.txt")
139 |     print(stop - start,  file=open(runtime_out, 'w'))
140 |     print("------ Done ------")
141 |     print("------ No prediction ------")
142 | 
143 | print("argument 1:",sys.argv[1])
144 | print("argument 2:",sys.argv[2])
145 | 
146 | run_glue_fn(sys.argv[1],sys.argv[2])
147 | 
148 | 


--------------------------------------------------------------------------------
/methods/liger/rliger_env.R:
--------------------------------------------------------------------------------
 1 | # command line
 2 | 
 3 | conda create -n liger r-essentials r-base
 4 | 
 5 | conda activate liger
 6 | conda install -y igraph hdf5
 7 | 
 8 | R
 9 | # R
10 | install.packages('Seurat')
11 | install.packages('IRkernel')
12 | IRkernel::installspec(name = 'liger', displayname = 'rliger')
13 | 
14 | install.packages('devtools')
15 | system("conda install -c conda-forge r-devtools")
16 | #system("conda install -c conda-forge r-gert")
17 | library(devtools)
18 | install_github('welch-lab/liger')
19 | 
20 | install.packages("BiocManager")
21 | BiocManager::install(c("GenomeInfoDb","IRanges", "Rsamtools", "S4Vectors", "BiocGenerics"))
22 | remotes::install_version("RSQLite", version = "2.2.5")
23 | BiocManager::install(c("EnsDb.Hsapiens.v86","biovizBase"))
24 | install.packages("Signac") 
25 | 
26 | 
27 | library(rliger)
28 | library(Seurat)
29 | library(Signac)
30 | library(stringr)
31 | 


--------------------------------------------------------------------------------
/methods/liger/run_rliger.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(rliger)
  5 | require(Seurat)
  6 | require(stringr)
  7 | require(Signac)
  8 | require(Matrix)
  9 | source("r_utils.R")
 10 | #require(future)
 11 | 
 12 | # must have a fragment file named fragments.tsv.gz 
 13 | run_rliger_fn <- function(in_dir, out_dir, nclust=7){
 14 |     # starting time
 15 |     t1 <- Sys.time()
 16 |     n_lat = 30
 17 |     #plan("multisession")
 18 |     #options(future.rng.onMisue = "ignore")
 19 |     #print(paste0("workers used:",nbrOfWorkers()))
 20 |     # run_rliger: Liger object construction, processing, and integration 
 21 |     run_rliger <- function(unpaired_rna,unpaired_atac,nclust,k=20){
 22 |         gene.activities <- GeneActivity(unpaired_atac)
 23 |         unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 24 | 
 25 |         data_comb <- list(atac = unpaired_atac[["ACTIVITY"]]@counts, rna = unpaired_rna[["RNA"]]@counts)
 26 |         data_comb <- createLiger(data_comb)
 27 | 
 28 |         data_comb <- rliger::normalize(data_comb)
 29 |         data_comb <- selectGenes(data_comb, datasets.use = 2)
 30 |         data_comb <- scaleNotCenter(data_comb)
 31 | 
 32 |         data_comb <- optimizeALS(data_comb, k = k)
 33 | 
 34 |         data_comb <- quantile_norm(data_comb)
 35 |         res <- find_resolution_liger(data_comb, nclust)
 36 |         data_comb <- louvainCluster(data_comb, resolution = res)
 37 |         
 38 |         return(data_comb)
 39 |     }
 40 | 
 41 |     datasets = load_datasets(in_dir)
 42 |     paired_rna=datasets$paired_rna
 43 |     paired_atac=datasets$paired_atac
 44 |     unpaired_rna=datasets$unpaired_rna
 45 |     unpaired_atac=datasets$unpaired_atac
 46 |     
 47 |     # verify number of cells in each condition 
 48 |     dataset_vec <- rep(c("scRNA","snATAC","Multiome-RNA","Multiome-ATAC"),
 49 |                    c(ncol(unpaired_rna),
 50 |                      ncol(unpaired_atac),
 51 |                      ncol(paired_rna),
 52 |                     ncol(paired_atac)))
 53 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 54 |                             paste0(colnames(unpaired_atac)),
 55 |                             paste0("prna_",colnames(paired_rna)),
 56 |                             paste0("patac_",colnames(paired_atac)))
 57 |     print(table(dataset_vec))
 58 |     
 59 |     paired_rna <- RenameCells(paired_rna,add.cell.id = "prna",for.merge = FALSE)
 60 |     paired_atac <- RenameCells(paired_atac,add.cell.id = "patac",for.merge = FALSE)
 61 |     
 62 |     # merging
 63 |     unpaired_rna <- merge(unpaired_rna,paired_rna)
 64 |     unpaired_atac <- merge(unpaired_atac,paired_atac)
 65 |     
 66 |     # run LIGER
 67 |     data_integrated <- run_rliger(unpaired_rna,unpaired_atac,nclust)
 68 |     # run UMAP
 69 |     data_integrated <- runUMAP(data_integrated, distance = 'cosine', n_neighbors = 30, min_dist = 0.3)
 70 |     
 71 |     df_umap <- data.frame(cbind(data_integrated@H.norm,predicted_ct=data_integrated@clusters))
 72 |     colnames(df_umap) [1:20] <- paste0("latent_",1:20)
 73 | 
 74 |     df_umap$dataset = "scRNA"
 75 |     df_umap[names(dataset_vec),"dataset"] = dataset_vec
 76 |     #df_umap = cbind(df_umap,data_integrated@tsne.coords)
 77 |     #colnames(df_umap)[c(ncol(df_umap)-1):ncol(df_umap)] = paste0("umap_",1:2)
 78 |     
 79 |     dir.create(file.path(out_dir,"rliger/"),recursive=TRUE)
 80 |     print("------ Saving integration result ------")
 81 |     write.csv(df_umap,file.path(out_dir,"rliger","rliger_result.csv"))
 82 |     t2 <- Sys.time()
 83 |     dir.create(file.path(out_dir,"runtime/"),recursive=TRUE)
 84 |     ## use '[[1]]' for clean output
 85 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
 86 |                 file = file.path(out_dir,"runtime","rliger_runtime.txt"), 
 87 |                 sep = "\t",
 88 |                 row.names = FALSE,
 89 |                 col.names = FALSE)
 90 |     print("------ Done ------")
 91 | }
 92 | 
 93 | 
 94 | if (length(args)<2) {
 95 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
 96 | }else if(length(args)==2) {
 97 |     print(paste0("Input directory: ",args[1]))
 98 |     print(paste0("Output directory: ",args[2]))
 99 |     run_rliger_fn(args[1], args[2])
100 | }else{
101 |     print(paste0("Number of arguments: ",length(args)))
102 |     print(paste0("Input directory: ",args[1]))
103 |     print(paste0("Output directory: ",args[2]))
104 |     print(paste0("Number of clusters: ",args[3]))
105 |     run_rliger_fn(args[1], args[2],as.integer(args[3]))
106 | }
107 | 
108 |     


--------------------------------------------------------------------------------
/methods/liger/run_rliger_batch_sequential.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(rliger)
  5 | require(Seurat)
  6 | require(stringr)
  7 | require(Signac)
  8 | require(Matrix)
  9 | source("r_utils.R")
 10 | #require(future)
 11 | 
 12 | # must have a fragment file named fragments.tsv.gz 
 13 | run_rliger_fn <- function(in_dir, out_dir, nclust=7){
 14 |     # starting time
 15 |     t1 <- Sys.time()
 16 |     n_lat = 30
 17 |     #plan("multisession")
 18 |     #options(future.rng.onMisue = "ignore")
 19 |     #print(paste0("workers used:",nbrOfWorkers()))
 20 |     # run_rliger: Liger object construction, processing, and integration 
 21 |     run_rliger <- function(data_comb,nclust,k=20,datasets.use=2){
 22 | 
 23 |         data_comb <- createLiger(data_comb)
 24 | 
 25 |         data_comb <- rliger::normalize(data_comb)
 26 |         # change this to use the first n urna datasets
 27 |         data_comb <- selectGenes(data_comb, datasets.use = datasets.use)
 28 |         data_comb <- scaleNotCenter(data_comb)
 29 | 
 30 |         data_comb <- optimizeALS(data_comb, k = k)
 31 | 
 32 |         data_comb <- quantile_norm(data_comb)
 33 |         res <- find_resolution_liger(data_comb, nclust)
 34 |         data_comb <- louvainCluster(data_comb, resolution = res)
 35 | 
 36 |         return(data_comb)
 37 |     }
 38 |     datasets = load_datasets(in_dir,obs=c("barcodes","batch"))
 39 |     paired_rna=datasets$paired_rna
 40 |     paired_atac=datasets$paired_atac
 41 |     unpaired_rna=datasets$unpaired_rna
 42 |     unpaired_atac=datasets$unpaired_atac
 43 | 
 44 |     # verify number of cells in each condition 
 45 |     dataset_vec <- rep(c("scRNA","snATAC","Multiome-RNA","Multiome-ATAC"),
 46 |                    c(ncol(unpaired_rna),
 47 |                      ncol(unpaired_atac),
 48 |                      ncol(paired_rna),
 49 |                     ncol(paired_atac)))
 50 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 51 |                             paste0(colnames(unpaired_atac)),
 52 |                             paste0("prna_",colnames(paired_rna)),
 53 |                             paste0("patac_",colnames(paired_atac)))
 54 |     print(table(dataset_vec))
 55 | 
 56 |     paired_rna <- RenameCells(paired_rna,add.cell.id = "prna",for.merge = FALSE)
 57 |     paired_atac <- RenameCells(paired_atac,add.cell.id = "patac",for.merge = FALSE)
 58 |     
 59 |     unpaired_rna@meta.data$technology = "scRNA"
 60 |     unpaired_atac@meta.data$technology = "snATAC"
 61 |     paired_rna@meta.data$technology = "Multiome-RNA"
 62 |     paired_atac@meta.data$technology = "Multiome-ATAC"
 63 | 
 64 |     unpaired_rna@meta.data$group <- paste0(unpaired_rna$batch,"_",unpaired_rna$technology)
 65 |     paired_rna@meta.data$group <- paste0(paired_rna$batch,"_",paired_rna$technology)
 66 | 
 67 |     unpaired_atac@meta.data$group <- paste0(unpaired_atac$batch,"_",unpaired_atac$technology)
 68 |     paired_atac@meta.data$group <- paste0(paired_atac$batch,"_",paired_atac$technology)
 69 | 
 70 |     # merging
 71 |     unpaired_rna <- merge(unpaired_rna,paired_rna)
 72 |     unpaired_atac <- merge(unpaired_atac,paired_atac)
 73 | 
 74 |     urna_list = SplitObject(unpaired_rna, split.by = "group")
 75 |     
 76 |     gene.activities <- GeneActivity(unpaired_atac)
 77 |     unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 78 |     uatac_list = SplitObject(unpaired_atac, split.by = "group")
 79 | 
 80 |     urna_counts_list = lapply(urna_list,function(x){x[["RNA"]]@counts})
 81 |     uatac_counts_list = lapply(uatac_list,function(x){x[["ACTIVITY"]]@counts})
 82 |     data_comb<-c(urna_counts_list,uatac_counts_list)
 83 |     
 84 |     datasets.use = c(1:length(urna_counts_list))
 85 |     # TO-BE-EDITED, PASS a list of objects to liger.
 86 |     data_integrated <- run_rliger(data_comb,nclust,datasets.use=datasets.use)
 87 |     # run UMAP
 88 |     data_integrated <- runUMAP(data_integrated, distance = 'cosine', n_neighbors = 30, min_dist = 0.3)
 89 |     
 90 |     df_umap <- data.frame(cbind(data_integrated@H.norm,predicted_ct=data_integrated@clusters))
 91 |     colnames(df_umap) [1:20] <- paste0("latent_",1:20)
 92 | 
 93 |     df_umap$dataset = "scRNA"
 94 |     df_umap[names(dataset_vec),"dataset"] = dataset_vec
 95 |     #df_umap = cbind(df_umap,data_integrated@tsne.coords)
 96 |     #colnames(df_umap)[c(ncol(df_umap)-1):ncol(df_umap)] = paste0("umap_",1:2)
 97 |     
 98 |     dir.create(file.path(out_dir,"rliger/"),recursive=TRUE)
 99 |     print("------ Saving integration result ------")
100 |     write.csv(df_umap,file.path(out_dir,"rliger","rliger_result.csv"))
101 |     t2 <- Sys.time()
102 |     dir.create(file.path(out_dir,"runtime/"),recursive=TRUE)
103 |     ## use '[[1]]' for clean output
104 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
105 |                 file = file.path(out_dir,"runtime","rliger_runtime.txt"), 
106 |                 sep = "\t",
107 |                 row.names = FALSE,
108 |                 col.names = FALSE)
109 |     print("------ Done ------")
110 | }
111 | 
112 | 
113 | if (length(args)<2) {
114 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
115 | }else if(length(args)==2) {
116 |     print(paste0("Input directory: ",args[1]))
117 |     print(paste0("Output directory: ",args[2]))
118 |     run_rliger_fn(args[1], args[2])
119 | }else{
120 |     print(paste0("Number of arguments: ",length(args)))
121 |     print(paste0("Input directory: ",args[1]))
122 |     print(paste0("Output directory: ",args[2]))
123 |     print(paste0("Number of clusters: ",args[3]))
124 |     run_rliger_fn(args[1], args[2],as.integer(args[3]))
125 | }
126 | 
127 |     


--------------------------------------------------------------------------------
/methods/liger/run_rliger_single.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(rliger)
  5 | require(Seurat)
  6 | require(stringr)
  7 | require(Signac)
  8 | require(Matrix)
  9 | source("r_utils.R")
 10 | #require(future)
 11 | 
 12 | # must have a fragment file named fragments.tsv.gz 
 13 | run_rliger_fn <- function(in_dir, out_dir, nclust=7){
 14 |     # starting time
 15 |     t1 <- Sys.time()
 16 |     n_lat = 30
 17 |     #plan("multisession")
 18 |     #options(future.rng.onMisue = "ignore")
 19 |     #print(paste0("workers used:",nbrOfWorkers()))
 20 |     # run_rliger: Liger object construction, processing, and integration 
 21 |     run_rliger <- function(unpaired_rna,unpaired_atac,nclust,k=20){
 22 |         gene.activities <- GeneActivity(unpaired_atac)
 23 |         unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 24 | 
 25 |         data_comb <- list(atac = unpaired_atac[["ACTIVITY"]]@counts, rna = unpaired_rna[["RNA"]]@counts)
 26 |         data_comb <- createLiger(data_comb)
 27 | 
 28 |         data_comb <- rliger::normalize(data_comb)
 29 |         data_comb <- selectGenes(data_comb, datasets.use = 2)
 30 |         data_comb <- scaleNotCenter(data_comb)
 31 | 
 32 |         data_comb <- optimizeALS(data_comb, k = k)
 33 | 
 34 |         data_comb <- quantile_norm(data_comb)
 35 |         res <- find_resolution_liger(data_comb, nclust,resolution_start=10)
 36 |         data_comb <- louvainCluster(data_comb, resolution = res,,random.seed=1234)
 37 |         
 38 |         return(data_comb)
 39 |     }
 40 | 
 41 |     datasets = load_datasets(in_dir)
 42 |     print("loading single modality datasets only, ignoring paired RNA and paired ATAC folder")
 43 |     unpaired_rna=datasets$unpaired_rna
 44 |     unpaired_atac=datasets$unpaired_atac
 45 | 
 46 |     # print number of cells per data type
 47 |     dataset_vec <- rep(c("scRNA","snATAC"),
 48 |                        c(ncol(unpaired_rna),
 49 |                          ncol(unpaired_atac)))
 50 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 51 |                             paste0(colnames(unpaired_atac)))
 52 |     print(table(dataset_vec))
 53 |     
 54 |     unpaired_atac@meta.data$dataset <- "snATAC"
 55 |     unpaired_rna@meta.data$dataset <- "scRNA"
 56 | #     # merging
 57 | #     unpaired_rna <- merge(unpaired_rna,paired_rna, add.cell.ids = c("urna", "prna"))
 58 | #     unpaired_atac <- merge(unpaired_atac,paired_atac, add.cell.ids = c("uatac", "patac"))
 59 |     
 60 |     # run LIGER
 61 |     data_integrated <- run_rliger(unpaired_rna,unpaired_atac,nclust)
 62 |     # run UMAP
 63 |     data_integrated <- runUMAP(data_integrated, distance = 'cosine', n_neighbors = 30, min_dist = 0.3)
 64 |     
 65 |     df_umap <- data.frame(cbind(data_integrated@H.norm,predicted_ct=data_integrated@clusters))
 66 |     colnames(df_umap) [1:20] <- paste0("latent_",1:20)
 67 | 
 68 |     df_umap$dataset = "scRNA"
 69 |     df_umap[names(dataset_vec),"dataset"] = dataset_vec
 70 |     #df_umap = cbind(df_umap,data_integrated@tsne.coords)
 71 |     #colnames(df_umap)[c(ncol(df_umap)-1):ncol(df_umap)] = paste0("umap_",1:2)
 72 |     
 73 |     dir.create(file.path(out_dir,"rliger"),recursive=TRUE)
 74 |     print("------ Saving integration result ------")
 75 |     write.csv(df_umap,file.path(out_dir,"rliger","rliger_result.csv"))
 76 |     t2 <- Sys.time()
 77 |     dir.create(file.path(out_dir,"runtime"),recursive=TRUE)
 78 |     ## use '[[1]]' for clean output
 79 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
 80 |                 file = file.path(out_dir,"runtime","rliger_runtime.txt"), 
 81 |                 sep = "\t",
 82 |                 row.names = FALSE,
 83 |                 col.names = FALSE)
 84 |     print("------ Done ------")
 85 | }
 86 | 
 87 | 
 88 | if (length(args)<2) {
 89 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
 90 | }else if(length(args)==2) {
 91 |     print(paste0("Input directory: ",args[1]))
 92 |     print(paste0("Output directory: ",args[2]))
 93 |     run_rliger_fn(args[1], args[2])
 94 | }else{
 95 |     print(paste0("Number of arguments: ",length(args)))
 96 |     print(paste0("Input directory: ",args[1]))
 97 |     print(paste0("Output directory: ",args[2]))
 98 |     print(paste0("Number of clusters: ",args[3]))
 99 |     run_rliger_fn(args[1], args[2],as.integer(args[3]))
100 | }
101 | 
102 |     


--------------------------------------------------------------------------------
/methods/multivi/multivi.yml:
--------------------------------------------------------------------------------
  1 | name: multivi
  2 | channels:
  3 |   - conda-forge
  4 |   - bioconda
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=1_gnu
  9 |   - ca-certificates=2021.10.8=ha878542_0
 10 |   - ld_impl_linux-64=2.36.1=hea4e1c9_2
 11 |   - libffi=3.4.2=h7f98852_5
 12 |   - libgcc-ng=11.2.0=h1d223b6_12
 13 |   - libgomp=11.2.0=h1d223b6_12
 14 |   - libnsl=2.0.0=h7f98852_0
 15 |   - libstdcxx-ng=11.2.0=he4da1e4_12
 16 |   - libzlib=1.2.11=h36c2ea0_1013
 17 |   - ncurses=6.3=h9c3ff4c_0
 18 |   - openssl=3.0.0=h7f98852_2
 19 |   - pip=22.0.3=pyhd8ed1ab_0
 20 |   - python=3.7.12=hf930737_100_cpython
 21 |   - python_abi=3.7=2_cp37m
 22 |   - readline=8.1=h46c0cb4_0
 23 |   - sqlite=3.37.0=h9cd32fc_0
 24 |   - tk=8.6.12=h27826a3_0
 25 |   - wheel=0.37.1=pyhd8ed1ab_0
 26 |   - xz=5.2.5=h516909a_1
 27 |   - zlib=1.2.11=h36c2ea0_1013
 28 |   - pip:
 29 |     - absl-py==1.0.0
 30 |     - aiohttp==3.8.1
 31 |     - aiosignal==1.2.0
 32 |     - anndata==0.7.8
 33 |     - argon2-cffi==21.3.0
 34 |     - argon2-cffi-bindings==21.2.0
 35 |     - async-timeout==4.0.2
 36 |     - asynctest==0.13.0
 37 |     - attrs==21.4.0
 38 |     - backcall==0.2.0
 39 |     - bleach==4.1.0
 40 |     - cached-property==1.5.2
 41 |     - cachetools==5.0.0
 42 |     - certifi==2021.10.8
 43 |     - cffi==1.15.0
 44 |     - charset-normalizer==2.0.12
 45 |     - chex==0.1.3
 46 |     - colorama==0.4.4
 47 |     - commonmark==0.9.1
 48 |     - cycler==0.11.0
 49 |     - debugpy==1.5.1
 50 |     - decorator==5.1.1
 51 |     - defusedxml==0.7.1
 52 |     - dm-tree==0.1.7
 53 |     - docrep==0.3.2
 54 |     - entrypoints==0.4
 55 |     - et-xmlfile==1.1.0
 56 |     - flatbuffers==2.0
 57 |     - flax==0.5.1
 58 |     - fonttools==4.29.1
 59 |     - frozenlist==1.3.0
 60 |     - fsspec==2022.2.0
 61 |     - future==0.18.2
 62 |     - google-auth==2.6.0
 63 |     - google-auth-oauthlib==0.4.6
 64 |     - grpcio==1.44.0
 65 |     - h5py==3.6.0
 66 |     - hyperopt==0.1.2
 67 |     - idna==3.3
 68 |     - igraph==0.9.9
 69 |     - importlib-metadata==1.7.0
 70 |     - importlib-resources==5.4.0
 71 |     - ipykernel==6.9.1
 72 |     - ipython==7.32.0
 73 |     - ipython-genutils==0.2.0
 74 |     - ipywidgets==7.6.5
 75 |     - jax==0.3.13
 76 |     - jaxlib==0.3.10
 77 |     - jedi==0.18.1
 78 |     - jinja2==3.0.3
 79 |     - joblib==1.1.0
 80 |     - jsonschema==4.4.0
 81 |     - jupyter-client==7.1.2
 82 |     - jupyter-core==4.9.2
 83 |     - jupyterlab-pygments==0.1.2
 84 |     - jupyterlab-widgets==1.0.2
 85 |     - kiwisolver==1.3.2
 86 |     - leidenalg==0.8.9
 87 |     - llvmlite==0.38.0
 88 |     - markdown==3.3.4
 89 |     - markupsafe==2.1.0
 90 |     - matplotlib==3.5.1
 91 |     - matplotlib-inline==0.1.3
 92 |     - mistune==0.8.4
 93 |     - msgpack==1.0.4
 94 |     - multidict==6.0.2
 95 |     - multipledispatch==0.6.0
 96 |     - natsort==8.1.0
 97 |     - nbclient==0.5.11
 98 |     - nbconvert==6.4.2
 99 |     - nbformat==5.1.3
100 |     - nest-asyncio==1.5.4
101 |     - networkx==2.6.3
102 |     - notebook==6.4.8
103 |     - numba==0.55.1
104 |     - numexpr==2.8.1
105 |     - numpy==1.21.5
106 |     - numpyro==0.9.2
107 |     - oauthlib==3.2.0
108 |     - openpyxl==3.0.9
109 |     - opt-einsum==3.3.0
110 |     - optax==0.1.2
111 |     - packaging==21.3
112 |     - pandas==1.3.5
113 |     - pandocfilters==1.5.0
114 |     - parso==0.8.3
115 |     - patsy==0.5.2
116 |     - pexpect==4.8.0
117 |     - pickleshare==0.7.5
118 |     - pillow==9.0.1
119 |     - prometheus-client==0.13.1
120 |     - prompt-toolkit==3.0.28
121 |     - protobuf==3.19.4
122 |     - psutil==5.9.0
123 |     - ptyprocess==0.7.0
124 |     - pyasn1==0.4.8
125 |     - pyasn1-modules==0.2.8
126 |     - pycparser==2.21
127 |     - pydeprecate==0.3.1
128 |     - pygments==2.11.2
129 |     - pymongo==4.1.1
130 |     - pynndescent==0.5.6
131 |     - pyparsing==3.0.7
132 |     - pyro-api==0.1.2
133 |     - pyro-ppl==1.8.0
134 |     - pyrsistent==0.18.1
135 |     - python-dateutil==2.8.2
136 |     - pytorch-lightning==1.5.10
137 |     - pytz==2021.3
138 |     - pyyaml==5.4.1
139 |     - pyzmq==22.3.0
140 |     - requests==2.27.1
141 |     - requests-oauthlib==1.3.1
142 |     - rich==11.1.0
143 |     - rsa==4.8
144 |     - scanpy==1.8.2
145 |     - scikit-learn==1.0.2
146 |     - scikit-misc==0.1.4
147 |     - scipy==1.7.3
148 |     - scvi==0.6.8
149 |     - scvi-tools==0.16.4
150 |     - seaborn==0.11.2
151 |     - send2trash==1.8.0
152 |     - setuptools==59.5.0
153 |     - sinfo==0.3.4
154 |     - six==1.16.0
155 |     - statsmodels==0.13.2
156 |     - stdlib-list==0.8.0
157 |     - tables==3.7.0
158 |     - tensorboard==2.8.0
159 |     - tensorboard-data-server==0.6.1
160 |     - tensorboard-plugin-wit==1.8.1
161 |     - terminado==0.13.1
162 |     - testpath==0.6.0
163 |     - texttable==1.6.4
164 |     - threadpoolctl==3.1.0
165 |     - toolz==0.11.2
166 |     - torch==1.10.2
167 |     - torchmetrics==0.7.2
168 |     - tornado==6.1
169 |     - tqdm==4.62.3
170 |     - traitlets==5.1.1
171 |     - typing-extensions==4.1.1
172 |     - umap-learn==0.5.2
173 |     - urllib3==1.26.8
174 |     - wcwidth==0.2.5
175 |     - webencodings==0.5.1
176 |     - werkzeug==2.0.3
177 |     - widgetsnbextension==3.5.2
178 |     - xlrd==1.2.0
179 |     - yarl==1.7.2
180 |     - zipp==3.7.0
181 | prefix: /home/myylee/anaconda3/envs/multivi
182 | 


--------------------------------------------------------------------------------
/methods/multivi/multivi_env.R:
--------------------------------------------------------------------------------
 1 | # command line
 2 | 
 3 | conda create -n figr r-essentials r-base
 4 | 
 5 | conda activate figr
 6 | conda install -y igraph hdf5
 7 | 
 8 | R
 9 | # R
10 | install.packages('Seurat')
11 | install.packages('IRkernel')
12 | IRkernel::installspec(name = 'figr', displayname = 'rfigr')
13 | 
14 | 
15 | system("conda install -y -c conda-forge r-devtools")
16 | install.packages("BiocManager")
17 | library(devtools)
18 | 
19 | BiocManager::install(c("GenomeInfoDb","IRanges", "Rsamtools", "S4Vectors", "BiocGenerics"))
20 | remotes::install_version("RSQLite", version = "2.2.5")
21 | BiocManager::install(c("EnsDb.Hsapiens.v86","biovizBase"))
22 | install.packages("Signac") 
23 | remotes::install_github("mojaveazure/seurat-disk")
24 | 
25 | install.packages("optmatch")
26 | BiocManager::install("chromVAR")
27 | install.packages("pbmcapply")
28 | 
29 | system("conda install -c conda-forge r-ggrastr")
30 | devtools::install_github("caleblareau/BuenColors")
31 | BiocManager::install("ComplexHeatmap")
32 | install.packages(c("networkD3","network","GGally","network"))
33 | 
34 | library(Seurat)
35 | library(Signac)
36 | library(stringr)
37 | 
38 | # figr download from github repo 
39 | # git clone https://github.com/buenrostrolab/stimATAC_analyses_code.git
40 | 
41 | 


--------------------------------------------------------------------------------
/methods/multivi/run_multivi_2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import sys 
  3 | print('Number of arguments:', len(sys.argv), 'arguments.')
  4 | print('Argument List:'+ str(sys.argv))
  5 | 
  6 | 
  7 | from anndata import AnnData
  8 | import anndata as ad
  9 | import scipy
 10 | import numpy as np
 11 | import pandas as pd
 12 | import scipy.io as sio
 13 | import os
 14 | import scanpy as sc
 15 | from copy import deepcopy
 16 | from utils_eval import read_mtx_folder, write_adata
 17 | import timeit
 18 | 
 19 | #==== method specific ==== 
 20 | import scvi
 21 | 
 22 | scvi.settings.seed = 420
 23 | def run_multivi_fn(in_dir,out_dir):
 24 |     ##Test
 25 |     #os.makedirs(os.path.join(out_dir,"multivi"), exist_ok=True)
 26 |     #os.makedirs(os.path.join(out_dir,"runtime"), exist_ok=True)
 27 |     #csv_out = os.path.join(out_dir, "multivi","multivi_result.csv")
 28 |     #pd.DataFrame([1,2,3,4]).to_csv(csv_out)
 29 |     
 30 |     # save latent representation and model 
 31 |     start = timeit.default_timer()
 32 |     scvi.settings.seed = 420
 33 |     adata_prna = read_mtx_folder(os.path.join(in_dir,"paired_RNA/"),
 34 |                                        "Gene Expression",
 35 |                                        ["gene"],
 36 |                                        ["barcodes"])
 37 | 
 38 |     adata_patac = read_mtx_folder(os.path.join(in_dir,"paired_ATAC/"),
 39 |                                        "Peaks",
 40 |                                        ["peak"],
 41 |                                        ["barcodes"])
 42 | 
 43 |     adata_urna = read_mtx_folder(os.path.join(in_dir,"unpaired_RNA/"),
 44 |                                        "Gene Expression",
 45 |                                        ["gene"],
 46 |                                        ["barcodes"])
 47 | 
 48 |     adata_uatac = read_mtx_folder(os.path.join(in_dir,"unpaired_ATAC/"),
 49 |                                        "Peaks",
 50 |                                        ["peak"],
 51 |                                        ["barcodes"])
 52 |     # Horizontally stack two modalities of paired dataset 
 53 |     adata_paired = AnnData(scipy.sparse.hstack((deepcopy(adata_prna.X), 
 54 |                                             deepcopy(adata_patac.X)), 
 55 |                                            format='csr'),
 56 |                            obs = deepcopy(adata_prna.obs),
 57 |                            var = pd.concat([deepcopy(adata_prna.var[["modality"]]),deepcopy(adata_patac.var[["modality"]])]))
 58 |     # organize_mulitome_anndatats: concatenate paired and two unpaired anndata
 59 |     adata_mvi = scvi.data.organize_multiome_anndatas(adata_paired, adata_urna, adata_uatac)
 60 |     # gene features need to be before chromatin peaks (algorithm assumption)
 61 |     adata_mvi = adata_mvi[:, adata_mvi.var["modality"].argsort()].copy()
 62 |     sc.pp.filter_genes(adata_mvi, min_cells=int(adata_mvi.shape[0] * 0.01))
 63 |     # setup batch annotation
 64 |     scvi.model.MULTIVI.setup_anndata(adata_mvi, batch_key='modality')
 65 |     # setup model 
 66 |     mvi = scvi.model.MULTIVI(
 67 |         adata_mvi,
 68 |         n_genes=(adata_mvi.var['modality']=='Gene Expression').sum(),
 69 |         n_regions=(adata_mvi.var['modality']=='Peaks').sum(),
 70 |     )
 71 |     # train 
 72 |     mvi.train()
 73 |     os.makedirs(out_dir, exist_ok=True)
 74 |     # get latent representation 
 75 |     adata_mvi.obsm["MultiVI_latent"] = mvi.get_latent_representation()
 76 |    
 77 |     adata_mvi.obs = adata_mvi.obs.set_axis([s. split("_", 1)[0] for s in adata_mvi.obs.index], axis='index')
 78 | 
 79 |     # extract latent representation
 80 |     res_df = pd.DataFrame(adata_mvi.obsm['MultiVI_latent'],index=adata_mvi.obs.index)
 81 |     # set column names as latent_x 
 82 |     res_df = res_df.set_axis(["latent_" + s  for s in res_df.columns.astype("str").tolist()],axis="columns")
 83 |     # save modality information as dataset 
 84 |     res_df['dataset'] = adata_mvi.obs['modality']
 85 |     # convert categories to the ["snATAC","scRNA","Multiome"]
 86 |     res_df['dataset'] = res_df['dataset'].astype("category")
 87 |     res_df['dataset'].cat.categories = ["snATAC","scRNA","Multiome"]
 88 |     res_df['dataset'] = res_df['dataset'].astype("string")
 89 |     
 90 |     # save latent representation and model 
 91 |     os.makedirs(os.path.join(out_dir,"multivi"), exist_ok=True)
 92 |     os.makedirs(os.path.join(out_dir,"runtime"), exist_ok=True)
 93 |     
 94 |     csv_out = os.path.join(out_dir, "multivi","multivi_result.csv")
 95 |     res_df.to_csv(csv_out)
 96 |     model_out = os.path.join(out_dir,"multivi","trained_multivi")
 97 |     mvi.save(model_out, overwrite=True)
 98 |     stop = timeit.default_timer()
 99 |     print('Time(s): ', stop - start)  
100 |     # record time 
101 |     runtime_out = os.path.join(out_dir,"runtime","multivi_runtime.txt")
102 |     print(stop - start,  file=open(runtime_out, 'w'))
103 |     print("------ Done ------")
104 |     print("------ Prediction ------")
105 |     start = timeit.default_timer()
106 |     # get imputated gene expression 
107 |     imputed_expression = mvi.get_normalized_expression()
108 |     rna_pred = AnnData(imputed_expression,
109 |                        obs=deepcopy(adata_mvi.obs),
110 |                        var=adata_mvi.var[adata_mvi.var['modality']=="Gene Expression"])
111 |     rna_pred.var['feature'] = list(rna_pred.var.index)
112 |     # select for cells in multiome and snATAC datasets 
113 |     idx_keep = list(rna_pred.obs['modality'].isin(["paired","accessibility"]))
114 |     # subset the imputed gene expression for the selected cells 
115 |     rna_pred_save = rna_pred[idx_keep,]
116 |     
117 |     # select for cells in multiome and snATAC dataset
118 |     idx_keep = list(adata_mvi.obs['modality'].isin(["paired","accessibility"]))
119 |     # subset the peak matrix 
120 |     adata_atac_save = adata_mvi[idx_keep,list(adata_mvi.var['modality']  == "Peaks")]
121 |     adata_atac_save.var['feature'] = list(adata_atac_save.var.index)
122 | 
123 |     # save the original peak matrix and imputed gene expression matrix in "predicted" folder 
124 |     write_adata(rna_pred_save,  os.path.join(out_dir,"multivi","predicted","RNA"),
125 |                            "RNA","gene",bc="barcodes",feature_name='feature',transpose=True)
126 |     write_adata(adata_atac_save, os.path.join(out_dir,"multivi","predicted","ATAC"),
127 |                            "ATAC","peak",bc="barcodes",feature_name='feature',transpose=True)
128 | 
129 |     stop = timeit.default_timer()
130 |     print('Time(s): ', stop - start)  
131 |     # save prediction time 
132 |     prediction_time_out = os.path.join(out_dir, "runtime","multivi_prediction_time.txt")
133 |     print(stop - start,  file=open(prediction_time_out, 'w'))
134 |     print("------ Prediction Done ------")
135 |     return(mvi)
136 | 
137 | print("argument 1:",sys.argv[1])
138 | print("argument 2:",sys.argv[2])
139 | 
140 | mvi = run_multivi_fn(sys.argv[1],sys.argv[2])
141 | 
142 | 


--------------------------------------------------------------------------------
/methods/multivi/run_multivi_2_noPred.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import sys 
  3 | print('Number of arguments:', len(sys.argv), 'arguments.')
  4 | print('Argument List:'+ str(sys.argv))
  5 | 
  6 | 
  7 | from anndata import AnnData
  8 | import anndata as ad
  9 | import scipy
 10 | import numpy as np
 11 | import pandas as pd
 12 | import scipy.io as sio
 13 | import os
 14 | import scanpy as sc
 15 | from copy import deepcopy
 16 | from utils_eval import read_mtx_folder, write_adata
 17 | import timeit
 18 | 
 19 | #==== method specific ==== 
 20 | import scvi
 21 | 
 22 | scvi.settings.seed = 420
 23 | def run_multivi_fn(in_dir,out_dir):
 24 |     ##Test
 25 |     #os.makedirs(os.path.join(out_dir,"multivi"), exist_ok=True)
 26 |     #os.makedirs(os.path.join(out_dir,"runtime"), exist_ok=True)
 27 |     #csv_out = os.path.join(out_dir, "multivi","multivi_result.csv")
 28 |     #pd.DataFrame([1,2,3,4]).to_csv(csv_out)
 29 |     
 30 |     # save latent representation and model 
 31 |     start = timeit.default_timer()
 32 |     scvi.settings.seed = 420
 33 |     adata_prna = read_mtx_folder(os.path.join(in_dir,"paired_RNA/"),
 34 |                                        "Gene Expression",
 35 |                                        ["gene"],
 36 |                                        ["barcodes"])
 37 | 
 38 |     adata_patac = read_mtx_folder(os.path.join(in_dir,"paired_ATAC/"),
 39 |                                        "Peaks",
 40 |                                        ["peak"],
 41 |                                        ["barcodes"])
 42 | 
 43 |     adata_urna = read_mtx_folder(os.path.join(in_dir,"unpaired_RNA/"),
 44 |                                        "Gene Expression",
 45 |                                        ["gene"],
 46 |                                        ["barcodes"])
 47 | 
 48 |     adata_uatac = read_mtx_folder(os.path.join(in_dir,"unpaired_ATAC/"),
 49 |                                        "Peaks",
 50 |                                        ["peak"],
 51 |                                        ["barcodes"])
 52 |     # Horizontally stack two modalities of paired dataset 
 53 |     adata_paired = AnnData(scipy.sparse.hstack((deepcopy(adata_prna.X), 
 54 |                                             deepcopy(adata_patac.X)), 
 55 |                                            format='csr'),
 56 |                            obs = deepcopy(adata_prna.obs),
 57 |                            var = pd.concat([deepcopy(adata_prna.var[["modality"]]),deepcopy(adata_patac.var[["modality"]])]))
 58 |     # organize_mulitome_anndatats: concatenate paired and two unpaired anndata
 59 |     adata_mvi = scvi.data.organize_multiome_anndatas(adata_paired, adata_urna, adata_uatac)
 60 |     # gene features need to be before chromatin peaks (algorithm assumption)
 61 |     adata_mvi = adata_mvi[:, adata_mvi.var["modality"].argsort()].copy()
 62 |     sc.pp.filter_genes(adata_mvi, min_cells=int(adata_mvi.shape[0] * 0.01))
 63 |     # setup batch annotation
 64 |     scvi.model.MULTIVI.setup_anndata(adata_mvi, batch_key='modality')
 65 |     # setup model 
 66 |     mvi = scvi.model.MULTIVI(
 67 |         adata_mvi,
 68 |         n_genes=(adata_mvi.var['modality']=='Gene Expression').sum(),
 69 |         n_regions=(adata_mvi.var['modality']=='Peaks').sum(),
 70 |     )
 71 |     # train 
 72 |     mvi.train()
 73 |     os.makedirs(out_dir, exist_ok=True)
 74 |     # get latent representation 
 75 |     adata_mvi.obsm["MultiVI_latent"] = mvi.get_latent_representation()
 76 |    
 77 |     adata_mvi.obs = adata_mvi.obs.set_axis([s. split("_", 1)[0] for s in adata_mvi.obs.index], axis='index')
 78 | 
 79 |     # extract latent representation
 80 |     res_df = pd.DataFrame(adata_mvi.obsm['MultiVI_latent'],index=adata_mvi.obs.index)
 81 |     # set column names as latent_x 
 82 |     res_df = res_df.set_axis(["latent_" + s  for s in res_df.columns.astype("str").tolist()],axis="columns")
 83 |     # save modality information as dataset 
 84 |     res_df['dataset'] = adata_mvi.obs['modality']
 85 |     # convert categories to the ["snATAC","scRNA","Multiome"]
 86 |     res_df['dataset'] = res_df['dataset'].astype("category")
 87 |     res_df['dataset'].cat.categories = ["snATAC","scRNA","Multiome"]
 88 |     res_df['dataset'] = res_df['dataset'].astype("string")
 89 |     
 90 |     # save latent representation and model 
 91 |     os.makedirs(os.path.join(out_dir,"multivi"), exist_ok=True)
 92 |     os.makedirs(os.path.join(out_dir,"runtime"), exist_ok=True)
 93 |     
 94 |     csv_out = os.path.join(out_dir, "multivi","multivi_result.csv")
 95 |     res_df.to_csv(csv_out)
 96 |     model_out = os.path.join(out_dir,"multivi","trained_multivi")
 97 |     mvi.save(model_out, overwrite=True)
 98 |     stop = timeit.default_timer()
 99 |     print('Time(s): ', stop - start)  
100 |     # record time 
101 |     runtime_out = os.path.join(out_dir,"runtime","multivi_runtime.txt")
102 |     print(stop - start,  file=open(runtime_out, 'w'))
103 |     print("------ Done ------")
104 |     print("------ No prediction ------")
105 |     return(mvi)
106 | 
107 | print("argument 1:",sys.argv[1])
108 | print("argument 2:",sys.argv[2])
109 | 
110 | mvi = run_multivi_fn(sys.argv[1],sys.argv[2])
111 | 


--------------------------------------------------------------------------------
/methods/multivi/run_multivi_batch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import sys 
  3 | print('Number of arguments:', len(sys.argv), 'arguments.')
  4 | print('Argument List:'+ str(sys.argv))
  5 | 
  6 | 
  7 | from anndata import AnnData
  8 | import anndata as ad
  9 | import scipy
 10 | import numpy as np
 11 | import pandas as pd
 12 | import scipy.io as sio
 13 | import os
 14 | import scanpy as sc
 15 | from copy import deepcopy
 16 | from utils_eval import read_mtx_folder, write_adata
 17 | import timeit
 18 | 
 19 | #==== method specific ==== 
 20 | import scvi
 21 | 
 22 | scvi.settings.seed = 420
 23 | def run_multivi_fn(in_dir,out_dir):
 24 |     start = timeit.default_timer()
 25 |     scvi.settings.seed = 420
 26 |     adata_prna = read_mtx_folder(in_dir+"/paired_RNA/",
 27 |                                        "Gene Expression",
 28 |                                        ["gene"],
 29 |                                        ["barcodes","batch"])
 30 | 
 31 |     adata_patac = read_mtx_folder(in_dir+"/paired_ATAC/",
 32 |                                        "Peaks",
 33 |                                        ["peak"],
 34 |                                        ["barcodes","batch"])
 35 | 
 36 |     adata_urna = read_mtx_folder(in_dir+"/unpaired_RNA/",
 37 |                                        "Gene Expression",
 38 |                                        ["gene"],
 39 |                                        ["barcodes","batch"])
 40 | 
 41 |     adata_uatac = read_mtx_folder(in_dir+"/unpaired_ATAC/",
 42 |                                        "Peaks",
 43 |                                        ["peak"],
 44 |                                        ["barcodes","batch"])
 45 |     # Horizontally stack two modalities of paired dataset 
 46 |     adata_paired = AnnData(scipy.sparse.hstack((deepcopy(adata_prna.X), 
 47 |                                             deepcopy(adata_patac.X)), 
 48 |                                            format='csr'),
 49 |                            obs = deepcopy(adata_prna.obs),
 50 |                            var = pd.concat([deepcopy(adata_prna.var[["modality"]]),deepcopy(adata_patac.var[["modality"]])]))
 51 |     # organize_mulitome_anndatats: concatenate paired and two unpaired anndata
 52 |     adata_mvi = scvi.data.organize_multiome_anndatas(adata_paired, adata_urna, adata_uatac)
 53 |     # gene features need to be before chromatin peaks (algorithm assumption)
 54 |     adata_mvi = adata_mvi[:, adata_mvi.var["modality"].argsort()].copy()
 55 |     sc.pp.filter_genes(adata_mvi, min_cells=int(adata_mvi.shape[0] * 0.01))
 56 |     # setup batch annotation
 57 |     scvi.model.MULTIVI.setup_anndata(adata_mvi, batch_key='modality',
 58 |                            categorical_covariate_keys=['batch'])
 59 |     # setup model 
 60 |     mvi = scvi.model.MULTIVI(
 61 |         adata_mvi,
 62 |         n_genes=(adata_mvi.var['modality']=='Gene Expression').sum(),
 63 |         n_regions=(adata_mvi.var['modality']=='Peaks').sum(),
 64 |     )
 65 |     # train 
 66 |     mvi.train()
 67 |     os.makedirs(out_dir, exist_ok=True)
 68 |     # get latent representation 
 69 |     adata_mvi.obsm["MultiVI_latent"] = mvi.get_latent_representation()
 70 |    
 71 |     adata_mvi.obs = adata_mvi.obs.set_axis([s. split("_", 1)[0] for s in adata_mvi.obs.index], axis='index')
 72 | 
 73 |     # extract latent representation
 74 |     res_df = pd.DataFrame(adata_mvi.obsm['MultiVI_latent'],index=adata_mvi.obs.index)
 75 |     # set column names as latent_x 
 76 |     res_df = res_df.set_axis(["latent_" + s  for s in res_df.columns.astype("str").tolist()],axis="columns")
 77 |     # save modality information as dataset 
 78 |     res_df['dataset'] = adata_mvi.obs['modality']
 79 |     # convert categories to the ["snATAC","scRNA","Multiome"]
 80 |     res_df['dataset'] = res_df['dataset'].astype("category")
 81 |     res_df['dataset'].cat.categories = ["snATAC","scRNA","Multiome"]
 82 |     res_df['dataset'] = res_df['dataset'].astype("string")
 83 |     # save latent representation and model 
 84 |     os.makedirs(out_dir+"/multivi", exist_ok=True)
 85 |     res_df.to_csv(out_dir+"/multivi/multivi_result.csv")
 86 |     mvi.save(out_dir+"/multivi/trained_multivi", overwrite=True)
 87 |     stop = timeit.default_timer()
 88 |     print('Time(s): ', stop - start)  
 89 |     os.makedirs(out_dir+"/runtime", exist_ok=True)
 90 |     # record time 
 91 |     print(stop - start,  file=open(out_dir+'/runtime/multivi_runtime.txt', 'w'))
 92 |     print("------ Done ------")
 93 |     print("------ Prediction ------")
 94 |     start = timeit.default_timer()
 95 |     # get imputated gene expression 
 96 |     imputed_expression = mvi.get_normalized_expression()
 97 |     rna_pred = AnnData(imputed_expression,
 98 |                        obs=deepcopy(adata_mvi.obs),
 99 |                        var=adata_mvi.var[adata_mvi.var['modality']=="Gene Expression"])
100 |     rna_pred.var['feature'] = list(rna_pred.var.index)
101 |     # select for cells in multiome and snATAC datasets 
102 |     idx_keep = list(rna_pred.obs['modality'].isin(["paired","accessibility"]))
103 |     # subset the imputed gene expression for the selected cells 
104 |     rna_pred_save = rna_pred[idx_keep,]
105 |     
106 |     # select for cells in multiome and snATAC dataset
107 |     idx_keep = list(adata_mvi.obs['modality'].isin(["paired","accessibility"]))
108 |     # subset the peak matrix 
109 |     adata_atac_save = adata_mvi[idx_keep,list(adata_mvi.var['modality']  == "Peaks")]
110 |     adata_atac_save.var['feature'] = list(adata_atac_save.var.index)
111 | 
112 |     # save the original peak matrix and imputed gene expression matrix in "predicted" folder 
113 |     write_adata(rna_pred_save, out_dir+"/multivi/predicted/RNA/",
114 |                            "RNA","gene",bc="barcodes",feature_name='feature',transpose=True)
115 |     write_adata(adata_atac_save, out_dir+"/multivi/predicted/ATAC/",
116 |                            "ATAC","peak",bc="barcodes",feature_name='feature',transpose=True)
117 | 
118 |     stop = timeit.default_timer()
119 |     print('Time(s): ', stop - start)  
120 |     # save prediction time 
121 |     print(stop - start,  file=open(out_dir+'/runtime/multivi_prediction_time.txt', 'w'))
122 |     print("------ Prediction Done ------")
123 |     return(mvi)
124 | 
125 | print("argument 1:",sys.argv[1])
126 | print("argument 2:",sys.argv[2])
127 | 
128 | mvi = run_multivi_fn(sys.argv[1],sys.argv[2])
129 | 
130 | 


--------------------------------------------------------------------------------
/methods/scmomat/scmomat.yml:
--------------------------------------------------------------------------------
  1 | name: scmomat
  2 | channels:
  3 |   - conda-forge
  4 |   - bioconda
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=2_gnu
  9 |   - ca-certificates=2022.12.7=ha878542_0
 10 |   - ld_impl_linux-64=2.40=h41732ed_0
 11 |   - libffi=3.4.2=h7f98852_5
 12 |   - libgcc-ng=12.2.0=h65d4601_19
 13 |   - libgomp=12.2.0=h65d4601_19
 14 |   - libsqlite=3.40.0=h753d276_0
 15 |   - libstdcxx-ng=12.2.0=h46fd767_19
 16 |   - libzlib=1.2.13=h166bdaf_4
 17 |   - ncurses=6.3=h27087fc_1
 18 |   - openssl=1.1.1t=h0b41bf4_0
 19 |   - pip=23.0.1=pyhd8ed1ab_0
 20 |   - python=3.8.10=hb7a2778_2_cpython
 21 |   - readline=8.2=h8228510_1
 22 |   - setuptools=67.6.1=pyhd8ed1ab_0
 23 |   - sqlite=3.40.0=h4ff8645_0
 24 |   - tk=8.6.12=h27826a3_0
 25 |   - wheel=0.40.0=pyhd8ed1ab_0
 26 |   - xz=5.2.6=h166bdaf_0
 27 |   - zlib=1.2.13=h166bdaf_4
 28 |   - pip:
 29 |       - anndata==0.9.1
 30 |       - asttokens==2.2.1
 31 |       - backcall==0.2.0
 32 |       - cmake==3.26.3
 33 |       - comm==0.1.3
 34 |       - contourpy==1.0.7
 35 |       - cycler==0.11.0
 36 |       - debugpy==1.6.7
 37 |       - decorator==5.1.1
 38 |       - executing==1.2.0
 39 |       - filelock==3.11.0
 40 |       - fonttools==4.39.3
 41 |       - h5py==3.8.0
 42 |       - igraph==0.10.4
 43 |       - importlib-metadata==6.3.0
 44 |       - importlib-resources==5.12.0
 45 |       - ipykernel==6.22.0
 46 |       - ipython==8.12.0
 47 |       - jedi==0.18.2
 48 |       - jinja2==3.1.2
 49 |       - joblib==1.2.0
 50 |       - jupyter-client==8.2.0
 51 |       - jupyter-core==5.3.0
 52 |       - kiwisolver==1.4.4
 53 |       - leidenalg==0.9.1
 54 |       - lit==16.0.1
 55 |       - llvmlite==0.39.1
 56 |       - louvain==0.8.0
 57 |       - markupsafe==2.1.2
 58 |       - matplotlib==3.7.1
 59 |       - matplotlib-inline==0.1.6
 60 |       - mpmath==1.3.0
 61 |       - natsort==8.3.1
 62 |       - nest-asyncio==1.5.6
 63 |       - networkx==3.1
 64 |       - numba==0.56.4
 65 |       - numpy==1.23.5
 66 |       - nvidia-cublas-cu11==11.10.3.66
 67 |       - nvidia-cuda-cupti-cu11==11.7.101
 68 |       - nvidia-cuda-nvrtc-cu11==11.7.99
 69 |       - nvidia-cuda-runtime-cu11==11.7.99
 70 |       - nvidia-cudnn-cu11==8.5.0.96
 71 |       - nvidia-cufft-cu11==10.9.0.58
 72 |       - nvidia-curand-cu11==10.2.10.91
 73 |       - nvidia-cusolver-cu11==11.4.0.1
 74 |       - nvidia-cusparse-cu11==11.7.4.91
 75 |       - nvidia-nccl-cu11==2.14.3
 76 |       - nvidia-nvtx-cu11==11.7.91
 77 |       - packaging==23.1
 78 |       - pandas==2.0.0
 79 |       - parso==0.8.3
 80 |       - patsy==0.5.3
 81 |       - pexpect==4.8.0
 82 |       - pickleshare==0.7.5
 83 |       - pillow==9.5.0
 84 |       - platformdirs==3.2.0
 85 |       - prompt-toolkit==3.0.38
 86 |       - psutil==5.9.4
 87 |       - ptyprocess==0.7.0
 88 |       - pure-eval==0.2.2
 89 |       - pygments==2.15.0
 90 |       - pynndescent==0.5.8
 91 |       - pyparsing==3.0.9
 92 |       - python-dateutil==2.8.2
 93 |       - pytz==2023.3
 94 |       - pyzmq==25.0.2
 95 |       - scanpy==1.9.3
 96 |       - scikit-learn==1.2.2
 97 |       - scipy==1.10.1
 98 |       - scmomat==0.2.0
 99 |       - seaborn==0.12.2
100 |       - session-info==1.0.0
101 |       - six==1.16.0
102 |       - stack-data==0.6.2
103 |       - statsmodels==0.13.5
104 |       - stdlib-list==0.8.0
105 |       - sympy==1.11.1
106 |       - texttable==1.6.7
107 |       - threadpoolctl==3.1.0
108 |       - torch==2.0.0
109 |       - tornado==6.2
110 |       - tqdm==4.65.0
111 |       - traitlets==5.9.0
112 |       - triton==2.0.0
113 |       - typing-extensions==4.5.0
114 |       - tzdata==2023.3
115 |       - umap-learn==0.5.3
116 |       - wcwidth==0.2.6
117 |       - zipp==3.15.0
118 | prefix: /home/myylee/anaconda3/envs/scmomat
119 | 


--------------------------------------------------------------------------------
/methods/scmomat/scmomat_env.txt:
--------------------------------------------------------------------------------
 1 | # command line
 2 | conda create -n scmomat python=3.8.10
 3 | 
 4 | conda activate scmomat
 5 | 
 6 | pip install scmomat
 7 | 
 8 | pip install scanpy leidenalg ipykernel
 9 | 
10 | python -m ipykernel install --user --name scmomat --display-name "scmomat"
11 | 


--------------------------------------------------------------------------------
/methods/seuratv3/run_seurat3.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(Seurat)
  5 | require(Matrix)
  6 | source("seurat_wnn_project.R")
  7 | source("r_utils.R")
  8 | require(future)
  9 | 
 10 | run_seurat3_fn <- function(in_dir, out_dir){
 11 |     # starting time
 12 |     t1 <- Sys.time()  
 13 |     n_lat = 30
 14 |     plan("multisession")
 15 |     options(future.rng.onMisue = "ignore")
 16 |     print(paste0("workers used:",nbrOfWorkers()))
 17 |     
 18 |     datasets = load_datasets(in_dir)
 19 |     paired_rna=datasets$paired_rna
 20 |     paired_atac=datasets$paired_atac
 21 |     unpaired_rna=datasets$unpaired_rna
 22 |     unpaired_atac=datasets$unpaired_atac
 23 | 
 24 |     # verify number of cells in each condition 
 25 |     dataset_vec <- rep(c("scRNA","snATAC","Multiome-RNA","Multiome-ATAC"),
 26 |                    c(ncol(unpaired_rna),
 27 |                      ncol(unpaired_atac),
 28 |                      ncol(paired_rna),
 29 |                     ncol(paired_atac)))
 30 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 31 |                             paste0(colnames(unpaired_atac)),
 32 |                             paste0("prna_",colnames(paired_rna)),
 33 |                             paste0("patac_",colnames(paired_atac)))
 34 |     print(table(dataset_vec))
 35 |     
 36 |     paired_rna <- RenameCells(paired_rna,add.cell.id = "prna",for.merge = FALSE)
 37 |     paired_atac <- RenameCells(paired_atac,add.cell.id = "patac",for.merge = FALSE)
 38 |     
 39 |     unpaired_atac@meta.data$dataset <- "snATAC"
 40 |     unpaired_rna@meta.data$dataset <- "scRNA"
 41 |     paired_atac@meta.data$dataset <- "Multiome-ATAC"
 42 |     paired_rna@meta.data$dataset <- "Multiome-RNA"
 43 | 
 44 |     # merging
 45 |     unpaired_rna <- merge(unpaired_rna,paired_rna)
 46 |     unpaired_atac <- merge(unpaired_atac,paired_atac)
 47 |     
 48 |     # Perform standard analysis of each modality independently RNA analysis
 49 |     unpaired_rna <- NormalizeData(unpaired_rna)
 50 |     unpaired_rna <- FindVariableFeatures(unpaired_rna)
 51 |     unpaired_rna <- ScaleData(unpaired_rna)
 52 |     unpaired_rna <- RunPCA(unpaired_rna)
 53 |     unpaired_rna <- RunUMAP(unpaired_rna, dims = 1:n_lat)
 54 |     
 55 |     # We exclude the first dimension as this is typically correlated with sequencing depth
 56 |     unpaired_atac <- RunTFIDF(unpaired_atac)
 57 |     unpaired_atac <- FindTopFeatures(unpaired_atac, min.cutoff = "q0")
 58 |     unpaired_atac <- RunSVD(unpaired_atac)
 59 |     unpaired_atac <- RunUMAP(unpaired_atac, reduction = "lsi", dims = 2:n_lat, reduction.name = "umap.atac", reduction.key = "atacUMAP_")
 60 |     
 61 |     # quantify gene activity
 62 |     gene.activities <- GeneActivity(unpaired_atac, features = VariableFeatures(unpaired_rna))
 63 | 
 64 |     # add gene activities as a new assay
 65 |     unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 66 | 
 67 |     # normalize gene activities
 68 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 69 |     unpaired_atac <- NormalizeData(unpaired_atac)
 70 |     unpaired_atac <- ScaleData(unpaired_atac, features = rownames(unpaired_atac))
 71 | 
 72 |     # Identify anchors
 73 |     transfer.anchors <- FindTransferAnchors(reference = unpaired_rna, 
 74 |                                             query = unpaired_atac, 
 75 |                                             features = VariableFeatures(object = unpaired_rna),
 76 |                                             reference.assay = "RNA", 
 77 |                                             query.assay = "ACTIVITY", 
 78 |                                             reduction = "cca")
 79 |     # note that we restrict the imputation to variable genes from scRNA-seq, but could impute the
 80 |     # full transcriptome if we wanted to
 81 |     genes.use <- VariableFeatures(unpaired_rna)
 82 |     refdata <- GetAssayData(unpaired_rna, assay = "RNA", slot = "data")[genes.use, ]
 83 | 
 84 |     # refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells.  imputation
 85 |     # (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells
 86 |     imputation <- TransferData(anchorset = transfer.anchors, 
 87 |                                refdata = refdata, 
 88 |                                weight.reduction = unpaired_atac[["lsi"]],
 89 |                                dims = 2:n_lat)
 90 |     unpaired_atac[["RNA"]] <- imputation
 91 | 
 92 |     coembed <- merge(x = unpaired_rna, y = unpaired_atac)
 93 | 
 94 |     # Finally, we run PCA and UMAP on this combined object, to visualize the co-embedding of both
 95 |     # datasets
 96 |     coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE)
 97 |     coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE)
 98 |     coembed <- RunUMAP(coembed, dims = 1:n_lat)
 99 | 
100 |     print("------ Saving integration result ------")
101 |     df_umap = as.data.frame(coembed@reductions$pca@cell.embeddings[,1:n_lat])
102 | 
103 |     # ===== added =====
104 |     colnames(df_umap) = paste0("latent_",1:ncol(df_umap))
105 |     
106 |     df = cbind(df_umap,dataset=coembed@meta.data$dataset)
107 |     print(table(df_umap$dataset))
108 |     dir.create(file.path(out_dir,"seurat3"),recursive=TRUE)
109 |     write.csv(df,file.path(out_dir,"seurat3/","seurat3_result.csv"))
110 |     t2 <- Sys.time()
111 |     dir.create(file.path(out_dir,"runtime"),recursive=TRUE)
112 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
113 |                 file = file.path(out_dir,"runtime","seurat3_runtime.txt"), 
114 |                 sep = "\t",
115 |                 row.names = FALSE,
116 |                 col.names = FALSE)
117 |     print("------ Done ------")
118 |  
119 |     print("------ Prediction ------")
120 |     # starting time
121 |     t1 <- Sys.time()
122 | 
123 |     # predict gene expression values
124 |     rna <- TransferData(
125 |       anchorset = transfer.anchors,
126 |       refdata = GetAssayData(unpaired_rna, assay = "RNA", slot = "data"),
127 |       weight.reduction = unpaired_atac[["lsi"]],
128 |       dims = 2:n_lat
129 |     )
130 | 
131 |     # add predicted values as a new assay
132 |     unpaired_atac[["predicted"]] <- rna
133 |     
134 |     write_mtx_folder(file.path(out_dir,"seurat3","predicted","ATAC"),unpaired_atac,assay_key="ATAC",slot_key="counts","peak")
135 |     write_mtx_folder(file.path(out_dir,"seurat3","predicted","RNA"),unpaired_atac,assay_key="predicted",slot_key="data","gene")
136 |     
137 |     t2 <- Sys.time()
138 |     ## use '[[1]]' for clean output
139 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
140 |                 file = file.path(out_dir,"runtime","seurat3_prediction_time.txt"), 
141 |                 sep = "\t",
142 |                 row.names = FALSE,
143 |                 col.names = FALSE)
144 |     print("------ Prediction Done ------")
145 | 
146 | }
147 | 
148 | 
149 | if (length(args)<2) {
150 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
151 | }else{ 
152 |     print(paste0("Input directory: ",args[1]))
153 |     print(paste0("Output directory: ",args[2]))
154 |     run_seurat3_fn(args[1], args[2])
155 | }
156 | 


--------------------------------------------------------------------------------
/methods/seuratv3/run_seurat3_batch_sequential.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | ### NOT USING! 
  3 | args = commandArgs(trailingOnly=TRUE)
  4 | 
  5 | require(Seurat)
  6 | require(Matrix)
  7 | source("seurat_wnn_project.R")
  8 | source("r_utils.R")
  9 | require(future)
 10 | 
 11 | run_seurat3_fn <- function(in_dir, out_dir){
 12 |     # starting time
 13 |     t1 <- Sys.time()  
 14 |     n_lat = 30
 15 | #     plan("multisession")
 16 | #     options(future.rng.onMisue = "ignore")
 17 | #     print(paste0("workers used:",nbrOfWorkers()))
 18 | #     options(future.globals.maxSize = 8000 * 1024^2)
 19 |     
 20 |     datasets = load_datasets(in_dir,obs=c("barcodes","batch"))
 21 |     paired_rna=datasets$paired_rna
 22 |     paired_atac=datasets$paired_atac
 23 |     unpaired_rna=datasets$unpaired_rna
 24 |     unpaired_atac=datasets$unpaired_atac
 25 | 
 26 |     dataset_vec <- rep(c("scRNA","snATAC","Multiome-RNA","Multiome-ATAC"),
 27 |                    c(ncol(unpaired_rna),
 28 |                      ncol(unpaired_atac),
 29 |                      ncol(paired_rna),
 30 |                     ncol(paired_atac)))
 31 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 32 |                             paste0(colnames(unpaired_atac)),
 33 |                             paste0("prna_",colnames(paired_rna)),
 34 |                             paste0("patac_",colnames(paired_atac)))
 35 |     print(table(dataset_vec))
 36 | 
 37 |     paired_rna <- RenameCells(paired_rna,add.cell.id = "prna",for.merge = FALSE)
 38 |     paired_atac <- RenameCells(paired_atac,add.cell.id = "patac",for.merge = FALSE)
 39 | 
 40 |     # merging
 41 |     unpaired_rna@meta.data$technology = "scRNA"
 42 |     unpaired_atac@meta.data$technology = "snATAC"
 43 |     paired_rna@meta.data$technology = "Multiome-RNA"
 44 |     paired_atac@meta.data$technology = "Multiome-ATAC"
 45 | 
 46 |     unpaired_rna <- merge(unpaired_rna,paired_rna)
 47 |     unpaired_atac <- merge(unpaired_atac,paired_atac)
 48 | 
 49 |     unpaired_rna@meta.data$group <- paste0(unpaired_rna$batch,"_",unpaired_rna$technology)
 50 |     unpaired_atac@meta.data$group <- paste0(unpaired_atac$batch,"_",unpaired_atac$technology)
 51 | 
 52 |     # Normalize gene expression and obtain highly variable genes 
 53 |     DefaultAssay(unpaired_rna) <- "RNA"
 54 |     unpaired_rna <- NormalizeData(unpaired_rna)
 55 |     rna.list <- SplitObject(unpaired_rna, split.by = "group")
 56 |     #select high variable features across samples 
 57 |     features = SelectIntegrationFeatures(
 58 |       rna.list,
 59 |       nfeatures = 5000,
 60 |       verbose = TRUE,
 61 |       fvf.nfeatures = 10000,
 62 |     )
 63 |     
 64 |     # For ATAC-seq 
 65 |     # We exclude the first dimension as this is typically correlated with sequencing depth
 66 |     unpaired_atac <- RunTFIDF(unpaired_atac)
 67 |     # min.cutoff = 50 
 68 |     unpaired_atac <- FindTopFeatures(unpaired_atac, min.cutoff = 50)
 69 |     unpaired_atac <- RunSVD(unpaired_atac)
 70 |     unpaired_atac <- RunUMAP(unpaired_atac, reduction = "lsi", dims = 2:n_lat, reduction.name = "umap.atac", reduction.key = "atacUMAP_")
 71 |     
 72 |     # quantify gene activity
 73 |     gene.activities <- GeneActivity(unpaired_atac, features = features)
 74 |     # add gene activities as a new assay
 75 |     unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 76 |         # normalize gene activities
 77 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 78 |     unpaired_atac <- NormalizeData(unpaired_atac)
 79 |     unpaired_atac <- ScaleData(unpaired_atac, 
 80 |                                features = rownames(unpaired_atac),
 81 |                                split.by='group')
 82 | 
 83 |     
 84 |     unpaired_rna@assays$RNA@var.features = features
 85 |     unpaired_rna <- ScaleData(unpaired_rna,features=features,split.by='group')
 86 |     unpaired_rna <- RunPCA(unpaired_rna,features=features)
 87 |     unpaired_rna <- FindNeighbors(unpaired_rna, dims = 1:20, reduction = "pca")
 88 |     unpaired_rna <- FindClusters(unpaired_rna, resolution = 0.5)
 89 |     unpaired_rna <- RunUMAP(unpaired_rna, reduction = "pca", dims = 1:15)
 90 | 
 91 |     # due to error with future, change to sequential from this point onward 
 92 | #     plan(sequential)
 93 |     # Identify anchors
 94 |     transfer.anchors <- FindTransferAnchors(reference = unpaired_rna, 
 95 |                                             query = unpaired_atac, 
 96 |                                             features = VariableFeatures(object = unpaired_rna),
 97 |                                             reference.assay = "RNA", 
 98 |                                             query.assay = "ACTIVITY", 
 99 |                                             reduction = "cca")
100 |     # note that we restrict the imputation to variable genes from scRNA-seq, but could impute the
101 |     # full transcriptome if we wanted to
102 |     genes.use <- VariableFeatures(unpaired_rna)
103 |     refdata <- GetAssayData(unpaired_rna, assay = "RNA", slot = "data")[genes.use, ]
104 | 
105 |     # refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells.  imputation
106 |     # (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells
107 |     imputation <- TransferData(anchorset = transfer.anchors, 
108 |                                refdata = refdata, 
109 |                                weight.reduction = unpaired_atac[["lsi"]],
110 |                                dims = 2:n_lat)
111 |     unpaired_atac[["RNA"]] <- imputation
112 | 
113 |     coembed <- merge(x = unpaired_rna, y = unpaired_atac)
114 | 
115 |     # Finally, we run PCA and UMAP on this combined object, to visualize the co-embedding of both
116 |     # datasets
117 |     coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE)
118 |     coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE)
119 |     coembed <- RunUMAP(coembed, dims = 1:n_lat)
120 | 
121 |     print("------ Saving integration result ------")
122 |     df_umap = as.data.frame(coembed@reductions$pca@cell.embeddings[,1:n_lat])
123 | 
124 |     # ===== added =====
125 |     colnames(df_umap) = paste0("latent_",1:ncol(df_umap))
126 |     
127 |     df = cbind(df_umap,dataset=coembed@meta.data$technology)
128 |     print(table(df$dataset))
129 |     dir.create(file.path(out_dir,"seurat3"),recursive=TRUE)
130 |     write.csv(df,file.path(out_dir,"seurat3/","seurat3_result.csv"))
131 |     t2 <- Sys.time()
132 |     dir.create(file.path(out_dir,"runtime"),recursive=TRUE)
133 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
134 |                 file = file.path(out_dir,"runtime","seurat3_runtime.txt"), 
135 |                 sep = "\t",
136 |                 row.names = FALSE,
137 |                 col.names = FALSE)
138 |     print("------ Done ------")
139 |  
140 |     print("------ No prediction ------")
141 | 
142 | 
143 | }
144 | 
145 | 
146 | if (length(args)<2) {
147 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
148 | }else{ 
149 |     print(paste0("Input directory: ",args[1]))
150 |     print(paste0("Output directory: ",args[2]))
151 |     run_seurat3_fn(args[1], args[2])
152 | }
153 | 


--------------------------------------------------------------------------------
/methods/seuratv3/run_seurat3_noPred.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(Seurat)
  5 | require(Matrix)
  6 | source("seurat_wnn_project.R")
  7 | source("r_utils.R")
  8 | require(future)
  9 | 
 10 | run_seurat3_fn <- function(in_dir, out_dir){
 11 |     # starting time
 12 |     t1 <- Sys.time()  
 13 |     n_lat = 30
 14 |     plan("multisession")
 15 |     options(future.rng.onMisue = "ignore")
 16 |     print(paste0("workers used:",nbrOfWorkers()))
 17 |     
 18 |     datasets = load_datasets(in_dir)
 19 |     paired_rna=datasets$paired_rna
 20 |     paired_atac=datasets$paired_atac
 21 |     unpaired_rna=datasets$unpaired_rna
 22 |     unpaired_atac=datasets$unpaired_atac
 23 | 
 24 |     # verify number of cells in each condition 
 25 |     dataset_vec <- rep(c("scRNA","snATAC","Multiome-RNA","Multiome-ATAC"),
 26 |                    c(ncol(unpaired_rna),
 27 |                      ncol(unpaired_atac),
 28 |                      ncol(paired_rna),
 29 |                     ncol(paired_atac)))
 30 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 31 |                             paste0(colnames(unpaired_atac)),
 32 |                             paste0("prna_",colnames(paired_rna)),
 33 |                             paste0("patac_",colnames(paired_atac)))
 34 |     print(table(dataset_vec))
 35 |     
 36 |     paired_rna <- RenameCells(paired_rna,add.cell.id = "prna",for.merge = FALSE)
 37 |     paired_atac <- RenameCells(paired_atac,add.cell.id = "patac",for.merge = FALSE)
 38 |     
 39 |     unpaired_atac@meta.data$dataset <- "snATAC"
 40 |     unpaired_rna@meta.data$dataset <- "scRNA"
 41 |     paired_atac@meta.data$dataset <- "Multiome-ATAC"
 42 |     paired_rna@meta.data$dataset <- "Multiome-RNA"
 43 | 
 44 |     # merging
 45 |     unpaired_rna <- merge(unpaired_rna,paired_rna)
 46 |     unpaired_atac <- merge(unpaired_atac,paired_atac)
 47 |     
 48 |     # Perform standard analysis of each modality independently RNA analysis
 49 |     unpaired_rna <- NormalizeData(unpaired_rna)
 50 |     unpaired_rna <- FindVariableFeatures(unpaired_rna)
 51 |     unpaired_rna <- ScaleData(unpaired_rna)
 52 |     unpaired_rna <- RunPCA(unpaired_rna)
 53 |     unpaired_rna <- RunUMAP(unpaired_rna, dims = 1:n_lat)
 54 |     
 55 |     # We exclude the first dimension as this is typically correlated with sequencing depth
 56 |     unpaired_atac <- RunTFIDF(unpaired_atac)
 57 |     unpaired_atac <- FindTopFeatures(unpaired_atac, min.cutoff = "q0")
 58 |     unpaired_atac <- RunSVD(unpaired_atac)
 59 |     unpaired_atac <- RunUMAP(unpaired_atac, reduction = "lsi", dims = 2:n_lat, reduction.name = "umap.atac", reduction.key = "atacUMAP_")
 60 |     
 61 |     # quantify gene activity
 62 |     gene.activities <- GeneActivity(unpaired_atac, features = VariableFeatures(unpaired_rna))
 63 | 
 64 |     # add gene activities as a new assay
 65 |     unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 66 | 
 67 |     # normalize gene activities
 68 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 69 |     unpaired_atac <- NormalizeData(unpaired_atac)
 70 |     unpaired_atac <- ScaleData(unpaired_atac, features = rownames(unpaired_atac))
 71 | 
 72 |     # Identify anchors
 73 |     transfer.anchors <- FindTransferAnchors(reference = unpaired_rna, 
 74 |                                             query = unpaired_atac, 
 75 |                                             features = VariableFeatures(object = unpaired_rna),
 76 |                                             reference.assay = "RNA", 
 77 |                                             query.assay = "ACTIVITY", 
 78 |                                             reduction = "cca")
 79 |     # note that we restrict the imputation to variable genes from scRNA-seq, but could impute the
 80 |     # full transcriptome if we wanted to
 81 |     genes.use <- VariableFeatures(unpaired_rna)
 82 |     refdata <- GetAssayData(unpaired_rna, assay = "RNA", slot = "data")[genes.use, ]
 83 | 
 84 |     # refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells.  imputation
 85 |     # (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells
 86 |     imputation <- TransferData(anchorset = transfer.anchors, 
 87 |                                refdata = refdata, 
 88 |                                weight.reduction = unpaired_atac[["lsi"]],
 89 |                                dims = 2:n_lat)
 90 |     unpaired_atac[["RNA"]] <- imputation
 91 | 
 92 |     coembed <- merge(x = unpaired_rna, y = unpaired_atac)
 93 | 
 94 |     # Finally, we run PCA and UMAP on this combined object, to visualize the co-embedding of both
 95 |     # datasets
 96 |     coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE)
 97 |     coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE)
 98 |     coembed <- RunUMAP(coembed, dims = 1:n_lat)
 99 | 
100 |     print("------ Saving integration result ------")
101 |     df_umap = as.data.frame(coembed@reductions$pca@cell.embeddings[,1:n_lat])
102 | 
103 |     # ===== added =====
104 |     colnames(df_umap) = paste0("latent_",1:ncol(df_umap))
105 |     
106 |     df = cbind(df_umap,dataset=coembed@meta.data$dataset)
107 |     print(table(df_umap$dataset))
108 |     dir.create(file.path(out_dir,"seurat3"),recursive=TRUE)
109 |     write.csv(df,file.path(out_dir,"seurat3/","seurat3_result.csv"))
110 |     t2 <- Sys.time()
111 |     dir.create(file.path(out_dir,"runtime"),recursive=TRUE)
112 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
113 |                 file = file.path(out_dir,"runtime","seurat3_runtime.txt"), 
114 |                 sep = "\t",
115 |                 row.names = FALSE,
116 |                 col.names = FALSE)
117 |     print("------ Done ------")
118 |  
119 | }
120 | 
121 | 
122 | if (length(args)<2) {
123 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
124 | }else{ 
125 |     print(paste0("Input directory: ",args[1]))
126 |     print(paste0("Output directory: ",args[2]))
127 |     run_seurat3_fn(args[1], args[2])
128 | }
129 | 


--------------------------------------------------------------------------------
/methods/seuratv3/run_seurat3_single.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(Seurat)
  5 | require(Matrix)
  6 | source("seurat_wnn_project.R")
  7 | source("r_utils.R")
  8 | require(future)
  9 | 
 10 | run_seurat3_fn <- function(in_dir, out_dir){
 11 |     # starting time
 12 |     t1 <- Sys.time()  
 13 |     n_lat = 30
 14 |     plan("multisession")
 15 |     options(future.rng.onMisue = "ignore")
 16 |     print(paste0("workers used:",nbrOfWorkers()))
 17 |     
 18 |     datasets = load_datasets(in_dir)
 19 |     print("loading single modality datasets only, ignoring paired RNA and paired ATAC folder")
 20 |     #paired_rna=datasets$paired_rna
 21 |     #paired_atac=datasets$paired_atac
 22 |     unpaired_rna=datasets$unpaired_rna
 23 |     unpaired_atac=datasets$unpaired_atac
 24 | 
 25 |     # print number of cells per data type
 26 |     dataset_vec <- rep(c("scRNA","snATAC"),
 27 |                        c(ncol(unpaired_rna),
 28 |                          ncol(unpaired_atac)))
 29 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 30 |                             paste0(colnames(unpaired_atac)))
 31 |     print(table(dataset_vec))
 32 |     
 33 |     unpaired_atac@meta.data$dataset <- "snATAC"
 34 |     unpaired_rna@meta.data$dataset <- "scRNA"
 35 | #     # merging
 36 | #     unpaired_rna <- merge(unpaired_rna,paired_rna, add.cell.ids = c("urna", "prna"))
 37 | #     unpaired_atac <- merge(unpaired_atac,paired_atac, add.cell.ids = c("uatac", "patac"))
 38 |     
 39 |     # Perform standard analysis of each modality independently RNA analysis
 40 |     unpaired_rna <- NormalizeData(unpaired_rna)
 41 |     unpaired_rna <- FindVariableFeatures(unpaired_rna)
 42 |     unpaired_rna <- ScaleData(unpaired_rna)
 43 |     unpaired_rna <- RunPCA(unpaired_rna)
 44 |     unpaired_rna <- RunUMAP(unpaired_rna, dims = 1:n_lat)
 45 |     
 46 |     # We exclude the first dimension as this is typically correlated with sequencing depth
 47 |     unpaired_atac <- RunTFIDF(unpaired_atac)
 48 |     unpaired_atac <- FindTopFeatures(unpaired_atac, min.cutoff = "q0")
 49 |     unpaired_atac <- RunSVD(unpaired_atac)
 50 |     unpaired_atac <- RunUMAP(unpaired_atac, reduction = "lsi", dims = 2:n_lat, reduction.name = "umap.atac", reduction.key = "atacUMAP_")
 51 |     
 52 |     # quantify gene activity
 53 |     gene.activities <- GeneActivity(unpaired_atac, features = VariableFeatures(unpaired_rna))
 54 | 
 55 |     # add gene activities as a new assay
 56 |     unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 57 | 
 58 |     # normalize gene activities
 59 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 60 |     unpaired_atac <- NormalizeData(unpaired_atac)
 61 |     unpaired_atac <- ScaleData(unpaired_atac, features = rownames(unpaired_atac))
 62 | 
 63 |     # Identify anchors
 64 |     transfer.anchors <- FindTransferAnchors(reference = unpaired_rna, 
 65 |                                             query = unpaired_atac, 
 66 |                                             features = VariableFeatures(object = unpaired_rna),
 67 |                                             reference.assay = "RNA", 
 68 |                                             query.assay = "ACTIVITY", 
 69 |                                             reduction = "cca")
 70 |     # note that we restrict the imputation to variable genes from scRNA-seq, but could impute the
 71 |     # full transcriptome if we wanted to
 72 |     genes.use <- VariableFeatures(unpaired_rna)
 73 |     refdata <- GetAssayData(unpaired_rna, assay = "RNA", slot = "data")[genes.use, ]
 74 | 
 75 |     # refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells.  imputation
 76 |     # (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells
 77 |     imputation <- TransferData(anchorset = transfer.anchors, 
 78 |                                refdata = refdata, 
 79 |                                weight.reduction = unpaired_atac[["lsi"]],
 80 |                                dims = 2:n_lat)
 81 |     unpaired_atac[["RNA"]] <- imputation
 82 | 
 83 |     coembed <- merge(x = unpaired_rna, y = unpaired_atac)
 84 | 
 85 |     # Finally, we run PCA and UMAP on this combined object, to visualize the co-embedding of both
 86 |     # datasets
 87 |     coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE)
 88 |     coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE)
 89 |     coembed <- RunUMAP(coembed, dims = 1:n_lat)
 90 | 
 91 |     print("------ Saving integration result ------")
 92 |     df_umap = as.data.frame(coembed@reductions$pca@cell.embeddings[,1:n_lat])
 93 | 
 94 |     # ===== added =====
 95 |     colnames(df_umap) = paste0("latent_",1:ncol(df_umap))
 96 |     
 97 |     df = cbind(df_umap,dataset=coembed@meta.data$dataset)
 98 |     print(table(df_umap$dataset))
 99 |     dir.create(file.path(out_dir,"seurat3"),recursive=TRUE)
100 |     write.csv(df,file.path(out_dir,"seurat3","seurat3_result.csv"))
101 |     t2 <- Sys.time()
102 |     dir.create(file.path(out_dir,"runtime"),recursive=TRUE)
103 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
104 |                 file = file.path(out_dir,"runtime","seurat3_runtime.txt"), 
105 |                 sep = "\t",
106 |                 row.names = FALSE,
107 |                 col.names = FALSE)
108 |     print("------ Done ------")
109 |  
110 |     print("------ Prediction ------")
111 |     # starting time
112 |     t1 <- Sys.time()
113 | 
114 |     # predict gene expression values
115 |     rna <- TransferData(
116 |       anchorset = transfer.anchors,
117 |       refdata = GetAssayData(unpaired_rna, assay = "RNA", slot = "data"),
118 |       weight.reduction = unpaired_atac[["lsi"]],
119 |       dims = 2:n_lat
120 |     )
121 | 
122 |     # add predicted values as a new assay
123 |     unpaired_atac[["predicted"]] <- rna
124 |     
125 |     write_mtx_folder(file.path(out_dir,"seurat3","predicted/ATAC/"),unpaired_atac,assay_key="ATAC",slot_key="counts","peak")
126 |     write_mtx_folder(file.path(out_dir,"seurat3","predicted/RNA/"),unpaired_atac,assay_key="predicted",slot_key="data","gene")
127 |     
128 |     t2 <- Sys.time()
129 |     ## use '[[1]]' for clean output
130 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
131 |                 file = file.path(out_dir,"runtime","seurat3_prediction_time.txt"), 
132 |                 sep = "\t",
133 |                 row.names = FALSE,
134 |                 col.names = FALSE)
135 |     print("------ Prediction Done ------")
136 | 
137 | }
138 | 
139 | 
140 | if (length(args)<2) {
141 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
142 | }else{ 
143 |     print(paste0("Input directory: ",args[1]))
144 |     print(paste0("Output directory: ",args[2]))
145 |     run_seurat3_fn(args[1], args[2])
146 | }
147 | 


--------------------------------------------------------------------------------
/methods/seuratv3/run_seurat3_single_noPred.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(Seurat)
  5 | require(Matrix)
  6 | source("seurat_wnn_project.R")
  7 | source("r_utils.R")
  8 | require(future)
  9 | 
 10 | run_seurat3_fn <- function(in_dir, out_dir){
 11 |     # starting time
 12 |     t1 <- Sys.time()  
 13 |     n_lat = 30
 14 |     plan("multisession")
 15 |     options(future.rng.onMisue = "ignore")
 16 |     print(paste0("workers used:",nbrOfWorkers()))
 17 |     
 18 |     datasets = load_datasets(in_dir)
 19 |     print("loading single modality datasets only, ignoring paired RNA and paired ATAC folder")
 20 |     #paired_rna=datasets$paired_rna
 21 |     #paired_atac=datasets$paired_atac
 22 |     unpaired_rna=datasets$unpaired_rna
 23 |     unpaired_atac=datasets$unpaired_atac
 24 | 
 25 |     # print number of cells per data type
 26 |     dataset_vec <- rep(c("scRNA","snATAC"),
 27 |                        c(ncol(unpaired_rna),
 28 |                          ncol(unpaired_atac)))
 29 |     names(dataset_vec) <- c(paste0(colnames(unpaired_rna)),
 30 |                             paste0(colnames(unpaired_atac)))
 31 |     print(table(dataset_vec))
 32 |     
 33 |     unpaired_atac@meta.data$dataset <- "snATAC"
 34 |     unpaired_rna@meta.data$dataset <- "scRNA"
 35 | #     # merging
 36 | #     unpaired_rna <- merge(unpaired_rna,paired_rna, add.cell.ids = c("urna", "prna"))
 37 | #     unpaired_atac <- merge(unpaired_atac,paired_atac, add.cell.ids = c("uatac", "patac"))
 38 |     
 39 |     # Perform standard analysis of each modality independently RNA analysis
 40 |     unpaired_rna <- NormalizeData(unpaired_rna)
 41 |     unpaired_rna <- FindVariableFeatures(unpaired_rna)
 42 |     unpaired_rna <- ScaleData(unpaired_rna)
 43 |     unpaired_rna <- RunPCA(unpaired_rna)
 44 |     unpaired_rna <- RunUMAP(unpaired_rna, dims = 1:n_lat)
 45 |     
 46 |     # We exclude the first dimension as this is typically correlated with sequencing depth
 47 |     unpaired_atac <- RunTFIDF(unpaired_atac)
 48 |     unpaired_atac <- FindTopFeatures(unpaired_atac, min.cutoff = "q0")
 49 |     unpaired_atac <- RunSVD(unpaired_atac)
 50 |     unpaired_atac <- RunUMAP(unpaired_atac, reduction = "lsi", dims = 2:n_lat, reduction.name = "umap.atac", reduction.key = "atacUMAP_")
 51 |     
 52 |     # quantify gene activity
 53 |     gene.activities <- GeneActivity(unpaired_atac, features = VariableFeatures(unpaired_rna))
 54 | 
 55 |     # add gene activities as a new assay
 56 |     unpaired_atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
 57 | 
 58 |     # normalize gene activities
 59 |     DefaultAssay(unpaired_atac) <- "ACTIVITY"
 60 |     unpaired_atac <- NormalizeData(unpaired_atac)
 61 |     unpaired_atac <- ScaleData(unpaired_atac, features = rownames(unpaired_atac))
 62 | 
 63 |     # Identify anchors
 64 |     transfer.anchors <- FindTransferAnchors(reference = unpaired_rna, 
 65 |                                             query = unpaired_atac, 
 66 |                                             features = VariableFeatures(object = unpaired_rna),
 67 |                                             reference.assay = "RNA", 
 68 |                                             query.assay = "ACTIVITY", 
 69 |                                             reduction = "cca")
 70 |     # note that we restrict the imputation to variable genes from scRNA-seq, but could impute the
 71 |     # full transcriptome if we wanted to
 72 |     genes.use <- VariableFeatures(unpaired_rna)
 73 |     refdata <- GetAssayData(unpaired_rna, assay = "RNA", slot = "data")[genes.use, ]
 74 | 
 75 |     # refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells.  imputation
 76 |     # (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells
 77 |     imputation <- TransferData(anchorset = transfer.anchors, 
 78 |                                refdata = refdata, 
 79 |                                weight.reduction = unpaired_atac[["lsi"]],
 80 |                                dims = 2:n_lat)
 81 |     unpaired_atac[["RNA"]] <- imputation
 82 | 
 83 |     coembed <- merge(x = unpaired_rna, y = unpaired_atac)
 84 | 
 85 |     # Finally, we run PCA and UMAP on this combined object, to visualize the co-embedding of both
 86 |     # datasets
 87 |     coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE)
 88 |     coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE)
 89 |     coembed <- RunUMAP(coembed, dims = 1:n_lat)
 90 | 
 91 |     print("------ Saving integration result ------")
 92 |     df_umap = as.data.frame(coembed@reductions$pca@cell.embeddings[,1:n_lat])
 93 | 
 94 |     # ===== added =====
 95 |     colnames(df_umap) = paste0("latent_",1:ncol(df_umap))
 96 |     
 97 |     df = cbind(df_umap,dataset=coembed@meta.data$dataset)
 98 |     print(table(df_umap$dataset))
 99 |     dir.create(file.path(out_dir,"seurat3"),recursive=TRUE)
100 |     write.csv(df,file.path(out_dir,"seurat3","seurat3_result.csv"))
101 |     t2 <- Sys.time()
102 |     dir.create(file.path(out_dir,"runtime"),recursive=TRUE)
103 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
104 |                 file = file.path(out_dir,"runtime","seurat3_runtime.txt"), 
105 |                 sep = "\t",
106 |                 row.names = FALSE,
107 |                 col.names = FALSE)
108 |     print("------ Done ------")
109 | 
110 | 
111 | }
112 | 
113 | 
114 | if (length(args)<2) {
115 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
116 | }else{ 
117 |     print(paste0("Input directory: ",args[1]))
118 |     print(paste0("Output directory: ",args[2]))
119 |     run_seurat3_fn(args[1], args[2])
120 | }
121 | 


--------------------------------------------------------------------------------
/methods/seuratv3/seurat_env.txt:
--------------------------------------------------------------------------------
 1 | # command line
 2 | 
 3 | conda create -n seurat r-essentials r-base
 4 | 
 5 | conda activate seurat
 6 | 
 7 | # devtools:
 8 | conda install -y -c conda-forge r-devtools
 9 | conda install -y igraph hdf5
10 | 
11 | conda install -c conda-forge r-rgeos
12 | 
13 | R
14 | install.packages('IRkernel')
15 | IRkernel::installspec(name = 'seurat', displayname = 'rseurat')
16 | 
17 | install.packages("BiocManager")
18 | BiocManager::install(c("GenomeInfoDb","IRanges", "Rsamtools", "S4Vectors", "BiocGenerics"))
19 | remotes::install_version("RSQLite", version = "2.2.5")
20 | BiocManager::install(c("EnsDb.Hsapiens.v86","biovizBase"))
21 | BiocManager::install(c("BSgenome.Hsapiens.UCSC.hg38"))
22 | install.packages("Seurat") 
23 | devtools::install_github('satijalab/seurat-data')
24 | remotes::install_github("mojaveazure/seurat-disk")
25 | 
26 | install.packages("Signac") 
27 | install.packages('qlcMatrix')
28 | 
29 | 


--------------------------------------------------------------------------------
/methods/seuratv4/run_seurat4_3.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(Seurat)
  5 | require(Matrix)
  6 | source("seurat_wnn_project.R")
  7 | source("r_utils.R")
  8 | require(future)
  9 | 
 10 | 
 11 | run_seurat4_fn <- function(in_dir, out_dir,nclust = 7){
 12 |     # starting time
 13 |     t1 <- Sys.time()
 14 |     plan("multisession")
 15 |     options(future.rng.onMisue = "ignore")
 16 |     print(paste0("workers used:",nbrOfWorkers()))
 17 |     
 18 |     # load dataset 
 19 |     datasets = load_datasets(in_dir)
 20 |     pdata = datasets$paired_rna
 21 |     pdata[["ATAC"]] = datasets$paired_atac[["ATAC"]]
 22 |     DefaultAssay(pdata) <- "RNA"
 23 |     
 24 |     unpaired_rna = datasets$unpaired_rna
 25 |     unpaired_atac = datasets$unpaired_atac
 26 |     
 27 |     dir.create(file.path(out_dir,"seurat4","figures"),recursive=TRUE)
 28 |     print("------ Processing Multiome ------")
 29 |     pdata <- process_paired(pdata,
 30 |                             file_path = file.path(out_dir,"seurat4","figures","paired_data_umap_seurat.pdf"),
 31 |                             nclust=nclust)
 32 |     print("------ Projecting scRNA ------")
 33 |     unpaired_rna <- project_rna(pdata,unpaired_rna, 
 34 |                                 file_path = file.path(out_dir,"seurat4","figures","rna_data_umap_seurat.pdf"))
 35 |     
 36 |     print("------ Projecting snATAC ------")
 37 |     res <- project_atac_slsi(pdata,unpaired_atac, 
 38 |                                        file_path = file.path(out_dir,"seurat4","figures","atac_data_umap_seurat.pdf"),
 39 |                                        return_anchor=T)
 40 |     unpaired_atac <- res[[1]]
 41 |     anchors_u2 <- res[[2]]
 42 | 
 43 |     dir.create(file.path(out_dir,"runtime/"),recursive=TRUE)
 44 |     print("------ Saving integration result ------")
 45 |     df_umap = do.call(rbind,list(pdata@reductions$wnn.umap@cell.embeddings,
 46 |                     unpaired_rna@reductions$ref.umap@cell.embeddings,
 47 |                     unpaired_atac@reductions$ref.umap@cell.embeddings))
 48 | 
 49 |     # ===== added =====
 50 |     colnames(df_umap) = paste0("latent_",1:ncol(df_umap))
 51 | 
 52 |     pdata@meta.data$predicted.ct = pdata$ct
 53 |     col_sel = "predicted.ct"
 54 |     df = data.frame(predicted_ct=c(pdata@meta.data[,col_sel],
 55 |            unpaired_rna@meta.data[,col_sel],
 56 |            unpaired_atac@meta.data[,col_sel]))
 57 |     # first df_umap is latent embedding, second df_umap is umap embedding, in this case, they are the same. 
 58 |     # During evaluation, if umap embedding is detected, no umap projection was run again
 59 |     df = cbind(df_umap,df_umap,df,
 60 |                dataset=c(rep("Multiome",ncol(pdata)),
 61 |                  rep("scRNA",ncol(unpaired_rna)),
 62 |                  rep("snATAC",ncol(unpaired_atac))))
 63 | 
 64 |     umap_col_range = c((ncol(df_umap)+1):(ncol(df_umap)*2))
 65 |     colnames(df)[umap_col_range] = paste0("umap_",1:length(umap_col_range))
 66 |     rownames(df) <- gsub("\\.","-",rownames(df))
 67 |     write.csv(df,file.path(out_dir,"seurat4","seurat4_result.csv"))
 68 |     t2 <- Sys.time()
 69 |     ## use '[[1]]' for clean output
 70 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
 71 |                 file = file.path(out_dir,"runtime","seurat4_runtime.txt"), 
 72 |                 sep = "\t",
 73 |                 row.names = FALSE,
 74 |                 col.names = FALSE)
 75 |     print("------ Done ------")
 76 |     
 77 |     print("------ Prediction ------")
 78 |     # starting time
 79 |     t1 <- Sys.time()
 80 | 
 81 |     # predict gene expression values
 82 |     rna <- TransferData(
 83 |       anchorset = anchors_u2,
 84 |       refdata = GetAssayData(pdata, assay = "SCT", slot = "data"),
 85 |       weight.reduction = "lsiproject"
 86 |     )
 87 | 
 88 |     # add predicted values as a new assay
 89 |     unpaired_atac[["SCT"]] <- rna
 90 |     
 91 |     #data_combined <- merge(pdata,unpaired_atac)
 92 |     
 93 |     write_mtx_folder(file.path(out_dir,"seurat4","predicted","ATAC"),unpaired_atac,assay_key="ATAC",slot_key="counts","peak")
 94 |     write_mtx_folder(file.path(out_dir,"seurat4","predicted","RNA"),unpaired_atac,assay_key="SCT",slot_key="data","gene")
 95 |     
 96 |     t2 <- Sys.time()
 97 |     ## use '[[1]]' for clean output
 98 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
 99 |                 file = file.path(out_dir,"runtime","seurat4_prediction_time.txt"), 
100 |                 sep = "\t",
101 |                 row.names = FALSE,
102 |                 col.names = FALSE)
103 |     print("------ Prediction Done ------")
104 |     
105 | }
106 | 
107 | 
108 | 
109 | if (length(args)<2) {
110 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
111 | }else if(length(args)==2) {
112 |     print(paste0("Input directory: ",args[1]))
113 |     print(paste0("Output directory: ",args[2]))
114 |     run_seurat4_fn(args[1], args[2])
115 | }else if(length(args) ==3){
116 |     print(paste0("Input directory: ",args[1]))
117 |     print(paste0("Output directory: ",args[2]))
118 |     print(paste0("Number of clusters: ",args[3]))
119 |     run_seurat4_fn(args[1], args[2],as.integer(args[3]))
120 | }else{
121 |     stop(paste0(length(args)," arguments are supplied (input file).n, please double check!"), call.=FALSE)
122 | }
123 | 
124 |     
125 |     


--------------------------------------------------------------------------------
/methods/seuratv4/run_seurat4_3_noPred.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | args = commandArgs(trailingOnly=TRUE)
 3 | 
 4 | require(Seurat)
 5 | require(Matrix)
 6 | source("seurat_wnn_project.R")
 7 | source("r_utils.R")
 8 | require(future)
 9 | 
10 | 
11 | run_seurat4_fn <- function(in_dir, out_dir,nclust = 7){
12 |     # starting time
13 |     t1 <- Sys.time()
14 |     plan("multisession")
15 |     options(future.rng.onMisue = "ignore")
16 |     print(paste0("workers used:",nbrOfWorkers()))
17 |     
18 |     # load dataset 
19 |     datasets = load_datasets(in_dir)
20 |     pdata = datasets$paired_rna
21 |     pdata[["ATAC"]] = datasets$paired_atac[["ATAC"]]
22 |     DefaultAssay(pdata) <- "RNA"
23 |     
24 |     unpaired_rna = datasets$unpaired_rna
25 |     unpaired_atac = datasets$unpaired_atac
26 |     
27 |     dir.create(file.path(out_dir,"seurat4","figures"),recursive=TRUE)
28 |     print("------ Processing Multiome ------")
29 |     pdata <- process_paired(pdata,
30 |                             file_path = file.path(out_dir,"seurat4","figures","paired_data_umap_seurat.pdf"),
31 |                             nclust=nclust)
32 |     print("------ Projecting scRNA ------")
33 |     unpaired_rna <- project_rna(pdata,unpaired_rna, 
34 |                                 file_path = file.path(out_dir,"seurat4","figures","rna_data_umap_seurat.pdf"))
35 |     
36 |     print("------ Projecting snATAC ------")
37 |     res <- project_atac_slsi(pdata,unpaired_atac, 
38 |                                        file_path = file.path(out_dir,"seurat4","figures","atac_data_umap_seurat.pdf"),
39 |                                        return_anchor=T)
40 |     unpaired_atac <- res[[1]]
41 |     anchors_u2 <- res[[2]]
42 | 
43 |     dir.create(file.path(out_dir,"runtime/"),recursive=TRUE)
44 |     print("------ Saving integration result ------")
45 |     df_umap = do.call(rbind,list(pdata@reductions$wnn.umap@cell.embeddings,
46 |                     unpaired_rna@reductions$ref.umap@cell.embeddings,
47 |                     unpaired_atac@reductions$ref.umap@cell.embeddings))
48 | 
49 |     # ===== added =====
50 |     colnames(df_umap) = paste0("latent_",1:ncol(df_umap))
51 | 
52 |     pdata@meta.data$predicted.ct = pdata$ct
53 |     col_sel = "predicted.ct"
54 |     df = data.frame(predicted_ct=c(pdata@meta.data[,col_sel],
55 |            unpaired_rna@meta.data[,col_sel],
56 |            unpaired_atac@meta.data[,col_sel]))
57 |     # first df_umap is latent embedding, second df_umap is umap embedding, in this case, they are the same. 
58 |     # During evaluation, if umap embedding is detected, no umap projection was run again
59 |     df = cbind(df_umap,df_umap,df,
60 |                dataset=c(rep("Multiome",ncol(pdata)),
61 |                  rep("scRNA",ncol(unpaired_rna)),
62 |                  rep("snATAC",ncol(unpaired_atac))))
63 | 
64 |     umap_col_range = c((ncol(df_umap)+1):(ncol(df_umap)*2))
65 |     colnames(df)[umap_col_range] = paste0("umap_",1:length(umap_col_range))
66 |     rownames(df) <- gsub("\\.","-",rownames(df))
67 |     write.csv(df,file.path(out_dir,"seurat4","seurat4_result.csv"))
68 |     t2 <- Sys.time()
69 |     ## use '[[1]]' for clean output
70 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
71 |                 file = file.path(out_dir,"runtime","seurat4_runtime.txt"), 
72 |                 sep = "\t",
73 |                 row.names = FALSE,
74 |                 col.names = FALSE)
75 |     print("------ Done ------")
76 |  
77 | }
78 | 
79 | 
80 | 
81 | if (length(args)<2) {
82 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
83 | }else if(length(args)==2) {
84 |     print(paste0("Input directory: ",args[1]))
85 |     print(paste0("Output directory: ",args[2]))
86 |     run_seurat4_fn(args[1], args[2])
87 | }else if(length(args) ==3){
88 |     print(paste0("Input directory: ",args[1]))
89 |     print(paste0("Output directory: ",args[2]))
90 |     print(paste0("Number of clusters: ",args[3]))
91 |     run_seurat4_fn(args[1], args[2],as.integer(args[3]))
92 | }else{
93 |     stop(paste0(length(args)," arguments are supplied (input file).n, please double check!"), call.=FALSE)
94 | }
95 | 
96 |     
97 |     


--------------------------------------------------------------------------------
/methods/seuratv4/run_seurat4_4.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(Seurat)
  5 | require(Matrix)
  6 | source("seurat_wnn_project.R")
  7 | source("r_utils.R")
  8 | require(future)
  9 | require(patchwork)
 10 | 
 11 | 
 12 | # run_seurat4_4, perform sctransform per individual/donor, assuming 8GB of RAM
 13 | run_seurat4_fn <- function(in_dir, out_dir,nclust = 7){
 14 |     # starting time
 15 |     t1 <- Sys.time()
 16 |     
 17 |     plan("multisession")
 18 |     options(future.rng.onMisue = "ignore")
 19 |     print(paste0("workers used:",nbrOfWorkers()))
 20 |     options(future.globals.maxSize = 8000 * 1024^2)
 21 |     # load dataset 
 22 |     datasets = load_datasets(in_dir,obs=c("barcodes","batch"))
 23 |     pdata = datasets$paired_rna
 24 |     pdata[["ATAC"]] = datasets$paired_atac[["ATAC"]]
 25 |     DefaultAssay(pdata) <- "RNA"
 26 | 
 27 |     unpaired_rna = datasets$unpaired_rna
 28 |     unpaired_atac = datasets$unpaired_atac
 29 | 
 30 |     dir.create(file.path(out_dir,"seurat4int","figures"),recursive=TRUE)
 31 |     print("------ Processing Multiome ------")
 32 |     pdata <- process_paired_multidonor(pdata,
 33 |                             file_path = file.path(out_dir,"seurat4int","figures","paired_data_umap_seurat.pdf"),
 34 |                             nclust=nclust)
 35 |     unpaired_rna <- project_rna(pdata,unpaired_rna, 
 36 |                                 file_path = file.path(out_dir,"seurat4int","figures","rna_data_umap_seurat.pdf"),
 37 |                                 reference_assay = "rnaintegrated")
 38 | 
 39 |     print("------ Projecting snATAC ------")
 40 |     res <- project_atac_slsi(pdata,unpaired_atac, 
 41 |                              file_path = file.path(out_dir,"seurat4int","figures","atac_data_umap_seurat.pdf"),
 42 |                              return_anchor=T)
 43 |     # testing - set transfer of atac to based on their tutorial
 44 | #     res <- project_atac_ref_lsi(pdata,unpaired_atac, 
 45 | #                              file_path = file.path(out_dir,"seurat4int","figures","atac_data_umap_seurat.pdf"),
 46 | #                              return_anchor=T,reference_reduction = "integrated_lsi")
 47 |     unpaired_atac <- res[[1]]
 48 |     anchors_u2 <- res[[2]]
 49 | 
 50 |     dir.create(file.path(out_dir,"runtime/"),recursive=TRUE)
 51 |     print("------ Saving integration result ------")
 52 |     df_umap = do.call(rbind,list(pdata@reductions$wnn.umap@cell.embeddings,
 53 |                     unpaired_rna@reductions$ref.umap@cell.embeddings,
 54 |                     unpaired_atac@reductions$ref.umap@cell.embeddings))
 55 | 
 56 |     # ===== added =====
 57 |     colnames(df_umap) = paste0("latent_",1:ncol(df_umap))
 58 | 
 59 |     pdata@meta.data$predicted.ct = pdata$ct
 60 |     col_sel = "predicted.ct"
 61 |     df = data.frame(predicted_ct=c(pdata@meta.data[,col_sel],
 62 |            unpaired_rna@meta.data[,col_sel],
 63 |            unpaired_atac@meta.data[,col_sel]))
 64 |     # first df_umap is latent embedding, second df_umap is umap embedding, in this case, they are the same. 
 65 |     # During evaluation, if umap embedding is detected, no umap projection was run again
 66 |     df = cbind(df_umap,df_umap,df,
 67 |                dataset=c(rep("Multiome",ncol(pdata)),
 68 |                  rep("scRNA",ncol(unpaired_rna)),
 69 |                  rep("snATAC",ncol(unpaired_atac))))
 70 | 
 71 |     umap_col_range = c((ncol(df_umap)+1):(ncol(df_umap)*2))
 72 |     colnames(df)[umap_col_range] = paste0("umap_",1:length(umap_col_range))
 73 |     rownames(df) <- gsub("\\.","-",rownames(df))
 74 |     write.csv(df,file.path(out_dir,"seurat4int","seurat4int_result.csv"))
 75 |     t2 <- Sys.time()
 76 |     ## use '[[1]]' for clean output
 77 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
 78 |                 file = file.path(out_dir,"runtime","seurat4int_runtime.txt"), 
 79 |                 sep = "\t",
 80 |                 row.names = FALSE,
 81 |                 col.names = FALSE)
 82 |     print("------ Done ------")
 83 |     
 84 |     print("------ Prediction ------")
 85 |     # starting time
 86 |     t1 <- Sys.time()
 87 | 
 88 |     # predict gene expression values
 89 |     rna <- TransferData(
 90 |       anchorset = anchors_u2,
 91 |       refdata = GetAssayData(pdata, assay = "SCT", slot = "data"),
 92 |       weight.reduction = "lsiproject"
 93 |     )
 94 | 
 95 |     # add predicted values as a new assay
 96 |     unpaired_atac[["SCT"]] <- rna
 97 |     
 98 |     #data_combined <- merge(pdata,unpaired_atac)
 99 |     
100 |     write_mtx_folder(file.path(out_dir,"seurat4int","predicted","ATAC"),unpaired_atac,assay_key="ATAC",slot_key="counts","peak")
101 |     write_mtx_folder(file.path(out_dir,"seurat4int","predicted","RNA"),unpaired_atac,assay_key="SCT",slot_key="data","gene")
102 |     
103 |     t2 <- Sys.time()
104 |     ## use '[[1]]' for clean output
105 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
106 |                 file = file.path(out_dir,"runtime","seurat4int_prediction_time.txt"), 
107 |                 sep = "\t",
108 |                 row.names = FALSE,
109 |                 col.names = FALSE)
110 |     print("------ Prediction Done ------")
111 |     
112 | }
113 | 
114 | 
115 | 
116 | if (length(args)<2) {
117 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
118 | }else if(length(args)==2) {
119 |     print(paste0("Input directory: ",args[1]))
120 |     print(paste0("Output directory: ",args[2]))
121 |     run_seurat4_fn(args[1], args[2])
122 | }else if(length(args) ==3){
123 |     print(paste0("Input directory: ",args[1]))
124 |     print(paste0("Output directory: ",args[2]))
125 |     print(paste0("Number of clusters: ",args[3]))
126 |     run_seurat4_fn(args[1], args[2],as.integer(args[3]))
127 | }else{
128 |     stop(paste0(length(args)," arguments are supplied (input file).n, please double check!"), call.=FALSE)
129 | }
130 | 
131 |     
132 |     


--------------------------------------------------------------------------------
/methods/seuratv4/run_seurat4_4_sequential.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(Seurat)
  5 | require(Matrix)
  6 | source("seurat_wnn_project.R")
  7 | source("r_utils.R")
  8 | require(future)
  9 | require(patchwork)
 10 | 
 11 | 
 12 | # run_seurat4_4, perform sctransform per individual/donor, assuming 8GB of RAM
 13 | run_seurat4_fn <- function(in_dir, out_dir,nclust = 7){
 14 |     # starting time
 15 |     t1 <- Sys.time()
 16 |     
 17 |     #plan("multisession")
 18 |     #options(future.rng.onMisue = "ignore")
 19 |     #print(paste0("workers used:",nbrOfWorkers()))
 20 |     #options(future.globals.maxSize = 8000 * 1024^2)
 21 |     # load dataset 
 22 |     datasets = load_datasets(in_dir,obs=c("barcodes","batch"))
 23 |     pdata = datasets$paired_rna
 24 |     pdata[["ATAC"]] = datasets$paired_atac[["ATAC"]]
 25 |     DefaultAssay(pdata) <- "RNA"
 26 | 
 27 |     unpaired_rna = datasets$unpaired_rna
 28 |     unpaired_atac = datasets$unpaired_atac
 29 | 
 30 |     dir.create(file.path(out_dir,"seurat4int","figures"),recursive=TRUE)
 31 |     print("------ Processing Multiome ------")
 32 |     pdata <- process_paired_multidonor(pdata,
 33 |                             file_path = file.path(out_dir,"seurat4int","figures","paired_data_umap_seurat.pdf"),
 34 |                             nclust=nclust)
 35 |     unpaired_rna <- project_rna(pdata,unpaired_rna, 
 36 |                                 file_path = file.path(out_dir,"seurat4int","figures","rna_data_umap_seurat.pdf"),
 37 |                                 reference_assay = "rnaintegrated")
 38 | 
 39 |     print("------ Projecting snATAC ------")
 40 |     res <- project_atac_slsi(pdata,unpaired_atac, 
 41 |                              file_path = file.path(out_dir,"seurat4int","figures","atac_data_umap_seurat.pdf"),
 42 |                              return_anchor=T)
 43 |     # testing - set transfer of atac to based on their tutorial
 44 | #     res <- project_atac_ref_lsi(pdata,unpaired_atac, 
 45 | #                              file_path = file.path(out_dir,"seurat4int","figures","atac_data_umap_seurat.pdf"),
 46 | #                              return_anchor=T,reference_reduction = "integrated_lsi")
 47 |     unpaired_atac <- res[[1]]
 48 |     anchors_u2 <- res[[2]]
 49 | 
 50 |     dir.create(file.path(out_dir,"runtime/"),recursive=TRUE)
 51 |     print("------ Saving integration result ------")
 52 |     df_umap = do.call(rbind,list(pdata@reductions$wnn.umap@cell.embeddings,
 53 |                     unpaired_rna@reductions$ref.umap@cell.embeddings,
 54 |                     unpaired_atac@reductions$ref.umap@cell.embeddings))
 55 | 
 56 |     # ===== added =====
 57 |     colnames(df_umap) = paste0("latent_",1:ncol(df_umap))
 58 | 
 59 |     pdata@meta.data$predicted.ct = pdata$ct
 60 |     col_sel = "predicted.ct"
 61 |     df = data.frame(predicted_ct=c(pdata@meta.data[,col_sel],
 62 |            unpaired_rna@meta.data[,col_sel],
 63 |            unpaired_atac@meta.data[,col_sel]))
 64 |     # first df_umap is latent embedding, second df_umap is umap embedding, in this case, they are the same. 
 65 |     # During evaluation, if umap embedding is detected, no umap projection was run again
 66 |     df = cbind(df_umap,df_umap,df,
 67 |                dataset=c(rep("Multiome",ncol(pdata)),
 68 |                  rep("scRNA",ncol(unpaired_rna)),
 69 |                  rep("snATAC",ncol(unpaired_atac))))
 70 | 
 71 |     umap_col_range = c((ncol(df_umap)+1):(ncol(df_umap)*2))
 72 |     colnames(df)[umap_col_range] = paste0("umap_",1:length(umap_col_range))
 73 |     rownames(df) <- gsub("\\.","-",rownames(df))
 74 |     write.csv(df,file.path(out_dir,"seurat4int","seurat4int_result.csv"))
 75 |     t2 <- Sys.time()
 76 |     ## use '[[1]]' for clean output
 77 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
 78 |                 file = file.path(out_dir,"runtime","seurat4int_runtime.txt"), 
 79 |                 sep = "\t",
 80 |                 row.names = FALSE,
 81 |                 col.names = FALSE)
 82 |     print("------ Done ------")
 83 |     
 84 |     print("------ Prediction ------")
 85 |     # starting time
 86 |     t1 <- Sys.time()
 87 | 
 88 |     # predict gene expression values
 89 |     rna <- TransferData(
 90 |       anchorset = anchors_u2,
 91 |       refdata = GetAssayData(pdata, assay = "SCT", slot = "data"),
 92 |       weight.reduction = "lsiproject"
 93 |     )
 94 | 
 95 |     # add predicted values as a new assay
 96 |     unpaired_atac[["SCT"]] <- rna
 97 |     
 98 |     #data_combined <- merge(pdata,unpaired_atac)
 99 |     
100 |     write_mtx_folder(file.path(out_dir,"seurat4int","predicted","ATAC"),unpaired_atac,assay_key="ATAC",slot_key="counts","peak")
101 |     write_mtx_folder(file.path(out_dir,"seurat4int","predicted","RNA"),unpaired_atac,assay_key="SCT",slot_key="data","gene")
102 |     
103 |     t2 <- Sys.time()
104 |     ## use '[[1]]' for clean output
105 |     write.table(difftime(t2, t1, units = "secs")[[1]], 
106 |                 file = file.path(out_dir,"runtime","seurat4int_prediction_time.txt"), 
107 |                 sep = "\t",
108 |                 row.names = FALSE,
109 |                 col.names = FALSE)
110 |     print("------ Prediction Done ------")
111 |     
112 | }
113 | 
114 | 
115 | 
116 | if (length(args)<2) {
117 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
118 | }else if(length(args)==2) {
119 |     print(paste0("Input directory: ",args[1]))
120 |     print(paste0("Output directory: ",args[2]))
121 |     run_seurat4_fn(args[1], args[2])
122 | }else if(length(args) ==3){
123 |     print(paste0("Input directory: ",args[1]))
124 |     print(paste0("Output directory: ",args[2]))
125 |     print(paste0("Number of clusters: ",args[3]))
126 |     run_seurat4_fn(args[1], args[2],as.integer(args[3]))
127 | }else{
128 |     stop(paste0(length(args)," arguments are supplied (input file).n, please double check!"), call.=FALSE)
129 | }
130 | 
131 |     
132 |     


--------------------------------------------------------------------------------
/methods/seuratv4/seurat_env.txt:
--------------------------------------------------------------------------------
 1 | # command line
 2 | 
 3 | conda create -n azimuth r-essentials r-base
 4 | 
 5 | conda activate seurat
 6 | 
 7 | # devtools:
 8 | conda install -y -c conda-forge r-devtools
 9 | conda install -y igraph hdf5
10 | 
11 | conda install -c conda-forge r-rgeos
12 | 
13 | R
14 | install.packages('IRkernel')
15 | IRkernel::installspec(name = 'seurat', displayname = 'rseurat')
16 | 
17 | install.packages("BiocManager")
18 | BiocManager::install(c("GenomeInfoDb","IRanges", "Rsamtools", "S4Vectors", "BiocGenerics"))
19 | remotes::install_version("RSQLite", version = "2.2.5")
20 | BiocManager::install(c("EnsDb.Hsapiens.v86","biovizBase"))
21 | BiocManager::install(c("BSgenome.Hsapiens.UCSC.hg38"))
22 | install.packages("Seurat") 
23 | devtools::install_github('satijalab/seurat-data')
24 | remotes::install_github("mojaveazure/seurat-disk")
25 | 
26 | install.packages("Signac") 
27 | install.packages('qlcMatrix')
28 | 
29 | 


--------------------------------------------------------------------------------
/scenario_parameters.txt:
--------------------------------------------------------------------------------
 1 | sceanrio	challenge	submission 	eval	ct_ref	nclust	gp_eval	gp_truth	dir path	cond_key 	var 	repeats
 2 | 1	PBMC vary cells 	submit_job_per_condition_n_eval2.sh	run_metric_eval_single.py	dataset/multiome_pbmc_10k/pbmc_10x_bc_ct3.csv	7	eval_missing_modality_prediction_single.R	dataset/multiome_pbmc_10k/pbmc_10x_pmat_sig_links_50kb_unique.csv	dataset/multiome_pbmc_10k/pbmc_vary_cell_test/	nmulti	"[1000,3000,8000]"	5
 3 | 1	BMMC vary cells	submit_job_per_condition_n_eval2.sh	run_metric_eval_single.py	dataset/bmmc/bmmc_all_bc_ct3.csv	21	eval_missing_modality_prediction_single.R	dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv	dataset/bmmc/bmmc_vary_cell_test/	nmulti	"[1000,2000,4000]"	5
 4 | 1	SHARE-seq vary cells	submit_job_per_condition_n_eval2.sh	run_metric_eval_single.py	dataset/mouse_skin/mouse_skin_shareseq_bc_ct3.csv	22	eval_missing_modality_prediction_single_mm10.R	dataset/mouse_skin/mouse_skin_shareseq_pmat_all_ct_sig_links_50kb_unique.csv	dataset/mouse_skin/multiome_ncells_pmat/	nmulti	"[5000,10000,15000]"	5
 5 | 2	PBMC 2000 vary depth 	submit_job_per_condition_n_eval2.sh	run_metric_eval_fair.py	dataset/multiome_pbmc_10k/pbmc_10x_bc_ct3.csv	7	eval_missing_modality_prediction_single.R	dataset/multiome_pbmc_10k/pbmc_10x_pmat_sig_links_50kb_unique.csv	dataset/multiome_pbmc_10k/nmulti2000_7ct_vdepth_test/	depthmulti	"[25,50,75,100]"	5
 6 | 2	BMMC 2000 vary depth	submit_job_per_condition_n_eval2.sh	run_metric_eval_fair.py	dataset/bmmc/bmmc_all_bc_ct3.csv	21	eval_missing_modality_prediction_single.R	dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv	dataset/bmmc/nmulti2000_21ct_vdepth_test/	depthmulti	"[25,50,75,100]"	5
 7 | 2	BMMC 4000 vary depth	submit_job_per_condition_n_eval2.sh	run_metric_eval_fair.py	dataset/bmmc/bmmc_all_bc_ct3.csv	21	eval_missing_modality_prediction_single.R	dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv	dataset/bmmc/nmulti4000_21ct_vdepth_test/	depthmulti	"[25,50,75,100]"	5
 8 | 2	BMMC increasing cells intervals	submit_job_per_condition_n_eval2.sh	run_metric_eval_fair.py	dataset/bmmc/bmmc_all_bc_ct3.csv	21	eval_missing_modality_prediction_single.R	dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv	dataset/bmmc/bmmc_vcells_intervals/	depthmulti	"[10,20,30,40,50,60,70,80,90,100]"	5
 9 | 2	BMMC increasing depth intervals	submit_job_per_condition_n_eval2.sh	run_metric_eval_fair.py	dataset/bmmc/bmmc_all_bc_ct3.csv	21	eval_missing_modality_prediction_single.R	dataset/bmmc/bmmc_s1d2_pmat_all_ct_sig_links_50kb_unique.csv	dataset/bmmc/bmmc_vdepth_intervals/	nmulti	"[1000, 1400, 1800, 2200, 2600, 3000, 3400, 3800, 4200, 4600, 5000]"	5
10 | 3	BMMC technical batch	submit_job_per_condition_n_eval2.sh	run_metric_eval_batch.py	dataset/bmmc/bmmc_all_bc_ct3.csv	21	false	false	dataset/bmmc/bmmc_technical_batch_test/	nmulti	"[1000,3000,5000]"	5
11 | 3	BMMC biological batch	submit_job_per_condition_n_eval2.sh	run_metric_eval_batch.py	dataset/bmmc/bmmc_all_bc_ct3.csv	21	false	false	dataset/bmmc/bmmc_biological_batch_test/	nmulti	"[1000,3000,5000]"	5
12 | 3	BMMC complex test 1	submit_job_per_condition_n_eval2.sh	run_metric_eval_batch2.py	dataset/bmmc/bmmc_all_bc_ct3.csv	21	false	false	dataset/bmmc/bmmc_complex1_test/	nmulti	"[1000,3000,5000]"	5
13 | 3	BMMC complex test 2	submit_job_per_condition_n_eval2.sh	run_metric_eval_batch2.py	dataset/bmmc/bmmc_all_bc_ct3.csv	21	false	false	dataset/bmmc/bmmc_complex2_test/	nmulti	[10000]	5
14 | 4	PBMC single modality missing ct	submit_job_per_condition_n_missing_ct_eval.sh	run_metric_eval_fair_missing_ct_perMod.py	dataset/multiome_pbmc_10k/pbmc_10x_bc_ct3.csv	7	false	false	dataset/pbmc/single_modality_fixed_missing_ct/	"[""noMiss_nmulti"",""rnaMissNK_nmulti"",""atacMissNK_nmulti""]"	"[1000,3000,6000]"	5
15 | 4	SHARE-seq single modality missing ct	submit_job_per_condition_n_missing_ct_eval.sh	run_metric_eval_fair_missing_ct_perMod.py	dataset/mouse_skin/mouse_skin_shareseq_bc_ct3_10k.csv	12	false	false	dataset/mouse_skin/single_modality_fixed_missing_ct/	"[""noMiss_nmulti"",""rnaMissHS_nmulti"",""atacMissHS_nmulti"",""rnaMissEndo_nmulti"",""atacMissEndo_nmulti"",""rnaMissTwo_nmulti"",""atacMissTwo_nmulti"",""eachMissOne_nmulti"",""eachMissOneAlt_nmulti""]"	"[1000,3000,6000]"	5
16 | 5	PBMC multiome missing ct	submit_job_per_condition_n_missing_ct_eval.sh	run_metric_eval_fair_missing_ct_perMod.py	dataset/multiome_pbmc_10k/pbmc_10x_bc_ct3.csv	7	false	false	dataset/pbmc/multiome_fixed_missing_ct/	"[""noMiss_nmulti"",""multiMissNK_nmulti"",""rnaOnlyNK_nmulti"",""atacOnlyNK_nmulti""]"	[3000]	5
17 | 5	SHARE-seq multiome missing ct	submit_job_per_condition_n_missing_ct_eval.sh	run_metric_eval_fair_missing_ct_perMod.py	dataset/mouse_skin/mouse_skin_shareseq_bc_ct3_10k.csv	12	false	false	dataset/mouse_skin/multiome_fixed_missing_ct/	"[""noMiss_nmulti"",""multiMissHS_nmulti"",""rnaOnlyHS_nmulti"",""atacOnlyHS_nmulti"",""multiMissEndo_nmulti"",""rnaOnlyEndo_nmulti"",""atacOnlyEndo_nmulti""]"	[3000]	5
18 | 6	HPAP integration	submit_job_per_condition_n_eval2.sh	run_metric_eval_batch2_hpap.py	dataset/hpap/hpap_all_bc_ct3.csv	10	FALSE	FALSE	dataset/hpap/real_data/	nmulti	[70000]	1


--------------------------------------------------------------------------------
/scenario_parameters.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/myylee/benchmark_sc_multiomic_integration/73fc426fecdf4e11d17263cbace145f84941eab5/scenario_parameters.xlsx


--------------------------------------------------------------------------------
/scripts/eval_missing_modality_prediction_single.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(Seurat)
  5 | require(BSgenome.Hsapiens.UCSC.hg38)
  6 | require(stringr)
  7 | require(Signac)
  8 | require(Matrix)
  9 | require(future)
 10 | require(dplyr)
 11 | source("r_utils.R")
 12 | require(BSgenome.Hsapiens.UCSC.hg38)
 13 | require(future)
 14 | plan(multisession)
 15 | options(future.rng.onMisue = "ignore")
 16 | # 3GB
 17 | options(future.globals.maxSize = 3000*1024^3)
 18 | # loading saved result
 19 | # predict_link.R
 20 | 
 21 | # assumes that input RNA data has been normalized 
 22 | predict_links <- function(in_dir,out_dir,method_key,links_path,expression_slot="counts",predict_folder="predicted"){   
 23 |     require(BSgenome.Hsapiens.UCSC.hg38)
 24 |     require(future)
 25 |     plan(multisession)
 26 |     options(future.rng.onMisue = "ignore")
 27 |     nbrOfWorkers()
 28 |     
 29 |     print("reading ATAC")
 30 |     print(paste0("ATAC path: ",file.path(out_dir,method_key,predict_folder,"ATAC")))
 31 |     if(! file.exists(file.path(out_dir,method_key,predict_folder,"ATAC")) || ! file.exists(file.path(out_dir,method_key,predict_folder,"/RNA/"))){
 32 |         print("no imputation found, skip gene-peak association evaluation")
 33 |         return()
 34 |     }
 35 |     atac <- read_mtx_folder(file.path(out_dir,method_key,predict_folder,"/ATAC/"),
 36 |                         "ATAC",c("peak"),c("barcodes"),atac=TRUE,frag_path="")
 37 |     print("reading RNA")
 38 |     paired <- read_mtx_folder(file.path(out_dir,method_key,predict_folder,"/RNA/"),
 39 |                             "RNA",c("gene"),c("barcodes"),atac=FALSE)
 40 |     print("reading RNA done")
 41 |     paired[["ATAC"]] <- atac@assays$ATAC
 42 |     
 43 |     ## subset for cells from single-modality only ("snATAC")
 44 |     unpaired_atac_barcode <- read.csv(file.path(in_dir,"unpaired_ATAC","barcodes.tsv"),header = F)[,1]
 45 |     barcodes_predicted <- intersect(unpaired_atac_barcode,colnames(paired))
 46 |     paired <-paired[,barcodes_predicted]
 47 |     print(paste0("Number of cells with RNA predicted: ",ncol(paired)))
 48 |           
 49 |     links_truth <- read.csv(links_path)
 50 |     print("loaded links_truth")
 51 |     DefaultAssay(paired)<-"RNA"
 52 |     if(expression_slot=="data"){
 53 |         paired <- NormalizeData(paired)
 54 |     }else if (expression_slot != "counts"){
 55 |         stop("wrong expression.slot inputted")
 56 |     }
 57 |     
 58 |     DefaultAssay(paired)<-"ATAC"
 59 |     main.chroms <- standardChromosomes(BSgenome.Hsapiens.UCSC.hg38)
 60 |     keep.peaks <- which(as.character(seqnames(granges(paired))) %in% main.chroms)
 61 |     paired[["ATAC"]] <- subset(paired[["ATAC"]], features = rownames(paired[["ATAC"]])[keep.peaks])
 62 | 
 63 |     paired <- RegionStats(paired, genome = BSgenome.Hsapiens.UCSC.hg38)
 64 | 
 65 |     # 50kb list, with all genes 
 66 |     system.time({
 67 |         paired_links <- LinkPeaks(
 68 |             object = paired,
 69 |             peak.assay = "ATAC",
 70 |             expression.assay = "RNA",
 71 |             peak.slot = "counts",
 72 |             expression.slot = expression_slot,
 73 |             genes.use = rownames(paired@assays$RNA),
 74 |             distance = 50000,
 75 |         )
 76 |     })
 77 |     gene_link_50kb<-as.data.frame(paired_links@assays$ATAC@links)
 78 | 
 79 |     gene_link_50kb_unique<- gene_link_50kb %>%
 80 |         arrange(peak, -pvalue) %>%
 81 |         dplyr::filter(duplicated(peak) == FALSE) 
 82 | 
 83 |     truth = paste0(links_truth$gene,"_",links_truth$peak)
 84 |     pred = paste0(gene_link_50kb_unique$gene,"_",gene_link_50kb_unique$peak)
 85 |     
 86 |     tp = length(intersect(truth,pred))
 87 |     fp = length(pred) - tp
 88 |     fn = length(truth) - tp
 89 |     precision = tp/(tp+fp) 
 90 |     recall = tp/(tp+fn)
 91 |     f1 = 2 * (precision * recall) / (precision + recall)
 92 |     percent_recovered_50kb = tp/dim(links_truth)[1]
 93 |     print(paste0("percent_recovered_50kb: ",percent_recovered_50kb))
 94 |     print(paste0("f1: ",f1))
 95 |     write.table(c("percent_recovered_50kb" = percent_recovered_50kb,"f1"=f1),
 96 |             file = file.path(out_dir,method_key,paste0(method_key,"_prediction_eval.txt")), 
 97 |             sep = "\t",
 98 |             col.names = FALSE)
 99 |     
100 | }
101 | 
102 | if (length(args) < 4) {
103 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
104 | }
105 | 
106 | print(paste0("argument 1: ",args[1]))
107 | print(paste0("argument 2: ",args[2]))
108 | print(paste0("argument 3: ",args[3]))
109 | print(paste0("argument 4: ",args[4]))
110 | 
111 | #links_path="dataset/multiome_pbmc_10k/pbmc_10x_pmat_sig_links_50kb_unique.csv"
112 | 
113 | predict_links(in_dir = args[1],
114 |               out_dir = args[2],
115 |               method_key = args[3],
116 |               links_path = args[4])
117 | 
118 | print("----eval_missing_modality_prediction.R Done----")


--------------------------------------------------------------------------------
/scripts/eval_missing_modality_prediction_single_mm10.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | require(Seurat)
  5 | require(BSgenome.Mmusculus.UCSC.mm10)
  6 | require(stringr)
  7 | require(Signac)
  8 | require(Matrix)
  9 | require(future)
 10 | require(dplyr)
 11 | source("r_utils.R")
 12 | require(future)
 13 | plan(multisession)
 14 | options(future.rng.onMisue = "ignore")
 15 | # 3GB
 16 | options(future.globals.maxSize = 3000*1024^3)
 17 | # loading saved result
 18 | # predict_link.R
 19 | 
 20 | # assumes that input RNA data has been normalized 
 21 | predict_links <- function(in_dir,out_dir,method_key,links_path,expression_slot="counts",predict_folder="predicted"){   
 22 |     require(BSgenome.Mmusculus.UCSC.mm10)
 23 |     require(EnsDb.Mmusculus.v79)
 24 |     require(future)
 25 |     plan(multisession)
 26 |     options(future.rng.onMisue = "ignore")
 27 |     nbrOfWorkers()
 28 |     
 29 |     print("reading ATAC")
 30 |     print(paste0("ATAC path: ",file.path(out_dir,method_key,predict_folder,"ATAC")))
 31 |     if(! file.exists(file.path(out_dir,method_key,predict_folder,"ATAC")) || ! file.exists(file.path(out_dir,method_key,predict_folder,"/RNA/"))){
 32 |         print("no imputation found, skip gene-peak association evaluation")
 33 |         return()
 34 |     }
 35 |     atac <- read_mtx_folder(file.path(out_dir,method_key,predict_folder,"/ATAC/"),
 36 |                         "ATAC",c("peak"),c("barcodes"),atac=TRUE,frag_path="")
 37 |     annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)
 38 |     ucsc.levels <- stringr::str_replace(string=paste("chr",seqlevels(annotation),sep=""), pattern="chrMT", replacement="chrM")
 39 |     seqlevels(annotation) <- ucsc.levels
 40 |     Annotation(atac) <- annotation
 41 | 
 42 |     print("reading RNA")
 43 |     paired <- read_mtx_folder(file.path(out_dir,method_key,predict_folder,"/RNA/"),
 44 |                             "RNA",c("gene"),c("barcodes"),atac=FALSE)
 45 |     print("reading RNA done")
 46 |     paired[["ATAC"]] <- atac@assays$ATAC
 47 |     
 48 |     ## subset for cells from single-modality only ("snATAC")
 49 |     unpaired_atac_barcode <- read.csv(file.path(in_dir,"unpaired_ATAC","barcodes.tsv"),header = F)[,1]
 50 |     barcodes_predicted <- intersect(unpaired_atac_barcode,colnames(paired))
 51 |     paired <-paired[,barcodes_predicted]
 52 |     print(paste0("Number of cells with RNA predicted: ",ncol(paired)))
 53 |           
 54 |     links_truth <- read.csv(links_path)
 55 |     print("loaded links_truth")
 56 |     DefaultAssay(paired)<-"RNA"
 57 |     if(expression_slot=="data"){
 58 |         paired <- NormalizeData(paired)
 59 |     }else if (expression_slot != "counts"){
 60 |         stop("wrong expression.slot inputted")
 61 |     }
 62 |     
 63 |     DefaultAssay(paired)<-"ATAC"
 64 |     main.chroms <- standardChromosomes(BSgenome.Mmusculus.UCSC.mm10)
 65 |     keep.peaks <- which(as.character(seqnames(granges(paired))) %in% main.chroms)
 66 |     paired[["ATAC"]] <- subset(paired[["ATAC"]], features = rownames(paired[["ATAC"]])[keep.peaks])
 67 | 
 68 |     paired <- RegionStats(paired, genome = BSgenome.Mmusculus.UCSC.mm10)
 69 | 
 70 |     # 50kb list, with all genes 
 71 |     system.time({
 72 |         paired_links <- LinkPeaks(
 73 |             object = paired,
 74 |             peak.assay = "ATAC",
 75 |             expression.assay = "RNA",
 76 |             peak.slot = "counts",
 77 |             expression.slot = expression_slot,
 78 |             genes.use = rownames(paired@assays$RNA),
 79 |             distance = 50000,
 80 |         )
 81 |     })
 82 |     gene_link_50kb<-as.data.frame(paired_links@assays$ATAC@links)
 83 | 
 84 |     gene_link_50kb_unique<- gene_link_50kb %>%
 85 |         arrange(peak, -pvalue) %>%
 86 |         dplyr::filter(duplicated(peak) == FALSE) 
 87 | 
 88 |     truth = paste0(links_truth$gene,"_",links_truth$peak)
 89 |     pred = paste0(gene_link_50kb_unique$gene,"_",gene_link_50kb_unique$peak)
 90 |     
 91 |     tp = length(intersect(truth,pred))
 92 |     fp = length(pred) - tp
 93 |     fn = length(truth) - tp
 94 |     precision = tp/(tp+fp) 
 95 |     recall = tp/(tp+fn)
 96 |     f1 = 2 * (precision * recall) / (precision + recall)
 97 |     percent_recovered_50kb = tp/dim(links_truth)[1]
 98 |     print(paste0("percent_recovered_50kb: ",percent_recovered_50kb))
 99 |     print(paste0("f1: ",f1))
100 |     write.table(c("percent_recovered_50kb" = percent_recovered_50kb,"f1"=f1),
101 |             file = file.path(out_dir,method_key,paste0(method_key,"_prediction_eval.txt")), 
102 |             sep = "\t",
103 |             col.names = FALSE)
104 |     
105 | }
106 | 
107 | if (length(args) < 4) {
108 |   stop("Insufficient number of arguments are supplied (input file).n", call.=FALSE)
109 | }
110 | 
111 | print(paste0("argument 1: ",args[1]))
112 | print(paste0("argument 2: ",args[2]))
113 | print(paste0("argument 3: ",args[3]))
114 | print(paste0("argument 4: ",args[4]))
115 | 
116 | #links_path="dataset/multiome_pbmc_10k/pbmc_10x_pmat_sig_links_50kb_unique.csv"
117 | 
118 | predict_links(in_dir = args[1],
119 |               out_dir = args[2],
120 |               method_key = args[3],
121 |               links_path = args[4])
122 | 
123 | print("----eval_missing_modality_prediction.R Done----")


--------------------------------------------------------------------------------
/scripts/submit_job_per_condition_n_eval2.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #BSUB -J dweisepytest #LSF Job Name
  4 | #BSUB -q mingyao_normal
  5 | #BSUB -o pytestdweise.%J.txt #Name of the job output file
  6 | ### -- Default: use 8 cores --
  7 | #BSUB -n 8
  8 | #BSUB -R "span[hosts=1]"
  9 | ### -- specify that we need 32GB of memory per core/slot --
 10 | #BSUB -R "rusage[mem=32GB]"
 11 | ### -- specify that we want the job to get killed if it exceeds 32 GB per core/slot --
 12 | #BSUB -M 32GB
 13 | ### -- send notification at completion --
 14 | #BSUB -N
 15 | 
 16 | 
 17 | 
 18 | ############################################################
 19 | # Help                                                     #
 20 | ############################################################
 21 | Help()
 22 | {
 23 |    # Display Help
 24 |    echo "general script to run a python/R script (with two arguments) under a certain conda env"
 25 |    echo
 26 |    echo "Syntax: scriptTemplate [-i|w|c|s|p|r|e|f|t|l]"
 27 |    echo "options:"
 28 |    echo "i     input address."
 29 |    echo "w     output address"
 30 |    echo "c     conda environment name"
 31 |    echo "s     script path"
 32 |    echo "p     script is written in Python"
 33 |    echo "r     script is written in R"
 34 |    echo "e     eval script path"
 35 |    echo "m     method key"
 36 |    echo "f     path to result matrix (specific to each method)"
 37 |    echo "t     path to barcode-to-cell type table"
 38 |    echo "l     number of clusters"
 39 |    echo "a     path to R script that performs gene-peak association using the predicted gene/peak expression"
 40 |    echo "b     path to gene-peak association pair list, or false if no association to be evaluated"
 41 |    echo
 42 | }
 43 | 
 44 | ############################################################
 45 | ############################################################
 46 | # Main program                                             #
 47 | ############################################################
 48 | ############################################################
 49 | 
 50 | py="false"
 51 | r="false"
 52 | 
 53 | #  If a character is followed by a colon (e.g. f:), that option is expected to have an argument. thus here, p and r are not expected to have an argument.
 54 | while getopts i:w:c:s:pre:f:m:t:l:a:b: flag
 55 | do
 56 |     case "${flag}" in
 57 |         i) in_dir=${OPTARG};;
 58 |         w) out_dir=${OPTARG};;
 59 |         c) conda_env=${OPTARG};;
 60 |         s) script_path=${OPTARG};;
 61 |         p) py="true";;
 62 |         r) r="true";;
 63 |         e) eval_path=${OPTARG};;
 64 |         f) file_path=${OPTARG};;
 65 |         m) method_key=${OPTARG};;
 66 |         t) ct_ref=${OPTARG};;
 67 |         l) nclust=${OPTARG};;
 68 |         a) eval_path_gp=${OPTARG};;
 69 |         b) gp_truth=${OPTARG};;
 70 |     esac
 71 | done
 72 | 
 73 | echo "Working Directory: $PWD";
 74 | echo "Input Directory: $in_dir";
 75 | echo "Output Directory: $out_dir";
 76 | echo "Conda Environment: $conda_env";
 77 | echo "Script: $script_path";
 78 | echo "Running in Python: $py";
 79 | echo "Running in R: $r";
 80 | echo "Evaluation script: $eval_path";
 81 | echo "Result matrix path: $file_path";
 82 | echo "Method key: $method_key";
 83 | echo "Cell type reference path: $ct_ref";
 84 | echo "nclust: $nclust";
 85 | echo "gene-pair eval script: $eval_path_gp";
 86 | echo "gene-pair truth list: $gp_truth";
 87 | 
 88 | #module load R 
 89 | source ~/anaconda3/etc/profile.d/conda.sh
 90 | conda activate $conda_env
 91 | 
 92 | echo "Conda env activated";
 93 | 
 94 | #create output directory if it doesn't exist 
 95 | mkdir -p $out_dir
 96 | 
 97 | 
 98 | # white spaces in between everything! 
 99 | if [[ $py == "true" ]] && [[ $r == "false" ]]
100 | then
101 |     echo "running a Python script."
102 |     python $script_path $in_dir $out_dir
103 | elif [[ $py == "false" ]] && [[ $r == "true" ]]
104 | then
105 |     echo "running a R script."
106 |     Rscript --vanilla $script_path $in_dir $out_dir $nclust
107 | else 
108 |     echo "please specify the language for the script to be run in, can either be python or r but not both or none."
109 | fi 
110 | 
111 | echo "Running evaluation";
112 | conda deactivate
113 | conda activate scib2
114 | 
115 | if [[ "$file_path" == *"seurat4"* ]] || [[ "$file_path" == *"liger"* ]] #|| [[ "$file_path" == *"scmomat"* ]]
116 | then
117 |     echo "Not clustering" 
118 |     python $eval_path $out_dir $file_path $ct_ref 
119 | else 
120 |     echo "Will cluster using latent embedding" 
121 |     python $eval_path $out_dir $file_path $ct_ref $nclust
122 | fi 
123 | 
124 | if [[ "$eval_path_gp" != "false" ]] || [[ "$gp_truth" != "false" ]]
125 | then
126 |     echo "Evaluating predicted profiles"
127 |     Rscript --vanilla $eval_path_gp $in_dir $out_dir $method_key $gp_truth
128 | else 
129 |     echo "Note evaluating predicted profiles, END!"
130 | fi  
131 |     
132 | 
133 | 


--------------------------------------------------------------------------------
/scripts/submit_job_per_condition_n_missing_ct_eval.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #BSUB -J dweisepytest #LSF Job Name
  4 | #BSUB -q mingyao_normal
  5 | #BSUB -o pytestdweise.%J.txt #Name of the job output file
  6 | ### -- Default: use 8 cores --
  7 | #BSUB -n 8
  8 | #BSUB -R "span[hosts=1]"
  9 | ### -- specify that we need 32GB of memory per core/slot --
 10 | #BSUB -R "rusage[mem=32GB]"
 11 | ### -- specify that we want the job to get killed if it exceeds 32 GB per core/slot --
 12 | #BSUB -M 32GB
 13 | ### -- send notification at completion --
 14 | #BSUB -N
 15 | 
 16 | 
 17 | 
 18 | ############################################################
 19 | # Help                                                     #
 20 | ############################################################
 21 | Help()
 22 | {
 23 |    # Display Help
 24 |    echo "general script to run a python/R script (with two arguments) under a certain conda env"
 25 |    echo
 26 |    echo "Syntax: scriptTemplate [-i|w|c|s|p|r|e|f|t|l]"
 27 |    echo "options:"
 28 |    echo "i     input address."
 29 |    echo "w     output address"
 30 |    echo "c     conda environment name"
 31 |    echo "s     script path"
 32 |    echo "p     script is written in Python"
 33 |    echo "r     script is written in R"
 34 |    echo "e     eval script path"
 35 |    echo "m     method key"
 36 |    echo "f     path to result matrix (specific to each method)"
 37 |    echo "t     path to barcode-to-cell type table"
 38 |    echo "l     number of clusters"
 39 |    echo "a     path to R script that performs gene-peak association using the predicted gene/peak expression"
 40 |    echo "b     path to gene-peak association pair list, or false if no association to be evaluated"
 41 |    echo "g     path to file storing the rare cell types being evaluated in this script (must be present, no default)"
 42 |    echo
 43 | }
 44 | 
 45 | ############################################################
 46 | ############################################################
 47 | # Main program                                             #
 48 | ############################################################
 49 | ############################################################
 50 | 
 51 | py="false"
 52 | r="false"
 53 | 
 54 | #  If a character is followed by a colon (e.g. f:), that option is expected to have an argument. thus here, p and r are not expected to have an argument.
 55 | while getopts i:w:c:s:pre:f:m:t:l:a:b:g: flag
 56 | do
 57 |     case "${flag}" in
 58 |         i) in_dir=${OPTARG};;
 59 |         w) out_dir=${OPTARG};;
 60 |         c) conda_env=${OPTARG};;
 61 |         s) script_path=${OPTARG};;
 62 |         p) py="true";;
 63 |         r) r="true";;
 64 |         e) eval_path=${OPTARG};;
 65 |         f) file_path=${OPTARG};;
 66 |         m) method_key=${OPTARG};;
 67 |         t) ct_ref=${OPTARG};;
 68 |         l) nclust=${OPTARG};;
 69 |         a) eval_path_gp=${OPTARG};;
 70 |         b) gp_truth=${OPTARG};;
 71 |         g) rare_ct_path=${OPTARG};;
 72 |     esac
 73 | done
 74 | 
 75 | echo "Working Directory: $PWD";
 76 | echo "Input Directory: $in_dir";
 77 | echo "Output Directory: $out_dir";
 78 | echo "Conda Environment: $conda_env";
 79 | echo "Script: $script_path";
 80 | echo "Running in Python: $py";
 81 | echo "Running in R: $r";
 82 | echo "Evaluation script: $eval_path";
 83 | echo "Result matrix path: $file_path";
 84 | echo "Method key: $method_key";
 85 | echo "Cell type reference path: $ct_ref";
 86 | echo "nclust: $nclust";
 87 | echo "gene-pair eval script: $eval_path_gp";
 88 | echo "gene-pair truth list: $gp_truth";
 89 | echo "path to rare cell type list: $rare_ct_path";
 90 | 
 91 | #module load R 
 92 | source ~/anaconda3/etc/profile.d/conda.sh
 93 | conda activate $conda_env
 94 | 
 95 | echo "Conda env activated";
 96 | 
 97 | #create output directory if it doesn't exist 
 98 | mkdir -p $out_dir
 99 | 
100 | 
101 | # white spaces in between everything! 
102 | if [[ $py == "true" ]] && [[ $r == "false" ]]
103 | then
104 |     echo "running a Python script."
105 |     python $script_path $in_dir $out_dir
106 | elif [[ $py == "false" ]] && [[ $r == "true" ]]
107 | then
108 |     echo "running a R script."
109 |     Rscript --vanilla $script_path $in_dir $out_dir $nclust
110 | else 
111 |     echo "please specify the language for the script to be run in, can either be python or r but not both or none."
112 | fi 
113 | 
114 | echo "Running evaluation";
115 | conda deactivate
116 | conda activate scib2
117 | 
118 | if [[ "$file_path" == *"seurat4"* ]] || [[ "$file_path" == *"liger"* ]]
119 | then
120 |     echo "Not clustering" 
121 |     python $eval_path $out_dir $file_path $ct_ref $rare_ct_path
122 | else 
123 |     echo "Will cluster using latent embedding" 
124 |     python $eval_path $out_dir $file_path $ct_ref $rare_ct_path $nclust
125 | fi 
126 | 
127 | if [[ "$eval_path_gp" != "false" ]] || [[ "$gp_truth" != "false" ]]
128 | then
129 |     echo "Evaluating predicted profiles"
130 |     Rscript --vanilla $eval_path_gp $in_dir $out_dir $method_key $gp_truth
131 | else 
132 |     echo "Not evaluating predicted profiles, END!"
133 | fi  
134 |     
135 | 
136 | 


--------------------------------------------------------------------------------
/scripts/submit_job_per_missing_mod_eval.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #BSUB -J dweisepytest #LSF Job Name
  4 | #BSUB -q mingyao_normal
  5 | #BSUB -o pytestdweise.%J.txt #Name of the job output file
  6 | ###BSUB -e pytestdweise.%J.out #Name of the job error file
  7 | ### -- Default: use 8 cores --
  8 | #BSUB -n 2
  9 | ### -- specify that we need 32GB of memory per core/slot --
 10 | #BSUB -R "rusage[mem=32GB]"
 11 | ### -- specify that we want the job to get killed if it exceeds 32 GB per core/slot --
 12 | #BSUB -M 32GB
 13 | ### -- send notification at completion --
 14 | #BSUB -N
 15 | 
 16 | 
 17 | ############################################################
 18 | # Help                                                     #
 19 | ############################################################
 20 | Help()
 21 | {
 22 |    # Display Help
 23 |    echo "general script to run a python script (with two arguments) under a certain conda env"
 24 |    echo
 25 |    echo "Syntax: scriptTemplate [-i|w|c|s|p|r|e|f|t|l]"
 26 |    echo "options:"
 27 |    echo "i     input address."
 28 |    echo "w     output address"
 29 |    echo "c     conda environment name"
 30 |    echo "s     script path"
 31 |    echo "p     script is written in Python"
 32 |    echo "r     script is written in R"
 33 |    echo "e     eval script path"
 34 |    echo "m     method key"
 35 |    echo "f     path to result matrix (specific to each method)"
 36 |    echo "t     path to barcode-to-cell type table"
 37 |    echo "l     number of clusters"
 38 |    echo "a     path to R script that performs gene-peak association using the predicted gene/peak expression"
 39 |    echo "b     path to gene-peak association pair list, or false if no association to be evaluated"
 40 |    echo
 41 | }
 42 | 
 43 | ############################################################
 44 | ############################################################
 45 | # Main program                                             #
 46 | ############################################################
 47 | ############################################################
 48 | 
 49 | py="false"
 50 | r="false"
 51 | 
 52 | #  If a character is followed by a colon (e.g. f:), that option is expected to have an argument. thus here, p and r are not expected to have an argument.
 53 | while getopts i:w:c:s:pre:f:m:t:l:a:b: flag
 54 | do
 55 |     case "${flag}" in
 56 |         i) in_dir=${OPTARG};;
 57 |         w) out_dir=${OPTARG};;
 58 |         c) conda_env=${OPTARG};;
 59 |         s) script_path=${OPTARG};;
 60 |         p) py="true";;
 61 |         r) r="true";;
 62 |         e) eval_path=${OPTARG};;
 63 |         f) file_path=${OPTARG};;
 64 |         m) method_key=${OPTARG};;
 65 |         t) ct_ref=${OPTARG};;
 66 |         l) nclust=${OPTARG};;
 67 |         a) eval_path_gp=${OPTARG};;
 68 |         b) gp_truth=${OPTARG};;
 69 |     esac
 70 | done
 71 | 
 72 | echo "Working Directory: $PWD";
 73 | echo "Input Directory: $in_dir";
 74 | echo "Output Directory: $out_dir";
 75 | echo "Conda Environment: $conda_env";
 76 | echo "Script: $script_path";
 77 | echo "Running in Python: $py";
 78 | echo "Running in R: $r";
 79 | echo "Evaluation script: $eval_path";
 80 | echo "Result matrix path: $file_path";
 81 | echo "Method key: $method_key";
 82 | echo "Cell type reference path: $ct_ref";
 83 | echo "nclust: $nclust";
 84 | echo "gene-pair eval script: $eval_path_gp";
 85 | echo "gene-pair truth list: $gp_truth";
 86 | 
 87 | #module load R 
 88 | source ~/anaconda3/etc/profile.d/conda.sh
 89 | # conda activate $conda_env
 90 | 
 91 | # echo "Conda env activated";
 92 | 
 93 | # #create output directory if it doesn't exist 
 94 | # mkdir -p $out_dir
 95 | 
 96 | 
 97 | # # white spaces in between everything! 
 98 | # if [[ $py == "true" ]] && [[ $r == "false" ]]
 99 | # then
100 | #     echo "running a Python script."
101 | #     python $script_path $in_dir $out_dir
102 | # elif [[ $py == "false" ]] && [[ $r == "true" ]]
103 | # then
104 | #     echo "running a R script."
105 | #     Rscript --vanilla $script_path $in_dir $out_dir $nclust
106 | # else 
107 | #     echo "please specify the language for the script to be run in, can either be python or r but not both or none."
108 | # fi 
109 | 
110 | echo "Running evaluation";
111 | #conda deactivate
112 | conda activate scib2
113 | 
114 | # if [[ "$file_path" == *"seurat4"* ]] || [[ "$file_path" == *"liger"* ]]
115 | # then
116 | #     echo "Not clustering" 
117 | #     python $eval_path $out_dir $file_path $ct_ref 
118 | # else 
119 | #     echo "Will cluster using latent embedding" 
120 | #     python $eval_path $out_dir $file_path $ct_ref $nclust
121 | # fi 
122 | 
123 | if [[ "$eval_path_gp" != "false" ]] || [[ "$gp_truth" != "false" ]]
124 | then
125 |     echo "Evaluating predicted profiles"
126 |     Rscript --vanilla $eval_path_gp $in_dir $out_dir $method_key $gp_truth
127 | else 
128 |     echo "Note evaluating predicted profiles, END!"
129 | fi  
130 |     
131 | 
132 | 


--------------------------------------------------------------------------------