├── Cross_Validation.R
├── DEgenesMAST.R
├── LICENSE
├── README.md
├── Scripts
    ├── run_ACTINN.py
    ├── run_CHETAH.R
    ├── run_CaSTLe.R
    ├── run_Cell_BLAST.py
    ├── run_DigitalCellSorter.py
    ├── run_Garnett_CV.R
    ├── run_Garnett_Pretrained.R
    ├── run_LAmbDA.py
    ├── run_LDA.py
    ├── run_LDA_rejection.py
    ├── run_NMC.py
    ├── run_RF.py
    ├── run_SCINA.R
    ├── run_SVM.py
    ├── run_SVM_rejection.py
    ├── run_SingleR.R
    ├── run_kNN50.py
    ├── run_kNN9.py
    ├── run_moana.py
    ├── run_scID.R
    ├── run_scPred.R
    ├── run_scVI.py
    ├── run_scmap.R
    └── run_singleCellNet.R
├── Snakemake
    ├── Cross_Validation.R
    ├── DEgenesMAST.R
    ├── Dockerfiles
    │   ├── baseline
    │   │   └── Dockerfile
    │   ├── cell_blast
    │   │   └── Dockerfile
    │   ├── chetah
    │   │   ├── Dockerfile
    │   │   └── install_packages.R
    │   ├── cross_validation
    │   │   ├── Dockerfile
    │   │   └── install_packages.R
    │   ├── garnett
    │   │   ├── Dockerfile
    │   │   └── install_packages.R
    │   ├── scid
    │   │   ├── Dockerfile
    │   │   └── install_packages.R
    │   ├── scmap
    │   │   ├── Dockerfile
    │   │   └── install_packages.R
    │   ├── scvi
    │   │   └── Dockerfile
    │   ├── singlecellnet
    │   │   ├── Dockerfile
    │   │   └── install_packages.R
    │   └── singler
    │   │   ├── Dockerfile
    │   │   └── install_packages.R
    ├── LICENSE
    ├── README.md
    ├── Scripts
    │   ├── run_ACTINN.py
    │   ├── run_CHETAH.R
    │   ├── run_CaSTLe.R
    │   ├── run_Cell_BLAST.py
    │   ├── run_DigitalCellSorter.py
    │   ├── run_Garnett_CV.R
    │   ├── run_Garnett_Pretrained.R
    │   ├── run_LAmbDA.py
    │   ├── run_LDA.py
    │   ├── run_LDA_rejection.py
    │   ├── run_NMC.py
    │   ├── run_RF.py
    │   ├── run_SCINA.R
    │   ├── run_SVM.py
    │   ├── run_SVM_rejection.py
    │   ├── run_SingleR.R
    │   ├── run_kNN50.py
    │   ├── run_kNN9.py
    │   ├── run_moana.py
    │   ├── run_scID.R
    │   ├── run_scPred.R
    │   ├── run_scVI.py
    │   ├── run_scmap.R
    │   ├── run_scmapcell.R
    │   ├── run_scmapcluster.R
    │   ├── run_scmaptotal.R
    │   └── run_singleCellNet.R
    ├── Snakefile
    ├── evaluate.R
    ├── example.config.yml
    ├── rank_gene_dropouts.py
    └── rulegraph.png
├── evaluate.R
└── rank_gene_dropouts.py


/Cross_Validation.R:
--------------------------------------------------------------------------------
 1 | Cross_Validation <- function(LabelsPath, col_Index = 1,OutputDir){
 2 |   "
 3 |   Cross_Validation
 4 |   Function returns train and test indices for 5 folds stratified across unique cell populations,
 5 |   also filter out cell populations with less than 10 cells.
 6 |   It return a 'CV_folds.RData' file which then used as input to classifiers wrappers.
 7 |   
 8 |   Parameters
 9 |   ----------
10 |   LabelsPath : Cell population annotations file path (.csv).
11 |   col_Index : column index (integer) defining which level of annotation to use,
12 |   in case of multiple cell type annotations (default is 1)
13 |   OutputDir : Output directory defining the path of the exported file.
14 |   "
15 |   
16 |   Labels <- as.matrix(read.csv(LabelsPath))
17 |   Labels <- as.vector(Labels[,col_Index])
18 |   
19 |   Removed_classes <- !(table(Labels) > 10)
20 |   Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes]))
21 |   Labels <- Labels[Cells_to_Keep]
22 |   
23 |   # Getting training and testing Folds
24 |   library(rBayesianOptimization)
25 |   n_folds = 5
26 |   Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE)
27 |   Test_Folds <- c(n_folds:1)
28 |   Train_Idx <- list()
29 |   Test_Idx <- list()
30 |   for (i in c(1:length(Folds))){
31 |     Temp_Folds <- Folds
32 |     Temp_Folds[Test_Folds[i]] <- NULL
33 |     Train_Idx[i] <- list(unlist(Temp_Folds))
34 |     Test_Idx[i] <- Folds[Test_Folds[i]]
35 |   }
36 |   remove(Temp_Folds,i,Folds)
37 |   setwd(OutputDir)
38 |   save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = 'CV_folds.RData')
39 | }


--------------------------------------------------------------------------------
/DEgenesMAST.R:
--------------------------------------------------------------------------------
 1 | DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){
 2 |   # This functions applies a differential expression test to the data using one vs all
 3 |   # The training data should be used a an input
 4 |   # The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes
 5 |   # This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set.
 6 |   
 7 |   # Data: genes X cells (rows = genes, columns = cells)
 8 |   # Labels: labels of the data
 9 |   # Normalize: the input for MAST should be cpm normalized data, 
10 |   #            if the data is not normalized yet, this should be set to TRUE
11 |   # LogTransform: the input for MAST should be logtransformed,
12 |   #            if the data is not logtransformed yet, this should be set to TRUE
13 |   
14 |   
15 |   library(Seurat)
16 |   
17 |   if(Normalize)
18 |   {
19 |     Data <- apply(Data, 2, function(x) (x/sum(x))*1000000)
20 |   }
21 |   
22 |   if(LogTransform)
23 |   {
24 |     Data <- log(Data+1, base = 2)
25 |   }
26 |   SeuObj <- CreateSeuratObject(raw.data = Data, project = "DEgenes")
27 |   SeuObj <- SetIdent(SeuObj, ident.use = Labels)
28 |   DEgenes <- FindAllMarkers(SeuObj, test.use = "MAST")
29 |   Markers <- matrix(nrow = 20,ncol = length(unique(Labels)))
30 |   colnames(Markers) <- unique(Labels)
31 |   for (i in unique(Labels)){
32 |     i
33 |     TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))]
34 |     MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i]
35 |     print(MarkerGenes[1:20])
36 |     if (length(TempList) >= 20){
37 |       Markers[,i] <- TempList[1:20]
38 |     }
39 |     else{
40 |       if(length(TempList) > 0){
41 |         Markers[c(1:length(TempList)),i] <- TempList
42 |       }
43 |     }
44 |   }
45 |   return(Markers)
46 | }
47 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 tabdelaal
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Scripts/run_ACTINN.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import numpy as np
  3 | import pandas as pd
  4 | import time as tm
  5 | import rpy2.robjects as robjects
  6 | 
  7 | def run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
  8 |     '''
  9 |     run ACTINN
 10 |     Wrapper script to run ACTINN on a benchmark dataset with 5-fold cross validation,
 11 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 12 |   
 13 |     Parameters
 14 |     ----------
 15 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 16 |     as row names and gene names as column names.
 17 |     LabelsPath : Cell population annotations file path (.csv).
 18 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 19 |     OutputDir : Output directory defining the path of the exported file.
 20 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 21 |     defining the genes order for each cross validation fold, default is NULL.
 22 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 23 |     '''
 24 |     
 25 |     # read the Rdata file
 26 |     robjects.r['load'](CV_RDataPath)
 27 | 
 28 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 29 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 30 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 31 |     col = col - 1 
 32 |     test_ind = np.array(robjects.r['Test_Idx'])
 33 |     train_ind = np.array(robjects.r['Train_Idx'])
 34 | 
 35 |     # read the data
 36 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 37 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 38 |     
 39 |     labels = labels.iloc[tokeep]
 40 |     data = data.iloc[tokeep]
 41 |     
 42 |     # read the feature file
 43 |     if (NumGenes > 0):
 44 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 45 |     
 46 |     # folder with results
 47 |     os.chdir(OutputDir)
 48 |     
 49 |     tot=[]
 50 |     truelab = []
 51 |     pred = []
 52 | 
 53 |     for i in range(np.squeeze(nfolds)):
 54 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 55 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 56 |     
 57 |         train=data.iloc[train_ind_i]
 58 |         test=data.iloc[test_ind_i]
 59 |         y_train=labels.iloc[train_ind_i]
 60 |         y_test=labels.iloc[test_ind_i]
 61 |         
 62 |         if (NumGenes > 0):
 63 |             feat_to_use = features.iloc[0:NumGenes,i]
 64 |             train = train.iloc[:,feat_to_use]
 65 |             test = test.iloc[:,feat_to_use]
 66 |         
 67 |         train = train.transpose()
 68 |         test = test.transpose()
 69 |         
 70 |         train.to_csv("train.csv")
 71 |         test.to_csv("test.csv")
 72 |         y_train.to_csv("train_lab.csv", header = False, index = True, sep = '\t')
 73 |         y_test.to_csv("test_lab.csv", header = False, index = True, sep = '\t')
 74 |         
 75 |         tm.sleep(60)
 76 |             
 77 |         os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i train.csv -o train -f csv")
 78 |         os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i test.csv -o test -f csv")
 79 |         
 80 |         start = tm.time()
 81 |         os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts test.h5")    
 82 |         tot.append(tm.time()-start)
 83 |         
 84 |         tm.sleep(60)
 85 | 
 86 |         truelab.extend(y_test.values)
 87 |         predlabels = pd.read_csv('predicted_label.txt',header=0,index_col=None, sep='\t', usecols = [1])            
 88 |         pred.extend(predlabels.values)
 89 |     
 90 |             
 91 |     truelab = pd.DataFrame(truelab)
 92 |     pred = pd.DataFrame(pred)
 93 |     tot_time = pd.DataFrame(tot)
 94 |     
 95 |     if (NumGenes == 0):  
 96 |         truelab.to_csv("ACTINN_True_Labels.csv", index = False)
 97 |         pred.to_csv("ACTINN_Pred_Labels.csv", index = False)
 98 |         tot_time.to_csv("ACTINN_Total_Time.csv", index = False)
 99 |     else:
100 |         truelab.to_csv("ACTINN_" + str(NumGenes) + "_True_Labels.csv", index = False)
101 |         pred.to_csv("ACTINN_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
102 |         tot_time.to_csv("ACTINN_" + str(NumGenes) + "_Total_Time.csv", index = False)
103 |         
104 |         
105 |         
106 |         
107 |         
108 |         
109 |         
110 | 


--------------------------------------------------------------------------------
/Scripts/run_CHETAH.R:
--------------------------------------------------------------------------------
 1 | run_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
 2 |   "
 3 |   run CHETAH
 4 |   Wrapper script to run CHETAH on a benchmark dataset with 5-fold cross validation,
 5 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
 6 |   
 7 |   Parameters
 8 |   ----------
 9 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
10 |   as row names and gene names as column names.
11 |   LabelsPath : Cell population annotations file path (.csv).
12 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
13 |   OutputDir : Output directory defining the path of the exported file.
14 |   GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
15 |   defining the genes order for each cross validation fold, default is NULL.
16 |   NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
17 |   "
18 |   
19 |   Data <- read.csv(DataPath,row.names = 1)
20 |   Labels <- as.matrix(read.csv(LabelsPath))
21 |   load(CV_RDataPath)
22 |   Labels <- as.vector(Labels[,col_Index])
23 |   Data <- Data[Cells_to_Keep,]
24 |   Labels <- Labels[Cells_to_Keep]
25 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
26 |     GenesOrder = read.csv(GeneOrderPath)
27 |   }
28 |   
29 |   #############################################################################
30 |   #                                CHETAH                                     #
31 |   #############################################################################
32 |   library(CHETAH)
33 |   library(SingleCellExperiment)
34 |   True_Labels_CHETAH <- list()
35 |   Pred_Labels_CHETAH <- list()
36 |   Total_Time_CHETAH <- list()
37 |   Data = t(as.matrix(Data))
38 |   
39 |   for (i in c(1:n_folds)){
40 |     if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
41 |       sce <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
42 |                                   colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
43 |       
44 |       sce_test <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
45 |                                        colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
46 |       start_time <- Sys.time()
47 |       sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce, n_genes = NumGenes)
48 |       end_time <- Sys.time()
49 |     }
50 |     else{
51 |       sce <- SingleCellExperiment(assays = list(counts = Data[,Train_Idx[[i]]]), 
52 |                                   colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
53 |       
54 |       sce_test <- SingleCellExperiment(assays = list(counts = Data[,Test_Idx[[i]]]), 
55 |                                        colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
56 |       start_time <- Sys.time()
57 |       sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce)
58 |       end_time <- Sys.time()
59 |     }
60 |     
61 |     Total_Time_CHETAH[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
62 |     
63 |     True_Labels_CHETAH[i] <- list(Labels[Test_Idx[[i]]])
64 |     Pred_Labels_CHETAH[i] <- list(sce_test$celltype_CHETAH)
65 |   }
66 |   True_Labels_CHETAH <- as.vector(unlist(True_Labels_CHETAH))
67 |   Pred_Labels_CHETAH <- as.vector(unlist(Pred_Labels_CHETAH))
68 |   Total_Time_CHETAH <- as.vector(unlist(Total_Time_CHETAH))
69 |   
70 |   setwd(OutputDir)
71 |   
72 |   if (!is.null(GeneOrderPath) & !is.null (NumGenes)){
73 |     write.csv(True_Labels_CHETAH,paste('CHETAH_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
74 |     write.csv(Pred_Labels_CHETAH,paste('CHETAH_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
75 |     write.csv(Total_Time_CHETAH,paste('CHETAH_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)
76 |   }
77 |   else{
78 |     write.csv(True_Labels_CHETAH,'CHETAH_True_Labels.csv',row.names = FALSE)
79 |     write.csv(Pred_Labels_CHETAH,'CHETAH_Pred_Labels.csv',row.names = FALSE)
80 |     write.csv(Total_Time_CHETAH,'CHETAH_Total_Time.csv',row.names = FALSE)
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/Scripts/run_Cell_BLAST.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time as tm
  3 | import pandas as pd
  4 | import warnings
  5 | warnings.filterwarnings("ignore")
  6 | 
  7 | import tensorflow as tf
  8 | tf.logging.set_verbosity(0)
  9 | 
 10 | import Cell_BLAST as cb
 11 | import numpy as np
 12 | from numpy import genfromtxt as gft
 13 | import rpy2.robjects as robjects
 14 | 
 15 | 
 16 | def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 17 |     '''
 18 |     run Cell_BLAST
 19 |     Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation,
 20 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 21 |   
 22 |     Parameters
 23 |     ----------
 24 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 25 |     as row names and gene names as column names.
 26 |     LabelsPath : Cell population annotations file path (.csv).
 27 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 28 |     OutputDir : Output directory defining the path of the exported file.
 29 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 30 |     defining the genes order for each cross validation fold, default is NULL.
 31 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 32 |     '''
 33 |         
 34 |     # read the Rdata file
 35 |     robjects.r['load'](CV_RDataPath)
 36 | 
 37 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 38 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 39 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 40 |     col = col - 1 
 41 |     test_ind = np.array(robjects.r['Test_Idx'])
 42 |     train_ind = np.array(robjects.r['Train_Idx'])
 43 |     
 44 |     # read the feature file
 45 |     if (NumGenes > 0):
 46 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 47 | 
 48 |     # read the data and labels
 49 |     data_old = cb.data.ExprDataSet.read_table(DataPath,orientation="cg", sep=",", index_col = 0, header = 0, sparsify = True).normalize()
 50 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 51 |     
 52 |     data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns)
 53 | 
 54 |     labels = gft(LabelsPath, dtype = "str", skip_header = 1, delimiter = ",", usecols = col)      
 55 |     labels = labels[tokeep]
 56 | 
 57 |     os.chdir(OutputDir)
 58 |     
 59 |     truelab = []
 60 |     pred = []
 61 |     tr_time = []
 62 |     ts_time = []
 63 |     
 64 |     for i in range(np.squeeze(nfolds)):
 65 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 66 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 67 | 
 68 |         train=data[train_ind_i,:]
 69 |         test=data[test_ind_i,:]
 70 |         y_train = labels[train_ind_i]
 71 |         y_test = labels[test_ind_i]
 72 |         
 73 |         if (NumGenes > 0):
 74 |             feat_to_use = features.iloc[0:NumGenes,i]
 75 |             train = train[:,feat_to_use]
 76 |             test = test[:,feat_to_use]
 77 | 
 78 |         
 79 |         train.obs['cell_type'] = y_train
 80 |                 
 81 |         start = tm.time()
 82 |                 
 83 |         # reduce dimensions
 84 |         num_epoch = 50
 85 |         models = []
 86 |     
 87 |         for j in range(4):
 88 |             models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j))
 89 |     
 90 |         # train model
 91 |         blast = cb.blast.BLAST(models, train).build_empirical()
 92 |         tr_time.append(tm.time()-start)
 93 |         
 94 |         # predict labels
 95 |         start = tm.time()
 96 |         test_pred = blast.query(test).annotate('cell_type')
 97 |         ts_time.append(tm.time()-start)
 98 | 
 99 |         truelab.extend(y_test)
100 |         pred.extend(test_pred.values)
101 |     
102 |     #write results    
103 |     truelab = pd.DataFrame(truelab)
104 |     pred = pd.DataFrame(pred)
105 |             
106 |     tr_time = pd.DataFrame(tr_time)
107 |     ts_time = pd.DataFrame(ts_time)
108 |     
109 |     if (NumGenes == 0):  
110 |         truelab.to_csv("Cell_BLAST_True_Labels.csv", index = False)
111 |         pred.to_csv("Cell_BLAST_Pred_Labels.csv", index = False)
112 |         tr_time.to_csv("Cell_BLAST_Training_Time.csv", index = False)
113 |         ts_time.to_csv("Cell_BLAST_Testing_Time.csv", index = False)
114 |     else:
115 |         truelab.to_csv("Cell_BLAST_" + str(NumGenes) + "_True_Labels.csv", index = False)
116 |         pred.to_csv("Cell_BLAST_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
117 |         tr_time.to_csv("Cell_BLAST_" + str(NumGenes) + "_Training_Time.csv", index = False)
118 |         ts_time.to_csv("Cell_BLAST_" + str(NumGenes) + "_Testing_Time.csv", index = False)
119 |         
120 | 


--------------------------------------------------------------------------------
/Scripts/run_DigitalCellSorter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import scripts.DigitalCellSorter as DigitalCellSorter
 4 | import os
 5 | import time as tm
 6 | import rpy2.robjects as robjects
 7 | 
 8 | def run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 9 |     '''
10 |     run DigitalCellSorter
11 |     Wrapper script to run DigitalCellSorter on a benchmark dataset using a predefined genelist,
12 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.  
13 |   
14 |     Parameters
15 |     ----------
16 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
17 |     as row names and gene names as column names.
18 |     LabelsPath : Cell population annotations file path (.csv).
19 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
20 |     GeneListPath : Data file path to the genest.
21 |     OutputDir : Output directory defining the path of the exported file.
22 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
23 |     defining the genes order for each cross validation fold, default is NULL.
24 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
25 |     '''
26 |         
27 |     # read the Rdata file
28 |     robjects.r['load'](CV_RDataPath)
29 | 
30 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
31 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
32 |     col = col - 1
33 |     
34 |     # read the data
35 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
36 |     data = data.iloc[tokeep]
37 |     
38 |     truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
39 |     truelab = truelab.iloc[tokeep]
40 | 
41 | 
42 |     # read the feature file
43 |     if (NumGenes > 0):
44 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
45 |         feat_to_use = features.iloc[0:NumGenes,0]
46 |         data = data.iloc[:,feat_to_use]
47 |         
48 |     data = data.transpose()
49 |     
50 |     # number of different cell types in the data?
51 |     n_clusters = 8
52 |     AvailableCPUsCount = 1
53 |     N_samples_for_distribution = 10000
54 |         
55 |     start = tm.time()
56 |     pred = DigitalCellSorter.DigitalCellSorter().Process(data, 'DigitalCellSorter_Zhang', 
57 |                                                 saveDir = OutputDir, 
58 |                                                 geneListFileName = GeneListPath,
59 |                                                 N_samples_for_distribution = N_samples_for_distribution,
60 |                                                 AvailableCPUsCount = AvailableCPUsCount,
61 |                                                 clusterIndex=None,
62 |                                                 clusterName=None,
63 |                                                 n_clusters=n_clusters)	
64 |     runtime = tm.time() - start 
65 |     
66 |     os.chdir(OutputDir)
67 |     
68 |     results = pd.read_excel('DigitalCellSorter_Zhang_voting.xlsx',header=0,index_col=None, usecols=[11])
69 | 
70 |     prediction = np.zeros(np.shape(pred), dtype='>U10')
71 |     
72 |     for i in range(len(results)):
73 |     	prediction[np.where(pred == i)] = results.values[i]
74 |     
75 |     prediction = pd.DataFrame(prediction)
76 |         
77 |     if (NumGenes == 0):  
78 |         truelab.to_csv("DigitalCellSorter_True_Labels.csv", index = False)
79 |         prediction.to_csv("DigitalCellSorter_Pred_Labels.csv", index = False)
80 |         with open("DigitalCellSorter_Total_Time.csv", 'w') as f:
81 |             f.write("%f\n" % runtime)
82 |     else:
83 |         truelab.to_csv("DigitalCellSorter_" + str(NumGenes) + "_True_Labels.csv", index = False)
84 |         prediction.to_csv("DigitalCellSorter_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
85 |         with open("DigitalCellSorter_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
86 |             f.write("%f\n" % runtime)
87 | 
88 |             
89 | 
90 |         


--------------------------------------------------------------------------------
/Scripts/run_Garnett_Pretrained.R:
--------------------------------------------------------------------------------
 1 | run_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){
 2 |   "
 3 |   run Garnett
 4 |   Wrapper script to run Garnett on a benchmark dataset with a pretrained classifier,
 5 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
 6 |   
 7 |   Parameters
 8 |   ----------
 9 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
10 |   as row names and gene names as column names.
11 |   LabelsPath : Cell population annotations file path (.csv).
12 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
13 |   GenesPath : Path to the file with the genenames
14 |   ClassifierPath : Path to the pretrained classifier
15 |   OutputDir : Output directory defining the path of the exported file.
16 |   Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)
17 |   "
18 |   # load needed libraries
19 |   library(garnett)
20 |   
21 |   if (Human) {
22 |     library(org.Hs.eg.db)
23 |   } else {
24 |     library(org.Mm.eg.db)
25 |   }
26 |   
27 |   # load data, genes, and marker file
28 |   load(CV_RDataPath)
29 |   
30 |   load(ClassifierPath)
31 |   
32 |   labels <- as.matrix(read.csv(LabelsPath))
33 |   labels <- labels[Cells_to_Keep]
34 |   
35 |   mat <- read.table(DataPath, sep = ",")
36 |   data <- mat[-1,-1]
37 |   data <- data[Cells_to_Keep,]
38 |   data <- t(data) #ensure that the genes are rows, and the cells are columns
39 |   
40 |   barcodes <- mat[-1,1]
41 |   
42 |   pdata = data.frame(barcodes)
43 |   fdata <- read.table(GenesPath)
44 |   names(fdata) <- 'gene_short_name'
45 |   row.names(fdata) <- fdata$gene_short_name
46 |   
47 |   row.names(data) <- row.names(fdata)
48 |   colnames(data) <- row.names(pdata)
49 |   
50 |   pd <- new("AnnotatedDataFrame", data = pdata)
51 |   fd <- new("AnnotatedDataFrame", data = fdata)
52 |   pbmc_cds <- newCellDataSet(as(data, "dgCMatrix"),
53 |                              phenoData = pd,
54 |                              featureData = fd)
55 |   
56 |   start_time <- Sys.time()
57 |   
58 |   pbmc_cds <- estimateSizeFactors(pbmc_cds)
59 |   
60 |   if (Human){
61 |     pbmc_cds <- classify_cells(pbmc_cds, hsPBMC, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
62 |   } else {
63 |     pbmc_cds <- classify_cells(pbmc_cds, mmLung, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
64 |   }
65 |   
66 |   end_time <- Sys.time()
67 |   
68 |   test_time <- as.numeric(end_time - start_time)
69 |   
70 |   setwd(OutputDir)
71 |   
72 |   write.table(pData(pbmc_cds)$cluster_ext_type, file = "Garnett_Pred_Labels.csv", append = FALSE, quote = TRUE, sep = "\t",
73 |               eol = "\n", na = "NA", dec = ".", row.names = FALSE,
74 |               qmethod = c("escape", "double"),
75 |               fileEncoding = "")
76 |   
77 |   write.csv(labels,"Garnett_Pretrained_True_Labels.csv", row.names = FALSE)
78 |   
79 |   write.csv(test_time,'Garnett_Pretrained_Testing_Time.csv',row.names = FALSE)
80 |   
81 |   
82 |   
83 | }


--------------------------------------------------------------------------------
/Scripts/run_LDA.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import numpy as np
  3 | import pandas as pd
  4 | import time as tm
  5 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
  6 | import rpy2.robjects as robjects
  7 | 
  8 | 
  9 | def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 10 |     '''
 11 |     run baseline classifier: LDA
 12 |     Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
 13 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 14 |   
 15 |     Parameters
 16 |     ----------
 17 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 18 |     as row names and gene names as column names.
 19 |     LabelsPath : Cell population annotations file path (.csv).
 20 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 21 |     OutputDir : Output directory defining the path of the exported file.
 22 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 23 |     defining the genes order for each cross validation fold, default is NULL.
 24 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 25 |     '''
 26 |         
 27 |     # read the Rdata file
 28 |     robjects.r['load'](CV_RDataPath)
 29 | 
 30 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 31 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 32 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 33 |     col = col - 1 
 34 |     test_ind = np.array(robjects.r['Test_Idx'])
 35 |     train_ind = np.array(robjects.r['Train_Idx'])
 36 | 
 37 |     # read the data
 38 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 39 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 40 |     
 41 |     labels = labels.iloc[tokeep]
 42 |     data = data.iloc[tokeep]
 43 |     
 44 |     # read the feature file
 45 |     if (NumGenes > 0):
 46 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 47 |     
 48 |     # folder with results
 49 |     os.chdir(OutputDir)
 50 |     
 51 |     # normalize data
 52 |     data = np.log1p(data)
 53 |         
 54 |     Classifier = LinearDiscriminantAnalysis()
 55 |             
 56 |     tr_time=[]
 57 |     ts_time=[]
 58 |     truelab = []
 59 |     pred = []
 60 |         
 61 |     for i in range(np.squeeze(nfolds)):
 62 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 63 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 64 |     
 65 |         train=data.iloc[train_ind_i]
 66 |         test=data.iloc[test_ind_i]
 67 |         y_train=labels.iloc[train_ind_i]
 68 |         y_test=labels.iloc[test_ind_i]
 69 |             
 70 |         if (NumGenes > 0):
 71 |             feat_to_use = features.iloc[0:NumGenes,i]
 72 |             train = train.iloc[:,feat_to_use]
 73 |             test = test.iloc[:,feat_to_use]
 74 |             
 75 |         start=tm.time()
 76 |         Classifier.fit(train, y_train)
 77 |         tr_time.append(tm.time()-start)
 78 |                     
 79 |         start=tm.time()
 80 |         predicted = Classifier.predict(test)
 81 |         ts_time.append(tm.time()-start)
 82 |             
 83 |         truelab.extend(y_test.values)
 84 |         pred.extend(predicted)
 85 |                 
 86 |     truelab = pd.DataFrame(truelab)
 87 |     pred = pd.DataFrame(pred)
 88 |         
 89 |     tr_time = pd.DataFrame(tr_time)
 90 |     ts_time = pd.DataFrame(ts_time)
 91 |         
 92 |     if (NumGenes == 0):  
 93 |         truelab.to_csv("LDA_True_Labels.csv", index = False)
 94 |         pred.to_csv("LDA_Pred_Labels.csv", index = False)
 95 |         tr_time.to_csv("LDA_Training_Time.csv", index = False)
 96 |         ts_time.to_csv("LDA_Testing_Time.csv", index = False)
 97 |     else:
 98 |         truelab.to_csv("LDA_" + str(NumGenes) + "_True_Labels.csv", index = False)
 99 |         pred.to_csv("LDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
100 |         tr_time.to_csv("LDA_" + str(NumGenes) + "_Training_Time.csv", index = False)
101 |         ts_time.to_csv("LDA_" + str(NumGenes) + "_Testing_Time.csv", index = False)
102 | 
103 |     
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/Scripts/run_LDA_rejection.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import numpy as np
  3 | import pandas as pd
  4 | import time as tm
  5 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
  6 | import rpy2.robjects as robjects
  7 | 
  8 | 
  9 | def run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7):
 10 |     '''
 11 |     run baseline classifier: LDA
 12 |     Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
 13 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 14 |   
 15 |     Parameters
 16 |     ----------
 17 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 18 |     as row names and gene names as column names.
 19 |     LabelsPath : Cell population annotations file path (.csv).
 20 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 21 |     OutputDir : Output directory defining the path of the exported file.
 22 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 23 |     defining the genes order for each cross validation fold, default is NULL.
 24 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 25 |     Threshold : Threshold used when rejecting the genes, default is 0.7.
 26 |     '''
 27 |         
 28 |     # read the Rdata file
 29 |     robjects.r['load'](CV_RDataPath)
 30 | 
 31 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 32 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 33 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 34 |     col = col - 1 
 35 |     test_ind = np.array(robjects.r['Test_Idx'])
 36 |     train_ind = np.array(robjects.r['Train_Idx'])
 37 | 
 38 |     # read the data
 39 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 40 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 41 |     
 42 |     labels = labels.iloc[tokeep]
 43 |     data = data.iloc[tokeep]
 44 |     
 45 |     # read the feature file
 46 |     if (NumGenes > 0):
 47 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 48 |     
 49 |     # folder with results
 50 |     os.chdir(OutputDir)
 51 |     
 52 |     # normalize data
 53 |     data = np.log1p(data)
 54 |         
 55 |     Classifier = LinearDiscriminantAnalysis()
 56 |             
 57 |     tr_time=[]
 58 |     ts_time=[]
 59 |     truelab = []
 60 |     pred = []
 61 |         
 62 |     for i in range(np.squeeze(nfolds)):
 63 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 64 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 65 |     
 66 |         train=data.iloc[train_ind_i]
 67 |         test=data.iloc[test_ind_i]
 68 |         y_train=labels.iloc[train_ind_i]
 69 |         y_test=labels.iloc[test_ind_i]
 70 |             
 71 |         if (NumGenes > 0):
 72 |             feat_to_use = features.iloc[0:NumGenes,i]
 73 |             train = train.iloc[:,feat_to_use]
 74 |             test = test.iloc[:,feat_to_use]
 75 |             
 76 |         start=tm.time()
 77 |         Classifier.fit(train, y_train)
 78 |         tr_time.append(tm.time()-start)
 79 |                     
 80 |         start=tm.time()
 81 |         predicted = Classifier.predict(test)
 82 |         prob = np.max(Classifier.predict_proba(test), axis = 1)
 83 |         unlabeled = np.where(prob < Threshold)
 84 |         predicted[unlabeled] = 'Unknown'
 85 |         ts_time.append(tm.time()-start)
 86 |             
 87 |         truelab.extend(y_test.values)
 88 |         pred.extend(predicted)
 89 |                 
 90 |     truelab = pd.DataFrame(truelab)
 91 |     pred = pd.DataFrame(pred)
 92 |         
 93 |     tr_time = pd.DataFrame(tr_time)
 94 |     ts_time = pd.DataFrame(ts_time)
 95 |         
 96 |     if (NumGenes == 0):  
 97 |         truelab.to_csv("LDA_True_Labels.csv", index = False)
 98 |         pred.to_csv("LDA_Pred_Labels.csv", index = False)
 99 |         tr_time.to_csv("LDA_Training_Time.csv", index = False)
100 |         ts_time.to_csv("LDA_Testing_Time.csv", index = False)
101 |     else:
102 |         truelab.to_csv("LDA_" + str(NumGenes) + "_True_Labels.csv", index = False)
103 |         pred.to_csv("LDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
104 |         tr_time.to_csv("LDA_" + str(NumGenes) + "_Training_Time.csv", index = False)
105 |         ts_time.to_csv("LDA_" + str(NumGenes) + "_Testing_Time.csv", index = False)
106 | 
107 |     
108 | 
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/Scripts/run_NMC.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import numpy as np
  3 | import pandas as pd
  4 | import time as tm
  5 | from sklearn.neighbors import NearestCentroid
  6 | import rpy2.robjects as robjects
  7 | 
  8 | 
  9 | def run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 10 |     '''
 11 |     run baseline classifier: NMC
 12 |     Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation,
 13 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 14 |   
 15 |     Parameters
 16 |     ----------
 17 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 18 |     as row names and gene names as column names.
 19 |     LabelsPath : Cell population annotations file path (.csv).
 20 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 21 |     OutputDir : Output directory defining the path of the exported file.
 22 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 23 |     defining the genes order for each cross validation fold, default is NULL.
 24 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 25 |     '''
 26 |         
 27 |     # read the Rdata file
 28 |     robjects.r['load'](CV_RDataPath)
 29 | 
 30 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 31 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 32 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 33 |     col = col - 1 
 34 |     test_ind = np.array(robjects.r['Test_Idx'])
 35 |     train_ind = np.array(robjects.r['Train_Idx'])
 36 | 
 37 |     # read the data
 38 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 39 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 40 |     
 41 |     labels = labels.iloc[tokeep]
 42 |     data = data.iloc[tokeep]
 43 |     
 44 |     # read the feature file
 45 |     if (NumGenes > 0):
 46 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 47 |     
 48 |     # folder with results
 49 |     os.chdir(OutputDir)
 50 |     
 51 |     # normalize data
 52 |     data = np.log1p(data)
 53 |         
 54 |     Classifier = NearestCentroid()
 55 |             
 56 |     tr_time=[]
 57 |     ts_time=[]
 58 |     truelab = []
 59 |     pred = []
 60 |         
 61 |     for i in range(np.squeeze(nfolds)):
 62 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 63 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 64 |     
 65 |         train=data.iloc[train_ind_i]
 66 |         test=data.iloc[test_ind_i]
 67 |         y_train=labels.iloc[train_ind_i]
 68 |         y_test=labels.iloc[test_ind_i]
 69 |             
 70 |         if (NumGenes > 0):
 71 |             feat_to_use = features.iloc[0:NumGenes,i]
 72 |             train = train.iloc[:,feat_to_use]
 73 |             test = test.iloc[:,feat_to_use]
 74 |             
 75 |         start=tm.time()
 76 |         Classifier.fit(train, y_train)
 77 |         tr_time.append(tm.time()-start)
 78 |                     
 79 |         start=tm.time()
 80 |         predicted = Classifier.predict(test)
 81 |         ts_time.append(tm.time()-start)
 82 |             
 83 |         truelab.extend(y_test.values)
 84 |         pred.extend(predicted)
 85 |                 
 86 |     truelab = pd.DataFrame(truelab)
 87 |     pred = pd.DataFrame(pred)
 88 |         
 89 |     tr_time = pd.DataFrame(tr_time)
 90 |     ts_time = pd.DataFrame(ts_time)
 91 |         
 92 |     if (NumGenes == 0):  
 93 |         truelab.to_csv("NMC_True_Labels.csv", index = False)
 94 |         pred.to_csv("NMC_Pred_Labels.csv", index = False)
 95 |         tr_time.to_csv("NMC_Training_Time.csv", index = False)
 96 |         ts_time.to_csv("NMC_Testing_Time.csv", index = False)
 97 |     else:
 98 |         truelab.to_csv("NMC_" + str(NumGenes) + "_True_Labels.csv", index = False)
 99 |         pred.to_csv("NMC_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
100 |         tr_time.to_csv("NMC_" + str(NumGenes) + "_Training_Time.csv", index = False)
101 |         ts_time.to_csv("NMC_" + str(NumGenes) + "_Testing_Time.csv", index = False)
102 | 
103 |     
104 | 
105 | 
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/Scripts/run_RF.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import numpy as np
  3 | import pandas as pd
  4 | import time as tm
  5 | from sklearn.ensemble import RandomForestClassifier
  6 | import rpy2.robjects as robjects
  7 | 
  8 | 
  9 | def run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 10 |     '''
 11 |     run baseline classifier: RF
 12 |     Wrapper script to run a RF classifier with 50 trees on a benchmark dataset with 5-fold cross validation,
 13 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 14 |   
 15 |     Parameters
 16 |     ----------
 17 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 18 |     as row names and gene names as column names.
 19 |     LabelsPath : Cell population annotations file path (.csv).
 20 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 21 |     OutputDir : Output directory defining the path of the exported file.
 22 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 23 |     defining the genes order for each cross validation fold, default is NULL.
 24 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 25 |     '''
 26 |         
 27 |     # read the Rdata file
 28 |     robjects.r['load'](CV_RDataPath)
 29 | 
 30 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 31 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 32 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 33 |     col = col - 1 
 34 |     test_ind = np.array(robjects.r['Test_Idx'])
 35 |     train_ind = np.array(robjects.r['Train_Idx'])
 36 | 
 37 |     # read the data
 38 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 39 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 40 |     
 41 |     labels = labels.iloc[tokeep]
 42 |     data = data.iloc[tokeep]
 43 |     
 44 |     # read the feature file
 45 |     if (NumGenes > 0):
 46 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 47 |     
 48 |     # folder with results
 49 |     os.chdir(OutputDir)
 50 |     
 51 |     # normalize data
 52 |     data = np.log1p(data)
 53 |         
 54 |     Classifier = RandomForestClassifier(n_estimators = 50)
 55 |             
 56 |     tr_time=[]
 57 |     ts_time=[]
 58 |     truelab = []
 59 |     pred = []
 60 |         
 61 |     for i in range(np.squeeze(nfolds)):
 62 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 63 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 64 |     
 65 |         train=data.iloc[train_ind_i]
 66 |         test=data.iloc[test_ind_i]
 67 |         y_train=labels.iloc[train_ind_i]
 68 |         y_test=labels.iloc[test_ind_i]
 69 |             
 70 |         if (NumGenes > 0):
 71 |             feat_to_use = features.iloc[0:NumGenes,i]
 72 |             train = train.iloc[:,feat_to_use]
 73 |             test = test.iloc[:,feat_to_use]
 74 |             
 75 |         start=tm.time()
 76 |         Classifier.fit(train, y_train)
 77 |         tr_time.append(tm.time()-start)
 78 |                     
 79 |         start=tm.time()
 80 |         predicted = Classifier.predict(test)
 81 |         ts_time.append(tm.time()-start)
 82 |             
 83 |         truelab.extend(y_test.values)
 84 |         pred.extend(predicted)
 85 |                 
 86 |     truelab = pd.DataFrame(truelab)
 87 |     pred = pd.DataFrame(pred)
 88 |         
 89 |     tr_time = pd.DataFrame(tr_time)
 90 |     ts_time = pd.DataFrame(ts_time)
 91 |         
 92 |     if (NumGenes == 0):  
 93 |         truelab.to_csv("RF_True_Labels.csv", index = False)
 94 |         pred.to_csv("RF_Pred_Labels.csv", index = False)
 95 |         tr_time.to_csv("RF_Training_Time.csv", index = False)
 96 |         ts_time.to_csv("RF_Testing_Time.csv", index = False)
 97 |     else:
 98 |         truelab.to_csv("RF_" + str(NumGenes) + "_True_Labels.csv", index = False)
 99 |         pred.to_csv("RF_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
100 |         tr_time.to_csv("RF_" + str(NumGenes) + "_Training_Time.csv", index = False)
101 |         ts_time.to_csv("RF_" + str(NumGenes) + "_Testing_Time.csv", index = False)
102 | 
103 |     
104 | 
105 | 
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/Scripts/run_SCINA.R:
--------------------------------------------------------------------------------
 1 | run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){
 2 |   "
 3 |   run SCINA
 4 |   Wrapper script to run SCINA on a benchmark dataset,
 5 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
 6 |   
 7 |   Parameters
 8 |   ----------
 9 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
10 |   as row names and gene names as column names.
11 |   LabelsPath : Cell population annotations file path (.csv).
12 |   GeneSigPath : Cell type marker genes file path (.csv)
13 |   OutputDir : Output directory defining the path of the exported file.
14 |   "
15 |   
16 |   Data <- read.csv(DataPath,row.names = 1)
17 |   Labels <- as.vector(as.matrix(read.csv(LabelsPath)))
18 |   Data <- Data[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK')),]
19 |   Labels <- Labels[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK'))]
20 |   Labels[Labels == 'CD14+ Monocyte'] <- 'CD14_Monocyte'
21 |   Labels[Labels == 'CD19+ B'] <- 'CD19_B'
22 |   Labels[Labels == 'CD56+ NK'] <- 'CD56_NK'
23 |   
24 |   
25 |   #############################################################################
26 |   #                                 SCINA                                     #
27 |   #############################################################################
28 |   library(SCINA)
29 |   Signature_Genes <- preprocess.signatures(GeneSigPath)
30 |   True_Labels_SCINA <- list()
31 |   Pred_Labels_SCINA <- list()
32 |   Total_Time_SCINA <- list()
33 |   
34 |   library(preprocessCore)
35 |   Data = t(as.matrix(Data))
36 |   Data=log(Data+1)
37 |   Data[]=normalize.quantiles(Data)
38 |   
39 |   start_time <- Sys.time()
40 |   results = SCINA(Data, Signature_Genes)
41 |   end_time <- Sys.time()
42 |   
43 |   True_Labels_SCINA <- Labels
44 |   Pred_Labels_SCINA <- results$cell_labels
45 |   Total_Time_SCINA <- as.numeric(difftime(end_time,start_time,units = 'secs'))
46 |   
47 |   setwd(OutputDir)
48 |   
49 |   write.csv(True_Labels_SCINA,'SCINA_True_Labels.csv',row.names = FALSE)
50 |   write.csv(Pred_Labels_SCINA,'SCINA_Pred_Labels.csv',row.names = FALSE)
51 |   write.csv(Total_Time_SCINA,'SCINA_Total_Time.csv',row.names = FALSE)
52 | }
53 | 


--------------------------------------------------------------------------------
/Scripts/run_SVM.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import numpy as np
  3 | import pandas as pd
  4 | import time as tm
  5 | from sklearn.svm import LinearSVC
  6 | import rpy2.robjects as robjects
  7 | 
  8 | 
  9 | def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 10 |     '''
 11 |     run baseline classifier: SVM
 12 |     Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
 13 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 14 |   
 15 |     Parameters
 16 |     ----------
 17 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 18 |     as row names and gene names as column names.
 19 |     LabelsPath : Cell population annotations file path (.csv).
 20 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 21 |     OutputDir : Output directory defining the path of the exported file.
 22 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 23 |     defining the genes order for each cross validation fold, default is NULL.
 24 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 25 |     '''
 26 |         
 27 |     # read the Rdata file
 28 |     robjects.r['load'](CV_RDataPath)
 29 | 
 30 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 31 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 32 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 33 |     col = col - 1 
 34 |     test_ind = np.array(robjects.r['Test_Idx'])
 35 |     train_ind = np.array(robjects.r['Train_Idx'])
 36 | 
 37 |     # read the data
 38 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 39 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 40 |     
 41 |     labels = labels.iloc[tokeep]
 42 |     data = data.iloc[tokeep]
 43 |     
 44 |     # read the feature file
 45 |     if (NumGenes > 0):
 46 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 47 |     
 48 |     # folder with results
 49 |     os.chdir(OutputDir)
 50 |     
 51 |     # normalize data
 52 |     data = np.log1p(data)
 53 |         
 54 |     Classifier = LinearSVC()
 55 |             
 56 |     tr_time=[]
 57 |     ts_time=[]
 58 |     truelab = []
 59 |     pred = []
 60 |         
 61 |     for i in range(np.squeeze(nfolds)):
 62 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 63 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 64 |     
 65 |         train=data.iloc[train_ind_i]
 66 |         test=data.iloc[test_ind_i]
 67 |         y_train=labels.iloc[train_ind_i]
 68 |         y_test=labels.iloc[test_ind_i]
 69 |             
 70 |         if (NumGenes > 0):
 71 |             feat_to_use = features.iloc[0:NumGenes,i]
 72 |             train = train.iloc[:,feat_to_use]
 73 |             test = test.iloc[:,feat_to_use]
 74 |             
 75 |         start=tm.time()
 76 |         Classifier.fit(train, y_train)
 77 |         tr_time.append(tm.time()-start)
 78 |                     
 79 |         start=tm.time()
 80 |         predicted = Classifier.predict(test)
 81 |         ts_time.append(tm.time()-start)
 82 |             
 83 |         truelab.extend(y_test.values)
 84 |         pred.extend(predicted)
 85 |                 
 86 |     truelab = pd.DataFrame(truelab)
 87 |     pred = pd.DataFrame(pred)
 88 |         
 89 |     tr_time = pd.DataFrame(tr_time)
 90 |     ts_time = pd.DataFrame(ts_time)
 91 |         
 92 |     if (NumGenes == 0):  
 93 |         truelab.to_csv("SVM_True_Labels.csv", index = False)
 94 |         pred.to_csv("SVM_Pred_Labels.csv", index = False)
 95 |         tr_time.to_csv("SVM_Training_Time.csv", index = False)
 96 |         ts_time.to_csv("SVM_Testing_Time.csv", index = False)
 97 |     else:
 98 |         truelab.to_csv("SVM_" + str(NumGenes) + "_True_Labels.csv", index = False)
 99 |         pred.to_csv("SVM_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
100 |         tr_time.to_csv("SVM_" + str(NumGenes) + "_Training_Time.csv", index = False)
101 |         ts_time.to_csv("SVM_" + str(NumGenes) + "_Testing_Time.csv", index = False)
102 | 
103 |     
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/Scripts/run_SVM_rejection.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import numpy as np
  3 | import pandas as pd
  4 | import time as tm
  5 | from sklearn.svm import LinearSVC
  6 | import rpy2.robjects as robjects
  7 | from sklearn.calibration import CalibratedClassifierCV
  8 | 
  9 | 
 10 | def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7):
 11 |     '''
 12 |     run baseline classifier: SVM
 13 |     Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
 14 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 15 |   
 16 |     Parameters
 17 |     ----------
 18 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 19 |     as row names and gene names as column names.
 20 |     LabelsPath : Cell population annotations file path (.csv).
 21 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 22 |     OutputDir : Output directory defining the path of the exported file.
 23 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 24 |     defining the genes order for each cross validation fold, default is NULL.
 25 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 26 |     Threshold : Threshold used when rejecting the cells, default is 0.7.
 27 | 
 28 |     '''
 29 |         
 30 |     # read the Rdata file
 31 |     robjects.r['load'](CV_RDataPath)
 32 | 
 33 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 34 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 35 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 36 |     col = col - 1 
 37 |     test_ind = np.array(robjects.r['Test_Idx'])
 38 |     train_ind = np.array(robjects.r['Train_Idx'])
 39 | 
 40 |     # read the data
 41 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 42 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 43 |     
 44 |     labels = labels.iloc[tokeep]
 45 |     data = data.iloc[tokeep]
 46 |     
 47 |     # read the feature file
 48 |     if (NumGenes > 0):
 49 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 50 |     
 51 |     # folder with results
 52 |     os.chdir(OutputDir)
 53 |     
 54 |     # normalize data
 55 |     data = np.log1p(data)
 56 |         
 57 |     Classifier = LinearSVC()
 58 |     clf = CalibratedClassifierCV(Classifier)
 59 |             
 60 |     tr_time=[]
 61 |     ts_time=[]
 62 |     truelab = []
 63 |     pred = []
 64 |         
 65 |     for i in range(np.squeeze(nfolds)):
 66 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 67 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 68 |     
 69 |         train=data.iloc[train_ind_i]
 70 |         test=data.iloc[test_ind_i]
 71 |         y_train=labels.iloc[train_ind_i]
 72 |         y_test=labels.iloc[test_ind_i]
 73 |             
 74 |         if (NumGenes > 0):
 75 |             feat_to_use = features.iloc[0:NumGenes,i]
 76 |             train = train.iloc[:,feat_to_use]
 77 |             test = test.iloc[:,feat_to_use]
 78 |             
 79 |         start=tm.time()
 80 |         clf.fit(train, y_train)
 81 |         tr_time.append(tm.time()-start)
 82 |                     
 83 |         start=tm.time()
 84 |         predicted = clf.predict(test)
 85 |         prob = np.max(clf.predict_proba(test), axis = 1)
 86 |         unlabeled = np.where(prob < Threshold)
 87 |         predicted[unlabeled] = 'Unknown'
 88 |         ts_time.append(tm.time()-start)
 89 |             
 90 |         truelab.extend(y_test.values)
 91 |         pred.extend(predicted)
 92 |                 
 93 |     truelab = pd.DataFrame(truelab)
 94 |     pred = pd.DataFrame(pred)
 95 |         
 96 |     tr_time = pd.DataFrame(tr_time)
 97 |     ts_time = pd.DataFrame(ts_time)
 98 |         
 99 |     if (NumGenes == 0):  
100 |         truelab.to_csv("SVM_True_Labels.csv", index = False)
101 |         pred.to_csv("SVM_Pred_Labels.csv", index = False)
102 |         tr_time.to_csv("SVM_Training_Time.csv", index = False)
103 |         ts_time.to_csv("SVM_Testing_Time.csv", index = False)
104 |     else:
105 |         truelab.to_csv("SVM_" + str(NumGenes) + "_True_Labels.csv", index = False)
106 |         pred.to_csv("SVM_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
107 |         tr_time.to_csv("SVM_" + str(NumGenes) + "_Training_Time.csv", index = False)
108 |         ts_time.to_csv("SVM_" + str(NumGenes) + "_Testing_Time.csv", index = False)
109 | 
110 |     
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/Scripts/run_SingleR.R:
--------------------------------------------------------------------------------
 1 | run_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
 2 |   "
 3 |   run SingleR
 4 |   Wrapper script to run SingleR on a benchmark dataset with 5-fold cross validation,
 5 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
 6 |   
 7 |   Parameters
 8 |   ----------
 9 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
10 |   as row names and gene names as column names.
11 |   LabelsPath : Cell population annotations file path (.csv).
12 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
13 |   OutputDir : Output directory defining the path of the exported file.
14 |   GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
15 |   defining the genes order for each cross validation fold, default is NULL.
16 |   NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
17 |   "
18 |   
19 |   Data <- read.csv(DataPath,row.names = 1)
20 |   Labels <- as.matrix(read.csv(LabelsPath))
21 |   load(CV_RDataPath)
22 |   Labels <- as.vector(Labels[,col_Index])
23 |   Data <- Data[Cells_to_Keep,]
24 |   Labels <- Labels[Cells_to_Keep]
25 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
26 |     GenesOrder = read.csv(GeneOrderPath)
27 |   }
28 |   
29 |   #############################################################################
30 |   #                               SingleR                                     #
31 |   #############################################################################
32 |   library(SingleR)
33 |   library(Seurat)
34 |   True_Labels_SingleR <- list()
35 |   Pred_Labels_SingleR <- list()
36 |   Total_Time_SingleR <- list()
37 |   Data = t(as.matrix(Data))
38 |   
39 |   for (i in c(1:n_folds)){
40 |     if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
41 |       start_time <- Sys.time()
42 |       singler = SingleR(method = "single", Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], 
43 |                         Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], 
44 |                         Labels[Train_Idx[[i]]], numCores = 1)
45 |       end_time <- Sys.time()
46 |     }
47 |     else{
48 |       start_time <- Sys.time()
49 |       singler = SingleR(method = "single", Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1)
50 |       end_time <- Sys.time()
51 |     }
52 |     Total_Time_SingleR[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
53 |     
54 |     True_Labels_SingleR[i] <- list(Labels[Test_Idx[[i]]])
55 |     Pred_Labels_SingleR[i] <- list(as.vector(singler$labels))
56 |   }
57 |   True_Labels_SingleR <- as.vector(unlist(True_Labels_SingleR))
58 |   Pred_Labels_SingleR <- as.vector(unlist(Pred_Labels_SingleR))
59 |   Total_Time_SingleR <- as.vector(unlist(Total_Time_SingleR))
60 |   
61 |   setwd(OutputDir)
62 |   
63 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
64 |     write.csv(True_Labels_SingleR,paste('SingleR_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
65 |     write.csv(Pred_Labels_SingleR,paste('SingleR_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
66 |     write.csv(Total_Time_SingleR,paste('SingleR_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)
67 |   }
68 |   else{
69 |     write.csv(True_Labels_SingleR,'SingleR_True_Labels.csv',row.names = FALSE)
70 |     write.csv(Pred_Labels_SingleR,'SingleR_Pred_Labels.csv',row.names = FALSE)
71 |     write.csv(Total_Time_SingleR,'SingleR_Total_Time.csv',row.names = FALSE)
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/Scripts/run_kNN50.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import numpy as np
  3 | import pandas as pd
  4 | import time as tm
  5 | from sklearn.neighbors import KNeighborsClassifier
  6 | import rpy2.robjects as robjects
  7 | 
  8 | 
  9 | def run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 10 |     '''
 11 |     run baseline classifiers: kNN
 12 |     Wrapper script to run kNN (with k = 50) classifier on a benchmark dataset with 5-fold cross validation,
 13 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 14 |   
 15 |     Parameters
 16 |     ----------
 17 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 18 |     as row names and gene names as column names.
 19 |     LabelsPath : Cell population annotations file path (.csv).
 20 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 21 |     OutputDir : Output directory defining the path of the exported file.
 22 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 23 |     defining the genes order for each cross validation fold, default is NULL.
 24 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 25 |     '''
 26 |         
 27 |     # read the Rdata file
 28 |     robjects.r['load'](CV_RDataPath)
 29 | 
 30 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 31 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 32 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 33 |     col = col - 1 
 34 |     test_ind = np.array(robjects.r['Test_Idx'])
 35 |     train_ind = np.array(robjects.r['Train_Idx'])
 36 | 
 37 |     # read the data
 38 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 39 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 40 |     
 41 |     labels = labels.iloc[tokeep]
 42 |     data = data.iloc[tokeep]
 43 |     
 44 |     # read the feature file
 45 |     if (NumGenes > 0):
 46 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 47 |     
 48 |     # folder with results
 49 |     os.chdir(OutputDir)
 50 |     
 51 |     # normalize data
 52 |     data = np.log1p(data)
 53 |         
 54 |     Classifier = KNeighborsClassifier(n_neighbors=50)
 55 |             
 56 |     tr_time=[]
 57 |     ts_time=[]
 58 |     truelab = []
 59 |     pred = []
 60 |         
 61 |     for i in range(np.squeeze(nfolds)):
 62 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 63 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 64 |     
 65 |         train=data.iloc[train_ind_i]
 66 |         test=data.iloc[test_ind_i]
 67 |         y_train=labels.iloc[train_ind_i]
 68 |         y_test=labels.iloc[test_ind_i]
 69 |             
 70 |         if (NumGenes > 0):
 71 |             feat_to_use = features.iloc[0:NumGenes,i]
 72 |             train = train.iloc[:,feat_to_use]
 73 |             test = test.iloc[:,feat_to_use]
 74 |             
 75 |         start=tm.time()
 76 |         Classifier.fit(train, y_train)
 77 |         tr_time.append(tm.time()-start)
 78 |                     
 79 |         start=tm.time()
 80 |         predicted = Classifier.predict(test)
 81 |         ts_time.append(tm.time()-start)
 82 |             
 83 |         truelab.extend(y_test.values)
 84 |         pred.extend(predicted)
 85 |                 
 86 |     truelab = pd.DataFrame(truelab)
 87 |     pred = pd.DataFrame(pred)
 88 |         
 89 |     tr_time = pd.DataFrame(tr_time)
 90 |     ts_time = pd.DataFrame(ts_time)
 91 |         
 92 |     if (NumGenes == 0):  
 93 |         truelab.to_csv("kNN50_True_Labels.csv", index = False)
 94 |         pred.to_csv("kNN50_Pred_Labels.csv", index = False)
 95 |         tr_time.to_csv("kNN50_Training_Time.csv", index = False)
 96 |         ts_time.to_csv("kNN50_Testing_Time.csv", index = False)
 97 |     else:
 98 |         truelab.to_csv("kNN50_" + str(NumGenes) + "_True_Labels.csv", index = False)
 99 |         pred.to_csv("kNN50_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
100 |         tr_time.to_csv("kNN50_" + str(NumGenes) + "_Training_Time.csv", index = False)
101 |         ts_time.to_csv("kNN50_" + str(NumGenes) + "_Testing_Time.csv", index = False)
102 | 
103 |     
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/Scripts/run_kNN9.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import numpy as np
  3 | import pandas as pd
  4 | import time as tm
  5 | from sklearn.neighbors import KNeighborsClassifier
  6 | import rpy2.robjects as robjects
  7 | 
  8 | 
  9 | def run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 10 |     '''
 11 |     run baseline classifiers: kNN
 12 |     Wrapper script to run kNN (with k = 9) classifier on a benchmark dataset with 5-fold cross validation,
 13 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 14 |   
 15 |     Parameters
 16 |     ----------
 17 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 18 |     as row names and gene names as column names.
 19 |     LabelsPath : Cell population annotations file path (.csv).
 20 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 21 |     OutputDir : Output directory defining the path of the exported file.
 22 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 23 |     defining the genes order for each cross validation fold, default is NULL.
 24 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 25 |     '''
 26 |         
 27 |     # read the Rdata file
 28 |     robjects.r['load'](CV_RDataPath)
 29 | 
 30 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 31 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 32 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 33 |     col = col - 1 
 34 |     test_ind = np.array(robjects.r['Test_Idx'])
 35 |     train_ind = np.array(robjects.r['Train_Idx'])
 36 | 
 37 |     # read the data
 38 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 39 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 40 |     
 41 |     labels = labels.iloc[tokeep]
 42 |     data = data.iloc[tokeep]
 43 |     
 44 |     # read the feature file
 45 |     if (NumGenes > 0):
 46 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 47 |     
 48 |     # folder with results
 49 |     os.chdir(OutputDir)
 50 |     
 51 |     # normalize data
 52 |     data = np.log1p(data)
 53 |         
 54 |     Classifier = KNeighborsClassifier(n_neighbors=9)
 55 |             
 56 |     tr_time=[]
 57 |     ts_time=[]
 58 |     truelab = []
 59 |     pred = []
 60 |         
 61 |     for i in range(np.squeeze(nfolds)):
 62 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 63 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 64 |     
 65 |         train=data.iloc[train_ind_i]
 66 |         test=data.iloc[test_ind_i]
 67 |         y_train=labels.iloc[train_ind_i]
 68 |         y_test=labels.iloc[test_ind_i]
 69 |             
 70 |         if (NumGenes > 0):
 71 |             feat_to_use = features.iloc[0:NumGenes,i]
 72 |             train = train.iloc[:,feat_to_use]
 73 |             test = test.iloc[:,feat_to_use]
 74 |             
 75 |         start=tm.time()
 76 |         Classifier.fit(train, y_train)
 77 |         tr_time.append(tm.time()-start)
 78 |                     
 79 |         start=tm.time()
 80 |         predicted = Classifier.predict(test)
 81 |         ts_time.append(tm.time()-start)
 82 |             
 83 |         truelab.extend(y_test.values)
 84 |         pred.extend(predicted)
 85 |                 
 86 |     truelab = pd.DataFrame(truelab)
 87 |     pred = pd.DataFrame(pred)
 88 |         
 89 |     tr_time = pd.DataFrame(tr_time)
 90 |     ts_time = pd.DataFrame(ts_time)
 91 |         
 92 |     if (NumGenes == 0):  
 93 |         truelab.to_csv("kNN9_True_Labels.csv", index = False)
 94 |         pred.to_csv("kNN9_Pred_Labels.csv", index = False)
 95 |         tr_time.to_csv("kNN9_Training_Time.csv", index = False)
 96 |         ts_time.to_csv("kNN9_Testing_Time.csv", index = False)
 97 |     else:
 98 |         truelab.to_csv("kNN9_" + str(NumGenes) + "_True_Labels.csv", index = False)
 99 |         pred.to_csv("kNN9_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
100 |         tr_time.to_csv("kNN9_" + str(NumGenes) + "_Training_Time.csv", index = False)
101 |         ts_time.to_csv("kNN9_" + str(NumGenes) + "_Testing_Time.csv", index = False)
102 | 
103 |     
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/Scripts/run_moana.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import numpy as np
 4 | from moana.core import ExpMatrix
 5 | from moana.classify import CellTypeClassifier
 6 | import time as tm
 7 | import rpy2.robjects as robjects
 8 | 
 9 | def run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
10 |     '''
11 |     run moana
12 |     Wrapper script to run moana on a benchmark dataset with a pretrained classifier,
13 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.  
14 |   
15 |     Parameters
16 |     ----------
17 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
18 |     as row names and gene names as column names.
19 |     LabelsPath : Cell population annotations file path (.csv).
20 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
21 |     ClassifierPath : Data file path to the pretrained classifier.
22 |     OutputDir : Output directory defining the path of the exported file.
23 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
24 |     defining the genes order for each cross validation fold, default is NULL.
25 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
26 |     '''
27 |     
28 | #    # read the Rdata file
29 | #    robjects.r['load'](CV_RDataPath)
30 | #
31 | #    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
32 | #    col = np.array(robjects.r['col_Index'], dtype = 'int')
33 | #    col = col - 1
34 |     
35 |     matrix = ExpMatrix.read_tsv(DataPath, sep = ',')    
36 | #    matrix = matrix.iloc[tokeep] 
37 |     
38 |     truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',')
39 | #    truelab = truelab.iloc[tokeep]
40 |     
41 |     ct_old = ['CD19+ B','CD14+ Monocyte','CD4+/CD45RA+/CD25- Naive T','CD4+/CD45RO+ Memory','CD8+/CD45RA+ Naive Cytotoxic','Dendritic', 'CD56+ NK']
42 |     ct_new = ['B cells','CD14+ monocytes','Naive CD4+ T cells','Memory CD4+ T cells','Naive CD8+ T cells','Dendritic cells','NK cells']
43 |     
44 |     tokeep2 = np.isin(truelab,ct_old)
45 |     truelab = truelab[tokeep2]
46 |     print(len(truelab))
47 |     matrix = matrix.iloc[np.squeeze(tokeep2)]
48 |     
49 |     for i in range(len(ct_old)):
50 |         truelab.iloc[truelab == ct_old[i]] = ct_new[i]
51 |         
52 |     # read the feature file
53 |     if (NumGenes > 0):
54 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
55 |         feat_to_use = features.iloc[0:NumGenes,0]
56 |         matrix = matrix.iloc[:,feat_to_use]
57 | 
58 |     data = ExpMatrix(X = np.transpose(matrix.X), genes = matrix.cells, cells = matrix.genes)
59 |     data.genes.name = 'Genes'
60 |     data.cells.name = 'Cells'
61 |     data.index.name = 'Genes'
62 |     data.columns.name = 'Cells'
63 |     
64 |     clf = CellTypeClassifier.read_pickle(ClassifierPath)
65 |     
66 |     start = tm.time()
67 |     predictions = clf.predict(data)
68 |     runtime = tm.time() - start
69 |     
70 |     np.asarray(predictions)
71 |     
72 |     pred = pd.DataFrame(predictions)
73 |         
74 |     os.chdir(OutputDir)
75 |             
76 |     if (NumGenes == 0):  
77 |         truelab.to_csv("moana_True_Labels.csv", index = False)
78 |         pred.to_csv("moana_Pred_Labels.csv", index = False)
79 |         with open("moana_Total_Time.csv", 'w') as f:
80 |             f.write("%f\n" % runtime)
81 |     else:
82 |         truelab.to_csv("moana_" + str(NumGenes) + "_True_Labels.csv", index = False)
83 |         pred.to_csv("moana_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
84 |         with open("moana_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
85 |             f.write("%f\n" % runtime)
86 | 
87 | 
88 |         
89 |     
90 |     
91 |     
92 |     
93 |     
94 |     
95 | 


--------------------------------------------------------------------------------
/Scripts/run_scID.R:
--------------------------------------------------------------------------------
 1 | run_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
 2 |   "
 3 |   run scID
 4 |   Wrapper script to run scID on a benchmark dataset with 5-fold cross validation,
 5 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
 6 |   
 7 |   Parameters
 8 |   ----------
 9 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
10 |   as row names and gene names as column names.
11 |   LabelsPath : Cell population annotations file path (.csv).
12 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
13 |   OutputDir : Output directory defining the path of the exported file.
14 |   GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
15 |   defining the genes order for each cross validation fold, default is NULL.
16 |   NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
17 |   "
18 |   
19 |   Data <- read.csv(DataPath,row.names = 1)
20 |   Labels <- as.matrix(read.csv(LabelsPath))
21 |   load(CV_RDataPath)
22 |   Labels <- as.vector(Labels[,col_Index])
23 |   Data <- Data[Cells_to_Keep,]
24 |   Labels <- Labels[Cells_to_Keep]
25 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
26 |     GenesOrder = read.csv(GeneOrderPath)
27 |   }
28 |   
29 |   #############################################################################
30 |   #                                 scID                                      #
31 |   #############################################################################
32 |   library(scID)
33 |   library(Seurat)
34 |   True_Labels_scID <- list()
35 |   Pred_Labels_scID <- list()
36 |   Total_Time_scID <- list()
37 |   Data = t(as.matrix(Data))
38 |   
39 |   for (i in c(1:n_folds)){
40 |     if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
41 |       Train_Labels <- list(Labels[Train_Idx[[i]]])
42 |       names(Train_Labels[[1]]) <- colnames(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]])
43 |       start_time <- Sys.time()
44 |       scID_output <- scid_multiclass(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], 
45 |                                      Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], 
46 |                                      Train_Labels[[1]])
47 |       end_time <- Sys.time()
48 |     }
49 |     else{
50 |       Train_Labels <- list(Labels[Train_Idx[[i]]])
51 |       names(Train_Labels[[1]]) <- colnames(Data[,Train_Idx[[i]]])
52 |       start_time <- Sys.time()
53 |       scID_output <- scid_multiclass(Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Train_Labels[[1]])
54 |       end_time <- Sys.time()
55 |     }
56 |     Total_Time_scID[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
57 |     
58 |     True_Labels_scID[i] <- list(Labels[Test_Idx[[i]]])
59 |     Pred_Labels_scID[i] <- list(as.vector(scID_output$labels))
60 |   }
61 |   True_Labels_scID <- as.vector(unlist(True_Labels_scID))
62 |   Pred_Labels_scID <- as.vector(unlist(Pred_Labels_scID))
63 |   Total_Time_scID <- as.vector(unlist(Total_Time_scID))
64 |   
65 |   setwd(OutputDir)
66 |   
67 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
68 |     write.csv(True_Labels_scID,paste('scID_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
69 |     write.csv(Pred_Labels_scID,paste('scID_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
70 |     write.csv(Total_Time_scID,paste('scID_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)
71 |   }
72 |   else{
73 |     write.csv(True_Labels_scID,'scID_True_Labels.csv',row.names = FALSE)
74 |     write.csv(Pred_Labels_scID,'scID_Pred_Labels.csv',row.names = FALSE)
75 |     write.csv(Total_Time_scID,'scID_Total_Time.csv',row.names = FALSE)
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/Scripts/run_scPred.R:
--------------------------------------------------------------------------------
  1 | run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  2 |   "
  3 |   run scPred
  4 |   Wrapper script to run scPred on a benchmark dataset with 5-fold cross validation,
  5 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
  6 |   
  7 |   Parameters
  8 |   ----------
  9 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 10 |   as row names and gene names as column names.
 11 |   LabelsPath : Cell population annotations file path (.csv).
 12 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 13 |   OutputDir : Output directory defining the path of the exported file.
 14 |   GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 15 |   defining the genes order for each cross validation fold, default is NULL.
 16 |   NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
 17 |   "
 18 |   
 19 |   Data <- read.csv(DataPath,row.names = 1)
 20 |   Labels <- as.matrix(read.csv(LabelsPath))
 21 |   load(CV_RDataPath)
 22 |   Labels <- as.vector(Labels[,col_Index])
 23 |   Data <- Data[Cells_to_Keep,]
 24 |   Labels <- Labels[Cells_to_Keep]
 25 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
 26 |     GenesOrder = read.csv(GeneOrderPath)
 27 |   }
 28 |   
 29 |   #############################################################################
 30 |   #                                scPred                                     #
 31 |   #############################################################################
 32 |   library(scPred)
 33 |   library(tidyverse)
 34 |   library(SingleCellExperiment)
 35 |   True_Labels_scPred <- list()
 36 |   Pred_Labels_scPred <- list()
 37 |   Training_Time_scPred <- list()
 38 |   Testing_Time_scPred <- list()
 39 |   Data = t(as.matrix(Data))
 40 |   
 41 |   for (i in c(1:n_folds)){
 42 |     if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
 43 |       sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
 44 |                                   colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
 45 |       sce_counts <- normcounts(sce)
 46 |       sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)
 47 |       sce_metadata <- as.data.frame(colData(sce))
 48 |       
 49 |       sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
 50 |                                        colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
 51 |       sce_counts_test <- normcounts(sce_test)
 52 |       sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)
 53 |       sce_metadata_test <- as.data.frame(colData(sce_test))
 54 |     }
 55 |     else{
 56 |       sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 
 57 |                                   colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
 58 |       sce_counts <- normcounts(sce)
 59 |       sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)
 60 |       sce_metadata <- as.data.frame(colData(sce))
 61 |       
 62 |       sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 
 63 |                                        colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
 64 |       sce_counts_test <- normcounts(sce_test)
 65 |       sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)
 66 |       sce_metadata_test <- as.data.frame(colData(sce_test))
 67 |     }
 68 |     
 69 |     
 70 |     # scPred Training    
 71 |     start_time <- Sys.time()
 72 |     set.seed(1234)
 73 |     scp <- eigenDecompose(sce_cpm)
 74 |     scPred::metadata(scp) <- sce_metadata
 75 |     scp <- getFeatureSpace(scp, pVar = 'cell_type1')
 76 |     # plotEigen(scp, group = 'cell_type1')
 77 |     scp <- trainModel(scp)
 78 |     # plotTrainProbs(scp)
 79 |     end_time <- Sys.time()
 80 |     Training_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
 81 |     
 82 |     # scPred Prediction
 83 |     start_time <- Sys.time()
 84 |     scp <- scPredict(scp,newData = sce_cpm_test)
 85 |     end_time <- Sys.time()
 86 |     Testing_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
 87 |     
 88 |     True_Labels_scPred[i] <- list(Labels[Test_Idx[[i]]])
 89 |     Pred_Labels_scPred[i] <- list(getPredictions(scp)$predClass)
 90 |   }
 91 |   True_Labels_scPred <- as.vector(unlist(True_Labels_scPred))
 92 |   Pred_Labels_scPred <- as.vector(unlist(Pred_Labels_scPred))
 93 |   Training_Time_scPred <- as.vector(unlist(Training_Time_scPred))
 94 |   Testing_Time_scPred <- as.vector(unlist(Testing_Time_scPred))
 95 |   
 96 |   setwd(OutputDir)
 97 |   
 98 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
 99 |     write.csv(True_Labels_scPred,paste('scPred_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
100 |     write.csv(Pred_Labels_scPred,paste('scPred_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
101 |     write.csv(Training_Time_scPred,paste('scPred_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
102 |     write.csv(Testing_Time_scPred,paste('scPred_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
103 |   }
104 |   else{
105 |     write.csv(True_Labels_scPred,'scPred_True_Labels.csv',row.names = FALSE)
106 |     write.csv(Pred_Labels_scPred,'scPred_Pred_Labels.csv',row.names = FALSE)
107 |     write.csv(Training_Time_scPred,'scPred_Training_Time.csv',row.names = FALSE)
108 |     write.csv(Testing_Time_scPred,'scPred_Testing_Time.csv',row.names = FALSE)
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/Scripts/run_scVI.py:
--------------------------------------------------------------------------------
  1 | from scvi.dataset import CsvDataset
  2 | import os
  3 | import numpy as np
  4 | import pandas as pd
  5 | from scvi.models import SCANVI
  6 | from scvi.inference import SemiSupervisedTrainer
  7 | import time as tm
  8 | import rpy2.robjects as robjects
  9 | 
 10 | def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 11 |     '''
 12 |     run scVI
 13 |     Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation,
 14 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 15 |   
 16 |     Parameters
 17 |     ----------
 18 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 19 |     as row names and gene names as column names.
 20 |     LabelsPath : Cell population annotations file path (.csv).
 21 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 22 |     OutputDir : Output directory defining the path of the exported file.
 23 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 24 |     defining the genes order for each cross validation fold, default is NULL.
 25 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 26 |     '''
 27 |     
 28 |     # read the Rdata file
 29 |     robjects.r['load'](CV_RDataPath)
 30 | 
 31 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 32 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 33 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 34 |     col = col - 1 
 35 |     test_ind = np.array(robjects.r['Test_Idx'])
 36 |     train_ind = np.array(robjects.r['Train_Idx'])
 37 | 
 38 |     # read the data
 39 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 40 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 41 | 
 42 |     labels = labels.iloc[tokeep]
 43 |     data = data.iloc[tokeep] 
 44 |     
 45 |     # read the feature file
 46 |     if (NumGenes > 0):
 47 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 48 |         
 49 |     os.chdir(OutputDir)
 50 |     
 51 |     if (NumGenes == 0):
 52 |         #save labels as csv file with header and index column
 53 |         labels.to_csv('Labels_scvi.csv')
 54 |         data.to_csv('Data_scvi.csv')    
 55 |         
 56 |         train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False)
 57 |         
 58 |         ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
 59 |         scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
 60 |         trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)
 61 |     
 62 |     n_epochs = 200
 63 |     
 64 |     truelab = []
 65 |     pred = []
 66 |     tr_time = []
 67 |     ts_time = []
 68 |     
 69 |     for i in range(np.squeeze(nfolds)):
 70 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 71 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 72 |         
 73 |         if (NumGenes > 0):
 74 |             feat_to_use = features.iloc[0:NumGenes,i]
 75 |             data2 = data.iloc[:,feat_to_use]
 76 |             
 77 |             labels.to_csv('Labels_scvi.csv')
 78 |             data2.to_csv('Data_scvi.csv')    
 79 |             
 80 |             train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False, new_n_genes = False)
 81 |             
 82 |             ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
 83 |             scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
 84 |             trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)
 85 | 
 86 |         trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False)
 87 |         trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy']
 88 |         trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False)
 89 |         trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy']
 90 |     
 91 |         start = tm.time()
 92 |         trainer_scanvi.train(n_epochs)
 93 |         tr_time.append(tm.time()-start)
 94 |     
 95 |         ## labels of test set are in y_pred
 96 |         ## labels are returned in numbers, should be mapped back to the real labels
 97 |         ## indices are permutated
 98 |         start = tm.time()
 99 |         y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()
100 |         ts_time.append(tm.time()-start)
101 |         
102 |         truelab.extend(y_true)
103 |         pred.extend(y_pred)
104 |     
105 |     #write results
106 |     
107 |     truelab = pd.DataFrame(truelab)
108 |     pred = pd.DataFrame(pred)
109 |     
110 |     tr_time = pd.DataFrame(tr_time)
111 |     ts_time = pd.DataFrame(ts_time)
112 | 
113 |     
114 |     if (NumGenes == 0):  
115 |         truelab.to_csv("scVI_True_Labels.csv", index = False)
116 |         pred.to_csv("scVI_Pred_Labels.csv", index = False)
117 |         tr_time.to_csv("scVI_Training_Time.csv", index = False)
118 |         ts_time.to_csv("scVI_Testing_Time.csv", index = False)
119 |     else:
120 |         truelab.to_csv("scVI_" + str(NumGenes) + "_True_Labels.csv", index = False)
121 |         pred.to_csv("scVI_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
122 |         tr_time.to_csv("scVI_" + str(NumGenes) + "_Training_Time.csv", index = False)
123 |         ts_time.to_csv("scVI_" + str(NumGenes) + "_Testing_Time.csv", index = False)
124 |         
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/Scripts/run_singleCellNet.R:
--------------------------------------------------------------------------------
 1 | run_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
 2 |   "
 3 |   run singleCellNet
 4 |   Wrapper script to run singleCellNet on a benchmark dataset with 5-fold cross validation,
 5 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
 6 |   
 7 |   Parameters
 8 |   ----------
 9 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
10 |   as row names and gene names as column names.
11 |   LabelsPath : Cell population annotations file path (.csv).
12 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
13 |   OutputDir : Output directory defining the path of the exported file.
14 |   GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
15 |   defining the genes order for each cross validation fold, default is NULL.
16 |   NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
17 |   "
18 |   
19 |   Data <- read.csv(DataPath,row.names = 1)
20 |   colnames(Data) <- gsub('_','.',colnames(Data), fixed = TRUE)
21 |   Labels <- as.matrix(read.csv(LabelsPath))
22 |   load(CV_RDataPath)
23 |   Labels <- as.vector(Labels[,col_Index])
24 |   Data <- Data[Cells_to_Keep,]
25 |   Labels <- Labels[Cells_to_Keep]
26 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
27 |     GenesOrder = read.csv(GeneOrderPath)
28 |   }
29 |   
30 |   #############################################################################
31 |   #                              singleCellNet                                #
32 |   #############################################################################
33 |   library(singleCellNet)
34 |   library(dplyr)
35 |   True_Labels_singleCellNet <- list()
36 |   Pred_Labels_singleCellNet <- list()
37 |   Training_Time_singleCellNet <- list()
38 |   Testing_Time_singleCellNet <- list()
39 |   Data = t(as.matrix(Data))              # deals also with sparse matrix
40 |   
41 |   for(i in c(1:n_folds)){
42 |     if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
43 |       DataTrain <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]
44 |       DataTest <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]
45 |     }
46 |     else{
47 |       DataTrain <- Data[,Train_Idx[[i]]]
48 |       DataTest <- Data[,Test_Idx[[i]]]
49 |     }
50 |     
51 |     start_time <- Sys.time()
52 |     cgenes2<-findClassyGenes(DataTrain, data.frame(Annotation = Labels[Train_Idx[[i]]]), "Annotation")
53 |     cgenesA<-cgenes2[['cgenes']]
54 |     grps<-cgenes2[['grps']]
55 |     DataTrain<-as.matrix(DataTrain[cgenesA,])
56 |     xpairs<-ptGetTop(DataTrain, grps, ncores = 1)
57 |     pdTrain<-query_transform(DataTrain[cgenesA, ], xpairs)
58 |     rf<-sc_makeClassifier(pdTrain[xpairs,], genes=xpairs, groups=grps)
59 |     end_time <- Sys.time()
60 |     Training_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
61 |     
62 |     start_time <- Sys.time()
63 |     DataTest<-query_transform(DataTest[cgenesA,], xpairs)
64 |     classRes <-rf_classPredict(rf, DataTest)
65 |     end_time <- Sys.time()
66 |     Testing_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
67 |     
68 |     True_Labels_singleCellNet[i] <- list(Labels[Test_Idx[[i]]])
69 |     Pred_Labels_singleCellNet[i] <- list((rownames(classRes)[apply(classRes,2,which.max)])[1:length(Test_Idx[[i]])])
70 |   }
71 |   True_Labels_singleCellNet <- as.vector(unlist(True_Labels_singleCellNet))
72 |   Pred_Labels_singleCellNet <- as.vector(unlist(Pred_Labels_singleCellNet))
73 |   Training_Time_singleCellNet <- as.vector(unlist(Training_Time_singleCellNet))
74 |   Testing_Time_singleCellNet <- as.vector(unlist(Testing_Time_singleCellNet))
75 |   
76 |   setwd(OutputDir)
77 |   
78 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
79 |     write.csv(True_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
80 |     write.csv(Pred_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
81 |     write.csv(Training_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
82 |     write.csv(Testing_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
83 |   }
84 |   else{
85 |     write.csv(True_Labels_singleCellNet,'singleCellNet_True_Labels.csv',row.names = FALSE)
86 |     write.csv(Pred_Labels_singleCellNet,'singleCellNet_Pred_Labels.csv',row.names = FALSE)
87 |     write.csv(Training_Time_singleCellNet,'singleCellNet_Training_Time.csv',row.names = FALSE)
88 |     write.csv(Testing_Time_singleCellNet,'singleCellNet_Testing_Time.csv',row.names = FALSE)
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/Snakemake/Cross_Validation.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(TRUE)
 2 | 
 3 | Cross_Validation <- function(LabelsPath, col_Index = 1, OutputDir){
 4 |   "
 5 |   Cross_Validation
 6 |   Function returns train and test indices for 5 folds stratified across unique cell populations,
 7 |   also filter out cell populations with less than 10 cells.
 8 |   It return a 'CV_folds.RData' file which then used as input to classifiers wrappers.
 9 | 
10 |   Parameters
11 |   ----------
12 |   LabelsPath : Cell population annotations file path (.csv).
13 |   col_Index : column index (integer) defining which level of annotation to use,
14 |   in case of multiple cell type annotations (default is 1)
15 |   OutputDir : Output directory defining the path of the exported file.
16 |   "
17 | 
18 |   Labels <- as.matrix(read.csv(LabelsPath))
19 |   Labels <- as.vector(Labels[,col_Index])
20 | 
21 |   Removed_classes <- !(table(Labels) > 10)
22 |   Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes]))
23 |   Labels <- Labels[Cells_to_Keep]
24 | 
25 |   # Getting training and testing Folds
26 |   library(rBayesianOptimization)
27 |   n_folds = 5
28 |   Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE)
29 |   Test_Folds <- c(n_folds:1)
30 |   Train_Idx <- list()
31 |   Test_Idx <- list()
32 |   for (i in c(1:length(Folds))){
33 |     Temp_Folds <- Folds
34 |     Temp_Folds[Test_Folds[i]] <- NULL
35 |     Train_Idx[i] <- list(unlist(Temp_Folds))
36 |     Test_Idx[i] <- Folds[Test_Folds[i]]
37 |   }
38 |   remove(Temp_Folds,i,Folds)
39 |   save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = paste0(OutputDir, '/CV_folds.RData'))
40 | }
41 | 
42 | Cross_Validation(args[1], as.numeric(args[2]), args[3])
43 | 


--------------------------------------------------------------------------------
/Snakemake/DEgenesMAST.R:
--------------------------------------------------------------------------------
 1 | DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){
 2 |   # This functions applies a differential expression test to the data using one vs all
 3 |   # The training data should be used a an input
 4 |   # The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes
 5 |   # This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set.
 6 |   
 7 |   # Data: genes X cells (rows = genes, columns = cells)
 8 |   # Labels: labels of the data
 9 |   # Normalize: the input for MAST should be cpm normalized data, 
10 |   #            if the data is not normalized yet, this should be set to TRUE
11 |   # LogTransform: the input for MAST should be logtransformed,
12 |   #            if the data is not logtransformed yet, this should be set to TRUE
13 |   
14 |   
15 |   library(Seurat)
16 |   
17 |   if(Normalize)
18 |   {
19 |     Data <- apply(Data, 2, function(x) (x/sum(x))*1000000)
20 |   }
21 |   
22 |   if(LogTransform)
23 |   {
24 |     Data <- log(Data+1, base = 2)
25 |   }
26 |   SeuObj <- CreateSeuratObject(raw.data = Data, project = "DEgenes")
27 |   SeuObj <- SetIdent(SeuObj, ident.use = Labels)
28 |   DEgenes <- FindAllMarkers(SeuObj, test.use = "MAST")
29 |   Markers <- matrix(nrow = 20,ncol = length(unique(Labels)))
30 |   colnames(Markers) <- unique(Labels)
31 |   for (i in unique(Labels)){
32 |     i
33 |     TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))]
34 |     MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i]
35 |     print(MarkerGenes[1:20])
36 |     if (length(TempList) >= 20){
37 |       Markers[,i] <- TempList[1:20]
38 |     }
39 |     else{
40 |       if(length(TempList) > 0){
41 |         Markers[c(1:length(TempList)),i] <- TempList
42 |       }
43 |     }
44 |   }
45 |   return(Markers)
46 | }
47 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/baseline/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:9.9-slim
 2 | 
 3 | # Install newest R version
 4 | RUN apt-get update && \
 5 |     apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
 6 |     wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
 7 |     echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
 8 |     apt-get update && \
 9 |     apt-get install --no-install-recommends --yes r-base && \
10 |     apt-get purge --yes wget gnupg apt-transport-https && \
11 |     apt-get autoremove --yes && \
12 |     apt-get clean && \
13 |     rm -rf /var/lib/apt/lists/*
14 | 
15 | # Install python
16 | RUN apt-get update && \
17 |     apt-get install --no-install-recommends --yes python3 python3-pip && \
18 |     pip3 --no-cache-dir install setuptools && \
19 |     pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels && \
20 |     rm -rf /var/lib/apt/lists/*
21 | 
22 | COPY Scripts/run_kNN50.py \
23 |      Scripts/run_kNN9.py \
24 |      Scripts/run_LDA.py \
25 |      Scripts/run_LDA_rejection.py \
26 |      Scripts/run_NMC.py \
27 |      Scripts/run_RF.py \
28 |      Scripts/run_SVM.py \
29 |      Scripts/run_SVM_rejection.py \
30 |      rank_gene_dropouts.py \
31 |      /Scripts/
32 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/cell_blast/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7-slim-stretch
 2 | 
 3 | # Install newest R version
 4 | RUN apt-get update && \
 5 |     apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
 6 |     wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
 7 |     echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
 8 |     apt-get update && \
 9 |     apt-get install --no-install-recommends --yes r-base && \
10 |     apt-get purge --yes wget gnupg apt-transport-https && \
11 |     apt-get autoremove --yes && \
12 |     apt-get clean && \
13 |     rm -rf /var/lib/apt/lists/*
14 | 
15 | # Install python and pip deps
16 | RUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \
17 |     pip3 --no-cache-dir install --upgrade pip && \
18 |     pip3 --no-cache-dir install --upgrade setuptools && \
19 |     pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow Cell-BLAST && \
20 |     apt-get autoremove --yes && \
21 |     apt-get clean && \
22 |     rm -rf /var/lib/apt/lists/*
23 | 
24 | COPY Scripts/run_Cell_BLAST.py /Scripts/
25 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/chetah/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:9.9-slim
 2 | 
 3 | # Install newest R version
 4 | RUN apt-get update && \
 5 |     apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
 6 |     wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
 7 |     echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
 8 |     apt-get update && \
 9 |     apt-get install --no-install-recommends --yes r-base && \
10 |     apt-get purge --yes wget gnupg apt-transport-https && \
11 |     apt-get autoremove --yes && \
12 |     apt-get clean && \
13 |     rm -rf /var/lib/apt/lists/*
14 | 
15 | COPY Scripts/run_CHETAH.R \
16 |      Dockerfiles/chetah/install_packages.R \
17 |      /Scripts/
18 | 
19 | # Install R packages
20 | RUN apt-get update && \
21 |     apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
22 |     Rscript --vanilla /Scripts/install_packages.R && \
23 |     apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
24 |     apt-get autoremove --yes && \
25 |     apt-get clean && \
26 |     rm -rf /var/lib/apt/lists/*
27 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/chetah/install_packages.R:
--------------------------------------------------------------------------------
1 | withCallingHandlers({
2 |   install.packages("devtools", repos="https://cloud.r-project.org/")
3 |   install.packages("BiocManager", repos="https://cloud.r-project.org/")
4 |   BiocManager::install(c("bioDist", "ggplot2", "gplots", "cowplot",
5 |                          "dendextend", "corrplot", "reshape2", "plotly"))
6 |   devtools::install_github("jdekanter/CHETAH", ref="b777e6f671bff3c434842adb655869a52bc9e368")
7 | },
8 | warning = function(w) stop(w))
9 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/cross_validation/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:9.9-slim
 2 | 
 3 | # Install newest R version
 4 | RUN apt-get update && \
 5 |     apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
 6 |     wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
 7 |     echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
 8 |     apt-get update && \
 9 |     apt-get install --no-install-recommends --yes r-base && \
10 |     apt-get purge --yes wget gnupg apt-transport-https && \
11 |     apt-get autoremove --yes && \
12 |     apt-get clean && \
13 |     rm -rf /var/lib/apt/lists/*
14 | 
15 | COPY Cross_Validation.R \
16 |      Dockerfiles/cross_validation/install_packages.R \
17 |      /Scripts/
18 | 
19 | # Install R packages
20 | RUN apt-get update && \
21 |     apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libxml2-dev && \
22 |     Rscript --vanilla /Scripts/install_packages.R && \
23 |     apt-get purge --yes make gcc g++ libxml2-dev && \
24 |     apt-get autoremove --yes && \
25 |     apt-get clean && \
26 |     rm -rf /var/lib/apt/lists/*
27 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/cross_validation/install_packages.R:
--------------------------------------------------------------------------------
1 | withCallingHandlers({
2 |   install.packages("lhs", repos="https://cloud.r-project.org/")
3 |   install.packages("rBayesianOptimization", repos="https://cloud.r-project.org/")
4 | },
5 | warning = function(w) stop(w))
6 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/garnett/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:9.9-slim
 2 | 
 3 | # Install newest R version
 4 | RUN apt-get update && \
 5 |     apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
 6 |     wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
 7 |     echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
 8 |     apt-get update && \
 9 |     apt-get install --no-install-recommends --yes r-base && \
10 |     apt-get purge --yes wget gnupg apt-transport-https && \
11 |     apt-get autoremove --yes && \
12 |     apt-get clean && \
13 |     rm -rf /var/lib/apt/lists/*
14 | 
15 | COPY Scripts/run_Garnett_CV.R \
16 |      Scripts/run_Garnett_Pretrained.R \
17 |      Dockerfiles/garnett/install_packages.R \
18 |      /Scripts/
19 | 
20 | # Install R packages
21 | RUN apt-get update && \
22 |     apt-get install --no-install-recommends --yes make gcc g++ libxml2-dev zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \
23 |     Rscript --vanilla /Scripts/install_packages.R && \
24 |     apt-get purge --yes make gcc g++ zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \
25 |     apt-get autoremove --yes && \
26 |     apt-get clean && \
27 |     rm -rf /var/lib/apt/lists/*
28 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/garnett/install_packages.R:
--------------------------------------------------------------------------------
1 | withCallingHandlers({
2 |   install.packages("BiocManager", repos="https://cloud.r-project.org/")
3 |   BiocManager::install(c("monocle", "DelayedArray", "DelayedMatrixStats",
4 |                        "org.Hs.eg.db", "org.Mm.eg.db"))
5 |   install.packages("devtools", repos="https://cloud.r-project.org/")
6 |   devtools::install_github("cole-trapnell-lab/garnett", ref="9804b532bbcc1714b3ed0b718cf430741f1dba6c")
7 | },
8 | warning = function(w) stop(w))
9 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/scid/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM r-base:3.6.0
 2 | 
 3 | COPY Scripts/run_scID.R \
 4 |      Dockerfiles/scid/install_packages.R \
 5 |      /Scripts/
 6 | 
 7 | # Install R packages
 8 | RUN apt-get update && \
 9 |     apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
10 |     Rscript --vanilla /Scripts/install_packages.R && \
11 |     apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
12 |     apt-get autoremove --yes && \
13 |     apt-get clean && \
14 |     rm -rf /var/lib/apt/lists/*
15 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/scid/install_packages.R:
--------------------------------------------------------------------------------
 1 | withCallingHandlers({
 2 |   install.packages("BiocManager", repos="https://cloud.r-project.org/")
 3 |   BiocManager::install(ask = FALSE);
 4 |   BiocManager::install(c("scater", "MAST"))
 5 |   install.packages("devtools", repos="https://cloud.r-project.org/")
 6 |   devtools::install_github("satijalab/seurat")
 7 |   devtools::install_github("BatadaLab/scID")
 8 | },
 9 | warning = function(w) stop(w))
10 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/scmap/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM r-base:3.6.0
 2 | 
 3 | COPY Scripts/run_scmapcell.R \
 4 |      Scripts/run_scmapcluster.R \
 5 |      Dockerfiles/scmap/install_packages.R \
 6 |      /Scripts/
 7 | 
 8 | # Install R packages
 9 | RUN apt-get update && \
10 |     apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
11 |     Rscript --vanilla /Scripts/install_packages.R && \
12 |     apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
13 |     apt-get autoremove --yes && \
14 |     apt-get clean && \
15 |     rm -rf /var/lib/apt/lists/*
16 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/scmap/install_packages.R:
--------------------------------------------------------------------------------
1 | withCallingHandlers({
2 |   install.packages("BiocManager", repos="https://cloud.r-project.org/")
3 |   BiocManager::install(ask = FALSE)
4 |   BiocManager::install("SingleCellExperiment")
5 |   install.packages("devtools", repos="https://cloud.r-project.org/")
6 |   devtools::install_github("hemberg-lab/scmap")
7 | },
8 | warning = function(w) stop(w))
9 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/scvi/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7-slim-stretch
 2 | 
 3 | # Install newest R version
 4 | RUN apt-get update && \
 5 |     apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
 6 |     wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
 7 |     echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
 8 |     apt-get update && \
 9 |     apt-get install --no-install-recommends --yes r-base && \
10 |     apt-get purge --yes wget gnupg apt-transport-https && \
11 |     apt-get autoremove --yes && \
12 |     apt-get clean && \
13 |     rm -rf /var/lib/apt/lists/*
14 | 
15 | # Install python and pip deps
16 | RUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \
17 |     pip3 --no-cache-dir install --upgrade pip && \
18 |     pip3 --no-cache-dir install --upgrade setuptools && \
19 |     pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow scvi && \
20 |     apt-get autoremove --yes && \
21 |     apt-get clean && \
22 |     rm -rf /var/lib/apt/lists/*
23 | 
24 | 
25 | COPY Scripts/run_scVI.py /Scripts/
26 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/singlecellnet/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:9.9-slim
 2 | 
 3 | # Install newest R version
 4 | RUN apt-get update && \
 5 |     apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
 6 |     wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
 7 |     echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
 8 |     apt-get update && \
 9 |     apt-get install --no-install-recommends --yes r-base && \
10 |     apt-get purge --yes wget gnupg apt-transport-https && \
11 |     apt-get autoremove --yes && \
12 |     apt-get clean && \
13 |     rm -rf /var/lib/apt/lists/*
14 | 
15 | COPY Scripts/run_singleCellNet.R \
16 |      Dockerfiles/singlecellnet/install_packages.R \
17 |      /Scripts/
18 | 
19 | # Install R packages
20 | RUN apt-get update && \
21 |     apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libcurl4-openssl-dev zlib1g-dev libssl-dev r-base-dev libxml2-dev && \
22 |     Rscript --vanilla /Scripts/install_packages.R && \
23 |     apt-get purge --yes make gcc g++ zlib1g-dev libcurl4-openssl-dev libc6-dev libssl-dev r-base-dev libxml2-dev && \
24 |     apt-get autoremove --yes && \
25 |     apt-get clean && \
26 |     rm -rf /var/lib/apt/lists/*
27 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/singlecellnet/install_packages.R:
--------------------------------------------------------------------------------
1 | withCallingHandlers({
2 |   install.packages("devtools", repos="https://cloud.r-project.org/")
3 |   install.packages("BiocManager", repos="https://cloud.r-project.org/")
4 |   BiocManager::install("fgsea")
5 |   devtools::install_github("thomasp85/patchwork", ref="fd7958bae3e7a1e30237c751952e412a0a1d1242")
6 |   devtools::install_github("pcahan1/singleCellNet", ref="4279a68112743b783cc82628421dd703261ec117")
7 | },
8 | warning = function(w) stop(w))
9 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/singler/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:9.9-slim
 2 | 
 3 | # Install newest R version
 4 | RUN apt-get update && \
 5 |     apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
 6 |     wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
 7 |     echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
 8 |     apt-get update && \
 9 |     apt-get install --no-install-recommends --yes r-base && \
10 |     apt-get purge --yes wget gnupg apt-transport-https && \
11 |     apt-get autoremove --yes && \
12 |     apt-get clean && \
13 |     rm -rf /var/lib/apt/lists/*
14 | 
15 | COPY Scripts/run_SingleR.R \
16 |      Dockerfiles/singler/install_packages.R \
17 |      /Scripts/
18 | 
19 | RUN apt-get update && \
20 |     apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev libxml2 && \
21 |     Rscript --vanilla /Scripts/install_packages.R && \
22 |     apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
23 |     apt-get autoremove --yes && \
24 |     apt-get clean && \
25 |     rm -rf /var/lib/apt/lists/*
26 | 


--------------------------------------------------------------------------------
/Snakemake/Dockerfiles/singler/install_packages.R:
--------------------------------------------------------------------------------
1 | withCallingHandlers({
2 |   install.packages("devtools", repos="https://cloud.r-project.org/")
3 |   install.packages("Seurat", repos="https://cloud.r-project.org/")
4 |   devtools::install_github("dviraran/SingleR", ref="db4823b380ba2c3142c857c8c0695200dd1736f6")
5 | },
6 | warning = function(w) stop(w))
7 | 


--------------------------------------------------------------------------------
/Snakemake/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 tabdelaal
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Snakemake/README.md:
--------------------------------------------------------------------------------
  1 | # scRNAseq_Benchmark
  2 | Benchmarking classification tools for scRNA-seq data
  3 | 
  4 | ## How to use
  5 | [snakemake](https://snakemake.readthedocs.io/en/stable/index.html) and
  6 | [singularity](https://www.sylabs.io/docs/) need to be available on your 
  7 | system. You will need to run this on a linux system, as singularity
  8 | only supports linux.
  9 | 
 10 | From the root of this repository:
 11 | ```
 12 | snakemake \
 13 |   --configfile <configfile> \
 14 |   --use-singularity
 15 | ```
 16 | 
 17 | If your data or output directory is not located under the root of this
 18 | repository, be sure to tell snakemake to mount the appropriate directories
 19 | in singularity:
 20 | ```
 21 | snakemake \
 22 |   --configfile <configfile> \
 23 |   --use-singularity \
 24 |   --singularity-args '--bind <location of inputs>:<location of inputs> --bind <output directory>:<output directory>'
 25 | ```
 26 | 
 27 | #### The config file
 28 | ```YML
 29 | output_dir: <path to outputs directory>
 30 | datafile: <path to csv file with counts per cell>
 31 | labfile: <csv with true labels per cell>
 32 | column: <The index of the column in the labels file which ought to be used, defaults to 1>
 33 | number_of_features: <number of features to be used as input for the classification methods, 0 means all, defaults to 0>
 34 | genes: <path to gene name list, only needed for garnett_CV and Garnett_Pretrained>
 35 | human: <whether or not the data is human, true means human, false means mouse, defaults to true>
 36 | tools_to_run: # List of tools to run
 37 |   - <tool 1>
 38 |   - <tool 2>
 39 |   - <...>
 40 | ```
 41 | 
 42 | ##### Tool specific inputs
 43 | Some tools require specific inputs. Add the following to your config file when
 44 | one of these tools:
 45 | - Garnett_CV
 46 |   ```YML
 47 |   Garnett_CV:
 48 |     markers: <path to Gernett marker gene file>
 49 |   ```
 50 | - Garnett_Pretrained
 51 |   ```YML
 52 |   Garnett_Pretrained:
 53 |     classifier: <path to Gernett classifier>
 54 |   ```
 55 | 
 56 | <!-- TODO explain these input files -->
 57 | 
 58 | ## Included tools/methods
 59 | - kNN50
 60 | - kNN9
 61 | - LDA
 62 | - LDA_rejection (LDA with rejection option)
 63 | - NMC
 64 | - RF
 65 | - SVM
 66 | - SVM (SVM with rejection option)
 67 | - [singleCellNet](https://github.com/pcahan1/singleCellNet)
 68 | - [CHETAH](https://github.com/jdekanter/CHETAH)
 69 | - [scmap](https://github.com/hemberg-lab/scmap)
 70 |   - scmapcell
 71 |   - scmapcluster
 72 | - [SingleR](https://github.com/dviraran/SingleR)
 73 | - [scID](https://github.com/BatadaLab/scID)
 74 | - [scVI](https://github.com/YosefLab/scVI)
 75 | - [Cell_BLAST](https://github.com/gao-lab/Cell_BLAST)
 76 | - [Garnett](https://cole-trapnell-lab.github.io/garnett/)
 77 |   - Garnett_CV (without pretrained classifier)
 78 |   - Garnett_Pretrained (with pretrained classifier)
 79 | 
 80 | ## Adding new tools
 81 | In order to add a tool to this benchmarking workflow, a rule for this tool
 82 | needs to be added to the `Snakefile`. This rule should produce as output:
 83 | - a table of predicted label (`<output directory/<tool>/<tool>_pred.csv`).
 84 | - a table of true labels (`<output directory/<tool>/<tool>_true.csv`).
 85 | - a tables of testing, prediction and/or total time:
 86 |   - `<output directory>/<tool>/<tool>_test_time.csv`
 87 |   - `<output directory>/<tool>/<tool>_training_time.csv`
 88 |   - `<output directory>/<tool>/<tool>_total_time.csv`
 89 | 
 90 | The input to this rule should be:
 91 | - a count table (specified as the `datafile` in the config).
 92 | - a true labels file (specified as the `labfile` in the config).
 93 | 
 94 | You will want to write a wrapper script for the tool you want to
 95 | add to facilitate this. The `"{output_dir}/CV_folds.RData"` input may be
 96 | used to provide your wrapper script with folds for cross_validation.
 97 | It is recommended to make a docker image containing all dependencies for both
 98 | the tool and any wrappers for the tool.  
 99 | This wrapper script should also make a selection of the features to be used.
100 | This selection should be based on ranking which can be accessed by providing
101 | `feature ranking` as input to the wrapper script. The number of features to be
102 | used should be configurable and settable through the 'number_of_features' field
103 | in the config.
104 | 
105 | The following can be used as a template for new rules. Replace everything
106 | surrounded by (and including the) `<>` with appropriate values.
107 | ```
108 | rule SVM:
109 |   input:
110 |     datafile = config["datafile"],
111 |     labfile = config["labfile"],
112 |     folds = "{output_dir}/CV_folds.RData",
113 |     ranking = feature_ranking
114 |   output:
115 |     pred = "{output_dir}/<tool name>/<tool name>_pred.csv",
116 |     true = "{output_dir}/<tool name>/<tool name>_true.csv",
117 |     test_time = "{output_dir}/<tool name>/<tool name>_test_time.csv",
118 |     training_time = "{output_dir}/<tool name>/<tool name>_training_time.csv"
119 |   log: "{output_dir}/<tool name>/<tool name>.log"
120 |   params:
121 |     n_features = config.get("number_of_features", 0)
122 |   singularity: "docker://<docker image>"
123 |   shell:
124 |     "<python or Rscript> <wrapper script> "
125 |     "{input.datafile} "
126 |     "{input.labfile} "
127 |     "{input.folds} "
128 |     "{wildcards.output_dir}/<tool name> "
129 |     "{input.ranking} "
130 |     "{params.n_features} "
131 |     "&> {log}"
132 | ```
133 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_ACTINN.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import numpy as np
  3 | import pandas as pd
  4 | import time as tm
  5 | import rpy2.robjects as robjects
  6 | 
  7 | def run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
  8 |     '''
  9 |     run ACTINN
 10 |     Wrapper script to run ACTINN on a benchmark dataset with 5-fold cross validation,
 11 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 12 |   
 13 |     Parameters
 14 |     ----------
 15 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 16 |     as row names and gene names as column names.
 17 |     LabelsPath : Cell population annotations file path (.csv).
 18 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 19 |     OutputDir : Output directory defining the path of the exported file.
 20 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 21 |     defining the genes order for each cross validation fold, default is NULL.
 22 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 23 |     '''
 24 |     
 25 |     # read the Rdata file
 26 |     robjects.r['load'](CV_RDataPath)
 27 | 
 28 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 29 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 30 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 31 |     col = col - 1 
 32 |     test_ind = np.array(robjects.r['Test_Idx'])
 33 |     train_ind = np.array(robjects.r['Train_Idx'])
 34 | 
 35 |     # read the data
 36 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 37 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 38 |     
 39 |     labels = labels.iloc[tokeep]
 40 |     data = data.iloc[tokeep]
 41 |     
 42 |     # read the feature file
 43 |     if (NumGenes > 0):
 44 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 45 |     
 46 |     # folder with results
 47 |     os.chdir(OutputDir)
 48 |     
 49 |     tot=[]
 50 |     truelab = []
 51 |     pred = []
 52 | 
 53 |     for i in range(np.squeeze(nfolds)):
 54 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 55 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 56 |     
 57 |         train=data.iloc[train_ind_i]
 58 |         test=data.iloc[test_ind_i]
 59 |         y_train=labels.iloc[train_ind_i]
 60 |         y_test=labels.iloc[test_ind_i]
 61 |         
 62 |         if (NumGenes > 0):
 63 |             feat_to_use = features.iloc[0:NumGenes,i]
 64 |             train = train.iloc[:,feat_to_use]
 65 |             test = test.iloc[:,feat_to_use]
 66 |         
 67 |         train = train.transpose()
 68 |         test = test.transpose()
 69 |         
 70 |         train.to_csv("train.csv")
 71 |         test.to_csv("test.csv")
 72 |         y_train.to_csv("train_lab.csv", header = False, index = True, sep = '\t')
 73 |         y_test.to_csv("test_lab.csv", header = False, index = True, sep = '\t')
 74 |         
 75 |         tm.sleep(60)
 76 |             
 77 |         os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i train.csv -o train -f csv")
 78 |         os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i test.csv -o test -f csv")
 79 |         
 80 |         start = tm.time()
 81 |         os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts test.h5")    
 82 |         tot.append(tm.time()-start)
 83 |         
 84 |         tm.sleep(60)
 85 | 
 86 |         truelab.extend(y_test.values)
 87 |         predlabels = pd.read_csv('predicted_label.txt',header=0,index_col=None, sep='\t', usecols = [1])            
 88 |         pred.extend(predlabels.values)
 89 |     
 90 |             
 91 |     truelab = pd.DataFrame(truelab)
 92 |     pred = pd.DataFrame(pred)
 93 |     tot_time = pd.DataFrame(tot)
 94 |     
 95 |     if (NumGenes == 0):  
 96 |         truelab.to_csv("ACTINN_True_Labels.csv", index = False)
 97 |         pred.to_csv("ACTINN_Pred_Labels.csv", index = False)
 98 |         tot_time.to_csv("ACTINN_Total_Time.csv", index = False)
 99 |     else:
100 |         truelab.to_csv("ACTINN_" + str(NumGenes) + "_True_Labels.csv", index = False)
101 |         pred.to_csv("ACTINN_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
102 |         tot_time.to_csv("ACTINN_" + str(NumGenes) + "_Total_Time.csv", index = False)
103 |         
104 |         
105 |         
106 |         
107 |         
108 |         
109 |         
110 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_CHETAH.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(TRUE)
 2 | 
 3 | run_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
 4 |   "
 5 |   run CHETAH
 6 |   Wrapper script to run CHETAH on a benchmark dataset with 5-fold cross validation,
 7 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
 8 |   
 9 |   Parameters
10 |   ----------
11 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
12 |   as row names and gene names as column names.
13 |   LabelsPath : Cell population annotations file path (.csv).
14 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
15 |   OutputDir : Output directory defining the path of the exported file.
16 |   GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
17 |   defining the genes order for each cross validation fold, default is NULL.
18 |   NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
19 |   "
20 |   
21 |   Data <- read.csv(DataPath,row.names = 1)
22 |   Labels <- as.matrix(read.csv(LabelsPath))
23 |   load(CV_RDataPath)
24 |   Labels <- as.vector(Labels[,col_Index])
25 |   Data <- Data[Cells_to_Keep,]
26 |   Labels <- Labels[Cells_to_Keep]
27 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
28 |     GenesOrder = read.csv(GeneOrderPath)
29 |   }
30 |   
31 |   #############################################################################
32 |   #                                CHETAH                                     #
33 |   #############################################################################
34 |   library(CHETAH)
35 |   library(SingleCellExperiment)
36 |   True_Labels_CHETAH <- list()
37 |   Pred_Labels_CHETAH <- list()
38 |   Total_Time_CHETAH <- list()
39 |   Data = t(as.matrix(Data))
40 |   
41 |   for (i in c(1:n_folds)){
42 |     if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
43 |       sce <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
44 |                                   colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
45 |       
46 |       sce_test <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
47 |                                        colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
48 |       start_time <- Sys.time()
49 |       sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce, n_genes = NumGenes)
50 |       end_time <- Sys.time()
51 |     }
52 |     else{
53 |       sce <- SingleCellExperiment(assays = list(counts = Data[,Train_Idx[[i]]]), 
54 |                                   colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
55 |       
56 |       sce_test <- SingleCellExperiment(assays = list(counts = Data[,Test_Idx[[i]]]), 
57 |                                        colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
58 |       start_time <- Sys.time()
59 |       sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce)
60 |       end_time <- Sys.time()
61 |     }
62 |     
63 |     Total_Time_CHETAH[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
64 |     
65 |     True_Labels_CHETAH[i] <- list(Labels[Test_Idx[[i]]])
66 |     Pred_Labels_CHETAH[i] <- list(sce_test$celltype_CHETAH)
67 |   }
68 |   True_Labels_CHETAH <- as.vector(unlist(True_Labels_CHETAH))
69 |   Pred_Labels_CHETAH <- as.vector(unlist(Pred_Labels_CHETAH))
70 |   Total_Time_CHETAH <- as.vector(unlist(Total_Time_CHETAH))
71 |   write.csv(True_Labels_CHETAH,paste0(OutputDir,'/CHETAH_true.csv'),row.names = FALSE)
72 |   write.csv(Pred_Labels_CHETAH,paste0(OutputDir,'/CHETAH_pred.csv'),row.names = FALSE)
73 |   write.csv(Total_Time_CHETAH,paste0(OutputDir,'/CHETAH_total_time.csv'),row.names = FALSE)
74 | }
75 | 
76 | if (args[6] == "0") {
77 |   run_CHETAH(args[1], args[2], args[3], args[4])
78 | } else {
79 |   run_CHETAH(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))
80 | }
81 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_Cell_BLAST.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from sys import argv
  3 | from pathlib import Path
  4 | import time as tm
  5 | import pandas as pd
  6 | import warnings
  7 | warnings.filterwarnings("ignore")
  8 | 
  9 | import tensorflow as tf
 10 | tf.logging.set_verbosity(0)
 11 | 
 12 | import Cell_BLAST as cb
 13 | import numpy as np
 14 | from numpy import genfromtxt as gft
 15 | import rpy2.robjects as robjects
 16 | 
 17 | 
 18 | def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 19 |     '''
 20 |     run Cell_BLAST
 21 |     Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation,
 22 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 23 |   
 24 |     Parameters
 25 |     ----------
 26 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 27 |     as row names and gene names as column names.
 28 |     LabelsPath : Cell population annotations file path (.csv).
 29 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 30 |     OutputDir : Output directory defining the path of the exported file.
 31 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 32 |     defining the genes order for each cross validation fold, default is NULL.
 33 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 34 |     '''
 35 |         
 36 |     # read the Rdata file
 37 |     robjects.r['load'](CV_RDataPath)
 38 | 
 39 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 40 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 41 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 42 |     col = col - 1 
 43 |     test_ind = np.array(robjects.r['Test_Idx'])
 44 |     train_ind = np.array(robjects.r['Train_Idx'])
 45 |     
 46 |     # read the feature file
 47 |     if (NumGenes > 0):
 48 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 49 | 
 50 |     # read the data and labels
 51 |     data_old = cb.data.ExprDataSet.read_table(DataPath,orientation="cg", sep=",", index_col = 0, header = 0, sparsify = True).normalize()
 52 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 53 |     
 54 |     data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns)
 55 | 
 56 |     labels = gft(LabelsPath, dtype = "str", skip_header = 1, delimiter = ",", usecols = col)      
 57 |     labels = labels[tokeep]
 58 |    
 59 |     truelab = []
 60 |     pred = []
 61 |     tr_time = []
 62 |     ts_time = []
 63 |     
 64 |     for i in range(np.squeeze(nfolds)):
 65 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 66 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 67 | 
 68 |         train=data[train_ind_i,:]
 69 |         test=data[test_ind_i,:]
 70 |         y_train = labels[train_ind_i]
 71 |         y_test = labels[test_ind_i]
 72 |         
 73 |         if (NumGenes > 0):
 74 |             feat_to_use = features.iloc[0:NumGenes,i]
 75 |             train = train[:,feat_to_use]
 76 |             test = test[:,feat_to_use]
 77 | 
 78 |         
 79 |         train.obs['cell_type'] = y_train
 80 |                 
 81 |         start = tm.time()
 82 |                 
 83 |         # reduce dimensions
 84 |         num_epoch = 50
 85 |         models = []
 86 |     
 87 |         for j in range(4):
 88 |             models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j))
 89 |     
 90 |         # train model
 91 |         blast = cb.blast.BLAST(models, train).build_empirical()
 92 |         tr_time.append(tm.time()-start)
 93 |         
 94 |         # predict labels
 95 |         start = tm.time()
 96 |         test_pred = blast.query(test).annotate('cell_type')
 97 |         ts_time.append(tm.time()-start)
 98 | 
 99 |         truelab.extend(y_test)
100 |         pred.extend(test_pred.values)
101 |     
102 |     #write results    
103 |     truelab = pd.DataFrame(truelab)
104 |     pred = pd.DataFrame(pred)
105 |             
106 |     tr_time = pd.DataFrame(tr_time)
107 |     ts_time = pd.DataFrame(ts_time)
108 | 
109 |     truelab.to_csv(str(Path(OutputDir+"/Cell_BLAST_true.csv")),index = False)
110 |     pred.to_csv(str(Path(OutputDir+"/Cell_BLAST_pred.csv")),index = False)
111 |     tr_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_training_time.csv")), index = False)
112 |     ts_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_test_time.csv")),index = False)
113 | 
114 | 
115 | run_Cell_BLAST(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))
116 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_DigitalCellSorter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import scripts.DigitalCellSorter as DigitalCellSorter
 4 | import os
 5 | import time as tm
 6 | import rpy2.robjects as robjects
 7 | 
 8 | def run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 9 |     '''
10 |     run DigitalCellSorter
11 |     Wrapper script to run DigitalCellSorter on a benchmark dataset using a predefined genelist,
12 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.  
13 |   
14 |     Parameters
15 |     ----------
16 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
17 |     as row names and gene names as column names.
18 |     LabelsPath : Cell population annotations file path (.csv).
19 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
20 |     GeneListPath : Data file path to the genest.
21 |     OutputDir : Output directory defining the path of the exported file.
22 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
23 |     defining the genes order for each cross validation fold, default is NULL.
24 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
25 |     '''
26 |         
27 |     # read the Rdata file
28 |     robjects.r['load'](CV_RDataPath)
29 | 
30 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
31 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
32 |     col = col - 1
33 |     
34 |     # read the data
35 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
36 |     data = data.iloc[tokeep]
37 |     
38 |     truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
39 |     truelab = truelab.iloc[tokeep]
40 | 
41 | 
42 |     # read the feature file
43 |     if (NumGenes > 0):
44 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
45 |         feat_to_use = features.iloc[0:NumGenes,0]
46 |         data = data.iloc[:,feat_to_use]
47 |         
48 |     data = data.transpose()
49 |     
50 |     # number of different cell types in the data?
51 |     n_clusters = 8
52 |     AvailableCPUsCount = 1
53 |     N_samples_for_distribution = 10000
54 |         
55 |     start = tm.time()
56 |     pred = DigitalCellSorter.DigitalCellSorter().Process(data, 'DigitalCellSorter_Zhang', 
57 |                                                 saveDir = OutputDir, 
58 |                                                 geneListFileName = GeneListPath,
59 |                                                 N_samples_for_distribution = N_samples_for_distribution,
60 |                                                 AvailableCPUsCount = AvailableCPUsCount,
61 |                                                 clusterIndex=None,
62 |                                                 clusterName=None,
63 |                                                 n_clusters=n_clusters)	
64 |     runtime = tm.time() - start 
65 |     
66 |     os.chdir(OutputDir)
67 |     
68 |     results = pd.read_excel('DigitalCellSorter_Zhang_voting.xlsx',header=0,index_col=None, usecols=[11])
69 | 
70 |     prediction = np.zeros(np.shape(pred), dtype='>U10')
71 |     
72 |     for i in range(len(results)):
73 |     	prediction[np.where(pred == i)] = results.values[i]
74 |     
75 |     prediction = pd.DataFrame(prediction)
76 |         
77 |     if (NumGenes == 0):  
78 |         truelab.to_csv("DigitalCellSorter_True_Labels.csv", index = False)
79 |         prediction.to_csv("DigitalCellSorter_Pred_Labels.csv", index = False)
80 |         with open("DigitalCellSorter_Total_Time.csv", 'w') as f:
81 |             f.write("%f\n" % runtime)
82 |     else:
83 |         truelab.to_csv("DigitalCellSorter_" + str(NumGenes) + "_True_Labels.csv", index = False)
84 |         prediction.to_csv("DigitalCellSorter_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
85 |         with open("DigitalCellSorter_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
86 |             f.write("%f\n" % runtime)
87 | 
88 |             
89 | 
90 |         


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_Garnett_Pretrained.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(TRUE)
 2 | 
 3 | run_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){
 4 |   "
 5 |   run Garnett
 6 |   Wrapper script to run Garnett on a benchmark dataset with a pretrained classifier,
 7 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
 8 |   
 9 |   Parameters
10 |   ----------
11 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
12 |   as row names and gene names as column names.
13 |   LabelsPath : Cell population annotations file path (.csv).
14 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
15 |   GenesPath : Path to the file with the genenames
16 |   ClassifierPath : Path to the pretrained classifier
17 |   OutputDir : Output directory defining the path of the exported file.
18 |   Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)
19 |   "
20 |   # load needed libraries
21 |   library(garnett)
22 |   
23 |   if (Human) {
24 |     library(org.Hs.eg.db)
25 |   } else {
26 |     library(org.Mm.eg.db)
27 |   }
28 |   
29 |   # load data, genes, and marker file
30 |   load(CV_RDataPath)
31 |   
32 |   load(ClassifierPath)
33 |   
34 |   labels <- as.matrix(read.csv(LabelsPath))
35 |   labels <- labels[Cells_to_Keep]
36 |   
37 |   mat <- read.table(DataPath, sep = ",")
38 |   data <- mat[-1,-1]
39 |   data <- data[Cells_to_Keep,]
40 |   data <- t(data) #ensure that the genes are rows, and the cells are columns
41 |   
42 |   barcodes <- mat[-1,1]
43 |   
44 |   pdata = data.frame(barcodes)
45 |   fdata <- read.table(GenesPath)
46 |   names(fdata) <- 'gene_short_name'
47 |   row.names(fdata) <- fdata$gene_short_name
48 |   
49 |   row.names(data) <- row.names(fdata)
50 |   colnames(data) <- row.names(pdata)
51 |   
52 |   pd <- new("AnnotatedDataFrame", data = pdata)
53 |   fd <- new("AnnotatedDataFrame", data = fdata)
54 |   pbmc_cds <- newCellDataSet(as(data, "dgCMatrix"),
55 |                              phenoData = pd,
56 |                              featureData = fd)
57 |   
58 |   start_time <- Sys.time()
59 |   
60 |   pbmc_cds <- estimateSizeFactors(pbmc_cds)
61 |   
62 |   if (Human){
63 |     pbmc_cds <- classify_cells(pbmc_cds, hsPBMC, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
64 |   } else {
65 |     pbmc_cds <- classify_cells(pbmc_cds, mmLung, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
66 |   }
67 |   
68 |   end_time <- Sys.time()
69 |   
70 |   test_time <- as.numeric(end_time - start_time)
71 | 
72 |   write.table(pData(pbmc_cds)$cluster_ext_type,
73 |               file = paste0(OutputDir, "/Garnett_Pretrained_pred.csv"), append = FALSE, quote = TRUE, sep = "\t",
74 |               eol = "\n", na = "NA", dec = ".", row.names = FALSE,
75 |               qmethod = c("escape", "double"),
76 |               fileEncoding = "")
77 | 
78 |   write.csv(labels,paste0(OutputDir,"/Garnett_Pretrained_true.csv"), row.names = FALSE)
79 |   write.csv(test_time,paste0(OutputDir,'/Garnett_Pretrained_test_time.csv'),row.names = FALSE)
80 | }
81 | 
82 | run_Garnett_Pretrained(args[1], args[2], args[3], args[4], args[5], args[6], args[7])
83 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_LDA.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from sys import argv
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import time as tm
  8 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
  9 | import rpy2.robjects as robjects
 10 | 
 11 | 
 12 | def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 13 |     '''
 14 |     run baseline classifier: LDA
 15 |     Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
 16 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
 21 |     as row names and gene names as column names.
 22 |     LabelsPath : Cell population annotations file path (.csv).
 23 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 24 |     OutputDir : Output directory defining the path of the exported file.
 25 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
 26 |     defining the genes order for each cross validation fold, default is NULL.
 27 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 28 |     '''
 29 | 
 30 |     # read the Rdata file
 31 |     robjects.r['load'](CV_RDataPath)
 32 | 
 33 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 34 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 35 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 36 |     col = col - 1
 37 |     test_ind = np.array(robjects.r['Test_Idx'])
 38 |     train_ind = np.array(robjects.r['Train_Idx'])
 39 | 
 40 |     # read the data
 41 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 42 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 43 | 
 44 |     labels = labels.iloc[tokeep]
 45 |     data = data.iloc[tokeep]
 46 | 
 47 |     # read the feature file
 48 |     if (NumGenes > 0):
 49 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 50 | 
 51 |     # normalize data
 52 |     data = np.log1p(data)
 53 | 
 54 |     Classifier = LinearDiscriminantAnalysis()
 55 | 
 56 |     tr_time=[]
 57 |     ts_time=[]
 58 |     truelab = []
 59 |     pred = []
 60 | 
 61 |     for i in range(np.squeeze(nfolds)):
 62 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 63 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 64 | 
 65 |         train=data.iloc[train_ind_i]
 66 |         test=data.iloc[test_ind_i]
 67 |         y_train=labels.iloc[train_ind_i]
 68 |         y_test=labels.iloc[test_ind_i]
 69 | 
 70 |         if (NumGenes > 0):
 71 |             feat_to_use = features.iloc[0:NumGenes,i]
 72 |             train = train.iloc[:,feat_to_use]
 73 |             test = test.iloc[:,feat_to_use]
 74 | 
 75 |         start=tm.time()
 76 |         Classifier.fit(train, y_train)
 77 |         tr_time.append(tm.time()-start)
 78 | 
 79 |         start=tm.time()
 80 |         predicted = Classifier.predict(test)
 81 |         ts_time.append(tm.time()-start)
 82 | 
 83 |         truelab.extend(y_test.values)
 84 |         pred.extend(predicted)
 85 | 
 86 |     truelab = pd.DataFrame(truelab)
 87 |     pred = pd.DataFrame(pred)
 88 | 
 89 |     tr_time = pd.DataFrame(tr_time)
 90 |     ts_time = pd.DataFrame(ts_time)
 91 | 
 92 |     OutputDir = Path(OutputDir)
 93 |     truelab.to_csv(str(OutputDir / Path("LDA_true.csv")),
 94 |                    index = False)
 95 |     pred.to_csv(str(OutputDir / Path("LDA_pred.csv")),
 96 |                 index = False)
 97 |     tr_time.to_csv(str(OutputDir / Path("LDA_training_time.csv")),
 98 |                    index = False)
 99 |     ts_time.to_csv(str(OutputDir / Path("LDA_test_time.csv")),
100 |                    index = False)
101 | 
102 | run_LDA(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))
103 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_LDA_rejection.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from sys import argv
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import time as tm
  8 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
  9 | import rpy2.robjects as robjects
 10 | 
 11 | 
 12 | def run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7):
 13 |     '''
 14 |     run baseline classifier: LDA
 15 |     Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
 16 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
 21 |     as row names and gene names as column names.
 22 |     LabelsPath : Cell population annotations file path (.csv).
 23 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 24 |     OutputDir : Output directory defining the path of the exported file.
 25 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
 26 |     defining the genes order for each cross validation fold, default is NULL.
 27 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 28 |     Threshold : Threshold used when rejecting the genes, default is 0.7.
 29 |     '''
 30 | 
 31 |     # read the Rdata file
 32 |     robjects.r['load'](CV_RDataPath)
 33 | 
 34 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 35 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 36 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 37 |     col = col - 1
 38 |     test_ind = np.array(robjects.r['Test_Idx'])
 39 |     train_ind = np.array(robjects.r['Train_Idx'])
 40 | 
 41 |     # read the data
 42 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 43 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 44 | 
 45 |     labels = labels.iloc[tokeep]
 46 |     data = data.iloc[tokeep]
 47 | 
 48 |     # read the feature file
 49 |     if (NumGenes > 0):
 50 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 51 | 
 52 |     # normalize data
 53 |     data = np.log1p(data)
 54 | 
 55 |     Classifier = LinearDiscriminantAnalysis()
 56 | 
 57 |     tr_time=[]
 58 |     ts_time=[]
 59 |     truelab = []
 60 |     pred = []
 61 | 
 62 |     for i in range(np.squeeze(nfolds)):
 63 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 64 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 65 | 
 66 |         train=data.iloc[train_ind_i]
 67 |         test=data.iloc[test_ind_i]
 68 |         y_train=labels.iloc[train_ind_i]
 69 |         y_test=labels.iloc[test_ind_i]
 70 | 
 71 |         if (NumGenes > 0):
 72 |             feat_to_use = features.iloc[0:NumGenes,i]
 73 |             train = train.iloc[:,feat_to_use]
 74 |             test = test.iloc[:,feat_to_use]
 75 | 
 76 |         start=tm.time()
 77 |         Classifier.fit(train, y_train)
 78 |         tr_time.append(tm.time()-start)
 79 | 
 80 |         start=tm.time()
 81 |         predicted = Classifier.predict(test)
 82 |         prob = np.max(Classifier.predict_proba(test), axis = 1)
 83 |         unlabeled = np.where(prob < Threshold)
 84 |         predicted[unlabeled] = 'Unknown'
 85 |         ts_time.append(tm.time()-start)
 86 | 
 87 |         truelab.extend(y_test.values)
 88 |         pred.extend(predicted)
 89 | 
 90 |     truelab = pd.DataFrame(truelab)
 91 |     pred = pd.DataFrame(pred)
 92 | 
 93 |     tr_time = pd.DataFrame(tr_time)
 94 |     ts_time = pd.DataFrame(ts_time)
 95 | 
 96 |     OutputDir = Path(OutputDir)
 97 |     truelab.to_csv(str(OutputDir / Path("LDA_rejection_true.csv")),
 98 |                    index = False)
 99 |     pred.to_csv(str(OutputDir / Path("LDA_rejection_pred.csv")),
100 | 
101 |                 index = False)
102 | 
103 |     tr_time.to_csv(str(OutputDir / Path("LDA_rejection_training_time.csv")),
104 |                    index = False)
105 |     ts_time.to_csv(str(OutputDir / Path("LDA_rejection_test_time.csv")),
106 |                    index = False)
107 | 
108 | run_LDA(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))
109 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_NMC.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from sys import argv
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import time as tm
  8 | from sklearn.neighbors import NearestCentroid
  9 | import rpy2.robjects as robjects
 10 | 
 11 | 
 12 | def run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 13 |     '''
 14 |     run baseline classifier: NMC
 15 |     Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation,
 16 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
 21 |     as row names and gene names as column names.
 22 |     LabelsPath : Cell population annotations file path (.csv).
 23 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 24 |     OutputDir : Output directory defining the path of the exported file.
 25 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
 26 |     defining the genes order for each cross validation fold, default is NULL.
 27 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 28 |     '''
 29 | 
 30 |     # read the Rdata file
 31 |     robjects.r['load'](CV_RDataPath)
 32 | 
 33 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 34 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 35 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 36 |     col = col - 1
 37 |     test_ind = np.array(robjects.r['Test_Idx'])
 38 |     train_ind = np.array(robjects.r['Train_Idx'])
 39 | 
 40 |     # read the data
 41 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 42 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 43 | 
 44 |     labels = labels.iloc[tokeep]
 45 |     data = data.iloc[tokeep]
 46 | 
 47 |     # read the feature file
 48 |     if (NumGenes > 0):
 49 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 50 | 
 51 |     # normalize data
 52 |     data = np.log1p(data)
 53 | 
 54 |     Classifier = NearestCentroid()
 55 | 
 56 |     tr_time=[]
 57 |     ts_time=[]
 58 |     truelab = []
 59 |     pred = []
 60 | 
 61 |     for i in range(np.squeeze(nfolds)):
 62 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 63 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 64 | 
 65 |         train=data.iloc[train_ind_i]
 66 |         test=data.iloc[test_ind_i]
 67 |         y_train=labels.iloc[train_ind_i]
 68 |         y_test=labels.iloc[test_ind_i]
 69 | 
 70 |         if (NumGenes > 0):
 71 |             feat_to_use = features.iloc[0:NumGenes,i]
 72 |             train = train.iloc[:,feat_to_use]
 73 |             test = test.iloc[:,feat_to_use]
 74 | 
 75 |         start=tm.time()
 76 |         Classifier.fit(train, y_train)
 77 |         tr_time.append(tm.time()-start)
 78 | 
 79 |         start=tm.time()
 80 |         predicted = Classifier.predict(test)
 81 |         ts_time.append(tm.time()-start)
 82 | 
 83 |         truelab.extend(y_test.values)
 84 |         pred.extend(predicted)
 85 | 
 86 |     truelab = pd.DataFrame(truelab)
 87 |     pred = pd.DataFrame(pred)
 88 | 
 89 |     tr_time = pd.DataFrame(tr_time)
 90 |     ts_time = pd.DataFrame(ts_time)
 91 | 
 92 |     OutputDir = Path(OutputDir)
 93 |     truelab.to_csv(str(OutputDir / Path("NMC_true.csv")),
 94 |                    index = False)
 95 |     pred.to_csv(str(OutputDir / Path("NMC_pred.csv")),
 96 |                 index = False)
 97 |     tr_time.to_csv(str(OutputDir / Path("NMC_training_time.csv")),
 98 |                    index = False)
 99 |     ts_time.to_csv(str(OutputDir / Path("NMC_test_time.csv")),
100 |                    index = False)
101 | 
102 | run_NMC(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))
103 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_RF.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from sys import argv
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import time as tm
  8 | from sklearn.ensemble import RandomForestClassifier
  9 | import rpy2.robjects as robjects
 10 | 
 11 | 
 12 | def run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 13 |     '''
 14 |     run baseline classifier: RF
 15 |     Wrapper script to run a RF classifier with 50 trees on a benchmark dataset with 5-fold cross validation,
 16 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
 21 |     as row names and gene names as column names.
 22 |     LabelsPath : Cell population annotations file path (.csv).
 23 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 24 |     OutputDir : Output directory defining the path of the exported file.
 25 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
 26 |     defining the genes order for each cross validation fold, default is NULL.
 27 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 28 |     '''
 29 | 
 30 |     # read the Rdata file
 31 |     robjects.r['load'](CV_RDataPath)
 32 | 
 33 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 34 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 35 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 36 |     col = col - 1
 37 |     test_ind = np.array(robjects.r['Test_Idx'])
 38 |     train_ind = np.array(robjects.r['Train_Idx'])
 39 | 
 40 |     # read the data
 41 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 42 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 43 | 
 44 |     labels = labels.iloc[tokeep]
 45 |     data = data.iloc[tokeep]
 46 | 
 47 |     # read the feature file
 48 |     if (NumGenes > 0):
 49 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 50 | 
 51 |     # normalize data
 52 |     data = np.log1p(data)
 53 | 
 54 |     Classifier = RandomForestClassifier(n_estimators = 50)
 55 | 
 56 |     tr_time=[]
 57 |     ts_time=[]
 58 |     truelab = []
 59 |     pred = []
 60 | 
 61 |     for i in range(np.squeeze(nfolds)):
 62 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 63 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 64 | 
 65 |         train=data.iloc[train_ind_i]
 66 |         test=data.iloc[test_ind_i]
 67 |         y_train=labels.iloc[train_ind_i]
 68 |         y_test=labels.iloc[test_ind_i]
 69 | 
 70 |         if (NumGenes > 0):
 71 |             feat_to_use = features.iloc[0:NumGenes,i]
 72 |             train = train.iloc[:,feat_to_use]
 73 |             test = test.iloc[:,feat_to_use]
 74 | 
 75 |         start=tm.time()
 76 |         Classifier.fit(train, y_train)
 77 |         tr_time.append(tm.time()-start)
 78 | 
 79 |         start=tm.time()
 80 |         predicted = Classifier.predict(test)
 81 |         ts_time.append(tm.time()-start)
 82 | 
 83 |         truelab.extend(y_test.values)
 84 |         pred.extend(predicted)
 85 | 
 86 |     truelab = pd.DataFrame(truelab)
 87 |     pred = pd.DataFrame(pred)
 88 | 
 89 |     tr_time = pd.DataFrame(tr_time)
 90 |     ts_time = pd.DataFrame(ts_time)
 91 | 
 92 |     OutputDir = Path(OutputDir)
 93 |     truelab.to_csv(str(OutputDir / Path("RF_true.csv")),
 94 |                    index = False)
 95 |     pred.to_csv(str(OutputDir / Path("RF_pred.csv")),
 96 |                 index = False)
 97 |     tr_time.to_csv(str(OutputDir / Path("RF_training_time.csv")),
 98 |                    index = False)
 99 |     ts_time.to_csv(str(OutputDir / Path("RF_test_time.csv")),
100 |                    index = False)
101 | 
102 | run_RF(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))
103 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_SCINA.R:
--------------------------------------------------------------------------------
 1 | run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){
 2 |   "
 3 |   run SCINA
 4 |   Wrapper script to run SCINA on a benchmark dataset,
 5 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
 6 |   
 7 |   Parameters
 8 |   ----------
 9 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
10 |   as row names and gene names as column names.
11 |   LabelsPath : Cell population annotations file path (.csv).
12 |   GeneSigPath : Cell type marker genes file path (.csv)
13 |   OutputDir : Output directory defining the path of the exported file.
14 |   "
15 |   
16 |   Data <- read.csv(DataPath,row.names = 1)
17 |   Labels <- as.vector(as.matrix(read.csv(LabelsPath)))
18 |   Data <- Data[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK')),]
19 |   Labels <- Labels[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK'))]
20 |   Labels[Labels == 'CD14+ Monocyte'] <- 'CD14_Monocyte'
21 |   Labels[Labels == 'CD19+ B'] <- 'CD19_B'
22 |   Labels[Labels == 'CD56+ NK'] <- 'CD56_NK'
23 |   
24 |   
25 |   #############################################################################
26 |   #                                 SCINA                                     #
27 |   #############################################################################
28 |   library(SCINA)
29 |   Signature_Genes <- preprocess.signatures(GeneSigPath)
30 |   True_Labels_SCINA <- list()
31 |   Pred_Labels_SCINA <- list()
32 |   Total_Time_SCINA <- list()
33 |   
34 |   library(preprocessCore)
35 |   Data = t(as.matrix(Data))
36 |   Data=log(Data+1)
37 |   Data[]=normalize.quantiles(Data)
38 |   
39 |   start_time <- Sys.time()
40 |   results = SCINA(Data, Signature_Genes)
41 |   end_time <- Sys.time()
42 |   
43 |   True_Labels_SCINA <- Labels
44 |   Pred_Labels_SCINA <- results$cell_labels
45 |   Total_Time_SCINA <- as.numeric(difftime(end_time,start_time,units = 'secs'))
46 |   
47 |   setwd(OutputDir)
48 |   
49 |   write.csv(True_Labels_SCINA,'SCINA_True_Labels.csv',row.names = FALSE)
50 |   write.csv(Pred_Labels_SCINA,'SCINA_Pred_Labels.csv',row.names = FALSE)
51 |   write.csv(Total_Time_SCINA,'SCINA_Total_Time.csv',row.names = FALSE)
52 | }
53 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_SVM.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from sys import argv
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import time as tm
  8 | from sklearn.svm import LinearSVC
  9 | import rpy2.robjects as robjects
 10 | 
 11 | 
 12 | def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 13 |     '''
 14 |     run baseline classifier: SVM
 15 |     Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
 16 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
 21 |     as row names and gene names as column names.
 22 |     LabelsPath : Cell population annotations file path (.csv).
 23 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 24 |     OutputDir : Output directory defining the path of the exported file.
 25 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
 26 |     defining the genes order for each cross validation fold, default is NULL.
 27 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 28 |     '''
 29 | 
 30 |     # read the Rdata file
 31 |     robjects.r['load'](CV_RDataPath)
 32 | 
 33 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 34 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 35 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 36 |     col = col - 1
 37 |     test_ind = np.array(robjects.r['Test_Idx'])
 38 |     train_ind = np.array(robjects.r['Train_Idx'])
 39 | 
 40 |     # read the data
 41 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 42 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 43 | 
 44 |     labels = labels.iloc[tokeep]
 45 |     data = data.iloc[tokeep]
 46 | 
 47 |     # read the feature file
 48 |     if (NumGenes > 0):
 49 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 50 | 
 51 |     # normalize data
 52 |     data = np.log1p(data)
 53 | 
 54 |     Classifier = LinearSVC()
 55 | 
 56 |     tr_time=[]
 57 |     ts_time=[]
 58 |     truelab = []
 59 |     pred = []
 60 | 
 61 |     for i in range(np.squeeze(nfolds)):
 62 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 63 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 64 | 
 65 |         train=data.iloc[train_ind_i]
 66 |         test=data.iloc[test_ind_i]
 67 |         y_train=labels.iloc[train_ind_i]
 68 |         y_test=labels.iloc[test_ind_i]
 69 | 
 70 |         if (NumGenes > 0):
 71 |             feat_to_use = features.iloc[0:NumGenes,i]
 72 |             train = train.iloc[:,feat_to_use]
 73 |             test = test.iloc[:,feat_to_use]
 74 | 
 75 |         start=tm.time()
 76 |         Classifier.fit(train, y_train)
 77 |         tr_time.append(tm.time()-start)
 78 | 
 79 |         start=tm.time()
 80 |         predicted = Classifier.predict(test)
 81 |         ts_time.append(tm.time()-start)
 82 | 
 83 |         truelab.extend(y_test.values)
 84 |         pred.extend(predicted)
 85 | 
 86 |     truelab = pd.DataFrame(truelab)
 87 |     pred = pd.DataFrame(pred)
 88 | 
 89 |     tr_time = pd.DataFrame(tr_time)
 90 |     ts_time = pd.DataFrame(ts_time)
 91 | 
 92 |     OutputDir = Path(OutputDir)
 93 |     truelab.to_csv(str(OutputDir / Path("SVM_true.csv")),
 94 |                    index = False)
 95 |     pred.to_csv(str(OutputDir / Path("SVM_pred.csv")),
 96 |                 index = False)
 97 |     tr_time.to_csv(str(OutputDir / Path("SVM_training_time.csv")),
 98 |                    index = False)
 99 |     ts_time.to_csv(str(OutputDir / Path("SVM_test_time.csv")),
100 |                    index = False)
101 | 
102 | run_SVM(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))
103 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_SVM_rejection.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from sys import argv
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import time as tm
  8 | from sklearn.svm import LinearSVC
  9 | import rpy2.robjects as robjects
 10 | from sklearn.calibration import CalibratedClassifierCV
 11 | 
 12 | 
 13 | def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7):
 14 |     '''
 15 |     run baseline classifier: SVM
 16 |     Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
 17 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
 22 |     as row names and gene names as column names.
 23 |     LabelsPath : Cell population annotations file path (.csv).
 24 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 25 |     OutputDir : Output directory defining the path of the exported file.
 26 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
 27 |     defining the genes order for each cross validation fold, default is NULL.
 28 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 29 |     Threshold : Threshold used when rejecting the cells, default is 0.7.
 30 | 
 31 |     '''
 32 | 
 33 |     # read the Rdata file
 34 |     robjects.r['load'](CV_RDataPath)
 35 | 
 36 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 37 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 38 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 39 |     col = col - 1
 40 |     test_ind = np.array(robjects.r['Test_Idx'])
 41 |     train_ind = np.array(robjects.r['Train_Idx'])
 42 | 
 43 |     # read the data
 44 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 45 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 46 | 
 47 |     labels = labels.iloc[tokeep]
 48 |     data = data.iloc[tokeep]
 49 | 
 50 |     # read the feature file
 51 |     if (NumGenes > 0):
 52 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 53 | 
 54 |     # normalize data
 55 |     data = np.log1p(data)
 56 | 
 57 |     Classifier = LinearSVC()
 58 |     clf = CalibratedClassifierCV(Classifier)
 59 | 
 60 |     tr_time=[]
 61 |     ts_time=[]
 62 |     truelab = []
 63 |     pred = []
 64 | 
 65 |     for i in range(np.squeeze(nfolds)):
 66 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 67 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 68 | 
 69 |         train=data.iloc[train_ind_i]
 70 |         test=data.iloc[test_ind_i]
 71 |         y_train=labels.iloc[train_ind_i]
 72 |         y_test=labels.iloc[test_ind_i]
 73 | 
 74 |         if (NumGenes > 0):
 75 |             feat_to_use = features.iloc[0:NumGenes,i]
 76 |             train = train.iloc[:,feat_to_use]
 77 |             test = test.iloc[:,feat_to_use]
 78 | 
 79 |         start=tm.time()
 80 |         clf.fit(train, y_train)
 81 |         tr_time.append(tm.time()-start)
 82 | 
 83 |         start=tm.time()
 84 |         predicted = clf.predict(test)
 85 |         prob = np.max(clf.predict_proba(test), axis = 1)
 86 |         unlabeled = np.where(prob < Threshold)
 87 |         predicted[unlabeled] = 'Unknown'
 88 |         ts_time.append(tm.time()-start)
 89 | 
 90 |         truelab.extend(y_test.values)
 91 |         pred.extend(predicted)
 92 | 
 93 |     truelab = pd.DataFrame(truelab)
 94 |     pred = pd.DataFrame(pred)
 95 | 
 96 |     tr_time = pd.DataFrame(tr_time)
 97 |     ts_time = pd.DataFrame(ts_time)
 98 | 
 99 |     OutputDir = Path(OutputDir)
100 |     truelab.to_csv(str(OutputDir / Path("SVM_rejection_true.csv")),
101 |                    index = False)
102 |     pred.to_csv(str(OutputDir / Path("SVM_rejection_pred.csv")),
103 |                 index = False)
104 |     tr_time.to_csv(str(OutputDir / Path("SVM_rejection_training_time.csv")),
105 | 
106 |                    index = False)
107 |     ts_time.to_csv(str(OutputDir / Path("SVM_rejection_test_time.csv")),
108 |                    index = False)
109 | 
110 | run_SVM(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))
111 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_SingleR.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(TRUE)
 2 | 
 3 | run_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
 4 |   "
 5 |   run SingleR
 6 |   Wrapper script to run SingleR on a benchmark dataset with 5-fold cross validation,
 7 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
 8 | 
 9 |   Parameters
10 |   ----------
11 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
12 |   as row names and gene names as column names.
13 |   LabelsPath : Cell population annotations file path (.csv).
14 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
15 |   OutputDir : Output directory defining the path of the exported file.
16 |   GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
17 |   defining the genes order for each cross validation fold, default is NULL.
18 |   NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
19 |   "
20 | 
21 |   Data <- read.csv(DataPath,row.names = 1)
22 |   Labels <- as.matrix(read.csv(LabelsPath))
23 |   load(CV_RDataPath)
24 |   Labels <- as.vector(Labels[,col_Index])
25 |   Data <- Data[Cells_to_Keep,]
26 |   Labels <- Labels[Cells_to_Keep]
27 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
28 |     GenesOrder = read.csv(GeneOrderPath)
29 |   }
30 | 
31 |   #############################################################################
32 |   #                               SingleR                                     #
33 |   #############################################################################
34 |   library(SingleR)
35 |   library(Seurat)
36 |   True_Labels_SingleR <- list()
37 |   Pred_Labels_SingleR <- list()
38 |   Total_Time_SingleR <- list()
39 |   Data = t(as.matrix(Data))
40 | 
41 |   for (i in c(1:n_folds)){
42 |     if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
43 |       start_time <- Sys.time()
44 |       singler = SingleR(method = "single", Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]],
45 |                         Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]],
46 |                         Labels[Train_Idx[[i]]], numCores = 1)
47 |       end_time <- Sys.time()
48 |     }
49 |     else{
50 |       start_time <- Sys.time()
51 |       singler = SingleR(method = "single", Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1)
52 |       end_time <- Sys.time()
53 |     }
54 |     Total_Time_SingleR[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
55 | 
56 |     True_Labels_SingleR[i] <- list(Labels[Test_Idx[[i]]])
57 |     Pred_Labels_SingleR[i] <- list(as.vector(singler$labels))
58 |   }
59 |   True_Labels_SingleR <- as.vector(unlist(True_Labels_SingleR))
60 |   Pred_Labels_SingleR <- as.vector(unlist(Pred_Labels_SingleR))
61 |   Total_Time_SingleR <- as.vector(unlist(Total_Time_SingleR))
62 | 
63 |   write.csv(True_Labels_SingleR,paste0(OutputDir,'/SingleR_true.csv'),row.names = FALSE)
64 |   write.csv(Pred_Labels_SingleR,paste0(OutputDir,'/SingleR_pred.csv'),row.names = FALSE)
65 |   write.csv(Total_Time_SingleR,paste0(OutputDir,'/SingleR_total_time.csv'),row.names = FALSE)
66 | }
67 | 
68 | if (args[6] == "0") {
69 |   run_SingleR(args[1], args[2], args[3], args[4])
70 | } else {
71 |   run_SingleR(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))
72 | }
73 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_kNN50.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from sys import argv
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import time as tm
  8 | from sklearn.neighbors import KNeighborsClassifier
  9 | import rpy2.robjects as robjects
 10 | 
 11 | 
 12 | def run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 13 |     '''
 14 |     run baseline classifiers: kNN
 15 |     Wrapper script to run kNN (with k = 50) classifier on a benchmark dataset with 5-fold cross validation,
 16 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
 21 |     as row names and gene names as column names.
 22 |     LabelsPath : Cell population annotations file path (.csv).
 23 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 24 |     OutputDir : Output directory defining the path of the exported file.
 25 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
 26 |     defining the genes order for each cross validation fold, default is NULL.
 27 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 28 |     '''
 29 | 
 30 |     # read the Rdata file
 31 |     robjects.r['load'](CV_RDataPath)
 32 | 
 33 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 34 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 35 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 36 |     col = col - 1
 37 |     test_ind = np.array(robjects.r['Test_Idx'])
 38 |     train_ind = np.array(robjects.r['Train_Idx'])
 39 | 
 40 |     # read the data
 41 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 42 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 43 | 
 44 |     labels = labels.iloc[tokeep]
 45 |     data = data.iloc[tokeep]
 46 | 
 47 |     # read the feature file
 48 |     if (NumGenes > 0):
 49 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 50 | 
 51 |     # normalize data
 52 |     data = np.log1p(data)
 53 | 
 54 |     Classifier = KNeighborsClassifier(n_neighbors=50)
 55 | 
 56 |     tr_time=[]
 57 |     ts_time=[]
 58 |     truelab = []
 59 |     pred = []
 60 | 
 61 |     for i in range(np.squeeze(nfolds)):
 62 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 63 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 64 | 
 65 |         train=data.iloc[train_ind_i]
 66 |         test=data.iloc[test_ind_i]
 67 |         y_train=labels.iloc[train_ind_i]
 68 |         y_test=labels.iloc[test_ind_i]
 69 | 
 70 |         if (NumGenes > 0):
 71 |             feat_to_use = features.iloc[0:NumGenes,i]
 72 |             train = train.iloc[:,feat_to_use]
 73 |             test = test.iloc[:,feat_to_use]
 74 | 
 75 |         start=tm.time()
 76 |         Classifier.fit(train, y_train)
 77 |         tr_time.append(tm.time()-start)
 78 | 
 79 |         start=tm.time()
 80 |         predicted = Classifier.predict(test)
 81 |         ts_time.append(tm.time()-start)
 82 | 
 83 |         truelab.extend(y_test.values)
 84 |         pred.extend(predicted)
 85 | 
 86 |     truelab = pd.DataFrame(truelab)
 87 |     pred = pd.DataFrame(pred)
 88 | 
 89 |     tr_time = pd.DataFrame(tr_time)
 90 |     ts_time = pd.DataFrame(ts_time)
 91 | 
 92 |     OutputDir = Path(OutputDir)
 93 |     truelab.to_csv(str(OutputDir / Path("kNN50_true.csv")),
 94 |                    index = False)
 95 |     pred.to_csv(str(OutputDir / Path("kNN50_pred.csv")),
 96 |                 index = False)
 97 |     tr_time.to_csv(str(OutputDir / Path("kNN50_training_time.csv")),
 98 |                    index = False)
 99 |     ts_time.to_csv(str(OutputDir / Path("kNN50_test_time.csv")),
100 |                    index = False)
101 | 
102 | run_kNN50(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))
103 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_kNN9.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from sys import argv
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import time as tm
  8 | from sklearn.neighbors import KNeighborsClassifier
  9 | import rpy2.robjects as robjects
 10 | 
 11 | 
 12 | def run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 13 |     '''
 14 |     run baseline classifiers: kNN
 15 |     Wrapper script to run kNN (with k = 9) classifier on a benchmark dataset with 5-fold cross validation,
 16 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
 21 |     as row names and gene names as column names.
 22 |     LabelsPath : Cell population annotations file path (.csv).
 23 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 24 |     OutputDir : Output directory defining the path of the exported file.
 25 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
 26 |     defining the genes order for each cross validation fold, default is NULL.
 27 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 28 |     '''
 29 | 
 30 |     # read the Rdata file
 31 |     robjects.r['load'](CV_RDataPath)
 32 | 
 33 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 34 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 35 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 36 |     col = col - 1
 37 |     test_ind = np.array(robjects.r['Test_Idx'])
 38 |     train_ind = np.array(robjects.r['Train_Idx'])
 39 | 
 40 |     # read the data
 41 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 42 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 43 | 
 44 |     labels = labels.iloc[tokeep]
 45 |     data = data.iloc[tokeep]
 46 | 
 47 |     # read the feature file
 48 |     if (NumGenes > 0):
 49 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 50 | 
 51 |     # normalize data
 52 |     data = np.log1p(data)
 53 | 
 54 |     Classifier = KNeighborsClassifier(n_neighbors=9)
 55 | 
 56 |     tr_time=[]
 57 |     ts_time=[]
 58 |     truelab = []
 59 |     pred = []
 60 | 
 61 |     for i in range(np.squeeze(nfolds)):
 62 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 63 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 64 | 
 65 |         train=data.iloc[train_ind_i]
 66 |         test=data.iloc[test_ind_i]
 67 |         y_train=labels.iloc[train_ind_i]
 68 |         y_test=labels.iloc[test_ind_i]
 69 | 
 70 |         if (NumGenes > 0):
 71 |             feat_to_use = features.iloc[0:NumGenes,i]
 72 |             train = train.iloc[:,feat_to_use]
 73 |             test = test.iloc[:,feat_to_use]
 74 | 
 75 |         start=tm.time()
 76 |         Classifier.fit(train, y_train)
 77 |         tr_time.append(tm.time()-start)
 78 | 
 79 |         start=tm.time()
 80 |         predicted = Classifier.predict(test)
 81 |         ts_time.append(tm.time()-start)
 82 | 
 83 |         truelab.extend(y_test.values)
 84 |         pred.extend(predicted)
 85 | 
 86 |     truelab = pd.DataFrame(truelab)
 87 |     pred = pd.DataFrame(pred)
 88 | 
 89 |     tr_time = pd.DataFrame(tr_time)
 90 |     ts_time = pd.DataFrame(ts_time)
 91 | 
 92 |     OutputDir = Path(OutputDir)
 93 |     truelab.to_csv(str(OutputDir / Path("kNN9_true.csv")),
 94 |                    index = False)
 95 |     pred.to_csv(str(OutputDir / Path("kNN9_pred.csv")),
 96 |                 index = False)
 97 |     tr_time.to_csv(str(OutputDir / Path("kNN9_training_time.csv")),
 98 |                    index = False)
 99 |     ts_time.to_csv(str(OutputDir / Path("kNN9_test_time.csv")),
100 |                    index = False)
101 | 
102 | run_kNN50(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))
103 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_moana.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import numpy as np
 4 | from moana.core import ExpMatrix
 5 | from moana.classify import CellTypeClassifier
 6 | import time as tm
 7 | import rpy2.robjects as robjects
 8 | 
 9 | def run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
10 |     '''
11 |     run moana
12 |     Wrapper script to run moana on a benchmark dataset with a pretrained classifier,
13 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.  
14 |   
15 |     Parameters
16 |     ----------
17 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
18 |     as row names and gene names as column names.
19 |     LabelsPath : Cell population annotations file path (.csv).
20 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
21 |     ClassifierPath : Data file path to the pretrained classifier.
22 |     OutputDir : Output directory defining the path of the exported file.
23 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
24 |     defining the genes order for each cross validation fold, default is NULL.
25 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
26 |     '''
27 |     
28 | #    # read the Rdata file
29 | #    robjects.r['load'](CV_RDataPath)
30 | #
31 | #    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
32 | #    col = np.array(robjects.r['col_Index'], dtype = 'int')
33 | #    col = col - 1
34 |     
35 |     matrix = ExpMatrix.read_tsv(DataPath, sep = ',')    
36 | #    matrix = matrix.iloc[tokeep] 
37 |     
38 |     truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',')
39 | #    truelab = truelab.iloc[tokeep]
40 |     
41 |     ct_old = ['CD19+ B','CD14+ Monocyte','CD4+/CD45RA+/CD25- Naive T','CD4+/CD45RO+ Memory','CD8+/CD45RA+ Naive Cytotoxic','Dendritic', 'CD56+ NK']
42 |     ct_new = ['B cells','CD14+ monocytes','Naive CD4+ T cells','Memory CD4+ T cells','Naive CD8+ T cells','Dendritic cells','NK cells']
43 |     
44 |     tokeep2 = np.isin(truelab,ct_old)
45 |     truelab = truelab[tokeep2]
46 |     print(len(truelab))
47 |     matrix = matrix.iloc[np.squeeze(tokeep2)]
48 |     
49 |     for i in range(len(ct_old)):
50 |         truelab.iloc[truelab == ct_old[i]] = ct_new[i]
51 |         
52 |     # read the feature file
53 |     if (NumGenes > 0):
54 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
55 |         feat_to_use = features.iloc[0:NumGenes,0]
56 |         matrix = matrix.iloc[:,feat_to_use]
57 | 
58 |     data = ExpMatrix(X = np.transpose(matrix.X), genes = matrix.cells, cells = matrix.genes)
59 |     data.genes.name = 'Genes'
60 |     data.cells.name = 'Cells'
61 |     data.index.name = 'Genes'
62 |     data.columns.name = 'Cells'
63 |     
64 |     clf = CellTypeClassifier.read_pickle(ClassifierPath)
65 |     
66 |     start = tm.time()
67 |     predictions = clf.predict(data)
68 |     runtime = tm.time() - start
69 |     
70 |     np.asarray(predictions)
71 |     
72 |     pred = pd.DataFrame(predictions)
73 |         
74 |     os.chdir(OutputDir)
75 |             
76 |     if (NumGenes == 0):  
77 |         truelab.to_csv("moana_True_Labels.csv", index = False)
78 |         pred.to_csv("moana_Pred_Labels.csv", index = False)
79 |         with open("moana_Total_Time.csv", 'w') as f:
80 |             f.write("%f\n" % runtime)
81 |     else:
82 |         truelab.to_csv("moana_" + str(NumGenes) + "_True_Labels.csv", index = False)
83 |         pred.to_csv("moana_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
84 |         with open("moana_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
85 |             f.write("%f\n" % runtime)
86 | 
87 | 
88 |         
89 |     
90 |     
91 |     
92 |     
93 |     
94 |     
95 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_scID.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(TRUE)
 2 | 
 3 | run_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
 4 |   "
 5 |   run scID
 6 |   Wrapper script to run scID on a benchmark dataset with 5-fold cross validation,
 7 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
 8 | 
 9 |   Parameters
10 |   ----------
11 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
12 |   as row names and gene names as column names.
13 |   LabelsPath : Cell population annotations file path (.csv).
14 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
15 |   OutputDir : Output directory defining the path of the exported file.
16 |   GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
17 |   defining the genes order for each cross validation fold, default is NULL.
18 |   NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
19 |   "
20 | 
21 |   Data <- read.csv(DataPath,row.names = 1)
22 |   Labels <- as.matrix(read.csv(LabelsPath))
23 |   load(CV_RDataPath)
24 |   Labels <- as.vector(Labels[,col_Index])
25 |   Data <- Data[Cells_to_Keep,]
26 |   Labels <- Labels[Cells_to_Keep]
27 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
28 |     GenesOrder = read.csv(GeneOrderPath)
29 |   }
30 | 
31 |   #############################################################################
32 |   #                                 scID                                      #
33 |   #############################################################################
34 |   library(scID)
35 |   library(Seurat)
36 |   True_Labels_scID <- list()
37 |   Pred_Labels_scID <- list()
38 |   Total_Time_scID <- list()
39 |   Data = t(as.matrix(Data))
40 | 
41 |   for (i in c(1:n_folds)){
42 |     if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
43 |       Train_Labels <- list(Labels[Train_Idx[[i]]])
44 |       names(Train_Labels[[1]]) <- colnames(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]])
45 |       start_time <- Sys.time()
46 |       scID_output <- scid_multiclass(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]],
47 |                                      Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]],
48 |                                      Train_Labels[[1]])
49 |       end_time <- Sys.time()
50 |     }
51 |     else{
52 |       Train_Labels <- list(Labels[Train_Idx[[i]]])
53 |       names(Train_Labels[[1]]) <- colnames(Data[,Train_Idx[[i]]])
54 |       start_time <- Sys.time()
55 |       scID_output <- scid_multiclass(Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Train_Labels[[1]])
56 |       end_time <- Sys.time()
57 |     }
58 |     Total_Time_scID[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
59 | 
60 |     True_Labels_scID[i] <- list(Labels[Test_Idx[[i]]])
61 |     Pred_Labels_scID[i] <- list(as.vector(scID_output$labels))
62 |   }
63 |   True_Labels_scID <- as.vector(unlist(True_Labels_scID))
64 |   Pred_Labels_scID <- as.vector(unlist(Pred_Labels_scID))
65 |   Total_Time_scID <- as.vector(unlist(Total_Time_scID))
66 | 
67 |   write.csv(Pred_Labels_scID, paste0(OutputDir,'/scID_pred.csv'),row.names = FALSE)
68 |   write.csv(True_Labels_scID, paste0(OutputDir,'/scID_true.csv'),row.names = FALSE)
69 |   write.csv(Total_Time_scID,paste0(OutputDir,'/scID_total_time.csv'),row.names = FALSE)
70 | 
71 | }
72 | 
73 | if (args[6] == "0") {
74 |   run_scID(args[1], args[2], args[3], args[4])
75 | } else {
76 |   run_scID(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))
77 | }
78 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_scPred.R:
--------------------------------------------------------------------------------
  1 | run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  2 |   "
  3 |   run scPred
  4 |   Wrapper script to run scPred on a benchmark dataset with 5-fold cross validation,
  5 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
  6 |   
  7 |   Parameters
  8 |   ----------
  9 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 10 |   as row names and gene names as column names.
 11 |   LabelsPath : Cell population annotations file path (.csv).
 12 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 13 |   OutputDir : Output directory defining the path of the exported file.
 14 |   GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 15 |   defining the genes order for each cross validation fold, default is NULL.
 16 |   NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
 17 |   "
 18 |   
 19 |   Data <- read.csv(DataPath,row.names = 1)
 20 |   Labels <- as.matrix(read.csv(LabelsPath))
 21 |   load(CV_RDataPath)
 22 |   Labels <- as.vector(Labels[,col_Index])
 23 |   Data <- Data[Cells_to_Keep,]
 24 |   Labels <- Labels[Cells_to_Keep]
 25 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
 26 |     GenesOrder = read.csv(GeneOrderPath)
 27 |   }
 28 |   
 29 |   #############################################################################
 30 |   #                                scPred                                     #
 31 |   #############################################################################
 32 |   library(scPred)
 33 |   library(tidyverse)
 34 |   library(SingleCellExperiment)
 35 |   True_Labels_scPred <- list()
 36 |   Pred_Labels_scPred <- list()
 37 |   Training_Time_scPred <- list()
 38 |   Testing_Time_scPred <- list()
 39 |   Data = t(as.matrix(Data))
 40 |   
 41 |   for (i in c(1:n_folds)){
 42 |     if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
 43 |       sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
 44 |                                   colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
 45 |       sce_counts <- normcounts(sce)
 46 |       sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)
 47 |       sce_metadata <- as.data.frame(colData(sce))
 48 |       
 49 |       sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
 50 |                                        colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
 51 |       sce_counts_test <- normcounts(sce_test)
 52 |       sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)
 53 |       sce_metadata_test <- as.data.frame(colData(sce_test))
 54 |     }
 55 |     else{
 56 |       sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 
 57 |                                   colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
 58 |       sce_counts <- normcounts(sce)
 59 |       sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)
 60 |       sce_metadata <- as.data.frame(colData(sce))
 61 |       
 62 |       sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 
 63 |                                        colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
 64 |       sce_counts_test <- normcounts(sce_test)
 65 |       sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)
 66 |       sce_metadata_test <- as.data.frame(colData(sce_test))
 67 |     }
 68 |     
 69 |     
 70 |     # scPred Training    
 71 |     start_time <- Sys.time()
 72 |     set.seed(1234)
 73 |     scp <- eigenDecompose(sce_cpm)
 74 |     scPred::metadata(scp) <- sce_metadata
 75 |     scp <- getFeatureSpace(scp, pVar = 'cell_type1')
 76 |     # plotEigen(scp, group = 'cell_type1')
 77 |     scp <- trainModel(scp)
 78 |     # plotTrainProbs(scp)
 79 |     end_time <- Sys.time()
 80 |     Training_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
 81 |     
 82 |     # scPred Prediction
 83 |     start_time <- Sys.time()
 84 |     scp <- scPredict(scp,newData = sce_cpm_test)
 85 |     end_time <- Sys.time()
 86 |     Testing_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
 87 |     
 88 |     True_Labels_scPred[i] <- list(Labels[Test_Idx[[i]]])
 89 |     Pred_Labels_scPred[i] <- list(getPredictions(scp)$predClass)
 90 |   }
 91 |   True_Labels_scPred <- as.vector(unlist(True_Labels_scPred))
 92 |   Pred_Labels_scPred <- as.vector(unlist(Pred_Labels_scPred))
 93 |   Training_Time_scPred <- as.vector(unlist(Training_Time_scPred))
 94 |   Testing_Time_scPred <- as.vector(unlist(Testing_Time_scPred))
 95 |   
 96 |   setwd(OutputDir)
 97 |   
 98 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
 99 |     write.csv(True_Labels_scPred,paste('scPred_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
100 |     write.csv(Pred_Labels_scPred,paste('scPred_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
101 |     write.csv(Training_Time_scPred,paste('scPred_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
102 |     write.csv(Testing_Time_scPred,paste('scPred_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
103 |   }
104 |   else{
105 |     write.csv(True_Labels_scPred,'scPred_True_Labels.csv',row.names = FALSE)
106 |     write.csv(Pred_Labels_scPred,'scPred_Pred_Labels.csv',row.names = FALSE)
107 |     write.csv(Training_Time_scPred,'scPred_Training_Time.csv',row.names = FALSE)
108 |     write.csv(Testing_Time_scPred,'scPred_Testing_Time.csv',row.names = FALSE)
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_scVI.py:
--------------------------------------------------------------------------------
  1 | from scvi.dataset import CsvDataset
  2 | import os
  3 | from sys import argv
  4 | from pathlib import Path
  5 | from scvi.dataset import CsvDataset
  6 | import numpy as np
  7 | import pandas as pd
  8 | from scvi.models import SCANVI
  9 | from scvi.inference import SemiSupervisedTrainer
 10 | import time as tm
 11 | import rpy2.robjects as robjects
 12 | 
 13 | def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
 14 |     '''
 15 |     run scVI
 16 |     Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation,
 17 |     outputs lists of true and predicted cell labels as csv files, as well as computation time.
 18 |   
 19 |     Parameters
 20 |     ----------
 21 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 22 |     as row names and gene names as column names.
 23 |     LabelsPath : Cell population annotations file path (.csv).
 24 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 25 |     OutputDir : Output directory defining the path of the exported file.
 26 |     GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 27 |     defining the genes order for each cross validation fold, default is NULL.
 28 |     NumGenes : Number of genes used in case of feature selection (integer), default is 0.
 29 |     '''
 30 |     
 31 |     # read the Rdata file
 32 |     robjects.r['load'](CV_RDataPath)
 33 | 
 34 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
 35 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
 36 |     col = np.array(robjects.r['col_Index'], dtype = 'int')
 37 |     col = col - 1 
 38 |     test_ind = np.array(robjects.r['Test_Idx'])
 39 |     train_ind = np.array(robjects.r['Train_Idx'])
 40 | 
 41 |     # read the data
 42 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
 43 |     labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
 44 | 
 45 |     labels = labels.iloc[tokeep]
 46 |     data = data.iloc[tokeep] 
 47 |     
 48 |     # read the feature file
 49 |     if (NumGenes > 0):
 50 |         features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
 51 |     
 52 |     if (NumGenes == 0):
 53 |         #save labels as csv file with header and index column
 54 |         labels.to_csv('Labels_scvi.csv')
 55 |         data.to_csv('Data_scvi.csv')    
 56 |         
 57 |         train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False)
 58 |         
 59 |         ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
 60 |         scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
 61 |         trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)
 62 |     
 63 |     n_epochs = 200
 64 |     
 65 |     truelab = []
 66 |     pred = []
 67 |     tr_time = []
 68 |     ts_time = []
 69 |     
 70 |     for i in range(np.squeeze(nfolds)):
 71 |         test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
 72 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
 73 |         
 74 |         if (NumGenes > 0):
 75 |             feat_to_use = features.iloc[0:NumGenes,i]
 76 |             data2 = data.iloc[:,feat_to_use]
 77 | 
 78 |             labels.to_csv(OutputDir +'/Labels_scvi.csv')
 79 |             data2.to_csv(OutputDir +'/Data_scvi.csv')
 80 | 
 81 |             train = CsvDataset(OutputDir +'/Data_scvi.csv', save_path = "", sep = ",", labels_file = OutputDir +"/Labels_scvi.csv", gene_by_cell = False, new_n_genes = False)
 82 | 
 83 |             ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
 84 |             scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
 85 |             trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)
 86 | 
 87 |         trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False)
 88 |         trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy']
 89 |         trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False)
 90 |         trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy']
 91 |     
 92 |         start = tm.time()
 93 |         trainer_scanvi.train(n_epochs)
 94 |         tr_time.append(tm.time()-start)
 95 |     
 96 |         ## labels of test set are in y_pred
 97 |         ## labels are returned in numbers, should be mapped back to the real labels
 98 |         ## indices are permutated
 99 |         start = tm.time()
100 |         y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()
101 |         ts_time.append(tm.time()-start)
102 |         
103 |         truelab.extend(y_true)
104 |         pred.extend(y_pred)
105 |     
106 |     #write results
107 | 
108 |     truelab = pd.DataFrame(truelab)
109 |     pred = pd.DataFrame(pred)
110 |     
111 |     tr_time = pd.DataFrame(tr_time)
112 |     ts_time = pd.DataFrame(ts_time)
113 | 
114 |     truelab.to_csv(str(Path(OutputDir + "/scVI_true.csv")), index=False)
115 |     pred.to_csv(str(Path(OutputDir + "/scVI_pred.csv")), index=False)
116 |     tr_time.to_csv(str(Path(OutputDir + "/scVI_training_time.csv")), index=False)
117 |     ts_time.to_csv(str(Path(OutputDir + "/scVI_test_time.csv")), index=False)
118 | 
119 | run_scVI(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))
120 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_scmapcell.R:
--------------------------------------------------------------------------------
  1 | args <- commandArgs(TRUE)
  2 | 
  3 | run_scmapcell <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  4 |   "
  5 |   run scmapcell
  6 |   Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,
  7 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
  8 |   
  9 |   Parameters
 10 |   ----------
 11 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 12 |   as row names and gene names as column names.
 13 |   LabelsPath : Cell population annotations file path (.csv).
 14 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 15 |   OutputDir : Output directory defining the path of the exported file.
 16 |   GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 17 |   defining the genes order for each cross validation fold, default is NULL.
 18 |   NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
 19 |   "
 20 |   
 21 |   Data <- read.csv(DataPath,row.names = 1)
 22 |   Labels <- as.matrix(read.csv(LabelsPath))
 23 |   load(CV_RDataPath)
 24 |   Labels <- as.vector(Labels[,col_Index])
 25 |   Data <- Data[Cells_to_Keep,]
 26 |   Labels <- Labels[Cells_to_Keep]
 27 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
 28 |     GenesOrder = read.csv(GeneOrderPath)
 29 |   }
 30 |   
 31 |   #############################################################################
 32 |   #                                 scmap                                     #
 33 |   #############################################################################
 34 |   library(scmap)
 35 |   library(SingleCellExperiment)
 36 |   True_Labels_scmapcell <- list()
 37 |   Pred_Labels_scmapcell <- list()
 38 |   Training_Time_scmapcell <- list()
 39 |   Testing_Time_scmapcell <- list()
 40 |   Data = t(as.matrix(Data))
 41 |   
 42 |   for (i in c(1:n_folds)){
 43 |     if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
 44 |       sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
 45 |                                   colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
 46 |       logcounts(sce) <- log2(normcounts(sce) + 1)
 47 |       # use gene names as feature symbols
 48 |       rowData(sce)$feature_symbol <- rownames(sce)
 49 |       sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)
 50 |       
 51 |       sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
 52 |                                        colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
 53 |       logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
 54 |       rowData(sce_test)$feature_symbol <- rownames(sce_test)
 55 |       sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
 56 |     }
 57 |     else{
 58 |       sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 
 59 |                                   colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
 60 |       logcounts(sce) <- log2(normcounts(sce) + 1)
 61 |       # use gene names as feature symbols
 62 |       rowData(sce)$feature_symbol <- rownames(sce)
 63 |       sce <- selectFeatures(sce, suppress_plot = TRUE)
 64 |       
 65 |       sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 
 66 |                                        colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
 67 |       logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
 68 |       rowData(sce_test)$feature_symbol <- rownames(sce_test)
 69 |       sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
 70 |     }
 71 |     
 72 |     # scmap-cell
 73 |     start_time <- Sys.time()
 74 |     set.seed(1)
 75 |     sce <- indexCell(sce)
 76 |     end_time <- Sys.time()
 77 |     Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
 78 |     
 79 |     start_time <- Sys.time()
 80 |     scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index))
 81 |     scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1)))
 82 |     end_time <- Sys.time()
 83 |     Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
 84 |     
 85 |     True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]])
 86 |     Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs)
 87 |   }
 88 |   
 89 |   True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell))
 90 |   Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell))
 91 |   Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell))
 92 |   Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell))
 93 |   
 94 |   write.csv(True_Labels_scmapcell,paste0(OutputDir,'/scmapcell_true.csv'),row.names = FALSE)
 95 |   write.csv(Pred_Labels_scmapcell,paste0(OutputDir,'/scmapcell_pred.csv'),row.names = FALSE)
 96 |   write.csv(Training_Time_scmapcell,paste0(OutputDir,'/scmapcell_training_time.csv'),row.names = FALSE)
 97 |   write.csv(Testing_Time_scmapcell,paste0(OutputDir,'/scmapcell_test_time.csv'),row.names = FALSE)
 98 | }
 99 | if (args[6] == "0") {
100 |   run_scmapcell(args[1], args[2], args[3], args[4])
101 | } else {
102 |   run_scmapcell(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))
103 | }
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_scmapcluster.R:
--------------------------------------------------------------------------------
  1 | args <- commandArgs(TRUE)
  2 | 
  3 | run_scmapcluster <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  4 |   "
  5 |   run scmapcluster
  6 |   Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,
  7 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
  8 |   
  9 |   Parameters
 10 |   ----------
 11 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
 12 |   as row names and gene names as column names.
 13 |   LabelsPath : Cell population annotations file path (.csv).
 14 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
 15 |   OutputDir : Output directory defining the path of the exported file.
 16 |   GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
 17 |   defining the genes order for each cross validation fold, default is NULL.
 18 |   NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
 19 |   "
 20 |   
 21 |   Data <- read.csv(DataPath,row.names = 1)
 22 |   Labels <- as.matrix(read.csv(LabelsPath))
 23 |   load(CV_RDataPath)
 24 |   Labels <- as.vector(Labels[,col_Index])
 25 |   Data <- Data[Cells_to_Keep,]
 26 |   Labels <- Labels[Cells_to_Keep]
 27 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
 28 |     GenesOrder = read.csv(GeneOrderPath)
 29 |   }
 30 |   
 31 |   #############################################################################
 32 |   #                                 scmap                                     #
 33 |   #############################################################################
 34 |   library(scmap)
 35 |   library(SingleCellExperiment)
 36 |   True_Labels_scmapcluster <- list()
 37 |   Pred_Labels_scmapcluster <- list()
 38 |   Training_Time_scmapcluster <- list()
 39 |   Testing_Time_scmapcluster <- list()
 40 |   Data = t(as.matrix(Data))
 41 |   
 42 |   for (i in c(1:n_folds)){
 43 |     if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
 44 |       sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
 45 |                                   colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
 46 |       logcounts(sce) <- log2(normcounts(sce) + 1)
 47 |       # use gene names as feature symbols
 48 |       rowData(sce)$feature_symbol <- rownames(sce)
 49 |       sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)
 50 |       
 51 |       sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
 52 |                                        colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
 53 |       logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
 54 |       rowData(sce_test)$feature_symbol <- rownames(sce_test)
 55 |       sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
 56 |     }
 57 |     else{
 58 |       sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 
 59 |                                   colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
 60 |       logcounts(sce) <- log2(normcounts(sce) + 1)
 61 |       # use gene names as feature symbols
 62 |       rowData(sce)$feature_symbol <- rownames(sce)
 63 |       sce <- selectFeatures(sce, suppress_plot = TRUE)
 64 |       
 65 |       sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 
 66 |                                        colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
 67 |       logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
 68 |       rowData(sce_test)$feature_symbol <- rownames(sce_test)
 69 |       sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
 70 |     }
 71 |     
 72 |     # scmap-cluster
 73 |     start_time <- Sys.time()
 74 |     sce <- indexCluster(sce)
 75 |     end_time <- Sys.time()
 76 |     Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
 77 |     
 78 |     start_time <- Sys.time()
 79 |     scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index))
 80 |     end_time <- Sys.time()
 81 |     Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
 82 |     
 83 |     True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]])
 84 |     Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs)
 85 |     
 86 |   }
 87 |   
 88 |   True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster))
 89 |   Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster))
 90 |   Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster))
 91 |   Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster))
 92 | 
 93 |   write.csv(True_Labels_scmapcluster,paste0(OutputDir,'/scmapcluster_true.csv'),row.names = FALSE)
 94 |   write.csv(Pred_Labels_scmapcluster,paste0(OutputDir,'/scmapcluster_pred.csv'),row.names = FALSE)
 95 |   write.csv(Training_Time_scmapcluster,paste0(OutputDir,'/scmapcluster_training_time.csv'),row.names = FALSE)
 96 |   write.csv(Testing_Time_scmapcluster,paste0(OutputDir,'/scmapcluster_test_time.csv'),row.names = FALSE)
 97 | 
 98 | 
 99 | }
100 | if (args[6] == "0") {
101 |   run_scmapcluster(args[1], args[2], args[3], args[4])
102 | } else {
103 |   run_scmapcluster(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))
104 | }
105 | 


--------------------------------------------------------------------------------
/Snakemake/Scripts/run_singleCellNet.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(TRUE)
 2 | 
 3 | run_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
 4 |   "
 5 |   run singleCellNet
 6 |   Wrapper script to run singleCellNet on a benchmark dataset with 5-fold cross validation,
 7 |   outputs lists of true and predicted cell labels as csv files, as well as computation time.
 8 | 
 9 |   Parameters
10 |   ----------
11 |   DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
12 |   as row names and gene names as column names.
13 |   LabelsPath : Cell population annotations file path (.csv).
14 |   CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
15 |   OutputDir : Output directory defining the path of the exported file.
16 |   GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
17 |   defining the genes order for each cross validation fold, default is NULL.
18 |   NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
19 |   "
20 | 
21 |   Data <- read.csv(DataPath,row.names = 1)
22 |   colnames(Data) <- gsub('_','.',colnames(Data), fixed = TRUE)
23 |   Labels <- as.matrix(read.csv(LabelsPath))
24 |   load(CV_RDataPath)
25 |   Labels <- as.vector(Labels[,col_Index])
26 |   Data <- Data[Cells_to_Keep,]
27 |   Labels <- Labels[Cells_to_Keep]
28 |   if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
29 |     GenesOrder = read.csv(GeneOrderPath)
30 |   }
31 | 
32 |   #############################################################################
33 |   #                              singleCellNet                                #
34 |   #############################################################################
35 |   library(singleCellNet)
36 |   library(dplyr)
37 |   True_Labels_singleCellNet <- list()
38 |   Pred_Labels_singleCellNet <- list()
39 |   Training_Time_singleCellNet <- list()
40 |   Testing_Time_singleCellNet <- list()
41 |   Data = t(as.matrix(Data))              # deals also with sparse matrix
42 | 
43 |   for(i in c(1:n_folds)){
44 |     if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
45 |       DataTrain <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]
46 |       DataTest <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]
47 |     }
48 |     else{
49 |       DataTrain <- Data[,Train_Idx[[i]]]
50 |       DataTest <- Data[,Test_Idx[[i]]]
51 |     }
52 | 
53 |     start_time <- Sys.time()
54 |     cgenes2<-findClassyGenes(DataTrain, data.frame(Annotation = Labels[Train_Idx[[i]]]), "Annotation")
55 |     cgenesA<-cgenes2[['cgenes']]
56 |     grps<-cgenes2[['grps']]
57 |     DataTrain<-as.matrix(DataTrain[cgenesA,])
58 |     xpairs<-ptGetTop(DataTrain, grps, ncores = 1)
59 |     pdTrain<-query_transform(DataTrain[cgenesA, ], xpairs)
60 |     rf<-sc_makeClassifier(pdTrain[xpairs,], genes=xpairs, groups=grps)
61 |     end_time <- Sys.time()
62 |     Training_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
63 | 
64 |     start_time <- Sys.time()
65 |     DataTest<-query_transform(DataTest[cgenesA,], xpairs)
66 |     classRes <-rf_classPredict(rf, DataTest)
67 |     end_time <- Sys.time()
68 |     Testing_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
69 | 
70 |     True_Labels_singleCellNet[i] <- list(Labels[Test_Idx[[i]]])
71 |     Pred_Labels_singleCellNet[i] <- list((rownames(classRes)[apply(classRes,2,which.max)])[1:length(Test_Idx[[i]])])
72 |   }
73 |   True_Labels_singleCellNet <- as.vector(unlist(True_Labels_singleCellNet))
74 |   Pred_Labels_singleCellNet <- as.vector(unlist(Pred_Labels_singleCellNet))
75 |   Training_Time_singleCellNet <- as.vector(unlist(Training_Time_singleCellNet))
76 |   Testing_Time_singleCellNet <- as.vector(unlist(Testing_Time_singleCellNet))
77 |   write.csv(True_Labels_singleCellNet,paste0(OutputDir,'/singleCellNet_true.csv'),row.names = FALSE)
78 |   write.csv(Pred_Labels_singleCellNet,paste0(OutputDir,'/singleCellNet_pred.csv'),row.names = FALSE)
79 |   write.csv(Training_Time_singleCellNet,paste0(OutputDir,'/singleCellNet_training_time.csv'),row.names = FALSE)
80 |   write.csv(Testing_Time_singleCellNet,paste0(OutputDir,'/singleCellNet_test_time.csv'),row.names = FALSE)
81 | }
82 | 
83 | if (args[6] == "0") {
84 |   run_singleCellNet(args[1], args[2], args[3], args[4])
85 | } else {
86 |   run_singleCellNet(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))
87 | }
88 | 


--------------------------------------------------------------------------------
/Snakemake/evaluate.R:
--------------------------------------------------------------------------------
 1 | args <- commandArgs(TRUE)
 2 | 
 3 | TrueLabelsPath <- args[1]
 4 | PredLabelsPath <- args[2]
 5 | OutputDir <- args[3]
 6 | ToolName <- args[4]
 7 | 
 8 | evaluate <- function(TrueLabelsPath, PredLabelsPath, Indices = NULL){
 9 |   "
10 |   Script to evaluate the performance of the classifier.
11 |   It returns multiple evaluation measures: the confusion matrix, median F1-score, F1-score for each class, accuracy, percentage of unlabeled, population size. 
12 |   
13 |   The percentage of unlabeled cells is find by checking for cells that are labeled 'Unassigned', 'unassigned', 'Unknown', 'unknown', 'Nodexx', 'rand', or 'ambiguous'.
14 |   
15 |   Parameters
16 |   ----------
17 |   TrueLabelsPath: csv file with the true labels (format: one column, no index)
18 |   PredLabelsPath: csv file with the predicted labels (format: one column, no index)
19 |   Indices: which part of the csv file should be read (e.g. if more datasets are tested at the same time) (format: c(begin, end))
20 |   
21 |   Returns
22 |   -------
23 |   Conf: confusion matrix
24 |   MedF1 : median F1-score
25 |   F1 : F1-score per class
26 |   Acc : accuracy
27 |   PercUnl : percentage of unlabeled cells
28 |   PopSize : number of cells per cell type
29 |   "
30 |   
31 |   true_lab <- unlist(read.csv(TrueLabelsPath))
32 |   pred_lab <- unlist(read.csv(PredLabelsPath))
33 |   
34 |   if (! is.null(Indices)){
35 |     true_lab <- true_lab[Indices]
36 |     pred_lab <- pred_lab[Indices]
37 |   }
38 |   
39 |   unique_true <- unlist(unique(true_lab))
40 |   unique_pred <- unlist(unique(pred_lab))
41 |   
42 |   unique_all <- unique(c(unique_true,unique_pred))
43 |   conf <- table(true_lab,pred_lab)
44 |   pop_size <- rowSums(conf)
45 |   
46 |   pred_lab = gsub('Node..','Node',pred_lab)
47 |   
48 |   conf_F1 <- table(true_lab,pred_lab,exclude = c('unassigned','Unassigned','Unknown','rand','Node','ambiguous','unknown'))
49 | 
50 |   F1 <- vector()
51 |   sum_acc <- 0
52 |   
53 |   for (i in c(1:length(unique_true))){
54 |     findLabel = colnames(conf_F1) == row.names(conf_F1)[i]
55 |     if(sum(findLabel)){
56 |       prec <- conf_F1[i,findLabel] / colSums(conf_F1)[findLabel]
57 |       rec <- conf_F1[i,findLabel] / rowSums(conf_F1)[i]
58 |       if (prec == 0 || rec == 0){
59 |         F1[i] = 0
60 |       } else{
61 |         F1[i] <- (2*prec*rec) / (prec + rec)
62 |       }
63 |       sum_acc <- sum_acc + conf_F1[i,findLabel]
64 |     } else {
65 |       F1[i] = 0
66 |     }
67 |   }
68 |   
69 |   pop_size <- pop_size[pop_size > 0]
70 |   
71 |   names(F1) <- names(pop_size)
72 |   
73 |   med_F1 <- median(F1)
74 |   
75 |   total <- length(pred_lab)
76 |   num_unlab <- sum(pred_lab == 'unassigned') + sum(pred_lab == 'Unassigned') + sum(pred_lab == 'rand') + sum(pred_lab == 'Unknown') + sum(pred_lab == 'unknown') + sum(pred_lab == 'Node') + sum(pred_lab == 'ambiguous')
77 |   per_unlab <- num_unlab / total
78 |   
79 |   acc <- sum_acc/sum(conf_F1)
80 |   
81 |   result <- list(Conf = conf, MedF1 = med_F1, F1 = F1, Acc = acc, PercUnl = per_unlab, PopSize = pop_size)
82 |   
83 |   return(result)
84 | }
85 | 
86 | results <- evaluate(TrueLabelsPath, PredLabelsPath)
87 | write.csv(results$Conf, file.path(OutputDir, "Confusion", paste0(ToolName, ".csv")))
88 | write.csv(results$F1, file.path(OutputDir, "F1", paste0(ToolName, ".csv")))
89 | write.csv(results$PopSize, file.path(OutputDir, "PopSize", paste0(ToolName, ".csv")))
90 | df <- data.frame(results[c("MedF1", "Acc", "PercUnl")])
91 | write.csv(df, file.path(OutputDir, "Summary", paste0(ToolName, ".csv")))
92 | 


--------------------------------------------------------------------------------
/Snakemake/example.config.yml:
--------------------------------------------------------------------------------
 1 | output_dir: output
 2 | datafile: input/data.csv
 3 | labfile: input/Labels.csv
 4 | column: 1
 5 | number_of_features: 0
 6 | tools_to_run:
 7 |   - Cell_BLAST
 8 |   - scVI
 9 |   - scmapcell 
10 | 


--------------------------------------------------------------------------------
/Snakemake/rank_gene_dropouts.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from sys import argv
 3 | from pathlib import Path
 4 | 
 5 | import rpy2.robjects as robjects
 6 | import numpy as np
 7 | import pandas as pd
 8 | from sklearn import linear_model
 9 | 
10 | 
11 | def rank_gene_dropouts(DataPath, CV_RDataPath, OutputDir):
12 |     '''
13 |     Script to rank the genes in the training set of the inputfile based on their dropout level.
14 |     This rank is written to a file.
15 | 
16 |     Parameters
17 |     ----------
18 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
19 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
20 |     OutputDir : Output directory defining the path of the exported file.
21 |     '''
22 | 
23 |     # read the Rdata file
24 |     robjects.r['load'](CV_RDataPath)
25 | 
26 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
27 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
28 |     train_ind = np.array(robjects.r['Train_Idx'])
29 | 
30 |     # read the data
31 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
32 |     data = data.iloc[tokeep]
33 |     data = np.log2(data+1)
34 | 
35 |     genes = np.zeros([np.shape(data)[1],np.squeeze(nfolds)], dtype = '>U10')
36 | 
37 |     for i in range(np.squeeze(nfolds)):
38 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
39 |         train=data.iloc[train_ind_i]
40 |         train.columns = np.arange(len(train.columns))
41 | 
42 |         # rank genes training set
43 |         dropout = (train == 0).sum(axis='rows')
44 |         dropout = (dropout / train.shape[0]) * 100
45 |         mean = train.mean(axis='rows')
46 | 
47 |         notzero = np.where((np.array(mean) > 0) & (np.array(dropout) > 0))[0]
48 |         zero = np.where(~((np.array(mean) > 0) & (np.array(dropout) > 0)))[0]
49 |         train_notzero = train.iloc[:,notzero]
50 |         train_zero = train.iloc[:,zero]
51 |         zero_genes = train_zero.columns
52 | 
53 |         dropout = dropout.iloc[notzero]
54 |         mean = mean.iloc[notzero]
55 | 
56 |         dropout = np.log2(np.array(dropout)).reshape(-1,1)
57 |         mean = np.array(mean).reshape(-1,1)
58 |         reg = linear_model.LinearRegression()
59 |         reg.fit(mean,dropout)
60 | 
61 |         residuals = dropout - reg.predict(mean)
62 |         residuals = pd.Series(np.array(residuals).ravel(),index=train_notzero.columns)
63 |         residuals = residuals.sort_values(ascending=False)
64 |         sorted_genes = residuals.index
65 |         sorted_genes = sorted_genes.append(zero_genes)
66 | 
67 |         genes[:,i] = sorted_genes.values
68 | 
69 | 
70 |     genes = pd.DataFrame(genes)
71 | 
72 |     genes.to_csv(str(OutputDir / Path("rank_genes_dropouts.csv")), index = False)
73 | 
74 | rank_gene_dropouts(argv[1], argv[2], argv[3])
75 | 


--------------------------------------------------------------------------------
/Snakemake/rulegraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tabdelaal/scRNAseq_Benchmark/553869b632f490d6fccc0af012e9ec5d698c17d6/Snakemake/rulegraph.png


--------------------------------------------------------------------------------
/evaluate.R:
--------------------------------------------------------------------------------
 1 | evaluate <- function(TrueLabelsPath, PredLabelsPath, Indices = NULL){
 2 |   "
 3 |   Script to evaluate the performance of the classifier.
 4 |   It returns multiple evaluation measures: the confusion matrix, median F1-score, F1-score for each class, accuracy, percentage of unlabeled, population size. 
 5 |   
 6 |   The percentage of unlabeled cells is find by checking for cells that are labeled 'Unassigned', 'unassigned', 'Unknown', 'unknown', 'Nodexx', 'rand', or 'ambiguous'.
 7 |   
 8 |   Parameters
 9 |   ----------
10 |   TrueLabelsPath: csv file with the true labels (format: one column, no index)
11 |   PredLabelsPath: csv file with the predicted labels (format: one column, no index)
12 |   Indices: which part of the csv file should be read (e.g. if more datasets are tested at the same time) (format: c(begin, end))
13 |   
14 |   Returns
15 |   -------
16 |   Conf: confusion matrix
17 |   MedF1 : median F1-score
18 |   F1 : F1-score per class
19 |   Acc : accuracy
20 |   PercUnl : percentage of unlabeled cells
21 |   PopSize : number of cells per cell type
22 |   "
23 |   
24 |   true_lab <- unlist(read.csv(TrueLabelsPath))
25 |   pred_lab <- unlist(read.csv(PredLabelsPath))
26 |   
27 |   if (! is.null(Indices)){
28 |     true_lab <- true_lab[Indices]
29 |     pred_lab <- pred_lab[Indices]
30 |   }
31 |   
32 |   unique_true <- unlist(unique(true_lab))
33 |   unique_pred <- unlist(unique(pred_lab))
34 |   
35 |   unique_all <- unique(c(unique_true,unique_pred))
36 |   conf <- table(true_lab,pred_lab)
37 |   pop_size <- rowSums(conf)
38 |   
39 |   pred_lab = gsub('Node..','Node',pred_lab)
40 |   
41 |   conf_F1 <- table(true_lab,pred_lab,exclude = c('unassigned','Unassigned','Unknown','rand','Node','ambiguous','unknown'))
42 | 
43 |   F1 <- vector()
44 |   sum_acc <- 0
45 |   
46 |   for (i in c(1:length(unique_true))){
47 |     findLabel = colnames(conf_F1) == row.names(conf_F1)[i]
48 |     if(sum(findLabel)){
49 |       prec <- conf_F1[i,findLabel] / colSums(conf_F1)[findLabel]
50 |       rec <- conf_F1[i,findLabel] / rowSums(conf_F1)[i]
51 |       if (prec == 0 || rec == 0){
52 |         F1[i] = 0
53 |       } else{
54 |         F1[i] <- (2*prec*rec) / (prec + rec)
55 |       }
56 |       sum_acc <- sum_acc + conf_F1[i,findLabel]
57 |     } else {
58 |       F1[i] = 0
59 |     }
60 |   }
61 |   
62 |   pop_size <- pop_size[pop_size > 0]
63 |   
64 |   names(F1) <- names(pop_size)
65 |   
66 |   med_F1 <- median(F1)
67 |   
68 |   total <- length(pred_lab)
69 |   num_unlab <- sum(pred_lab == 'unassigned') + sum(pred_lab == 'Unassigned') + sum(pred_lab == 'rand') + sum(pred_lab == 'Unknown') + sum(pred_lab == 'unknown') + sum(pred_lab == 'Node') + sum(pred_lab == 'ambiguous')
70 |   per_unlab <- num_unlab / total
71 |   
72 |   acc <- sum_acc/sum(conf_F1)
73 |   
74 |   result <- list(Conf = conf, MedF1 = med_F1, F1 = F1, Acc = acc, PercUnl = per_unlab, PopSize = pop_size)
75 |   
76 |   return(result)
77 | }
78 | 


--------------------------------------------------------------------------------
/rank_gene_dropouts.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import rpy2.robjects as robjects
 3 | import numpy as np
 4 | import pandas as pd
 5 | from sklearn import linear_model
 6 | 
 7 | 
 8 | def rank_gene_dropouts(DataPath, CV_RDataPath, OutputDir):
 9 |     '''
10 |     Script to rank the genes in the training set of the inputfile based on their dropout level.
11 |     This rank is written to a file.
12 |     
13 |     Parameters 
14 |     ----------
15 |     DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
16 |     CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
17 |     OutputDir : Output directory defining the path of the exported file.
18 |     '''
19 |         
20 |     # read the Rdata file
21 |     robjects.r['load'](CV_RDataPath)
22 | 
23 |     nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
24 |     tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
25 |     train_ind = np.array(robjects.r['Train_Idx'])
26 |     
27 |     # read the data
28 |     data = pd.read_csv(DataPath,index_col=0,sep=',')
29 |     data = data.iloc[tokeep]
30 |     data = np.log2(data+1)
31 |     
32 |     genes = np.zeros([np.shape(data)[1],np.squeeze(nfolds)], dtype = '>U10')
33 |         
34 |     for i in range(np.squeeze(nfolds)):
35 |         train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
36 |         train=data.iloc[train_ind_i]
37 |         train.columns = np.arange(len(train.columns))
38 |             
39 |         # rank genes training set 
40 |         dropout = (train == 0).sum(axis='rows')
41 |         dropout = (dropout / train.shape[0]) * 100
42 |         mean = train.mean(axis='rows')
43 |             
44 |         notzero = np.where((np.array(mean) > 0) & (np.array(dropout) > 0))[0]
45 |         zero = np.where(~((np.array(mean) > 0) & (np.array(dropout) > 0)))[0]
46 |         train_notzero = train.iloc[:,notzero]
47 |         train_zero = train.iloc[:,zero]
48 |         zero_genes = train_zero.columns
49 |             
50 |         dropout = dropout.iloc[notzero]
51 |         mean = mean.iloc[notzero]
52 |     
53 |         dropout = np.log2(np.array(dropout)).reshape(-1,1)
54 |         mean = np.array(mean).reshape(-1,1)
55 |         reg = linear_model.LinearRegression()
56 |         reg.fit(mean,dropout)
57 |     
58 |         residuals = dropout - reg.predict(mean)
59 |         residuals = pd.Series(np.array(residuals).ravel(),index=train_notzero.columns)
60 |         residuals = residuals.sort_values(ascending=False)
61 |         sorted_genes = residuals.index
62 |         sorted_genes = sorted_genes.append(zero_genes)
63 |             
64 |         genes[:,i] = sorted_genes.values
65 |             
66 |     
67 |     genes = pd.DataFrame(genes)
68 |     
69 |     os.chdir(OutputDir)
70 |     genes.to_csv("rank_genes_dropouts.csv", index = False)
71 | 
72 |         
73 | 
74 | 


--------------------------------------------------------------------------------