├── Cross_Validation.R ├── DEgenesMAST.R ├── LICENSE ├── README.md ├── Scripts ├── run_ACTINN.py ├── run_CHETAH.R ├── run_CaSTLe.R ├── run_Cell_BLAST.py ├── run_DigitalCellSorter.py ├── run_Garnett_CV.R ├── run_Garnett_Pretrained.R ├── run_LAmbDA.py ├── run_LDA.py ├── run_LDA_rejection.py ├── run_NMC.py ├── run_RF.py ├── run_SCINA.R ├── run_SVM.py ├── run_SVM_rejection.py ├── run_SingleR.R ├── run_kNN50.py ├── run_kNN9.py ├── run_moana.py ├── run_scID.R ├── run_scPred.R ├── run_scVI.py ├── run_scmap.R └── run_singleCellNet.R ├── Snakemake ├── Cross_Validation.R ├── DEgenesMAST.R ├── Dockerfiles │ ├── baseline │ │ └── Dockerfile │ ├── cell_blast │ │ └── Dockerfile │ ├── chetah │ │ ├── Dockerfile │ │ └── install_packages.R │ ├── cross_validation │ │ ├── Dockerfile │ │ └── install_packages.R │ ├── garnett │ │ ├── Dockerfile │ │ └── install_packages.R │ ├── scid │ │ ├── Dockerfile │ │ └── install_packages.R │ ├── scmap │ │ ├── Dockerfile │ │ └── install_packages.R │ ├── scvi │ │ └── Dockerfile │ ├── singlecellnet │ │ ├── Dockerfile │ │ └── install_packages.R │ └── singler │ │ ├── Dockerfile │ │ └── install_packages.R ├── LICENSE ├── README.md ├── Scripts │ ├── run_ACTINN.py │ ├── run_CHETAH.R │ ├── run_CaSTLe.R │ ├── run_Cell_BLAST.py │ ├── run_DigitalCellSorter.py │ ├── run_Garnett_CV.R │ ├── run_Garnett_Pretrained.R │ ├── run_LAmbDA.py │ ├── run_LDA.py │ ├── run_LDA_rejection.py │ ├── run_NMC.py │ ├── run_RF.py │ ├── run_SCINA.R │ ├── run_SVM.py │ ├── run_SVM_rejection.py │ ├── run_SingleR.R │ ├── run_kNN50.py │ ├── run_kNN9.py │ ├── run_moana.py │ ├── run_scID.R │ ├── run_scPred.R │ ├── run_scVI.py │ ├── run_scmap.R │ ├── run_scmapcell.R │ ├── run_scmapcluster.R │ ├── run_scmaptotal.R │ └── run_singleCellNet.R ├── Snakefile ├── evaluate.R ├── example.config.yml ├── rank_gene_dropouts.py └── rulegraph.png ├── evaluate.R └── rank_gene_dropouts.py /Cross_Validation.R: -------------------------------------------------------------------------------- 1 | Cross_Validation <- function(LabelsPath, col_Index = 1,OutputDir){ 2 | " 3 | Cross_Validation 4 | Function returns train and test indices for 5 folds stratified across unique cell populations, 5 | also filter out cell populations with less than 10 cells. 6 | It return a 'CV_folds.RData' file which then used as input to classifiers wrappers. 7 | 8 | Parameters 9 | ---------- 10 | LabelsPath : Cell population annotations file path (.csv). 11 | col_Index : column index (integer) defining which level of annotation to use, 12 | in case of multiple cell type annotations (default is 1) 13 | OutputDir : Output directory defining the path of the exported file. 14 | " 15 | 16 | Labels <- as.matrix(read.csv(LabelsPath)) 17 | Labels <- as.vector(Labels[,col_Index]) 18 | 19 | Removed_classes <- !(table(Labels) > 10) 20 | Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes])) 21 | Labels <- Labels[Cells_to_Keep] 22 | 23 | # Getting training and testing Folds 24 | library(rBayesianOptimization) 25 | n_folds = 5 26 | Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE) 27 | Test_Folds <- c(n_folds:1) 28 | Train_Idx <- list() 29 | Test_Idx <- list() 30 | for (i in c(1:length(Folds))){ 31 | Temp_Folds <- Folds 32 | Temp_Folds[Test_Folds[i]] <- NULL 33 | Train_Idx[i] <- list(unlist(Temp_Folds)) 34 | Test_Idx[i] <- Folds[Test_Folds[i]] 35 | } 36 | remove(Temp_Folds,i,Folds) 37 | setwd(OutputDir) 38 | save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = 'CV_folds.RData') 39 | } -------------------------------------------------------------------------------- /DEgenesMAST.R: -------------------------------------------------------------------------------- 1 | DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){ 2 | # This functions applies a differential expression test to the data using one vs all 3 | # The training data should be used a an input 4 | # The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes 5 | # This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set. 6 | 7 | # Data: genes X cells (rows = genes, columns = cells) 8 | # Labels: labels of the data 9 | # Normalize: the input for MAST should be cpm normalized data, 10 | # if the data is not normalized yet, this should be set to TRUE 11 | # LogTransform: the input for MAST should be logtransformed, 12 | # if the data is not logtransformed yet, this should be set to TRUE 13 | 14 | 15 | library(Seurat) 16 | 17 | if(Normalize) 18 | { 19 | Data <- apply(Data, 2, function(x) (x/sum(x))*1000000) 20 | } 21 | 22 | if(LogTransform) 23 | { 24 | Data <- log(Data+1, base = 2) 25 | } 26 | SeuObj <- CreateSeuratObject(raw.data = Data, project = "DEgenes") 27 | SeuObj <- SetIdent(SeuObj, ident.use = Labels) 28 | DEgenes <- FindAllMarkers(SeuObj, test.use = "MAST") 29 | Markers <- matrix(nrow = 20,ncol = length(unique(Labels))) 30 | colnames(Markers) <- unique(Labels) 31 | for (i in unique(Labels)){ 32 | i 33 | TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))] 34 | MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i] 35 | print(MarkerGenes[1:20]) 36 | if (length(TempList) >= 20){ 37 | Markers[,i] <- TempList[1:20] 38 | } 39 | else{ 40 | if(length(TempList) > 0){ 41 | Markers[c(1:length(TempList)),i] <- TempList 42 | } 43 | } 44 | } 45 | return(Markers) 46 | } 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 tabdelaal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Scripts/run_ACTINN.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import time as tm 5 | import rpy2.robjects as robjects 6 | 7 | def run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 8 | ''' 9 | run ACTINN 10 | Wrapper script to run ACTINN on a benchmark dataset with 5-fold cross validation, 11 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 12 | 13 | Parameters 14 | ---------- 15 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 16 | as row names and gene names as column names. 17 | LabelsPath : Cell population annotations file path (.csv). 18 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 19 | OutputDir : Output directory defining the path of the exported file. 20 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 21 | defining the genes order for each cross validation fold, default is NULL. 22 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 23 | ''' 24 | 25 | # read the Rdata file 26 | robjects.r['load'](CV_RDataPath) 27 | 28 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 29 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 30 | col = np.array(robjects.r['col_Index'], dtype = 'int') 31 | col = col - 1 32 | test_ind = np.array(robjects.r['Test_Idx']) 33 | train_ind = np.array(robjects.r['Train_Idx']) 34 | 35 | # read the data 36 | data = pd.read_csv(DataPath,index_col=0,sep=',') 37 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 38 | 39 | labels = labels.iloc[tokeep] 40 | data = data.iloc[tokeep] 41 | 42 | # read the feature file 43 | if (NumGenes > 0): 44 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 45 | 46 | # folder with results 47 | os.chdir(OutputDir) 48 | 49 | tot=[] 50 | truelab = [] 51 | pred = [] 52 | 53 | for i in range(np.squeeze(nfolds)): 54 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 55 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 56 | 57 | train=data.iloc[train_ind_i] 58 | test=data.iloc[test_ind_i] 59 | y_train=labels.iloc[train_ind_i] 60 | y_test=labels.iloc[test_ind_i] 61 | 62 | if (NumGenes > 0): 63 | feat_to_use = features.iloc[0:NumGenes,i] 64 | train = train.iloc[:,feat_to_use] 65 | test = test.iloc[:,feat_to_use] 66 | 67 | train = train.transpose() 68 | test = test.transpose() 69 | 70 | train.to_csv("train.csv") 71 | test.to_csv("test.csv") 72 | y_train.to_csv("train_lab.csv", header = False, index = True, sep = '\t') 73 | y_test.to_csv("test_lab.csv", header = False, index = True, sep = '\t') 74 | 75 | tm.sleep(60) 76 | 77 | os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i train.csv -o train -f csv") 78 | os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i test.csv -o test -f csv") 79 | 80 | start = tm.time() 81 | os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts test.h5") 82 | tot.append(tm.time()-start) 83 | 84 | tm.sleep(60) 85 | 86 | truelab.extend(y_test.values) 87 | predlabels = pd.read_csv('predicted_label.txt',header=0,index_col=None, sep='\t', usecols = [1]) 88 | pred.extend(predlabels.values) 89 | 90 | 91 | truelab = pd.DataFrame(truelab) 92 | pred = pd.DataFrame(pred) 93 | tot_time = pd.DataFrame(tot) 94 | 95 | if (NumGenes == 0): 96 | truelab.to_csv("ACTINN_True_Labels.csv", index = False) 97 | pred.to_csv("ACTINN_Pred_Labels.csv", index = False) 98 | tot_time.to_csv("ACTINN_Total_Time.csv", index = False) 99 | else: 100 | truelab.to_csv("ACTINN_" + str(NumGenes) + "_True_Labels.csv", index = False) 101 | pred.to_csv("ACTINN_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 102 | tot_time.to_csv("ACTINN_" + str(NumGenes) + "_Total_Time.csv", index = False) 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /Scripts/run_CHETAH.R: -------------------------------------------------------------------------------- 1 | run_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ 2 | " 3 | run CHETAH 4 | Wrapper script to run CHETAH on a benchmark dataset with 5-fold cross validation, 5 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 6 | 7 | Parameters 8 | ---------- 9 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 10 | as row names and gene names as column names. 11 | LabelsPath : Cell population annotations file path (.csv). 12 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 13 | OutputDir : Output directory defining the path of the exported file. 14 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 15 | defining the genes order for each cross validation fold, default is NULL. 16 | NumGenes : Number of genes used in case of feature selection (integer), default is NULL. 17 | " 18 | 19 | Data <- read.csv(DataPath,row.names = 1) 20 | Labels <- as.matrix(read.csv(LabelsPath)) 21 | load(CV_RDataPath) 22 | Labels <- as.vector(Labels[,col_Index]) 23 | Data <- Data[Cells_to_Keep,] 24 | Labels <- Labels[Cells_to_Keep] 25 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 26 | GenesOrder = read.csv(GeneOrderPath) 27 | } 28 | 29 | ############################################################################# 30 | # CHETAH # 31 | ############################################################################# 32 | library(CHETAH) 33 | library(SingleCellExperiment) 34 | True_Labels_CHETAH <- list() 35 | Pred_Labels_CHETAH <- list() 36 | Total_Time_CHETAH <- list() 37 | Data = t(as.matrix(Data)) 38 | 39 | for (i in c(1:n_folds)){ 40 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 41 | sce <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 42 | colData = data.frame(celltypes = Labels[Train_Idx[[i]]])) 43 | 44 | sce_test <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 45 | colData = data.frame(celltypes = Labels[Test_Idx[[i]]])) 46 | start_time <- Sys.time() 47 | sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce, n_genes = NumGenes) 48 | end_time <- Sys.time() 49 | } 50 | else{ 51 | sce <- SingleCellExperiment(assays = list(counts = Data[,Train_Idx[[i]]]), 52 | colData = data.frame(celltypes = Labels[Train_Idx[[i]]])) 53 | 54 | sce_test <- SingleCellExperiment(assays = list(counts = Data[,Test_Idx[[i]]]), 55 | colData = data.frame(celltypes = Labels[Test_Idx[[i]]])) 56 | start_time <- Sys.time() 57 | sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce) 58 | end_time <- Sys.time() 59 | } 60 | 61 | Total_Time_CHETAH[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 62 | 63 | True_Labels_CHETAH[i] <- list(Labels[Test_Idx[[i]]]) 64 | Pred_Labels_CHETAH[i] <- list(sce_test$celltype_CHETAH) 65 | } 66 | True_Labels_CHETAH <- as.vector(unlist(True_Labels_CHETAH)) 67 | Pred_Labels_CHETAH <- as.vector(unlist(Pred_Labels_CHETAH)) 68 | Total_Time_CHETAH <- as.vector(unlist(Total_Time_CHETAH)) 69 | 70 | setwd(OutputDir) 71 | 72 | if (!is.null(GeneOrderPath) & !is.null (NumGenes)){ 73 | write.csv(True_Labels_CHETAH,paste('CHETAH_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) 74 | write.csv(Pred_Labels_CHETAH,paste('CHETAH_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) 75 | write.csv(Total_Time_CHETAH,paste('CHETAH_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE) 76 | } 77 | else{ 78 | write.csv(True_Labels_CHETAH,'CHETAH_True_Labels.csv',row.names = FALSE) 79 | write.csv(Pred_Labels_CHETAH,'CHETAH_Pred_Labels.csv',row.names = FALSE) 80 | write.csv(Total_Time_CHETAH,'CHETAH_Total_Time.csv',row.names = FALSE) 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /Scripts/run_Cell_BLAST.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as tm 3 | import pandas as pd 4 | import warnings 5 | warnings.filterwarnings("ignore") 6 | 7 | import tensorflow as tf 8 | tf.logging.set_verbosity(0) 9 | 10 | import Cell_BLAST as cb 11 | import numpy as np 12 | from numpy import genfromtxt as gft 13 | import rpy2.robjects as robjects 14 | 15 | 16 | def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 17 | ''' 18 | run Cell_BLAST 19 | Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation, 20 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 21 | 22 | Parameters 23 | ---------- 24 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 25 | as row names and gene names as column names. 26 | LabelsPath : Cell population annotations file path (.csv). 27 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 28 | OutputDir : Output directory defining the path of the exported file. 29 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 30 | defining the genes order for each cross validation fold, default is NULL. 31 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 32 | ''' 33 | 34 | # read the Rdata file 35 | robjects.r['load'](CV_RDataPath) 36 | 37 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 38 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 39 | col = np.array(robjects.r['col_Index'], dtype = 'int') 40 | col = col - 1 41 | test_ind = np.array(robjects.r['Test_Idx']) 42 | train_ind = np.array(robjects.r['Train_Idx']) 43 | 44 | # read the feature file 45 | if (NumGenes > 0): 46 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 47 | 48 | # read the data and labels 49 | data_old = cb.data.ExprDataSet.read_table(DataPath,orientation="cg", sep=",", index_col = 0, header = 0, sparsify = True).normalize() 50 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 51 | 52 | data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns) 53 | 54 | labels = gft(LabelsPath, dtype = "str", skip_header = 1, delimiter = ",", usecols = col) 55 | labels = labels[tokeep] 56 | 57 | os.chdir(OutputDir) 58 | 59 | truelab = [] 60 | pred = [] 61 | tr_time = [] 62 | ts_time = [] 63 | 64 | for i in range(np.squeeze(nfolds)): 65 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 66 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 67 | 68 | train=data[train_ind_i,:] 69 | test=data[test_ind_i,:] 70 | y_train = labels[train_ind_i] 71 | y_test = labels[test_ind_i] 72 | 73 | if (NumGenes > 0): 74 | feat_to_use = features.iloc[0:NumGenes,i] 75 | train = train[:,feat_to_use] 76 | test = test[:,feat_to_use] 77 | 78 | 79 | train.obs['cell_type'] = y_train 80 | 81 | start = tm.time() 82 | 83 | # reduce dimensions 84 | num_epoch = 50 85 | models = [] 86 | 87 | for j in range(4): 88 | models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j)) 89 | 90 | # train model 91 | blast = cb.blast.BLAST(models, train).build_empirical() 92 | tr_time.append(tm.time()-start) 93 | 94 | # predict labels 95 | start = tm.time() 96 | test_pred = blast.query(test).annotate('cell_type') 97 | ts_time.append(tm.time()-start) 98 | 99 | truelab.extend(y_test) 100 | pred.extend(test_pred.values) 101 | 102 | #write results 103 | truelab = pd.DataFrame(truelab) 104 | pred = pd.DataFrame(pred) 105 | 106 | tr_time = pd.DataFrame(tr_time) 107 | ts_time = pd.DataFrame(ts_time) 108 | 109 | if (NumGenes == 0): 110 | truelab.to_csv("Cell_BLAST_True_Labels.csv", index = False) 111 | pred.to_csv("Cell_BLAST_Pred_Labels.csv", index = False) 112 | tr_time.to_csv("Cell_BLAST_Training_Time.csv", index = False) 113 | ts_time.to_csv("Cell_BLAST_Testing_Time.csv", index = False) 114 | else: 115 | truelab.to_csv("Cell_BLAST_" + str(NumGenes) + "_True_Labels.csv", index = False) 116 | pred.to_csv("Cell_BLAST_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 117 | tr_time.to_csv("Cell_BLAST_" + str(NumGenes) + "_Training_Time.csv", index = False) 118 | ts_time.to_csv("Cell_BLAST_" + str(NumGenes) + "_Testing_Time.csv", index = False) 119 | 120 | -------------------------------------------------------------------------------- /Scripts/run_DigitalCellSorter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scripts.DigitalCellSorter as DigitalCellSorter 4 | import os 5 | import time as tm 6 | import rpy2.robjects as robjects 7 | 8 | def run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 9 | ''' 10 | run DigitalCellSorter 11 | Wrapper script to run DigitalCellSorter on a benchmark dataset using a predefined genelist, 12 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 13 | 14 | Parameters 15 | ---------- 16 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 17 | as row names and gene names as column names. 18 | LabelsPath : Cell population annotations file path (.csv). 19 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 20 | GeneListPath : Data file path to the genest. 21 | OutputDir : Output directory defining the path of the exported file. 22 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 23 | defining the genes order for each cross validation fold, default is NULL. 24 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 25 | ''' 26 | 27 | # read the Rdata file 28 | robjects.r['load'](CV_RDataPath) 29 | 30 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 31 | col = np.array(robjects.r['col_Index'], dtype = 'int') 32 | col = col - 1 33 | 34 | # read the data 35 | data = pd.read_csv(DataPath,index_col=0,sep=',') 36 | data = data.iloc[tokeep] 37 | 38 | truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 39 | truelab = truelab.iloc[tokeep] 40 | 41 | 42 | # read the feature file 43 | if (NumGenes > 0): 44 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 45 | feat_to_use = features.iloc[0:NumGenes,0] 46 | data = data.iloc[:,feat_to_use] 47 | 48 | data = data.transpose() 49 | 50 | # number of different cell types in the data? 51 | n_clusters = 8 52 | AvailableCPUsCount = 1 53 | N_samples_for_distribution = 10000 54 | 55 | start = tm.time() 56 | pred = DigitalCellSorter.DigitalCellSorter().Process(data, 'DigitalCellSorter_Zhang', 57 | saveDir = OutputDir, 58 | geneListFileName = GeneListPath, 59 | N_samples_for_distribution = N_samples_for_distribution, 60 | AvailableCPUsCount = AvailableCPUsCount, 61 | clusterIndex=None, 62 | clusterName=None, 63 | n_clusters=n_clusters) 64 | runtime = tm.time() - start 65 | 66 | os.chdir(OutputDir) 67 | 68 | results = pd.read_excel('DigitalCellSorter_Zhang_voting.xlsx',header=0,index_col=None, usecols=[11]) 69 | 70 | prediction = np.zeros(np.shape(pred), dtype='>U10') 71 | 72 | for i in range(len(results)): 73 | prediction[np.where(pred == i)] = results.values[i] 74 | 75 | prediction = pd.DataFrame(prediction) 76 | 77 | if (NumGenes == 0): 78 | truelab.to_csv("DigitalCellSorter_True_Labels.csv", index = False) 79 | prediction.to_csv("DigitalCellSorter_Pred_Labels.csv", index = False) 80 | with open("DigitalCellSorter_Total_Time.csv", 'w') as f: 81 | f.write("%f\n" % runtime) 82 | else: 83 | truelab.to_csv("DigitalCellSorter_" + str(NumGenes) + "_True_Labels.csv", index = False) 84 | prediction.to_csv("DigitalCellSorter_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 85 | with open("DigitalCellSorter_" + str(NumGenes) + "_Total_Time.csv", 'w') as f: 86 | f.write("%f\n" % runtime) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /Scripts/run_Garnett_Pretrained.R: -------------------------------------------------------------------------------- 1 | run_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){ 2 | " 3 | run Garnett 4 | Wrapper script to run Garnett on a benchmark dataset with a pretrained classifier, 5 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 6 | 7 | Parameters 8 | ---------- 9 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 10 | as row names and gene names as column names. 11 | LabelsPath : Cell population annotations file path (.csv). 12 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 13 | GenesPath : Path to the file with the genenames 14 | ClassifierPath : Path to the pretrained classifier 15 | OutputDir : Output directory defining the path of the exported file. 16 | Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE) 17 | " 18 | # load needed libraries 19 | library(garnett) 20 | 21 | if (Human) { 22 | library(org.Hs.eg.db) 23 | } else { 24 | library(org.Mm.eg.db) 25 | } 26 | 27 | # load data, genes, and marker file 28 | load(CV_RDataPath) 29 | 30 | load(ClassifierPath) 31 | 32 | labels <- as.matrix(read.csv(LabelsPath)) 33 | labels <- labels[Cells_to_Keep] 34 | 35 | mat <- read.table(DataPath, sep = ",") 36 | data <- mat[-1,-1] 37 | data <- data[Cells_to_Keep,] 38 | data <- t(data) #ensure that the genes are rows, and the cells are columns 39 | 40 | barcodes <- mat[-1,1] 41 | 42 | pdata = data.frame(barcodes) 43 | fdata <- read.table(GenesPath) 44 | names(fdata) <- 'gene_short_name' 45 | row.names(fdata) <- fdata$gene_short_name 46 | 47 | row.names(data) <- row.names(fdata) 48 | colnames(data) <- row.names(pdata) 49 | 50 | pd <- new("AnnotatedDataFrame", data = pdata) 51 | fd <- new("AnnotatedDataFrame", data = fdata) 52 | pbmc_cds <- newCellDataSet(as(data, "dgCMatrix"), 53 | phenoData = pd, 54 | featureData = fd) 55 | 56 | start_time <- Sys.time() 57 | 58 | pbmc_cds <- estimateSizeFactors(pbmc_cds) 59 | 60 | if (Human){ 61 | pbmc_cds <- classify_cells(pbmc_cds, hsPBMC, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL") 62 | } else { 63 | pbmc_cds <- classify_cells(pbmc_cds, mmLung, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL") 64 | } 65 | 66 | end_time <- Sys.time() 67 | 68 | test_time <- as.numeric(end_time - start_time) 69 | 70 | setwd(OutputDir) 71 | 72 | write.table(pData(pbmc_cds)$cluster_ext_type, file = "Garnett_Pred_Labels.csv", append = FALSE, quote = TRUE, sep = "\t", 73 | eol = "\n", na = "NA", dec = ".", row.names = FALSE, 74 | qmethod = c("escape", "double"), 75 | fileEncoding = "") 76 | 77 | write.csv(labels,"Garnett_Pretrained_True_Labels.csv", row.names = FALSE) 78 | 79 | write.csv(test_time,'Garnett_Pretrained_Testing_Time.csv',row.names = FALSE) 80 | 81 | 82 | 83 | } -------------------------------------------------------------------------------- /Scripts/run_LDA.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import time as tm 5 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 6 | import rpy2.robjects as robjects 7 | 8 | 9 | def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 10 | ''' 11 | run baseline classifier: LDA 12 | Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation, 13 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 14 | 15 | Parameters 16 | ---------- 17 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 18 | as row names and gene names as column names. 19 | LabelsPath : Cell population annotations file path (.csv). 20 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 21 | OutputDir : Output directory defining the path of the exported file. 22 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 23 | defining the genes order for each cross validation fold, default is NULL. 24 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 25 | ''' 26 | 27 | # read the Rdata file 28 | robjects.r['load'](CV_RDataPath) 29 | 30 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 31 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 32 | col = np.array(robjects.r['col_Index'], dtype = 'int') 33 | col = col - 1 34 | test_ind = np.array(robjects.r['Test_Idx']) 35 | train_ind = np.array(robjects.r['Train_Idx']) 36 | 37 | # read the data 38 | data = pd.read_csv(DataPath,index_col=0,sep=',') 39 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 40 | 41 | labels = labels.iloc[tokeep] 42 | data = data.iloc[tokeep] 43 | 44 | # read the feature file 45 | if (NumGenes > 0): 46 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 47 | 48 | # folder with results 49 | os.chdir(OutputDir) 50 | 51 | # normalize data 52 | data = np.log1p(data) 53 | 54 | Classifier = LinearDiscriminantAnalysis() 55 | 56 | tr_time=[] 57 | ts_time=[] 58 | truelab = [] 59 | pred = [] 60 | 61 | for i in range(np.squeeze(nfolds)): 62 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 63 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 64 | 65 | train=data.iloc[train_ind_i] 66 | test=data.iloc[test_ind_i] 67 | y_train=labels.iloc[train_ind_i] 68 | y_test=labels.iloc[test_ind_i] 69 | 70 | if (NumGenes > 0): 71 | feat_to_use = features.iloc[0:NumGenes,i] 72 | train = train.iloc[:,feat_to_use] 73 | test = test.iloc[:,feat_to_use] 74 | 75 | start=tm.time() 76 | Classifier.fit(train, y_train) 77 | tr_time.append(tm.time()-start) 78 | 79 | start=tm.time() 80 | predicted = Classifier.predict(test) 81 | ts_time.append(tm.time()-start) 82 | 83 | truelab.extend(y_test.values) 84 | pred.extend(predicted) 85 | 86 | truelab = pd.DataFrame(truelab) 87 | pred = pd.DataFrame(pred) 88 | 89 | tr_time = pd.DataFrame(tr_time) 90 | ts_time = pd.DataFrame(ts_time) 91 | 92 | if (NumGenes == 0): 93 | truelab.to_csv("LDA_True_Labels.csv", index = False) 94 | pred.to_csv("LDA_Pred_Labels.csv", index = False) 95 | tr_time.to_csv("LDA_Training_Time.csv", index = False) 96 | ts_time.to_csv("LDA_Testing_Time.csv", index = False) 97 | else: 98 | truelab.to_csv("LDA_" + str(NumGenes) + "_True_Labels.csv", index = False) 99 | pred.to_csv("LDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 100 | tr_time.to_csv("LDA_" + str(NumGenes) + "_Training_Time.csv", index = False) 101 | ts_time.to_csv("LDA_" + str(NumGenes) + "_Testing_Time.csv", index = False) 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /Scripts/run_LDA_rejection.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import time as tm 5 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 6 | import rpy2.robjects as robjects 7 | 8 | 9 | def run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7): 10 | ''' 11 | run baseline classifier: LDA 12 | Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation, 13 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 14 | 15 | Parameters 16 | ---------- 17 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 18 | as row names and gene names as column names. 19 | LabelsPath : Cell population annotations file path (.csv). 20 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 21 | OutputDir : Output directory defining the path of the exported file. 22 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 23 | defining the genes order for each cross validation fold, default is NULL. 24 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 25 | Threshold : Threshold used when rejecting the genes, default is 0.7. 26 | ''' 27 | 28 | # read the Rdata file 29 | robjects.r['load'](CV_RDataPath) 30 | 31 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 32 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 33 | col = np.array(robjects.r['col_Index'], dtype = 'int') 34 | col = col - 1 35 | test_ind = np.array(robjects.r['Test_Idx']) 36 | train_ind = np.array(robjects.r['Train_Idx']) 37 | 38 | # read the data 39 | data = pd.read_csv(DataPath,index_col=0,sep=',') 40 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 41 | 42 | labels = labels.iloc[tokeep] 43 | data = data.iloc[tokeep] 44 | 45 | # read the feature file 46 | if (NumGenes > 0): 47 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 48 | 49 | # folder with results 50 | os.chdir(OutputDir) 51 | 52 | # normalize data 53 | data = np.log1p(data) 54 | 55 | Classifier = LinearDiscriminantAnalysis() 56 | 57 | tr_time=[] 58 | ts_time=[] 59 | truelab = [] 60 | pred = [] 61 | 62 | for i in range(np.squeeze(nfolds)): 63 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 64 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 65 | 66 | train=data.iloc[train_ind_i] 67 | test=data.iloc[test_ind_i] 68 | y_train=labels.iloc[train_ind_i] 69 | y_test=labels.iloc[test_ind_i] 70 | 71 | if (NumGenes > 0): 72 | feat_to_use = features.iloc[0:NumGenes,i] 73 | train = train.iloc[:,feat_to_use] 74 | test = test.iloc[:,feat_to_use] 75 | 76 | start=tm.time() 77 | Classifier.fit(train, y_train) 78 | tr_time.append(tm.time()-start) 79 | 80 | start=tm.time() 81 | predicted = Classifier.predict(test) 82 | prob = np.max(Classifier.predict_proba(test), axis = 1) 83 | unlabeled = np.where(prob < Threshold) 84 | predicted[unlabeled] = 'Unknown' 85 | ts_time.append(tm.time()-start) 86 | 87 | truelab.extend(y_test.values) 88 | pred.extend(predicted) 89 | 90 | truelab = pd.DataFrame(truelab) 91 | pred = pd.DataFrame(pred) 92 | 93 | tr_time = pd.DataFrame(tr_time) 94 | ts_time = pd.DataFrame(ts_time) 95 | 96 | if (NumGenes == 0): 97 | truelab.to_csv("LDA_True_Labels.csv", index = False) 98 | pred.to_csv("LDA_Pred_Labels.csv", index = False) 99 | tr_time.to_csv("LDA_Training_Time.csv", index = False) 100 | ts_time.to_csv("LDA_Testing_Time.csv", index = False) 101 | else: 102 | truelab.to_csv("LDA_" + str(NumGenes) + "_True_Labels.csv", index = False) 103 | pred.to_csv("LDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 104 | tr_time.to_csv("LDA_" + str(NumGenes) + "_Training_Time.csv", index = False) 105 | ts_time.to_csv("LDA_" + str(NumGenes) + "_Testing_Time.csv", index = False) 106 | 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /Scripts/run_NMC.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import time as tm 5 | from sklearn.neighbors import NearestCentroid 6 | import rpy2.robjects as robjects 7 | 8 | 9 | def run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 10 | ''' 11 | run baseline classifier: NMC 12 | Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation, 13 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 14 | 15 | Parameters 16 | ---------- 17 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 18 | as row names and gene names as column names. 19 | LabelsPath : Cell population annotations file path (.csv). 20 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 21 | OutputDir : Output directory defining the path of the exported file. 22 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 23 | defining the genes order for each cross validation fold, default is NULL. 24 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 25 | ''' 26 | 27 | # read the Rdata file 28 | robjects.r['load'](CV_RDataPath) 29 | 30 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 31 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 32 | col = np.array(robjects.r['col_Index'], dtype = 'int') 33 | col = col - 1 34 | test_ind = np.array(robjects.r['Test_Idx']) 35 | train_ind = np.array(robjects.r['Train_Idx']) 36 | 37 | # read the data 38 | data = pd.read_csv(DataPath,index_col=0,sep=',') 39 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 40 | 41 | labels = labels.iloc[tokeep] 42 | data = data.iloc[tokeep] 43 | 44 | # read the feature file 45 | if (NumGenes > 0): 46 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 47 | 48 | # folder with results 49 | os.chdir(OutputDir) 50 | 51 | # normalize data 52 | data = np.log1p(data) 53 | 54 | Classifier = NearestCentroid() 55 | 56 | tr_time=[] 57 | ts_time=[] 58 | truelab = [] 59 | pred = [] 60 | 61 | for i in range(np.squeeze(nfolds)): 62 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 63 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 64 | 65 | train=data.iloc[train_ind_i] 66 | test=data.iloc[test_ind_i] 67 | y_train=labels.iloc[train_ind_i] 68 | y_test=labels.iloc[test_ind_i] 69 | 70 | if (NumGenes > 0): 71 | feat_to_use = features.iloc[0:NumGenes,i] 72 | train = train.iloc[:,feat_to_use] 73 | test = test.iloc[:,feat_to_use] 74 | 75 | start=tm.time() 76 | Classifier.fit(train, y_train) 77 | tr_time.append(tm.time()-start) 78 | 79 | start=tm.time() 80 | predicted = Classifier.predict(test) 81 | ts_time.append(tm.time()-start) 82 | 83 | truelab.extend(y_test.values) 84 | pred.extend(predicted) 85 | 86 | truelab = pd.DataFrame(truelab) 87 | pred = pd.DataFrame(pred) 88 | 89 | tr_time = pd.DataFrame(tr_time) 90 | ts_time = pd.DataFrame(ts_time) 91 | 92 | if (NumGenes == 0): 93 | truelab.to_csv("NMC_True_Labels.csv", index = False) 94 | pred.to_csv("NMC_Pred_Labels.csv", index = False) 95 | tr_time.to_csv("NMC_Training_Time.csv", index = False) 96 | ts_time.to_csv("NMC_Testing_Time.csv", index = False) 97 | else: 98 | truelab.to_csv("NMC_" + str(NumGenes) + "_True_Labels.csv", index = False) 99 | pred.to_csv("NMC_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 100 | tr_time.to_csv("NMC_" + str(NumGenes) + "_Training_Time.csv", index = False) 101 | ts_time.to_csv("NMC_" + str(NumGenes) + "_Testing_Time.csv", index = False) 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /Scripts/run_RF.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import time as tm 5 | from sklearn.ensemble import RandomForestClassifier 6 | import rpy2.robjects as robjects 7 | 8 | 9 | def run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 10 | ''' 11 | run baseline classifier: RF 12 | Wrapper script to run a RF classifier with 50 trees on a benchmark dataset with 5-fold cross validation, 13 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 14 | 15 | Parameters 16 | ---------- 17 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 18 | as row names and gene names as column names. 19 | LabelsPath : Cell population annotations file path (.csv). 20 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 21 | OutputDir : Output directory defining the path of the exported file. 22 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 23 | defining the genes order for each cross validation fold, default is NULL. 24 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 25 | ''' 26 | 27 | # read the Rdata file 28 | robjects.r['load'](CV_RDataPath) 29 | 30 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 31 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 32 | col = np.array(robjects.r['col_Index'], dtype = 'int') 33 | col = col - 1 34 | test_ind = np.array(robjects.r['Test_Idx']) 35 | train_ind = np.array(robjects.r['Train_Idx']) 36 | 37 | # read the data 38 | data = pd.read_csv(DataPath,index_col=0,sep=',') 39 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 40 | 41 | labels = labels.iloc[tokeep] 42 | data = data.iloc[tokeep] 43 | 44 | # read the feature file 45 | if (NumGenes > 0): 46 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 47 | 48 | # folder with results 49 | os.chdir(OutputDir) 50 | 51 | # normalize data 52 | data = np.log1p(data) 53 | 54 | Classifier = RandomForestClassifier(n_estimators = 50) 55 | 56 | tr_time=[] 57 | ts_time=[] 58 | truelab = [] 59 | pred = [] 60 | 61 | for i in range(np.squeeze(nfolds)): 62 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 63 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 64 | 65 | train=data.iloc[train_ind_i] 66 | test=data.iloc[test_ind_i] 67 | y_train=labels.iloc[train_ind_i] 68 | y_test=labels.iloc[test_ind_i] 69 | 70 | if (NumGenes > 0): 71 | feat_to_use = features.iloc[0:NumGenes,i] 72 | train = train.iloc[:,feat_to_use] 73 | test = test.iloc[:,feat_to_use] 74 | 75 | start=tm.time() 76 | Classifier.fit(train, y_train) 77 | tr_time.append(tm.time()-start) 78 | 79 | start=tm.time() 80 | predicted = Classifier.predict(test) 81 | ts_time.append(tm.time()-start) 82 | 83 | truelab.extend(y_test.values) 84 | pred.extend(predicted) 85 | 86 | truelab = pd.DataFrame(truelab) 87 | pred = pd.DataFrame(pred) 88 | 89 | tr_time = pd.DataFrame(tr_time) 90 | ts_time = pd.DataFrame(ts_time) 91 | 92 | if (NumGenes == 0): 93 | truelab.to_csv("RF_True_Labels.csv", index = False) 94 | pred.to_csv("RF_Pred_Labels.csv", index = False) 95 | tr_time.to_csv("RF_Training_Time.csv", index = False) 96 | ts_time.to_csv("RF_Testing_Time.csv", index = False) 97 | else: 98 | truelab.to_csv("RF_" + str(NumGenes) + "_True_Labels.csv", index = False) 99 | pred.to_csv("RF_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 100 | tr_time.to_csv("RF_" + str(NumGenes) + "_Training_Time.csv", index = False) 101 | ts_time.to_csv("RF_" + str(NumGenes) + "_Testing_Time.csv", index = False) 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /Scripts/run_SCINA.R: -------------------------------------------------------------------------------- 1 | run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){ 2 | " 3 | run SCINA 4 | Wrapper script to run SCINA on a benchmark dataset, 5 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 6 | 7 | Parameters 8 | ---------- 9 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 10 | as row names and gene names as column names. 11 | LabelsPath : Cell population annotations file path (.csv). 12 | GeneSigPath : Cell type marker genes file path (.csv) 13 | OutputDir : Output directory defining the path of the exported file. 14 | " 15 | 16 | Data <- read.csv(DataPath,row.names = 1) 17 | Labels <- as.vector(as.matrix(read.csv(LabelsPath))) 18 | Data <- Data[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK')),] 19 | Labels <- Labels[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK'))] 20 | Labels[Labels == 'CD14+ Monocyte'] <- 'CD14_Monocyte' 21 | Labels[Labels == 'CD19+ B'] <- 'CD19_B' 22 | Labels[Labels == 'CD56+ NK'] <- 'CD56_NK' 23 | 24 | 25 | ############################################################################# 26 | # SCINA # 27 | ############################################################################# 28 | library(SCINA) 29 | Signature_Genes <- preprocess.signatures(GeneSigPath) 30 | True_Labels_SCINA <- list() 31 | Pred_Labels_SCINA <- list() 32 | Total_Time_SCINA <- list() 33 | 34 | library(preprocessCore) 35 | Data = t(as.matrix(Data)) 36 | Data=log(Data+1) 37 | Data[]=normalize.quantiles(Data) 38 | 39 | start_time <- Sys.time() 40 | results = SCINA(Data, Signature_Genes) 41 | end_time <- Sys.time() 42 | 43 | True_Labels_SCINA <- Labels 44 | Pred_Labels_SCINA <- results$cell_labels 45 | Total_Time_SCINA <- as.numeric(difftime(end_time,start_time,units = 'secs')) 46 | 47 | setwd(OutputDir) 48 | 49 | write.csv(True_Labels_SCINA,'SCINA_True_Labels.csv',row.names = FALSE) 50 | write.csv(Pred_Labels_SCINA,'SCINA_Pred_Labels.csv',row.names = FALSE) 51 | write.csv(Total_Time_SCINA,'SCINA_Total_Time.csv',row.names = FALSE) 52 | } 53 | -------------------------------------------------------------------------------- /Scripts/run_SVM.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import time as tm 5 | from sklearn.svm import LinearSVC 6 | import rpy2.robjects as robjects 7 | 8 | 9 | def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 10 | ''' 11 | run baseline classifier: SVM 12 | Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation, 13 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 14 | 15 | Parameters 16 | ---------- 17 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 18 | as row names and gene names as column names. 19 | LabelsPath : Cell population annotations file path (.csv). 20 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 21 | OutputDir : Output directory defining the path of the exported file. 22 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 23 | defining the genes order for each cross validation fold, default is NULL. 24 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 25 | ''' 26 | 27 | # read the Rdata file 28 | robjects.r['load'](CV_RDataPath) 29 | 30 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 31 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 32 | col = np.array(robjects.r['col_Index'], dtype = 'int') 33 | col = col - 1 34 | test_ind = np.array(robjects.r['Test_Idx']) 35 | train_ind = np.array(robjects.r['Train_Idx']) 36 | 37 | # read the data 38 | data = pd.read_csv(DataPath,index_col=0,sep=',') 39 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 40 | 41 | labels = labels.iloc[tokeep] 42 | data = data.iloc[tokeep] 43 | 44 | # read the feature file 45 | if (NumGenes > 0): 46 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 47 | 48 | # folder with results 49 | os.chdir(OutputDir) 50 | 51 | # normalize data 52 | data = np.log1p(data) 53 | 54 | Classifier = LinearSVC() 55 | 56 | tr_time=[] 57 | ts_time=[] 58 | truelab = [] 59 | pred = [] 60 | 61 | for i in range(np.squeeze(nfolds)): 62 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 63 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 64 | 65 | train=data.iloc[train_ind_i] 66 | test=data.iloc[test_ind_i] 67 | y_train=labels.iloc[train_ind_i] 68 | y_test=labels.iloc[test_ind_i] 69 | 70 | if (NumGenes > 0): 71 | feat_to_use = features.iloc[0:NumGenes,i] 72 | train = train.iloc[:,feat_to_use] 73 | test = test.iloc[:,feat_to_use] 74 | 75 | start=tm.time() 76 | Classifier.fit(train, y_train) 77 | tr_time.append(tm.time()-start) 78 | 79 | start=tm.time() 80 | predicted = Classifier.predict(test) 81 | ts_time.append(tm.time()-start) 82 | 83 | truelab.extend(y_test.values) 84 | pred.extend(predicted) 85 | 86 | truelab = pd.DataFrame(truelab) 87 | pred = pd.DataFrame(pred) 88 | 89 | tr_time = pd.DataFrame(tr_time) 90 | ts_time = pd.DataFrame(ts_time) 91 | 92 | if (NumGenes == 0): 93 | truelab.to_csv("SVM_True_Labels.csv", index = False) 94 | pred.to_csv("SVM_Pred_Labels.csv", index = False) 95 | tr_time.to_csv("SVM_Training_Time.csv", index = False) 96 | ts_time.to_csv("SVM_Testing_Time.csv", index = False) 97 | else: 98 | truelab.to_csv("SVM_" + str(NumGenes) + "_True_Labels.csv", index = False) 99 | pred.to_csv("SVM_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 100 | tr_time.to_csv("SVM_" + str(NumGenes) + "_Training_Time.csv", index = False) 101 | ts_time.to_csv("SVM_" + str(NumGenes) + "_Testing_Time.csv", index = False) 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /Scripts/run_SVM_rejection.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import time as tm 5 | from sklearn.svm import LinearSVC 6 | import rpy2.robjects as robjects 7 | from sklearn.calibration import CalibratedClassifierCV 8 | 9 | 10 | def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7): 11 | ''' 12 | run baseline classifier: SVM 13 | Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation, 14 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 15 | 16 | Parameters 17 | ---------- 18 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 19 | as row names and gene names as column names. 20 | LabelsPath : Cell population annotations file path (.csv). 21 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 22 | OutputDir : Output directory defining the path of the exported file. 23 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 24 | defining the genes order for each cross validation fold, default is NULL. 25 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 26 | Threshold : Threshold used when rejecting the cells, default is 0.7. 27 | 28 | ''' 29 | 30 | # read the Rdata file 31 | robjects.r['load'](CV_RDataPath) 32 | 33 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 34 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 35 | col = np.array(robjects.r['col_Index'], dtype = 'int') 36 | col = col - 1 37 | test_ind = np.array(robjects.r['Test_Idx']) 38 | train_ind = np.array(robjects.r['Train_Idx']) 39 | 40 | # read the data 41 | data = pd.read_csv(DataPath,index_col=0,sep=',') 42 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 43 | 44 | labels = labels.iloc[tokeep] 45 | data = data.iloc[tokeep] 46 | 47 | # read the feature file 48 | if (NumGenes > 0): 49 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 50 | 51 | # folder with results 52 | os.chdir(OutputDir) 53 | 54 | # normalize data 55 | data = np.log1p(data) 56 | 57 | Classifier = LinearSVC() 58 | clf = CalibratedClassifierCV(Classifier) 59 | 60 | tr_time=[] 61 | ts_time=[] 62 | truelab = [] 63 | pred = [] 64 | 65 | for i in range(np.squeeze(nfolds)): 66 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 67 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 68 | 69 | train=data.iloc[train_ind_i] 70 | test=data.iloc[test_ind_i] 71 | y_train=labels.iloc[train_ind_i] 72 | y_test=labels.iloc[test_ind_i] 73 | 74 | if (NumGenes > 0): 75 | feat_to_use = features.iloc[0:NumGenes,i] 76 | train = train.iloc[:,feat_to_use] 77 | test = test.iloc[:,feat_to_use] 78 | 79 | start=tm.time() 80 | clf.fit(train, y_train) 81 | tr_time.append(tm.time()-start) 82 | 83 | start=tm.time() 84 | predicted = clf.predict(test) 85 | prob = np.max(clf.predict_proba(test), axis = 1) 86 | unlabeled = np.where(prob < Threshold) 87 | predicted[unlabeled] = 'Unknown' 88 | ts_time.append(tm.time()-start) 89 | 90 | truelab.extend(y_test.values) 91 | pred.extend(predicted) 92 | 93 | truelab = pd.DataFrame(truelab) 94 | pred = pd.DataFrame(pred) 95 | 96 | tr_time = pd.DataFrame(tr_time) 97 | ts_time = pd.DataFrame(ts_time) 98 | 99 | if (NumGenes == 0): 100 | truelab.to_csv("SVM_True_Labels.csv", index = False) 101 | pred.to_csv("SVM_Pred_Labels.csv", index = False) 102 | tr_time.to_csv("SVM_Training_Time.csv", index = False) 103 | ts_time.to_csv("SVM_Testing_Time.csv", index = False) 104 | else: 105 | truelab.to_csv("SVM_" + str(NumGenes) + "_True_Labels.csv", index = False) 106 | pred.to_csv("SVM_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 107 | tr_time.to_csv("SVM_" + str(NumGenes) + "_Training_Time.csv", index = False) 108 | ts_time.to_csv("SVM_" + str(NumGenes) + "_Testing_Time.csv", index = False) 109 | 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /Scripts/run_SingleR.R: -------------------------------------------------------------------------------- 1 | run_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ 2 | " 3 | run SingleR 4 | Wrapper script to run SingleR on a benchmark dataset with 5-fold cross validation, 5 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 6 | 7 | Parameters 8 | ---------- 9 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 10 | as row names and gene names as column names. 11 | LabelsPath : Cell population annotations file path (.csv). 12 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 13 | OutputDir : Output directory defining the path of the exported file. 14 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 15 | defining the genes order for each cross validation fold, default is NULL. 16 | NumGenes : Number of genes used in case of feature selection (integer), default is NULL. 17 | " 18 | 19 | Data <- read.csv(DataPath,row.names = 1) 20 | Labels <- as.matrix(read.csv(LabelsPath)) 21 | load(CV_RDataPath) 22 | Labels <- as.vector(Labels[,col_Index]) 23 | Data <- Data[Cells_to_Keep,] 24 | Labels <- Labels[Cells_to_Keep] 25 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 26 | GenesOrder = read.csv(GeneOrderPath) 27 | } 28 | 29 | ############################################################################# 30 | # SingleR # 31 | ############################################################################# 32 | library(SingleR) 33 | library(Seurat) 34 | True_Labels_SingleR <- list() 35 | Pred_Labels_SingleR <- list() 36 | Total_Time_SingleR <- list() 37 | Data = t(as.matrix(Data)) 38 | 39 | for (i in c(1:n_folds)){ 40 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 41 | start_time <- Sys.time() 42 | singler = SingleR(method = "single", Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], 43 | Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], 44 | Labels[Train_Idx[[i]]], numCores = 1) 45 | end_time <- Sys.time() 46 | } 47 | else{ 48 | start_time <- Sys.time() 49 | singler = SingleR(method = "single", Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1) 50 | end_time <- Sys.time() 51 | } 52 | Total_Time_SingleR[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 53 | 54 | True_Labels_SingleR[i] <- list(Labels[Test_Idx[[i]]]) 55 | Pred_Labels_SingleR[i] <- list(as.vector(singler$labels)) 56 | } 57 | True_Labels_SingleR <- as.vector(unlist(True_Labels_SingleR)) 58 | Pred_Labels_SingleR <- as.vector(unlist(Pred_Labels_SingleR)) 59 | Total_Time_SingleR <- as.vector(unlist(Total_Time_SingleR)) 60 | 61 | setwd(OutputDir) 62 | 63 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 64 | write.csv(True_Labels_SingleR,paste('SingleR_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) 65 | write.csv(Pred_Labels_SingleR,paste('SingleR_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) 66 | write.csv(Total_Time_SingleR,paste('SingleR_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE) 67 | } 68 | else{ 69 | write.csv(True_Labels_SingleR,'SingleR_True_Labels.csv',row.names = FALSE) 70 | write.csv(Pred_Labels_SingleR,'SingleR_Pred_Labels.csv',row.names = FALSE) 71 | write.csv(Total_Time_SingleR,'SingleR_Total_Time.csv',row.names = FALSE) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /Scripts/run_kNN50.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import time as tm 5 | from sklearn.neighbors import KNeighborsClassifier 6 | import rpy2.robjects as robjects 7 | 8 | 9 | def run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 10 | ''' 11 | run baseline classifiers: kNN 12 | Wrapper script to run kNN (with k = 50) classifier on a benchmark dataset with 5-fold cross validation, 13 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 14 | 15 | Parameters 16 | ---------- 17 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 18 | as row names and gene names as column names. 19 | LabelsPath : Cell population annotations file path (.csv). 20 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 21 | OutputDir : Output directory defining the path of the exported file. 22 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 23 | defining the genes order for each cross validation fold, default is NULL. 24 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 25 | ''' 26 | 27 | # read the Rdata file 28 | robjects.r['load'](CV_RDataPath) 29 | 30 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 31 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 32 | col = np.array(robjects.r['col_Index'], dtype = 'int') 33 | col = col - 1 34 | test_ind = np.array(robjects.r['Test_Idx']) 35 | train_ind = np.array(robjects.r['Train_Idx']) 36 | 37 | # read the data 38 | data = pd.read_csv(DataPath,index_col=0,sep=',') 39 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 40 | 41 | labels = labels.iloc[tokeep] 42 | data = data.iloc[tokeep] 43 | 44 | # read the feature file 45 | if (NumGenes > 0): 46 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 47 | 48 | # folder with results 49 | os.chdir(OutputDir) 50 | 51 | # normalize data 52 | data = np.log1p(data) 53 | 54 | Classifier = KNeighborsClassifier(n_neighbors=50) 55 | 56 | tr_time=[] 57 | ts_time=[] 58 | truelab = [] 59 | pred = [] 60 | 61 | for i in range(np.squeeze(nfolds)): 62 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 63 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 64 | 65 | train=data.iloc[train_ind_i] 66 | test=data.iloc[test_ind_i] 67 | y_train=labels.iloc[train_ind_i] 68 | y_test=labels.iloc[test_ind_i] 69 | 70 | if (NumGenes > 0): 71 | feat_to_use = features.iloc[0:NumGenes,i] 72 | train = train.iloc[:,feat_to_use] 73 | test = test.iloc[:,feat_to_use] 74 | 75 | start=tm.time() 76 | Classifier.fit(train, y_train) 77 | tr_time.append(tm.time()-start) 78 | 79 | start=tm.time() 80 | predicted = Classifier.predict(test) 81 | ts_time.append(tm.time()-start) 82 | 83 | truelab.extend(y_test.values) 84 | pred.extend(predicted) 85 | 86 | truelab = pd.DataFrame(truelab) 87 | pred = pd.DataFrame(pred) 88 | 89 | tr_time = pd.DataFrame(tr_time) 90 | ts_time = pd.DataFrame(ts_time) 91 | 92 | if (NumGenes == 0): 93 | truelab.to_csv("kNN50_True_Labels.csv", index = False) 94 | pred.to_csv("kNN50_Pred_Labels.csv", index = False) 95 | tr_time.to_csv("kNN50_Training_Time.csv", index = False) 96 | ts_time.to_csv("kNN50_Testing_Time.csv", index = False) 97 | else: 98 | truelab.to_csv("kNN50_" + str(NumGenes) + "_True_Labels.csv", index = False) 99 | pred.to_csv("kNN50_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 100 | tr_time.to_csv("kNN50_" + str(NumGenes) + "_Training_Time.csv", index = False) 101 | ts_time.to_csv("kNN50_" + str(NumGenes) + "_Testing_Time.csv", index = False) 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /Scripts/run_kNN9.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import time as tm 5 | from sklearn.neighbors import KNeighborsClassifier 6 | import rpy2.robjects as robjects 7 | 8 | 9 | def run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 10 | ''' 11 | run baseline classifiers: kNN 12 | Wrapper script to run kNN (with k = 9) classifier on a benchmark dataset with 5-fold cross validation, 13 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 14 | 15 | Parameters 16 | ---------- 17 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 18 | as row names and gene names as column names. 19 | LabelsPath : Cell population annotations file path (.csv). 20 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 21 | OutputDir : Output directory defining the path of the exported file. 22 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 23 | defining the genes order for each cross validation fold, default is NULL. 24 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 25 | ''' 26 | 27 | # read the Rdata file 28 | robjects.r['load'](CV_RDataPath) 29 | 30 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 31 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 32 | col = np.array(robjects.r['col_Index'], dtype = 'int') 33 | col = col - 1 34 | test_ind = np.array(robjects.r['Test_Idx']) 35 | train_ind = np.array(robjects.r['Train_Idx']) 36 | 37 | # read the data 38 | data = pd.read_csv(DataPath,index_col=0,sep=',') 39 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 40 | 41 | labels = labels.iloc[tokeep] 42 | data = data.iloc[tokeep] 43 | 44 | # read the feature file 45 | if (NumGenes > 0): 46 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 47 | 48 | # folder with results 49 | os.chdir(OutputDir) 50 | 51 | # normalize data 52 | data = np.log1p(data) 53 | 54 | Classifier = KNeighborsClassifier(n_neighbors=9) 55 | 56 | tr_time=[] 57 | ts_time=[] 58 | truelab = [] 59 | pred = [] 60 | 61 | for i in range(np.squeeze(nfolds)): 62 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 63 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 64 | 65 | train=data.iloc[train_ind_i] 66 | test=data.iloc[test_ind_i] 67 | y_train=labels.iloc[train_ind_i] 68 | y_test=labels.iloc[test_ind_i] 69 | 70 | if (NumGenes > 0): 71 | feat_to_use = features.iloc[0:NumGenes,i] 72 | train = train.iloc[:,feat_to_use] 73 | test = test.iloc[:,feat_to_use] 74 | 75 | start=tm.time() 76 | Classifier.fit(train, y_train) 77 | tr_time.append(tm.time()-start) 78 | 79 | start=tm.time() 80 | predicted = Classifier.predict(test) 81 | ts_time.append(tm.time()-start) 82 | 83 | truelab.extend(y_test.values) 84 | pred.extend(predicted) 85 | 86 | truelab = pd.DataFrame(truelab) 87 | pred = pd.DataFrame(pred) 88 | 89 | tr_time = pd.DataFrame(tr_time) 90 | ts_time = pd.DataFrame(ts_time) 91 | 92 | if (NumGenes == 0): 93 | truelab.to_csv("kNN9_True_Labels.csv", index = False) 94 | pred.to_csv("kNN9_Pred_Labels.csv", index = False) 95 | tr_time.to_csv("kNN9_Training_Time.csv", index = False) 96 | ts_time.to_csv("kNN9_Testing_Time.csv", index = False) 97 | else: 98 | truelab.to_csv("kNN9_" + str(NumGenes) + "_True_Labels.csv", index = False) 99 | pred.to_csv("kNN9_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 100 | tr_time.to_csv("kNN9_" + str(NumGenes) + "_Training_Time.csv", index = False) 101 | ts_time.to_csv("kNN9_" + str(NumGenes) + "_Testing_Time.csv", index = False) 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /Scripts/run_moana.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | from moana.core import ExpMatrix 5 | from moana.classify import CellTypeClassifier 6 | import time as tm 7 | import rpy2.robjects as robjects 8 | 9 | def run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 10 | ''' 11 | run moana 12 | Wrapper script to run moana on a benchmark dataset with a pretrained classifier, 13 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 14 | 15 | Parameters 16 | ---------- 17 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 18 | as row names and gene names as column names. 19 | LabelsPath : Cell population annotations file path (.csv). 20 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 21 | ClassifierPath : Data file path to the pretrained classifier. 22 | OutputDir : Output directory defining the path of the exported file. 23 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 24 | defining the genes order for each cross validation fold, default is NULL. 25 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 26 | ''' 27 | 28 | # # read the Rdata file 29 | # robjects.r['load'](CV_RDataPath) 30 | # 31 | # tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 32 | # col = np.array(robjects.r['col_Index'], dtype = 'int') 33 | # col = col - 1 34 | 35 | matrix = ExpMatrix.read_tsv(DataPath, sep = ',') 36 | # matrix = matrix.iloc[tokeep] 37 | 38 | truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',') 39 | # truelab = truelab.iloc[tokeep] 40 | 41 | ct_old = ['CD19+ B','CD14+ Monocyte','CD4+/CD45RA+/CD25- Naive T','CD4+/CD45RO+ Memory','CD8+/CD45RA+ Naive Cytotoxic','Dendritic', 'CD56+ NK'] 42 | ct_new = ['B cells','CD14+ monocytes','Naive CD4+ T cells','Memory CD4+ T cells','Naive CD8+ T cells','Dendritic cells','NK cells'] 43 | 44 | tokeep2 = np.isin(truelab,ct_old) 45 | truelab = truelab[tokeep2] 46 | print(len(truelab)) 47 | matrix = matrix.iloc[np.squeeze(tokeep2)] 48 | 49 | for i in range(len(ct_old)): 50 | truelab.iloc[truelab == ct_old[i]] = ct_new[i] 51 | 52 | # read the feature file 53 | if (NumGenes > 0): 54 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 55 | feat_to_use = features.iloc[0:NumGenes,0] 56 | matrix = matrix.iloc[:,feat_to_use] 57 | 58 | data = ExpMatrix(X = np.transpose(matrix.X), genes = matrix.cells, cells = matrix.genes) 59 | data.genes.name = 'Genes' 60 | data.cells.name = 'Cells' 61 | data.index.name = 'Genes' 62 | data.columns.name = 'Cells' 63 | 64 | clf = CellTypeClassifier.read_pickle(ClassifierPath) 65 | 66 | start = tm.time() 67 | predictions = clf.predict(data) 68 | runtime = tm.time() - start 69 | 70 | np.asarray(predictions) 71 | 72 | pred = pd.DataFrame(predictions) 73 | 74 | os.chdir(OutputDir) 75 | 76 | if (NumGenes == 0): 77 | truelab.to_csv("moana_True_Labels.csv", index = False) 78 | pred.to_csv("moana_Pred_Labels.csv", index = False) 79 | with open("moana_Total_Time.csv", 'w') as f: 80 | f.write("%f\n" % runtime) 81 | else: 82 | truelab.to_csv("moana_" + str(NumGenes) + "_True_Labels.csv", index = False) 83 | pred.to_csv("moana_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 84 | with open("moana_" + str(NumGenes) + "_Total_Time.csv", 'w') as f: 85 | f.write("%f\n" % runtime) 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /Scripts/run_scID.R: -------------------------------------------------------------------------------- 1 | run_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ 2 | " 3 | run scID 4 | Wrapper script to run scID on a benchmark dataset with 5-fold cross validation, 5 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 6 | 7 | Parameters 8 | ---------- 9 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 10 | as row names and gene names as column names. 11 | LabelsPath : Cell population annotations file path (.csv). 12 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 13 | OutputDir : Output directory defining the path of the exported file. 14 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 15 | defining the genes order for each cross validation fold, default is NULL. 16 | NumGenes : Number of genes used in case of feature selection (integer), default is NULL. 17 | " 18 | 19 | Data <- read.csv(DataPath,row.names = 1) 20 | Labels <- as.matrix(read.csv(LabelsPath)) 21 | load(CV_RDataPath) 22 | Labels <- as.vector(Labels[,col_Index]) 23 | Data <- Data[Cells_to_Keep,] 24 | Labels <- Labels[Cells_to_Keep] 25 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 26 | GenesOrder = read.csv(GeneOrderPath) 27 | } 28 | 29 | ############################################################################# 30 | # scID # 31 | ############################################################################# 32 | library(scID) 33 | library(Seurat) 34 | True_Labels_scID <- list() 35 | Pred_Labels_scID <- list() 36 | Total_Time_scID <- list() 37 | Data = t(as.matrix(Data)) 38 | 39 | for (i in c(1:n_folds)){ 40 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 41 | Train_Labels <- list(Labels[Train_Idx[[i]]]) 42 | names(Train_Labels[[1]]) <- colnames(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]) 43 | start_time <- Sys.time() 44 | scID_output <- scid_multiclass(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], 45 | Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], 46 | Train_Labels[[1]]) 47 | end_time <- Sys.time() 48 | } 49 | else{ 50 | Train_Labels <- list(Labels[Train_Idx[[i]]]) 51 | names(Train_Labels[[1]]) <- colnames(Data[,Train_Idx[[i]]]) 52 | start_time <- Sys.time() 53 | scID_output <- scid_multiclass(Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Train_Labels[[1]]) 54 | end_time <- Sys.time() 55 | } 56 | Total_Time_scID[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 57 | 58 | True_Labels_scID[i] <- list(Labels[Test_Idx[[i]]]) 59 | Pred_Labels_scID[i] <- list(as.vector(scID_output$labels)) 60 | } 61 | True_Labels_scID <- as.vector(unlist(True_Labels_scID)) 62 | Pred_Labels_scID <- as.vector(unlist(Pred_Labels_scID)) 63 | Total_Time_scID <- as.vector(unlist(Total_Time_scID)) 64 | 65 | setwd(OutputDir) 66 | 67 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 68 | write.csv(True_Labels_scID,paste('scID_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) 69 | write.csv(Pred_Labels_scID,paste('scID_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) 70 | write.csv(Total_Time_scID,paste('scID_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE) 71 | } 72 | else{ 73 | write.csv(True_Labels_scID,'scID_True_Labels.csv',row.names = FALSE) 74 | write.csv(Pred_Labels_scID,'scID_Pred_Labels.csv',row.names = FALSE) 75 | write.csv(Total_Time_scID,'scID_Total_Time.csv',row.names = FALSE) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /Scripts/run_scPred.R: -------------------------------------------------------------------------------- 1 | run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ 2 | " 3 | run scPred 4 | Wrapper script to run scPred on a benchmark dataset with 5-fold cross validation, 5 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 6 | 7 | Parameters 8 | ---------- 9 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 10 | as row names and gene names as column names. 11 | LabelsPath : Cell population annotations file path (.csv). 12 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 13 | OutputDir : Output directory defining the path of the exported file. 14 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 15 | defining the genes order for each cross validation fold, default is NULL. 16 | NumGenes : Number of genes used in case of feature selection (integer), default is NULL. 17 | " 18 | 19 | Data <- read.csv(DataPath,row.names = 1) 20 | Labels <- as.matrix(read.csv(LabelsPath)) 21 | load(CV_RDataPath) 22 | Labels <- as.vector(Labels[,col_Index]) 23 | Data <- Data[Cells_to_Keep,] 24 | Labels <- Labels[Cells_to_Keep] 25 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 26 | GenesOrder = read.csv(GeneOrderPath) 27 | } 28 | 29 | ############################################################################# 30 | # scPred # 31 | ############################################################################# 32 | library(scPred) 33 | library(tidyverse) 34 | library(SingleCellExperiment) 35 | True_Labels_scPred <- list() 36 | Pred_Labels_scPred <- list() 37 | Training_Time_scPred <- list() 38 | Testing_Time_scPred <- list() 39 | Data = t(as.matrix(Data)) 40 | 41 | for (i in c(1:n_folds)){ 42 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 43 | sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 44 | colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) 45 | sce_counts <- normcounts(sce) 46 | sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000) 47 | sce_metadata <- as.data.frame(colData(sce)) 48 | 49 | sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 50 | colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) 51 | sce_counts_test <- normcounts(sce_test) 52 | sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000) 53 | sce_metadata_test <- as.data.frame(colData(sce_test)) 54 | } 55 | else{ 56 | sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 57 | colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) 58 | sce_counts <- normcounts(sce) 59 | sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000) 60 | sce_metadata <- as.data.frame(colData(sce)) 61 | 62 | sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 63 | colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) 64 | sce_counts_test <- normcounts(sce_test) 65 | sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000) 66 | sce_metadata_test <- as.data.frame(colData(sce_test)) 67 | } 68 | 69 | 70 | # scPred Training 71 | start_time <- Sys.time() 72 | set.seed(1234) 73 | scp <- eigenDecompose(sce_cpm) 74 | scPred::metadata(scp) <- sce_metadata 75 | scp <- getFeatureSpace(scp, pVar = 'cell_type1') 76 | # plotEigen(scp, group = 'cell_type1') 77 | scp <- trainModel(scp) 78 | # plotTrainProbs(scp) 79 | end_time <- Sys.time() 80 | Training_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 81 | 82 | # scPred Prediction 83 | start_time <- Sys.time() 84 | scp <- scPredict(scp,newData = sce_cpm_test) 85 | end_time <- Sys.time() 86 | Testing_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 87 | 88 | True_Labels_scPred[i] <- list(Labels[Test_Idx[[i]]]) 89 | Pred_Labels_scPred[i] <- list(getPredictions(scp)$predClass) 90 | } 91 | True_Labels_scPred <- as.vector(unlist(True_Labels_scPred)) 92 | Pred_Labels_scPred <- as.vector(unlist(Pred_Labels_scPred)) 93 | Training_Time_scPred <- as.vector(unlist(Training_Time_scPred)) 94 | Testing_Time_scPred <- as.vector(unlist(Testing_Time_scPred)) 95 | 96 | setwd(OutputDir) 97 | 98 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 99 | write.csv(True_Labels_scPred,paste('scPred_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) 100 | write.csv(Pred_Labels_scPred,paste('scPred_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) 101 | write.csv(Training_Time_scPred,paste('scPred_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE) 102 | write.csv(Testing_Time_scPred,paste('scPred_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE) 103 | } 104 | else{ 105 | write.csv(True_Labels_scPred,'scPred_True_Labels.csv',row.names = FALSE) 106 | write.csv(Pred_Labels_scPred,'scPred_Pred_Labels.csv',row.names = FALSE) 107 | write.csv(Training_Time_scPred,'scPred_Training_Time.csv',row.names = FALSE) 108 | write.csv(Testing_Time_scPred,'scPred_Testing_Time.csv',row.names = FALSE) 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /Scripts/run_scVI.py: -------------------------------------------------------------------------------- 1 | from scvi.dataset import CsvDataset 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from scvi.models import SCANVI 6 | from scvi.inference import SemiSupervisedTrainer 7 | import time as tm 8 | import rpy2.robjects as robjects 9 | 10 | def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 11 | ''' 12 | run scVI 13 | Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation, 14 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 15 | 16 | Parameters 17 | ---------- 18 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 19 | as row names and gene names as column names. 20 | LabelsPath : Cell population annotations file path (.csv). 21 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 22 | OutputDir : Output directory defining the path of the exported file. 23 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 24 | defining the genes order for each cross validation fold, default is NULL. 25 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 26 | ''' 27 | 28 | # read the Rdata file 29 | robjects.r['load'](CV_RDataPath) 30 | 31 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 32 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 33 | col = np.array(robjects.r['col_Index'], dtype = 'int') 34 | col = col - 1 35 | test_ind = np.array(robjects.r['Test_Idx']) 36 | train_ind = np.array(robjects.r['Train_Idx']) 37 | 38 | # read the data 39 | data = pd.read_csv(DataPath,index_col=0,sep=',') 40 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 41 | 42 | labels = labels.iloc[tokeep] 43 | data = data.iloc[tokeep] 44 | 45 | # read the feature file 46 | if (NumGenes > 0): 47 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 48 | 49 | os.chdir(OutputDir) 50 | 51 | if (NumGenes == 0): 52 | #save labels as csv file with header and index column 53 | labels.to_csv('Labels_scvi.csv') 54 | data.to_csv('Data_scvi.csv') 55 | 56 | train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False) 57 | 58 | ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing 59 | scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels) 60 | trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5) 61 | 62 | n_epochs = 200 63 | 64 | truelab = [] 65 | pred = [] 66 | tr_time = [] 67 | ts_time = [] 68 | 69 | for i in range(np.squeeze(nfolds)): 70 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 71 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 72 | 73 | if (NumGenes > 0): 74 | feat_to_use = features.iloc[0:NumGenes,i] 75 | data2 = data.iloc[:,feat_to_use] 76 | 77 | labels.to_csv('Labels_scvi.csv') 78 | data2.to_csv('Data_scvi.csv') 79 | 80 | train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False, new_n_genes = False) 81 | 82 | ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing 83 | scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels) 84 | trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5) 85 | 86 | trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False) 87 | trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy'] 88 | trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False) 89 | trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy'] 90 | 91 | start = tm.time() 92 | trainer_scanvi.train(n_epochs) 93 | tr_time.append(tm.time()-start) 94 | 95 | ## labels of test set are in y_pred 96 | ## labels are returned in numbers, should be mapped back to the real labels 97 | ## indices are permutated 98 | start = tm.time() 99 | y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions() 100 | ts_time.append(tm.time()-start) 101 | 102 | truelab.extend(y_true) 103 | pred.extend(y_pred) 104 | 105 | #write results 106 | 107 | truelab = pd.DataFrame(truelab) 108 | pred = pd.DataFrame(pred) 109 | 110 | tr_time = pd.DataFrame(tr_time) 111 | ts_time = pd.DataFrame(ts_time) 112 | 113 | 114 | if (NumGenes == 0): 115 | truelab.to_csv("scVI_True_Labels.csv", index = False) 116 | pred.to_csv("scVI_Pred_Labels.csv", index = False) 117 | tr_time.to_csv("scVI_Training_Time.csv", index = False) 118 | ts_time.to_csv("scVI_Testing_Time.csv", index = False) 119 | else: 120 | truelab.to_csv("scVI_" + str(NumGenes) + "_True_Labels.csv", index = False) 121 | pred.to_csv("scVI_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 122 | tr_time.to_csv("scVI_" + str(NumGenes) + "_Training_Time.csv", index = False) 123 | ts_time.to_csv("scVI_" + str(NumGenes) + "_Testing_Time.csv", index = False) 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /Scripts/run_singleCellNet.R: -------------------------------------------------------------------------------- 1 | run_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ 2 | " 3 | run singleCellNet 4 | Wrapper script to run singleCellNet on a benchmark dataset with 5-fold cross validation, 5 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 6 | 7 | Parameters 8 | ---------- 9 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 10 | as row names and gene names as column names. 11 | LabelsPath : Cell population annotations file path (.csv). 12 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 13 | OutputDir : Output directory defining the path of the exported file. 14 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 15 | defining the genes order for each cross validation fold, default is NULL. 16 | NumGenes : Number of genes used in case of feature selection (integer), default is NULL. 17 | " 18 | 19 | Data <- read.csv(DataPath,row.names = 1) 20 | colnames(Data) <- gsub('_','.',colnames(Data), fixed = TRUE) 21 | Labels <- as.matrix(read.csv(LabelsPath)) 22 | load(CV_RDataPath) 23 | Labels <- as.vector(Labels[,col_Index]) 24 | Data <- Data[Cells_to_Keep,] 25 | Labels <- Labels[Cells_to_Keep] 26 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 27 | GenesOrder = read.csv(GeneOrderPath) 28 | } 29 | 30 | ############################################################################# 31 | # singleCellNet # 32 | ############################################################################# 33 | library(singleCellNet) 34 | library(dplyr) 35 | True_Labels_singleCellNet <- list() 36 | Pred_Labels_singleCellNet <- list() 37 | Training_Time_singleCellNet <- list() 38 | Testing_Time_singleCellNet <- list() 39 | Data = t(as.matrix(Data)) # deals also with sparse matrix 40 | 41 | for(i in c(1:n_folds)){ 42 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 43 | DataTrain <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]] 44 | DataTest <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]] 45 | } 46 | else{ 47 | DataTrain <- Data[,Train_Idx[[i]]] 48 | DataTest <- Data[,Test_Idx[[i]]] 49 | } 50 | 51 | start_time <- Sys.time() 52 | cgenes2<-findClassyGenes(DataTrain, data.frame(Annotation = Labels[Train_Idx[[i]]]), "Annotation") 53 | cgenesA<-cgenes2[['cgenes']] 54 | grps<-cgenes2[['grps']] 55 | DataTrain<-as.matrix(DataTrain[cgenesA,]) 56 | xpairs<-ptGetTop(DataTrain, grps, ncores = 1) 57 | pdTrain<-query_transform(DataTrain[cgenesA, ], xpairs) 58 | rf<-sc_makeClassifier(pdTrain[xpairs,], genes=xpairs, groups=grps) 59 | end_time <- Sys.time() 60 | Training_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 61 | 62 | start_time <- Sys.time() 63 | DataTest<-query_transform(DataTest[cgenesA,], xpairs) 64 | classRes <-rf_classPredict(rf, DataTest) 65 | end_time <- Sys.time() 66 | Testing_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 67 | 68 | True_Labels_singleCellNet[i] <- list(Labels[Test_Idx[[i]]]) 69 | Pred_Labels_singleCellNet[i] <- list((rownames(classRes)[apply(classRes,2,which.max)])[1:length(Test_Idx[[i]])]) 70 | } 71 | True_Labels_singleCellNet <- as.vector(unlist(True_Labels_singleCellNet)) 72 | Pred_Labels_singleCellNet <- as.vector(unlist(Pred_Labels_singleCellNet)) 73 | Training_Time_singleCellNet <- as.vector(unlist(Training_Time_singleCellNet)) 74 | Testing_Time_singleCellNet <- as.vector(unlist(Testing_Time_singleCellNet)) 75 | 76 | setwd(OutputDir) 77 | 78 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 79 | write.csv(True_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) 80 | write.csv(Pred_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) 81 | write.csv(Training_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE) 82 | write.csv(Testing_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE) 83 | } 84 | else{ 85 | write.csv(True_Labels_singleCellNet,'singleCellNet_True_Labels.csv',row.names = FALSE) 86 | write.csv(Pred_Labels_singleCellNet,'singleCellNet_Pred_Labels.csv',row.names = FALSE) 87 | write.csv(Training_Time_singleCellNet,'singleCellNet_Training_Time.csv',row.names = FALSE) 88 | write.csv(Testing_Time_singleCellNet,'singleCellNet_Testing_Time.csv',row.names = FALSE) 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /Snakemake/Cross_Validation.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | 3 | Cross_Validation <- function(LabelsPath, col_Index = 1, OutputDir){ 4 | " 5 | Cross_Validation 6 | Function returns train and test indices for 5 folds stratified across unique cell populations, 7 | also filter out cell populations with less than 10 cells. 8 | It return a 'CV_folds.RData' file which then used as input to classifiers wrappers. 9 | 10 | Parameters 11 | ---------- 12 | LabelsPath : Cell population annotations file path (.csv). 13 | col_Index : column index (integer) defining which level of annotation to use, 14 | in case of multiple cell type annotations (default is 1) 15 | OutputDir : Output directory defining the path of the exported file. 16 | " 17 | 18 | Labels <- as.matrix(read.csv(LabelsPath)) 19 | Labels <- as.vector(Labels[,col_Index]) 20 | 21 | Removed_classes <- !(table(Labels) > 10) 22 | Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes])) 23 | Labels <- Labels[Cells_to_Keep] 24 | 25 | # Getting training and testing Folds 26 | library(rBayesianOptimization) 27 | n_folds = 5 28 | Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE) 29 | Test_Folds <- c(n_folds:1) 30 | Train_Idx <- list() 31 | Test_Idx <- list() 32 | for (i in c(1:length(Folds))){ 33 | Temp_Folds <- Folds 34 | Temp_Folds[Test_Folds[i]] <- NULL 35 | Train_Idx[i] <- list(unlist(Temp_Folds)) 36 | Test_Idx[i] <- Folds[Test_Folds[i]] 37 | } 38 | remove(Temp_Folds,i,Folds) 39 | save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = paste0(OutputDir, '/CV_folds.RData')) 40 | } 41 | 42 | Cross_Validation(args[1], as.numeric(args[2]), args[3]) 43 | -------------------------------------------------------------------------------- /Snakemake/DEgenesMAST.R: -------------------------------------------------------------------------------- 1 | DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){ 2 | # This functions applies a differential expression test to the data using one vs all 3 | # The training data should be used a an input 4 | # The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes 5 | # This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set. 6 | 7 | # Data: genes X cells (rows = genes, columns = cells) 8 | # Labels: labels of the data 9 | # Normalize: the input for MAST should be cpm normalized data, 10 | # if the data is not normalized yet, this should be set to TRUE 11 | # LogTransform: the input for MAST should be logtransformed, 12 | # if the data is not logtransformed yet, this should be set to TRUE 13 | 14 | 15 | library(Seurat) 16 | 17 | if(Normalize) 18 | { 19 | Data <- apply(Data, 2, function(x) (x/sum(x))*1000000) 20 | } 21 | 22 | if(LogTransform) 23 | { 24 | Data <- log(Data+1, base = 2) 25 | } 26 | SeuObj <- CreateSeuratObject(raw.data = Data, project = "DEgenes") 27 | SeuObj <- SetIdent(SeuObj, ident.use = Labels) 28 | DEgenes <- FindAllMarkers(SeuObj, test.use = "MAST") 29 | Markers <- matrix(nrow = 20,ncol = length(unique(Labels))) 30 | colnames(Markers) <- unique(Labels) 31 | for (i in unique(Labels)){ 32 | i 33 | TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))] 34 | MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i] 35 | print(MarkerGenes[1:20]) 36 | if (length(TempList) >= 20){ 37 | Markers[,i] <- TempList[1:20] 38 | } 39 | else{ 40 | if(length(TempList) > 0){ 41 | Markers[c(1:length(TempList)),i] <- TempList 42 | } 43 | } 44 | } 45 | return(Markers) 46 | } 47 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/baseline/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:9.9-slim 2 | 3 | # Install newest R version 4 | RUN apt-get update && \ 5 | apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ 6 | wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ 7 | echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ 8 | apt-get update && \ 9 | apt-get install --no-install-recommends --yes r-base && \ 10 | apt-get purge --yes wget gnupg apt-transport-https && \ 11 | apt-get autoremove --yes && \ 12 | apt-get clean && \ 13 | rm -rf /var/lib/apt/lists/* 14 | 15 | # Install python 16 | RUN apt-get update && \ 17 | apt-get install --no-install-recommends --yes python3 python3-pip && \ 18 | pip3 --no-cache-dir install setuptools && \ 19 | pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels && \ 20 | rm -rf /var/lib/apt/lists/* 21 | 22 | COPY Scripts/run_kNN50.py \ 23 | Scripts/run_kNN9.py \ 24 | Scripts/run_LDA.py \ 25 | Scripts/run_LDA_rejection.py \ 26 | Scripts/run_NMC.py \ 27 | Scripts/run_RF.py \ 28 | Scripts/run_SVM.py \ 29 | Scripts/run_SVM_rejection.py \ 30 | rank_gene_dropouts.py \ 31 | /Scripts/ 32 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/cell_blast/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-slim-stretch 2 | 3 | # Install newest R version 4 | RUN apt-get update && \ 5 | apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ 6 | wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ 7 | echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ 8 | apt-get update && \ 9 | apt-get install --no-install-recommends --yes r-base && \ 10 | apt-get purge --yes wget gnupg apt-transport-https && \ 11 | apt-get autoremove --yes && \ 12 | apt-get clean && \ 13 | rm -rf /var/lib/apt/lists/* 14 | 15 | # Install python and pip deps 16 | RUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \ 17 | pip3 --no-cache-dir install --upgrade pip && \ 18 | pip3 --no-cache-dir install --upgrade setuptools && \ 19 | pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow Cell-BLAST && \ 20 | apt-get autoremove --yes && \ 21 | apt-get clean && \ 22 | rm -rf /var/lib/apt/lists/* 23 | 24 | COPY Scripts/run_Cell_BLAST.py /Scripts/ 25 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/chetah/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:9.9-slim 2 | 3 | # Install newest R version 4 | RUN apt-get update && \ 5 | apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ 6 | wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ 7 | echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ 8 | apt-get update && \ 9 | apt-get install --no-install-recommends --yes r-base && \ 10 | apt-get purge --yes wget gnupg apt-transport-https && \ 11 | apt-get autoremove --yes && \ 12 | apt-get clean && \ 13 | rm -rf /var/lib/apt/lists/* 14 | 15 | COPY Scripts/run_CHETAH.R \ 16 | Dockerfiles/chetah/install_packages.R \ 17 | /Scripts/ 18 | 19 | # Install R packages 20 | RUN apt-get update && \ 21 | apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ 22 | Rscript --vanilla /Scripts/install_packages.R && \ 23 | apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ 24 | apt-get autoremove --yes && \ 25 | apt-get clean && \ 26 | rm -rf /var/lib/apt/lists/* 27 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/chetah/install_packages.R: -------------------------------------------------------------------------------- 1 | withCallingHandlers({ 2 | install.packages("devtools", repos="https://cloud.r-project.org/") 3 | install.packages("BiocManager", repos="https://cloud.r-project.org/") 4 | BiocManager::install(c("bioDist", "ggplot2", "gplots", "cowplot", 5 | "dendextend", "corrplot", "reshape2", "plotly")) 6 | devtools::install_github("jdekanter/CHETAH", ref="b777e6f671bff3c434842adb655869a52bc9e368") 7 | }, 8 | warning = function(w) stop(w)) 9 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/cross_validation/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:9.9-slim 2 | 3 | # Install newest R version 4 | RUN apt-get update && \ 5 | apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ 6 | wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ 7 | echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ 8 | apt-get update && \ 9 | apt-get install --no-install-recommends --yes r-base && \ 10 | apt-get purge --yes wget gnupg apt-transport-https && \ 11 | apt-get autoremove --yes && \ 12 | apt-get clean && \ 13 | rm -rf /var/lib/apt/lists/* 14 | 15 | COPY Cross_Validation.R \ 16 | Dockerfiles/cross_validation/install_packages.R \ 17 | /Scripts/ 18 | 19 | # Install R packages 20 | RUN apt-get update && \ 21 | apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libxml2-dev && \ 22 | Rscript --vanilla /Scripts/install_packages.R && \ 23 | apt-get purge --yes make gcc g++ libxml2-dev && \ 24 | apt-get autoremove --yes && \ 25 | apt-get clean && \ 26 | rm -rf /var/lib/apt/lists/* 27 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/cross_validation/install_packages.R: -------------------------------------------------------------------------------- 1 | withCallingHandlers({ 2 | install.packages("lhs", repos="https://cloud.r-project.org/") 3 | install.packages("rBayesianOptimization", repos="https://cloud.r-project.org/") 4 | }, 5 | warning = function(w) stop(w)) 6 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/garnett/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:9.9-slim 2 | 3 | # Install newest R version 4 | RUN apt-get update && \ 5 | apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ 6 | wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ 7 | echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ 8 | apt-get update && \ 9 | apt-get install --no-install-recommends --yes r-base && \ 10 | apt-get purge --yes wget gnupg apt-transport-https && \ 11 | apt-get autoremove --yes && \ 12 | apt-get clean && \ 13 | rm -rf /var/lib/apt/lists/* 14 | 15 | COPY Scripts/run_Garnett_CV.R \ 16 | Scripts/run_Garnett_Pretrained.R \ 17 | Dockerfiles/garnett/install_packages.R \ 18 | /Scripts/ 19 | 20 | # Install R packages 21 | RUN apt-get update && \ 22 | apt-get install --no-install-recommends --yes make gcc g++ libxml2-dev zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \ 23 | Rscript --vanilla /Scripts/install_packages.R && \ 24 | apt-get purge --yes make gcc g++ zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \ 25 | apt-get autoremove --yes && \ 26 | apt-get clean && \ 27 | rm -rf /var/lib/apt/lists/* 28 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/garnett/install_packages.R: -------------------------------------------------------------------------------- 1 | withCallingHandlers({ 2 | install.packages("BiocManager", repos="https://cloud.r-project.org/") 3 | BiocManager::install(c("monocle", "DelayedArray", "DelayedMatrixStats", 4 | "org.Hs.eg.db", "org.Mm.eg.db")) 5 | install.packages("devtools", repos="https://cloud.r-project.org/") 6 | devtools::install_github("cole-trapnell-lab/garnett", ref="9804b532bbcc1714b3ed0b718cf430741f1dba6c") 7 | }, 8 | warning = function(w) stop(w)) 9 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/scid/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM r-base:3.6.0 2 | 3 | COPY Scripts/run_scID.R \ 4 | Dockerfiles/scid/install_packages.R \ 5 | /Scripts/ 6 | 7 | # Install R packages 8 | RUN apt-get update && \ 9 | apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ 10 | Rscript --vanilla /Scripts/install_packages.R && \ 11 | apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ 12 | apt-get autoremove --yes && \ 13 | apt-get clean && \ 14 | rm -rf /var/lib/apt/lists/* 15 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/scid/install_packages.R: -------------------------------------------------------------------------------- 1 | withCallingHandlers({ 2 | install.packages("BiocManager", repos="https://cloud.r-project.org/") 3 | BiocManager::install(ask = FALSE); 4 | BiocManager::install(c("scater", "MAST")) 5 | install.packages("devtools", repos="https://cloud.r-project.org/") 6 | devtools::install_github("satijalab/seurat") 7 | devtools::install_github("BatadaLab/scID") 8 | }, 9 | warning = function(w) stop(w)) 10 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/scmap/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM r-base:3.6.0 2 | 3 | COPY Scripts/run_scmapcell.R \ 4 | Scripts/run_scmapcluster.R \ 5 | Dockerfiles/scmap/install_packages.R \ 6 | /Scripts/ 7 | 8 | # Install R packages 9 | RUN apt-get update && \ 10 | apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ 11 | Rscript --vanilla /Scripts/install_packages.R && \ 12 | apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ 13 | apt-get autoremove --yes && \ 14 | apt-get clean && \ 15 | rm -rf /var/lib/apt/lists/* 16 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/scmap/install_packages.R: -------------------------------------------------------------------------------- 1 | withCallingHandlers({ 2 | install.packages("BiocManager", repos="https://cloud.r-project.org/") 3 | BiocManager::install(ask = FALSE) 4 | BiocManager::install("SingleCellExperiment") 5 | install.packages("devtools", repos="https://cloud.r-project.org/") 6 | devtools::install_github("hemberg-lab/scmap") 7 | }, 8 | warning = function(w) stop(w)) 9 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/scvi/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-slim-stretch 2 | 3 | # Install newest R version 4 | RUN apt-get update && \ 5 | apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ 6 | wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ 7 | echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ 8 | apt-get update && \ 9 | apt-get install --no-install-recommends --yes r-base && \ 10 | apt-get purge --yes wget gnupg apt-transport-https && \ 11 | apt-get autoremove --yes && \ 12 | apt-get clean && \ 13 | rm -rf /var/lib/apt/lists/* 14 | 15 | # Install python and pip deps 16 | RUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \ 17 | pip3 --no-cache-dir install --upgrade pip && \ 18 | pip3 --no-cache-dir install --upgrade setuptools && \ 19 | pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow scvi && \ 20 | apt-get autoremove --yes && \ 21 | apt-get clean && \ 22 | rm -rf /var/lib/apt/lists/* 23 | 24 | 25 | COPY Scripts/run_scVI.py /Scripts/ 26 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/singlecellnet/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:9.9-slim 2 | 3 | # Install newest R version 4 | RUN apt-get update && \ 5 | apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ 6 | wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ 7 | echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ 8 | apt-get update && \ 9 | apt-get install --no-install-recommends --yes r-base && \ 10 | apt-get purge --yes wget gnupg apt-transport-https && \ 11 | apt-get autoremove --yes && \ 12 | apt-get clean && \ 13 | rm -rf /var/lib/apt/lists/* 14 | 15 | COPY Scripts/run_singleCellNet.R \ 16 | Dockerfiles/singlecellnet/install_packages.R \ 17 | /Scripts/ 18 | 19 | # Install R packages 20 | RUN apt-get update && \ 21 | apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libcurl4-openssl-dev zlib1g-dev libssl-dev r-base-dev libxml2-dev && \ 22 | Rscript --vanilla /Scripts/install_packages.R && \ 23 | apt-get purge --yes make gcc g++ zlib1g-dev libcurl4-openssl-dev libc6-dev libssl-dev r-base-dev libxml2-dev && \ 24 | apt-get autoremove --yes && \ 25 | apt-get clean && \ 26 | rm -rf /var/lib/apt/lists/* 27 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/singlecellnet/install_packages.R: -------------------------------------------------------------------------------- 1 | withCallingHandlers({ 2 | install.packages("devtools", repos="https://cloud.r-project.org/") 3 | install.packages("BiocManager", repos="https://cloud.r-project.org/") 4 | BiocManager::install("fgsea") 5 | devtools::install_github("thomasp85/patchwork", ref="fd7958bae3e7a1e30237c751952e412a0a1d1242") 6 | devtools::install_github("pcahan1/singleCellNet", ref="4279a68112743b783cc82628421dd703261ec117") 7 | }, 8 | warning = function(w) stop(w)) 9 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/singler/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:9.9-slim 2 | 3 | # Install newest R version 4 | RUN apt-get update && \ 5 | apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ 6 | wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ 7 | echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ 8 | apt-get update && \ 9 | apt-get install --no-install-recommends --yes r-base && \ 10 | apt-get purge --yes wget gnupg apt-transport-https && \ 11 | apt-get autoremove --yes && \ 12 | apt-get clean && \ 13 | rm -rf /var/lib/apt/lists/* 14 | 15 | COPY Scripts/run_SingleR.R \ 16 | Dockerfiles/singler/install_packages.R \ 17 | /Scripts/ 18 | 19 | RUN apt-get update && \ 20 | apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev libxml2 && \ 21 | Rscript --vanilla /Scripts/install_packages.R && \ 22 | apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ 23 | apt-get autoremove --yes && \ 24 | apt-get clean && \ 25 | rm -rf /var/lib/apt/lists/* 26 | -------------------------------------------------------------------------------- /Snakemake/Dockerfiles/singler/install_packages.R: -------------------------------------------------------------------------------- 1 | withCallingHandlers({ 2 | install.packages("devtools", repos="https://cloud.r-project.org/") 3 | install.packages("Seurat", repos="https://cloud.r-project.org/") 4 | devtools::install_github("dviraran/SingleR", ref="db4823b380ba2c3142c857c8c0695200dd1736f6") 5 | }, 6 | warning = function(w) stop(w)) 7 | -------------------------------------------------------------------------------- /Snakemake/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 tabdelaal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Snakemake/README.md: -------------------------------------------------------------------------------- 1 | # scRNAseq_Benchmark 2 | Benchmarking classification tools for scRNA-seq data 3 | 4 | ## How to use 5 | [snakemake](https://snakemake.readthedocs.io/en/stable/index.html) and 6 | [singularity](https://www.sylabs.io/docs/) need to be available on your 7 | system. You will need to run this on a linux system, as singularity 8 | only supports linux. 9 | 10 | From the root of this repository: 11 | ``` 12 | snakemake \ 13 | --configfile \ 14 | --use-singularity 15 | ``` 16 | 17 | If your data or output directory is not located under the root of this 18 | repository, be sure to tell snakemake to mount the appropriate directories 19 | in singularity: 20 | ``` 21 | snakemake \ 22 | --configfile \ 23 | --use-singularity \ 24 | --singularity-args '--bind : --bind :' 25 | ``` 26 | 27 | #### The config file 28 | ```YML 29 | output_dir: 30 | datafile: 31 | labfile: 32 | column: 33 | number_of_features: 34 | genes: 35 | human: 36 | tools_to_run: # List of tools to run 37 | - 38 | - 39 | - <...> 40 | ``` 41 | 42 | ##### Tool specific inputs 43 | Some tools require specific inputs. Add the following to your config file when 44 | one of these tools: 45 | - Garnett_CV 46 | ```YML 47 | Garnett_CV: 48 | markers: 49 | ``` 50 | - Garnett_Pretrained 51 | ```YML 52 | Garnett_Pretrained: 53 | classifier: 54 | ``` 55 | 56 | 57 | 58 | ## Included tools/methods 59 | - kNN50 60 | - kNN9 61 | - LDA 62 | - LDA_rejection (LDA with rejection option) 63 | - NMC 64 | - RF 65 | - SVM 66 | - SVM (SVM with rejection option) 67 | - [singleCellNet](https://github.com/pcahan1/singleCellNet) 68 | - [CHETAH](https://github.com/jdekanter/CHETAH) 69 | - [scmap](https://github.com/hemberg-lab/scmap) 70 | - scmapcell 71 | - scmapcluster 72 | - [SingleR](https://github.com/dviraran/SingleR) 73 | - [scID](https://github.com/BatadaLab/scID) 74 | - [scVI](https://github.com/YosefLab/scVI) 75 | - [Cell_BLAST](https://github.com/gao-lab/Cell_BLAST) 76 | - [Garnett](https://cole-trapnell-lab.github.io/garnett/) 77 | - Garnett_CV (without pretrained classifier) 78 | - Garnett_Pretrained (with pretrained classifier) 79 | 80 | ## Adding new tools 81 | In order to add a tool to this benchmarking workflow, a rule for this tool 82 | needs to be added to the `Snakefile`. This rule should produce as output: 83 | - a table of predicted label (`/_pred.csv`). 84 | - a table of true labels (`/_true.csv`). 85 | - a tables of testing, prediction and/or total time: 86 | - `//_test_time.csv` 87 | - `//_training_time.csv` 88 | - `//_total_time.csv` 89 | 90 | The input to this rule should be: 91 | - a count table (specified as the `datafile` in the config). 92 | - a true labels file (specified as the `labfile` in the config). 93 | 94 | You will want to write a wrapper script for the tool you want to 95 | add to facilitate this. The `"{output_dir}/CV_folds.RData"` input may be 96 | used to provide your wrapper script with folds for cross_validation. 97 | It is recommended to make a docker image containing all dependencies for both 98 | the tool and any wrappers for the tool. 99 | This wrapper script should also make a selection of the features to be used. 100 | This selection should be based on ranking which can be accessed by providing 101 | `feature ranking` as input to the wrapper script. The number of features to be 102 | used should be configurable and settable through the 'number_of_features' field 103 | in the config. 104 | 105 | The following can be used as a template for new rules. Replace everything 106 | surrounded by (and including the) `<>` with appropriate values. 107 | ``` 108 | rule SVM: 109 | input: 110 | datafile = config["datafile"], 111 | labfile = config["labfile"], 112 | folds = "{output_dir}/CV_folds.RData", 113 | ranking = feature_ranking 114 | output: 115 | pred = "{output_dir}//_pred.csv", 116 | true = "{output_dir}//_true.csv", 117 | test_time = "{output_dir}//_test_time.csv", 118 | training_time = "{output_dir}//_training_time.csv" 119 | log: "{output_dir}//.log" 120 | params: 121 | n_features = config.get("number_of_features", 0) 122 | singularity: "docker://" 123 | shell: 124 | " " 125 | "{input.datafile} " 126 | "{input.labfile} " 127 | "{input.folds} " 128 | "{wildcards.output_dir}/ " 129 | "{input.ranking} " 130 | "{params.n_features} " 131 | "&> {log}" 132 | ``` 133 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_ACTINN.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import time as tm 5 | import rpy2.robjects as robjects 6 | 7 | def run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 8 | ''' 9 | run ACTINN 10 | Wrapper script to run ACTINN on a benchmark dataset with 5-fold cross validation, 11 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 12 | 13 | Parameters 14 | ---------- 15 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 16 | as row names and gene names as column names. 17 | LabelsPath : Cell population annotations file path (.csv). 18 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 19 | OutputDir : Output directory defining the path of the exported file. 20 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 21 | defining the genes order for each cross validation fold, default is NULL. 22 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 23 | ''' 24 | 25 | # read the Rdata file 26 | robjects.r['load'](CV_RDataPath) 27 | 28 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 29 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 30 | col = np.array(robjects.r['col_Index'], dtype = 'int') 31 | col = col - 1 32 | test_ind = np.array(robjects.r['Test_Idx']) 33 | train_ind = np.array(robjects.r['Train_Idx']) 34 | 35 | # read the data 36 | data = pd.read_csv(DataPath,index_col=0,sep=',') 37 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 38 | 39 | labels = labels.iloc[tokeep] 40 | data = data.iloc[tokeep] 41 | 42 | # read the feature file 43 | if (NumGenes > 0): 44 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 45 | 46 | # folder with results 47 | os.chdir(OutputDir) 48 | 49 | tot=[] 50 | truelab = [] 51 | pred = [] 52 | 53 | for i in range(np.squeeze(nfolds)): 54 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 55 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 56 | 57 | train=data.iloc[train_ind_i] 58 | test=data.iloc[test_ind_i] 59 | y_train=labels.iloc[train_ind_i] 60 | y_test=labels.iloc[test_ind_i] 61 | 62 | if (NumGenes > 0): 63 | feat_to_use = features.iloc[0:NumGenes,i] 64 | train = train.iloc[:,feat_to_use] 65 | test = test.iloc[:,feat_to_use] 66 | 67 | train = train.transpose() 68 | test = test.transpose() 69 | 70 | train.to_csv("train.csv") 71 | test.to_csv("test.csv") 72 | y_train.to_csv("train_lab.csv", header = False, index = True, sep = '\t') 73 | y_test.to_csv("test_lab.csv", header = False, index = True, sep = '\t') 74 | 75 | tm.sleep(60) 76 | 77 | os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i train.csv -o train -f csv") 78 | os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i test.csv -o test -f csv") 79 | 80 | start = tm.time() 81 | os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts test.h5") 82 | tot.append(tm.time()-start) 83 | 84 | tm.sleep(60) 85 | 86 | truelab.extend(y_test.values) 87 | predlabels = pd.read_csv('predicted_label.txt',header=0,index_col=None, sep='\t', usecols = [1]) 88 | pred.extend(predlabels.values) 89 | 90 | 91 | truelab = pd.DataFrame(truelab) 92 | pred = pd.DataFrame(pred) 93 | tot_time = pd.DataFrame(tot) 94 | 95 | if (NumGenes == 0): 96 | truelab.to_csv("ACTINN_True_Labels.csv", index = False) 97 | pred.to_csv("ACTINN_Pred_Labels.csv", index = False) 98 | tot_time.to_csv("ACTINN_Total_Time.csv", index = False) 99 | else: 100 | truelab.to_csv("ACTINN_" + str(NumGenes) + "_True_Labels.csv", index = False) 101 | pred.to_csv("ACTINN_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 102 | tot_time.to_csv("ACTINN_" + str(NumGenes) + "_Total_Time.csv", index = False) 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_CHETAH.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | 3 | run_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ 4 | " 5 | run CHETAH 6 | Wrapper script to run CHETAH on a benchmark dataset with 5-fold cross validation, 7 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 8 | 9 | Parameters 10 | ---------- 11 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 12 | as row names and gene names as column names. 13 | LabelsPath : Cell population annotations file path (.csv). 14 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 15 | OutputDir : Output directory defining the path of the exported file. 16 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 17 | defining the genes order for each cross validation fold, default is NULL. 18 | NumGenes : Number of genes used in case of feature selection (integer), default is NULL. 19 | " 20 | 21 | Data <- read.csv(DataPath,row.names = 1) 22 | Labels <- as.matrix(read.csv(LabelsPath)) 23 | load(CV_RDataPath) 24 | Labels <- as.vector(Labels[,col_Index]) 25 | Data <- Data[Cells_to_Keep,] 26 | Labels <- Labels[Cells_to_Keep] 27 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 28 | GenesOrder = read.csv(GeneOrderPath) 29 | } 30 | 31 | ############################################################################# 32 | # CHETAH # 33 | ############################################################################# 34 | library(CHETAH) 35 | library(SingleCellExperiment) 36 | True_Labels_CHETAH <- list() 37 | Pred_Labels_CHETAH <- list() 38 | Total_Time_CHETAH <- list() 39 | Data = t(as.matrix(Data)) 40 | 41 | for (i in c(1:n_folds)){ 42 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 43 | sce <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 44 | colData = data.frame(celltypes = Labels[Train_Idx[[i]]])) 45 | 46 | sce_test <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 47 | colData = data.frame(celltypes = Labels[Test_Idx[[i]]])) 48 | start_time <- Sys.time() 49 | sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce, n_genes = NumGenes) 50 | end_time <- Sys.time() 51 | } 52 | else{ 53 | sce <- SingleCellExperiment(assays = list(counts = Data[,Train_Idx[[i]]]), 54 | colData = data.frame(celltypes = Labels[Train_Idx[[i]]])) 55 | 56 | sce_test <- SingleCellExperiment(assays = list(counts = Data[,Test_Idx[[i]]]), 57 | colData = data.frame(celltypes = Labels[Test_Idx[[i]]])) 58 | start_time <- Sys.time() 59 | sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce) 60 | end_time <- Sys.time() 61 | } 62 | 63 | Total_Time_CHETAH[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 64 | 65 | True_Labels_CHETAH[i] <- list(Labels[Test_Idx[[i]]]) 66 | Pred_Labels_CHETAH[i] <- list(sce_test$celltype_CHETAH) 67 | } 68 | True_Labels_CHETAH <- as.vector(unlist(True_Labels_CHETAH)) 69 | Pred_Labels_CHETAH <- as.vector(unlist(Pred_Labels_CHETAH)) 70 | Total_Time_CHETAH <- as.vector(unlist(Total_Time_CHETAH)) 71 | write.csv(True_Labels_CHETAH,paste0(OutputDir,'/CHETAH_true.csv'),row.names = FALSE) 72 | write.csv(Pred_Labels_CHETAH,paste0(OutputDir,'/CHETAH_pred.csv'),row.names = FALSE) 73 | write.csv(Total_Time_CHETAH,paste0(OutputDir,'/CHETAH_total_time.csv'),row.names = FALSE) 74 | } 75 | 76 | if (args[6] == "0") { 77 | run_CHETAH(args[1], args[2], args[3], args[4]) 78 | } else { 79 | run_CHETAH(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6])) 80 | } 81 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_Cell_BLAST.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sys import argv 3 | from pathlib import Path 4 | import time as tm 5 | import pandas as pd 6 | import warnings 7 | warnings.filterwarnings("ignore") 8 | 9 | import tensorflow as tf 10 | tf.logging.set_verbosity(0) 11 | 12 | import Cell_BLAST as cb 13 | import numpy as np 14 | from numpy import genfromtxt as gft 15 | import rpy2.robjects as robjects 16 | 17 | 18 | def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 19 | ''' 20 | run Cell_BLAST 21 | Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation, 22 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 23 | 24 | Parameters 25 | ---------- 26 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 27 | as row names and gene names as column names. 28 | LabelsPath : Cell population annotations file path (.csv). 29 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 30 | OutputDir : Output directory defining the path of the exported file. 31 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 32 | defining the genes order for each cross validation fold, default is NULL. 33 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 34 | ''' 35 | 36 | # read the Rdata file 37 | robjects.r['load'](CV_RDataPath) 38 | 39 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 40 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 41 | col = np.array(robjects.r['col_Index'], dtype = 'int') 42 | col = col - 1 43 | test_ind = np.array(robjects.r['Test_Idx']) 44 | train_ind = np.array(robjects.r['Train_Idx']) 45 | 46 | # read the feature file 47 | if (NumGenes > 0): 48 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 49 | 50 | # read the data and labels 51 | data_old = cb.data.ExprDataSet.read_table(DataPath,orientation="cg", sep=",", index_col = 0, header = 0, sparsify = True).normalize() 52 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 53 | 54 | data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns) 55 | 56 | labels = gft(LabelsPath, dtype = "str", skip_header = 1, delimiter = ",", usecols = col) 57 | labels = labels[tokeep] 58 | 59 | truelab = [] 60 | pred = [] 61 | tr_time = [] 62 | ts_time = [] 63 | 64 | for i in range(np.squeeze(nfolds)): 65 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 66 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 67 | 68 | train=data[train_ind_i,:] 69 | test=data[test_ind_i,:] 70 | y_train = labels[train_ind_i] 71 | y_test = labels[test_ind_i] 72 | 73 | if (NumGenes > 0): 74 | feat_to_use = features.iloc[0:NumGenes,i] 75 | train = train[:,feat_to_use] 76 | test = test[:,feat_to_use] 77 | 78 | 79 | train.obs['cell_type'] = y_train 80 | 81 | start = tm.time() 82 | 83 | # reduce dimensions 84 | num_epoch = 50 85 | models = [] 86 | 87 | for j in range(4): 88 | models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j)) 89 | 90 | # train model 91 | blast = cb.blast.BLAST(models, train).build_empirical() 92 | tr_time.append(tm.time()-start) 93 | 94 | # predict labels 95 | start = tm.time() 96 | test_pred = blast.query(test).annotate('cell_type') 97 | ts_time.append(tm.time()-start) 98 | 99 | truelab.extend(y_test) 100 | pred.extend(test_pred.values) 101 | 102 | #write results 103 | truelab = pd.DataFrame(truelab) 104 | pred = pd.DataFrame(pred) 105 | 106 | tr_time = pd.DataFrame(tr_time) 107 | ts_time = pd.DataFrame(ts_time) 108 | 109 | truelab.to_csv(str(Path(OutputDir+"/Cell_BLAST_true.csv")),index = False) 110 | pred.to_csv(str(Path(OutputDir+"/Cell_BLAST_pred.csv")),index = False) 111 | tr_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_training_time.csv")), index = False) 112 | ts_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_test_time.csv")),index = False) 113 | 114 | 115 | run_Cell_BLAST(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) 116 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_DigitalCellSorter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scripts.DigitalCellSorter as DigitalCellSorter 4 | import os 5 | import time as tm 6 | import rpy2.robjects as robjects 7 | 8 | def run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 9 | ''' 10 | run DigitalCellSorter 11 | Wrapper script to run DigitalCellSorter on a benchmark dataset using a predefined genelist, 12 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 13 | 14 | Parameters 15 | ---------- 16 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 17 | as row names and gene names as column names. 18 | LabelsPath : Cell population annotations file path (.csv). 19 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 20 | GeneListPath : Data file path to the genest. 21 | OutputDir : Output directory defining the path of the exported file. 22 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 23 | defining the genes order for each cross validation fold, default is NULL. 24 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 25 | ''' 26 | 27 | # read the Rdata file 28 | robjects.r['load'](CV_RDataPath) 29 | 30 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 31 | col = np.array(robjects.r['col_Index'], dtype = 'int') 32 | col = col - 1 33 | 34 | # read the data 35 | data = pd.read_csv(DataPath,index_col=0,sep=',') 36 | data = data.iloc[tokeep] 37 | 38 | truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 39 | truelab = truelab.iloc[tokeep] 40 | 41 | 42 | # read the feature file 43 | if (NumGenes > 0): 44 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 45 | feat_to_use = features.iloc[0:NumGenes,0] 46 | data = data.iloc[:,feat_to_use] 47 | 48 | data = data.transpose() 49 | 50 | # number of different cell types in the data? 51 | n_clusters = 8 52 | AvailableCPUsCount = 1 53 | N_samples_for_distribution = 10000 54 | 55 | start = tm.time() 56 | pred = DigitalCellSorter.DigitalCellSorter().Process(data, 'DigitalCellSorter_Zhang', 57 | saveDir = OutputDir, 58 | geneListFileName = GeneListPath, 59 | N_samples_for_distribution = N_samples_for_distribution, 60 | AvailableCPUsCount = AvailableCPUsCount, 61 | clusterIndex=None, 62 | clusterName=None, 63 | n_clusters=n_clusters) 64 | runtime = tm.time() - start 65 | 66 | os.chdir(OutputDir) 67 | 68 | results = pd.read_excel('DigitalCellSorter_Zhang_voting.xlsx',header=0,index_col=None, usecols=[11]) 69 | 70 | prediction = np.zeros(np.shape(pred), dtype='>U10') 71 | 72 | for i in range(len(results)): 73 | prediction[np.where(pred == i)] = results.values[i] 74 | 75 | prediction = pd.DataFrame(prediction) 76 | 77 | if (NumGenes == 0): 78 | truelab.to_csv("DigitalCellSorter_True_Labels.csv", index = False) 79 | prediction.to_csv("DigitalCellSorter_Pred_Labels.csv", index = False) 80 | with open("DigitalCellSorter_Total_Time.csv", 'w') as f: 81 | f.write("%f\n" % runtime) 82 | else: 83 | truelab.to_csv("DigitalCellSorter_" + str(NumGenes) + "_True_Labels.csv", index = False) 84 | prediction.to_csv("DigitalCellSorter_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 85 | with open("DigitalCellSorter_" + str(NumGenes) + "_Total_Time.csv", 'w') as f: 86 | f.write("%f\n" % runtime) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_Garnett_Pretrained.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | 3 | run_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){ 4 | " 5 | run Garnett 6 | Wrapper script to run Garnett on a benchmark dataset with a pretrained classifier, 7 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 8 | 9 | Parameters 10 | ---------- 11 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 12 | as row names and gene names as column names. 13 | LabelsPath : Cell population annotations file path (.csv). 14 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 15 | GenesPath : Path to the file with the genenames 16 | ClassifierPath : Path to the pretrained classifier 17 | OutputDir : Output directory defining the path of the exported file. 18 | Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE) 19 | " 20 | # load needed libraries 21 | library(garnett) 22 | 23 | if (Human) { 24 | library(org.Hs.eg.db) 25 | } else { 26 | library(org.Mm.eg.db) 27 | } 28 | 29 | # load data, genes, and marker file 30 | load(CV_RDataPath) 31 | 32 | load(ClassifierPath) 33 | 34 | labels <- as.matrix(read.csv(LabelsPath)) 35 | labels <- labels[Cells_to_Keep] 36 | 37 | mat <- read.table(DataPath, sep = ",") 38 | data <- mat[-1,-1] 39 | data <- data[Cells_to_Keep,] 40 | data <- t(data) #ensure that the genes are rows, and the cells are columns 41 | 42 | barcodes <- mat[-1,1] 43 | 44 | pdata = data.frame(barcodes) 45 | fdata <- read.table(GenesPath) 46 | names(fdata) <- 'gene_short_name' 47 | row.names(fdata) <- fdata$gene_short_name 48 | 49 | row.names(data) <- row.names(fdata) 50 | colnames(data) <- row.names(pdata) 51 | 52 | pd <- new("AnnotatedDataFrame", data = pdata) 53 | fd <- new("AnnotatedDataFrame", data = fdata) 54 | pbmc_cds <- newCellDataSet(as(data, "dgCMatrix"), 55 | phenoData = pd, 56 | featureData = fd) 57 | 58 | start_time <- Sys.time() 59 | 60 | pbmc_cds <- estimateSizeFactors(pbmc_cds) 61 | 62 | if (Human){ 63 | pbmc_cds <- classify_cells(pbmc_cds, hsPBMC, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL") 64 | } else { 65 | pbmc_cds <- classify_cells(pbmc_cds, mmLung, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL") 66 | } 67 | 68 | end_time <- Sys.time() 69 | 70 | test_time <- as.numeric(end_time - start_time) 71 | 72 | write.table(pData(pbmc_cds)$cluster_ext_type, 73 | file = paste0(OutputDir, "/Garnett_Pretrained_pred.csv"), append = FALSE, quote = TRUE, sep = "\t", 74 | eol = "\n", na = "NA", dec = ".", row.names = FALSE, 75 | qmethod = c("escape", "double"), 76 | fileEncoding = "") 77 | 78 | write.csv(labels,paste0(OutputDir,"/Garnett_Pretrained_true.csv"), row.names = FALSE) 79 | write.csv(test_time,paste0(OutputDir,'/Garnett_Pretrained_test_time.csv'),row.names = FALSE) 80 | } 81 | 82 | run_Garnett_Pretrained(args[1], args[2], args[3], args[4], args[5], args[6], args[7]) 83 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_LDA.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sys import argv 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import time as tm 8 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 9 | import rpy2.robjects as robjects 10 | 11 | 12 | def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 13 | ''' 14 | run baseline classifier: LDA 15 | Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation, 16 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 17 | 18 | Parameters 19 | ---------- 20 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 21 | as row names and gene names as column names. 22 | LabelsPath : Cell population annotations file path (.csv). 23 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 24 | OutputDir : Output directory defining the path of the exported file. 25 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 26 | defining the genes order for each cross validation fold, default is NULL. 27 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 28 | ''' 29 | 30 | # read the Rdata file 31 | robjects.r['load'](CV_RDataPath) 32 | 33 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 34 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 35 | col = np.array(robjects.r['col_Index'], dtype = 'int') 36 | col = col - 1 37 | test_ind = np.array(robjects.r['Test_Idx']) 38 | train_ind = np.array(robjects.r['Train_Idx']) 39 | 40 | # read the data 41 | data = pd.read_csv(DataPath,index_col=0,sep=',') 42 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 43 | 44 | labels = labels.iloc[tokeep] 45 | data = data.iloc[tokeep] 46 | 47 | # read the feature file 48 | if (NumGenes > 0): 49 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 50 | 51 | # normalize data 52 | data = np.log1p(data) 53 | 54 | Classifier = LinearDiscriminantAnalysis() 55 | 56 | tr_time=[] 57 | ts_time=[] 58 | truelab = [] 59 | pred = [] 60 | 61 | for i in range(np.squeeze(nfolds)): 62 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 63 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 64 | 65 | train=data.iloc[train_ind_i] 66 | test=data.iloc[test_ind_i] 67 | y_train=labels.iloc[train_ind_i] 68 | y_test=labels.iloc[test_ind_i] 69 | 70 | if (NumGenes > 0): 71 | feat_to_use = features.iloc[0:NumGenes,i] 72 | train = train.iloc[:,feat_to_use] 73 | test = test.iloc[:,feat_to_use] 74 | 75 | start=tm.time() 76 | Classifier.fit(train, y_train) 77 | tr_time.append(tm.time()-start) 78 | 79 | start=tm.time() 80 | predicted = Classifier.predict(test) 81 | ts_time.append(tm.time()-start) 82 | 83 | truelab.extend(y_test.values) 84 | pred.extend(predicted) 85 | 86 | truelab = pd.DataFrame(truelab) 87 | pred = pd.DataFrame(pred) 88 | 89 | tr_time = pd.DataFrame(tr_time) 90 | ts_time = pd.DataFrame(ts_time) 91 | 92 | OutputDir = Path(OutputDir) 93 | truelab.to_csv(str(OutputDir / Path("LDA_true.csv")), 94 | index = False) 95 | pred.to_csv(str(OutputDir / Path("LDA_pred.csv")), 96 | index = False) 97 | tr_time.to_csv(str(OutputDir / Path("LDA_training_time.csv")), 98 | index = False) 99 | ts_time.to_csv(str(OutputDir / Path("LDA_test_time.csv")), 100 | index = False) 101 | 102 | run_LDA(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) 103 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_LDA_rejection.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sys import argv 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import time as tm 8 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 9 | import rpy2.robjects as robjects 10 | 11 | 12 | def run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7): 13 | ''' 14 | run baseline classifier: LDA 15 | Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation, 16 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 17 | 18 | Parameters 19 | ---------- 20 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 21 | as row names and gene names as column names. 22 | LabelsPath : Cell population annotations file path (.csv). 23 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 24 | OutputDir : Output directory defining the path of the exported file. 25 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 26 | defining the genes order for each cross validation fold, default is NULL. 27 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 28 | Threshold : Threshold used when rejecting the genes, default is 0.7. 29 | ''' 30 | 31 | # read the Rdata file 32 | robjects.r['load'](CV_RDataPath) 33 | 34 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 35 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 36 | col = np.array(robjects.r['col_Index'], dtype = 'int') 37 | col = col - 1 38 | test_ind = np.array(robjects.r['Test_Idx']) 39 | train_ind = np.array(robjects.r['Train_Idx']) 40 | 41 | # read the data 42 | data = pd.read_csv(DataPath,index_col=0,sep=',') 43 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 44 | 45 | labels = labels.iloc[tokeep] 46 | data = data.iloc[tokeep] 47 | 48 | # read the feature file 49 | if (NumGenes > 0): 50 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 51 | 52 | # normalize data 53 | data = np.log1p(data) 54 | 55 | Classifier = LinearDiscriminantAnalysis() 56 | 57 | tr_time=[] 58 | ts_time=[] 59 | truelab = [] 60 | pred = [] 61 | 62 | for i in range(np.squeeze(nfolds)): 63 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 64 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 65 | 66 | train=data.iloc[train_ind_i] 67 | test=data.iloc[test_ind_i] 68 | y_train=labels.iloc[train_ind_i] 69 | y_test=labels.iloc[test_ind_i] 70 | 71 | if (NumGenes > 0): 72 | feat_to_use = features.iloc[0:NumGenes,i] 73 | train = train.iloc[:,feat_to_use] 74 | test = test.iloc[:,feat_to_use] 75 | 76 | start=tm.time() 77 | Classifier.fit(train, y_train) 78 | tr_time.append(tm.time()-start) 79 | 80 | start=tm.time() 81 | predicted = Classifier.predict(test) 82 | prob = np.max(Classifier.predict_proba(test), axis = 1) 83 | unlabeled = np.where(prob < Threshold) 84 | predicted[unlabeled] = 'Unknown' 85 | ts_time.append(tm.time()-start) 86 | 87 | truelab.extend(y_test.values) 88 | pred.extend(predicted) 89 | 90 | truelab = pd.DataFrame(truelab) 91 | pred = pd.DataFrame(pred) 92 | 93 | tr_time = pd.DataFrame(tr_time) 94 | ts_time = pd.DataFrame(ts_time) 95 | 96 | OutputDir = Path(OutputDir) 97 | truelab.to_csv(str(OutputDir / Path("LDA_rejection_true.csv")), 98 | index = False) 99 | pred.to_csv(str(OutputDir / Path("LDA_rejection_pred.csv")), 100 | 101 | index = False) 102 | 103 | tr_time.to_csv(str(OutputDir / Path("LDA_rejection_training_time.csv")), 104 | index = False) 105 | ts_time.to_csv(str(OutputDir / Path("LDA_rejection_test_time.csv")), 106 | index = False) 107 | 108 | run_LDA(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) 109 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_NMC.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sys import argv 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import time as tm 8 | from sklearn.neighbors import NearestCentroid 9 | import rpy2.robjects as robjects 10 | 11 | 12 | def run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 13 | ''' 14 | run baseline classifier: NMC 15 | Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation, 16 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 17 | 18 | Parameters 19 | ---------- 20 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 21 | as row names and gene names as column names. 22 | LabelsPath : Cell population annotations file path (.csv). 23 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 24 | OutputDir : Output directory defining the path of the exported file. 25 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 26 | defining the genes order for each cross validation fold, default is NULL. 27 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 28 | ''' 29 | 30 | # read the Rdata file 31 | robjects.r['load'](CV_RDataPath) 32 | 33 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 34 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 35 | col = np.array(robjects.r['col_Index'], dtype = 'int') 36 | col = col - 1 37 | test_ind = np.array(robjects.r['Test_Idx']) 38 | train_ind = np.array(robjects.r['Train_Idx']) 39 | 40 | # read the data 41 | data = pd.read_csv(DataPath,index_col=0,sep=',') 42 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 43 | 44 | labels = labels.iloc[tokeep] 45 | data = data.iloc[tokeep] 46 | 47 | # read the feature file 48 | if (NumGenes > 0): 49 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 50 | 51 | # normalize data 52 | data = np.log1p(data) 53 | 54 | Classifier = NearestCentroid() 55 | 56 | tr_time=[] 57 | ts_time=[] 58 | truelab = [] 59 | pred = [] 60 | 61 | for i in range(np.squeeze(nfolds)): 62 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 63 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 64 | 65 | train=data.iloc[train_ind_i] 66 | test=data.iloc[test_ind_i] 67 | y_train=labels.iloc[train_ind_i] 68 | y_test=labels.iloc[test_ind_i] 69 | 70 | if (NumGenes > 0): 71 | feat_to_use = features.iloc[0:NumGenes,i] 72 | train = train.iloc[:,feat_to_use] 73 | test = test.iloc[:,feat_to_use] 74 | 75 | start=tm.time() 76 | Classifier.fit(train, y_train) 77 | tr_time.append(tm.time()-start) 78 | 79 | start=tm.time() 80 | predicted = Classifier.predict(test) 81 | ts_time.append(tm.time()-start) 82 | 83 | truelab.extend(y_test.values) 84 | pred.extend(predicted) 85 | 86 | truelab = pd.DataFrame(truelab) 87 | pred = pd.DataFrame(pred) 88 | 89 | tr_time = pd.DataFrame(tr_time) 90 | ts_time = pd.DataFrame(ts_time) 91 | 92 | OutputDir = Path(OutputDir) 93 | truelab.to_csv(str(OutputDir / Path("NMC_true.csv")), 94 | index = False) 95 | pred.to_csv(str(OutputDir / Path("NMC_pred.csv")), 96 | index = False) 97 | tr_time.to_csv(str(OutputDir / Path("NMC_training_time.csv")), 98 | index = False) 99 | ts_time.to_csv(str(OutputDir / Path("NMC_test_time.csv")), 100 | index = False) 101 | 102 | run_NMC(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) 103 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_RF.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sys import argv 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import time as tm 8 | from sklearn.ensemble import RandomForestClassifier 9 | import rpy2.robjects as robjects 10 | 11 | 12 | def run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 13 | ''' 14 | run baseline classifier: RF 15 | Wrapper script to run a RF classifier with 50 trees on a benchmark dataset with 5-fold cross validation, 16 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 17 | 18 | Parameters 19 | ---------- 20 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 21 | as row names and gene names as column names. 22 | LabelsPath : Cell population annotations file path (.csv). 23 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 24 | OutputDir : Output directory defining the path of the exported file. 25 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 26 | defining the genes order for each cross validation fold, default is NULL. 27 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 28 | ''' 29 | 30 | # read the Rdata file 31 | robjects.r['load'](CV_RDataPath) 32 | 33 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 34 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 35 | col = np.array(robjects.r['col_Index'], dtype = 'int') 36 | col = col - 1 37 | test_ind = np.array(robjects.r['Test_Idx']) 38 | train_ind = np.array(robjects.r['Train_Idx']) 39 | 40 | # read the data 41 | data = pd.read_csv(DataPath,index_col=0,sep=',') 42 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 43 | 44 | labels = labels.iloc[tokeep] 45 | data = data.iloc[tokeep] 46 | 47 | # read the feature file 48 | if (NumGenes > 0): 49 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 50 | 51 | # normalize data 52 | data = np.log1p(data) 53 | 54 | Classifier = RandomForestClassifier(n_estimators = 50) 55 | 56 | tr_time=[] 57 | ts_time=[] 58 | truelab = [] 59 | pred = [] 60 | 61 | for i in range(np.squeeze(nfolds)): 62 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 63 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 64 | 65 | train=data.iloc[train_ind_i] 66 | test=data.iloc[test_ind_i] 67 | y_train=labels.iloc[train_ind_i] 68 | y_test=labels.iloc[test_ind_i] 69 | 70 | if (NumGenes > 0): 71 | feat_to_use = features.iloc[0:NumGenes,i] 72 | train = train.iloc[:,feat_to_use] 73 | test = test.iloc[:,feat_to_use] 74 | 75 | start=tm.time() 76 | Classifier.fit(train, y_train) 77 | tr_time.append(tm.time()-start) 78 | 79 | start=tm.time() 80 | predicted = Classifier.predict(test) 81 | ts_time.append(tm.time()-start) 82 | 83 | truelab.extend(y_test.values) 84 | pred.extend(predicted) 85 | 86 | truelab = pd.DataFrame(truelab) 87 | pred = pd.DataFrame(pred) 88 | 89 | tr_time = pd.DataFrame(tr_time) 90 | ts_time = pd.DataFrame(ts_time) 91 | 92 | OutputDir = Path(OutputDir) 93 | truelab.to_csv(str(OutputDir / Path("RF_true.csv")), 94 | index = False) 95 | pred.to_csv(str(OutputDir / Path("RF_pred.csv")), 96 | index = False) 97 | tr_time.to_csv(str(OutputDir / Path("RF_training_time.csv")), 98 | index = False) 99 | ts_time.to_csv(str(OutputDir / Path("RF_test_time.csv")), 100 | index = False) 101 | 102 | run_RF(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) 103 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_SCINA.R: -------------------------------------------------------------------------------- 1 | run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){ 2 | " 3 | run SCINA 4 | Wrapper script to run SCINA on a benchmark dataset, 5 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 6 | 7 | Parameters 8 | ---------- 9 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 10 | as row names and gene names as column names. 11 | LabelsPath : Cell population annotations file path (.csv). 12 | GeneSigPath : Cell type marker genes file path (.csv) 13 | OutputDir : Output directory defining the path of the exported file. 14 | " 15 | 16 | Data <- read.csv(DataPath,row.names = 1) 17 | Labels <- as.vector(as.matrix(read.csv(LabelsPath))) 18 | Data <- Data[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK')),] 19 | Labels <- Labels[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK'))] 20 | Labels[Labels == 'CD14+ Monocyte'] <- 'CD14_Monocyte' 21 | Labels[Labels == 'CD19+ B'] <- 'CD19_B' 22 | Labels[Labels == 'CD56+ NK'] <- 'CD56_NK' 23 | 24 | 25 | ############################################################################# 26 | # SCINA # 27 | ############################################################################# 28 | library(SCINA) 29 | Signature_Genes <- preprocess.signatures(GeneSigPath) 30 | True_Labels_SCINA <- list() 31 | Pred_Labels_SCINA <- list() 32 | Total_Time_SCINA <- list() 33 | 34 | library(preprocessCore) 35 | Data = t(as.matrix(Data)) 36 | Data=log(Data+1) 37 | Data[]=normalize.quantiles(Data) 38 | 39 | start_time <- Sys.time() 40 | results = SCINA(Data, Signature_Genes) 41 | end_time <- Sys.time() 42 | 43 | True_Labels_SCINA <- Labels 44 | Pred_Labels_SCINA <- results$cell_labels 45 | Total_Time_SCINA <- as.numeric(difftime(end_time,start_time,units = 'secs')) 46 | 47 | setwd(OutputDir) 48 | 49 | write.csv(True_Labels_SCINA,'SCINA_True_Labels.csv',row.names = FALSE) 50 | write.csv(Pred_Labels_SCINA,'SCINA_Pred_Labels.csv',row.names = FALSE) 51 | write.csv(Total_Time_SCINA,'SCINA_Total_Time.csv',row.names = FALSE) 52 | } 53 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_SVM.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sys import argv 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import time as tm 8 | from sklearn.svm import LinearSVC 9 | import rpy2.robjects as robjects 10 | 11 | 12 | def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 13 | ''' 14 | run baseline classifier: SVM 15 | Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation, 16 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 17 | 18 | Parameters 19 | ---------- 20 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 21 | as row names and gene names as column names. 22 | LabelsPath : Cell population annotations file path (.csv). 23 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 24 | OutputDir : Output directory defining the path of the exported file. 25 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 26 | defining the genes order for each cross validation fold, default is NULL. 27 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 28 | ''' 29 | 30 | # read the Rdata file 31 | robjects.r['load'](CV_RDataPath) 32 | 33 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 34 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 35 | col = np.array(robjects.r['col_Index'], dtype = 'int') 36 | col = col - 1 37 | test_ind = np.array(robjects.r['Test_Idx']) 38 | train_ind = np.array(robjects.r['Train_Idx']) 39 | 40 | # read the data 41 | data = pd.read_csv(DataPath,index_col=0,sep=',') 42 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 43 | 44 | labels = labels.iloc[tokeep] 45 | data = data.iloc[tokeep] 46 | 47 | # read the feature file 48 | if (NumGenes > 0): 49 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 50 | 51 | # normalize data 52 | data = np.log1p(data) 53 | 54 | Classifier = LinearSVC() 55 | 56 | tr_time=[] 57 | ts_time=[] 58 | truelab = [] 59 | pred = [] 60 | 61 | for i in range(np.squeeze(nfolds)): 62 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 63 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 64 | 65 | train=data.iloc[train_ind_i] 66 | test=data.iloc[test_ind_i] 67 | y_train=labels.iloc[train_ind_i] 68 | y_test=labels.iloc[test_ind_i] 69 | 70 | if (NumGenes > 0): 71 | feat_to_use = features.iloc[0:NumGenes,i] 72 | train = train.iloc[:,feat_to_use] 73 | test = test.iloc[:,feat_to_use] 74 | 75 | start=tm.time() 76 | Classifier.fit(train, y_train) 77 | tr_time.append(tm.time()-start) 78 | 79 | start=tm.time() 80 | predicted = Classifier.predict(test) 81 | ts_time.append(tm.time()-start) 82 | 83 | truelab.extend(y_test.values) 84 | pred.extend(predicted) 85 | 86 | truelab = pd.DataFrame(truelab) 87 | pred = pd.DataFrame(pred) 88 | 89 | tr_time = pd.DataFrame(tr_time) 90 | ts_time = pd.DataFrame(ts_time) 91 | 92 | OutputDir = Path(OutputDir) 93 | truelab.to_csv(str(OutputDir / Path("SVM_true.csv")), 94 | index = False) 95 | pred.to_csv(str(OutputDir / Path("SVM_pred.csv")), 96 | index = False) 97 | tr_time.to_csv(str(OutputDir / Path("SVM_training_time.csv")), 98 | index = False) 99 | ts_time.to_csv(str(OutputDir / Path("SVM_test_time.csv")), 100 | index = False) 101 | 102 | run_SVM(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) 103 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_SVM_rejection.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sys import argv 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import time as tm 8 | from sklearn.svm import LinearSVC 9 | import rpy2.robjects as robjects 10 | from sklearn.calibration import CalibratedClassifierCV 11 | 12 | 13 | def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7): 14 | ''' 15 | run baseline classifier: SVM 16 | Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation, 17 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 18 | 19 | Parameters 20 | ---------- 21 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 22 | as row names and gene names as column names. 23 | LabelsPath : Cell population annotations file path (.csv). 24 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 25 | OutputDir : Output directory defining the path of the exported file. 26 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 27 | defining the genes order for each cross validation fold, default is NULL. 28 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 29 | Threshold : Threshold used when rejecting the cells, default is 0.7. 30 | 31 | ''' 32 | 33 | # read the Rdata file 34 | robjects.r['load'](CV_RDataPath) 35 | 36 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 37 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 38 | col = np.array(robjects.r['col_Index'], dtype = 'int') 39 | col = col - 1 40 | test_ind = np.array(robjects.r['Test_Idx']) 41 | train_ind = np.array(robjects.r['Train_Idx']) 42 | 43 | # read the data 44 | data = pd.read_csv(DataPath,index_col=0,sep=',') 45 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 46 | 47 | labels = labels.iloc[tokeep] 48 | data = data.iloc[tokeep] 49 | 50 | # read the feature file 51 | if (NumGenes > 0): 52 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 53 | 54 | # normalize data 55 | data = np.log1p(data) 56 | 57 | Classifier = LinearSVC() 58 | clf = CalibratedClassifierCV(Classifier) 59 | 60 | tr_time=[] 61 | ts_time=[] 62 | truelab = [] 63 | pred = [] 64 | 65 | for i in range(np.squeeze(nfolds)): 66 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 67 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 68 | 69 | train=data.iloc[train_ind_i] 70 | test=data.iloc[test_ind_i] 71 | y_train=labels.iloc[train_ind_i] 72 | y_test=labels.iloc[test_ind_i] 73 | 74 | if (NumGenes > 0): 75 | feat_to_use = features.iloc[0:NumGenes,i] 76 | train = train.iloc[:,feat_to_use] 77 | test = test.iloc[:,feat_to_use] 78 | 79 | start=tm.time() 80 | clf.fit(train, y_train) 81 | tr_time.append(tm.time()-start) 82 | 83 | start=tm.time() 84 | predicted = clf.predict(test) 85 | prob = np.max(clf.predict_proba(test), axis = 1) 86 | unlabeled = np.where(prob < Threshold) 87 | predicted[unlabeled] = 'Unknown' 88 | ts_time.append(tm.time()-start) 89 | 90 | truelab.extend(y_test.values) 91 | pred.extend(predicted) 92 | 93 | truelab = pd.DataFrame(truelab) 94 | pred = pd.DataFrame(pred) 95 | 96 | tr_time = pd.DataFrame(tr_time) 97 | ts_time = pd.DataFrame(ts_time) 98 | 99 | OutputDir = Path(OutputDir) 100 | truelab.to_csv(str(OutputDir / Path("SVM_rejection_true.csv")), 101 | index = False) 102 | pred.to_csv(str(OutputDir / Path("SVM_rejection_pred.csv")), 103 | index = False) 104 | tr_time.to_csv(str(OutputDir / Path("SVM_rejection_training_time.csv")), 105 | 106 | index = False) 107 | ts_time.to_csv(str(OutputDir / Path("SVM_rejection_test_time.csv")), 108 | index = False) 109 | 110 | run_SVM(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) 111 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_SingleR.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | 3 | run_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ 4 | " 5 | run SingleR 6 | Wrapper script to run SingleR on a benchmark dataset with 5-fold cross validation, 7 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 8 | 9 | Parameters 10 | ---------- 11 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 12 | as row names and gene names as column names. 13 | LabelsPath : Cell population annotations file path (.csv). 14 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 15 | OutputDir : Output directory defining the path of the exported file. 16 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 17 | defining the genes order for each cross validation fold, default is NULL. 18 | NumGenes : Number of genes used in case of feature selection (integer), default is NULL. 19 | " 20 | 21 | Data <- read.csv(DataPath,row.names = 1) 22 | Labels <- as.matrix(read.csv(LabelsPath)) 23 | load(CV_RDataPath) 24 | Labels <- as.vector(Labels[,col_Index]) 25 | Data <- Data[Cells_to_Keep,] 26 | Labels <- Labels[Cells_to_Keep] 27 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 28 | GenesOrder = read.csv(GeneOrderPath) 29 | } 30 | 31 | ############################################################################# 32 | # SingleR # 33 | ############################################################################# 34 | library(SingleR) 35 | library(Seurat) 36 | True_Labels_SingleR <- list() 37 | Pred_Labels_SingleR <- list() 38 | Total_Time_SingleR <- list() 39 | Data = t(as.matrix(Data)) 40 | 41 | for (i in c(1:n_folds)){ 42 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 43 | start_time <- Sys.time() 44 | singler = SingleR(method = "single", Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], 45 | Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], 46 | Labels[Train_Idx[[i]]], numCores = 1) 47 | end_time <- Sys.time() 48 | } 49 | else{ 50 | start_time <- Sys.time() 51 | singler = SingleR(method = "single", Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1) 52 | end_time <- Sys.time() 53 | } 54 | Total_Time_SingleR[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 55 | 56 | True_Labels_SingleR[i] <- list(Labels[Test_Idx[[i]]]) 57 | Pred_Labels_SingleR[i] <- list(as.vector(singler$labels)) 58 | } 59 | True_Labels_SingleR <- as.vector(unlist(True_Labels_SingleR)) 60 | Pred_Labels_SingleR <- as.vector(unlist(Pred_Labels_SingleR)) 61 | Total_Time_SingleR <- as.vector(unlist(Total_Time_SingleR)) 62 | 63 | write.csv(True_Labels_SingleR,paste0(OutputDir,'/SingleR_true.csv'),row.names = FALSE) 64 | write.csv(Pred_Labels_SingleR,paste0(OutputDir,'/SingleR_pred.csv'),row.names = FALSE) 65 | write.csv(Total_Time_SingleR,paste0(OutputDir,'/SingleR_total_time.csv'),row.names = FALSE) 66 | } 67 | 68 | if (args[6] == "0") { 69 | run_SingleR(args[1], args[2], args[3], args[4]) 70 | } else { 71 | run_SingleR(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6])) 72 | } 73 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_kNN50.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sys import argv 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import time as tm 8 | from sklearn.neighbors import KNeighborsClassifier 9 | import rpy2.robjects as robjects 10 | 11 | 12 | def run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 13 | ''' 14 | run baseline classifiers: kNN 15 | Wrapper script to run kNN (with k = 50) classifier on a benchmark dataset with 5-fold cross validation, 16 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 17 | 18 | Parameters 19 | ---------- 20 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 21 | as row names and gene names as column names. 22 | LabelsPath : Cell population annotations file path (.csv). 23 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 24 | OutputDir : Output directory defining the path of the exported file. 25 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 26 | defining the genes order for each cross validation fold, default is NULL. 27 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 28 | ''' 29 | 30 | # read the Rdata file 31 | robjects.r['load'](CV_RDataPath) 32 | 33 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 34 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 35 | col = np.array(robjects.r['col_Index'], dtype = 'int') 36 | col = col - 1 37 | test_ind = np.array(robjects.r['Test_Idx']) 38 | train_ind = np.array(robjects.r['Train_Idx']) 39 | 40 | # read the data 41 | data = pd.read_csv(DataPath,index_col=0,sep=',') 42 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 43 | 44 | labels = labels.iloc[tokeep] 45 | data = data.iloc[tokeep] 46 | 47 | # read the feature file 48 | if (NumGenes > 0): 49 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 50 | 51 | # normalize data 52 | data = np.log1p(data) 53 | 54 | Classifier = KNeighborsClassifier(n_neighbors=50) 55 | 56 | tr_time=[] 57 | ts_time=[] 58 | truelab = [] 59 | pred = [] 60 | 61 | for i in range(np.squeeze(nfolds)): 62 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 63 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 64 | 65 | train=data.iloc[train_ind_i] 66 | test=data.iloc[test_ind_i] 67 | y_train=labels.iloc[train_ind_i] 68 | y_test=labels.iloc[test_ind_i] 69 | 70 | if (NumGenes > 0): 71 | feat_to_use = features.iloc[0:NumGenes,i] 72 | train = train.iloc[:,feat_to_use] 73 | test = test.iloc[:,feat_to_use] 74 | 75 | start=tm.time() 76 | Classifier.fit(train, y_train) 77 | tr_time.append(tm.time()-start) 78 | 79 | start=tm.time() 80 | predicted = Classifier.predict(test) 81 | ts_time.append(tm.time()-start) 82 | 83 | truelab.extend(y_test.values) 84 | pred.extend(predicted) 85 | 86 | truelab = pd.DataFrame(truelab) 87 | pred = pd.DataFrame(pred) 88 | 89 | tr_time = pd.DataFrame(tr_time) 90 | ts_time = pd.DataFrame(ts_time) 91 | 92 | OutputDir = Path(OutputDir) 93 | truelab.to_csv(str(OutputDir / Path("kNN50_true.csv")), 94 | index = False) 95 | pred.to_csv(str(OutputDir / Path("kNN50_pred.csv")), 96 | index = False) 97 | tr_time.to_csv(str(OutputDir / Path("kNN50_training_time.csv")), 98 | index = False) 99 | ts_time.to_csv(str(OutputDir / Path("kNN50_test_time.csv")), 100 | index = False) 101 | 102 | run_kNN50(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) 103 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_kNN9.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sys import argv 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import time as tm 8 | from sklearn.neighbors import KNeighborsClassifier 9 | import rpy2.robjects as robjects 10 | 11 | 12 | def run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 13 | ''' 14 | run baseline classifiers: kNN 15 | Wrapper script to run kNN (with k = 9) classifier on a benchmark dataset with 5-fold cross validation, 16 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 17 | 18 | Parameters 19 | ---------- 20 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 21 | as row names and gene names as column names. 22 | LabelsPath : Cell population annotations file path (.csv). 23 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 24 | OutputDir : Output directory defining the path of the exported file. 25 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 26 | defining the genes order for each cross validation fold, default is NULL. 27 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 28 | ''' 29 | 30 | # read the Rdata file 31 | robjects.r['load'](CV_RDataPath) 32 | 33 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 34 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 35 | col = np.array(robjects.r['col_Index'], dtype = 'int') 36 | col = col - 1 37 | test_ind = np.array(robjects.r['Test_Idx']) 38 | train_ind = np.array(robjects.r['Train_Idx']) 39 | 40 | # read the data 41 | data = pd.read_csv(DataPath,index_col=0,sep=',') 42 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 43 | 44 | labels = labels.iloc[tokeep] 45 | data = data.iloc[tokeep] 46 | 47 | # read the feature file 48 | if (NumGenes > 0): 49 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 50 | 51 | # normalize data 52 | data = np.log1p(data) 53 | 54 | Classifier = KNeighborsClassifier(n_neighbors=9) 55 | 56 | tr_time=[] 57 | ts_time=[] 58 | truelab = [] 59 | pred = [] 60 | 61 | for i in range(np.squeeze(nfolds)): 62 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 63 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 64 | 65 | train=data.iloc[train_ind_i] 66 | test=data.iloc[test_ind_i] 67 | y_train=labels.iloc[train_ind_i] 68 | y_test=labels.iloc[test_ind_i] 69 | 70 | if (NumGenes > 0): 71 | feat_to_use = features.iloc[0:NumGenes,i] 72 | train = train.iloc[:,feat_to_use] 73 | test = test.iloc[:,feat_to_use] 74 | 75 | start=tm.time() 76 | Classifier.fit(train, y_train) 77 | tr_time.append(tm.time()-start) 78 | 79 | start=tm.time() 80 | predicted = Classifier.predict(test) 81 | ts_time.append(tm.time()-start) 82 | 83 | truelab.extend(y_test.values) 84 | pred.extend(predicted) 85 | 86 | truelab = pd.DataFrame(truelab) 87 | pred = pd.DataFrame(pred) 88 | 89 | tr_time = pd.DataFrame(tr_time) 90 | ts_time = pd.DataFrame(ts_time) 91 | 92 | OutputDir = Path(OutputDir) 93 | truelab.to_csv(str(OutputDir / Path("kNN9_true.csv")), 94 | index = False) 95 | pred.to_csv(str(OutputDir / Path("kNN9_pred.csv")), 96 | index = False) 97 | tr_time.to_csv(str(OutputDir / Path("kNN9_training_time.csv")), 98 | index = False) 99 | ts_time.to_csv(str(OutputDir / Path("kNN9_test_time.csv")), 100 | index = False) 101 | 102 | run_kNN50(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) 103 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_moana.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | from moana.core import ExpMatrix 5 | from moana.classify import CellTypeClassifier 6 | import time as tm 7 | import rpy2.robjects as robjects 8 | 9 | def run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 10 | ''' 11 | run moana 12 | Wrapper script to run moana on a benchmark dataset with a pretrained classifier, 13 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 14 | 15 | Parameters 16 | ---------- 17 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 18 | as row names and gene names as column names. 19 | LabelsPath : Cell population annotations file path (.csv). 20 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 21 | ClassifierPath : Data file path to the pretrained classifier. 22 | OutputDir : Output directory defining the path of the exported file. 23 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 24 | defining the genes order for each cross validation fold, default is NULL. 25 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 26 | ''' 27 | 28 | # # read the Rdata file 29 | # robjects.r['load'](CV_RDataPath) 30 | # 31 | # tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 32 | # col = np.array(robjects.r['col_Index'], dtype = 'int') 33 | # col = col - 1 34 | 35 | matrix = ExpMatrix.read_tsv(DataPath, sep = ',') 36 | # matrix = matrix.iloc[tokeep] 37 | 38 | truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',') 39 | # truelab = truelab.iloc[tokeep] 40 | 41 | ct_old = ['CD19+ B','CD14+ Monocyte','CD4+/CD45RA+/CD25- Naive T','CD4+/CD45RO+ Memory','CD8+/CD45RA+ Naive Cytotoxic','Dendritic', 'CD56+ NK'] 42 | ct_new = ['B cells','CD14+ monocytes','Naive CD4+ T cells','Memory CD4+ T cells','Naive CD8+ T cells','Dendritic cells','NK cells'] 43 | 44 | tokeep2 = np.isin(truelab,ct_old) 45 | truelab = truelab[tokeep2] 46 | print(len(truelab)) 47 | matrix = matrix.iloc[np.squeeze(tokeep2)] 48 | 49 | for i in range(len(ct_old)): 50 | truelab.iloc[truelab == ct_old[i]] = ct_new[i] 51 | 52 | # read the feature file 53 | if (NumGenes > 0): 54 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 55 | feat_to_use = features.iloc[0:NumGenes,0] 56 | matrix = matrix.iloc[:,feat_to_use] 57 | 58 | data = ExpMatrix(X = np.transpose(matrix.X), genes = matrix.cells, cells = matrix.genes) 59 | data.genes.name = 'Genes' 60 | data.cells.name = 'Cells' 61 | data.index.name = 'Genes' 62 | data.columns.name = 'Cells' 63 | 64 | clf = CellTypeClassifier.read_pickle(ClassifierPath) 65 | 66 | start = tm.time() 67 | predictions = clf.predict(data) 68 | runtime = tm.time() - start 69 | 70 | np.asarray(predictions) 71 | 72 | pred = pd.DataFrame(predictions) 73 | 74 | os.chdir(OutputDir) 75 | 76 | if (NumGenes == 0): 77 | truelab.to_csv("moana_True_Labels.csv", index = False) 78 | pred.to_csv("moana_Pred_Labels.csv", index = False) 79 | with open("moana_Total_Time.csv", 'w') as f: 80 | f.write("%f\n" % runtime) 81 | else: 82 | truelab.to_csv("moana_" + str(NumGenes) + "_True_Labels.csv", index = False) 83 | pred.to_csv("moana_" + str(NumGenes) + "_Pred_Labels.csv", index = False) 84 | with open("moana_" + str(NumGenes) + "_Total_Time.csv", 'w') as f: 85 | f.write("%f\n" % runtime) 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_scID.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | 3 | run_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ 4 | " 5 | run scID 6 | Wrapper script to run scID on a benchmark dataset with 5-fold cross validation, 7 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 8 | 9 | Parameters 10 | ---------- 11 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 12 | as row names and gene names as column names. 13 | LabelsPath : Cell population annotations file path (.csv). 14 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 15 | OutputDir : Output directory defining the path of the exported file. 16 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 17 | defining the genes order for each cross validation fold, default is NULL. 18 | NumGenes : Number of genes used in case of feature selection (integer), default is NULL. 19 | " 20 | 21 | Data <- read.csv(DataPath,row.names = 1) 22 | Labels <- as.matrix(read.csv(LabelsPath)) 23 | load(CV_RDataPath) 24 | Labels <- as.vector(Labels[,col_Index]) 25 | Data <- Data[Cells_to_Keep,] 26 | Labels <- Labels[Cells_to_Keep] 27 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 28 | GenesOrder = read.csv(GeneOrderPath) 29 | } 30 | 31 | ############################################################################# 32 | # scID # 33 | ############################################################################# 34 | library(scID) 35 | library(Seurat) 36 | True_Labels_scID <- list() 37 | Pred_Labels_scID <- list() 38 | Total_Time_scID <- list() 39 | Data = t(as.matrix(Data)) 40 | 41 | for (i in c(1:n_folds)){ 42 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 43 | Train_Labels <- list(Labels[Train_Idx[[i]]]) 44 | names(Train_Labels[[1]]) <- colnames(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]) 45 | start_time <- Sys.time() 46 | scID_output <- scid_multiclass(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], 47 | Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], 48 | Train_Labels[[1]]) 49 | end_time <- Sys.time() 50 | } 51 | else{ 52 | Train_Labels <- list(Labels[Train_Idx[[i]]]) 53 | names(Train_Labels[[1]]) <- colnames(Data[,Train_Idx[[i]]]) 54 | start_time <- Sys.time() 55 | scID_output <- scid_multiclass(Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Train_Labels[[1]]) 56 | end_time <- Sys.time() 57 | } 58 | Total_Time_scID[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 59 | 60 | True_Labels_scID[i] <- list(Labels[Test_Idx[[i]]]) 61 | Pred_Labels_scID[i] <- list(as.vector(scID_output$labels)) 62 | } 63 | True_Labels_scID <- as.vector(unlist(True_Labels_scID)) 64 | Pred_Labels_scID <- as.vector(unlist(Pred_Labels_scID)) 65 | Total_Time_scID <- as.vector(unlist(Total_Time_scID)) 66 | 67 | write.csv(Pred_Labels_scID, paste0(OutputDir,'/scID_pred.csv'),row.names = FALSE) 68 | write.csv(True_Labels_scID, paste0(OutputDir,'/scID_true.csv'),row.names = FALSE) 69 | write.csv(Total_Time_scID,paste0(OutputDir,'/scID_total_time.csv'),row.names = FALSE) 70 | 71 | } 72 | 73 | if (args[6] == "0") { 74 | run_scID(args[1], args[2], args[3], args[4]) 75 | } else { 76 | run_scID(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6])) 77 | } 78 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_scPred.R: -------------------------------------------------------------------------------- 1 | run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ 2 | " 3 | run scPred 4 | Wrapper script to run scPred on a benchmark dataset with 5-fold cross validation, 5 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 6 | 7 | Parameters 8 | ---------- 9 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 10 | as row names and gene names as column names. 11 | LabelsPath : Cell population annotations file path (.csv). 12 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 13 | OutputDir : Output directory defining the path of the exported file. 14 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 15 | defining the genes order for each cross validation fold, default is NULL. 16 | NumGenes : Number of genes used in case of feature selection (integer), default is NULL. 17 | " 18 | 19 | Data <- read.csv(DataPath,row.names = 1) 20 | Labels <- as.matrix(read.csv(LabelsPath)) 21 | load(CV_RDataPath) 22 | Labels <- as.vector(Labels[,col_Index]) 23 | Data <- Data[Cells_to_Keep,] 24 | Labels <- Labels[Cells_to_Keep] 25 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 26 | GenesOrder = read.csv(GeneOrderPath) 27 | } 28 | 29 | ############################################################################# 30 | # scPred # 31 | ############################################################################# 32 | library(scPred) 33 | library(tidyverse) 34 | library(SingleCellExperiment) 35 | True_Labels_scPred <- list() 36 | Pred_Labels_scPred <- list() 37 | Training_Time_scPred <- list() 38 | Testing_Time_scPred <- list() 39 | Data = t(as.matrix(Data)) 40 | 41 | for (i in c(1:n_folds)){ 42 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 43 | sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 44 | colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) 45 | sce_counts <- normcounts(sce) 46 | sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000) 47 | sce_metadata <- as.data.frame(colData(sce)) 48 | 49 | sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 50 | colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) 51 | sce_counts_test <- normcounts(sce_test) 52 | sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000) 53 | sce_metadata_test <- as.data.frame(colData(sce_test)) 54 | } 55 | else{ 56 | sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 57 | colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) 58 | sce_counts <- normcounts(sce) 59 | sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000) 60 | sce_metadata <- as.data.frame(colData(sce)) 61 | 62 | sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 63 | colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) 64 | sce_counts_test <- normcounts(sce_test) 65 | sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000) 66 | sce_metadata_test <- as.data.frame(colData(sce_test)) 67 | } 68 | 69 | 70 | # scPred Training 71 | start_time <- Sys.time() 72 | set.seed(1234) 73 | scp <- eigenDecompose(sce_cpm) 74 | scPred::metadata(scp) <- sce_metadata 75 | scp <- getFeatureSpace(scp, pVar = 'cell_type1') 76 | # plotEigen(scp, group = 'cell_type1') 77 | scp <- trainModel(scp) 78 | # plotTrainProbs(scp) 79 | end_time <- Sys.time() 80 | Training_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 81 | 82 | # scPred Prediction 83 | start_time <- Sys.time() 84 | scp <- scPredict(scp,newData = sce_cpm_test) 85 | end_time <- Sys.time() 86 | Testing_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 87 | 88 | True_Labels_scPred[i] <- list(Labels[Test_Idx[[i]]]) 89 | Pred_Labels_scPred[i] <- list(getPredictions(scp)$predClass) 90 | } 91 | True_Labels_scPred <- as.vector(unlist(True_Labels_scPred)) 92 | Pred_Labels_scPred <- as.vector(unlist(Pred_Labels_scPred)) 93 | Training_Time_scPred <- as.vector(unlist(Training_Time_scPred)) 94 | Testing_Time_scPred <- as.vector(unlist(Testing_Time_scPred)) 95 | 96 | setwd(OutputDir) 97 | 98 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 99 | write.csv(True_Labels_scPred,paste('scPred_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) 100 | write.csv(Pred_Labels_scPred,paste('scPred_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) 101 | write.csv(Training_Time_scPred,paste('scPred_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE) 102 | write.csv(Testing_Time_scPred,paste('scPred_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE) 103 | } 104 | else{ 105 | write.csv(True_Labels_scPred,'scPred_True_Labels.csv',row.names = FALSE) 106 | write.csv(Pred_Labels_scPred,'scPred_Pred_Labels.csv',row.names = FALSE) 107 | write.csv(Training_Time_scPred,'scPred_Training_Time.csv',row.names = FALSE) 108 | write.csv(Testing_Time_scPred,'scPred_Testing_Time.csv',row.names = FALSE) 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_scVI.py: -------------------------------------------------------------------------------- 1 | from scvi.dataset import CsvDataset 2 | import os 3 | from sys import argv 4 | from pathlib import Path 5 | from scvi.dataset import CsvDataset 6 | import numpy as np 7 | import pandas as pd 8 | from scvi.models import SCANVI 9 | from scvi.inference import SemiSupervisedTrainer 10 | import time as tm 11 | import rpy2.robjects as robjects 12 | 13 | def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): 14 | ''' 15 | run scVI 16 | Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation, 17 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 18 | 19 | Parameters 20 | ---------- 21 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 22 | as row names and gene names as column names. 23 | LabelsPath : Cell population annotations file path (.csv). 24 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 25 | OutputDir : Output directory defining the path of the exported file. 26 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 27 | defining the genes order for each cross validation fold, default is NULL. 28 | NumGenes : Number of genes used in case of feature selection (integer), default is 0. 29 | ''' 30 | 31 | # read the Rdata file 32 | robjects.r['load'](CV_RDataPath) 33 | 34 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 35 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 36 | col = np.array(robjects.r['col_Index'], dtype = 'int') 37 | col = col - 1 38 | test_ind = np.array(robjects.r['Test_Idx']) 39 | train_ind = np.array(robjects.r['Train_Idx']) 40 | 41 | # read the data 42 | data = pd.read_csv(DataPath,index_col=0,sep=',') 43 | labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) 44 | 45 | labels = labels.iloc[tokeep] 46 | data = data.iloc[tokeep] 47 | 48 | # read the feature file 49 | if (NumGenes > 0): 50 | features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') 51 | 52 | if (NumGenes == 0): 53 | #save labels as csv file with header and index column 54 | labels.to_csv('Labels_scvi.csv') 55 | data.to_csv('Data_scvi.csv') 56 | 57 | train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False) 58 | 59 | ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing 60 | scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels) 61 | trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5) 62 | 63 | n_epochs = 200 64 | 65 | truelab = [] 66 | pred = [] 67 | tr_time = [] 68 | ts_time = [] 69 | 70 | for i in range(np.squeeze(nfolds)): 71 | test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 72 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 73 | 74 | if (NumGenes > 0): 75 | feat_to_use = features.iloc[0:NumGenes,i] 76 | data2 = data.iloc[:,feat_to_use] 77 | 78 | labels.to_csv(OutputDir +'/Labels_scvi.csv') 79 | data2.to_csv(OutputDir +'/Data_scvi.csv') 80 | 81 | train = CsvDataset(OutputDir +'/Data_scvi.csv', save_path = "", sep = ",", labels_file = OutputDir +"/Labels_scvi.csv", gene_by_cell = False, new_n_genes = False) 82 | 83 | ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing 84 | scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels) 85 | trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5) 86 | 87 | trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False) 88 | trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy'] 89 | trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False) 90 | trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy'] 91 | 92 | start = tm.time() 93 | trainer_scanvi.train(n_epochs) 94 | tr_time.append(tm.time()-start) 95 | 96 | ## labels of test set are in y_pred 97 | ## labels are returned in numbers, should be mapped back to the real labels 98 | ## indices are permutated 99 | start = tm.time() 100 | y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions() 101 | ts_time.append(tm.time()-start) 102 | 103 | truelab.extend(y_true) 104 | pred.extend(y_pred) 105 | 106 | #write results 107 | 108 | truelab = pd.DataFrame(truelab) 109 | pred = pd.DataFrame(pred) 110 | 111 | tr_time = pd.DataFrame(tr_time) 112 | ts_time = pd.DataFrame(ts_time) 113 | 114 | truelab.to_csv(str(Path(OutputDir + "/scVI_true.csv")), index=False) 115 | pred.to_csv(str(Path(OutputDir + "/scVI_pred.csv")), index=False) 116 | tr_time.to_csv(str(Path(OutputDir + "/scVI_training_time.csv")), index=False) 117 | ts_time.to_csv(str(Path(OutputDir + "/scVI_test_time.csv")), index=False) 118 | 119 | run_scVI(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) 120 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_scmapcell.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | 3 | run_scmapcell <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ 4 | " 5 | run scmapcell 6 | Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation, 7 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 8 | 9 | Parameters 10 | ---------- 11 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 12 | as row names and gene names as column names. 13 | LabelsPath : Cell population annotations file path (.csv). 14 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 15 | OutputDir : Output directory defining the path of the exported file. 16 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 17 | defining the genes order for each cross validation fold, default is NULL. 18 | NumGenes : Number of genes used in case of feature selection (integer), default is NULL. 19 | " 20 | 21 | Data <- read.csv(DataPath,row.names = 1) 22 | Labels <- as.matrix(read.csv(LabelsPath)) 23 | load(CV_RDataPath) 24 | Labels <- as.vector(Labels[,col_Index]) 25 | Data <- Data[Cells_to_Keep,] 26 | Labels <- Labels[Cells_to_Keep] 27 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 28 | GenesOrder = read.csv(GeneOrderPath) 29 | } 30 | 31 | ############################################################################# 32 | # scmap # 33 | ############################################################################# 34 | library(scmap) 35 | library(SingleCellExperiment) 36 | True_Labels_scmapcell <- list() 37 | Pred_Labels_scmapcell <- list() 38 | Training_Time_scmapcell <- list() 39 | Testing_Time_scmapcell <- list() 40 | Data = t(as.matrix(Data)) 41 | 42 | for (i in c(1:n_folds)){ 43 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 44 | sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 45 | colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) 46 | logcounts(sce) <- log2(normcounts(sce) + 1) 47 | # use gene names as feature symbols 48 | rowData(sce)$feature_symbol <- rownames(sce) 49 | sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE) 50 | 51 | sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 52 | colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) 53 | logcounts(sce_test) <- log2(normcounts(sce_test) + 1) 54 | rowData(sce_test)$feature_symbol <- rownames(sce_test) 55 | sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData 56 | } 57 | else{ 58 | sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 59 | colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) 60 | logcounts(sce) <- log2(normcounts(sce) + 1) 61 | # use gene names as feature symbols 62 | rowData(sce)$feature_symbol <- rownames(sce) 63 | sce <- selectFeatures(sce, suppress_plot = TRUE) 64 | 65 | sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 66 | colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) 67 | logcounts(sce_test) <- log2(normcounts(sce_test) + 1) 68 | rowData(sce_test)$feature_symbol <- rownames(sce_test) 69 | sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData 70 | } 71 | 72 | # scmap-cell 73 | start_time <- Sys.time() 74 | set.seed(1) 75 | sce <- indexCell(sce) 76 | end_time <- Sys.time() 77 | Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 78 | 79 | start_time <- Sys.time() 80 | scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index)) 81 | scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1))) 82 | end_time <- Sys.time() 83 | Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 84 | 85 | True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]]) 86 | Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs) 87 | } 88 | 89 | True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell)) 90 | Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell)) 91 | Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell)) 92 | Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell)) 93 | 94 | write.csv(True_Labels_scmapcell,paste0(OutputDir,'/scmapcell_true.csv'),row.names = FALSE) 95 | write.csv(Pred_Labels_scmapcell,paste0(OutputDir,'/scmapcell_pred.csv'),row.names = FALSE) 96 | write.csv(Training_Time_scmapcell,paste0(OutputDir,'/scmapcell_training_time.csv'),row.names = FALSE) 97 | write.csv(Testing_Time_scmapcell,paste0(OutputDir,'/scmapcell_test_time.csv'),row.names = FALSE) 98 | } 99 | if (args[6] == "0") { 100 | run_scmapcell(args[1], args[2], args[3], args[4]) 101 | } else { 102 | run_scmapcell(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6])) 103 | } 104 | 105 | 106 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_scmapcluster.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | 3 | run_scmapcluster <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ 4 | " 5 | run scmapcluster 6 | Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation, 7 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 8 | 9 | Parameters 10 | ---------- 11 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 12 | as row names and gene names as column names. 13 | LabelsPath : Cell population annotations file path (.csv). 14 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 15 | OutputDir : Output directory defining the path of the exported file. 16 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 17 | defining the genes order for each cross validation fold, default is NULL. 18 | NumGenes : Number of genes used in case of feature selection (integer), default is NULL. 19 | " 20 | 21 | Data <- read.csv(DataPath,row.names = 1) 22 | Labels <- as.matrix(read.csv(LabelsPath)) 23 | load(CV_RDataPath) 24 | Labels <- as.vector(Labels[,col_Index]) 25 | Data <- Data[Cells_to_Keep,] 26 | Labels <- Labels[Cells_to_Keep] 27 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 28 | GenesOrder = read.csv(GeneOrderPath) 29 | } 30 | 31 | ############################################################################# 32 | # scmap # 33 | ############################################################################# 34 | library(scmap) 35 | library(SingleCellExperiment) 36 | True_Labels_scmapcluster <- list() 37 | Pred_Labels_scmapcluster <- list() 38 | Training_Time_scmapcluster <- list() 39 | Testing_Time_scmapcluster <- list() 40 | Data = t(as.matrix(Data)) 41 | 42 | for (i in c(1:n_folds)){ 43 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 44 | sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 45 | colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) 46 | logcounts(sce) <- log2(normcounts(sce) + 1) 47 | # use gene names as feature symbols 48 | rowData(sce)$feature_symbol <- rownames(sce) 49 | sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE) 50 | 51 | sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 52 | colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) 53 | logcounts(sce_test) <- log2(normcounts(sce_test) + 1) 54 | rowData(sce_test)$feature_symbol <- rownames(sce_test) 55 | sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData 56 | } 57 | else{ 58 | sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 59 | colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) 60 | logcounts(sce) <- log2(normcounts(sce) + 1) 61 | # use gene names as feature symbols 62 | rowData(sce)$feature_symbol <- rownames(sce) 63 | sce <- selectFeatures(sce, suppress_plot = TRUE) 64 | 65 | sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 66 | colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) 67 | logcounts(sce_test) <- log2(normcounts(sce_test) + 1) 68 | rowData(sce_test)$feature_symbol <- rownames(sce_test) 69 | sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData 70 | } 71 | 72 | # scmap-cluster 73 | start_time <- Sys.time() 74 | sce <- indexCluster(sce) 75 | end_time <- Sys.time() 76 | Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 77 | 78 | start_time <- Sys.time() 79 | scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index)) 80 | end_time <- Sys.time() 81 | Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 82 | 83 | True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]]) 84 | Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs) 85 | 86 | } 87 | 88 | True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster)) 89 | Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster)) 90 | Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster)) 91 | Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster)) 92 | 93 | write.csv(True_Labels_scmapcluster,paste0(OutputDir,'/scmapcluster_true.csv'),row.names = FALSE) 94 | write.csv(Pred_Labels_scmapcluster,paste0(OutputDir,'/scmapcluster_pred.csv'),row.names = FALSE) 95 | write.csv(Training_Time_scmapcluster,paste0(OutputDir,'/scmapcluster_training_time.csv'),row.names = FALSE) 96 | write.csv(Testing_Time_scmapcluster,paste0(OutputDir,'/scmapcluster_test_time.csv'),row.names = FALSE) 97 | 98 | 99 | } 100 | if (args[6] == "0") { 101 | run_scmapcluster(args[1], args[2], args[3], args[4]) 102 | } else { 103 | run_scmapcluster(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6])) 104 | } 105 | -------------------------------------------------------------------------------- /Snakemake/Scripts/run_singleCellNet.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | 3 | run_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ 4 | " 5 | run singleCellNet 6 | Wrapper script to run singleCellNet on a benchmark dataset with 5-fold cross validation, 7 | outputs lists of true and predicted cell labels as csv files, as well as computation time. 8 | 9 | Parameters 10 | ---------- 11 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 12 | as row names and gene names as column names. 13 | LabelsPath : Cell population annotations file path (.csv). 14 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 15 | OutputDir : Output directory defining the path of the exported file. 16 | GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 17 | defining the genes order for each cross validation fold, default is NULL. 18 | NumGenes : Number of genes used in case of feature selection (integer), default is NULL. 19 | " 20 | 21 | Data <- read.csv(DataPath,row.names = 1) 22 | colnames(Data) <- gsub('_','.',colnames(Data), fixed = TRUE) 23 | Labels <- as.matrix(read.csv(LabelsPath)) 24 | load(CV_RDataPath) 25 | Labels <- as.vector(Labels[,col_Index]) 26 | Data <- Data[Cells_to_Keep,] 27 | Labels <- Labels[Cells_to_Keep] 28 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 29 | GenesOrder = read.csv(GeneOrderPath) 30 | } 31 | 32 | ############################################################################# 33 | # singleCellNet # 34 | ############################################################################# 35 | library(singleCellNet) 36 | library(dplyr) 37 | True_Labels_singleCellNet <- list() 38 | Pred_Labels_singleCellNet <- list() 39 | Training_Time_singleCellNet <- list() 40 | Testing_Time_singleCellNet <- list() 41 | Data = t(as.matrix(Data)) # deals also with sparse matrix 42 | 43 | for(i in c(1:n_folds)){ 44 | if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ 45 | DataTrain <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]] 46 | DataTest <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]] 47 | } 48 | else{ 49 | DataTrain <- Data[,Train_Idx[[i]]] 50 | DataTest <- Data[,Test_Idx[[i]]] 51 | } 52 | 53 | start_time <- Sys.time() 54 | cgenes2<-findClassyGenes(DataTrain, data.frame(Annotation = Labels[Train_Idx[[i]]]), "Annotation") 55 | cgenesA<-cgenes2[['cgenes']] 56 | grps<-cgenes2[['grps']] 57 | DataTrain<-as.matrix(DataTrain[cgenesA,]) 58 | xpairs<-ptGetTop(DataTrain, grps, ncores = 1) 59 | pdTrain<-query_transform(DataTrain[cgenesA, ], xpairs) 60 | rf<-sc_makeClassifier(pdTrain[xpairs,], genes=xpairs, groups=grps) 61 | end_time <- Sys.time() 62 | Training_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 63 | 64 | start_time <- Sys.time() 65 | DataTest<-query_transform(DataTest[cgenesA,], xpairs) 66 | classRes <-rf_classPredict(rf, DataTest) 67 | end_time <- Sys.time() 68 | Testing_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) 69 | 70 | True_Labels_singleCellNet[i] <- list(Labels[Test_Idx[[i]]]) 71 | Pred_Labels_singleCellNet[i] <- list((rownames(classRes)[apply(classRes,2,which.max)])[1:length(Test_Idx[[i]])]) 72 | } 73 | True_Labels_singleCellNet <- as.vector(unlist(True_Labels_singleCellNet)) 74 | Pred_Labels_singleCellNet <- as.vector(unlist(Pred_Labels_singleCellNet)) 75 | Training_Time_singleCellNet <- as.vector(unlist(Training_Time_singleCellNet)) 76 | Testing_Time_singleCellNet <- as.vector(unlist(Testing_Time_singleCellNet)) 77 | write.csv(True_Labels_singleCellNet,paste0(OutputDir,'/singleCellNet_true.csv'),row.names = FALSE) 78 | write.csv(Pred_Labels_singleCellNet,paste0(OutputDir,'/singleCellNet_pred.csv'),row.names = FALSE) 79 | write.csv(Training_Time_singleCellNet,paste0(OutputDir,'/singleCellNet_training_time.csv'),row.names = FALSE) 80 | write.csv(Testing_Time_singleCellNet,paste0(OutputDir,'/singleCellNet_test_time.csv'),row.names = FALSE) 81 | } 82 | 83 | if (args[6] == "0") { 84 | run_singleCellNet(args[1], args[2], args[3], args[4]) 85 | } else { 86 | run_singleCellNet(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6])) 87 | } 88 | -------------------------------------------------------------------------------- /Snakemake/evaluate.R: -------------------------------------------------------------------------------- 1 | args <- commandArgs(TRUE) 2 | 3 | TrueLabelsPath <- args[1] 4 | PredLabelsPath <- args[2] 5 | OutputDir <- args[3] 6 | ToolName <- args[4] 7 | 8 | evaluate <- function(TrueLabelsPath, PredLabelsPath, Indices = NULL){ 9 | " 10 | Script to evaluate the performance of the classifier. 11 | It returns multiple evaluation measures: the confusion matrix, median F1-score, F1-score for each class, accuracy, percentage of unlabeled, population size. 12 | 13 | The percentage of unlabeled cells is find by checking for cells that are labeled 'Unassigned', 'unassigned', 'Unknown', 'unknown', 'Nodexx', 'rand', or 'ambiguous'. 14 | 15 | Parameters 16 | ---------- 17 | TrueLabelsPath: csv file with the true labels (format: one column, no index) 18 | PredLabelsPath: csv file with the predicted labels (format: one column, no index) 19 | Indices: which part of the csv file should be read (e.g. if more datasets are tested at the same time) (format: c(begin, end)) 20 | 21 | Returns 22 | ------- 23 | Conf: confusion matrix 24 | MedF1 : median F1-score 25 | F1 : F1-score per class 26 | Acc : accuracy 27 | PercUnl : percentage of unlabeled cells 28 | PopSize : number of cells per cell type 29 | " 30 | 31 | true_lab <- unlist(read.csv(TrueLabelsPath)) 32 | pred_lab <- unlist(read.csv(PredLabelsPath)) 33 | 34 | if (! is.null(Indices)){ 35 | true_lab <- true_lab[Indices] 36 | pred_lab <- pred_lab[Indices] 37 | } 38 | 39 | unique_true <- unlist(unique(true_lab)) 40 | unique_pred <- unlist(unique(pred_lab)) 41 | 42 | unique_all <- unique(c(unique_true,unique_pred)) 43 | conf <- table(true_lab,pred_lab) 44 | pop_size <- rowSums(conf) 45 | 46 | pred_lab = gsub('Node..','Node',pred_lab) 47 | 48 | conf_F1 <- table(true_lab,pred_lab,exclude = c('unassigned','Unassigned','Unknown','rand','Node','ambiguous','unknown')) 49 | 50 | F1 <- vector() 51 | sum_acc <- 0 52 | 53 | for (i in c(1:length(unique_true))){ 54 | findLabel = colnames(conf_F1) == row.names(conf_F1)[i] 55 | if(sum(findLabel)){ 56 | prec <- conf_F1[i,findLabel] / colSums(conf_F1)[findLabel] 57 | rec <- conf_F1[i,findLabel] / rowSums(conf_F1)[i] 58 | if (prec == 0 || rec == 0){ 59 | F1[i] = 0 60 | } else{ 61 | F1[i] <- (2*prec*rec) / (prec + rec) 62 | } 63 | sum_acc <- sum_acc + conf_F1[i,findLabel] 64 | } else { 65 | F1[i] = 0 66 | } 67 | } 68 | 69 | pop_size <- pop_size[pop_size > 0] 70 | 71 | names(F1) <- names(pop_size) 72 | 73 | med_F1 <- median(F1) 74 | 75 | total <- length(pred_lab) 76 | num_unlab <- sum(pred_lab == 'unassigned') + sum(pred_lab == 'Unassigned') + sum(pred_lab == 'rand') + sum(pred_lab == 'Unknown') + sum(pred_lab == 'unknown') + sum(pred_lab == 'Node') + sum(pred_lab == 'ambiguous') 77 | per_unlab <- num_unlab / total 78 | 79 | acc <- sum_acc/sum(conf_F1) 80 | 81 | result <- list(Conf = conf, MedF1 = med_F1, F1 = F1, Acc = acc, PercUnl = per_unlab, PopSize = pop_size) 82 | 83 | return(result) 84 | } 85 | 86 | results <- evaluate(TrueLabelsPath, PredLabelsPath) 87 | write.csv(results$Conf, file.path(OutputDir, "Confusion", paste0(ToolName, ".csv"))) 88 | write.csv(results$F1, file.path(OutputDir, "F1", paste0(ToolName, ".csv"))) 89 | write.csv(results$PopSize, file.path(OutputDir, "PopSize", paste0(ToolName, ".csv"))) 90 | df <- data.frame(results[c("MedF1", "Acc", "PercUnl")]) 91 | write.csv(df, file.path(OutputDir, "Summary", paste0(ToolName, ".csv"))) 92 | -------------------------------------------------------------------------------- /Snakemake/example.config.yml: -------------------------------------------------------------------------------- 1 | output_dir: output 2 | datafile: input/data.csv 3 | labfile: input/Labels.csv 4 | column: 1 5 | number_of_features: 0 6 | tools_to_run: 7 | - Cell_BLAST 8 | - scVI 9 | - scmapcell 10 | -------------------------------------------------------------------------------- /Snakemake/rank_gene_dropouts.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sys import argv 3 | from pathlib import Path 4 | 5 | import rpy2.robjects as robjects 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn import linear_model 9 | 10 | 11 | def rank_gene_dropouts(DataPath, CV_RDataPath, OutputDir): 12 | ''' 13 | Script to rank the genes in the training set of the inputfile based on their dropout level. 14 | This rank is written to a file. 15 | 16 | Parameters 17 | ---------- 18 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 19 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 20 | OutputDir : Output directory defining the path of the exported file. 21 | ''' 22 | 23 | # read the Rdata file 24 | robjects.r['load'](CV_RDataPath) 25 | 26 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 27 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 28 | train_ind = np.array(robjects.r['Train_Idx']) 29 | 30 | # read the data 31 | data = pd.read_csv(DataPath,index_col=0,sep=',') 32 | data = data.iloc[tokeep] 33 | data = np.log2(data+1) 34 | 35 | genes = np.zeros([np.shape(data)[1],np.squeeze(nfolds)], dtype = '>U10') 36 | 37 | for i in range(np.squeeze(nfolds)): 38 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 39 | train=data.iloc[train_ind_i] 40 | train.columns = np.arange(len(train.columns)) 41 | 42 | # rank genes training set 43 | dropout = (train == 0).sum(axis='rows') 44 | dropout = (dropout / train.shape[0]) * 100 45 | mean = train.mean(axis='rows') 46 | 47 | notzero = np.where((np.array(mean) > 0) & (np.array(dropout) > 0))[0] 48 | zero = np.where(~((np.array(mean) > 0) & (np.array(dropout) > 0)))[0] 49 | train_notzero = train.iloc[:,notzero] 50 | train_zero = train.iloc[:,zero] 51 | zero_genes = train_zero.columns 52 | 53 | dropout = dropout.iloc[notzero] 54 | mean = mean.iloc[notzero] 55 | 56 | dropout = np.log2(np.array(dropout)).reshape(-1,1) 57 | mean = np.array(mean).reshape(-1,1) 58 | reg = linear_model.LinearRegression() 59 | reg.fit(mean,dropout) 60 | 61 | residuals = dropout - reg.predict(mean) 62 | residuals = pd.Series(np.array(residuals).ravel(),index=train_notzero.columns) 63 | residuals = residuals.sort_values(ascending=False) 64 | sorted_genes = residuals.index 65 | sorted_genes = sorted_genes.append(zero_genes) 66 | 67 | genes[:,i] = sorted_genes.values 68 | 69 | 70 | genes = pd.DataFrame(genes) 71 | 72 | genes.to_csv(str(OutputDir / Path("rank_genes_dropouts.csv")), index = False) 73 | 74 | rank_gene_dropouts(argv[1], argv[2], argv[3]) 75 | -------------------------------------------------------------------------------- /Snakemake/rulegraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tabdelaal/scRNAseq_Benchmark/553869b632f490d6fccc0af012e9ec5d698c17d6/Snakemake/rulegraph.png -------------------------------------------------------------------------------- /evaluate.R: -------------------------------------------------------------------------------- 1 | evaluate <- function(TrueLabelsPath, PredLabelsPath, Indices = NULL){ 2 | " 3 | Script to evaluate the performance of the classifier. 4 | It returns multiple evaluation measures: the confusion matrix, median F1-score, F1-score for each class, accuracy, percentage of unlabeled, population size. 5 | 6 | The percentage of unlabeled cells is find by checking for cells that are labeled 'Unassigned', 'unassigned', 'Unknown', 'unknown', 'Nodexx', 'rand', or 'ambiguous'. 7 | 8 | Parameters 9 | ---------- 10 | TrueLabelsPath: csv file with the true labels (format: one column, no index) 11 | PredLabelsPath: csv file with the predicted labels (format: one column, no index) 12 | Indices: which part of the csv file should be read (e.g. if more datasets are tested at the same time) (format: c(begin, end)) 13 | 14 | Returns 15 | ------- 16 | Conf: confusion matrix 17 | MedF1 : median F1-score 18 | F1 : F1-score per class 19 | Acc : accuracy 20 | PercUnl : percentage of unlabeled cells 21 | PopSize : number of cells per cell type 22 | " 23 | 24 | true_lab <- unlist(read.csv(TrueLabelsPath)) 25 | pred_lab <- unlist(read.csv(PredLabelsPath)) 26 | 27 | if (! is.null(Indices)){ 28 | true_lab <- true_lab[Indices] 29 | pred_lab <- pred_lab[Indices] 30 | } 31 | 32 | unique_true <- unlist(unique(true_lab)) 33 | unique_pred <- unlist(unique(pred_lab)) 34 | 35 | unique_all <- unique(c(unique_true,unique_pred)) 36 | conf <- table(true_lab,pred_lab) 37 | pop_size <- rowSums(conf) 38 | 39 | pred_lab = gsub('Node..','Node',pred_lab) 40 | 41 | conf_F1 <- table(true_lab,pred_lab,exclude = c('unassigned','Unassigned','Unknown','rand','Node','ambiguous','unknown')) 42 | 43 | F1 <- vector() 44 | sum_acc <- 0 45 | 46 | for (i in c(1:length(unique_true))){ 47 | findLabel = colnames(conf_F1) == row.names(conf_F1)[i] 48 | if(sum(findLabel)){ 49 | prec <- conf_F1[i,findLabel] / colSums(conf_F1)[findLabel] 50 | rec <- conf_F1[i,findLabel] / rowSums(conf_F1)[i] 51 | if (prec == 0 || rec == 0){ 52 | F1[i] = 0 53 | } else{ 54 | F1[i] <- (2*prec*rec) / (prec + rec) 55 | } 56 | sum_acc <- sum_acc + conf_F1[i,findLabel] 57 | } else { 58 | F1[i] = 0 59 | } 60 | } 61 | 62 | pop_size <- pop_size[pop_size > 0] 63 | 64 | names(F1) <- names(pop_size) 65 | 66 | med_F1 <- median(F1) 67 | 68 | total <- length(pred_lab) 69 | num_unlab <- sum(pred_lab == 'unassigned') + sum(pred_lab == 'Unassigned') + sum(pred_lab == 'rand') + sum(pred_lab == 'Unknown') + sum(pred_lab == 'unknown') + sum(pred_lab == 'Node') + sum(pred_lab == 'ambiguous') 70 | per_unlab <- num_unlab / total 71 | 72 | acc <- sum_acc/sum(conf_F1) 73 | 74 | result <- list(Conf = conf, MedF1 = med_F1, F1 = F1, Acc = acc, PercUnl = per_unlab, PopSize = pop_size) 75 | 76 | return(result) 77 | } 78 | -------------------------------------------------------------------------------- /rank_gene_dropouts.py: -------------------------------------------------------------------------------- 1 | import os 2 | import rpy2.robjects as robjects 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn import linear_model 6 | 7 | 8 | def rank_gene_dropouts(DataPath, CV_RDataPath, OutputDir): 9 | ''' 10 | Script to rank the genes in the training set of the inputfile based on their dropout level. 11 | This rank is written to a file. 12 | 13 | Parameters 14 | ---------- 15 | DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 16 | CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. 17 | OutputDir : Output directory defining the path of the exported file. 18 | ''' 19 | 20 | # read the Rdata file 21 | robjects.r['load'](CV_RDataPath) 22 | 23 | nfolds = np.array(robjects.r['n_folds'], dtype = 'int') 24 | tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') 25 | train_ind = np.array(robjects.r['Train_Idx']) 26 | 27 | # read the data 28 | data = pd.read_csv(DataPath,index_col=0,sep=',') 29 | data = data.iloc[tokeep] 30 | data = np.log2(data+1) 31 | 32 | genes = np.zeros([np.shape(data)[1],np.squeeze(nfolds)], dtype = '>U10') 33 | 34 | for i in range(np.squeeze(nfolds)): 35 | train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 36 | train=data.iloc[train_ind_i] 37 | train.columns = np.arange(len(train.columns)) 38 | 39 | # rank genes training set 40 | dropout = (train == 0).sum(axis='rows') 41 | dropout = (dropout / train.shape[0]) * 100 42 | mean = train.mean(axis='rows') 43 | 44 | notzero = np.where((np.array(mean) > 0) & (np.array(dropout) > 0))[0] 45 | zero = np.where(~((np.array(mean) > 0) & (np.array(dropout) > 0)))[0] 46 | train_notzero = train.iloc[:,notzero] 47 | train_zero = train.iloc[:,zero] 48 | zero_genes = train_zero.columns 49 | 50 | dropout = dropout.iloc[notzero] 51 | mean = mean.iloc[notzero] 52 | 53 | dropout = np.log2(np.array(dropout)).reshape(-1,1) 54 | mean = np.array(mean).reshape(-1,1) 55 | reg = linear_model.LinearRegression() 56 | reg.fit(mean,dropout) 57 | 58 | residuals = dropout - reg.predict(mean) 59 | residuals = pd.Series(np.array(residuals).ravel(),index=train_notzero.columns) 60 | residuals = residuals.sort_values(ascending=False) 61 | sorted_genes = residuals.index 62 | sorted_genes = sorted_genes.append(zero_genes) 63 | 64 | genes[:,i] = sorted_genes.values 65 | 66 | 67 | genes = pd.DataFrame(genes) 68 | 69 | os.chdir(OutputDir) 70 | genes.to_csv("rank_genes_dropouts.csv", index = False) 71 | 72 | 73 | 74 | --------------------------------------------------------------------------------