├── .gitignore ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── NEWS.md ├── R ├── RcppExports.R ├── cellClassifier.R ├── cellQC.R ├── clustCells.R ├── data.R ├── deGenes.R ├── dimensinalityReduction.R ├── gficf.R ├── pathwayAnalisys.R ├── util.R ├── visualization.R └── zzz.R ├── README.md ├── data-raw ├── small_atlas.R └── test_BC_atlas.R ├── data ├── small_BC_atlas.rda └── test_BC_atlas.rda ├── img ├── Cd34_expression.png ├── Cd8a_expression.png ├── tabula_annotated.png └── tabula_clusters.png ├── inst └── doc │ ├── index.Rmd │ ├── index.html │ ├── index_files │ └── figure-html │ │ ├── ccl-1.png │ │ ├── clustering-1.png │ │ └── norm-1.png │ ├── installation.Rmd │ ├── installation.html │ ├── scGSEA.Rmd │ ├── scGSEA.html │ ├── scGSEA_files │ └── figure-html │ │ ├── atlas-1.png │ │ ├── clust-1.png │ │ └── graph-1.png │ ├── scMAP.Rmd │ ├── scMAP.html │ └── scMAP_files │ └── figure-html │ ├── atlas-1.png │ └── map-1.png ├── man ├── Read10X.Rd ├── classify.cells.Rd ├── clustcells.Rd ├── clustcellsBYscGSEA.Rd ├── computePCADim.Rd ├── ensToSymbol.Rd ├── filterCells.Rd ├── findClusterMarkers.Rd ├── findVarGenes.Rd ├── gficf.Rd ├── loadGFICF.Rd ├── plotCells.Rd ├── plotGSEA.Rd ├── plotGSVA.Rd ├── plotGeneViolin.Rd ├── plotGenes.Rd ├── plotPathway.Rd ├── resetScGSEA.Rd ├── runGSEA.Rd ├── runHarmony.Rd ├── runNMF.Rd ├── runPCA.Rd ├── runReduction.Rd ├── runScGSEA.Rd ├── saveGFICF.Rd ├── scMAP.Rd ├── small_BC_atlas.Rd ├── symbolToEns.Rd └── test_BC_atlas.Rd ├── src ├── .gitignore ├── Makevars ├── Makevars.win ├── ModularityOptimizer.cpp ├── ModularityOptimizer.h ├── RModularityOptimizer.cpp ├── RcppExports.cpp ├── detectCores.cpp ├── jaccard_coeff.cpp ├── mann_whitney.cpp ├── mann_whitney.h ├── misc.cpp ├── rcpp_mann_whitney.cpp ├── rcpp_parallel_jaccard_coeff.cpp └── rcpp_parallel_mann_whitney.cpp └── tests └── testthat.R /.gitignore: -------------------------------------------------------------------------------- 1 | .* 2 | .Rproj 3 | !.gitignore 4 | 5 | # History files 6 | .Rhistory 7 | .Rapp.history 8 | 9 | # Session Data files 10 | .RData 11 | 12 | # Example code in package build process 13 | *-Ex.R 14 | 15 | # Output files from R CMD build 16 | /*.tar.gz 17 | 18 | # Output files from R CMD check 19 | /*.Rcheck/ 20 | 21 | # RStudio files 22 | .Rproj.user/ 23 | 24 | # produced vignettes 25 | vignettes/*.html 26 | vignettes/*.pdf 27 | 28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 29 | .httr-oauth 30 | 31 | # knitr and R markdown default cache directories 32 | /*_cache/ 33 | /cache/ 34 | 35 | # Temporary files created by R markdown 36 | *.utf8.md 37 | *.knit.md 38 | 39 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html 40 | rsconnect/ 41 | 42 | .Rproj.user 43 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: gficf 2 | Title: Gene Frequency - Inverse Cell Frequency 3 | Version: 2.0.0 4 | Authors@R: person("Gennaro", "Gambardella", email = "gambardella@tigem.it", role = c("aut", "cre")) 5 | Description: Single cell data rapresentation as text 6 | License: GPL-3 7 | URL: https://github.com/gambalab/gficf 8 | BugReports: https://github.com/gambalab/gficf 9 | Encoding: UTF-8 10 | LazyData: true 11 | RoxygenNote: 7.2.2 12 | Depends: R (>= 3.6), RcppML 13 | LinkingTo: RcppArmadillo, Rcpp (>= 0.11.0), RcppGSL, RcppProgress, RcppParallel, RcppEigen 14 | biocViews: 15 | Imports: 16 | uwot, 17 | igraph, 18 | ggplot2, 19 | irlba, 20 | ggrepel, 21 | Rtsne, 22 | fastmatch, 23 | edgeR, 24 | fgsea, 25 | RSpectra, 26 | Matrix, 27 | babelgene, 28 | reshape2, 29 | RcppParallel, 30 | Rcpp, 31 | MASS, 32 | locfit, 33 | reticulate, 34 | leiden, 35 | KernelKnn, 36 | mgcv, 37 | GSVA, 38 | limma, 39 | pointr, 40 | AnnotationHub, 41 | ensembldb, 42 | harmony, 43 | BiocParallel 44 | Remotes: 45 | github::zdebruine/RcppML 46 | SystemRequirements: GNU make 47 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(Read10X) 4 | export(classify.cells) 5 | export(clustcells) 6 | export(clustcellsBYscGSEA) 7 | export(computePCADim) 8 | export(ensToSymbol) 9 | export(filterCells) 10 | export(findClusterMarkers) 11 | export(gficf) 12 | export(loadGFICF) 13 | export(plotCells) 14 | export(plotGSEA) 15 | export(plotGSVA) 16 | export(plotGeneViolin) 17 | export(plotGenes) 18 | export(plotPathway) 19 | export(resetScGSEA) 20 | export(runGSEA) 21 | export(runHarmony) 22 | export(runNMF) 23 | export(runPCA) 24 | export(runReduction) 25 | export(runScGSEA) 26 | export(saveGFICF) 27 | export(scMAP) 28 | export(symbolToEns) 29 | import(AnnotationHub) 30 | import(GSVA) 31 | import(Matrix) 32 | import(RcppML) 33 | import(ensembldb) 34 | import(fastmatch) 35 | import(ggplot2) 36 | import(ggrepel) 37 | import(mgcv) 38 | import(msigdbr) 39 | import(pointr) 40 | import(utils) 41 | import(uwot) 42 | importFrom(BiocParallel,SnowParam) 43 | importFrom(KernelKnn,KernelKnn) 44 | importFrom(MASS,fitdistr) 45 | importFrom(Matrix,readMM) 46 | importFrom(RSpectra,svds) 47 | importFrom(RcppML,nmf) 48 | importFrom(RcppParallel,RcppParallelLibs) 49 | importFrom(RcppParallel,setThreadOptions) 50 | importFrom(Rtsne,Rtsne) 51 | importFrom(babelgene,orthologs) 52 | importFrom(edgeR,DGEList) 53 | importFrom(edgeR,calcNormFactors) 54 | importFrom(edgeR,cpm) 55 | importFrom(fgsea,fgsea) 56 | importFrom(harmony,HarmonyMatrix) 57 | importFrom(igraph,as_adj) 58 | importFrom(igraph,cluster_louvain) 59 | importFrom(igraph,fastgreedy.community) 60 | importFrom(igraph,graph.adjacency) 61 | importFrom(igraph,graph.data.frame) 62 | importFrom(igraph,membership) 63 | importFrom(igraph,simplify) 64 | importFrom(igraph,walktrap.community) 65 | importFrom(irlba,irlba) 66 | importFrom(leiden,leiden) 67 | importFrom(limma,eBayes) 68 | importFrom(limma,lmFit) 69 | importFrom(limma,topTable) 70 | importFrom(locfit,locfit) 71 | importFrom(locfit,locfit.robust) 72 | importFrom(locfit,lp) 73 | importFrom(mgcv,gam) 74 | importFrom(reshape2,acast) 75 | importFrom(reshape2,melt) 76 | importFrom(reticulate,py_module_available) 77 | importFrom(sva,ComBat_seq) 78 | importFrom(utils,read.delim) 79 | importFrom(utils,read.table) 80 | importFrom(utils,write.table) 81 | useDynLib(gficf, .registration = TRUE) 82 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # gf-icf 0.0.0.1 2 | 3 | ## New features 4 | 5 | 6 | ## Bug fixes and minor improvements 7 | 8 | 9 | # gficf 0.0.0.1 (XX XXX 2019) 10 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | RunModularityClusteringCpp <- function(SNN, modularityFunction, resolution, algorithm, nRandomStarts, nIterations, randomSeed, printOutput, edgefilename) { 5 | .Call(`_gficf_RunModularityClusteringCpp`, SNN, modularityFunction, resolution, algorithm, nRandomStarts, nIterations, randomSeed, printOutput, edgefilename) 6 | } 7 | 8 | jaccard_coeff <- function(idx, printOutput) { 9 | .Call(`_gficf_jaccard_coeff`, idx, printOutput) 10 | } 11 | 12 | armaColSumFull <- function(m, ncores = 1L, verbose = FALSE) { 13 | .Call(`_gficf_armaColSumFull`, m, ncores, verbose) 14 | } 15 | 16 | armaColSumSparse <- function(m, ncores = 1L, verbose = FALSE) { 17 | .Call(`_gficf_armaColSumSparse`, m, ncores, verbose) 18 | } 19 | 20 | colMeanVarS <- function(m, ncores = 1L) { 21 | .Call(`_gficf_colMeanVarS`, m, ncores) 22 | } 23 | 24 | armaManhattan <- function(m, ncores = 1L, verbose = TRUE, full = FALSE, diag = TRUE) { 25 | .Call(`_gficf_armaManhattan`, m, ncores, verbose, full, diag) 26 | } 27 | 28 | armaCorr <- function(m, ncores = 1L, verbose = TRUE, full = FALSE, diag = TRUE, dist = TRUE) { 29 | .Call(`_gficf_armaCorr`, m, ncores, verbose, full, diag, dist) 30 | } 31 | 32 | armaColMeans <- function(m, ncores = 1L, verbose = TRUE) { 33 | .Call(`_gficf_armaColMeans`, m, ncores, verbose) 34 | } 35 | 36 | scaleUMI <- function(m, ncores = 1L, verbose = FALSE) { 37 | .Call(`_gficf_scaleUMI`, m, ncores, verbose) 38 | } 39 | 40 | rcpp_WMU_test <- function(M, idx1, idx2) { 41 | .Call(`_gficf_rcpp_WMU_test`, M, idx1, idx2) 42 | } 43 | 44 | rcpp_parallel_jaccard_coef <- function(mat, printOutput) { 45 | .Call(`_gficf_rcpp_parallel_jaccard_coef`, mat, printOutput) 46 | } 47 | 48 | rcpp_parallel_WMU_test <- function(matX, matY, printOutput) { 49 | .Call(`_gficf_rcpp_parallel_WMU_test`, matX, matY, printOutput) 50 | } 51 | 52 | -------------------------------------------------------------------------------- /R/cellClassifier.R: -------------------------------------------------------------------------------- 1 | #' Classify New Embedded Cells 2 | #' 3 | #' Classify new embedded cells using GF-ICF transformation and K-nn algorithm. 4 | #' Existing cells are used as training set. 5 | #' 6 | #' @param data list; GFICF object 7 | #' @param classes chareachters; Classes of already existing cells in the order of they are in colnames(data$gficf). 8 | #' @param k integer; Number of K-nn to use for classification. Odd number less than 30 are preferred. 9 | #' @param seed integer; Initial seed to use. 10 | #' @param knn_method string; a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'. 11 | #' @param knn_weights_fun value; there are various ways of specifying the kernel function. NULL value (default) correspond to unweighted KNN algorithm. See the details section of KernelKnn package for more values. 12 | #' @param nt numeric; Number of thread to use (default is 0, i.e. all possible CPUs - 1) 13 | #' @return A dataframe containing cell id and predicted classes. 14 | #' @importFrom KernelKnn KernelKnn 15 | #' 16 | #' @export 17 | classify.cells = function(data,classes,k=7,seed=18051982,knn_method="euclidean",knn_weights_fun=NULL,nt=0) 18 | { 19 | if (is.null(data$embedded.predicted)) {stop("Please embed first new cells!")} 20 | if (nt==0) {nt = ifelse(detectCores()>1,detectCores()-1,1)} 21 | set.seed(seed) 22 | if(!is.factor(classes)) {classes = factor(as.character(classes))} 23 | 24 | res = KernelKnn(data = data$embedded[,c(1,2)], TEST_data = data$embedded.predicted[,c(1,2)], y = as.numeric(classes), k = k ,h = 1,method = knn_method, knn_weights_fun, threads = nt, regression = F,Levels = 1:length(levels(classes))) 25 | colnames(res) = levels(classes) 26 | res <- apply(res, 1, function(x) {i = which.max(x); return(data.frame(predicted.class=names(x)[i],class.prob=x[i],stringsAsFactors = F))} ) 27 | res <- do.call("rbind",res) 28 | rownames(res) <- NULL 29 | #res = class::knn(data$embedded[,c(1,2)],data$embedded.predicted[,c(1,2)],classes,k = k,prob = T) 30 | data$embedded.predicted$predicted.class <- res$predicted.class 31 | data$embedded.predicted$class.prob <- res$class.prob 32 | return(data) 33 | } 34 | 35 | #' Embed new cells in an existing space 36 | #' 37 | #' This function embed new cells in an already existing space. For now it supports only UMAP and t-UMAP. Briefly new cells are first normalized with GF-ICF method but using as ICF weigth estimated on the existing cells and than projected in the existing PCA/NMF space before to be embedded in the already existing UMAP space via umap_transform function. 38 | #' 39 | #' @param data list; GFICF object 40 | #' @param x Matrix; UMI counts matrix of cells to embedd. 41 | #' @param nt integer; Number of thread to use (default 2). 42 | #' @param seed integer; Initial seed to use. 43 | #' @param normalize boolean; If counts must be normalized before to be rescaled with GFICF. 44 | #' @param verbose boolean; Icrease verbosity. 45 | #' @return The updated gficf object. 46 | #' @import Matrix 47 | #' @import uwot 48 | #' @importFrom edgeR DGEList calcNormFactors cpm 49 | #' @importFrom Rtsne Rtsne 50 | #' @import RcppML 51 | #' 52 | #' @export 53 | scMAP = function(data,x,nt=2,seed=18051982, normalize=TRUE,verbose=TRUE) 54 | { 55 | if(data$reduction=="tsne") {stop("Not supported with t-SNE reduction!!")} 56 | if(length(intersect(rownames(data$gficf),rownames(x)))==0) {stop("No common genes between the two dataset! Please check if gene identifiers beween two dataset correspond")} 57 | 58 | tsmessage("Gene filtering..",verbose = verbose) 59 | g = union(rownames(filter_genes_cell2loc_style(data = x,data$param$cell_count_cutoff,data$param$cell_percentage_cutoff2,data$param$nonz_mean_cutoff)),rownames(data$gficf)) 60 | x = x[rownames(x)%in%g,] 61 | rm(g) 62 | 63 | if (normalize){ 64 | tsmessage("Normalize counts..",verbose = verbose) 65 | x <- Matrix::Matrix(edgeR::cpm(edgeR::calcNormFactors(edgeR::DGEList(counts=x),normalized.lib.sizes = T)),sparse = T) 66 | } 67 | 68 | x = tf(x,verbose=verbose) 69 | x = idf(x,w = data$w,verbose=verbose) 70 | x = t(l.norm(t(x),norm = "l2",verbose=verbose)) 71 | 72 | if(!is.null(data$pca$odgenes) & data$pca$rescale) 73 | { 74 | x = Matrix::Matrix(data = x,sparse = T) 75 | x@x <- x@x*rep(data$pca$odgenes[colnames(x),'gsf'],diff(x@p)) 76 | } 77 | 78 | if (data$pca$type=="NMF") { 79 | cells = colnames(x) 80 | x = t(predict.nmf(w = data$pca$genes,data = x[rownames(data$pca$genes),])) 81 | rownames(x) = cells 82 | } else { 83 | x = t(x[rownames(data$pca$genes),]) %*% data$pca$genes 84 | } 85 | 86 | if(data$reduction%in%c("tumap","umap")) { 87 | data$embedded.predicted = as.data.frame(uwot::umap_transform(as.matrix(x),data$uwot,verbose = verbose)) 88 | rownames(data$embedded.predicted) = rownames(x) 89 | colnames(data$embedded.predicted) = c("X","Y") 90 | } 91 | 92 | data$pca$pred = x;rm(x);gc() 93 | return(data) 94 | } 95 | 96 | predict.nmf = function (w, data, L1 = 0, L2 = 0, mask = NULL, ...) 97 | { 98 | m <- new("nmf", w = w, d = rep(1:ncol(w)), h = matrix(0,nrow = ncol(w), 1)) 99 | RcppML::predict(m, data, L1 = L1, L2 = L2, mask = mask, ...) 100 | } 101 | 102 | -------------------------------------------------------------------------------- /R/cellQC.R: -------------------------------------------------------------------------------- 1 | #' Cell QC 2 | #' 3 | #' Filter Cells with low gene ratio detection and high MT ratio. 4 | #' Loess and GAM regression are used to fit relationships between the number of UMI and either the ratio of detected genes or the MT ratio. 5 | #' 6 | #' @param counts Matrix; Raw counts matrix 7 | #' @param organism characters; Organism (supported human and mouse). 8 | #' @param plot boolean; If regression plots must be showed. 9 | #' @param verbose boolean; Increase verbosity. 10 | #' @param minUMI numeric; Minimium number of UMI per cell (default 800). 11 | #' @return The updated gficf object. 12 | #' @import AnnotationHub 13 | #' @import ensembldb 14 | #' 15 | #' @export 16 | filterCells = function(counts,organism,plot=F,verbose=T,minUMI=800) { 17 | organism = ifelse(tolower(organism) == "human","Homo sapiens","Mus musculus") 18 | organism = base::match.arg(arg = organism,choices = c("Homo sapiens","Mus musculus"),several.ok = F) 19 | 20 | metadata = data.frame(cell.id = colnames(counts),nUMI=armaColSum(counts),nGenes=armaColSum(counts!=0),stringsAsFactors = F) 21 | rownames(metadata) = metadata$cell.id 22 | metadata$geneRatio = metadata$nGenes/metadata$nUMI 23 | 24 | tsmessage("... Filtering Cells by minUMI",verbose = verbose) 25 | b = nrow(metadata) 26 | metadata <- subset(metadata,nUMI>=minUMI) 27 | counts <- counts[,colnames(counts)%in%metadata$cell.id] 28 | a = nrow(metadata) 29 | tsmessage(paste0("Cells passing the coverage filter ",a," out of ",b," (",round(a/b*100,2),")"),verbose = verbose) 30 | 31 | tsmessage("... Retrieving gene annotation",verbose = verbose) 32 | ah <- AnnotationHub() 33 | # Access the Ensembl database for organism 34 | ahDb <- query(ah,pattern = c(organism,"EnsDb"), ignore.case = TRUE) 35 | # Acquire the latest annotation files 36 | id <- tail(rownames(mcols(ahDb)),n=1) 37 | # Download the appropriate Ensembldb database 38 | edb <- ah[[id]] 39 | # Extract gene-level information from database 40 | annotations <- subset(genes(edb,return.type = "data.frame"),seq_name%in%c(as.character(1:22),"X","Y","MT")) 41 | # Extract IDs for mitochondrial genes 42 | mt = annotations$gene_id[annotations$seq_name%in%"MT"] 43 | 44 | tsmessage("... Filtering Cells by Gene/nUMI ~ log(UMI)",verbose = verbose) 45 | tmp = plot.UMIxGene(metadata = metadata,method = "less",fdr.th = .1,plot = plot,family = "loess") 46 | metadata$covFilter = !tmp$toremove 47 | a = sum(metadata$covFilter) 48 | b = nrow(metadata) 49 | tsmessage(paste0("Cells passing the coverage filter ",a," out of ",b," (",round(a/b*100,2),")"),verbose = verbose) 50 | 51 | if (sum(mt %in% rownames(counts))==0) { 52 | tsmessage("No mitochondrial genes found! MT filter will be not applied.") 53 | metadata$mtFilter = TRUE 54 | } else { 55 | # Number of UMIs assigned to mitochondrial genes 56 | metadata$mtUMI <- Matrix::colSums(counts[which(rownames(counts) %in% mt),], na.rm = T) 57 | 58 | # Calculate of mitoRatio per cell 59 | metadata$mitoRatio <- metadata$mtUMI/metadata$nUMI 60 | tsmessage("... Filtering Cells by mtRatio ~ log(UMI)") 61 | tmp = plot.UMIxMT(metadata,method="greater",plot = plot,family = "loess") 62 | metadata$mtFilter = !tmp$toremove 63 | a = sum(metadata$mtFilter) 64 | tsmessage(paste0("Cells passing the MT filter ",a," out of ",b," (",round(a/b*100,2),")"),verbose = verbose) 65 | } 66 | data = list() 67 | data$counts = counts[,metadata$cell.id[metadata$covFilter & metadata$mtFilter]]; 68 | rm(counts);gc() 69 | data$QC.metadata = metadata[colnames(data$counts),] 70 | data$ann.hub.id = id 71 | return(data) 72 | } 73 | 74 | #' 75 | #' @import ggplot2 76 | #' @importFrom MASS fitdistr 77 | #' @importFrom mgcv gam 78 | #' 79 | plot.UMIxMT = function(metadata,method="greater",fdr.th=0.1,plot=F,family="gam") 80 | { 81 | if (family == "loess") {fit <- stats::loess(formula = mitoRatio ~ log(nUMI),data = metadata,span = 1, degree = 1,family = "gaussian")} 82 | if (family == "poly") {fit <- stats::glm(formula = mitoRatio ~ poly(log(nUMI),degree = 2,raw = T),data = metadata)} 83 | if (family == "gam") {fit <- mgcv::gam(mitoRatio ~ log(nUMI), data = metadata)} 84 | metadata$ypred = predict(fit,metadata) 85 | metadata$diff = metadata$mitoRatio - metadata$ypred 86 | fitDist = MASS::fitdistr(metadata$diff, "normal") 87 | 88 | if (method == "two.sided") { 89 | metadata$p.val[metadata$diff>0] = pnorm(metadata$diff[metadata$diff>0], mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE) 90 | metadata$p.val[metadata$diff<0] = 1 - pnorm(metadata$diff[metadata$diff<0], mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE) 91 | } 92 | 93 | if (method == "greater") {metadata$p.val = stats::pnorm(metadata$diff, mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE)} 94 | if (method == "less") {metadata$p.val = 1 - stats::pnorm(metadata$diff, mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE)} 95 | metadata$p.val = stats::p.adjust(metadata$p.val,method = "fdr") 96 | metadata$toremove = metadata$p.val0] = stats::pnorm(metadata$diff[metadata$diff>0], mean = 0, sd = fitDist$estimate[2], lower.tail = FALSE) 123 | metadata$p.val[metadata$diff<0] = 1 - stats::pnorm(metadata$diff[metadata$diff<0], mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE) 124 | } 125 | if (method == "greater") {metadata$p.val = stats::pnorm(metadata$diff, mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE)} 126 | if (method == "less") {metadata$p.val = 1 - stats::pnorm(metadata$diff, mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE)} 127 | 128 | metadata$p.val = p.adjust(metadata$p.val,method = "fdr") 129 | metadata$toremove = metadata$p.val0, ] 67 | relations <- as.data.frame(relations) 68 | colnames(relations)<- c("from","to","weight") 69 | g <- igraph::graph.data.frame(relations, directed=FALSE) 70 | rm(relations,neigh);gc() 71 | 72 | if (community.algo=="louvain") 73 | { 74 | tsmessage("Performing louvain...",verbose = verbose) 75 | community <- igraph::cluster_louvain(g) 76 | } 77 | 78 | if (community.algo=="louvain 2") 79 | { 80 | tsmessage("Performing louvain with modularity optimization...",verbose = verbose) 81 | community <- RunModularityClustering(igraph::as_adjacency_matrix(g,attr = "weight",sparse = T),1,resolution,1,n.start,n.iter,seed,verbose) 82 | } 83 | 84 | if (community.algo=="louvain 3") 85 | { 86 | tsmessage("Performing louvain with modularity optimization...",verbose = verbose) 87 | community <- RunModularityClustering(igraph::as_adjacency_matrix(g,attr = "weight",sparse = T),1,resolution,2,n.start,n.iter,seed,verbose) 88 | } 89 | 90 | if (community.algo=="walktrap") 91 | { 92 | tsmessage("Performing walktrap...",verbose = verbose) 93 | community <- igraph::walktrap.community(g) 94 | } 95 | if (community.algo=="fastgreedy") { 96 | tsmessage("Performing fastgreedy...",verbose = verbose) 97 | community <- igraph::fastgreedy.community(g) 98 | } 99 | 100 | if (community.algo=="leiden") 101 | { 102 | if (!(reticulate::py_module_available("leidenalg") && reticulate::py_module_available("igraph"))) 103 | stop("Cannot find Leiden algorithm, please install through pip (e.g. sudo -H pip install leidenalg igraph).") 104 | 105 | tsmessage("Performing leiden...",verbose = verbose) 106 | community <- leiden::leiden(object = g,resolution_parameter=resolution) 107 | } 108 | 109 | 110 | if(community.algo %in% c("louvain 2","louvain 3","leiden")) { 111 | if(community.algo %in% c("louvain 2","louvain 3")) {community = community + 1} 112 | data$embedded$cluster = as.character(community) 113 | } else { 114 | data$embedded$cluster <- as.character(igraph::membership(community)) 115 | } 116 | 117 | if (store.graph) {data$community=community;data$cell.graph=g} else {data$community=community} 118 | 119 | # get centroid of clusters 120 | tsmessage("Computing Cluster Signatures...",verbose = verbose) 121 | cluster.map = data$embedded$cluster 122 | u = base::unique(cluster.map) 123 | data$cluster.gene.rnk = base::sapply(u, function(x,y=data$gficf,z=cluster.map) armaRowSum(y[,z%in%x])) 124 | 125 | tsmessage(paste("Detected Clusters:",length(unique(data$embedded$cluster))),verbose = verbose) 126 | 127 | return(data) 128 | } 129 | 130 | # Runs the modularity optimizer (C++ function from seurat package https://github.com/satijalab/seurat) 131 | # 132 | # @param SNN SNN matrix to use as input for the clustering algorithms 133 | # @param modularity Modularity function to use in clustering (1 = standard; 2 = alternative) 134 | # @param resolution Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities 135 | # @param algorithm Algorithm for modularity optimization (1 = original Louvain algorithm; 2 = Louvain algorithm with multilevel refinement; 3 = SLM algorithm; 4 = Leiden algorithm). Leiden requires the leidenalg python module. 136 | # @param n.start Number of random starts 137 | # @param n.iter Maximal number of iterations per random start 138 | # @param random.seed Seed of the random number generator 139 | # @param print.output Whether or not to print output to the console 140 | # @param temp.file.location Deprecated and no longer used 141 | # @param edge.file.name Path to edge file to use 142 | # 143 | # @return clusters 144 | # 145 | #' @importFrom utils read.table write.table 146 | # 147 | RunModularityClustering <- function(SNN = matrix(), modularity = 1, resolution = 0.8, algorithm = 1, n.start = 10, n.iter = 10, random.seed = 0, print.output = TRUE, temp.file.location = NULL, edge.file.name = "") 148 | { 149 | clusters <- RunModularityClusteringCpp(SNN,modularity,resolution,algorithm,n.start,n.iter,random.seed,print.output,edge.file.name) 150 | return(clusters) 151 | } 152 | 153 | 154 | #' Cell clustering by pathway's activity 155 | #' 156 | #' Cell clustering using pathway expression instead of gene expression 157 | #' 158 | #' Cells are clustered using the estimated pathway activity levels by scGSEA() function. 159 | #' Cells are clustered either by using PhenoGraph algorithm or hierarchical clustering. In the case of PhenoGraph method is used 160 | #' identified clusters with louvain method are stored into data$embedded$cluster.by.scGSEA while the prduced graph into data$scgsea$cell.graph. 161 | #' In case hierarchical clusteringis used the resulting dendogram is stored into data$scgsea$h.dendo. 162 | #' 163 | #' 164 | #' @param data list; Input data (gficf object) 165 | #' @param method character; Method used to produce the cell network, such as phenograph or fgraph. 166 | #' @param pca numeric; If different from NULL data are reduced using pca component before to apply clustering algorithm. 167 | #' @param k integer; number of nearest neighbours (default:10). 168 | #' @param resolution Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities. 169 | #' @param n.start Number of random starts. 170 | #' @param n.iter Maximal number of iterations per random start. 171 | #' @param nt integer; Number of cpus to use for k-nn search. If zero all cpu are used. 172 | #' @param seed integer; Seed to use for replication. 173 | #' @param verbose logical; Increase verbosity. 174 | #' @return the updated gficf object 175 | #' @importFrom igraph graph.data.frame graph.adjacency 176 | #' @importFrom RcppParallel setThreadOptions RcppParallelLibs 177 | #' @import uwot 178 | #' @importFrom irlba irlba 179 | #' @import Matrix 180 | #' @export 181 | clustcellsBYscGSEA <- function(data,method="fgraph",pca=NULL,k=10, resolution = 0.25, n.start = 50, n.iter = 250, nt=0, verbose=T, seed=180582) 182 | { 183 | if (is.null(data$scgsea)) {stop("Run first runScGSEA function")} 184 | method = base::match.arg(arg = method,choices = c("phenograph","fgraph"),several.ok = F) 185 | 186 | nt=detectCores() 187 | 188 | if (method=="phenograph") { 189 | x = data$scgsea$x 190 | 191 | tsmessage("Finding Neighboors..",verbose = verbose) 192 | 193 | if (!is.null(pca)) { 194 | x <- irlba::irlba(A = x,nv=pca,center = Matrix::rowMeans(t(x))) 195 | x <- x$u %*% diag(x$d) 196 | rownames(x) <- rownames(data$scgsea$x) 197 | } 198 | 199 | if (ncol(x)>100) { 200 | neigh = uwot:::find_nn(x,k=k,include_self = F,n_threads = nt,verbose = verbose,method = "annoy",metric="manhattan",n_trees = 100)$idx 201 | } else { 202 | neigh = uwot:::find_nn(x,k=k,include_self = F,n_threads = nt,verbose = verbose,method = "fnn",metric="manhattan")$idx 203 | } 204 | rm(x) 205 | 206 | RcppParallel::setThreadOptions(numThreads = nt) 207 | relations <- rcpp_parallel_jaccard_coef(neigh,verbose) 208 | relations <- relations[relations[,3]>0, ] 209 | relations <- as.data.frame(relations) 210 | colnames(relations)<- c("from","to","weight") 211 | g <- igraph::graph.data.frame(relations, directed=FALSE) 212 | rm(relations,neigh);gc() 213 | 214 | tsmessage("Performing louvain with modularity optimization...",verbose = verbose) 215 | community <- RunModularityClustering(igraph::as_adjacency_matrix(g,attr = "weight",sparse = T),1,resolution,2,n.start,n.iter,seed,verbose) 216 | community = community + 1 217 | data$embedded$cluster.by.scGSEA = as.character(community) 218 | data$scgsea$cell.graph=g; rm(g) 219 | 220 | # get centroid of clusters 221 | tsmessage("Computing Cluster Signatures...",verbose = verbose) 222 | cluster.map = data$embedded$cluster.by.scGSEA 223 | u = base::unique(cluster.map) 224 | data$scgsea$cluster.gsea.mu = base::sapply(u, function(x,y=t(data$scgsea$x),z=cluster.map) Matrix::rowMeans(y[,z%in%x])) 225 | tsmessage(paste("Detected Clusters:",length(unique(data$embedded$cluster))),verbose = verbose) 226 | } 227 | 228 | if (method=="fgraph") { 229 | g = uwot::umap(X = as.matrix(data$scgsea$x),n_neighbors = k,metric = "manhattan",scale = "Z",n_trees = 100,pca = pca,ret_extra = "fgraph",n_threads = nt,verbose = verbose)$fgraph 230 | g <- igraph::graph.adjacency(adjmatrix = g, mode = "undirected",weighted = TRUE,diag = F) 231 | 232 | tsmessage("Performing louvain with modularity optimization...",verbose = verbose) 233 | community <- RunModularityClustering(igraph::as_adjacency_matrix(g,attr = "weight",sparse = T),1,resolution,2,n.start,n.iter,seed,verbose) 234 | community = community + 1 235 | data$embedded$cluster.by.scGSEA = as.character(community) 236 | data$scgsea$cell.graph=g; rm(g) 237 | 238 | # get centroid of clusters 239 | tsmessage("Computing Cluster Signatures...",verbose = verbose) 240 | cluster.map = data$embedded$cluster.by.scGSEA 241 | u = base::unique(cluster.map) 242 | data$scgsea$cluster.gsea.mu = base::sapply(u, function(x,y=t(data$scgsea$x),z=cluster.map) Matrix::rowMeans(y[,z%in%x])) 243 | tsmessage(paste("Detected Clusters:",length(unique(data$embedded$cluster))),verbose = verbose) 244 | } 245 | 246 | gc() 247 | return(data) 248 | } 249 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Small Single cell breast cancer atlas 2 | #' 3 | #' A subset of data from the the Single cell breast cancer atlas 4 | #' Only 150 cells per cell-line are used 5 | #' 6 | #' @format ## `small_BC_atlas` 7 | #' A Matrix of 4,760 cells 8 | #' @source 9 | "small_BC_atlas" 10 | 11 | #' Test set for scMAP method 12 | #' 13 | #' A subset of data from the the Single cell breast cancer atlas 14 | #' Only 30 cells per cell-line are used. They do not ovrlapp with cells into small_BC_atlas dataset 15 | #' 16 | #' @format ## `test_BC_atlas` 17 | #' A Matrix of 930 cells 18 | #' @source 19 | "test_BC_atlas" -------------------------------------------------------------------------------- /R/deGenes.R: -------------------------------------------------------------------------------- 1 | #' Find Marker Genes from cell clusters. 2 | #' 3 | #' Try to identify marker genes across clusters performing Mann-Whitney U test. 4 | #' DE genes are identified the expression in each cluster versus the all the other. 5 | #' 6 | #' @param data list; GFICF object 7 | #' @param nt integer; Number of thread to use (default 2). 8 | #' @param hvg boolean; Use only High Variable Genes (default is TRUE). 9 | #' @param verbose boolean; Icrease verbosity. 10 | #' @return The updated gficf object. 11 | #' @import Matrix 12 | #' @importFrom RcppParallel setThreadOptions 13 | #' 14 | #' @export 15 | findClusterMarkers = function(data,nt=2,hvg=T,verbose=T) 16 | { 17 | if (is.null(data$community)) {stop("Please identify cluster first! Run clustcells function.")} 18 | if (is.null(data$rawCounts)) {stop("No raw/normalized counts stored. You should have run gficf normalization with storeRaw = T")} 19 | 20 | RcppParallel::setThreadOptions(numThreads = nt) 21 | 22 | u = unique(data$embedded$cluster) 23 | 24 | # Normalize if were not 25 | cpms = normCounts(data$rawCounts,doc_proportion_max = 2,doc_proportion_min = 0,normalizeCounts = !data$param$normalized,verbose=verbose) 26 | 27 | if (hvg) 28 | { 29 | tsmessage("... Detecting HVGs") 30 | df = findVarGenes(cpms,fitMethod = "locfit",verbose = verbose) 31 | df = df[order(df$FDR),] 32 | ix = df$FDR<.1 33 | tsmessage(paste("... Detected",sum(ix),"significant HVGs with an FDR < 10%")) 34 | if(sum(ix)<=1000) {ix = 1:min(1000,nrow(df))} 35 | cpms = cpms[df$gene[ix],] 36 | rm(df) 37 | } 38 | 39 | cpms = as.matrix(cpms) 40 | 41 | tsmessage("... Start identify marker genes") 42 | res = NULL 43 | #progress_for(n=0,tot = length(u),display = T) 44 | for (i in 1:length(u)) { 45 | cells.1 = which(data$embedded$cluster%in%u[i]) 46 | cells.2 = which(!data$embedded$cluster%in%u[i]) 47 | tmp = rcpp_parallel_WMU_test(matX = cpms[,cells.1],matY = cpms[,cells.2],printOutput = F) 48 | tmp = data.frame(ens=rownames(cpms),log2FC=tmp[,2],p.value=tmp[,1],fdr=p.adjust(tmp[,1],method = "fdr"),stringsAsFactors = F) 49 | tmp = subset(tmp,fdr<.05 & log2FC>0) 50 | tmp = tmp[order(tmp$fdr,decreasing = F),] 51 | tmp$cluster = u[i] 52 | res = rbind(res,tmp) 53 | #progress_for(n=i,tot = length(u),display = T) 54 | } 55 | 56 | res = res[order(res$log2FC,decreasing = T),] 57 | rownames(res) = NULL 58 | data$de.genes = res 59 | return(data) 60 | } 61 | 62 | #' Find high variable genes following the approach 63 | #' proposed by Chen et al. in BMC Genomics (2016) 64 | #' Code adapted from https://github.com/hillas/scVEGs 65 | #' @param data list; GFICF object 66 | #' @param fitMethod charachter; Method to use to fit variance and mean expression relationship (loess or locfit). 67 | #' @param verbose boolean; Increase verbosity. 68 | #' @import Matrix 69 | #' @importFrom locfit locfit locfit.robust lp 70 | #' @importFrom MASS fitdistr 71 | #' 72 | findVarGenes = function(data,fitMethod="locfit",verbose=T) 73 | { 74 | m <- dim(data)[1] 75 | std <- apply(data, 1, stats::sd) 76 | avg <- Matrix::rowMeans(data) 77 | cv <- std / avg 78 | # over dispersion sigma (var = u(1 + u * sigma^2)) 79 | xdata <- (avg) 80 | ydata <- log10(cv) 81 | xdata <- xdata[is.na(ydata) != "TRUE"] 82 | ydata <- ydata[is.na(ydata) != "TRUE"] 83 | 84 | if (fitMethod=="loess"){ 85 | fitLoc <- stats::loess(formula = ydata ~ log10(x = xdata),data = data.frame("ydata"=ydata,"xdata"=xdata,stringsAsFactors = F),span = 0.8) 86 | } else { 87 | fitLoc <- locfit::locfit.robust(ydata ~ locfit::lp(log10(xdata), nn = .2)) 88 | } 89 | 90 | xSeq <- seq(min(log10(xdata)), max(log10(xdata)), 0.005) 91 | gapNum <- matrix(0, length(xSeq), 1) 92 | for(i in 1:length(xSeq)) { 93 | cdx <- which((log10(xdata) >= xSeq[i] - 0.05) & (log10(xdata) < xSeq[i] + 0.05)) 94 | gapNum[i,1] <- length(cdx) 95 | } 96 | cdx <- which(gapNum > m*0.005) 97 | xSeq <- 10 ^ xSeq 98 | ySeq <- predict(fitLoc,log10(xSeq)) 99 | yDiff <- diff(ySeq) 100 | ix <- which(yDiff > 0 & log10(xSeq[-1]) > 0) 101 | if(length(ix) == 0) 102 | ix <- length(ySeq) - 1 103 | xSeq_all <- 10^seq(min(log10(xdata)), max(log10(xdata)), 0.001) 104 | xSeq <- xSeq[cdx[1]:ix[1] + 1] 105 | ySeq <- ySeq[cdx[1]:ix[1] + 1] 106 | 107 | b <- 1 108 | a <- 0 109 | df <- data.frame(x=xSeq, y = ySeq) 110 | fit = stats::nls(y ~ 0.5 * log10(b / x + a), data = df, start=list(b = b,a = a), stats::nls.control(maxiter = 500), na.action = 'na.exclude') 111 | newdf <- data.frame(x = xSeq_all) 112 | ydataFit <- stats::predict(fit,newdata = newdf) 113 | 114 | # Calculate CV difference 115 | logX <- log10(xdata) 116 | 117 | logXseq <- log10(xSeq_all) 118 | cvDist <- matrix(0,length(xdata),1) 119 | 120 | #progress_for(n=0,tot = length(logX),display = verbose) 121 | 122 | for (i in 1:length(logX)) 123 | { 124 | cx <- which(logXseq >= logX[i] - 0.2 & logXseq < logX[i] + 0.2) 125 | tmp <- sqrt((logXseq[cx] - logX[i])^2 + (ydataFit[cx] - ydata[i])^2) 126 | tx <- which.min(tmp) 127 | 128 | if(logXseq[cx[tx]] > logX[i]) { 129 | if(ydataFit[cx[tx]] > ydata[i]) { 130 | cvDist[i] <- -1*tmp[tx] 131 | } else { 132 | cvDist[i] <- tmp[tx] 133 | } 134 | cvDist[i] <- -1*tmp[tx] 135 | } else if (logXseq[cx[tx]] <= logX[i]) { 136 | if(ydataFit[cx[tx]] < ydata[i]) { 137 | cvDist[i] <- tmp[tx] 138 | } else { 139 | cvDist[i] <- -1*tmp[tx] 140 | } 141 | } 142 | 143 | #progress_for(n=i,tot = length(logX),display = verbose) 144 | 145 | } 146 | 147 | cvDist <- log2(10^cvDist) 148 | 149 | # use kernel density estimate to find the peak 150 | dor <- stats::density(cvDist, kernel = "gaussian") 151 | distMid <-dor$x[which.max(dor$y)] 152 | dist2 <- cvDist - distMid 153 | tmpDist <- c(dist2[dist2 <= 0], abs(dist2[dist2 < 0])) + distMid 154 | distFit <- MASS::fitdistr(tmpDist, "normal") 155 | 156 | res = data.frame(gene=rownames(data), 157 | mean=avg, 158 | "cv"=cv, 159 | P=pnorm(cvDist, mean = distFit$estimate[1], sd = distFit$estimate[2], lower.tail = FALSE) 160 | ) 161 | res$FDR <- stats::p.adjust(res$P, 'fdr') 162 | 163 | return(res) 164 | } 165 | 166 | -------------------------------------------------------------------------------- /R/dimensinalityReduction.R: -------------------------------------------------------------------------------- 1 | #' Non-Negative Matrix Factorization (NMF) 2 | #' 3 | #' Reduce dimensionality of the single cell dataset using Non-Negative Matrix Factorization (NMF) 4 | #' 5 | #' @param data list; GFICF object 6 | #' @param dim integer; Number of dimension which to reduce the dataset. 7 | #' @param centre logical; Centre gficf scores before applying reduction (increase separation). 8 | #' @param randomized logical; Use randomized (faster) version for matrix decomposition (default is TRUE). 9 | #' @param seed integer; Initial seed to use. 10 | #' @param use.odgenes boolean; Use only significant overdispersed genes respect to ICF values. 11 | #' @param n.odgenes integer; Number of overdispersed genes to use. A good choise seems to be usually between 1000 and 3000. 12 | #' @param plot.odgenes boolean; Show significant overdispersed genes respect to ICF values. 13 | #' @param nt numeric; Numbmber of thread to use (default is 0, i.e. all available CPU cores). 14 | #' @param ... Additional arguments to pass to nfm call (see ?RcppML::nmf). 15 | #' @return The updated gficf object. 16 | #' @import RcppML 17 | #' @import Matrix 18 | #' 19 | #' @export 20 | runNMF = function(data,dim=NULL,seed=180582,use.odgenes=F,n.odgenes=NULL,plot.odgenes=F, nt=0, ...) 21 | { 22 | if(use.odgenes & is.null(data$rawCounts)) {stop("Raw Counts absent! Please run gficf normalization with storeRaw = T")} 23 | 24 | if (nt==0) {nt = ifelse(detectCores()>1,detectCores()-1,1)} 25 | options(RcppML.threads = nt) 26 | 27 | if (is.null(dim)) 28 | { 29 | if (is.null(data$dimPCA)) {stop("Specify the number of dims or run computePCADim first")} else {dim=data$dimPCA} 30 | } else { 31 | data$dimPCA = dim 32 | } 33 | 34 | set.seed(seed) 35 | 36 | data$pca = list() 37 | 38 | if(use.odgenes) { 39 | overD=suppressWarnings(findOverDispersed(data = data,alpha = 0.1,verbose = F,plot = plot.odgenes)) 40 | overD$lpa[is.na(overD$lpa)] <- 0 41 | odgenes <- rownames(overD[overD$lpalength(odgenes)) { 44 | odgenes <- rownames(overD)[(order(overD$lp,decreasing=F)[1:min(nrow(data$gficf),n.odgenes)])] 45 | } else { 46 | odgenes <- odgenes[1:n.odgenes] 47 | } 48 | } 49 | data$pca$cells = t(data$gficf)[,odgenes] 50 | data$pca$odgenes = overD 51 | tsmessage("... using ",length(odgenes)," OD genes",verbose = T) 52 | } 53 | 54 | if (is.null(data$pca$cells)){ 55 | tsmessage("Performing NFM..") 56 | nfm = RcppML::nmf(data$gficf,k = dim, ...) 57 | } else { 58 | nfm = RcppML::nmf(t(data$pca$cells),k = dim, ...) 59 | } 60 | 61 | data$pca$cells <- t(nfm$h) 62 | data$pca$genes <- nfm$w 63 | rm(nfm);gc() 64 | data$pca$centre <- F # for legacy 65 | data$pca$rescale <- F # for legacy 66 | data$pca$type = "NMF" 67 | data$pca$use.odgenes = use.odgenes 68 | 69 | if(use.odgenes) {rownames(data$pca$genes)=odgenes} else {rownames(data$pca$genes) = rownames(data$gficf)} 70 | rownames(data$pca$cells) = colnames(data$gficf) 71 | return(data) 72 | } 73 | 74 | #' Principal Component Analysis (PCA) 75 | #' 76 | #' Reduce dimensionality of the single cell dataset using Principal Component Analysis (PCA) 77 | #' 78 | #' @param data list; GFICF object 79 | #' @param dim integer; Number of dimension which to reduce the dataset. 80 | #' @param centre logical; Centre gficf scores before applying reduction (increase separation). 81 | #' @param seed integer; Initial seed to use. 82 | #' @param use.odgenes boolean; Use only significant overdispersed genes respect to ICF values. 83 | #' @param n.odgenes integer; Number of overdispersed genes to use. A good choise seems to be usually between 1000 and 3000. 84 | #' @param plot.odgenes boolean; Show significant overdispersed genes respect to ICF values. 85 | #' @return The updated gficf object. 86 | #' @return The updated gficf object. 87 | #' @importFrom irlba irlba 88 | #' 89 | #' @export 90 | runPCA = function(data,dim=NULL,var.scale=F,centre=F,seed=180582,use.odgenes=F,n.odgenes=NULL,plot.odgenes=F) 91 | { 92 | 93 | if(use.odgenes & is.null(data$rawCounts)) {stop("Raw Counts absent! Please run gficf normalization with storeRaw = T")} 94 | 95 | if (is.null(dim)) 96 | { 97 | if (is.null(data$dimPCA)) {stop("Specify the number of dims or run computePCADim first")} else {dim=data$dimPCA} 98 | } else { 99 | data$dimPCA = dim 100 | } 101 | 102 | set.seed(seed) 103 | 104 | data$pca = list() 105 | data$pca$cells = t(data$gficf) 106 | 107 | if(use.odgenes) { 108 | overD=suppressWarnings(findOverDispersed(data = data,alpha = 0.1,verbose = F,plot = plot.odgenes)) 109 | overD$lpa[is.na(overD$lpa)] <- 0 110 | odgenes <- rownames(overD[overD$lpalength(odgenes)) { 113 | odgenes <- rownames(overD)[(order(overD$lp,decreasing=F)[1:min(nrow(data$gficf),n.odgenes)])] 114 | } else { 115 | odgenes <- odgenes[1:n.odgenes] 116 | } 117 | } 118 | data$pca$cells = data$pca$cells[,odgenes] 119 | data$pca$odgenes = overD 120 | tsmessage("... using ",length(odgenes)," OD genes",verbose = T) 121 | } 122 | 123 | #x = rsvd::rpca(data$pca$cells,k=dim,center=centre,scale=F,rand=randomized) 124 | if (centre) { 125 | x <- irlba::irlba(A = data$pca$cells,nv=dim,center = Matrix::rowMeans(t(data$pca$cells))) 126 | } else { 127 | x <- irlba::irlba(A = data$pca$cells,nv=dim) 128 | } 129 | 130 | x$x <- x$u %*% diag(x$d) 131 | data$pca$cells = x$x 132 | data$pca$centre <- centre 133 | data$pca$rescale <- F 134 | data$pca$genes <- x$v 135 | data$pca$use.odgenes = use.odgenes 136 | rm(x); gc() 137 | data$pca$type = "PCA" 138 | if(use.odgenes) {rownames(data$pca$genes)=odgenes} else {rownames(data$pca$genes) = rownames(data$gficf)} 139 | rownames(data$pca$cells) = colnames(data$gficf) 140 | colnames(data$pca$cells) = colnames(data$pca$genes) = paste("C",1:dim,sep = "") 141 | return(data) 142 | } 143 | 144 | #' Dimensionality reduction 145 | #' 146 | #' Run t-SNE or UMAP or t-UMAP dimensionality reduction on selected features from PCA or NMF. 147 | #' See ?umap or ?Rtsne for additional parameter to use. 148 | #' 149 | #' @param data list; GFICF object 150 | #' @param reduction characters; Reduction method to use. One of: 151 | #' \itemize{ 152 | #' \item \code{"tsne"} 153 | #' \item \code{"umap"} 154 | #' \item \code{"tumap"} (the default) 155 | #' } 156 | #' @param nt integer; Number of thread to use (default 2). 157 | #' @param seed integer; Initial seed to use. 158 | #' @param verbose boolean; Icrease verbosity. 159 | #' @param ... Additional arguments to pass to Rtsne/umap/tumap call. 160 | #' @return The updated gficf object. 161 | #' @import uwot 162 | #' @importFrom Rtsne Rtsne 163 | #' 164 | #' @export 165 | runReduction = function(data,reduction="tumap",nt=2,seed=18051982, verbose=T, ...) 166 | { 167 | 168 | reduction = base::match.arg(arg = reduction,choices = c("umap","tumap","tsne"),several.ok = F) 169 | 170 | set.seed(seed) 171 | if (!is.null(data$pca)) 172 | { 173 | if(reduction=="tumap"){ 174 | if (is.null(data$pca$harmony)){ 175 | data$uwot = uwot::tumap(X = data$pca$cells,scale = F,n_threads = nt,verbose = verbose,ret_model = T, ...) 176 | } else { 177 | data$uwot = uwot::tumap(X = t(data$pca$harmony$Z_corr),scale = F,n_threads = nt,verbose = verbose,ret_model = T, ...) 178 | } 179 | data$embedded = base::as.data.frame(data$uwot$embedding) 180 | } 181 | 182 | if(reduction=="umap"){ 183 | if (is.null(data$pca$harmony)){ 184 | data$uwot = uwot::umap(X = data$pca$cells, scale = F,n_threads = nt,verbose = verbose, ret_model = T, ...) 185 | } else { 186 | data$uwot = uwot::umap(X = t(data$pca$harmony$Z_corr), scale = F,n_threads = nt,verbose = verbose, ret_model = T, ...) 187 | } 188 | data$embedded = base::as.data.frame(data$uwot$embedding) 189 | } 190 | 191 | if(reduction=="tsne"){ 192 | data$uwot = NULL 193 | if (is.null(data$pca$harmony)){ 194 | data$embedded = base::as.data.frame(Rtsne::Rtsne(X = data$pca$cells,dims = 2, pca = F,verbose = verbose,max_iter=1000,num_threads=nt, ...)$Y) 195 | } else { 196 | data$embedded = base::as.data.frame(Rtsne::Rtsne(X = t(data$pca$harmony$Z_corr),dims = 2, pca = F,verbose = verbose,max_iter=1000,num_threads=nt, ...)$Y) 197 | } 198 | } 199 | } else { 200 | message("Warning: Reduction is applied directly on GF-ICF values.. can be slow if the dataset is big!") 201 | 202 | if(reduction=="tumap"){data$embedded = base::as.data.frame(uwot::tumap(X = as.matrix(t(data$gficf)),scale = F,n_threads = nt,verbose = verbose, ...))} 203 | 204 | if(reduction=="umap"){data$embedded = base::as.data.frame(uwot::umap(X = as.matrix(t(data$gficf)), scale = F,n_threads = nt,verbose = verbose, ...))} 205 | 206 | if(reduction=="tsne"){data$embedded = base::as.data.frame(Rtsne::Rtsne(X = as.matrix(t(data$gficf)), dims = 2, pca = F, verbose = verbose, max_iter=1000,num_threads=nt, ...)$Y)} 207 | } 208 | rownames(data$embedded) = base::colnames(data$gficf) 209 | colnames(data$embedded) = base::c("X","Y") 210 | data$reduction = reduction 211 | return(data) 212 | } 213 | 214 | #' Number of features to use 215 | #' 216 | #' Compute the number of dimension to use for either PCA or LSA. 217 | #' 218 | #' @param data list; GFICF object 219 | #' @param seed numeric; seed to use. 220 | #' @param subsampling logical; Use only a subset of the data for the imputation of dimensions to use. 221 | #' @param plot logical; Show eblow plot. 222 | #' @importFrom RSpectra svds 223 | #' 224 | #' @export 225 | computePCADim = function(data,seed=180582,subsampling=F,plot=T) 226 | { 227 | set.seed(seed) 228 | dim = min(50,ncol(data$gficf)) 229 | 230 | if (subsampling) 231 | { 232 | x = data$gficf[,sample(x = 1:ncol(data$gficf),size = round(ncol(data$gficf)/100*5))] 233 | ppk<- RSpectra::svds(t(x),k=dim) 234 | rm(x) 235 | } else { 236 | ppk<- RSpectra::svds(t(data$gficf),k=dim) 237 | } 238 | 239 | explained.var = ppk$d^2 / sum(ppk$d^2) 240 | if(plot) {plot(explained.var,xlab="components",ylab="explained.var")} 241 | 242 | ratio_to_first_diff <- diff(ppk$d^2 / sum(ppk$d^2)) / diff(ppk$d^2 / sum(ppk$d^2))[1] 243 | #reduction_dim <- (which(ratio_to_first_diff < 0.1) + 1)[1] 244 | ix = which(cumsum(diff((which(ratio_to_first_diff < 0.1))) == 1)>1)[1] 245 | reduction_dim = which(ratio_to_first_diff < 0.1)[ix] 246 | 247 | cat("Number of estimated dimensions =",reduction_dim) 248 | data$dimPCA = reduction_dim 249 | return(data) 250 | } 251 | 252 | # find over dispersed genes respect to computed ICF 253 | # ispired by pagoda2 function. Thanks to them. 254 | #' @import mgcv 255 | findOverDispersed=function(data,gam.k=5, alpha=5e-2, plot=FALSE, use.unadjusted.pvals=FALSE,do.par=T,max.adjusted.variance=1e3,min.adjusted.variance=1e-3,verbose=TRUE,min.gene.cells=0) 256 | { 257 | rowSel <- NULL; 258 | 259 | tsmessage("calculating variance fit ...",verbose=verbose) 260 | df = colMeanVarS(t(data$rawCounts),ncores = ifelse(detectCores()>1,detectCores()-1,1)) 261 | df$m = data$w 262 | 263 | # gene-relative normalizaton 264 | df$v <- log(df$v); 265 | rownames(df) <- rownames(data$gficf); 266 | vi <- which(is.finite(df$v) & df$nobs>=min.gene.cells); 267 | if(length(vi)0) { 301 | points(df$m[ods],df$v[ods],pch='.',col=2,cex=1) 302 | } 303 | smoothScatter(df$m[vi],df$qv[vi],xlab='ICF value',ylab='',main='adjusted') 304 | abline(h=1,lty=2,col=8) 305 | if(is.finite(max.adjusted.variance)) { abline(h=max.adjusted.variance,lty=2,col=1) } 306 | points(df$m[ods],df$qv[ods],col=2,pch='.') 307 | } 308 | tsmessage("done.\n",verbose=verbose) 309 | return(df) 310 | } 311 | 312 | # BH P-value adjustment with a log option 313 | bh.adjust <- function(x, log = FALSE, verbose = F) 314 | { 315 | nai <- which(!is.na(x)) 316 | ox <- x 317 | x<-x[nai] 318 | id <- order(x, decreasing = FALSE) 319 | if(log) { 320 | q <- x[id] + log(length(x)/seq_along(x)) 321 | } else { 322 | q <- x[id]*length(x)/seq_along(x) 323 | } 324 | a <- rev(cummin(rev(q)))[order(id)] 325 | ox[nai]<-a 326 | ox 327 | } 328 | 329 | #' Number of features to use 330 | #' 331 | #' Compute the number of dimension to use for either PCA or LSA. 332 | #' 333 | #' @param data list; GFICF object 334 | #' @param metadata dataframe; Either (1) Dataframe with variables to integrate or (2) vector with labels. 335 | #' @param var.to.use character; If meta_data is dataframe, this defined which variable(s) to remove (character vector). 336 | #' @param verbose boolean; Increase verbosity. 337 | #' @param ... Additional arguments to pass to HarmonyMatrix function. 338 | #' @importFrom harmony HarmonyMatrix 339 | #' 340 | #' @export 341 | runHarmony <- function(data,metadata, var.to.use, verbose = T, ...) 342 | { 343 | tsmessage(".. Running Harmony on PCA/NMF space",verbose=verbose) 344 | if (is.null(data$pca)) {stop("Please run fist PCA or NMF reduction!")} 345 | data$pca$harmony <- harmony::HarmonyMatrix(data$pca$cells, meta_data = metadata, vars_use = var.to.use,do_pca = F, verbose = F, return_object = T, ...) 346 | colnames(data$pca$harmony$Z_corr) <- rownames(data$pca$cells) 347 | rownames(data$pca$harmony$Z_corr) <- colnames(data$pca$cells) 348 | tsmessage(".. Finished!",verbose=verbose) 349 | return(data) 350 | } 351 | -------------------------------------------------------------------------------- /R/gficf.R: -------------------------------------------------------------------------------- 1 | #' Gene Frequency - Inverse Cell Frequency (GF-ICF) 2 | #' 3 | #' R implementation of the GF-ICF 4 | #' Thanks to 3’-end scRNA-seq approaches, we can now have an accurate estimation of gene expression without having to account for gene length, 5 | #' thus the number of transcripts (i.e. UMI) associated to each gene, strictly reflects the frequency of a gene in a cell, exactly like a word in a document. 6 | #' GFICF (Gene Frequency - Inverse Cell Frequency) is analogous of TF-IDF scoring method as defined for tex mining With GFICF we consider a cell to be 7 | #' analogous to a document, genes analogous to words and gene counts to be analogous of the word’s occurrence in a document. 8 | #' 9 | #' @param M Matrix; UMI cell count matrix 10 | #' @param QCdata list; QC cell object. 11 | #' @param cell_count_cutoff numeric; All genes detected in less than cell_count_cutoff cells will be excluded (default 5). 12 | #' @param cell_percentage_cutoff2 numeric; All genes detected in at least this percentage of cells will be included (default 0.03, i.e. 3 percent of cells). 13 | #' @param nonz_mean_cutoff numeric genes detected in the number of cells between the above mentioned cutoffs are selected only when their average expression in non-zero cells is above this cutoff (default 1.12). 14 | #' @param normalize logical; Rescale UMI counts before apply GFICF. Rescaling is done using EdgeR normalization. 15 | #' @param storeRaw logical; Store UMI counts. 16 | #' @param batches vector; Vector or factor for batch. 17 | #' @param groups vector; Vector or factor for biological condition of interest. 18 | #' @param filterGenes logical; Apply gene filter (default TRUE). 19 | #' @param verbose boolean; Increase verbosity. 20 | #' @param ... Additional arguments to pass to ComBat_seq call. 21 | #' @return The updated gficf object. 22 | #' 23 | #' @export 24 | gficf = function(M=NULL,QCdata=NULL,cell_count_cutoff=5,cell_percentage_cutoff2=0.03,nonz_mean_cutoff=1.12,normalize=TRUE,storeRaw=TRUE,batches=NULL,groups=NULL,filterGenes=TRUE,verbose=TRUE,fastNomalization=F, ...) 25 | { 26 | if(is.null(M) & is.null(QCdata)) {stop("Input data is missing!!")} 27 | 28 | data = list() 29 | if (!is.null(QCdata)) { 30 | data = QCdata 31 | rm(QCdata);gc(reset = T) 32 | if (!is.null(M)) {rm(M);gc()} 33 | } else { 34 | data$counts = M;rm(M);gc() 35 | } 36 | 37 | data = normCountsData(data,cell_count_cutoff,cell_percentage_cutoff2,nonz_mean_cutoff,normalize,batches,groups,verbose,filterGenes,fastNomalization, ...) 38 | data$gficf = tf(data$rawCounts,verbose = verbose) 39 | if (!storeRaw) {data$rawCounts=NULL;data$counts=NULL;gc()} 40 | data$w = getIdfW(data$gficf,verbose = verbose) 41 | data$gficf = idf(data$gficf,data$w,verbose = verbose) 42 | data$gficf = t(l.norm(t(data$gficf),norm = "l2",verbose = verbose)) 43 | 44 | data$param <- list() 45 | data$param$cell_count_cutoff = cell_count_cutoff 46 | data$param$cell_percentage_cutoff2 = cell_percentage_cutoff2 47 | data$param$nonz_mean_cutoff = nonz_mean_cutoff 48 | data$param$normalized = normalize 49 | return(data) 50 | } 51 | 52 | #' @import Matrix 53 | #' @importFrom edgeR DGEList calcNormFactors cpm 54 | #' @importFrom sva ComBat_seq 55 | #' 56 | normCounts = function(M,cell_count_cutoff=5,cell_percentage_cutoff2=0.03,nonz_mean_cutoff=1.12,normalizeCounts=TRUE,batches=NULL,groups=NULL,verbose=TRUE,filterGene=FALSE,fastNomalization=FALSE, ...) 57 | { 58 | if (filterGene) { 59 | tsmessage("Gene filtering..",verbose = verbose) 60 | M = filter_genes_cell2loc_style(data = M,cell_count_cutoff,cell_percentage_cutoff2,nonz_mean_cutoff) 61 | } 62 | 63 | if (normalizeCounts) 64 | { 65 | if(!is.null(batches)){ 66 | tsmessage("Correcting batches..",verbose = verbose) 67 | M = Matrix::Matrix(data = sva::ComBat_seq(counts = as.matrix(M),batch = batches,group = groups, ...),sparse = T) 68 | } 69 | tsmessage("Normalize counts..",verbose = verbose) 70 | if (!fastNomalization) { 71 | M <- Matrix::Matrix(edgeR::cpm(edgeR::calcNormFactors(edgeR::DGEList(counts=M),normalized.lib.sizes = T)),sparse = T) 72 | } else { 73 | nt = ifelse(detectCores()>1,detectCores()-1,1) 74 | M <- scaleUMI(F,nt,F) 75 | } 76 | } else { 77 | data$rawCounts <- data$counts 78 | data$counts <- NULL; gc() 79 | } 80 | 81 | return(M) 82 | } 83 | 84 | #' @import Matrix 85 | #' @importFrom edgeR DGEList calcNormFactors cpm 86 | #' @importFrom sva ComBat_seq 87 | #' 88 | normCountsData = function(data,cell_count_cutoff=5,cell_percentage_cutoff2=0.03,nonz_mean_cutoff=1.12,normalizeCounts=TRUE,batches=NULL,groups=NULL,verbose=TRUE,filterGene=TRUE, fastNomalization=FALSE, ...) 89 | { 90 | if (filterGene) { 91 | tsmessage("Gene filtering..",verbose = verbose) 92 | data$counts = filter_genes_cell2loc_style(data = data$counts,cell_count_cutoff,cell_percentage_cutoff2,nonz_mean_cutoff) 93 | } 94 | 95 | if (normalizeCounts) 96 | { 97 | if(!is.null(batches)){ 98 | tsmessage("Correcting batches..",verbose = verbose) 99 | data$counts = Matrix::Matrix(data = sva::ComBat_seq(counts = as.matrix(data$counts),batch = batches,group = groups, ...),sparse = T) 100 | gc() 101 | } 102 | tsmessage("Normalize counts..",verbose = verbose) 103 | if (!fastNomalization) { 104 | data$rawCounts <- Matrix::Matrix(edgeR::cpm(edgeR::calcNormFactors(edgeR::DGEList(counts=data$counts),normalized.lib.sizes = T)),sparse = T) 105 | } else { 106 | nt = ifelse(detectCores()>1,detectCores()-1,1) 107 | data$rawCounts <- scaleUMI(data$counts,nt,F) 108 | } 109 | } else { 110 | data$rawCounts <- data$counts 111 | data$counts <- NULL; gc() 112 | } 113 | 114 | if(is.null(batches)){data$counts=NULL;gc()} 115 | 116 | return(data) 117 | } 118 | 119 | #' @import Matrix 120 | #' 121 | tf = function(M,verbose) 122 | { 123 | 124 | tsmessage("Apply GF transformation..",verbose = verbose) 125 | M =t(t(M) / armaColSum(M)) 126 | 127 | return(M) 128 | } 129 | 130 | #' @import Matrix 131 | #' 132 | idf = function(M,w,verbose) 133 | { 134 | tsmessage("Applay ICF..",verbose = verbose) 135 | M = M[rownames(M) %in% names(w),] 136 | if(nrow(M)0] 45 | pathways 46 | } 47 | 48 | #' Gene Set Enrichement Analysi on GF-ICF 49 | #' 50 | #' Compute GSEA for each cluster across a set of input pathways. 51 | #' 52 | #' @param data list; GFICF object 53 | #' @param gmt.file characters; Path to gmt file from MSigDB 54 | #' @param nsim integer; number of simulation used to compute ES significance. 55 | #' @param convertToEns boolean: Convert gene sets from gene symbols to Ensable id. 56 | #' @param convertHu2Mm boolean: Convert gene sets from human symbols to Mouse Ensable id. 57 | #' @param nt numeric; Number of cpu to use for the GSEA 58 | #' @param minSize numeric; Minimal size of a gene set to test (default 15). All pathways below the threshold are excluded. 59 | #' @param maxSize numeric; Maximal size of a gene set to test (default Inf). All pathways above the threshold are excluded. 60 | #' @param verbose boolean; Show the progress bar. 61 | #' @param seed integer; Seed to use for random number generation. 62 | #' @param method string; Method to use GSEA or GSVA. Default is GSEA. 63 | #' @return The updated gficf object. 64 | #' @importFrom fgsea fgsea 65 | #' @import fastmatch 66 | #' @importFrom limma lmFit eBayes topTable 67 | #' @import GSVA 68 | #' @export 69 | runGSEA <- function(data,gmt.file,nsim=1000,convertToEns=T,convertHu2Mm=F,nt=2,minSize=15,maxSize=Inf,verbose=TRUE,seed=180582,method="GSEA") 70 | { 71 | set.seed(seed) 72 | 73 | if (is.null(data$cluster.gene.rnk)) {stop("Please run clustcell function first")} 74 | mthod = base::match.arg(arg = method,choices = c("GSEA","GSVA"),several.ok = F) 75 | 76 | if (method == "GSEA") 77 | { 78 | tsmessage("Choosen method is GSEA...",verbose=verbose) 79 | data$gsea = list() 80 | data$gsea$pathways = gmtPathways(gmt.file,convertToEns,convertHu2Mm,verbose) 81 | data$gsea$es = Matrix::Matrix(data = 0,nrow = length(data$gsea$pathways),ncol = ncol(data$cluster.gene.rnk)) 82 | data$gsea$nes = Matrix::Matrix(data = 0,nrow = length(data$gsea$pathways),ncol = ncol(data$cluster.gene.rnk)) 83 | data$gsea$pval = Matrix::Matrix(data = 0,nrow = length(data$gsea$pathways),ncol = ncol(data$cluster.gene.rnk)) 84 | data$gsea$fdr = Matrix::Matrix(data = 0,nrow = length(data$gsea$pathways),ncol = ncol(data$cluster.gene.rnk)) 85 | 86 | rownames(data$gsea$es) = rownames(data$gsea$nes) = rownames(data$gsea$pval) = rownames(data$gsea$fdr) = names(data$gsea$pathways) 87 | colnames(data$gsea$es) = colnames(data$gsea$nes) = colnames(data$gsea$pval) = colnames(data$gsea$fdr) = colnames(data$cluster.gene.rnk) 88 | 89 | pb = utils::txtProgressBar(min = 0, max = ncol(data$cluster.gene.rnk), initial = 0,style = 3) 90 | for (i in 1:ncol(data$cluster.gene.rnk)) 91 | { 92 | df = as.data.frame(fgsea::fgseaMultilevel(pathways = data$gsea$pathways,stats = data$cluster.gene.rnk[,i],nPermSimple = nsim,gseaParam = 0,nproc = nt,minSize = minSize,maxSize = maxSize))[,1:7] 93 | data$gsea$es[df$pathway,i] = df$ES 94 | data$gsea$nes[df$pathway,i] = df$NES 95 | data$gsea$pval[df$pathway,i] = df$pval 96 | data$gsea$fdr[df$pathway,i] = df$padj 97 | utils::setTxtProgressBar(pb,i) 98 | } 99 | close(pb) 100 | 101 | data$gsea$stat = df[,c("pathway","size")] 102 | } else { 103 | tsmessage("Choosen method is GSVA...",verbose=verbose) 104 | data$gsva = list() 105 | data$gsva$pathways = gmtPathways(gmt.file,convertToEns,convertHu2Mm,verbose) 106 | data$gsva$DEpathways = NULL 107 | data$gsva$res = Matrix::Matrix(data = 0,nrow = length(data$gsva$pathways),ncol = ncol(data$gficf)) 108 | rownames(data$gsva$res) = names(data$gsva$pathways) 109 | colnames(data$gsva$res) = colnames(data$gficf) 110 | 111 | tsmessage("Start executiong GSVA cluster by cluster",verbose=verbose) 112 | options(warn=-1) 113 | u = unique(data$embedded$cluster) 114 | for (i in 1:length(u)) 115 | { 116 | tsmessage(paste0("..Executing GSVA for cluster ",i," out of ",length(u))) 117 | cells = rownames(data$embedded)[data$embedded$cluster%in%u[i]] 118 | res = GSVA::gsva(expr = as.matrix(data$gficf[,cells]),gset.idx.list = data$gsva$pathways,kcdf="Gaussian",min.sz=minSize,max.sz=maxSize,parallel.sz=nt,method="gsva",verbose=F) 119 | data$gsva$res[rownames(res),cells] = res 120 | rm(res) 121 | } 122 | options(warn=0) 123 | data$gsva$res = data$gsva$res[armaRowSum(data$gsva$res!=0)>0,] 124 | 125 | tsmessage("Start executiong Limma cluster by cluster",verbose=verbose) 126 | for (i in 1:length(u)) 127 | { 128 | tsmessage(paste0("..Calling DE pathways for cluster ",i," out of ",length(u))) 129 | clusters = data$embedded$cluster 130 | clusters[!clusters%in%u[i]] = "other" 131 | clusters[!clusters%in%"other"] = paste0("C",clusters[!clusters%in%"other"]) 132 | design <- model.matrix(~ factor(clusters)) 133 | colnames(design) <- c("ALL", paste0("C",u[i],"vsOTHER")) 134 | fit <- limma::lmFit(data$gsva$res, design) 135 | fit <- limma::eBayes(fit) 136 | df <- as.data.frame(limma::topTable(fit, coef=paste0("C",u[i],"vsOTHER"), number=Inf)) 137 | df$pathway = rownames(df) 138 | df$cluster = u[i] 139 | data$gsva$DEpathways = rbind(data$gsva$DEpathways,df) 140 | rm(df) 141 | } 142 | rownames(data$gsva$DEpathways) = NULL 143 | } 144 | return(data) 145 | } 146 | 147 | #' Single cell Gene Set Enrichement Analysis on GF-ICF 148 | #' 149 | #' Compute GSEA for each cells across a set of input pathways by using NMF. 150 | #' 151 | #' @param data list; GFICF object 152 | #' @param geneID characters; The type of gene identifier to use, such as ensamble of symbol. 153 | #' @param species characters; Species name, such as human or mouse. 154 | #' @param category characters; MSigDB collection abbreviation, such as H or C1. 155 | #' @param subcategory characters; MSigDB sub-collection abbreviation, such as CGP or BP. 156 | #' @param pathway.list list; Custom list of pathways. Each element correspond to a pathway a and contains a vector of genes. 157 | #' @param nsim integer; number of simulation used to compute ES significance. 158 | #' @param nt numeric; Number of cpu to use for the GSEA and NMF. Default is 0 (i.e., all available cores minus one) 159 | #' @param minSize numeric; Minimal size of a gene set to test (default 15). All pathways below the threshold are excluded. 160 | #' @param maxSize numeric; Maximal size of a gene set to test (default Inf). All pathways above the threshold are excluded. 161 | #' @param verbose boolean; Show the progress bar. 162 | #' @param seed integer; Seed to use for random number generation. 163 | #' @param nmf.k numeric; Rank of NMF. 164 | #' @param fdr.th numeric; FDR threshold for GSEA. 165 | #' @param rescale string; If different by none, pathway's activity scores are resealed as Z-score. Possible values are none, byGS or byCell. Default is none. 166 | #' @param normalization; normalization to use before to apply NMF. Possible values are gficf or cpm. Default and highly raccomanded is gficf. 167 | #' @return The updated gficf object. 168 | #' @importFrom fgsea fgsea 169 | #' @import fastmatch 170 | #' @importFrom RcppML nmf 171 | #' @import utils 172 | #' @import pointr 173 | #' @import msigdbr 174 | #' @importFrom BiocParallel SnowParam 175 | #' @export 176 | runScGSEA <- function(data,geneID,species,category,subcategory=NULL,pathway.list=NULL,nsim=10000,nt=0,minSize=15,maxSize=Inf,verbose=TRUE,seed=180582,nmf.k=100,fdr.th=0.05,gp=0,rescale="none",normalization="gficf") 177 | { 178 | if(nt==0) {nt=detectCores()} 179 | #species = base::match.arg(arg = species,choices = c("human","mouse"),several.ok = F) 180 | geneID = base::match.arg(arg = geneID,choices = c("ensamble","symbol"),several.ok = F) 181 | rescale = base::match.arg(arg = rescale,choices = c("none","byGS","byCell"),several.ok = F) 182 | normalization = base::match.arg(arg = normalization,choices = c("gficf","cpm"),several.ok = F) 183 | options(RcppML.threads = nt) 184 | set.seed(seed) 185 | 186 | if (is.null(data$scgsea)) 187 | { 188 | data$scgsea = list() 189 | if (normalization=="gficf") 190 | { 191 | if (!is.null(data$pca) && data$pca$type == "NMF"){ 192 | if (data$dimPCAnt,nt,nt_fgsea) 258 | bpparameters <- BiocParallel::SnowParam(nt_fgsea) 259 | for (i in 1:ncol(data$scgsea$nmf.w)) 260 | { 261 | df = as.data.frame(fgsea::fgseaMultilevel(pathways = data$scgsea$pathways,stats = data$scgsea$nmf.w[,i],nPermSimple = nsim,gseaParam = gp,BPPARAM = bpparameters,minSize = minSize,maxSize = maxSize))[,1:7] 262 | data$scgsea$es[df$pathway,i] = df$ES 263 | data$scgsea$nes[df$pathway,i] = df$NES 264 | data$scgsea$pval[df$pathway,i] = df$pval 265 | data$scgsea$fdr[df$pathway,i] = df$padj 266 | utils::setTxtProgressBar(pb,i) 267 | } 268 | base::close(pb) 269 | on.exit(options(warn = oldw)) 270 | 271 | ix = is.na(data$scgsea$nes) 272 | if(sum(ix)>0) { 273 | data$scgsea$nes[ix] = 0 274 | data$scgsea$pval[ix] = 1 275 | data$scgsea$fdr[ix] = 1 276 | } 277 | 278 | data$scgsea$x = data$scgsea$nes 279 | data$scgsea$x[data$scgsea$x<0 | data$scgsea$fdr>=fdr.th] = 0 280 | data$scgsea$x = Matrix::Matrix(data = data$scgsea$nmf.h %*% t(data$scgsea$x),sparse = T) 281 | 282 | data$scgsea$stat = df[,c("pathway","size")] 283 | data$scgsea$x = data$scgsea$x[,armaColSum(data$scgsea$x)>0] 284 | 285 | if(rescale!="none"){ 286 | if(rescale=="byGS") { 287 | data$scgsea$x = t(data$scgsea$x) 288 | data$scgsea$x = t( (data$scgsea$x - rowMeans(data$scgsea$x)) / apply(data$scgsea$x, 1, sd)) 289 | } 290 | if(rescale=="byCell") { 291 | data$scgsea$x = (data$scgsea$x - rowMeans(data$scgsea$x)) / apply(data$scgsea$x, 1, sd) 292 | } 293 | } 294 | 295 | return(data) 296 | } 297 | 298 | #' Remove previous scGSEA analysis 299 | #' 300 | #' Remove previous scGSEA analysis 301 | #' @param data list; GFICF object 302 | #' @export 303 | resetScGSEA <- function(data){ 304 | data$scgsea <- NULL 305 | return(data) 306 | } 307 | 308 | -------------------------------------------------------------------------------- /R/util.R: -------------------------------------------------------------------------------- 1 | #' @import Matrix 2 | #' 3 | scaleMatrix = function(x,rescale,centre) 4 | { 5 | if (FALSE) 6 | { 7 | message("Rescaling..") 8 | bc_tot <- armaRowSum(x) 9 | median_tot <- stats::median(bc_tot) 10 | x <- base::sweep(x, 1, median_tot/bc_tot, '*') 11 | message("Rescaling Done!") 12 | } 13 | 14 | if (FALSE) 15 | { 16 | message("Centering data..") 17 | x <- base::sweep(x, 2, Matrix::colMeans(x), '-') 18 | x <- base::sweep(x, 2, base::apply(x, 2, sd), '/') 19 | message("Centering Done!") 20 | } 21 | return(x) 22 | } 23 | 24 | 25 | stime <- function() { 26 | format(Sys.time(), "%T") 27 | } 28 | 29 | # message with a time stamp 30 | tsmessage <- function(..., domain = NULL, appendLF = TRUE, verbose = TRUE,time_stamp = TRUE) { 31 | if (verbose) { 32 | msg <- "" 33 | if (time_stamp) { 34 | msg <- paste0(stime(), " ") 35 | } 36 | message(msg, ..., domain = domain, appendLF = appendLF) 37 | utils::flush.console() 38 | } 39 | } 40 | 41 | #' Convert Ensamble IDs to Official Gene Symbols 42 | #' 43 | #' It uses biomart. If more the one gene is associated to the enamble, the first one retrived from 44 | #' Biomart is used. 45 | #' 46 | #' @param df data frame; Data frame containing the IDs to convert. 47 | #' @param col characters; Name of column containing the ensamble ids. 48 | #' @param organism characters; Organism of origin (i.e. human or mouse). 49 | #' @param verbose boolean; Icrease verbosity. 50 | #' @return The updated data frame with a new column called symb. 51 | #' 52 | #' @import AnnotationHub 53 | #' @import fastmatch 54 | #' 55 | #' @export 56 | ensToSymbol = function(df,col,organism,verbose=T) 57 | { 58 | organism = tolower(organism) 59 | organism = base::match.arg(arg = organism,choices = c("human","mouse"),several.ok = F) 60 | org.map = c("Homo Sapiens","Mus Musculus") 61 | names(org.map) = c("human","mouse") 62 | 63 | tsmessage("... Retrieving gene annotation from AnnotationHub()",verbose = verbose) 64 | ah <- AnnotationHub::AnnotationHub() 65 | ahDb <- AnnotationHub::query(ah,pattern = c(org.map[organism],"EnsDb"), ignore.case = TRUE) 66 | id <- tail(rownames(AnnotationHub::mcols(ahDb)),n=1) 67 | edb <- ah[[id]] 68 | ens.map <- subset(genes(edb,return.type = "data.frame"),seq_name%in%c(as.character(1:22),"X","Y","MT") & !gene_biotype%in%"LRG_gene") 69 | 70 | if(organism %in% "human") 71 | { 72 | tsmessage(".. Start converting human ensamble id to symbols",verbose = verbose) 73 | df$symb = NA 74 | df$symb = ens.map$gene_name[fastmatch::fmatch(df[,col],ens.map$gene_id)] 75 | tsmessage("Done!",verbose = verbose) 76 | } 77 | 78 | if (organism %in% "mouse") 79 | { 80 | tsmessage(".. Start converting mouse ensamble id to symbols",verbose = verbose) 81 | df$symb = NA 82 | df$symb = ens.map$gene_name[fastmatch::fmatch(df[,col],ens.map$gene_id)] 83 | tsmessage("Done!",verbose = verbose) 84 | } 85 | 86 | return(df) 87 | } 88 | 89 | #' Convert Official Gene Symbols to Ensamble IDs 90 | #' 91 | #' It uses biomart. If more the one gene is associated to the enamble, the first one retrived from 92 | #' Biomart is used. 93 | #' 94 | #' @param df data frame; Data frame containing the IDs to convert. 95 | #' @param col characters; Name of column containing the gene symbol. 96 | #' @param organism characters; Organism of origin (i.e. human or mouse). 97 | #' @param verbose boolean; Icrease verbosity. 98 | #' @return The updated data frame with a new column called ens. 99 | #' 100 | #' @import AnnotationHub 101 | #' @import fastmatch 102 | #' 103 | #' @export 104 | symbolToEns = function(df,col,organism,verbose=T) 105 | { 106 | organism = tolower(organism) 107 | organism = base::match.arg(arg = organism,choices = c("human","mouse"),several.ok = F) 108 | org.map = c("Homo Sapiens","Mus Musculus") 109 | names(org.map) = c("human","mouse") 110 | 111 | tsmessage("... Retrieving gene annotation from AnnotationHub()",verbose = verbose) 112 | ah <- AnnotationHub::AnnotationHub() 113 | ahDb <- AnnotationHub::query(ah,pattern = c(org.map[organism],"EnsDb"), ignore.case = TRUE) 114 | id <- tail(rownames(AnnotationHub::mcols(ahDb)),n=1) 115 | edb <- ah[[id]] 116 | ens.map <- subset(genes(edb,return.type = "data.frame"),seq_name%in%c(as.character(1:22),"X","Y","MT") & !gene_biotype%in%"LRG_gene") 117 | 118 | if(organism %in% "human") 119 | { 120 | tsmessage(".. Start converting human symbols to human ensamble id",verbose = verbose) 121 | df$ens = NA 122 | df$ens = ens.map$gene_id[fastmatch::fmatch(df[,col],ens.map$symbol)] 123 | tsmessage("Done!",verbose = verbose) 124 | } 125 | 126 | if (organism %in% "mouse") 127 | { 128 | tsmessage(".. Start converting human symbols to mouse ensamble id",verbose = verbose) 129 | df$ens = NA 130 | df$ens = ens.map$gene_id[fastmatch::fmatch(df[,col],ens.map$symbol)] 131 | tsmessage("Done!",verbose = verbose) 132 | } 133 | 134 | return(df) 135 | } 136 | 137 | 138 | # Rcpp progress bar style 139 | progress_for <- function(n, tot,display) { 140 | if (display) { 141 | message("0% 10 20 30 40 50 60 70 80 90 100%") 142 | message("[----|----|----|----|----|----|----|----|----|----|") 143 | # n:tot = nstars:50 -> nstars = (n*50)/tot 144 | nstars = floor((n*50)/tot) 145 | if(nstars>0) 146 | for (i in 1:nstars) { 147 | message("*", appendLF = FALSE) 148 | utils::flush.console() 149 | } 150 | message("|") 151 | } 152 | } 153 | 154 | # Filter cells with a cell2location style 155 | # https://cell2location.readthedocs.io/en/latest/cell2location.utils.filtering.html 156 | filter_genes_cell2loc_style = function(data,cell_count_cutoff=5,cell_percentage_cutoff2=0.03,nonz_mean_cutoff=1.12) 157 | { 158 | nt = ifelse(detectCores()>1,detectCores()-1,1) 159 | data = data[armaRowSum(data)>0,] 160 | csums = armaRowSum(data!=0) 161 | gene_to_remove = csums <= cell_count_cutoff | csums/ncol(data) <= cell_percentage_cutoff2 162 | gene_to_remove = gene_to_remove & armaColMeans(t(data),nt,FALSE)$mu1 <= nonz_mean_cutoff 163 | data = data[!gene_to_remove,] 164 | return(data) 165 | } 166 | 167 | #' Load in data from 10X 168 | #' 169 | #' Enables easy loading of sparse data matrices provided by 10X genomics. 170 | #' 171 | #' @param data.dir Directory containing the matrix.mtx, genes.tsv (or features.tsv), and barcodes.tsv 172 | #' files provided by 10X. A vector or named vector can be given in order to load 173 | #' several data directories. If a named vector is given, the cell barcode names 174 | #' will be prefixed with the name. 175 | #' @param gene.column Specify which column of genes.tsv or features.tsv to use for gene names; default is 2 176 | #' @param cell.column Specify which column of barcodes.tsv to use for cell names; default is 1 177 | #' @param unique.features Make feature names unique (default TRUE) 178 | #' @param strip.suffix Remove trailing "-1" if present in all cell barcodes. 179 | #' 180 | #' @return If features.csv indicates the data has multiple data types, a list 181 | #' containing a sparse matrix of the data from each type will be returned. 182 | #' Otherwise a sparse matrix containing the expression data will be returned. 183 | #' 184 | #' @importFrom Matrix readMM 185 | #' @importFrom utils read.delim 186 | #' 187 | #' @export 188 | #' 189 | #' @examples 190 | #' \dontrun{ 191 | #' # For output from CellRanger < 3.0 192 | #' data_dir <- 'path/to/data/directory' 193 | #' list.files(data_dir) # Should show barcodes.tsv, genes.tsv, and matrix.mtx 194 | #' expression_matrix <- Read10X(data.dir = data_dir) 195 | #' seurat_object = CreateSeuratObject(counts = expression_matrix) 196 | #' 197 | #' # For output from CellRanger >= 3.0 with multiple data types 198 | #' data_dir <- 'path/to/data/directory' 199 | #' list.files(data_dir) # Should show barcodes.tsv.gz, features.tsv.gz, and matrix.mtx.gz 200 | #' data <- Read10X(data.dir = data_dir) 201 | #' seurat_object = CreateSeuratObject(counts = data$`Gene Expression`) 202 | #' seurat_object[['Protein']] = CreateAssayObject(counts = data$`Antibody Capture`) 203 | #' } 204 | #' 205 | Read10X <- function( 206 | data.dir, 207 | gene.column = 1, 208 | cell.column = 1, 209 | unique.features = TRUE, 210 | strip.suffix = FALSE 211 | ) { 212 | full.data <- list() 213 | for (i in seq_along(along.with = data.dir)) { 214 | run <- data.dir[i] 215 | if (!dir.exists(paths = run)) { 216 | stop("Directory provided does not exist") 217 | } 218 | barcode.loc <- file.path(run, 'barcodes.tsv') 219 | gene.loc <- file.path(run, 'genes.tsv') 220 | features.loc <- file.path(run, 'features.tsv.gz') 221 | matrix.loc <- file.path(run, 'matrix.mtx') 222 | # Flag to indicate if this data is from CellRanger >= 3.0 223 | pre_ver_3 <- file.exists(gene.loc) 224 | if (!pre_ver_3) { 225 | addgz <- function(s) { 226 | return(paste0(s, ".gz")) 227 | } 228 | barcode.loc <- addgz(s = barcode.loc) 229 | matrix.loc <- addgz(s = matrix.loc) 230 | } 231 | if (!file.exists(barcode.loc)) { 232 | stop("Barcode file missing. Expecting ", basename(path = barcode.loc)) 233 | } 234 | if (!pre_ver_3 && !file.exists(features.loc) ) { 235 | stop("Gene name or features file missing. Expecting ", basename(path = features.loc)) 236 | } 237 | if (!file.exists(matrix.loc)) { 238 | stop("Expression matrix file missing. Expecting ", basename(path = matrix.loc)) 239 | } 240 | data <- readMM(file = matrix.loc) 241 | cell.barcodes <- read.table(file = barcode.loc, header = FALSE, sep = '\t', row.names = NULL) 242 | if (ncol(x = cell.barcodes) > 1) { 243 | cell.names <- cell.barcodes[, cell.column] 244 | } else { 245 | cell.names <- readLines(con = barcode.loc) 246 | } 247 | if (all(grepl(pattern = "\\-1$", x = cell.names)) & strip.suffix) { 248 | cell.names <- as.vector(x = as.character(x = sapply( 249 | X = cell.names, 250 | FUN = ExtractField, 251 | field = 1, 252 | delim = "-" 253 | ))) 254 | } 255 | if (is.null(x = names(x = data.dir))) { 256 | if (length(x = data.dir) < 2) { 257 | colnames(x = data) <- cell.names 258 | } else { 259 | colnames(x = data) <- paste0(i, "_", cell.names) 260 | } 261 | } else { 262 | colnames(x = data) <- paste0(names(x = data.dir)[i], "_", cell.names) 263 | } 264 | feature.names <- read.delim( 265 | file = ifelse(test = pre_ver_3, yes = gene.loc, no = features.loc), 266 | header = FALSE, 267 | stringsAsFactors = FALSE 268 | ) 269 | if (any(is.na(x = feature.names[, gene.column]))) { 270 | warning( 271 | 'Some features names are NA. Replacing NA names with ID from the opposite column requested', 272 | call. = FALSE, 273 | immediate. = TRUE 274 | ) 275 | na.features <- which(x = is.na(x = feature.names[, gene.column])) 276 | replacement.column <- ifelse(test = gene.column == 2, yes = 1, no = 2) 277 | feature.names[na.features, gene.column] <- feature.names[na.features, replacement.column] 278 | } 279 | if (unique.features) { 280 | fcols = ncol(x = feature.names) 281 | if (fcols < gene.column) { 282 | stop(paste0("gene.column was set to ", gene.column, 283 | " but feature.tsv.gz (or genes.tsv) only has ", fcols, " columns.", 284 | " Try setting the gene.column argument to a value <= to ", fcols, ".")) 285 | } 286 | rownames(x = data) <- make.unique(names = feature.names[, gene.column]) 287 | } 288 | # In cell ranger 3.0, a third column specifying the type of data was added 289 | # and we will return each type of data as a separate matrix 290 | if (ncol(x = feature.names) > 2) { 291 | data_types <- factor(x = feature.names$V3) 292 | lvls <- levels(x = data_types) 293 | if (length(x = lvls) > 1 && length(x = full.data) == 0) { 294 | message("10X data contains more than one type and is being returned as a list containing matrices of each type.") 295 | } 296 | expr_name <- "Gene Expression" 297 | if (expr_name %in% lvls) { # Return Gene Expression first 298 | lvls <- c(expr_name, lvls[-which(x = lvls == expr_name)]) 299 | } 300 | data <- lapply( 301 | X = lvls, 302 | FUN = function(l) { 303 | return(data[data_types == l, , drop = FALSE]) 304 | } 305 | ) 306 | names(x = data) <- lvls 307 | } else{ 308 | data <- list(data) 309 | } 310 | full.data[[length(x = full.data) + 1]] <- data 311 | } 312 | # Combine all the data from different directories into one big matrix, note this 313 | # assumes that all data directories essentially have the same features files 314 | list_of_data <- list() 315 | for (j in 1:length(x = full.data[[1]])) { 316 | list_of_data[[j]] <- do.call(cbind, lapply(X = full.data, FUN = `[[`, j)) 317 | # Fix for Issue #913 318 | list_of_data[[j]] <- as(object = list_of_data[[j]], Class = "CsparseMatrix") 319 | } 320 | names(x = list_of_data) <- names(x = full.data[[1]]) 321 | # If multiple features, will return a list, otherwise 322 | # a matrix. 323 | if (length(x = list_of_data) == 1) { 324 | return(list_of_data[[1]]) 325 | } else { 326 | return(list_of_data) 327 | } 328 | } 329 | 330 | detectCores <- function() { 331 | .Call("detectCoresCpp") 332 | } 333 | 334 | armaColSum <- function(M,nt=0,verbose=FALSE) { 335 | res = NULL 336 | c = class(M) 337 | if (nt==0) {nt = ifelse(detectCores()>1,detectCores()-1,1)} 338 | if(c[1]=="matrix") { 339 | res = armaColSumFull(M,nt,verbose) 340 | } else { 341 | if (c[1]=="lgCMatrix" || c[1]=="lgTMatrix") { 342 | res = Matrix::colSums(M) 343 | } else { 344 | if(c[1]!="dgCMatrix") {M = as(M,"CsparseMatrix")} 345 | res = armaColSumSparse(M,nt,verbose) 346 | } 347 | } 348 | res = as.numeric(res) 349 | names(res) = colnames(M) 350 | return(res) 351 | } 352 | 353 | armaRowSum <- function(M,nt=0,verbose=FALSE) { 354 | return(armaColSum(t(M),nt,verbose)) 355 | } 356 | 357 | #' @param M A sparse matrix from the Matrix package. 358 | #' @param dir The directori in which to write the files. 359 | Write10X = function(M,dir) { 360 | if (!dir.exists(paths = dir)) { 361 | dir.create(path = dir,showWarnings = F,recursive = T) 362 | } 363 | writeMMgz(x = M,file = paste0(dir,"/matrix.mtx.gz")) 364 | write.table(x = rownames(M),file = gzfile(paste0(dir,"/features.tsv.gz")),col.names = F,row.names = F) 365 | write.table(x = colnames(M),file = gzfile(paste0(dir,"/barcodes.tsv.gz")),col.names = F,row.names = F) 366 | } 367 | 368 | #' @param x A sparse matrix from the Matrix package. 369 | #' @param file A filename that ends in ".gz". 370 | writeMMgz <- function(x, file) { 371 | mtype <- "real" 372 | if (is(x, "ngCMatrix")) { 373 | mtype <- "integer" 374 | } 375 | writeLines( 376 | c( 377 | sprintf("%%%%MatrixMarket matrix coordinate %s general", mtype), 378 | sprintf("%s %s %s", x@Dim[1], x@Dim[2], length(x@x)) 379 | ), 380 | gzfile(file) 381 | ) 382 | data.table::fwrite( 383 | x = summary(x), 384 | file = file, 385 | append = TRUE, 386 | sep = " ", 387 | row.names = FALSE, 388 | col.names = FALSE 389 | ) 390 | } 391 | -------------------------------------------------------------------------------- /R/visualization.R: -------------------------------------------------------------------------------- 1 | #' Plot cells in the ebedded space 2 | #' 3 | #' Plot cells in the bidimensional space and color it according to a specific parameter. 4 | #' 5 | #' @param data list; GFICF object 6 | #' @param colorBy characters; Color cells according to a column contained in data$embedded data frame. Default is NULL. 7 | #' @param pointSize integer; Size of the points in the plot. Default is 0.5. 8 | #' @param pointShape integer; Shape of the points in the plot. Default is 46. 9 | #' @return The updated gficf object. 10 | #' @export 11 | #' @import ggrepel 12 | #' @import ggplot2 13 | #' 14 | #' @export 15 | plotCells = function(data,colorBy=NULL,pointSize=.5,pointShape=46) 16 | { 17 | if (is.null(colorBy)) {return(ggplot(data = data$embedded) + geom_point(aes(x=X,y=Y),size=pointSize,color="blue") + theme_bw())} 18 | 19 | if (!colorBy%in%colnames(data$embedded)) {stop("colorBy parameter not found")} 20 | df = data$embedded 21 | u = unique(df[,colorBy]) 22 | c = NULL 23 | for (i in u) 24 | { 25 | d = as.matrix(dist(df[df[,colorBy]%in%i,c(1,2)])) 26 | d = apply(d, 1, sum) 27 | ix = which.min(d) 28 | c = rbind(c,data.frame(cluster=i,xx=df[names(d[ix]),"X"],yy=df[names(d[ix]),"Y"],stringsAsFactors = F)) 29 | } 30 | 31 | tmp = df[,colorBy] 32 | 33 | ggplot(data = data$embedded) + geom_point(aes(x=X,y=Y,color=tmp),size=pointSize,shape=pointShape) + theme_bw() + geom_text_repel(data = c,aes(x=xx,y=yy,label=cluster),min.segment.length = 0) + geom_point(data = c,aes(x=xx,y=yy),size=2) + theme(legend.position = "none") 34 | } 35 | 36 | #' Plot gene expression across cells 37 | #' 38 | #' Plot the expression of a group of genes across cells. 39 | #' 40 | #' @param data list; GFICF object 41 | #' @param genes characters; Id of genes to plot. It must correspond to the IDs on the rows of raw count matrix. 42 | #' @param log2Expr boolean; Relative expression of a gene is computed on rescaled in log2 expression (default TRUE). 43 | #' @param x Matrix; Custom normalized raw counts. If present will be used instead of the ones normalized by gficf. Default is NULL. 44 | #' @param rescale boolean; Rescale expression between 0 and 1. Default is false. 45 | #' @return A list of plots. 46 | #' @import Matrix 47 | #' @import ggplot2 48 | #' 49 | #' @export 50 | plotGenes = function(data,genes,log2Expr=T,x=NULL,rescale=F) 51 | { 52 | if (is.null(data$embedded)) {stop("Please run reduction in the embedded space first!")} 53 | if (!is.null(x)) {data$rawCounts=x} 54 | if (is.null(data$rawCounts)) {stop("Raw or normalized counts absent.")} 55 | 56 | data$rawCounts = normCounts(data$rawCounts,doc_proportion_max = 2, 57 | doc_proportion_min = 0, 58 | normalizeCounts = !data$param$normalized & is.null(x), 59 | verbose=T) 60 | 61 | genes = genes[genes%in%rownames(data$rawCounts)] 62 | 63 | if (length(genes)==0) {stop("Genes are absent in the Expression matrix")} 64 | 65 | l = vector(mode = "list",length = length(genes)) 66 | names(l) = genes 67 | for (i in genes) 68 | { 69 | if ("predicted" %in% colnames(data$embedded)) 70 | { 71 | df = subset(data$embedded, predicted %in% "NO") 72 | } else { 73 | df = data$embedded 74 | } 75 | 76 | if(log2Expr) 77 | { 78 | df$expr = log2(data$rawCounts[i,rownames(df)]+1) 79 | } else { 80 | df$expr = data$rawCounts[i,rownames(df)] 81 | } 82 | 83 | if(rescale) {df$expr = df$expr/max(df$expr)} 84 | df = df[order(df$expr,decreasing = F),] 85 | l[[i]] = ggplot(data = df,aes(x=X,y=Y,color=expr)) + geom_point(size=.5,shape=19) + theme_bw() + scale_color_gradient2(low = "gray",mid = "#2171b5",high = "#08306b",midpoint = .5) + ggtitle(i) 86 | 87 | } 88 | 89 | return(l) 90 | } 91 | 92 | #' Plot the expression of a gene across group of cells. 93 | #' 94 | #' Plot the expression of a gene across group of cells with violion plot. 95 | #' 96 | #' @param data list; GFICF object 97 | #' @param gene characters; Id of genes to plot. It must correspond to the IDs on the rows of raw count matrix. 98 | #' @param ncol numeric; Number of columns of the final plot (defaul is 3). 99 | #' @param x Matrix; Custom normalized raw counts. If present will be used instead of the ones normalized by gficf. Default is NULL. 100 | #' @return A list of plots. 101 | #' 102 | #' @import fastmatch 103 | #' @import Matrix 104 | #' @import ggplot2 105 | #' @importFrom reshape2 melt 106 | #' 107 | #' @export 108 | plotGeneViolin = function(data,gene,ncol=3,x=NULL) 109 | { 110 | if (is.null(data$community)) {stop("Please run clustcells first!")} 111 | if (!is.null(x)) {data$rawCounts=x} 112 | if (is.null(data$rawCounts)) {stop("Raw or normalized counts absent.")} 113 | 114 | cpms = normCounts(data$rawCounts,doc_proportion_max = 2, 115 | doc_proportion_min = 0, 116 | normalizeCounts = !data$param$normalized & is.null(x), 117 | verbose=T) 118 | 119 | df = reshape2::melt(as.matrix(cpms[gene,])) 120 | colnames(df) = c("ens","cell.id","value") 121 | 122 | if(!is.null(names(gene))) { 123 | ix = is.na(names(gene)) | names(gene) %in% "" | is.null(names(gene)) 124 | if(sum(ix)>0) {names(gene)[ix] = gene[ix]} 125 | if(length(unique(names(gene))) == length(gene)) { 126 | df$ens = names(gene)[fastmatch::fmatch(df$ens,gene)] 127 | } 128 | } 129 | 130 | df$value = log2(df$value+1) 131 | df$cluster = data$embedded$cluster[match(df$cell.id,rownames(data$embedded))] 132 | df$cluster = factor(as.character(df$cluster),levels = as.character(1:length(unique(df$cluster)))) 133 | p = ggplot2::ggplot(data = df,ggplot2::aes(x=cluster,y=value)) + 134 | ggplot2::geom_violin(scale = "width") + 135 | ggplot2::facet_wrap(~ens,scales = "free_y",ncol=ncol) + 136 | ggplot2::ylab("log2(CPM+1)") + ggplot2::xlab("") + ggplot2::theme_bw() 137 | 138 | return(p) 139 | } 140 | 141 | #' Plot GSEA results 142 | #' 143 | #' Circle plot for gene set enrichement analysis results. 144 | #' 145 | #' @param data list; GFICF object 146 | #' @param fdr number; FDR threshold to select significant pathways to plot. 147 | #' @param clusterRowCol boolean; if TRUE row and col of the plot are clustered. 148 | #' @return plot from ggplot2 package. 149 | #' @import Matrix 150 | #' @import ggplot2 151 | #' @importFrom reshape2 melt 152 | #' 153 | #' @export 154 | plotGSEA = function(data,fdr=.05,clusterRowCol=F) 155 | { 156 | if (is.null(data$gsea)) {stop("Please run runGSEA function first")} 157 | nes = data$gsea$nes 158 | nes[data$gsea$es<=0 | data$gsea$fdr>=fdr] = 0 159 | nes = nes[armaRowSum(nes)>0,] 160 | 161 | if (clusterRowCol) 162 | { 163 | h.c = hclust(dist(t(nes),method = "binary")) 164 | h.p = hclust(dist(nes,method = "binary")) 165 | } 166 | 167 | df = reshape2::melt(as.matrix(nes)) 168 | colnames(df) = c("pathway","cluster","nes") 169 | 170 | if (clusterRowCol) 171 | { 172 | df$cluster = factor(as.character(df$cluster),levels = rev(h.c$labels[h.c$order])) 173 | df$pathway = factor(as.character(df$pathway),levels = rev(h.p$labels[h.p$order])) 174 | } else { 175 | df$cluster = factor(as.character(df$cluster),levels = as.character(1:length(unique(data$embedded$cluster)))) 176 | } 177 | 178 | ggplot(data = df,aes(x=pathway,y=cluster)) + geom_point(aes(size=nes)) + scale_size_continuous(range = c(0,7)) + theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + xlab("") + ylab("Cluster name") 179 | } 180 | 181 | #' Plot GSEA results 182 | #' 183 | #' Plot GSEA values on top of UMAP/TSNE coordinates. 184 | #' 185 | #' @param data list; GFICF object 186 | #' @param pathwayName characters; Name of the pathway to plot. 187 | #' @param fdr number; FDR threshold to select significant pathways to plot. 188 | #' @return plot from ggplot2 package. 189 | #' @import Matrix 190 | #' @import ggplot2 191 | #' 192 | #' @export 193 | plotPathway = function(data,pathwayName,fdr=.05) 194 | { 195 | if (is.null(data$gsea)) {stop("Please run runGSEA function first")} 196 | nes = data$gsea$nes 197 | nes[data$gsea$es<=0 | data$gsea$fdr>=fdr] = 0 198 | nes = nes[armaRowSum(nes)>0,] 199 | nes = nes[pathwayName,] 200 | df = data$embedded 201 | df$NES = nes[match(df$cluster,names(nes))] 202 | ggplot(data = df,aes(x=X,y=Y)) + geom_point(aes(color=NES),shape=20) + theme_bw() + scale_color_gradient(low = "gray",high = "red") 203 | } 204 | 205 | #' Plot GSEA results 206 | #' 207 | #' Circle plot for gene set enrichement analysis results. 208 | #' 209 | #' @param data list; GFICF object 210 | #' @param fdr number; FDR threshold to select significant pathways to plot. 211 | #' @param clusterRowCol boolean; if TRUE row and col of the plot are clustered. 212 | #' @param logFCth number; LogFC threshold to select pathways to plot. 213 | #' @return plot from ggplot2 package. 214 | #' @import Matrix 215 | #' @import ggplot2 216 | #' @importFrom reshape2 melt acast 217 | #' 218 | #' @export 219 | plotGSVA = function(data,fdr=.05,clusterRowCol=T,logFCth=0) 220 | { 221 | if (is.null(data$gsva)) {stop("Please run runGSEA function first")} 222 | M = reshape2::acast(subset(data$gsva$DEpathways,adj.P.VallogFCth),pathway~cluster,fill = 0,value.var = "logFC") 223 | 224 | df = reshape2::melt(M) 225 | if(clusterRowCol) 226 | { 227 | h.col = hclust(dist(t(M)),method = "ward.D2") 228 | h.row = hclust(dist(M),method = "ward.D2") 229 | df$Var1 = factor(as.character(df$Var1),levels = h.row$labels[h.row$order]) 230 | df$Var2 = factor(as.character(df$Var2),levels = h.col$labels[h.col$order]) 231 | } 232 | 233 | ggplot(data = df,aes(x=Var1,y=Var2,fill=value)) + geom_tile() + scale_fill_gradient2(low = "blue",high = "red") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + xlab("") + ylab("") 234 | 235 | } 236 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onAttach <- function(libname, pkgname) { 2 | require(Matrix) 3 | packageStartupMessage(packageVersion("gficf")) 4 | invisible() 5 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gficf v2 - Single-cell gene set enrichment analysis 2 | 3 | Details on ***scGSEA and scMAP*** implemented into the version 2 of GFICF package can be found in the NAR genomics and bioinformatics manuscript [Franchini et al. 2023](https://doi.org/10.1093/nargab/lqad024) 4 | 5 | GFICF v1 manuscript can be found here at [Gambardella et al. 2019](https://www.frontiersin.org/articles/10.3389/fgene.2019.00734/abstract). 6 | 7 | The package also includes [Phenograph](https://biorxiv.org/cgi/content/short/2022.10.24.513476v1) 8 | [Louvain method](https://sites.google.com/site/findcommunities/) 9 | clustering using [RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy) library 10 | from [uwot](https://github.com/jlmelville/uwot) and a naive but fast parallel implementation 11 | of Jaccard Coefficient estimation using [RcppParallel](https://cran.r-project.org/package=RcppParallel). 12 | The package also include data reduction with either Principal Component Analisys (PCA) or 13 | non-negative matrix factorization [RcppML](https://github.com/zdebruine/RcppML) before to apply t-SNE or UMAP for single cell data visualization. 14 | 15 | **Examples & Functionality**: 16 | * [Install GFICF package](https://htmlpreview.github.io/?https://github.com/gambalab/gficf/blob/master/inst/doc/installation.html) 17 | * [Getting Started](https://htmlpreview.github.io/?https://github.com/gambalab/gficf/blob/master/inst/doc/index.html) 18 | * [Single-cell Gene Set Enrichement Analysis (scGSEA)](https://htmlpreview.github.io/?https://github.com/gambalab/gficf/blob/master/inst/doc/scGSEA.html) 19 | * [Single-cell Mapper (scMAP)](https://htmlpreview.github.io/?https://github.com/gambalab/gficf/blob/master/inst/doc/scMAP.html) 20 | * Batch Effect Correction (Cooming soon) 21 | -------------------------------------------------------------------------------- /data-raw/small_atlas.R: -------------------------------------------------------------------------------- 1 | library(Matrix) 2 | 3 | M.raw = readRDS(file = "~/work/current/BRCA_AIRC_paper/paper_git/RData/RAW.filtered.BRCA.UMI.counts.5K.umi.rds") 4 | 5 | set.seed(0) 6 | sample = sapply(strsplit(x = colnames(M.raw),split = "_",fixed = T), function(x) x[1]) 7 | u = unique(sample) 8 | cells=NULL 9 | for (i in 1:length(u)) { 10 | if (u[i]=="HDQP1") { 11 | cells = c(cells,sample(x=colnames(M.raw)[sample%in%u[i]],size=110)) 12 | } else { 13 | cells = c(cells,sample(x=colnames(M.raw)[sample%in%u[i]],size=150)) 14 | } 15 | } 16 | 17 | small_BC_atlas = M.raw[,cells] 18 | small_BC_atlas = small_BC_atlas[rowSums(small_BC_atlas)>0,] 19 | usethis::use_data(small_BC_atlas,overwrite = T) 20 | -------------------------------------------------------------------------------- /data-raw/test_BC_atlas.R: -------------------------------------------------------------------------------- 1 | library(Matrix) 2 | 3 | M.raw = readRDS(file = "~/work/current/BRCA_AIRC_paper/paper_git/RData/RAW.filtered.BRCA.UMI.counts.5K.umi.rds") 4 | load("~/work/package/gficf/data/small_BC_atlas.rda") 5 | M.raw = M.raw[,!colnames(M.raw)%in%colnames(small_BC_atlas)] 6 | 7 | set.seed(0) 8 | sample = sapply(strsplit(x = colnames(M.raw),split = "_",fixed = T), function(x) x[1]) 9 | u = unique(sample) 10 | cells=NULL 11 | for (i in 1:length(u)) { 12 | cells = c(cells,sample(x=colnames(M.raw)[sample%in%u[i]],size=30)) 13 | } 14 | 15 | test_BC_atlas = M.raw[,cells] 16 | test_BC_atlas = test_BC_atlas[rowSums(test_BC_atlas)>0,] 17 | usethis::use_data(test_BC_atlas,overwrite = T) 18 | -------------------------------------------------------------------------------- /data/small_BC_atlas.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/data/small_BC_atlas.rda -------------------------------------------------------------------------------- /data/test_BC_atlas.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/data/test_BC_atlas.rda -------------------------------------------------------------------------------- /img/Cd34_expression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/img/Cd34_expression.png -------------------------------------------------------------------------------- /img/Cd8a_expression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/img/Cd8a_expression.png -------------------------------------------------------------------------------- /img/tabula_annotated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/img/tabula_annotated.png -------------------------------------------------------------------------------- /img/tabula_clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/img/tabula_clusters.png -------------------------------------------------------------------------------- /inst/doc/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "GFICF Getting Started" 3 | author: 4 | - name: Gennaro Gambardella 5 | affiliation: TIGEM (Telethon Institute of Genetics and Medicine) 6 | package: gficf 7 | output: 8 | BiocStyle::html_document 9 | vignette: | 10 | %\VignetteIndexEntry{GFICF Getting Started} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | %\VignetteEncoding{UTF-8} 13 | --- 14 | 15 | 16 | 17 | 24 | 25 | # Getting started with `gficf` package 26 | 27 | Welcome to GF-ICF! `gficf` is an R package for normalization, visualization and analysis of of single-cell RNA sequencing data, based on a data transformation model called term frequency–inverse document frequency [(TF-IDF)](https://en.wikipedia.org/wiki/Tf%E2%80%93idf), which has been extensively used in the field of text mining. This vignette gives an overview and introduction to `gficf`’s functionality. 28 | 29 | For this tutorial, we will be analyzing the small_BC_atlas dataset included in the `gficf` package. This dataset is a small version (comprising only 4,760 cells) of the Breast cancer cell-line atlas we recently published ([Gambardella et al.](https://www.nature.com/articles/s41467-022-29358-6)). 30 | 31 | # QC and selecting cells for further analysis 32 | `gficf` allows you to automatically filter cells based on several criteria. These include 33 | 34 | 1. The total number of UMI detected within a cell (this correlates strongly with unique genes) 35 | 2. The number of unique genes detected in each cell. 36 | a. This because low-quality cells or empty droplets will often have very few genes 37 | b. While cell doublets or multiplets may exhibit an aberrant high gene count 38 | 3. The percentage of reads that map to the mitochondrial genome 39 | a. This because low-quality / dying cells often exhibit extensive mitochondrial contamination 40 | 41 | Both point 2 and 3 are done automatically by the tool filtering out cells with low gene ratio detection and high MT ratio. This is accomplished using loess regression to fit the relationships between the total number of UMI in a cell (in log scale) and the ratio of detected genes over total UMI (or total MT counts over total UMI). 42 | 43 | **IMPORTANT:** For now only datasets for which genes are reported as ENSEMBLE id are supported! 44 | 45 | ```{r qc,echo=TRUE,cache=TRUE,warning=FALSE,prompt=T,results='hold'} 46 | require(gficf) 47 | require(ggplot2) 48 | 49 | # Load the small BC atlas 50 | data("small_BC_atlas") 51 | 52 | # Filter out cells 53 | data = filterCells(counts = small_BC_atlas, 54 | organism = "human", 55 | plot = F, 56 | verbose = T, 57 | minUMI = 5000) 58 | ``` 59 | 60 | # Data Normalization and Visualization 61 | After cell QC, we can start to normalize raw UMI counts and filtering out lowly and rarely expressed genes. In particular here we discard genes expressed in less then 15 cells or in less then 5% of total cells but having an average expression in non-zero cells less then 1.12 UMI. We next perform PCA on the normalized data but using only over-dispersed (i.e.,highly variable) genes. Finally t-UMAP non-linear dimensional reduction is used to visualize the dataset (see Figure \@ref(fig:norm)). 62 | 63 | ```{r norm, fig.cap="UMAP Plot. UMAP plot of the small_BC_atlas dataset after cells have been normalized with GF-ICF model.",echo=TRUE,cache=TRUE,warning=FALSE,message=F,prompt=T,results='hide'} 64 | # Data normalization and gene filtering 65 | data <- gficf( QCdata = data, 66 | cell_count_cutoff = 15, 67 | nonz_mean_cutoff = 1.12, 68 | cell_percentage_cutoff2 = 0.05, 69 | normalize = T, 70 | verbose = T) 71 | 72 | # Create PCA-subspace using overdispersed genes 73 | data <- runPCA(data = data,dim = 10,use.odgenes = T) 74 | 75 | # Create t-UMAP space 76 | data <-runReduction(data = data,reduction = "tumap",nt = 2,verbose = T) 77 | 78 | # Plot cells 79 | p = plotCells(data = data,pointShape = 19) + 80 | xlab("UMAP 1") + 81 | ylab("UMAP 2") 82 | 83 | plot(p) 84 | ``` 85 | 86 | 87 | # Cell Clustering 88 | In the package `gficf` the function `clustcells` implement the [Phenograph](https://www.cell.com/cell/fulltext/S0092-8674(15)00637-6) algorithm, 89 | which is a clustering method designed for high-dimensional single-cell data analysis. It works by creating a graph ("network") representing phenotypic similarities between cells by calculating the Jaccard coefficient between nearest-neighbor sets, and then identifying communities using the well known [Louvain method](https://sites.google.com/site/findcommunities/) or [Leiden algorithm](https://www.nature.com/articles/s41598-019-41695-z) in this graph. 90 | 91 | In this particular implementation of Phenograph we use approximate nearest neighbors found using [RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy) 92 | libraries present in the `uwot` package. The supported distance metrics for KNN that can be set trough the `dist.method` parameter are: 93 | 94 | * Euclidean (default) 95 | * Cosine 96 | * Manhattan 97 | * Hamming 98 | 99 | Please note that the Hamming support is a lot slower than the other metrics. It is not recomadded to use it if you have more than a few hundred features, and even then expect it to take several minutes during the index building phase in situations where the Euclidean metric would take only a few seconds. 100 | 101 | After computation of Jaccard distances among cells (with custom [RcppParallel](https://cran.r-project.org/package=RcppParallel) implementation), the Louvain or Leiden community detection algorithms can be run to identify cell clusters. The supported communities detection algorithm that can be set trough the `community.algo` parameter are: 102 | 103 | * Louvain classic (igraph implementation) 104 | * Louvian with modularity optimization 105 | * Louvain algorithm with multilevel refinement (default) 106 | * Leiden algorithm from [Traag et al. 2019](https://www.nature.com/articles/s41598-019-41695-z) (need to be installed first via `sudo -H pip install leidenalg igraph`) 107 | * Walktrap 108 | * Fastgreedy 109 | 110 | ```{r clustering, fig.cap="Cell clusters. UMAP plot of the small_BC_atlas dataset where cells are color-coded according to the cluster they belong.",echo=TRUE,cache=TRUE,warning=FALSE,message=F,prompt=T,results='hold'} 111 | 112 | # Identify clusters 113 | data <- clustcells(data = data, 114 | community.algo = "louvain 3", 115 | nt = 2, 116 | resolution = .25, 117 | verbose = T) 118 | 119 | # Plot cells color coded acording to their cluster 120 | p = plotCells(data = data,colorBy = "cluster",pointShape = 19) + 121 | xlab("UMAP 1") + 122 | ylab("UMAP 2") 123 | plot(p) 124 | ``` 125 | 126 | 127 | 128 | # Adding and plot cell metadata 129 | Cell metadata are stored into the data.frame named `embedded` of the `gficf` object (i.e., it can be accessed via `data$embedded`). Any column can be added and later plotted on top of the UMAP/t-SNE plot (Figure \@ref(fig:ccl)) thanks to the function `plotCells` and specifying in parameter `colorBy` the name of the column with which is intended to color-code cells. This data frame contains by default the UMAP/t-SNE coordinates of each cell. The row names of this data.frame correspond instead to the cell barcode. 130 | 131 | ```{r ccl, fig.cap="UMAP plot. UMAP plot of the small_BC_atlas dataset where cells are color-coded according to their cell-line of origin.",echo=TRUE,cache=TRUE,warning=FALSE,message=F,prompt=T,results='hide'} 132 | 133 | # Cell meta-data can stored in the data$embedded data.frame 134 | # Let' add the info about the cell-line, stripping this information 135 | # from the name of the cell and storing it into ccl column. 136 | data$embedded$ccl = sapply( 137 | strsplit(x = rownames(data$embedded), 138 | split = "_",fixed = T) 139 | ,function(x) x[1] 140 | ) 141 | 142 | # We can now plot cell according to their cell-line of origin 143 | p = plotCells(data = data,colorBy = "ccl",pointShape = 19) + 144 | xlab("UMAP 1") + 145 | ylab("UMAP 2") 146 | 147 | plot(p) 148 | ``` 149 | 150 | ``` 151 | # show top 20 cells 152 | > head(data$embedded,20) 153 | ``` 154 | 155 | ```{r metadata,echo=FALSE,cache=TRUE} 156 | library(DT) 157 | df = head(data$embedded,20) 158 | df$X = round(df$X,2) 159 | df$Y = round(df$Y,2) 160 | DT::datatable(df) 161 | ``` 162 | 163 | # Save and load `gficf` object. 164 | `gficf` object need to be saved/loaded with its specific functions. This because annoy index used by `uwot` package for UMAP is stored in memory and cannot be saved with standard `saveRDS()` or `save()` functions. 165 | 166 | ``` 167 | # save GFICF object 168 | > saveGFICF(data,file = "/path/where/to/save/object.gficf") 169 | 170 | # load GFICF object 171 | > data = loadGFICF(file = "/path/where/is/object.gficf") 172 | ``` 173 | 174 | # Tips and suggestions. 175 | ## How can I read 10X files produced by Cell Ranger? 176 | Cell Ranger produce 3 output files (i.e., barcodes, features and matrix) usually stored in the same folder. `gficf` includes the `Read10X()` function that reads the output file prduced by cellranger pipeline and return as output the raw UMI count matrix. The values in this matrix represent the number of molecules for each gene (row) that are detected in each cell (column). 177 | ``` 178 | # Load 10x dataset 179 | > M = Read10X(data.dir = "/path/where/cellranger/files/are/") 180 | ``` 181 | 182 | ## Where are my normalized counts? 183 | `gficf` normalized data are stored in the gfifc matrix that can be accessd via `data$gficf` 184 | 185 | ``` 186 | # show 10 rows of gficf normalized data 187 | > head(data$gficf) 188 | ``` 189 | 190 | ## How can I run `gficf` without performing cell QC? 191 | In case you want to run gficf directley from raw UMI matrix this can be done using the parameter `M` of `gficf function` as showed below. 192 | ``` 193 | # Data normalization and gene filtering 194 | # from RAW counts 195 | > data <- gficf( M = small_BC_atlas, 196 | cell_count_cutoff = 15, 197 | nonz_mean_cutoff = 1.12, 198 | cell_percentage_cutoff2 = 0.05, 199 | normalize = T, 200 | verbose = T) 201 | ``` 202 | 203 | ## Where are QC metrics stored in `gficf`? 204 | The number of unique genes and total molecules calculated by `filterCells` function are in the `QC.metadata` data.frame. It can be accessed as below: 205 | 206 | ``` 207 | # Top ten row of QC metrics data.frame 208 | > head(data$QC.metadata) 209 | ``` 210 | 211 | ## Are there alternative to PCA? 212 | `gficf` package inlude non-negative matrix factorization (NMF) as alternative to PCA before to apply t-SNE or UMAP reduction. NMF can be executed with the following command. 213 | ``` 214 | # Run NMF with all available cores minus one 215 | > data = runNMF(data = data,dim = 50,use.odgenes = T) 216 | ``` 217 | 218 | # Session info {.unnumbered} 219 | 220 | ```{r sessionInfo, echo=FALSE} 221 | sessionInfo() 222 | ``` 223 | -------------------------------------------------------------------------------- /inst/doc/index_files/figure-html/ccl-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/index_files/figure-html/ccl-1.png -------------------------------------------------------------------------------- /inst/doc/index_files/figure-html/clustering-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/index_files/figure-html/clustering-1.png -------------------------------------------------------------------------------- /inst/doc/index_files/figure-html/norm-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/index_files/figure-html/norm-1.png -------------------------------------------------------------------------------- /inst/doc/installation.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "GFICF Installation" 3 | author: 4 | - name: Gennaro Gambardella 5 | affiliation: TIGEM (Telethon Institute of Genetics and Medicine) 6 | package: gficf 7 | output: 8 | BiocStyle::html_document 9 | vignette: | 10 | %\VignetteIndexEntry{GFICF Installation} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | %\VignetteEncoding{UTF-8} 13 | --- 14 | 15 | 16 | 17 | 24 | 25 | # Install Dependancies (Officially supported only Linux) 26 | 27 | `gficf` makes use of `Rcpp`, `RcppParallel` and `RcppGSL`. So you have to carry out 28 | a few extra steps before being able to build this package. The steps are reported below for each platform. 29 | 30 | 31 | ## Ubuntu/Debian 32 | 33 | You need gsl dev library to successfully install RcppGSL library. 34 | On Ubuntu/Debian systems this can be accomplished by running from terminal the command 35 | 36 | ```bash 37 | sudo apt-get install libgsl-dev libcurl4-openssl-dev libssl-dev libxml2-dev 38 | ``` 39 | 40 | ## Mac OS X (Not Officially Supported) 41 | 42 | 1.2.1 Open terminal and run `xcode-select --install` to install the command line developer tools. 43 | 44 | 1.2.1. We than need to install gsl libraries. This can be done via [Homebrew](https://brew.sh/). So, still from terminal 45 | ```bash 46 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" 47 | ``` 48 | and than use `homebrew` to install gsl with following command 49 | ```bash 50 | brew install gsl 51 | ``` 52 | 53 | 54 | ## Windows 55 | 56 | 1.3.1 Skip this first step if you are using RStudio because it will ask you automatically. Otherwise install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) and ensure `path\to\Rtools\bin` is on your path. 57 | 58 | 1.3.2 [Download gsl library for Windows](https://sourceforge.net/projects/gnu-scientific-library-windows/) from sourceforge and exctract it in `C:\` or where you want. 59 | 60 | 1.3.3 Open R/Rstudio and before to istall the package from github exec the following command in the R terminal. 61 | ```R 62 | # Change the path if you installed gsl librarie not in the default path. 63 | # Be sure to use the format '"path/to/gsl-xxx_mingw-xxx/gsl-xxx-static"' 64 | # In this way " characters will be mainteined and spaces 65 | # in the path preserved if there are. 66 | 67 | # For example for gsl-2.2.1 compiled with mingw-6.2.0: 68 | Sys.setenv(GSL_LIBS = '"C:/gsl-2.2.1_mingw-6.2.0/gsl-2.2.1-static"') 69 | ``` 70 | 71 | 72 | # Install GF-ICF package 73 | 74 | Exec in R terminal the following commands 75 | ```R 76 | # Install required bioconductor packages 77 | if (!requireNamespace("BiocManager", quietly = TRUE)) { 78 | install.packages("BiocManager") 79 | } 80 | 81 | BiocManager::install(setdiff(c("sva","edgeR", "fgsea"),rownames(installed.packages())),update = F) 82 | 83 | # We rquire RcppML package from github (not the cran version) 84 | if("RcppML" %in% rownames(installed.packages())) {remove.packages("RcppML")} 85 | devtools::install_github("zdebruine/RcppML") 86 | 87 | # Install gficf from github 88 | if(!require(devtools)){ install.packages("devtools")} 89 | devtools::install_github("gambalab/gficf") 90 | ``` 91 | -------------------------------------------------------------------------------- /inst/doc/scGSEA.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Single-cell Gene Set Enrichement Analysis" 3 | author: 4 | - name: Gennaro Gambardella 5 | affiliation: TIGEM (Telethon Institute of Genetics and Medicine) 6 | package: gficf 7 | output: 8 | BiocStyle::html_document 9 | vignette: | 10 | %\VignetteIndexEntry{Single-cell Gene Set Enrichement Analysis} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | %\VignetteEncoding{UTF-8} 13 | --- 14 | 15 | 16 | 17 | 24 | 25 | # Introduction {.unnumbered} 26 | 27 | single-cell Gene Set Enrichment Analysis (scGSEA) is a bioinformatic method that could measure the activity of an a priori defined collection of gene sets (i.e., pathways) at the single cell resolution. It that takes advantage of the informative biological signals spreading across the latent factors of gene expression values obtained from non-negative matrix factorization. The scGSEA method starts from a set of single-cell expression profiles and a collection of gene sets and scores their cumulative expression (i.e., pathway activity) in each of the profiled cells (see manucript for details). 28 | 29 | # Data Normalization and Visualization 30 | For this tutorial, we will be using as a reference atlas the `small_BC_atlas dataset` included in the `gficf` package (Figure \@ref(fig:atlas)). This dataset is a small version (comprising only 4,760 cells) of the Breast cancer cell-line atlas we recently published ([Gambardella et al.](https://www.nature.com/articles/s41467-022-29358-6)). 31 | 32 | ```{R atlas, fig.cap="UMAP Plot. UMAP plot of the small_BC_atlas dataset after cells have been normalized with GF-ICF model. Cell are color coded according to thei cell line of origin.",echo=TRUE,cache=TRUE,warning=FALSE,message=FALSE,prompt=FALSE,results='hide'} 33 | require(gficf) 34 | require(ggplot2) 35 | 36 | # Load the RAW UMI count matrix of small bc atlas 37 | data("small_BC_atlas") 38 | 39 | # Data normalization and gene filtering 40 | data <- gficf( M = small_BC_atlas) 41 | 42 | # Create PCA-subspace using overdispersed genes 43 | data <- runPCA(data = data,dim = 10,use.odgenes = T) 44 | 45 | # Create t-UMAP space 46 | data <-runReduction(data = data,reduction = "umap",nt = 2,verbose = T,n_neighbors=150) 47 | 48 | # Cell meta-data can stored in the data$embedded data.frame 49 | # Let' add the info about the cell-line, stripping this information 50 | # from the name of the cell and storing it into ccl column. 51 | data$embedded$ccl = sapply( 52 | strsplit(x = rownames(data$embedded), 53 | split = "_",fixed = T) 54 | ,function(x) x[1] 55 | ) 56 | 57 | 58 | # Plot cells 59 | p = plotCells(data = data,pointShape = 19,colorBy = "ccl") + 60 | xlab("UMAP 1") + 61 | ylab("UMAP 2") 62 | 63 | plot(p) 64 | ``` 65 | 66 | # Performing scGSEA 67 | Single cell gene set enrichment analysis is performed by the function `runScGSEA()` of `gficf` package. All available gene sets from [msigdb database](http://www.gsea-msigdb.org/gsea/msigdb/collections.jsp). The list of gene setes to use can can be specified trough the `category` parameter. Here we access to msigdb gene set collection through [`msigdbr`](https://cran.r-project.org/web/packages/msigdbr/vignettes/msigdbr-intro.html) package (see it for further details). 68 | 69 | ```{r collecion,cache=TRUE,warning=FALSE,message=FALSE,prompt=FALSE} 70 | require(msigdbr) 71 | 72 | # Show supported gene sets colections 73 | print(msigdbr::msigdbr_collections(),n=30) 74 | ``` 75 | ```{r scgsea, cache=TRUE, warning=FALSE,message=FALSE,prompt=FALSE,results='hide'} 76 | # Run scGSEA using 50 hallmarks genes 77 | data = runScGSEA(data = data, 78 | geneID = "ensamble", 79 | species = "human", 80 | category = "H", 81 | nmf.k = 100, 82 | fdr.th = .1, 83 | rescale = "none", 84 | verbose = T) 85 | ``` 86 | 87 | # Cluster cells by pathway's activity levels 88 | Now that we have reconstructed pathway's activity at single cell level we can try to cluster cell according to these values. To transform cells into a graph `gficf` package uses UMAP (i.e., the fuzzy graph produced by umap) or the phenograph algorithm. More information about how UMAP constructs a high dimensional graph representation of the data can be found [HERE](https://pair-code.github.io/understanding-umap/). Phenograph is instead a clustering method designed for high-dimensional single-cell data analysis. It works by first creating a graph ("network") that represents phenotypic similarities among cells by calculating the Jaccard coefficient between nearest-neighbor sets, and then identifying communities using the [Louvain method](https://sites.google.com/site/findcommunities/) in the reconstructed graph. Results are stored into the column `cluster.by.scGSEA` of the cell metadata data.frame `data$embedded`. 89 | 90 | ```{r clust, fig.cap="",echo=TRUE,cache=TRUE,warning=FALSE,message=FALSE,prompt=FALSE,results='hide'} 91 | # Cluster cells with phenograph method but using 92 | # estimate cells pathway's activity levels 93 | data = clustcellsBYscGSEA(data, 94 | method = "fgraph", 95 | pca = 10, 96 | k = 10, 97 | resolution = .05, 98 | n.start = 10, 99 | n.iter = 50, 100 | verbose = T) 101 | 102 | # Plot clusters on top of UMAP representation 103 | p = plotCells(data = data,pointShape = 19,colorBy = "cluster.by.scGSEA") + 104 | xlab("UMAP 1") + 105 | ylab("UMAP 2") 106 | 107 | plot(p) 108 | ``` 109 | 110 | # Tips and Suggestions 111 | ## Where are my inferred pathway activity levels? 112 | Reconstructed pathway's activity levels in each cell are stored in the matrix `data$scgsea$x`. In this matrix rows are cells while columns are pathways. 113 | 114 | ```{r head,cache=TRUE,warning=FALSE,message=FALSE,prompt=FALSE} 115 | head(data$scgsea$x[,1:2]) 116 | ``` 117 | 118 | ## How many factors of NMF I have to use for scGSEA? 119 | scGSEA is a tool that leverages NMF expression latent factors to infer pathway activity at a single cell level. Thus, by design, it inherits both benefits and limitations of the NMF model. A well know limit of this model, like other matrix decomposition techniques, is the choice of the exact number of factors to use. We generally recommend using at least 100 NMF factors. 120 | 121 | ## How can I use a custom gene set list for scGSEA? 122 | If you want to use a custom list of gene sets you can pass it trough the parameter `pathway.list` of `runScGSEA()` function. Name of each element of this list **must be unique** and must represents the name of the pathway. 123 | 124 | ## Where is the cell graph produced by phenograph or UMAP? 125 | The cell graph produced by UMAP (or phenograph) algorithm is stored into the `data$scgsea$cell.graph` igraph object, while the identified cell communities into the column `cluster.by.scGSEA` of the cell metadata data.frame `data$embedded`. That graph can be then plotted and manipulated with any network package supporting igraph objects. Here, for example, we use [`netbiov`](https://www.bioconductor.org/packages/release/bioc/html/netbiov.html) package to visualize this network. Remember that in this weighted network the **edge weights represents similarities among cells** in terms of probability (for UMAP) or jaccard coefficients for phenograph (i.e., they are not distances). Thus, remember to transform these similarities into distances before to apply the minimum spanning tree (MST) algorithm or any other graph theory method that use distances. 126 | 127 | ```{r stats,cache=TRUE,warning=FALSE,message=FALSE,prompt=FALSE} 128 | require(igraph) 129 | 130 | # Extract the graph and print info 131 | g = data$scgsea$cell.graph 132 | 133 | # Transform similarities into distances 134 | # you have to use MST 135 | # E(g)$weight <- 1 - E(g)$weight 136 | 137 | # simplify the network 138 | g = igraph::simplify(g,edge.attr.comb = "min") 139 | 140 | # print the network stats 141 | summary(g) 142 | ``` 143 | ```{r graph, fig.cap="Cell Network. The cell network reconstructed by UMAP using the reconstructed pathway's activity levels in each cell.",echo=TRUE,cache=TRUE,warning=FALSE,message=FALSE,prompt=FALSE,results='hide'} 144 | require(netbiov) 145 | 146 | # Plot the network (bolder edges are MST edges) 147 | hc <- rgb(t(col2rgb(heat.colors(20)))/255,alpha=.2) 148 | cl <- rgb(r=0, b=.7, g=1, alpha=.05) 149 | xx <- plot.modules(g, 150 | color.random=TRUE, 151 | v.size=1, 152 | layout.function=layout.graphopt) 153 | 154 | ``` 155 | 156 | # Session info {.unnumbered} 157 | 158 | ```{r sessionInfo, echo=FALSE} 159 | sessionInfo() 160 | ``` 161 | -------------------------------------------------------------------------------- /inst/doc/scGSEA_files/figure-html/atlas-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/scGSEA_files/figure-html/atlas-1.png -------------------------------------------------------------------------------- /inst/doc/scGSEA_files/figure-html/clust-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/scGSEA_files/figure-html/clust-1.png -------------------------------------------------------------------------------- /inst/doc/scGSEA_files/figure-html/graph-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/scGSEA_files/figure-html/graph-1.png -------------------------------------------------------------------------------- /inst/doc/scMAP.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Single-cell Mapper" 3 | author: 4 | - name: Gennaro Gambardella 5 | affiliation: TIGEM (Telethon Institute of Genetics and Medicine) 6 | package: gficf 7 | output: 8 | BiocStyle::html_document 9 | vignette: | 10 | %\VignetteIndexEntry{Single-cell Mapper} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | %\VignetteEncoding{UTF-8} 13 | --- 14 | 15 | 16 | 17 | 24 | 25 | # Introduction {.unnumbered} 26 | 27 | Single-cell Mapper (scMAP) is a transfer learning algorithm that combines text mining data transformation and a k-nearest neighbours’ (KNN) classifier to map a query set of single-cell transcriptional profiles on top of a reference atlas. scMAP consists of three main steps: 28 | 29 | 1. In the first step query cell profiles are normalized with the GF-ICF method but using the ICF weights learned by using the reference atlas; 30 | 31 | 2. In the second step, normalized cell profiles are first projected into the NMF (or PC) sub-space of the reference atlas and then mapped into its UMAP embedding space; 32 | 33 | 3. Finally, the KNN algorithm is used to contextualize mapped cells and annotate them. 34 | 35 | Please read the manuscript for additional details and estimated performances of the method. 36 | 37 | # Building the reference atlas 38 | For this tutorial, we will be using as a reference atlas the `small_BC_atlas dataset` included in the `gficf` package (Figure \@ref(fig:atlas)). This dataset is a small version (comprising only 4,760 cells) of the Breast cancer cell-line atlas we recently published ([Gambardella et al.](https://www.nature.com/articles/s41467-022-29358-6)). 39 | 40 | ```{r atlas, fig.cap="UMAP of referece BC Atlas. UMAP plot of the small_BC_atlas dataset where cells are color-coded according to their cell-line of origin.",echo=TRUE,cache=TRUE,warning=FALSE,message=F,prompt=FALSE,results='hide'} 41 | require(gficf) 42 | require(ggplot2) 43 | 44 | # Step 1. Build the reference atlas 45 | # Load the RAW UMI count matrix on which to build the reference atlas 46 | data("small_BC_atlas") 47 | 48 | # 1.1. Normalization and gene filtering 49 | data <- gficf( M = small_BC_atlas, 50 | cell_count_cutoff = 15, 51 | nonz_mean_cutoff = 1.12, 52 | cell_percentage_cutoff2 = 0.05, 53 | normalize = T, 54 | verbose = T) 55 | 56 | # 1.2 Create NMF (or PCA) subspace 57 | # using all genes (usually improves performances) 58 | data <- runNMF(data = data,dim = 50) 59 | 60 | # 1.3 Create t-UMAP space 61 | data <-runReduction(data = data,reduction = "umap",nt = 2,verbose = T) 62 | 63 | # 1.4 Let's add info about the cell-line of origin 64 | # Cell meta-data can stored in the data$embedded data.frame 65 | # Let' add the info about the cell-line, stripping this information 66 | # from the name of the cell and storing it into ccl column. 67 | data$embedded$ccl = sapply( 68 | strsplit(x = rownames(data$embedded), 69 | split = "_",fixed = T) 70 | ,function(x) x[1] 71 | ) 72 | 73 | # Plot cells by cell line of origin 74 | p = plotCells(data = data,colorBy = "ccl",pointShape = 19) + 75 | xlab("UMAP 1") + 76 | ylab("UMAP 2") 77 | 78 | plot(p) 79 | ``` 80 | 81 | # How to map new cells into a reference atlas 82 | 83 | After we have built our reference atlas (Figure \@ref(fig:atlas)), we use the 930 cells available into the `test_BC_atlas` dataset to test our mapping algorithm (Figure \@ref(fig:map)). Mapping of new cells into the embedded space of the reference atlas is performed by the `scMAP()` function. Metadata and coordinate of new mapped cells are stored into `data$embedded.predicted` data.frame. 84 | 85 | ```{r map, fig.cap="UMAP of referece BC Atlas and mapped cells. UMAP plot of the small_BC_atlas dataset where cells are color-coded according to their cell-line of origin. Black points are the 930 mapped cells contained in the test_BC_atlas dataset",echo=TRUE,cache=TRUE,warning=FALSE,message=F,prompt=FALSE,results='hide'} 86 | 87 | # Step 2. Map new cells into reference atlas. 88 | # 2.1 Load the new cells to map 89 | data("test_BC_atlas") 90 | 91 | # 2.2 Cell mapping 92 | data = scMAP(data = data, x = test_BC_atlas,nt = 2,normalize = T,verbose = T) 93 | 94 | # 2.3 Plot mapped cells 95 | p = ggplot() + 96 | geom_point(data = data$embedded,aes(x=X,y=Y,color=ccl), 97 | shape=19,size=.5) + 98 | geom_point(data = data$embedded.predicted,aes(x=X,y=Y), 99 | color="black",shape=19,size=.2) + 100 | theme_bw() + 101 | xlab("UMAP 1") + 102 | ylab("UMAP 2") + 103 | theme(legend.position = "none") 104 | 105 | plot(p) 106 | ``` 107 | 108 | # How to contextualize mapped cells 109 | 110 | Finally we can use KNN to classify and thus contextualize mapped cells into the reference atlas. In this example we try to infer cell-line of origin of each mapped cell for winch. After the mapping we also compute the classification accuracy that is for this example of 93.23%. Specifically, mapping task can be performed with the function `classify.cells()` of `gficf` package. In this function the class of reference cells must be specified by the `classes` parameter. Here we use as class of each cell its cell-line of origin. Classification results are stored into `data$embedded.predicted` data.frame where the column `predicted.class` reports the predicted class and the column `class.prob` its predicted probability. 111 | 112 | The `classify.cells()` function is built on top of [`KernelKnn`](https://github.com/mlampros/KernelKnn) package and thus implement several measures to compute the distances among cells that can be specified trough the `knn_method` parameter. At the same time the parameter `knn_weights_fun` can be used to specify the kernel function to use for cell classification. Default is `knn_weights_fun = NULL` that correspond the to [unweighted KNN algorithm](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm). For further details on possible kernel functions to use, plese see this [blog-post](http://mlampros.github.io/2016/07/10/KernelKnn/) of original `KernelKnn` package. 113 | 114 | ```{r classify, echo=TRUE,cache=TRUE,warning=FALSE,message=F,prompt=FALSE} 115 | 116 | # Step 3. Cell classification with KNN. 117 | data = classify.cells(data = data, 118 | classes = data$embedded$ccl, 119 | k = 11, 120 | knn_method = "manhattan", 121 | knn_weights_fun = NULL) 122 | 123 | # show top ten classified cells 124 | head(data$embedded.predicted) 125 | ``` 126 | 127 | ```{r performance, echo=TRUE,cache=TRUE,warning=FALSE,prompt=FALSE} 128 | 129 | # Strip from cell name the cell-line 130 | # of origin of each mapped cell 131 | data$embedded.predicted$ccl = sapply( 132 | strsplit(x = rownames(data$embedded.predicted), 133 | split = "_",fixed = T) 134 | ,function(x) x[1] 135 | ) 136 | 137 | # Now we can compute the classification accuracy 138 | acc = sum(data$embedded.predicted$predicted.class==data$embedded.predicted$ccl)/nrow(data$embedded.predicted)*100 139 | cat("Classification accuracy is",round(acc,2),"%") 140 | ``` 141 | 142 | 143 | 144 | # Session info {.unnumbered} 145 | 146 | ```{r sessionInfo, echo=FALSE} 147 | sessionInfo() 148 | ``` 149 | -------------------------------------------------------------------------------- /inst/doc/scMAP_files/figure-html/atlas-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/scMAP_files/figure-html/atlas-1.png -------------------------------------------------------------------------------- /inst/doc/scMAP_files/figure-html/map-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/scMAP_files/figure-html/map-1.png -------------------------------------------------------------------------------- /man/Read10X.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{Read10X} 4 | \alias{Read10X} 5 | \title{Load in data from 10X} 6 | \usage{ 7 | Read10X( 8 | data.dir, 9 | gene.column = 1, 10 | cell.column = 1, 11 | unique.features = TRUE, 12 | strip.suffix = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{data.dir}{Directory containing the matrix.mtx, genes.tsv (or features.tsv), and barcodes.tsv 17 | files provided by 10X. A vector or named vector can be given in order to load 18 | several data directories. If a named vector is given, the cell barcode names 19 | will be prefixed with the name.} 20 | 21 | \item{gene.column}{Specify which column of genes.tsv or features.tsv to use for gene names; default is 2} 22 | 23 | \item{cell.column}{Specify which column of barcodes.tsv to use for cell names; default is 1} 24 | 25 | \item{unique.features}{Make feature names unique (default TRUE)} 26 | 27 | \item{strip.suffix}{Remove trailing "-1" if present in all cell barcodes.} 28 | } 29 | \value{ 30 | If features.csv indicates the data has multiple data types, a list 31 | containing a sparse matrix of the data from each type will be returned. 32 | Otherwise a sparse matrix containing the expression data will be returned. 33 | } 34 | \description{ 35 | Enables easy loading of sparse data matrices provided by 10X genomics. 36 | } 37 | \examples{ 38 | \dontrun{ 39 | # For output from CellRanger < 3.0 40 | data_dir <- 'path/to/data/directory' 41 | list.files(data_dir) # Should show barcodes.tsv, genes.tsv, and matrix.mtx 42 | expression_matrix <- Read10X(data.dir = data_dir) 43 | seurat_object = CreateSeuratObject(counts = expression_matrix) 44 | 45 | # For output from CellRanger >= 3.0 with multiple data types 46 | data_dir <- 'path/to/data/directory' 47 | list.files(data_dir) # Should show barcodes.tsv.gz, features.tsv.gz, and matrix.mtx.gz 48 | data <- Read10X(data.dir = data_dir) 49 | seurat_object = CreateSeuratObject(counts = data$`Gene Expression`) 50 | seurat_object[['Protein']] = CreateAssayObject(counts = data$`Antibody Capture`) 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /man/classify.cells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cellClassifier.R 3 | \name{classify.cells} 4 | \alias{classify.cells} 5 | \title{Classify New Embedded Cells} 6 | \usage{ 7 | classify.cells( 8 | data, 9 | classes, 10 | k = 7, 11 | seed = 18051982, 12 | knn_method = "euclidean", 13 | knn_weights_fun = NULL, 14 | nt = 0 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{list; GFICF object} 19 | 20 | \item{classes}{chareachters; Classes of already existing cells in the order of they are in colnames(data$gficf).} 21 | 22 | \item{k}{integer; Number of K-nn to use for classification. Odd number less than 30 are preferred.} 23 | 24 | \item{seed}{integer; Initial seed to use.} 25 | 26 | \item{knn_method}{string; a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'.} 27 | 28 | \item{knn_weights_fun}{value; there are various ways of specifying the kernel function. NULL value (default) correspond to unweighted KNN algorithm. See the details section of KernelKnn package for more values.} 29 | 30 | \item{nt}{numeric; Number of thread to use (default is 0, i.e. all possible CPUs - 1)} 31 | } 32 | \value{ 33 | A dataframe containing cell id and predicted classes. 34 | } 35 | \description{ 36 | Classify new embedded cells using GF-ICF transformation and K-nn algorithm. 37 | Existing cells are used as training set. 38 | } 39 | -------------------------------------------------------------------------------- /man/clustcells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clustCells.R 3 | \name{clustcells} 4 | \alias{clustcells} 5 | \title{PhenoGraph clustering} 6 | \usage{ 7 | clustcells( 8 | data, 9 | from.embedded = F, 10 | k = 15, 11 | dist.method = "manhattan", 12 | nt = 2, 13 | community.algo = "louvain 3", 14 | store.graph = T, 15 | seed = 180582, 16 | verbose = TRUE, 17 | resolution = 0.25, 18 | n.start = 50, 19 | n.iter = 250 20 | ) 21 | } 22 | \arguments{ 23 | \item{data}{list; Input data (gficf object)} 24 | 25 | \item{from.embedded}{logical; Use embeddedd (UMAP or tSNA) space for clustering cells. Best results are usually obtained not using the embedded space.} 26 | 27 | \item{k}{integer; number of nearest neighbours (default:15)} 28 | 29 | \item{dist.method}{character; Dist to use for K-nn. Type of distance metric to use to find nearest neighbors. One of: 30 | \itemize{ 31 | \item \code{"euclidean"} (the default) 32 | \item \code{"cosine"} 33 | \item \code{"manhattan"} 34 | \item \code{"hamming"} (very slow) 35 | }} 36 | 37 | \item{nt}{integer; Number of cpus to use for k-nn search} 38 | 39 | \item{community.algo}{characthers; Community algorithm to use for clustering. Supported are: 40 | \itemize{ 41 | \item \code{"louvain"} (the default, the original louvain method) 42 | \item \code{"louvain 2"} (louvain with modularity optimization from Seurat) 43 | \item \code{"louvain 3"} (Louvain algorithm with multilevel refinement from Seurat) 44 | \item \code{"leiden"} (Leiden algorithm see Traag et al. 2019) 45 | \item \code{"walktrap"} 46 | \item \code{"fastgreedy"} 47 | }} 48 | 49 | \item{store.graph}{logical; Store produced phenograph in the gficf object} 50 | 51 | \item{seed}{integer; Seed to use for replication.} 52 | 53 | \item{verbose}{logical; Increase verbosity.} 54 | 55 | \item{resolution}{Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities (used only for leiden and louvain 2 or 3 methods).} 56 | 57 | \item{n.start}{Number of random starts (used only for louvain 2 or 3 methods).} 58 | 59 | \item{n.iter}{Maximal number of iterations per random start (used only for louvain 2 or 3 methods).} 60 | } 61 | \value{ 62 | the updated gficf object 63 | } 64 | \description{ 65 | R implementation of the PhenoGraph algorithm 66 | } 67 | \details{ 68 | A custom R implementation of the PhenoGraph (http://www.cell.com/cell/abstract/S0092-8674(15)00637-6) algorithm, 69 | which is a clustering method designed for high-dimensional single-cell data analysis. It works by creating a graph ("network") representing 70 | phenotypic similarities between cells by calclating the Jaccard coefficient between nearest-neighbor sets, and then identifying communities 71 | using the well known Louvain method (https://sites.google.com/site/findcommunities/) in this graph. 72 | 73 | That version used PCA or LSA reduced meta-cells and multithreading annoy version for K-nn search (from uwot package). 74 | } 75 | -------------------------------------------------------------------------------- /man/clustcellsBYscGSEA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clustCells.R 3 | \name{clustcellsBYscGSEA} 4 | \alias{clustcellsBYscGSEA} 5 | \title{Cell clustering by pathway's activity} 6 | \usage{ 7 | clustcellsBYscGSEA( 8 | data, 9 | method = "fgraph", 10 | pca = NULL, 11 | k = 10, 12 | resolution = 0.25, 13 | n.start = 50, 14 | n.iter = 250, 15 | nt = 0, 16 | verbose = T, 17 | seed = 180582 18 | ) 19 | } 20 | \arguments{ 21 | \item{data}{list; Input data (gficf object)} 22 | 23 | \item{method}{character; Method used to produce the cell network, such as phenograph or fgraph.} 24 | 25 | \item{pca}{numeric; If different from NULL data are reduced using pca component before to apply clustering algorithm.} 26 | 27 | \item{k}{integer; number of nearest neighbours (default:10).} 28 | 29 | \item{resolution}{Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities.} 30 | 31 | \item{n.start}{Number of random starts.} 32 | 33 | \item{n.iter}{Maximal number of iterations per random start.} 34 | 35 | \item{nt}{integer; Number of cpus to use for k-nn search. If zero all cpu are used.} 36 | 37 | \item{verbose}{logical; Increase verbosity.} 38 | 39 | \item{seed}{integer; Seed to use for replication.} 40 | } 41 | \value{ 42 | the updated gficf object 43 | } 44 | \description{ 45 | Cell clustering using pathway expression instead of gene expression 46 | } 47 | \details{ 48 | Cells are clustered using the estimated pathway activity levels by scGSEA() function. 49 | Cells are clustered either by using PhenoGraph algorithm or hierarchical clustering. In the case of PhenoGraph method is used 50 | identified clusters with louvain method are stored into data$embedded$cluster.by.scGSEA while the prduced graph into data$scgsea$cell.graph. 51 | In case hierarchical clusteringis used the resulting dendogram is stored into data$scgsea$h.dendo. 52 | } 53 | -------------------------------------------------------------------------------- /man/computePCADim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dimensinalityReduction.R 3 | \name{computePCADim} 4 | \alias{computePCADim} 5 | \title{Number of features to use} 6 | \usage{ 7 | computePCADim(data, seed = 180582, subsampling = F, plot = T) 8 | } 9 | \arguments{ 10 | \item{data}{list; GFICF object} 11 | 12 | \item{seed}{numeric; seed to use.} 13 | 14 | \item{subsampling}{logical; Use only a subset of the data for the imputation of dimensions to use.} 15 | 16 | \item{plot}{logical; Show eblow plot.} 17 | } 18 | \description{ 19 | Compute the number of dimension to use for either PCA or LSA. 20 | } 21 | -------------------------------------------------------------------------------- /man/ensToSymbol.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{ensToSymbol} 4 | \alias{ensToSymbol} 5 | \title{Convert Ensamble IDs to Official Gene Symbols} 6 | \usage{ 7 | ensToSymbol(df, col, organism, verbose = T) 8 | } 9 | \arguments{ 10 | \item{df}{data frame; Data frame containing the IDs to convert.} 11 | 12 | \item{col}{characters; Name of column containing the ensamble ids.} 13 | 14 | \item{organism}{characters; Organism of origin (i.e. human or mouse).} 15 | 16 | \item{verbose}{boolean; Icrease verbosity.} 17 | } 18 | \value{ 19 | The updated data frame with a new column called symb. 20 | } 21 | \description{ 22 | It uses biomart. If more the one gene is associated to the enamble, the first one retrived from 23 | Biomart is used. 24 | } 25 | -------------------------------------------------------------------------------- /man/filterCells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cellQC.R 3 | \name{filterCells} 4 | \alias{filterCells} 5 | \title{Cell QC} 6 | \usage{ 7 | filterCells(counts, organism, plot = F, verbose = T, minUMI = 800) 8 | } 9 | \arguments{ 10 | \item{counts}{Matrix; Raw counts matrix} 11 | 12 | \item{organism}{characters; Organism (supported human and mouse).} 13 | 14 | \item{plot}{boolean; If regression plots must be showed.} 15 | 16 | \item{verbose}{boolean; Increase verbosity.} 17 | 18 | \item{minUMI}{numeric; Minimium number of UMI per cell (default 800).} 19 | } 20 | \value{ 21 | The updated gficf object. 22 | } 23 | \description{ 24 | Filter Cells with low gene ratio detection and high MT ratio. 25 | Loess and GAM regression are used to fit relationships between the number of UMI and either the ratio of detected genes or the MT ratio. 26 | } 27 | -------------------------------------------------------------------------------- /man/findClusterMarkers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deGenes.R 3 | \name{findClusterMarkers} 4 | \alias{findClusterMarkers} 5 | \title{Find Marker Genes from cell clusters.} 6 | \usage{ 7 | findClusterMarkers(data, nt = 2, hvg = T, verbose = T) 8 | } 9 | \arguments{ 10 | \item{data}{list; GFICF object} 11 | 12 | \item{nt}{integer; Number of thread to use (default 2).} 13 | 14 | \item{hvg}{boolean; Use only High Variable Genes (default is TRUE).} 15 | 16 | \item{verbose}{boolean; Icrease verbosity.} 17 | } 18 | \value{ 19 | The updated gficf object. 20 | } 21 | \description{ 22 | Try to identify marker genes across clusters performing Mann-Whitney U test. 23 | DE genes are identified the expression in each cluster versus the all the other. 24 | } 25 | -------------------------------------------------------------------------------- /man/findVarGenes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deGenes.R 3 | \name{findVarGenes} 4 | \alias{findVarGenes} 5 | \title{Find high variable genes following the approach 6 | proposed by Chen et al. in BMC Genomics (2016) 7 | Code adapted from https://github.com/hillas/scVEGs} 8 | \usage{ 9 | findVarGenes(data, fitMethod = "locfit", verbose = T) 10 | } 11 | \arguments{ 12 | \item{data}{list; GFICF object} 13 | 14 | \item{fitMethod}{charachter; Method to use to fit variance and mean expression relationship (loess or locfit).} 15 | 16 | \item{verbose}{boolean; Increase verbosity.} 17 | } 18 | \description{ 19 | Find high variable genes following the approach 20 | proposed by Chen et al. in BMC Genomics (2016) 21 | Code adapted from https://github.com/hillas/scVEGs 22 | } 23 | -------------------------------------------------------------------------------- /man/gficf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gficf.R 3 | \name{gficf} 4 | \alias{gficf} 5 | \title{Gene Frequency - Inverse Cell Frequency (GF-ICF)} 6 | \usage{ 7 | gficf( 8 | M = NULL, 9 | QCdata = NULL, 10 | cell_count_cutoff = 5, 11 | cell_percentage_cutoff2 = 0.03, 12 | nonz_mean_cutoff = 1.12, 13 | normalize = TRUE, 14 | storeRaw = TRUE, 15 | batches = NULL, 16 | groups = NULL, 17 | filterGenes = TRUE, 18 | verbose = TRUE, 19 | ... 20 | ) 21 | } 22 | \arguments{ 23 | \item{M}{Matrix; UMI cell count matrix} 24 | 25 | \item{QCdata}{list; QC cell object.} 26 | 27 | \item{cell_count_cutoff}{numeric; All genes detected in less than cell_count_cutoff cells will be excluded (default 5).} 28 | 29 | \item{cell_percentage_cutoff2}{numeric; All genes detected in at least this percentage of cells will be included (default 0.03, i.e. 3 percent of cells).} 30 | 31 | \item{nonz_mean_cutoff}{numeric genes detected in the number of cells between the above mentioned cutoffs are selected only when their average expression in non-zero cells is above this cutoff (default 1.12).} 32 | 33 | \item{normalize}{logical; Rescale UMI counts before apply GFICF. Rescaling is done using EdgeR normalization.} 34 | 35 | \item{storeRaw}{logical; Store UMI counts.} 36 | 37 | \item{batches}{vector; Vector or factor for batch.} 38 | 39 | \item{groups}{vector; Vector or factor for biological condition of interest.} 40 | 41 | \item{filterGenes}{logical; Apply gene filter (default TRUE).} 42 | 43 | \item{verbose}{boolean; Increase verbosity.} 44 | 45 | \item{...}{Additional arguments to pass to ComBat_seq call.} 46 | } 47 | \value{ 48 | The updated gficf object. 49 | } 50 | \description{ 51 | R implementation of the GF-ICF 52 | Thanks to 3’-end scRNA-seq approaches, we can now have an accurate estimation of gene expression without having to account for gene length, 53 | thus the number of transcripts (i.e. UMI) associated to each gene, strictly reflects the frequency of a gene in a cell, exactly like a word in a document. 54 | GFICF (Gene Frequency - Inverse Cell Frequency) is analogous of TF-IDF scoring method as defined for tex mining With GFICF we consider a cell to be 55 | analogous to a document, genes analogous to words and gene counts to be analogous of the word’s occurrence in a document. 56 | } 57 | -------------------------------------------------------------------------------- /man/loadGFICF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gficf.R 3 | \name{loadGFICF} 4 | \alias{loadGFICF} 5 | \title{Restore GFICF object.} 6 | \usage{ 7 | loadGFICF(file, verbose = T) 8 | } 9 | \arguments{ 10 | \item{file}{name of the file where the object is stored.} 11 | } 12 | \description{ 13 | Function to read a GFICF object from a file saved with \code{\link{saveGFICF}}. 14 | } 15 | \examples{ 16 | gficf_file <- tempfile("gficf_data") 17 | 18 | # restore 19 | data2 <- loadGFICF(file = gficf_file) 20 | 21 | } 22 | -------------------------------------------------------------------------------- /man/plotCells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualization.R 3 | \name{plotCells} 4 | \alias{plotCells} 5 | \title{Plot cells in the ebedded space} 6 | \usage{ 7 | plotCells(data, colorBy = NULL, pointSize = 0.5, pointShape = 46) 8 | } 9 | \arguments{ 10 | \item{data}{list; GFICF object} 11 | 12 | \item{colorBy}{characters; Color cells according to a column contained in data$embedded data frame. Default is NULL.} 13 | 14 | \item{pointSize}{integer; Size of the points in the plot. Default is 0.5.} 15 | 16 | \item{pointShape}{integer; Shape of the points in the plot. Default is 46.} 17 | } 18 | \value{ 19 | The updated gficf object. 20 | } 21 | \description{ 22 | Plot cells in the bidimensional space and color it according to a specific parameter. 23 | } 24 | -------------------------------------------------------------------------------- /man/plotGSEA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualization.R 3 | \name{plotGSEA} 4 | \alias{plotGSEA} 5 | \title{Plot GSEA results} 6 | \usage{ 7 | plotGSEA(data, fdr = 0.05, clusterRowCol = F) 8 | } 9 | \arguments{ 10 | \item{data}{list; GFICF object} 11 | 12 | \item{fdr}{number; FDR threshold to select significant pathways to plot.} 13 | 14 | \item{clusterRowCol}{boolean; if TRUE row and col of the plot are clustered.} 15 | } 16 | \value{ 17 | plot from ggplot2 package. 18 | } 19 | \description{ 20 | Circle plot for gene set enrichement analysis results. 21 | } 22 | -------------------------------------------------------------------------------- /man/plotGSVA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualization.R 3 | \name{plotGSVA} 4 | \alias{plotGSVA} 5 | \title{Plot GSEA results} 6 | \usage{ 7 | plotGSVA(data, fdr = 0.05, clusterRowCol = T, logFCth = 0) 8 | } 9 | \arguments{ 10 | \item{data}{list; GFICF object} 11 | 12 | \item{fdr}{number; FDR threshold to select significant pathways to plot.} 13 | 14 | \item{clusterRowCol}{boolean; if TRUE row and col of the plot are clustered.} 15 | 16 | \item{logFCth}{number; LogFC threshold to select pathways to plot.} 17 | } 18 | \value{ 19 | plot from ggplot2 package. 20 | } 21 | \description{ 22 | Circle plot for gene set enrichement analysis results. 23 | } 24 | -------------------------------------------------------------------------------- /man/plotGeneViolin.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualization.R 3 | \name{plotGeneViolin} 4 | \alias{plotGeneViolin} 5 | \title{Plot the expression of a gene across group of cells.} 6 | \usage{ 7 | plotGeneViolin(data, gene, ncol = 3, x = NULL) 8 | } 9 | \arguments{ 10 | \item{data}{list; GFICF object} 11 | 12 | \item{gene}{characters; Id of genes to plot. It must correspond to the IDs on the rows of raw count matrix.} 13 | 14 | \item{ncol}{numeric; Number of columns of the final plot (defaul is 3).} 15 | 16 | \item{x}{Matrix; Custom normalized raw counts. If present will be used instead of the ones normalized by gficf. Default is NULL.} 17 | } 18 | \value{ 19 | A list of plots. 20 | } 21 | \description{ 22 | Plot the expression of a gene across group of cells with violion plot. 23 | } 24 | -------------------------------------------------------------------------------- /man/plotGenes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualization.R 3 | \name{plotGenes} 4 | \alias{plotGenes} 5 | \title{Plot gene expression across cells} 6 | \usage{ 7 | plotGenes(data, genes, log2Expr = T, x = NULL, rescale = F) 8 | } 9 | \arguments{ 10 | \item{data}{list; GFICF object} 11 | 12 | \item{genes}{characters; Id of genes to plot. It must correspond to the IDs on the rows of raw count matrix.} 13 | 14 | \item{log2Expr}{boolean; Relative expression of a gene is computed on rescaled in log2 expression (default TRUE).} 15 | 16 | \item{x}{Matrix; Custom normalized raw counts. If present will be used instead of the ones normalized by gficf. Default is NULL.} 17 | 18 | \item{rescale}{boolean; Rescale expression between 0 and 1. Default is false.} 19 | } 20 | \value{ 21 | A list of plots. 22 | } 23 | \description{ 24 | Plot the expression of a group of genes across cells. 25 | } 26 | -------------------------------------------------------------------------------- /man/plotPathway.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualization.R 3 | \name{plotPathway} 4 | \alias{plotPathway} 5 | \title{Plot GSEA results} 6 | \usage{ 7 | plotPathway(data, pathwayName, fdr = 0.05) 8 | } 9 | \arguments{ 10 | \item{data}{list; GFICF object} 11 | 12 | \item{pathwayName}{characters; Name of the pathway to plot.} 13 | 14 | \item{fdr}{number; FDR threshold to select significant pathways to plot.} 15 | } 16 | \value{ 17 | plot from ggplot2 package. 18 | } 19 | \description{ 20 | Plot GSEA values on top of UMAP/TSNE coordinates. 21 | } 22 | -------------------------------------------------------------------------------- /man/resetScGSEA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pathwayAnalisys.R 3 | \name{resetScGSEA} 4 | \alias{resetScGSEA} 5 | \title{Remove previous scGSEA analysis} 6 | \usage{ 7 | resetScGSEA(data) 8 | } 9 | \arguments{ 10 | \item{data}{list; GFICF object} 11 | } 12 | \description{ 13 | Remove previous scGSEA analysis 14 | } 15 | -------------------------------------------------------------------------------- /man/runGSEA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pathwayAnalisys.R 3 | \name{runGSEA} 4 | \alias{runGSEA} 5 | \title{Gene Set Enrichement Analysi on GF-ICF} 6 | \usage{ 7 | runGSEA( 8 | data, 9 | gmt.file, 10 | nsim = 1000, 11 | convertToEns = T, 12 | convertHu2Mm = F, 13 | nt = 2, 14 | minSize = 15, 15 | maxSize = Inf, 16 | verbose = TRUE, 17 | seed = 180582, 18 | method = "GSEA" 19 | ) 20 | } 21 | \arguments{ 22 | \item{data}{list; GFICF object} 23 | 24 | \item{gmt.file}{characters; Path to gmt file from MSigDB} 25 | 26 | \item{nsim}{integer; number of simulation used to compute ES significance.} 27 | 28 | \item{convertToEns}{boolean: Convert gene sets from gene symbols to Ensable id.} 29 | 30 | \item{convertHu2Mm}{boolean: Convert gene sets from human symbols to Mouse Ensable id.} 31 | 32 | \item{nt}{numeric; Number of cpu to use for the GSEA} 33 | 34 | \item{minSize}{numeric; Minimal size of a gene set to test (default 15). All pathways below the threshold are excluded.} 35 | 36 | \item{maxSize}{numeric; Maximal size of a gene set to test (default Inf). All pathways above the threshold are excluded.} 37 | 38 | \item{verbose}{boolean; Show the progress bar.} 39 | 40 | \item{seed}{integer; Seed to use for random number generation.} 41 | 42 | \item{method}{string; Method to use GSEA or GSVA. Default is GSEA.} 43 | } 44 | \value{ 45 | The updated gficf object. 46 | } 47 | \description{ 48 | Compute GSEA for each cluster across a set of input pathways. 49 | } 50 | -------------------------------------------------------------------------------- /man/runHarmony.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dimensinalityReduction.R 3 | \name{runHarmony} 4 | \alias{runHarmony} 5 | \title{Number of features to use} 6 | \usage{ 7 | runHarmony(data, metadata, var.to.use, verbose = T, ...) 8 | } 9 | \arguments{ 10 | \item{data}{list; GFICF object} 11 | 12 | \item{metadata}{dataframe; Either (1) Dataframe with variables to integrate or (2) vector with labels.} 13 | 14 | \item{var.to.use}{character; If meta_data is dataframe, this defined which variable(s) to remove (character vector).} 15 | 16 | \item{verbose}{boolean; Increase verbosity.} 17 | 18 | \item{...}{Additional arguments to pass to HarmonyMatrix function.} 19 | } 20 | \description{ 21 | Compute the number of dimension to use for either PCA or LSA. 22 | } 23 | -------------------------------------------------------------------------------- /man/runNMF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dimensinalityReduction.R 3 | \name{runNMF} 4 | \alias{runNMF} 5 | \title{Non-Negative Matrix Factorization (NMF)} 6 | \usage{ 7 | runNMF( 8 | data, 9 | dim = NULL, 10 | seed = 180582, 11 | use.odgenes = F, 12 | n.odgenes = NULL, 13 | plot.odgenes = F, 14 | nt = 0, 15 | ... 16 | ) 17 | } 18 | \arguments{ 19 | \item{data}{list; GFICF object} 20 | 21 | \item{dim}{integer; Number of dimension which to reduce the dataset.} 22 | 23 | \item{seed}{integer; Initial seed to use.} 24 | 25 | \item{use.odgenes}{boolean; Use only significant overdispersed genes respect to ICF values.} 26 | 27 | \item{n.odgenes}{integer; Number of overdispersed genes to use. A good choise seems to be usually between 1000 and 3000.} 28 | 29 | \item{plot.odgenes}{boolean; Show significant overdispersed genes respect to ICF values.} 30 | 31 | \item{nt}{numeric; Numbmber of thread to use (default is 0, i.e. all available CPU cores).} 32 | 33 | \item{...}{Additional arguments to pass to nfm call (see ?RcppML::nmf).} 34 | 35 | \item{centre}{logical; Centre gficf scores before applying reduction (increase separation).} 36 | 37 | \item{randomized}{logical; Use randomized (faster) version for matrix decomposition (default is TRUE).} 38 | } 39 | \value{ 40 | The updated gficf object. 41 | } 42 | \description{ 43 | Reduce dimensionality of the single cell dataset using Non-Negative Matrix Factorization (NMF) 44 | } 45 | -------------------------------------------------------------------------------- /man/runPCA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dimensinalityReduction.R 3 | \name{runPCA} 4 | \alias{runPCA} 5 | \title{Principal Component Analysis (PCA)} 6 | \usage{ 7 | runPCA( 8 | data, 9 | dim = NULL, 10 | var.scale = F, 11 | centre = F, 12 | seed = 180582, 13 | use.odgenes = F, 14 | n.odgenes = NULL, 15 | plot.odgenes = F 16 | ) 17 | } 18 | \arguments{ 19 | \item{data}{list; GFICF object} 20 | 21 | \item{dim}{integer; Number of dimension which to reduce the dataset.} 22 | 23 | \item{centre}{logical; Centre gficf scores before applying reduction (increase separation).} 24 | 25 | \item{seed}{integer; Initial seed to use.} 26 | 27 | \item{use.odgenes}{boolean; Use only significant overdispersed genes respect to ICF values.} 28 | 29 | \item{n.odgenes}{integer; Number of overdispersed genes to use. A good choise seems to be usually between 1000 and 3000.} 30 | 31 | \item{plot.odgenes}{boolean; Show significant overdispersed genes respect to ICF values.} 32 | } 33 | \value{ 34 | The updated gficf object. 35 | 36 | The updated gficf object. 37 | } 38 | \description{ 39 | Reduce dimensionality of the single cell dataset using Principal Component Analysis (PCA) 40 | } 41 | -------------------------------------------------------------------------------- /man/runReduction.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dimensinalityReduction.R 3 | \name{runReduction} 4 | \alias{runReduction} 5 | \title{Dimensionality reduction} 6 | \usage{ 7 | runReduction( 8 | data, 9 | reduction = "tumap", 10 | nt = 2, 11 | seed = 18051982, 12 | verbose = T, 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{data}{list; GFICF object} 18 | 19 | \item{reduction}{characters; Reduction method to use. One of: 20 | \itemize{ 21 | \item \code{"tsne"} 22 | \item \code{"umap"} 23 | \item \code{"tumap"} (the default) 24 | }} 25 | 26 | \item{nt}{integer; Number of thread to use (default 2).} 27 | 28 | \item{seed}{integer; Initial seed to use.} 29 | 30 | \item{verbose}{boolean; Icrease verbosity.} 31 | 32 | \item{...}{Additional arguments to pass to Rtsne/umap/tumap call.} 33 | } 34 | \value{ 35 | The updated gficf object. 36 | } 37 | \description{ 38 | Run t-SNE or UMAP or t-UMAP dimensionality reduction on selected features from PCA or NMF. 39 | See ?umap or ?Rtsne for additional parameter to use. 40 | } 41 | -------------------------------------------------------------------------------- /man/runScGSEA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pathwayAnalisys.R 3 | \name{runScGSEA} 4 | \alias{runScGSEA} 5 | \title{Single cell Gene Set Enrichement Analysis on GF-ICF} 6 | \usage{ 7 | runScGSEA( 8 | data, 9 | geneID, 10 | species, 11 | category, 12 | subcategory = NULL, 13 | pathway.list = NULL, 14 | nsim = 10000, 15 | nt = 0, 16 | minSize = 15, 17 | maxSize = Inf, 18 | verbose = TRUE, 19 | seed = 180582, 20 | nmf.k = 100, 21 | fdr.th = 0.05, 22 | gp = 0, 23 | rescale = "none", 24 | normalization = "gficf" 25 | ) 26 | } 27 | \arguments{ 28 | \item{data}{list; GFICF object} 29 | 30 | \item{geneID}{characters; The type of gene identifier to use, such as ensamble of symbol.} 31 | 32 | \item{species}{characters; Species name, such as human or mouse.} 33 | 34 | \item{category}{characters; MSigDB collection abbreviation, such as H or C1.} 35 | 36 | \item{subcategory}{characters; MSigDB sub-collection abbreviation, such as CGP or BP.} 37 | 38 | \item{pathway.list}{list; Custom list of pathways. Each element correspond to a pathway a and contains a vector of genes.} 39 | 40 | \item{nsim}{integer; number of simulation used to compute ES significance.} 41 | 42 | \item{nt}{numeric; Number of cpu to use for the GSEA and NMF. Default is 0 (i.e., all available cores minus one)} 43 | 44 | \item{minSize}{numeric; Minimal size of a gene set to test (default 15). All pathways below the threshold are excluded.} 45 | 46 | \item{maxSize}{numeric; Maximal size of a gene set to test (default Inf). All pathways above the threshold are excluded.} 47 | 48 | \item{verbose}{boolean; Show the progress bar.} 49 | 50 | \item{seed}{integer; Seed to use for random number generation.} 51 | 52 | \item{nmf.k}{numeric; Rank of NMF.} 53 | 54 | \item{fdr.th}{numeric; FDR threshold for GSEA.} 55 | 56 | \item{rescale}{string; If different by none, pathway's activity scores are resealed as Z-score. Possible values are none, byGS or byCell. Default is none.} 57 | 58 | \item{normalization;}{normalization to use before to apply NMF. Possible values are gficf or cpm. Default and highly raccomanded is gficf.} 59 | } 60 | \value{ 61 | The updated gficf object. 62 | } 63 | \description{ 64 | Compute GSEA for each cells across a set of input pathways by using NMF. 65 | } 66 | -------------------------------------------------------------------------------- /man/saveGFICF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gficf.R 3 | \name{saveGFICF} 4 | \alias{saveGFICF} 5 | \title{Save GFICF object.} 6 | \usage{ 7 | saveGFICF(data, file, verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{data}{a GFICF object create by \code{\link{gficf}}.} 11 | 12 | \item{file}{name of the file where the model is to be saved.} 13 | } 14 | \description{ 15 | Function to write a GFICF object to a file, preserving UMAP model. 16 | } 17 | \examples{ 18 | # save 19 | gficf_file <- tempfile("gficf_test") 20 | saveGFICF(data, file = gficf_file) 21 | 22 | # restore 23 | data2 <- loadGFICF(file = gficf_file) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /man/scMAP.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cellClassifier.R 3 | \name{scMAP} 4 | \alias{scMAP} 5 | \title{Embed new cells in an existing space} 6 | \usage{ 7 | scMAP(data, x, nt = 2, seed = 18051982, normalize = TRUE, verbose = TRUE) 8 | } 9 | \arguments{ 10 | \item{data}{list; GFICF object} 11 | 12 | \item{x}{Matrix; UMI counts matrix of cells to embedd.} 13 | 14 | \item{nt}{integer; Number of thread to use (default 2).} 15 | 16 | \item{seed}{integer; Initial seed to use.} 17 | 18 | \item{normalize}{boolean; If counts must be normalized before to be rescaled with GFICF.} 19 | 20 | \item{verbose}{boolean; Icrease verbosity.} 21 | } 22 | \value{ 23 | The updated gficf object. 24 | } 25 | \description{ 26 | This function embed new cells in an already existing space. For now it supports only UMAP and t-UMAP. Briefly new cells are first normalized with GF-ICF method but using as ICF weigth estimated on the existing cells and than projected in the existing PCA/NMF space before to be embedded in the already existing UMAP space via umap_transform function. 27 | } 28 | -------------------------------------------------------------------------------- /man/small_BC_atlas.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{small_BC_atlas} 5 | \alias{small_BC_atlas} 6 | \title{Small Single cell breast cancer atlas} 7 | \format{ 8 | ## `small_BC_atlas` 9 | A Matrix of 4,760 cells 10 | } 11 | \source{ 12 | 13 | } 14 | \usage{ 15 | small_BC_atlas 16 | } 17 | \description{ 18 | A subset of data from the the Single cell breast cancer atlas 19 | Only 150 cells per cell-line are used 20 | } 21 | \keyword{datasets} 22 | -------------------------------------------------------------------------------- /man/symbolToEns.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{symbolToEns} 4 | \alias{symbolToEns} 5 | \title{Convert Official Gene Symbols to Ensamble IDs} 6 | \usage{ 7 | symbolToEns(df, col, organism, verbose = T) 8 | } 9 | \arguments{ 10 | \item{df}{data frame; Data frame containing the IDs to convert.} 11 | 12 | \item{col}{characters; Name of column containing the gene symbol.} 13 | 14 | \item{organism}{characters; Organism of origin (i.e. human or mouse).} 15 | 16 | \item{verbose}{boolean; Icrease verbosity.} 17 | } 18 | \value{ 19 | The updated data frame with a new column called ens. 20 | } 21 | \description{ 22 | It uses biomart. If more the one gene is associated to the enamble, the first one retrived from 23 | Biomart is used. 24 | } 25 | -------------------------------------------------------------------------------- /man/test_BC_atlas.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{test_BC_atlas} 5 | \alias{test_BC_atlas} 6 | \title{Test set for scMAP method} 7 | \format{ 8 | ## `test_BC_atlas` 9 | A Matrix of 930 cells 10 | } 11 | \source{ 12 | 13 | } 14 | \usage{ 15 | test_BC_atlas 16 | } 17 | \description{ 18 | A subset of data from the the Single cell breast cancer atlas 19 | Only 30 cells per cell-line are used. They do not ovrlapp with cells into small_BC_atlas dataset 20 | } 21 | \keyword{datasets} 22 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | # Turn on C++11 support to get access to long long (guaranteed 64-bit ints) 2 | CXX_STD = CXX11 3 | 4 | # RcppGSL 5 | GSL_CFLAGS = $(shell ${R_HOME}/bin/Rscript -e "RcppGSL:::CFlags()") 6 | GSL_LIBS = $(shell ${R_HOME}/bin/Rscript -e "RcppGSL:::LdFlags()") `gsl-config --libs` 7 | PKG_CPPFLAGS += $(GSL_CFLAGS) 8 | PKG_LIBS += $(GSL_LIBS) 9 | 10 | # RcppParallel 11 | PKG_CXXFLAGS += -DRCPP_PARALLEL_USE_TBB=1 -DSTRICT_R_HEADERS -DRCPP_NO_RTTI 12 | PKG_LIBS += $(shell ${R_HOME}/bin/Rscript -e "RcppParallel::RcppParallelLibs()") 13 | 14 | # OMP 15 | PKG_CXXFLAGS += -I"./include" $(SHLIB_OPENMP_CXXFLAGS) 16 | PKG_LIBS += $(SHLIB_OPENMP_CXXFLAGS) 17 | PKG_CPPFLAGS += -DARMA_64BIT_WORD=1 18 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | # Turn on C++11 support to get access to long long (guaranteed 64-bit ints) 2 | CXX_STD = CXX11 3 | 4 | # RcppGSL 5 | PKG_CPPFLAGS= -I$(GSL_LIBS)/include 6 | PKG_LIBS= -L$(GSL_LIBS)/lib/x64 -lgsl -lgslcblas 7 | 8 | # RcppParallel 9 | PKG_CXXFLAGS += -DRCPP_PARALLEL_USE_TBB=1 -DSTRICT_R_HEADERS -DRCPP_NO_RTTI 10 | PKG_LIBS += $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "RcppParallel::RcppParallelLibs()") 11 | 12 | # OMP 13 | PKG_CXXFLAGS += -I"./include" $(SHLIB_OPENMP_CXXFLAGS) 14 | PKG_LIBS += $(SHLIB_OPENMP_CXXFLAGS) 15 | PKG_CPPFLAGS += -DARMA_64BIT_WORD=1 16 | 17 | # LAPACK AND BLAS 18 | PKG_LIBS += $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 19 | -------------------------------------------------------------------------------- /src/ModularityOptimizer.h: -------------------------------------------------------------------------------- 1 | // This C++ code is from Seurat package available at https://github.com/satijalab/seurat 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | typedef std::vector IVector; 15 | typedef std::vector DVector; 16 | 17 | namespace ModularityOptimizer { 18 | 19 | class JavaRandom { 20 | private: 21 | uint64_t seed; 22 | int next(int bits); 23 | public: 24 | JavaRandom(uint64_t seed); 25 | int nextInt(int n); 26 | void setSeed(uint64_t seed); 27 | }; 28 | 29 | namespace Arrays2 { 30 | IVector generateRandomPermutation(int nElements); 31 | IVector generateRandomPermutation(int nElements, JavaRandom& random); 32 | } 33 | 34 | 35 | class Clustering { 36 | private: 37 | int nNodes; 38 | public: 39 | // Note: These two variables were "protected" in java, which means it is accessible to the whole package/public. 40 | // Although we could have used friend classes, this allows for better mirroring of the original code. 41 | int nClusters; 42 | IVector cluster; 43 | 44 | Clustering(int nNodes); 45 | Clustering(IVector cluster); 46 | int getNNodes() const {return nNodes;}; 47 | int getNClusters() const {return nClusters;}; 48 | IVector getClusters() const {return cluster;}; 49 | int getCluster(int node) const {return cluster[node];}; 50 | IVector getNNodesPerCluster() const; 51 | std::vector getNodesPerCluster() const; 52 | void setCluster(int node, int cluster); 53 | void initSingletonClusters(); 54 | void orderClustersByNNodes(); 55 | void mergeClusters(const Clustering& clustering); 56 | 57 | }; 58 | 59 | class Network { 60 | friend class VOSClusteringTechnique; 61 | protected: 62 | int nNodes; 63 | int nEdges; 64 | DVector nodeWeight; 65 | IVector firstNeighborIndex; 66 | IVector neighbor; 67 | DVector edgeWeight; 68 | double totalEdgeWeightSelfLinks; 69 | public: 70 | Network(); 71 | Network(int nNodes, DVector* nodeWeight, std::vector& edge, DVector* edgeWeight); 72 | Network(int nNodes, std::vector& edge) : 73 | Network(nNodes, nullptr, edge, nullptr) { }; 74 | Network(int nNodes, DVector* nodeWeight, std::vector edge) : 75 | Network(nNodes, nodeWeight, edge, nullptr) {}; 76 | Network(int nNodes, std::vector& edge, DVector* edgeWeight) : 77 | Network(nNodes, nullptr, edge, edgeWeight) {}; 78 | 79 | Network(int nNodes, DVector* nodeWeight, IVector& firstNeighborIndex, IVector& neighbor, DVector* edgeWeight); 80 | Network(int nNodes, IVector& firstNeighborIndex, IVector& neighbor) : 81 | Network(nNodes, nullptr, firstNeighborIndex, neighbor, nullptr) {}; 82 | 83 | Network(int nNodes, DVector* nodeWeight, IVector& firstNeighborIndex, IVector& neighbor) : 84 | Network(nNodes, nodeWeight, firstNeighborIndex, neighbor, nullptr){}; 85 | 86 | Network(int nNodes, IVector& firstNeighborIndex, IVector& neighbor, DVector* edgeWeight) : 87 | Network(nNodes, nullptr, firstNeighborIndex, neighbor, edgeWeight) {}; 88 | 89 | 90 | int getNNodes() {return nNodes;}; 91 | double getTotalNodeWeight(); 92 | DVector getNodeWeights(); 93 | double getNodeWeight(int node) { return nodeWeight.at(node);}; 94 | int getNEdges() {return nEdges / 2;}; 95 | int getNEdges(int node) {return firstNeighborIndex.at(node + 1) - firstNeighborIndex.at(node);}; 96 | IVector getNEdgesPerNode(); 97 | std::vector getEdges(); 98 | IVector getEdges(int node); 99 | std::vector getEdgesPerNode(); 100 | double getTotalEdgeWeight(); 101 | double getTotalEdgeWeight(int node); 102 | DVector getTotalEdgeWeightPerNode(); 103 | DVector getEdgeWeights() {return edgeWeight;}; 104 | DVector getEdgeWeights(int node); 105 | std::vector getEdgeWeightsPerNode(); 106 | double getTotalEdgeWeightSelfLinks() 107 | { 108 | return totalEdgeWeightSelfLinks; 109 | }; 110 | // Added these to avoid making these values public 111 | int getFirstNeighborIndexValue(int i) const { 112 | return firstNeighborIndex.at(i); 113 | }; 114 | int getNeighborValue(int index) const { 115 | return neighbor.at(index); 116 | } 117 | 118 | std::vector createSubnetworks(Clustering clustering) const; 119 | Network createReducedNetwork(const Clustering& clustering) const; 120 | Clustering identifyComponents(); 121 | private: 122 | double generateRandomNumber(int node1, int node2, const IVector& nodePermutation); 123 | Network createSubnetwork(const Clustering& clustering, int cluster, IVector& node, 124 | IVector& subnetworkNode, IVector& subnetworkNeighbor, DVector& subnetworkEdgeWeight) const; 125 | }; 126 | 127 | 128 | class VOSClusteringTechnique { 129 | private: 130 | std::shared_ptr network; 131 | std::shared_ptr clustering; 132 | double resolution; 133 | 134 | public: 135 | VOSClusteringTechnique(std::shared_ptr network, double resolution); 136 | VOSClusteringTechnique(std::shared_ptr network, std::shared_ptr clustering, double resolution); 137 | std::shared_ptr getNetwork() { return network;} 138 | std::shared_ptr getClustering() { return clustering; } 139 | double getResolution() {return resolution; } 140 | void setNetwork(std::shared_ptr network) {this->network = network;} 141 | void setClustering(std::shared_ptr clustering) {this->clustering = clustering;} 142 | void setResolution(double resolution) {this->resolution = resolution;} 143 | double calcQualityFunction(); 144 | 145 | bool runLocalMovingAlgorithm(JavaRandom& random); 146 | bool runLouvainAlgorithm(JavaRandom& random); 147 | bool runIteratedLouvainAlgorithm(int maxNIterations, JavaRandom& random); 148 | bool runLouvainAlgorithmWithMultilevelRefinement(JavaRandom& random); 149 | bool runIteratedLouvainAlgorithmWithMultilevelRefinement(int maxNIterations, JavaRandom& random); 150 | bool runSmartLocalMovingAlgorithm(JavaRandom& random); 151 | bool runIteratedSmartLocalMovingAlgorithm(int nIterations, JavaRandom& random); 152 | 153 | int removeCluster(int cluster); 154 | void removeSmallClusters(int minNNodesPerCluster); 155 | }; 156 | 157 | 158 | std::shared_ptr matrixToNetwork(IVector& node1, IVector& node2, DVector& edgeWeight1, int modularityFunction); 159 | std::shared_ptr readInputFile(std::string fname, int modularityFunction); 160 | std::vector split(const std::string& s, char delimiter); 161 | }; 162 | -------------------------------------------------------------------------------- /src/RModularityOptimizer.cpp: -------------------------------------------------------------------------------- 1 | // This C++ code is from Seurat package available at https://github.com/satijalab/seurat 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include "ModularityOptimizer.h" 16 | 17 | using namespace ModularityOptimizer; 18 | using namespace std::chrono; 19 | using namespace Rcpp; 20 | 21 | 22 | // [[Rcpp::depends(RcppEigen)]] 23 | // [[Rcpp::depends(RcppProgress)]] 24 | // [[Rcpp::export]] 25 | IntegerVector RunModularityClusteringCpp(Eigen::SparseMatrix SNN, 26 | int modularityFunction, 27 | double resolution, 28 | int algorithm, 29 | int nRandomStarts, 30 | int nIterations, 31 | int randomSeed, 32 | bool printOutput, 33 | std::string edgefilename) { 34 | 35 | // validate arguments 36 | if(modularityFunction != 1 && modularityFunction != 2) 37 | stop("Modularity parameter must be equal to 1 or 2."); 38 | if(algorithm != 1 && algorithm !=2 && algorithm !=3 && algorithm !=4) 39 | stop("Algorithm for modularity optimization must be 1, 2, 3, or 4"); 40 | if(nRandomStarts < 1) 41 | stop("Have to have at least one start"); 42 | if(nIterations < 1) 43 | stop("Need at least one interation"); 44 | if (modularityFunction == 2 && resolution > 1.0) 45 | stop("error: resolution<1 for alternative modularity"); 46 | try { 47 | bool update; 48 | double modularity, maxModularity, resolution2; 49 | int i, j; 50 | 51 | std::string msg = "Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck"; 52 | if (printOutput) 53 | Rcout << msg << std::endl << std::endl; 54 | 55 | // Load netwrok 56 | std::shared_ptr network; 57 | if(edgefilename != "") { 58 | if (printOutput) 59 | Rcout << "Reading input file..." << std::endl << std::endl; 60 | try{ 61 | network = readInputFile(edgefilename, modularityFunction); 62 | } catch(...) { 63 | stop("Could not parse edge file."); 64 | } 65 | } else { 66 | // Load lower triangle 67 | int network_size = (SNN.nonZeros() / 2) + 3; 68 | IVector node1; 69 | IVector node2; 70 | DVector edgeweights; 71 | node1.reserve(network_size); 72 | node2.reserve(network_size); 73 | edgeweights.reserve(network_size); 74 | for (int k=0; k < SNN.outerSize(); ++k){ 75 | for (Eigen::SparseMatrix::InnerIterator it(SNN, k); it; ++it){ 76 | if(it.col() >= it.row()){ 77 | continue; 78 | } 79 | node1.emplace_back(it.col()); 80 | node2.emplace_back(it.row()); 81 | edgeweights.emplace_back(it.value()); 82 | } 83 | } 84 | if(node1.size() == 0) { 85 | stop("Matrix contained no network data. Check format."); 86 | } 87 | 88 | network = matrixToNetwork(node1, node2, edgeweights, modularityFunction); 89 | Rcpp::checkUserInterrupt(); 90 | } 91 | 92 | if (printOutput) 93 | { 94 | Rprintf("Number of nodes: %d\n", network->getNNodes()); 95 | Rprintf("Number of edges: %d\n", network->getNEdges()); 96 | Rcout << std::endl; 97 | Rcout << "Running " << ((algorithm == 1) ? "Louvain algorithm" : ((algorithm == 2) ? "Louvain algorithm with multilevel refinement" : "smart local moving algorithm")) << "..."; 98 | Rcout << std::endl; 99 | } 100 | 101 | resolution2 = ((modularityFunction == 1) ? (resolution / (2 * network->getTotalEdgeWeight() + network->getTotalEdgeWeightSelfLinks())) : resolution); 102 | 103 | auto beginTime = duration_cast(system_clock::now().time_since_epoch()); 104 | std::shared_ptr clustering; 105 | maxModularity = -std::numeric_limits::infinity(); 106 | JavaRandom random(randomSeed); 107 | 108 | Progress p(nRandomStarts, printOutput); 109 | for (i = 0; i < nRandomStarts; i++) 110 | { 111 | //if (printOutput && (nRandomStarts > 1)) 112 | //Rprintf("Random start: %d\n", i + 1); 113 | 114 | VOSClusteringTechnique vosClusteringTechnique(network, resolution2); 115 | 116 | j = 0; 117 | update = true; 118 | do 119 | { 120 | /*if (printOutput && (nIterations > 1)) 121 | Rprintf("Iteration: %d\n", j + 1); 122 | */ 123 | if (algorithm == 1) 124 | update = vosClusteringTechnique.runLouvainAlgorithm(random); 125 | else if (algorithm == 2) 126 | update = vosClusteringTechnique.runLouvainAlgorithmWithMultilevelRefinement(random); 127 | else if (algorithm == 3) 128 | vosClusteringTechnique.runSmartLocalMovingAlgorithm(random); 129 | j++; 130 | 131 | modularity = vosClusteringTechnique.calcQualityFunction(); 132 | 133 | //if (printOutput && (nIterations > 1)) 134 | // Rprintf("Modularity: %.4f\n", modularity); 135 | Rcpp::checkUserInterrupt(); 136 | } 137 | while ((j < nIterations) && update); 138 | 139 | if (modularity > maxModularity) 140 | { 141 | clustering = vosClusteringTechnique.getClustering(); 142 | maxModularity = modularity; 143 | } 144 | 145 | /*if (printOutput && (nRandomStarts > 1)) 146 | { 147 | if (nIterations == 1) 148 | Rprintf("Modularity: %.4f\n", modularity); 149 | Rcout << std::endl; 150 | }*/ 151 | p.increment(); 152 | } 153 | auto endTime = duration_cast(system_clock::now().time_since_epoch()); 154 | if(clustering == nullptr) { 155 | stop("Clustering step failed."); 156 | } 157 | if (printOutput) 158 | { 159 | if (nRandomStarts == 1) 160 | { 161 | if (nIterations > 1) 162 | Rcout << std::endl; 163 | Rprintf("Modularity: %.4f\n", maxModularity); 164 | } 165 | else 166 | Rprintf("Maximum modularity in %d random starts: %.4f\n", nRandomStarts, maxModularity); 167 | Rprintf("Number of communities: %d\n", clustering->getNClusters()); 168 | Rprintf("Elapsed time: %d seconds\n", static_cast((endTime - beginTime).count() / 1000.0)); 169 | } 170 | 171 | // Return results 172 | clustering->orderClustersByNNodes(); 173 | IntegerVector iv(clustering->cluster.cbegin(), clustering->cluster.cend()); 174 | return iv; 175 | } catch(std::exception &ex) { 176 | forward_exception_to_r(ex); 177 | } catch(...) { 178 | ::Rf_error("c++ exception (unknown reason)"); 179 | } 180 | return IntegerVector(1); 181 | } 182 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace Rcpp; 10 | 11 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 12 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 13 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 14 | #endif 15 | 16 | // RunModularityClusteringCpp 17 | IntegerVector RunModularityClusteringCpp(Eigen::SparseMatrix SNN, int modularityFunction, double resolution, int algorithm, int nRandomStarts, int nIterations, int randomSeed, bool printOutput, std::string edgefilename); 18 | RcppExport SEXP _gficf_RunModularityClusteringCpp(SEXP SNNSEXP, SEXP modularityFunctionSEXP, SEXP resolutionSEXP, SEXP algorithmSEXP, SEXP nRandomStartsSEXP, SEXP nIterationsSEXP, SEXP randomSeedSEXP, SEXP printOutputSEXP, SEXP edgefilenameSEXP) { 19 | BEGIN_RCPP 20 | Rcpp::RObject rcpp_result_gen; 21 | Rcpp::RNGScope rcpp_rngScope_gen; 22 | Rcpp::traits::input_parameter< Eigen::SparseMatrix >::type SNN(SNNSEXP); 23 | Rcpp::traits::input_parameter< int >::type modularityFunction(modularityFunctionSEXP); 24 | Rcpp::traits::input_parameter< double >::type resolution(resolutionSEXP); 25 | Rcpp::traits::input_parameter< int >::type algorithm(algorithmSEXP); 26 | Rcpp::traits::input_parameter< int >::type nRandomStarts(nRandomStartsSEXP); 27 | Rcpp::traits::input_parameter< int >::type nIterations(nIterationsSEXP); 28 | Rcpp::traits::input_parameter< int >::type randomSeed(randomSeedSEXP); 29 | Rcpp::traits::input_parameter< bool >::type printOutput(printOutputSEXP); 30 | Rcpp::traits::input_parameter< std::string >::type edgefilename(edgefilenameSEXP); 31 | rcpp_result_gen = Rcpp::wrap(RunModularityClusteringCpp(SNN, modularityFunction, resolution, algorithm, nRandomStarts, nIterations, randomSeed, printOutput, edgefilename)); 32 | return rcpp_result_gen; 33 | END_RCPP 34 | } 35 | // jaccard_coeff 36 | NumericMatrix jaccard_coeff(NumericMatrix idx, bool printOutput); 37 | RcppExport SEXP _gficf_jaccard_coeff(SEXP idxSEXP, SEXP printOutputSEXP) { 38 | BEGIN_RCPP 39 | Rcpp::RObject rcpp_result_gen; 40 | Rcpp::RNGScope rcpp_rngScope_gen; 41 | Rcpp::traits::input_parameter< NumericMatrix >::type idx(idxSEXP); 42 | Rcpp::traits::input_parameter< bool >::type printOutput(printOutputSEXP); 43 | rcpp_result_gen = Rcpp::wrap(jaccard_coeff(idx, printOutput)); 44 | return rcpp_result_gen; 45 | END_RCPP 46 | } 47 | // armaColSumFull 48 | arma::mat armaColSumFull(const arma::mat& m, int ncores, bool verbose); 49 | RcppExport SEXP _gficf_armaColSumFull(SEXP mSEXP, SEXP ncoresSEXP, SEXP verboseSEXP) { 50 | BEGIN_RCPP 51 | Rcpp::RObject rcpp_result_gen; 52 | Rcpp::RNGScope rcpp_rngScope_gen; 53 | Rcpp::traits::input_parameter< const arma::mat& >::type m(mSEXP); 54 | Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP); 55 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 56 | rcpp_result_gen = Rcpp::wrap(armaColSumFull(m, ncores, verbose)); 57 | return rcpp_result_gen; 58 | END_RCPP 59 | } 60 | // armaColSumSparse 61 | arma::mat armaColSumSparse(const arma::sp_mat& m, int ncores, bool verbose); 62 | RcppExport SEXP _gficf_armaColSumSparse(SEXP mSEXP, SEXP ncoresSEXP, SEXP verboseSEXP) { 63 | BEGIN_RCPP 64 | Rcpp::RObject rcpp_result_gen; 65 | Rcpp::RNGScope rcpp_rngScope_gen; 66 | Rcpp::traits::input_parameter< const arma::sp_mat& >::type m(mSEXP); 67 | Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP); 68 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 69 | rcpp_result_gen = Rcpp::wrap(armaColSumSparse(m, ncores, verbose)); 70 | return rcpp_result_gen; 71 | END_RCPP 72 | } 73 | // colMeanVarS 74 | Rcpp::DataFrame colMeanVarS(const arma::sp_mat& m, int ncores); 75 | RcppExport SEXP _gficf_colMeanVarS(SEXP mSEXP, SEXP ncoresSEXP) { 76 | BEGIN_RCPP 77 | Rcpp::RObject rcpp_result_gen; 78 | Rcpp::RNGScope rcpp_rngScope_gen; 79 | Rcpp::traits::input_parameter< const arma::sp_mat& >::type m(mSEXP); 80 | Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP); 81 | rcpp_result_gen = Rcpp::wrap(colMeanVarS(m, ncores)); 82 | return rcpp_result_gen; 83 | END_RCPP 84 | } 85 | // armaManhattan 86 | arma::sp_mat armaManhattan(const arma::sp_mat& m, int ncores, bool verbose, bool full, bool diag); 87 | RcppExport SEXP _gficf_armaManhattan(SEXP mSEXP, SEXP ncoresSEXP, SEXP verboseSEXP, SEXP fullSEXP, SEXP diagSEXP) { 88 | BEGIN_RCPP 89 | Rcpp::RObject rcpp_result_gen; 90 | Rcpp::RNGScope rcpp_rngScope_gen; 91 | Rcpp::traits::input_parameter< const arma::sp_mat& >::type m(mSEXP); 92 | Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP); 93 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 94 | Rcpp::traits::input_parameter< bool >::type full(fullSEXP); 95 | Rcpp::traits::input_parameter< bool >::type diag(diagSEXP); 96 | rcpp_result_gen = Rcpp::wrap(armaManhattan(m, ncores, verbose, full, diag)); 97 | return rcpp_result_gen; 98 | END_RCPP 99 | } 100 | // armaCorr 101 | arma::sp_mat armaCorr(const arma::mat& m, int ncores, bool verbose, bool full, bool diag, bool dist); 102 | RcppExport SEXP _gficf_armaCorr(SEXP mSEXP, SEXP ncoresSEXP, SEXP verboseSEXP, SEXP fullSEXP, SEXP diagSEXP, SEXP distSEXP) { 103 | BEGIN_RCPP 104 | Rcpp::RObject rcpp_result_gen; 105 | Rcpp::RNGScope rcpp_rngScope_gen; 106 | Rcpp::traits::input_parameter< const arma::mat& >::type m(mSEXP); 107 | Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP); 108 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 109 | Rcpp::traits::input_parameter< bool >::type full(fullSEXP); 110 | Rcpp::traits::input_parameter< bool >::type diag(diagSEXP); 111 | Rcpp::traits::input_parameter< bool >::type dist(distSEXP); 112 | rcpp_result_gen = Rcpp::wrap(armaCorr(m, ncores, verbose, full, diag, dist)); 113 | return rcpp_result_gen; 114 | END_RCPP 115 | } 116 | // armaColMeans 117 | Rcpp::DataFrame armaColMeans(const arma::sp_mat& m, int ncores, bool verbose); 118 | RcppExport SEXP _gficf_armaColMeans(SEXP mSEXP, SEXP ncoresSEXP, SEXP verboseSEXP) { 119 | BEGIN_RCPP 120 | Rcpp::RObject rcpp_result_gen; 121 | Rcpp::RNGScope rcpp_rngScope_gen; 122 | Rcpp::traits::input_parameter< const arma::sp_mat& >::type m(mSEXP); 123 | Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP); 124 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 125 | rcpp_result_gen = Rcpp::wrap(armaColMeans(m, ncores, verbose)); 126 | return rcpp_result_gen; 127 | END_RCPP 128 | } 129 | // scaleUMI 130 | arma::sp_mat scaleUMI(const arma::sp_mat& m, int ncores, bool verbose); 131 | RcppExport SEXP _gficf_scaleUMI(SEXP mSEXP, SEXP ncoresSEXP, SEXP verboseSEXP) { 132 | BEGIN_RCPP 133 | Rcpp::RObject rcpp_result_gen; 134 | Rcpp::RNGScope rcpp_rngScope_gen; 135 | Rcpp::traits::input_parameter< const arma::sp_mat& >::type m(mSEXP); 136 | Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP); 137 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 138 | rcpp_result_gen = Rcpp::wrap(scaleUMI(m, ncores, verbose)); 139 | return rcpp_result_gen; 140 | END_RCPP 141 | } 142 | // rcpp_WMU_test 143 | Rcpp::NumericMatrix rcpp_WMU_test(Rcpp::NumericMatrix M, Rcpp::NumericVector idx1, Rcpp::NumericVector idx2); 144 | RcppExport SEXP _gficf_rcpp_WMU_test(SEXP MSEXP, SEXP idx1SEXP, SEXP idx2SEXP) { 145 | BEGIN_RCPP 146 | Rcpp::RObject rcpp_result_gen; 147 | Rcpp::RNGScope rcpp_rngScope_gen; 148 | Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type M(MSEXP); 149 | Rcpp::traits::input_parameter< Rcpp::NumericVector >::type idx1(idx1SEXP); 150 | Rcpp::traits::input_parameter< Rcpp::NumericVector >::type idx2(idx2SEXP); 151 | rcpp_result_gen = Rcpp::wrap(rcpp_WMU_test(M, idx1, idx2)); 152 | return rcpp_result_gen; 153 | END_RCPP 154 | } 155 | // rcpp_parallel_jaccard_coef 156 | Rcpp::NumericMatrix rcpp_parallel_jaccard_coef(Rcpp::NumericMatrix mat, bool printOutput); 157 | RcppExport SEXP _gficf_rcpp_parallel_jaccard_coef(SEXP matSEXP, SEXP printOutputSEXP) { 158 | BEGIN_RCPP 159 | Rcpp::RObject rcpp_result_gen; 160 | Rcpp::RNGScope rcpp_rngScope_gen; 161 | Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type mat(matSEXP); 162 | Rcpp::traits::input_parameter< bool >::type printOutput(printOutputSEXP); 163 | rcpp_result_gen = Rcpp::wrap(rcpp_parallel_jaccard_coef(mat, printOutput)); 164 | return rcpp_result_gen; 165 | END_RCPP 166 | } 167 | // rcpp_parallel_WMU_test 168 | Rcpp::NumericMatrix rcpp_parallel_WMU_test(Rcpp::NumericMatrix matX, Rcpp::NumericMatrix matY, bool printOutput); 169 | RcppExport SEXP _gficf_rcpp_parallel_WMU_test(SEXP matXSEXP, SEXP matYSEXP, SEXP printOutputSEXP) { 170 | BEGIN_RCPP 171 | Rcpp::RObject rcpp_result_gen; 172 | Rcpp::RNGScope rcpp_rngScope_gen; 173 | Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type matX(matXSEXP); 174 | Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type matY(matYSEXP); 175 | Rcpp::traits::input_parameter< bool >::type printOutput(printOutputSEXP); 176 | rcpp_result_gen = Rcpp::wrap(rcpp_parallel_WMU_test(matX, matY, printOutput)); 177 | return rcpp_result_gen; 178 | END_RCPP 179 | } 180 | 181 | RcppExport SEXP detectCoresCpp(void); 182 | 183 | static const R_CallMethodDef CallEntries[] = { 184 | {"_gficf_RunModularityClusteringCpp", (DL_FUNC) &_gficf_RunModularityClusteringCpp, 9}, 185 | {"_gficf_jaccard_coeff", (DL_FUNC) &_gficf_jaccard_coeff, 2}, 186 | {"_gficf_armaColSumFull", (DL_FUNC) &_gficf_armaColSumFull, 3}, 187 | {"_gficf_armaColSumSparse", (DL_FUNC) &_gficf_armaColSumSparse, 3}, 188 | {"_gficf_colMeanVarS", (DL_FUNC) &_gficf_colMeanVarS, 2}, 189 | {"_gficf_armaManhattan", (DL_FUNC) &_gficf_armaManhattan, 5}, 190 | {"_gficf_armaCorr", (DL_FUNC) &_gficf_armaCorr, 6}, 191 | {"_gficf_armaColMeans", (DL_FUNC) &_gficf_armaColMeans, 3}, 192 | {"_gficf_scaleUMI", (DL_FUNC) &_gficf_scaleUMI, 3}, 193 | {"_gficf_rcpp_WMU_test", (DL_FUNC) &_gficf_rcpp_WMU_test, 3}, 194 | {"_gficf_rcpp_parallel_jaccard_coef", (DL_FUNC) &_gficf_rcpp_parallel_jaccard_coef, 2}, 195 | {"_gficf_rcpp_parallel_WMU_test", (DL_FUNC) &_gficf_rcpp_parallel_WMU_test, 3}, 196 | {"detectCoresCpp", (DL_FUNC) &detectCoresCpp, 0}, 197 | {NULL, NULL, 0} 198 | }; 199 | 200 | RcppExport void R_init_gficf(DllInfo *dll) { 201 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 202 | R_useDynamicSymbols(dll, FALSE); 203 | } 204 | -------------------------------------------------------------------------------- /src/detectCores.cpp: -------------------------------------------------------------------------------- 1 | // thnks to https://github.com/tnagler/RcppThread/commit/c26fc2b0d56555fa434c33352747822691334fe8 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | extern "C" { 8 | 9 | SEXP detectCoresCpp() { 10 | SEXP result; 11 | PROTECT(result = NEW_INTEGER(1)); 12 | INTEGER(result)[0] = std::thread::hardware_concurrency(); 13 | UNPROTECT(1); 14 | return result; 15 | } 16 | 17 | 18 | static const R_CallMethodDef callMethods[] = { 19 | {"detectCoresCpp", (DL_FUNC) &detectCoresCpp, 0}, 20 | {NULL, NULL, 0} 21 | }; 22 | 23 | void R_init_RcppThread(DllInfo *info) 24 | { 25 | R_registerRoutines(info, NULL, callMethods, NULL, NULL); 26 | R_useDynamicSymbols(info, TRUE); 27 | } 28 | 29 | } -------------------------------------------------------------------------------- /src/jaccard_coeff.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | using namespace Rcpp; 4 | 5 | // Original C++ code is from JinmiaoChenLab/Rphenograph package available at https://github.com/JinmiaoChenLab/Rphenograph/tree/master/src 6 | // 7 | // Compute jaccard coefficient between nearest-neighbor sets 8 | // 9 | // Weights of both i->j and j->i are recorded if they have intersection. In this case 10 | // w(i->j) should be equal to w(j->i). In some case i->j has weights while j<-i has no 11 | // intersections, only w(i->j) is recorded. This is determinded in code `if(u>0)`. 12 | // In this way, the undirected graph is symmetrized by halfing the weight 13 | // in code `weights(r, 2) = u/(2.0*ncol - u)/2`. 14 | // 15 | // Author: Chen Hao, Date: 25/09/2015 16 | 17 | // [[Rcpp::depends(RcppProgress)]] 18 | // [[Rcpp::export]] 19 | NumericMatrix jaccard_coeff(NumericMatrix idx, bool printOutput) { 20 | int nrow = idx.nrow(), ncol = idx.ncol(); 21 | NumericMatrix weights(nrow*ncol, 3); 22 | int r = 0; 23 | if (printOutput) 24 | { 25 | Rprintf("Running Jaccard Coefficient Estimation...\n"); 26 | } 27 | Progress p(nrow, printOutput); 28 | for (int i = 0; i < nrow; i++) { 29 | for (int j = 0; j < ncol; j++) { 30 | int k = idx(i,j)-1; 31 | NumericVector nodei = idx(i,_); 32 | NumericVector nodej = idx(k,_); 33 | int u = intersect(nodei, nodej).size(); // count intersection number 34 | if(u>0){ 35 | weights(r, 0) = i+1; 36 | weights(r, 1) = k+1; 37 | weights(r, 2) = u/(2.0*ncol - u); 38 | r++; 39 | } 40 | } 41 | p.increment(); 42 | } 43 | 44 | return weights; 45 | } 46 | -------------------------------------------------------------------------------- /src/mann_whitney.cpp: -------------------------------------------------------------------------------- 1 | #include "mann_whitney.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // Compute two-sided Mann–Whitney U test with coninuity correction between two umpaired samples. 12 | // In R this c++ function corresond to wilcox.test(x,y,alternative = "two.sided", paired = F,exact = F,correct = T) 13 | // 14 | // Author: Gennaro Gambardella, Date: 20/08/2019 15 | 16 | 17 | template 18 | std::vector sort_indexes(std::vector const &v) 19 | { 20 | 21 | // initialize original index locations 22 | std::vector idx(v.size()); 23 | iota(idx.begin(), idx.end(), 0); 24 | 25 | // sort indexes based on comparing values in v 26 | sort(idx.begin(), idx.end(), 27 | [&v](size_t i1, size_t i2) {return v[i1] < v[i2];}); 28 | 29 | return idx; 30 | } 31 | 32 | std::vector getRanks(std::vector const &absoluteValues) 33 | { 34 | std::vector ranks(absoluteValues.size()); 35 | 36 | size_t i = 0; 37 | while (i < absoluteValues.size()) 38 | { 39 | size_t j = i + 1; 40 | while (j < absoluteValues.size()) 41 | { 42 | if(absoluteValues[i] != absoluteValues[j]) 43 | { 44 | break; 45 | } 46 | j++; 47 | } 48 | for(size_t k = i; k <= j-1; k++) 49 | { 50 | ranks[k] = 1 + (double)(i + j-1)/(double)2; 51 | } 52 | i = j; 53 | } 54 | return ranks; 55 | } 56 | 57 | std::vector getUniq(std::vector u) 58 | { 59 | // using default comparison: 60 | std::vector::iterator it; 61 | it = std::unique (u.begin(), u.end()); 62 | u.resize( std::distance(u.begin(),it) ); 63 | return u; 64 | } 65 | 66 | std::vector getCounts(std::vector const &valuesOrd) 67 | { 68 | std::vector u = getUniq(valuesOrd); 69 | std::vector u_counts(u.size(),0); 70 | size_t k=0; 71 | double prev=valuesOrd[0]; 72 | 73 | for(size_t i=0;i& u_counts,double n1, double n2) 88 | { 89 | double nties = 0; 90 | //int n = n1 + n2; 91 | 92 | if(u_counts.size() < n1 + n2) 93 | { 94 | for(std::vector::iterator it = u_counts.begin(); it != u_counts.end(); ++it) 95 | nties += ( ((*it) * (*it) * (*it)) - *it ); 96 | } 97 | 98 | return sqrt( (n1 * n2 / 12) * ( (n1 + n2 + 1) - nties / ((n1 + n2) * (n1 + n2 - 1))) ); 99 | } 100 | 101 | double getPvalue(double& z) 102 | { 103 | double p =1; 104 | if(z<0){ 105 | p = gsl_cdf_gaussian_P(z,1)*2; 106 | } else { 107 | p = gsl_cdf_gaussian_Q(z,1)*2; 108 | } 109 | return p; 110 | } 111 | 112 | Rcpp::NumericVector subset(Rcpp::NumericVector const &v, Rcpp::NumericVector idx) 113 | { 114 | Rcpp::NumericVector res(idx.size()); 115 | size_t k=0; 116 | for(Rcpp::NumericVector::iterator it = idx.begin(); it != idx.end(); ++it, k++) 117 | res(k) = v[*it - 1]; 118 | return res; 119 | } 120 | 121 | double avg(Rcpp::NumericVector const &v) 122 | { 123 | return std::accumulate(v.begin(), v.end(), 0.0) / v.size(); 124 | } 125 | 126 | double avg(std::vector const &v) 127 | { 128 | return std::accumulate(v.begin(), v.end(), 0.0) / v.size(); 129 | } 130 | 131 | 132 | double MWUtest(Rcpp::NumericVector const &v1, Rcpp::NumericVector const &v2) 133 | { 134 | double pval = 1; 135 | std::vector values(v1.length()); 136 | std::vector valuesOrd(v1.length()+v2.length()); 137 | std::vector valuesRnk(v1.length()+v2.length()); 138 | 139 | // concat vectors 140 | std::copy(v1.begin(), v1.end(), values.begin()); 141 | values.insert( values.end(),v2.begin(),v2.end()); 142 | 143 | // sort 144 | std::vector idx = sort_indexes(values); 145 | for(size_t i=0;i nties = getCounts(valuesOrd); 154 | 155 | if(nties.size()>1) { 156 | 157 | double long U1 = (v1.size() * (v1.size()+1)) * -0.5; 158 | double long U2 = (v2.size() * (v2.size()+1)) * -0.5; 159 | std::vector u1_rnk(v1.size(),0); 160 | std::vector u2_rnk(v2.size(),0); 161 | std::vector v1_ord(v1.size(),0); 162 | std::vector v2_ord(v2.size(),0); 163 | int u1_i=0; 164 | int u2_i = 0; 165 | 166 | for(size_t i=0;i 2 | 3 | // Compute two-sided Mann–Whitney U test between two umpaired samples. 4 | // In R this c++ function corresond to wilcox.test(x,y,alternative = "two.sided", paired = F,exact = F,correct = F) 5 | // 6 | // Author: Gennaro Gambardella, Date: 20/08/2019 7 | 8 | 9 | template 10 | std::vector sort_indexes(std::vector const &v); 11 | 12 | std::vector getRanks(std::vector const &absoluteValues); 13 | 14 | std::vector getUniq(std::vector u); 15 | 16 | std::vector getCounts(std::vector const &valuesOrd); 17 | 18 | double getSigma(std::vector& u_counts,double n1, double n2); 19 | 20 | double getPvalue(double& z); 21 | 22 | Rcpp::NumericVector subset(Rcpp::NumericVector const &v, Rcpp::NumericVector idx); 23 | 24 | double avg(Rcpp::NumericVector const &v); 25 | 26 | double avg(std::vector const &v); 27 | 28 | double MWUtest(Rcpp::NumericVector const &v1, Rcpp::NumericVector const &v2); 29 | -------------------------------------------------------------------------------- /src/misc.cpp: -------------------------------------------------------------------------------- 1 | #define ARMA_64BIT_WORD 2 | #include 3 | #include 4 | 5 | #ifdef _OPENMP 6 | #include 7 | #endif 8 | 9 | using namespace std; 10 | using namespace Rcpp; 11 | 12 | // [[Rcpp::depends(RcppArmadillo)]] 13 | // [[Rcpp::plugins(openmp)]] 14 | // [[Rcpp::plugins(cpp11)]] 15 | // [[Rcpp::depends(RcppProgress)]] 16 | 17 | 18 | // [[Rcpp::export]] 19 | arma::mat armaColSumFull(const arma::mat& m, int ncores=1, bool verbose=false) 20 | { 21 | arma::vec s(m.n_cols, arma::fill::zeros); 22 | int tot = m.n_cols; 23 | Progress p(tot, verbose); 24 | #pragma omp parallel for num_threads(ncores) shared(s) 25 | for(int i=0;i<(m.n_cols);i++) { 26 | s(i) = arma::sum(m.col(i)); 27 | } 28 | return(s); 29 | } 30 | 31 | // [[Rcpp::export]] 32 | arma::mat armaColSumSparse(const arma::sp_mat& m, int ncores=1, bool verbose=false) 33 | { 34 | arma::vec s(m.n_cols, arma::fill::zeros); 35 | int tot = m.n_cols; 36 | Progress p(tot, verbose); 37 | #pragma omp parallel for num_threads(ncores) shared(s) 38 | for(int i=0;i<(m.n_cols);i++) { 39 | s(i) = arma::sum(m.col(i)); 40 | } 41 | return(s); 42 | } 43 | 44 | // calculate column mean and variance 45 | // [[Rcpp::export]] 46 | Rcpp::DataFrame colMeanVarS(const arma::sp_mat& m, int ncores=1) { 47 | 48 | arma::vec meanV(m.n_cols,arma::fill::zeros); arma::vec varV(m.n_cols,arma::fill::zeros); arma::vec nobsV(m.n_cols,arma::fill::zeros); 49 | #pragma omp parallel for num_threads(ncores) shared(meanV,varV,nobsV) 50 | for(int i=0;i<(m.n_cols);i++) { 51 | meanV(i) = arma::mean(m.col(i)); 52 | nobsV(i) = m.col(i).n_nonzero; 53 | varV(i) = arma::var(m.col(i)); 54 | } 55 | return Rcpp::DataFrame::create(Named("m")=meanV,Named("v")=varV,Named("nobs",nobsV)); 56 | } 57 | 58 | // [[Rcpp::export]] 59 | arma::sp_mat armaManhattan(const arma::sp_mat& m, int ncores=1,bool verbose=true, bool full=false, bool diag=true) 60 | { 61 | typedef arma::sp_mat::const_col_iterator iter; 62 | arma::sp_mat d(m.n_cols,m.n_cols); 63 | int tot = (full) ? m.n_cols*m.n_cols - ((diag) ? m.n_cols : 0) : (m.n_cols*m.n_cols + ((diag) ? m.n_cols : 0))/2; 64 | Progress p(tot, verbose); 65 | #pragma omp parallel for num_threads(ncores) shared(d) 66 | for(int i=0;i<(m.n_cols);i++) { 67 | for(int j = diag ? i : i+1;j0) { 145 | meanV[i] = tot/n; 146 | meanAll[i] = tot/m.n_cols; 147 | nobsV[i] = n; 148 | } 149 | p.increment(); // update progress 150 | } 151 | } 152 | return Rcpp::DataFrame::create(Named("mu1")=meanV,Named("mu2")=meanAll,Named("nobs",nobsV)); 153 | } 154 | 155 | // [[Rcpp::export]] 156 | arma::sp_mat scaleUMI(const arma::sp_mat& m, int ncores=1,bool verbose=false) 157 | { 158 | typedef arma::sp_mat::const_col_iterator iter; 159 | typedef arma::sp_mat::const_row_col_iterator iter2; 160 | 161 | arma::sp_mat d(m.n_rows,m.n_cols); 162 | d.zeros(); 163 | arma::vec totV(m.n_cols,arma::fill::zeros); 164 | arma::vec medianV(m.n_cols,arma::fill::zeros); 165 | Progress p(m.n_cols, verbose); 166 | 167 | #pragma omp parallel for num_threads(ncores) shared(totV) 168 | for(unsigned int i=0;i 3 | 4 | // Compute two-sided Mann–Whitney U test between two umpaired samples. 5 | // In R this c++ function corresond to wilcox.test(x,y,alternative = "two.sided", paired = F,exact = F,correct = F) 6 | // 7 | // Author: Gennaro Gambardella, Date: 20/08/2019 8 | 9 | 10 | 11 | // [[Rcpp::export]] 12 | Rcpp::NumericMatrix rcpp_WMU_test(Rcpp::NumericMatrix M,Rcpp::NumericVector idx1,Rcpp::NumericVector idx2) { 13 | Rcpp::NumericMatrix res(M.nrow(),2); 14 | 15 | for(int i=0;i 2 | #include 3 | 4 | // Compute jaccard coefficient between nearest-neighbor sets in parallell 5 | // 6 | // Author: Gennaro Gambardella, Date: 12/08/2019 7 | 8 | 9 | // [[Rcpp::depends(RcppParallel)]] 10 | struct JCoefficient : public RcppParallel::Worker { 11 | 12 | // input matrix to read from 13 | const RcppParallel::RMatrix mat; 14 | 15 | // output matrix to write to 16 | RcppParallel::RMatrix rmat; 17 | 18 | // initialize from Rcpp input and output matrixes (the RMatrix class 19 | // can be automatically converted to from the Rcpp matrix type) 20 | JCoefficient(const Rcpp::NumericMatrix& mat, Rcpp::NumericMatrix& rmat) 21 | : mat(mat), rmat(rmat) {} 22 | 23 | // function call operator that work for the specified range (begin/end) 24 | void operator()(std::size_t begin, std::size_t end) { 25 | for (std::size_t i = begin; i < end; i++) { 26 | for (std::size_t j = 0; j < mat.ncol(); j++) { 27 | 28 | int k = mat(i,j)-1; 29 | 30 | RcppParallel::RMatrix::Row row1(mat.row(i)); 31 | std::vector v1(row1.length()); 32 | std::copy(row1.begin(), row1.end(), v1.begin()); 33 | 34 | RcppParallel::RMatrix::Row row2(mat.row(k)); 35 | std::vector v2(row2.length()); 36 | std::copy(row2.begin(), row2.end(), v2.begin()); 37 | 38 | std::sort(v1.begin(), v1.end()); 39 | std::sort(v2.begin(), v2.end()); 40 | 41 | std::vector v_intersection; 42 | 43 | std::set_intersection(v1.begin(), v1.end(), 44 | v2.begin(), v2.end(), 45 | std::back_inserter(v_intersection)); 46 | int u = v_intersection.size(); 47 | 48 | if(u>0){ 49 | rmat((i*mat.ncol())+j, 0) = i+1; 50 | rmat((i*mat.ncol())+j, 1) = k+1; 51 | rmat((i*mat.ncol())+j, 2) = u/(2.0*mat.ncol() - u); 52 | } 53 | } 54 | } 55 | } 56 | }; 57 | 58 | // [[Rcpp::export]] 59 | Rcpp::NumericMatrix rcpp_parallel_jaccard_coef(Rcpp::NumericMatrix mat, bool printOutput) { 60 | 61 | if (printOutput) 62 | { 63 | Rprintf("Running Parallell Jaccard Coefficient Estimation...\n"); 64 | } 65 | 66 | // allocate the matrix we will return 67 | Rcpp::NumericMatrix rmat(mat.nrow()*mat.ncol(),3); 68 | 69 | // create the worker 70 | JCoefficient JCoefficient(mat, rmat); 71 | 72 | // call it with parallelFor 73 | parallelFor(0, mat.nrow(), JCoefficient); 74 | 75 | if (printOutput) 76 | { 77 | Rprintf("Done!!\n"); 78 | } 79 | return rmat; 80 | } 81 | -------------------------------------------------------------------------------- /src/rcpp_parallel_mann_whitney.cpp: -------------------------------------------------------------------------------- 1 | #include "mann_whitney.h" 2 | #include 3 | #include 4 | #include 5 | 6 | // Compute two-sided Mann–Whitney U test with coninuity correction between two umpaired samples. 7 | // In R this c++ function corresond to wilcox.test(x,y,alternative = "two.sided", paired = F,exact = F,correct = F) 8 | // 9 | // Author: Gennaro Gambardella, Date: 20/08/2019 10 | 11 | // [[Rcpp::depends(RcppParallel)]] 12 | struct WMU_test : public RcppParallel::Worker { 13 | 14 | // input matrix to read from 15 | const RcppParallel::RMatrix matX; 16 | const RcppParallel::RMatrix matY; 17 | 18 | // output matrix to write to 19 | RcppParallel::RMatrix rmat; 20 | 21 | // initialize from Rcpp input and output matrixes (the RMatrix class 22 | // can be automatically converted to from the Rcpp matrix type) 23 | WMU_test(const Rcpp::NumericMatrix& matX, const Rcpp::NumericMatrix& matY, Rcpp::NumericMatrix& rmat) 24 | : matX(matX), matY(matY), rmat(rmat) {} 25 | 26 | // function call operator that work for the specified range (begin/end) 27 | void operator()(std::size_t begin, std::size_t end) { 28 | for (std::size_t k = begin; k < end; k++) { 29 | 30 | RcppParallel::RMatrix::Row row1(matX.row(k)); 31 | RcppParallel::RMatrix::Row row2(matY.row(k)); 32 | 33 | std::vector v1(row1.length()); 34 | std::copy(row1.begin(), row1.end(), v1.begin()); 35 | 36 | std::vector v2(row2.length()); 37 | std::copy(row2.begin(), row2.end(), v2.begin()); 38 | 39 | double pval = 1; 40 | std::vector values(v1.size()); 41 | std::vector valuesOrd(v1.size()+v2.size()); 42 | std::vector valuesRnk(v1.size()+v2.size()); 43 | 44 | // concat vectors 45 | std::copy(v1.begin(), v1.end(), values.begin()); 46 | values.insert( values.end(),v2.begin(),v2.end()); 47 | 48 | // sort 49 | std::vector idx = sort_indexes(values); 50 | for(size_t i=0;i nties = getCounts(valuesOrd); 59 | 60 | if(nties.size()>1) { 61 | 62 | double long U1 = (v1.size() * (v1.size()+1)) * -0.5; 63 | double long U2 = (v2.size() * (v2.size()+1)) * -0.5; 64 | std::vector u1_rnk(v1.size(),0); 65 | std::vector u2_rnk(v2.size(),0); 66 | std::vector v1_ord(v1.size(),0); 67 | std::vector v2_ord(v2.size(),0); 68 | int u1_i=0; 69 | int u2_i = 0; 70 | 71 | for(size_t i=0;i(), 1.0)); 99 | transform(v2.begin(), v2.end(), v2.begin(),bind2nd(std::plus(), 1.0)); 100 | rmat(k,1) = log2( (double) (avg(v1)/avg(v2)) ); 101 | } 102 | } 103 | }; 104 | 105 | // [[Rcpp::export]] 106 | Rcpp::NumericMatrix rcpp_parallel_WMU_test(Rcpp::NumericMatrix matX,Rcpp::NumericMatrix matY, bool printOutput) { 107 | 108 | if (printOutput) 109 | { 110 | Rprintf("Running Parallell WM-U test...\n"); 111 | } 112 | 113 | // allocate the matrix we will return 114 | Rcpp::NumericMatrix rmat(matX.nrow(),2); 115 | 116 | // create the worker 117 | WMU_test WMU_test(matX, matY, rmat); 118 | 119 | // call it with parallelFor 120 | parallelFor(0, matX.nrow(), WMU_test); 121 | 122 | if (printOutput) 123 | { 124 | Rprintf("Done!!\n"); 125 | } 126 | return rmat; 127 | } 128 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(gficf) 3 | 4 | # test_check("gficf") 5 | --------------------------------------------------------------------------------