├── .gitignore
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── NEWS.md
├── R
    ├── RcppExports.R
    ├── cellClassifier.R
    ├── cellQC.R
    ├── clustCells.R
    ├── data.R
    ├── deGenes.R
    ├── dimensinalityReduction.R
    ├── gficf.R
    ├── pathwayAnalisys.R
    ├── util.R
    ├── visualization.R
    └── zzz.R
├── README.md
├── data-raw
    ├── small_atlas.R
    └── test_BC_atlas.R
├── data
    ├── small_BC_atlas.rda
    └── test_BC_atlas.rda
├── img
    ├── Cd34_expression.png
    ├── Cd8a_expression.png
    ├── tabula_annotated.png
    └── tabula_clusters.png
├── inst
    └── doc
    │   ├── index.Rmd
    │   ├── index.html
    │   ├── index_files
    │       └── figure-html
    │       │   ├── ccl-1.png
    │       │   ├── clustering-1.png
    │       │   └── norm-1.png
    │   ├── installation.Rmd
    │   ├── installation.html
    │   ├── scGSEA.Rmd
    │   ├── scGSEA.html
    │   ├── scGSEA_files
    │       └── figure-html
    │       │   ├── atlas-1.png
    │       │   ├── clust-1.png
    │       │   └── graph-1.png
    │   ├── scMAP.Rmd
    │   ├── scMAP.html
    │   └── scMAP_files
    │       └── figure-html
    │           ├── atlas-1.png
    │           └── map-1.png
├── man
    ├── Read10X.Rd
    ├── classify.cells.Rd
    ├── clustcells.Rd
    ├── clustcellsBYscGSEA.Rd
    ├── computePCADim.Rd
    ├── ensToSymbol.Rd
    ├── filterCells.Rd
    ├── findClusterMarkers.Rd
    ├── findVarGenes.Rd
    ├── gficf.Rd
    ├── loadGFICF.Rd
    ├── plotCells.Rd
    ├── plotGSEA.Rd
    ├── plotGSVA.Rd
    ├── plotGeneViolin.Rd
    ├── plotGenes.Rd
    ├── plotPathway.Rd
    ├── resetScGSEA.Rd
    ├── runGSEA.Rd
    ├── runHarmony.Rd
    ├── runNMF.Rd
    ├── runPCA.Rd
    ├── runReduction.Rd
    ├── runScGSEA.Rd
    ├── saveGFICF.Rd
    ├── scMAP.Rd
    ├── small_BC_atlas.Rd
    ├── symbolToEns.Rd
    └── test_BC_atlas.Rd
├── src
    ├── .gitignore
    ├── Makevars
    ├── Makevars.win
    ├── ModularityOptimizer.cpp
    ├── ModularityOptimizer.h
    ├── RModularityOptimizer.cpp
    ├── RcppExports.cpp
    ├── detectCores.cpp
    ├── jaccard_coeff.cpp
    ├── mann_whitney.cpp
    ├── mann_whitney.h
    ├── misc.cpp
    ├── rcpp_mann_whitney.cpp
    ├── rcpp_parallel_jaccard_coeff.cpp
    └── rcpp_parallel_mann_whitney.cpp
└── tests
    └── testthat.R


/.gitignore:
--------------------------------------------------------------------------------
 1 | .*
 2 | .Rproj
 3 | !.gitignore
 4 | 
 5 | # History files
 6 | .Rhistory
 7 | .Rapp.history
 8 | 
 9 | # Session Data files
10 | .RData
11 | 
12 | # Example code in package build process
13 | *-Ex.R
14 | 
15 | # Output files from R CMD build
16 | /*.tar.gz
17 | 
18 | # Output files from R CMD check
19 | /*.Rcheck/
20 | 
21 | # RStudio files
22 | .Rproj.user/
23 | 
24 | # produced vignettes
25 | vignettes/*.html
26 | vignettes/*.pdf
27 | 
28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
29 | .httr-oauth
30 | 
31 | # knitr and R markdown default cache directories
32 | /*_cache/
33 | /cache/
34 | 
35 | # Temporary files created by R markdown
36 | *.utf8.md
37 | *.knit.md
38 | 
39 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
40 | rsconnect/
41 | 
42 | .Rproj.user
43 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: gficf
 2 | Title: Gene Frequency - Inverse Cell Frequency
 3 | Version: 2.0.0
 4 | Authors@R: person("Gennaro", "Gambardella", email = "gambardella@tigem.it", role = c("aut", "cre"))
 5 | Description: Single cell data rapresentation as text
 6 | License: GPL-3
 7 | URL: https://github.com/gambalab/gficf
 8 | BugReports: https://github.com/gambalab/gficf
 9 | Encoding: UTF-8
10 | LazyData: true
11 | RoxygenNote: 7.2.2
12 | Depends: R (>= 3.6), RcppML
13 | LinkingTo: RcppArmadillo, Rcpp (>= 0.11.0), RcppGSL, RcppProgress, RcppParallel, RcppEigen
14 | biocViews:
15 | Imports: 
16 |     uwot,
17 |     igraph,
18 |     ggplot2,
19 |     irlba,
20 |     ggrepel,
21 |     Rtsne,
22 |     fastmatch,
23 |     edgeR,
24 |     fgsea,
25 |     RSpectra,
26 |     Matrix,
27 |     babelgene,
28 |     reshape2,
29 |     RcppParallel,
30 |     Rcpp,
31 |     MASS,
32 |     locfit,
33 |     reticulate,
34 |     leiden,
35 |     KernelKnn,
36 |     mgcv,
37 |     GSVA,
38 |     limma,
39 |     pointr,
40 |     AnnotationHub,
41 |     ensembldb,
42 |     harmony,
43 |     BiocParallel
44 | Remotes:
45 |     github::zdebruine/RcppML
46 | SystemRequirements: GNU make
47 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(Read10X)
 4 | export(classify.cells)
 5 | export(clustcells)
 6 | export(clustcellsBYscGSEA)
 7 | export(computePCADim)
 8 | export(ensToSymbol)
 9 | export(filterCells)
10 | export(findClusterMarkers)
11 | export(gficf)
12 | export(loadGFICF)
13 | export(plotCells)
14 | export(plotGSEA)
15 | export(plotGSVA)
16 | export(plotGeneViolin)
17 | export(plotGenes)
18 | export(plotPathway)
19 | export(resetScGSEA)
20 | export(runGSEA)
21 | export(runHarmony)
22 | export(runNMF)
23 | export(runPCA)
24 | export(runReduction)
25 | export(runScGSEA)
26 | export(saveGFICF)
27 | export(scMAP)
28 | export(symbolToEns)
29 | import(AnnotationHub)
30 | import(GSVA)
31 | import(Matrix)
32 | import(RcppML)
33 | import(ensembldb)
34 | import(fastmatch)
35 | import(ggplot2)
36 | import(ggrepel)
37 | import(mgcv)
38 | import(msigdbr)
39 | import(pointr)
40 | import(utils)
41 | import(uwot)
42 | importFrom(BiocParallel,SnowParam)
43 | importFrom(KernelKnn,KernelKnn)
44 | importFrom(MASS,fitdistr)
45 | importFrom(Matrix,readMM)
46 | importFrom(RSpectra,svds)
47 | importFrom(RcppML,nmf)
48 | importFrom(RcppParallel,RcppParallelLibs)
49 | importFrom(RcppParallel,setThreadOptions)
50 | importFrom(Rtsne,Rtsne)
51 | importFrom(babelgene,orthologs)
52 | importFrom(edgeR,DGEList)
53 | importFrom(edgeR,calcNormFactors)
54 | importFrom(edgeR,cpm)
55 | importFrom(fgsea,fgsea)
56 | importFrom(harmony,HarmonyMatrix)
57 | importFrom(igraph,as_adj)
58 | importFrom(igraph,cluster_louvain)
59 | importFrom(igraph,fastgreedy.community)
60 | importFrom(igraph,graph.adjacency)
61 | importFrom(igraph,graph.data.frame)
62 | importFrom(igraph,membership)
63 | importFrom(igraph,simplify)
64 | importFrom(igraph,walktrap.community)
65 | importFrom(irlba,irlba)
66 | importFrom(leiden,leiden)
67 | importFrom(limma,eBayes)
68 | importFrom(limma,lmFit)
69 | importFrom(limma,topTable)
70 | importFrom(locfit,locfit)
71 | importFrom(locfit,locfit.robust)
72 | importFrom(locfit,lp)
73 | importFrom(mgcv,gam)
74 | importFrom(reshape2,acast)
75 | importFrom(reshape2,melt)
76 | importFrom(reticulate,py_module_available)
77 | importFrom(sva,ComBat_seq)
78 | importFrom(utils,read.delim)
79 | importFrom(utils,read.table)
80 | importFrom(utils,write.table)
81 | useDynLib(gficf, .registration = TRUE)
82 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # gf-icf 0.0.0.1
 2 | 
 3 | ## New features
 4 | 
 5 | 
 6 | ## Bug fixes and minor improvements
 7 | 
 8 | 
 9 | # gficf 0.0.0.1 (XX XXX 2019)
10 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | RunModularityClusteringCpp <- function(SNN, modularityFunction, resolution, algorithm, nRandomStarts, nIterations, randomSeed, printOutput, edgefilename) {
 5 |     .Call(`_gficf_RunModularityClusteringCpp`, SNN, modularityFunction, resolution, algorithm, nRandomStarts, nIterations, randomSeed, printOutput, edgefilename)
 6 | }
 7 | 
 8 | jaccard_coeff <- function(idx, printOutput) {
 9 |     .Call(`_gficf_jaccard_coeff`, idx, printOutput)
10 | }
11 | 
12 | armaColSumFull <- function(m, ncores = 1L, verbose = FALSE) {
13 |     .Call(`_gficf_armaColSumFull`, m, ncores, verbose)
14 | }
15 | 
16 | armaColSumSparse <- function(m, ncores = 1L, verbose = FALSE) {
17 |     .Call(`_gficf_armaColSumSparse`, m, ncores, verbose)
18 | }
19 | 
20 | colMeanVarS <- function(m, ncores = 1L) {
21 |     .Call(`_gficf_colMeanVarS`, m, ncores)
22 | }
23 | 
24 | armaManhattan <- function(m, ncores = 1L, verbose = TRUE, full = FALSE, diag = TRUE) {
25 |     .Call(`_gficf_armaManhattan`, m, ncores, verbose, full, diag)
26 | }
27 | 
28 | armaCorr <- function(m, ncores = 1L, verbose = TRUE, full = FALSE, diag = TRUE, dist = TRUE) {
29 |     .Call(`_gficf_armaCorr`, m, ncores, verbose, full, diag, dist)
30 | }
31 | 
32 | armaColMeans <- function(m, ncores = 1L, verbose = TRUE) {
33 |     .Call(`_gficf_armaColMeans`, m, ncores, verbose)
34 | }
35 | 
36 | scaleUMI <- function(m, ncores = 1L, verbose = FALSE) {
37 |     .Call(`_gficf_scaleUMI`, m, ncores, verbose)
38 | }
39 | 
40 | rcpp_WMU_test <- function(M, idx1, idx2) {
41 |     .Call(`_gficf_rcpp_WMU_test`, M, idx1, idx2)
42 | }
43 | 
44 | rcpp_parallel_jaccard_coef <- function(mat, printOutput) {
45 |     .Call(`_gficf_rcpp_parallel_jaccard_coef`, mat, printOutput)
46 | }
47 | 
48 | rcpp_parallel_WMU_test <- function(matX, matY, printOutput) {
49 |     .Call(`_gficf_rcpp_parallel_WMU_test`, matX, matY, printOutput)
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/R/cellClassifier.R:
--------------------------------------------------------------------------------
  1 | #' Classify New Embedded Cells 
  2 | #'
  3 | #' Classify new embedded cells using GF-ICF transformation and K-nn algorithm.
  4 | #' Existing cells are used as training set.
  5 | #' 
  6 | #' @param data list; GFICF object
  7 | #' @param classes chareachters; Classes of already existing cells in the order of they are in colnames(data$gficf).
  8 | #' @param k integer; Number of K-nn to use for classification. Odd number less than 30 are preferred.
  9 | #' @param seed integer; Initial seed to use.
 10 | #' @param knn_method string; a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'.
 11 | #' @param knn_weights_fun value; there are various ways of specifying the kernel function. NULL value (default) correspond to unweighted KNN algorithm. See the details section of KernelKnn package for more values.
 12 | #' @param nt numeric; Number of thread to use (default is 0, i.e. all possible CPUs - 1)
 13 | #' @return A dataframe containing cell id and predicted classes.
 14 | #' @importFrom KernelKnn KernelKnn
 15 | #' 
 16 | #' @export
 17 | classify.cells = function(data,classes,k=7,seed=18051982,knn_method="euclidean",knn_weights_fun=NULL,nt=0)
 18 | {
 19 |   if (is.null(data$embedded.predicted)) {stop("Please embed first new cells!")}
 20 |   if (nt==0) {nt = ifelse(detectCores()>1,detectCores()-1,1)}
 21 |   set.seed(seed)
 22 |   if(!is.factor(classes)) {classes = factor(as.character(classes))}
 23 |   
 24 |   res = KernelKnn(data = data$embedded[,c(1,2)], TEST_data = data$embedded.predicted[,c(1,2)], y = as.numeric(classes), k = k ,h = 1,method = knn_method, knn_weights_fun, threads = nt, regression = F,Levels = 1:length(levels(classes)))
 25 |   colnames(res) = levels(classes)
 26 |   res <- apply(res, 1, function(x) {i = which.max(x); return(data.frame(predicted.class=names(x)[i],class.prob=x[i],stringsAsFactors = F))} )
 27 |   res <- do.call("rbind",res)
 28 |   rownames(res) <- NULL
 29 |   #res = class::knn(data$embedded[,c(1,2)],data$embedded.predicted[,c(1,2)],classes,k = k,prob = T)
 30 |   data$embedded.predicted$predicted.class <- res$predicted.class
 31 |   data$embedded.predicted$class.prob <- res$class.prob
 32 |   return(data)
 33 | }
 34 | 
 35 | #' Embed new cells in an existing space 
 36 | #'
 37 | #' This function embed new cells in an already existing space. For now it supports only UMAP and t-UMAP. Briefly new cells are first normalized with GF-ICF method but using as ICF weigth estimated on the existing cells and than projected in the existing PCA/NMF space before to be embedded in the already existing UMAP space via umap_transform function. 
 38 | #' 
 39 | #' @param data list; GFICF object
 40 | #' @param x Matrix; UMI counts matrix of cells to embedd.
 41 | #' @param nt integer; Number of thread to use (default 2).
 42 | #' @param seed integer; Initial seed to use.
 43 | #' @param normalize boolean; If counts must be normalized before to be rescaled with GFICF.
 44 | #' @param verbose boolean; Icrease verbosity.
 45 | #' @return The updated gficf object.
 46 | #' @import Matrix
 47 | #' @import uwot
 48 | #' @importFrom edgeR DGEList calcNormFactors cpm
 49 | #' @importFrom Rtsne Rtsne
 50 | #' @import RcppML
 51 | #' 
 52 | #' @export
 53 | scMAP = function(data,x,nt=2,seed=18051982, normalize=TRUE,verbose=TRUE)
 54 | {
 55 |   if(data$reduction=="tsne") {stop("Not supported with t-SNE reduction!!")}
 56 |   if(length(intersect(rownames(data$gficf),rownames(x)))==0) {stop("No common genes between the two dataset! Please check if gene identifiers beween two dataset correspond")}
 57 |   
 58 |   tsmessage("Gene filtering..",verbose = verbose)
 59 |   g = union(rownames(filter_genes_cell2loc_style(data = x,data$param$cell_count_cutoff,data$param$cell_percentage_cutoff2,data$param$nonz_mean_cutoff)),rownames(data$gficf))
 60 |   x = x[rownames(x)%in%g,]
 61 |   rm(g)
 62 |   
 63 |   if (normalize){
 64 |     tsmessage("Normalize counts..",verbose = verbose)
 65 |     x <- Matrix::Matrix(edgeR::cpm(edgeR::calcNormFactors(edgeR::DGEList(counts=x),normalized.lib.sizes = T)),sparse = T)
 66 |   }
 67 |   
 68 |   x = tf(x,verbose=verbose)
 69 |   x = idf(x,w = data$w,verbose=verbose)
 70 |   x = t(l.norm(t(x),norm = "l2",verbose=verbose))
 71 |   
 72 |   if(!is.null(data$pca$odgenes) & data$pca$rescale)
 73 |   {
 74 |     x = Matrix::Matrix(data = x,sparse = T)
 75 |     x@x <- x@x*rep(data$pca$odgenes[colnames(x),'gsf'],diff(x@p))
 76 |   }
 77 |   
 78 |   if (data$pca$type=="NMF") {
 79 |     cells = colnames(x)
 80 |     x = t(predict.nmf(w = data$pca$genes,data = x[rownames(data$pca$genes),]))
 81 |     rownames(x) = cells
 82 |   } else {
 83 |     x = t(x[rownames(data$pca$genes),]) %*% data$pca$genes
 84 |   }
 85 | 
 86 |   if(data$reduction%in%c("tumap","umap")) {
 87 |     data$embedded.predicted = as.data.frame(uwot::umap_transform(as.matrix(x),data$uwot,verbose = verbose))
 88 |     rownames(data$embedded.predicted) = rownames(x)
 89 |     colnames(data$embedded.predicted) = c("X","Y")
 90 |   }
 91 |   
 92 |   data$pca$pred = x;rm(x);gc()
 93 |   return(data)
 94 | }
 95 | 
 96 | predict.nmf = function (w, data, L1 = 0, L2 = 0, mask = NULL, ...) 
 97 | {
 98 |   m <- new("nmf", w = w, d = rep(1:ncol(w)), h = matrix(0,nrow = ncol(w), 1))
 99 |   RcppML::predict(m, data, L1 = L1, L2 = L2, mask = mask, ...)
100 | }
101 | 
102 | 


--------------------------------------------------------------------------------
/R/cellQC.R:
--------------------------------------------------------------------------------
  1 | #' Cell QC 
  2 | #'
  3 | #' Filter Cells with low gene ratio detection and high MT ratio.
  4 | #' Loess and GAM regression are used to fit relationships between the number of UMI and either the ratio of detected genes or the MT ratio.
  5 | #' 
  6 | #' @param counts Matrix; Raw counts matrix
  7 | #' @param organism characters; Organism (supported human and mouse).
  8 | #' @param plot boolean; If regression plots must be showed.
  9 | #' @param verbose boolean; Increase verbosity.
 10 | #' @param minUMI numeric; Minimium number of UMI per cell (default 800).
 11 | #' @return The updated gficf object.
 12 | #' @import AnnotationHub
 13 | #' @import ensembldb
 14 | #' 
 15 | #' @export
 16 | filterCells = function(counts,organism,plot=F,verbose=T,minUMI=800) {
 17 |   organism = ifelse(tolower(organism) == "human","Homo sapiens","Mus musculus")
 18 |   organism = base::match.arg(arg = organism,choices = c("Homo sapiens","Mus musculus"),several.ok = F)
 19 |   
 20 |   metadata = data.frame(cell.id = colnames(counts),nUMI=armaColSum(counts),nGenes=armaColSum(counts!=0),stringsAsFactors = F)
 21 |   rownames(metadata) = metadata$cell.id
 22 |   metadata$geneRatio = metadata$nGenes/metadata$nUMI
 23 |   
 24 |   tsmessage("... Filtering Cells by minUMI",verbose = verbose)
 25 |   b = nrow(metadata)
 26 |   metadata <- subset(metadata,nUMI>=minUMI)
 27 |   counts <- counts[,colnames(counts)%in%metadata$cell.id]
 28 |   a = nrow(metadata)
 29 |   tsmessage(paste0("Cells passing the coverage filter ",a," out of ",b," (",round(a/b*100,2),")"),verbose = verbose)
 30 |   
 31 |   tsmessage("... Retrieving gene annotation",verbose = verbose)
 32 |   ah <- AnnotationHub()
 33 |   # Access the Ensembl database for organism
 34 |   ahDb <- query(ah,pattern = c(organism,"EnsDb"), ignore.case = TRUE)
 35 |   # Acquire the latest annotation files
 36 |   id <- tail(rownames(mcols(ahDb)),n=1)
 37 |   # Download the appropriate Ensembldb database
 38 |   edb <- ah[[id]]
 39 |   # Extract gene-level information from database
 40 |   annotations <- subset(genes(edb,return.type = "data.frame"),seq_name%in%c(as.character(1:22),"X","Y","MT"))
 41 |   # Extract IDs for mitochondrial genes
 42 |   mt = annotations$gene_id[annotations$seq_name%in%"MT"]
 43 |   
 44 |   tsmessage("... Filtering Cells by Gene/nUMI ~ log(UMI)",verbose = verbose)
 45 |   tmp = plot.UMIxGene(metadata = metadata,method = "less",fdr.th = .1,plot = plot,family = "loess")
 46 |   metadata$covFilter = !tmp$toremove
 47 |   a = sum(metadata$covFilter)
 48 |   b = nrow(metadata)
 49 |   tsmessage(paste0("Cells passing the coverage filter ",a," out of ",b," (",round(a/b*100,2),")"),verbose = verbose)
 50 |   
 51 |   if (sum(mt %in% rownames(counts))==0) {
 52 |     tsmessage("No mitochondrial genes found! MT filter will be not applied.")
 53 |     metadata$mtFilter = TRUE
 54 |   } else {
 55 |     # Number of UMIs assigned to mitochondrial genes
 56 |     metadata$mtUMI <- Matrix::colSums(counts[which(rownames(counts) %in% mt),], na.rm = T)
 57 |     
 58 |     # Calculate of mitoRatio per cell
 59 |     metadata$mitoRatio <- metadata$mtUMI/metadata$nUMI
 60 |     tsmessage("... Filtering Cells by mtRatio ~ log(UMI)")
 61 |     tmp = plot.UMIxMT(metadata,method="greater",plot = plot,family = "loess")
 62 |     metadata$mtFilter = !tmp$toremove
 63 |     a = sum(metadata$mtFilter)
 64 |     tsmessage(paste0("Cells passing the MT filter ",a," out of ",b," (",round(a/b*100,2),")"),verbose = verbose)
 65 |   }
 66 |   data = list()
 67 |   data$counts = counts[,metadata$cell.id[metadata$covFilter & metadata$mtFilter]];
 68 |   rm(counts);gc()
 69 |   data$QC.metadata = metadata[colnames(data$counts),]
 70 |   data$ann.hub.id = id
 71 |   return(data)
 72 | }
 73 | 
 74 | #'
 75 | #' @import ggplot2
 76 | #' @importFrom MASS fitdistr
 77 | #' @importFrom mgcv gam
 78 | #' 
 79 | plot.UMIxMT = function(metadata,method="greater",fdr.th=0.1,plot=F,family="gam")
 80 | {
 81 |   if (family == "loess") {fit <- stats::loess(formula = mitoRatio ~ log(nUMI),data = metadata,span = 1, degree = 1,family = "gaussian")}
 82 |   if (family == "poly") {fit <- stats::glm(formula = mitoRatio ~ poly(log(nUMI),degree = 2,raw = T),data = metadata)}
 83 |   if (family == "gam") {fit <- mgcv::gam(mitoRatio ~ log(nUMI), data = metadata)}
 84 |   metadata$ypred = predict(fit,metadata)
 85 |   metadata$diff = metadata$mitoRatio - metadata$ypred
 86 |   fitDist = MASS::fitdistr(metadata$diff, "normal")
 87 |   
 88 |   if (method == "two.sided") {
 89 |     metadata$p.val[metadata$diff>0] = pnorm(metadata$diff[metadata$diff>0], mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE)
 90 |     metadata$p.val[metadata$diff<0] = 1 - pnorm(metadata$diff[metadata$diff<0], mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE)
 91 |   }
 92 |   
 93 |   if (method == "greater") {metadata$p.val = stats::pnorm(metadata$diff, mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE)}
 94 |   if (method == "less") {metadata$p.val = 1 - stats::pnorm(metadata$diff, mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE)}
 95 |   metadata$p.val = stats::p.adjust(metadata$p.val,method = "fdr")
 96 |   metadata$toremove = metadata$p.val<fdr.th;
 97 | 
 98 |   p1 = ggplot(data = metadata,aes(x=log(nUMI),y=mitoRatio)) + geom_point(aes(color=toremove),shape=19,size=.25) + theme_bw() + theme(legend.position = "none")
 99 |   if (family == "gam") {p1 = p1 + stat_smooth(method = gam, formula = y ~ s(x))}
100 |   if (family == "loess") {p1 = p1 + stat_smooth(method = loess, formula = y ~ x,span = 1)}
101 |   if (family == "poly") {p1 = p1 + stat_smooth(method = glm, formula = y ~ poly(x, 2, raw = TRUE))}
102 |   if(plot) {print(p1)}
103 |   
104 |   return(metadata)
105 | }
106 | 
107 | #'
108 | #' @import ggplot2
109 | #' @importFrom MASS fitdistr
110 | #' @importFrom mgcv gam
111 | #' 
112 | plot.UMIxGene = function(metadata,method="less",fdr.th=0.1,plot=F,family="gam")
113 | {
114 |   #fit <- stats::glm(formula = geneRatio ~ poly(log(nUMI), d, raw = TRUE),data = metadata)
115 |   if (family == "loess") {fit <- stats::loess(formula = geneRatio ~ log(nUMI),data = metadata,span = 1, degree = 2,family = "gaussian")}
116 |   if (family == "gam") {fit <- mgcv::gam(geneRatio ~ log(nUMI), data = metadata)}
117 |   metadata$ypred = predict(fit,metadata)
118 |   metadata$diff = metadata$geneRatio - metadata$ypred
119 |   fitDist = MASS::fitdistr(metadata$diff, "normal")
120 |   
121 |   if (method == "two.sided") {
122 |     metadata$p.val[metadata$diff>0] = stats::pnorm(metadata$diff[metadata$diff>0], mean = 0, sd = fitDist$estimate[2], lower.tail = FALSE)
123 |     metadata$p.val[metadata$diff<0] = 1 - stats::pnorm(metadata$diff[metadata$diff<0], mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE)
124 |   }
125 |   if (method == "greater") {metadata$p.val = stats::pnorm(metadata$diff, mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE)}
126 |   if (method == "less") {metadata$p.val = 1 - stats::pnorm(metadata$diff, mean = fitDist$estimate[1], sd = fitDist$estimate[2], lower.tail = FALSE)}
127 |   
128 |   metadata$p.val = p.adjust(metadata$p.val,method = "fdr")
129 |   metadata$toremove = metadata$p.val<fdr.th;
130 |   
131 |   p1 = ggplot(data = metadata,aes(x=log(nUMI),y=geneRatio)) + geom_point(aes(color=toremove),shape=19,size=.25) + theme_bw() + theme(legend.position = "none")
132 |   if (family == "gam") {p1 = p1 + stat_smooth(method = gam, formula = y ~ s(x))}
133 |   if (family == "loess") {p1 = p1 + stat_smooth(method = loess, formula = y ~ x,span = 1)}
134 |   if(plot) {print(p1)}
135 | 
136 |   return(metadata)
137 | }
138 | 
139 | 


--------------------------------------------------------------------------------
/R/clustCells.R:
--------------------------------------------------------------------------------
  1 | #' PhenoGraph clustering
  2 | #' 
  3 | #' R implementation of the PhenoGraph algorithm
  4 | #' 
  5 | #' A custom R implementation of the PhenoGraph (http://www.cell.com/cell/abstract/S0092-8674(15)00637-6) algorithm, 
  6 | #' which is a clustering method designed for high-dimensional single-cell data analysis. It works by creating a graph ("network") representing 
  7 | #' phenotypic similarities between cells by calclating the Jaccard coefficient between nearest-neighbor sets, and then identifying communities 
  8 | #' using the well known Louvain method (https://sites.google.com/site/findcommunities/) in this graph. 
  9 | #' 
 10 | #' That version used PCA or LSA reduced meta-cells and multithreading annoy version for K-nn search (from uwot package).  
 11 | #' 
 12 | #' @param data list; Input data (gficf object)
 13 | #' @param from.embedded logical; Use embeddedd (UMAP or tSNA) space for clustering cells. Best results are usually obtained not using the embedded space.
 14 | #' @param k integer; number of nearest neighbours (default:15)
 15 | #' @param dist.method character; Dist to use for K-nn. Type of distance metric to use to find nearest neighbors. One of:
 16 | #' \itemize{
 17 | #'   \item \code{"euclidean"} (the default)
 18 | #'   \item \code{"cosine"}
 19 | #'   \item \code{"manhattan"}
 20 | #'   \item \code{"hamming"} (very slow)
 21 | #' }
 22 | #' @param  nt integer; Number of cpus to use for k-nn search
 23 | #' @param community.algo characthers; Community algorithm to use for clustering. Supported are:
 24 | #' \itemize{
 25 | #'   \item \code{"louvain"} (the default, the original louvain method)
 26 | #'   \item \code{"louvain 2"} (louvain with modularity optimization from Seurat)
 27 | #'   \item \code{"louvain 3"} (Louvain algorithm with multilevel refinement from Seurat)
 28 | #'   \item \code{"leiden"} (Leiden algorithm see Traag et al. 2019)
 29 | #'   \item \code{"walktrap"}
 30 | #'   \item \code{"fastgreedy"}
 31 | #' }
 32 | #' @param store.graph logical; Store produced phenograph in the gficf object
 33 | #' @param seed integer; Seed to use for replication.
 34 | #' @param verbose logical; Increase verbosity.
 35 | #' @param resolution Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities (used only for leiden and louvain 2 or 3 methods).
 36 | #' @param n.start Number of random starts (used only for louvain 2 or 3 methods).
 37 | #' @param n.iter Maximal number of iterations per random start (used only for louvain 2 or 3 methods).
 38 | #' @return the updated gficf object
 39 | #' @importFrom  igraph graph.data.frame simplify cluster_louvain walktrap.community fastgreedy.community membership as_adj
 40 | #' @importFrom RcppParallel setThreadOptions RcppParallelLibs
 41 | #' @importFrom reticulate py_module_available
 42 | #' @importFrom leiden leiden
 43 | #' @import uwot
 44 | #' @import Matrix
 45 | #' @export
 46 | clustcells <- function(data,from.embedded=F,k=15,dist.method="manhattan",nt=2,community.algo="louvain 3",store.graph=T,seed=180582,verbose=TRUE, resolution = 0.25, n.start = 50, n.iter = 250)
 47 | {
 48 |   community.algo = base::match.arg(arg = community.algo,choices = c("louvain","louvain 2","louvain 3","walktrap","fastgreedy","leiden"),several.ok = F)
 49 |   
 50 |   if (is.null(data$embedded)) {stop("Run first runReduction function")}
 51 |   set.seed(seed)
 52 |   tsmessage("Finding Neighboors..",verbose = verbose)
 53 |   
 54 |   if (from.embedded)
 55 |   {
 56 |     if(is.null(data$embedded)) {stop("First run runReduction to embed your cells")}
 57 |     neigh = uwot:::find_nn(as.matrix(data$embedded[,c(1,2)]),k=k+1,include_self = T,n_threads = nt,verbose = TRUE,method = "annoy",metric=dist.method)$idx
 58 |   } else {
 59 |     if(is.null(data$pca)) {stop("First run runPCA or runLSA to reduce dimensionality")}
 60 |     neigh = uwot:::find_nn(data$pca$cells,k=k+1,include_self = T,n_threads = nt,verbose = verbose,method = "annoy",metric=dist.method)$idx
 61 |   }
 62 |   
 63 |   neigh = neigh[,-1]
 64 |   RcppParallel::setThreadOptions(numThreads = nt)
 65 |   relations <- rcpp_parallel_jaccard_coef(neigh,verbose)
 66 |   relations <- relations[relations[,3]>0, ]
 67 |   relations <- as.data.frame(relations)
 68 |   colnames(relations)<- c("from","to","weight")
 69 |   g <- igraph::graph.data.frame(relations, directed=FALSE)
 70 |   rm(relations,neigh);gc()
 71 |   
 72 |   if (community.algo=="louvain")
 73 |   {
 74 |     tsmessage("Performing louvain...",verbose = verbose)
 75 |     community <- igraph::cluster_louvain(g)
 76 |   }
 77 |   
 78 |   if (community.algo=="louvain 2")
 79 |   {
 80 |     tsmessage("Performing louvain with modularity optimization...",verbose = verbose)
 81 |     community <- RunModularityClustering(igraph::as_adjacency_matrix(g,attr = "weight",sparse = T),1,resolution,1,n.start,n.iter,seed,verbose)
 82 |   }
 83 |   
 84 |   if (community.algo=="louvain 3")
 85 |   {
 86 |     tsmessage("Performing louvain with modularity optimization...",verbose = verbose)
 87 |     community <- RunModularityClustering(igraph::as_adjacency_matrix(g,attr = "weight",sparse = T),1,resolution,2,n.start,n.iter,seed,verbose)
 88 |   }
 89 |   
 90 |   if (community.algo=="walktrap")
 91 |   {
 92 |     tsmessage("Performing walktrap...",verbose = verbose)
 93 |     community <- igraph::walktrap.community(g)
 94 |   } 
 95 |   if (community.algo=="fastgreedy") {
 96 |     tsmessage("Performing fastgreedy...",verbose = verbose)
 97 |     community <- igraph::fastgreedy.community(g)
 98 |   }
 99 |   
100 |   if (community.algo=="leiden")
101 |   {
102 |     if (!(reticulate::py_module_available("leidenalg") && reticulate::py_module_available("igraph")))
103 |       stop("Cannot find Leiden algorithm, please install through pip (e.g. sudo -H pip install leidenalg igraph).")
104 |     
105 |     tsmessage("Performing leiden...",verbose = verbose)
106 |     community <- leiden::leiden(object = g,resolution_parameter=resolution)
107 |   }
108 |   
109 |   
110 |   if(community.algo %in% c("louvain 2","louvain 3","leiden")) {
111 |     if(community.algo %in% c("louvain 2","louvain 3")) {community = community + 1}
112 |     data$embedded$cluster = as.character(community)
113 |   } else {
114 |     data$embedded$cluster <- as.character(igraph::membership(community))
115 |   }
116 |   
117 |   if (store.graph) {data$community=community;data$cell.graph=g} else {data$community=community}
118 |   
119 |   # get centroid of clusters
120 |   tsmessage("Computing Cluster Signatures...",verbose = verbose)
121 |   cluster.map = data$embedded$cluster
122 |   u = base::unique(cluster.map)
123 |   data$cluster.gene.rnk = base::sapply(u, function(x,y=data$gficf,z=cluster.map) armaRowSum(y[,z%in%x]))
124 |   
125 |   tsmessage(paste("Detected Clusters:",length(unique(data$embedded$cluster))),verbose = verbose)
126 |   
127 |   return(data)
128 | }
129 | 
130 | # Runs the modularity optimizer (C++ function from seurat package https://github.com/satijalab/seurat)
131 | #
132 | # @param SNN SNN matrix to use as input for the clustering algorithms
133 | # @param modularity Modularity function to use in clustering (1 = standard; 2 = alternative)
134 | # @param resolution Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities
135 | # @param algorithm Algorithm for modularity optimization (1 = original Louvain algorithm; 2 = Louvain algorithm with multilevel refinement; 3 = SLM algorithm; 4 = Leiden algorithm). Leiden requires the leidenalg python module.
136 | # @param n.start Number of random starts
137 | # @param n.iter Maximal number of iterations per random start
138 | # @param random.seed Seed of the random number generator
139 | # @param print.output Whether or not to print output to the console
140 | # @param temp.file.location Deprecated and no longer used
141 | # @param edge.file.name Path to edge file to use
142 | #
143 | # @return clusters
144 | #
145 | #' @importFrom utils read.table write.table
146 | #
147 | RunModularityClustering <- function(SNN = matrix(), modularity = 1, resolution = 0.8, algorithm = 1, n.start = 10, n.iter = 10, random.seed = 0, print.output = TRUE, temp.file.location = NULL, edge.file.name = "") 
148 | {
149 |   clusters <- RunModularityClusteringCpp(SNN,modularity,resolution,algorithm,n.start,n.iter,random.seed,print.output,edge.file.name)
150 |   return(clusters)
151 | }
152 | 
153 | 
154 | #' Cell clustering by pathway's activity
155 | #' 
156 | #' Cell clustering using pathway expression instead of gene expression
157 | #' 
158 | #' Cells are clustered using the estimated pathway activity levels by scGSEA() function.
159 | #' Cells are clustered either by using PhenoGraph algorithm or hierarchical clustering. In the case of PhenoGraph method is used
160 | #' identified clusters with louvain method are stored into data$embedded$cluster.by.scGSEA while the prduced graph into data$scgsea$cell.graph. 
161 | #' In case hierarchical clusteringis used the resulting dendogram is stored into data$scgsea$h.dendo.
162 | #' 
163 | #' 
164 | #' @param data list; Input data (gficf object)
165 | #' @param method character; Method used to produce the cell network, such as phenograph or fgraph.
166 | #' @param pca numeric; If different from NULL data are reduced using pca component before to apply clustering algorithm.
167 | #' @param k integer; number of nearest neighbours (default:10).
168 | #' @param resolution Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities.
169 | #' @param n.start Number of random starts.
170 | #' @param n.iter Maximal number of iterations per random start.
171 | #' @param nt integer; Number of cpus to use for k-nn search. If zero all cpu are used.
172 | #' @param seed integer; Seed to use for replication.
173 | #' @param verbose logical; Increase verbosity.
174 | #' @return the updated gficf object
175 | #' @importFrom  igraph graph.data.frame graph.adjacency
176 | #' @importFrom RcppParallel setThreadOptions RcppParallelLibs
177 | #' @import uwot
178 | #' @importFrom irlba irlba
179 | #' @import Matrix
180 | #' @export
181 | clustcellsBYscGSEA <- function(data,method="fgraph",pca=NULL,k=10, resolution = 0.25, n.start = 50, n.iter = 250, nt=0, verbose=T, seed=180582)
182 | {
183 |   if (is.null(data$scgsea)) {stop("Run first runScGSEA function")}
184 |   method = base::match.arg(arg = method,choices = c("phenograph","fgraph"),several.ok = F)
185 |   
186 |   nt=detectCores()
187 |   
188 |   if (method=="phenograph") {
189 |     x = data$scgsea$x
190 |     
191 |     tsmessage("Finding Neighboors..",verbose = verbose)
192 |     
193 |     if (!is.null(pca)) {
194 |       x <- irlba::irlba(A = x,nv=pca,center = Matrix::rowMeans(t(x)))
195 |       x <- x$u %*% diag(x$d)
196 |       rownames(x) <- rownames(data$scgsea$x)
197 |     }
198 |     
199 |     if (ncol(x)>100) {
200 |       neigh = uwot:::find_nn(x,k=k,include_self = F,n_threads = nt,verbose = verbose,method = "annoy",metric="manhattan",n_trees = 100)$idx
201 |     } else {
202 |       neigh = uwot:::find_nn(x,k=k,include_self = F,n_threads = nt,verbose = verbose,method = "fnn",metric="manhattan")$idx
203 |     }
204 |     rm(x)
205 |     
206 |     RcppParallel::setThreadOptions(numThreads = nt)
207 |     relations <- rcpp_parallel_jaccard_coef(neigh,verbose)
208 |     relations <- relations[relations[,3]>0, ]
209 |     relations <- as.data.frame(relations)
210 |     colnames(relations)<- c("from","to","weight")
211 |     g <- igraph::graph.data.frame(relations, directed=FALSE)
212 |     rm(relations,neigh);gc()
213 |     
214 |     tsmessage("Performing louvain with modularity optimization...",verbose = verbose)
215 |     community <- RunModularityClustering(igraph::as_adjacency_matrix(g,attr = "weight",sparse = T),1,resolution,2,n.start,n.iter,seed,verbose)
216 |     community = community + 1
217 |     data$embedded$cluster.by.scGSEA = as.character(community)
218 |     data$scgsea$cell.graph=g; rm(g)
219 |     
220 |     # get centroid of clusters
221 |     tsmessage("Computing Cluster Signatures...",verbose = verbose)
222 |     cluster.map = data$embedded$cluster.by.scGSEA
223 |     u = base::unique(cluster.map)
224 |     data$scgsea$cluster.gsea.mu = base::sapply(u, function(x,y=t(data$scgsea$x),z=cluster.map) Matrix::rowMeans(y[,z%in%x]))
225 |     tsmessage(paste("Detected Clusters:",length(unique(data$embedded$cluster))),verbose = verbose)
226 |   }
227 |   
228 |   if (method=="fgraph") {
229 |     g = uwot::umap(X = as.matrix(data$scgsea$x),n_neighbors = k,metric = "manhattan",scale = "Z",n_trees = 100,pca = pca,ret_extra = "fgraph",n_threads = nt,verbose = verbose)$fgraph
230 |     g <- igraph::graph.adjacency(adjmatrix = g, mode = "undirected",weighted = TRUE,diag = F)
231 |     
232 |     tsmessage("Performing louvain with modularity optimization...",verbose = verbose)
233 |     community <- RunModularityClustering(igraph::as_adjacency_matrix(g,attr = "weight",sparse = T),1,resolution,2,n.start,n.iter,seed,verbose)
234 |     community = community + 1
235 |     data$embedded$cluster.by.scGSEA = as.character(community)
236 |     data$scgsea$cell.graph=g; rm(g)
237 |     
238 |     # get centroid of clusters
239 |     tsmessage("Computing Cluster Signatures...",verbose = verbose)
240 |     cluster.map = data$embedded$cluster.by.scGSEA
241 |     u = base::unique(cluster.map)
242 |     data$scgsea$cluster.gsea.mu = base::sapply(u, function(x,y=t(data$scgsea$x),z=cluster.map) Matrix::rowMeans(y[,z%in%x]))
243 |     tsmessage(paste("Detected Clusters:",length(unique(data$embedded$cluster))),verbose = verbose)
244 |   }
245 |   
246 |   gc()
247 |   return(data)
248 | }
249 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' Small Single cell breast cancer atlas
 2 | #'
 3 | #' A subset of data from the the Single cell breast cancer atlas
 4 | #' Only 150 cells per cell-line are used
 5 | #'
 6 | #' @format ## `small_BC_atlas`
 7 | #' A Matrix of 4,760 cells
 8 | #' @source <https://www.nature.com/articles/s41467-022-29358-6>
 9 | "small_BC_atlas"
10 | 
11 | #' Test set for scMAP method
12 | #'
13 | #' A subset of data from the the Single cell breast cancer atlas
14 | #' Only 30 cells per cell-line are used. They do not ovrlapp with cells into small_BC_atlas dataset
15 | #'
16 | #' @format ## `test_BC_atlas`
17 | #' A Matrix of 930 cells
18 | #' @source <https://www.nature.com/articles/s41467-022-29358-6>
19 | "test_BC_atlas"


--------------------------------------------------------------------------------
/R/deGenes.R:
--------------------------------------------------------------------------------
  1 | #' Find Marker Genes from cell clusters.
  2 | #'
  3 | #' Try to identify marker genes across clusters performing Mann-Whitney U test.
  4 | #' DE genes are identified the expression in each cluster versus the all the other.
  5 | #'
  6 | #' @param data list; GFICF object
  7 | #' @param nt integer; Number of thread to use (default 2).
  8 | #' @param hvg boolean; Use only High Variable Genes (default is TRUE).
  9 | #' @param verbose boolean; Icrease verbosity.
 10 | #' @return The updated gficf object.
 11 | #' @import Matrix
 12 | #' @importFrom RcppParallel setThreadOptions
 13 | #' 
 14 | #' @export
 15 | findClusterMarkers = function(data,nt=2,hvg=T,verbose=T)
 16 | {
 17 |   if (is.null(data$community)) {stop("Please identify cluster first! Run clustcells function.")}
 18 |   if (is.null(data$rawCounts)) {stop("No raw/normalized counts stored. You should have run gficf normalization with storeRaw = T")}
 19 |   
 20 |   RcppParallel::setThreadOptions(numThreads = nt)
 21 |   
 22 |   u = unique(data$embedded$cluster)
 23 |   
 24 |   # Normalize if were not
 25 |   cpms = normCounts(data$rawCounts,doc_proportion_max = 2,doc_proportion_min = 0,normalizeCounts = !data$param$normalized,verbose=verbose)
 26 |   
 27 |   if (hvg)
 28 |   {
 29 |     tsmessage("... Detecting HVGs")
 30 |     df = findVarGenes(cpms,fitMethod = "locfit",verbose = verbose)
 31 |     df = df[order(df$FDR),]
 32 |     ix = df$FDR<.1
 33 |     tsmessage(paste("... Detected",sum(ix),"significant HVGs with an FDR < 10%"))
 34 |     if(sum(ix)<=1000) {ix = 1:min(1000,nrow(df))} 
 35 |     cpms = cpms[df$gene[ix],]
 36 |     rm(df)
 37 |   }
 38 |   
 39 |   cpms = as.matrix(cpms)
 40 |   
 41 |   tsmessage("... Start identify marker genes")
 42 |   res = NULL
 43 |   #progress_for(n=0,tot = length(u),display = T)
 44 |   for (i in 1:length(u)) {
 45 |     cells.1 = which(data$embedded$cluster%in%u[i])
 46 |     cells.2 = which(!data$embedded$cluster%in%u[i])
 47 |     tmp = rcpp_parallel_WMU_test(matX = cpms[,cells.1],matY = cpms[,cells.2],printOutput = F)
 48 |     tmp = data.frame(ens=rownames(cpms),log2FC=tmp[,2],p.value=tmp[,1],fdr=p.adjust(tmp[,1],method = "fdr"),stringsAsFactors = F)
 49 |     tmp = subset(tmp,fdr<.05 & log2FC>0)
 50 |     tmp = tmp[order(tmp$fdr,decreasing = F),]
 51 |     tmp$cluster = u[i]
 52 |     res = rbind(res,tmp)
 53 |     #progress_for(n=i,tot = length(u),display = T)
 54 |   }
 55 |   
 56 |   res = res[order(res$log2FC,decreasing = T),]
 57 |   rownames(res) = NULL
 58 |   data$de.genes = res
 59 |   return(data)
 60 | }
 61 | 
 62 | #'  Find high variable genes following the approach
 63 | #'  proposed by Chen et al. in BMC Genomics (2016)
 64 | #'  Code adapted from https://github.com/hillas/scVEGs
 65 | #' @param data list; GFICF object
 66 | #' @param fitMethod charachter; Method to use to fit variance and mean expression relationship (loess or locfit).
 67 | #' @param verbose boolean; Increase verbosity.
 68 | #' @import Matrix
 69 | #' @importFrom locfit locfit locfit.robust lp
 70 | #' @importFrom MASS fitdistr
 71 | #' 
 72 | findVarGenes = function(data,fitMethod="locfit",verbose=T)
 73 | {
 74 |   m <- dim(data)[1]
 75 |   std <- apply(data, 1, stats::sd)
 76 |   avg <- Matrix::rowMeans(data)
 77 |   cv <- std / avg
 78 |   # over dispersion sigma  (var = u(1 + u * sigma^2))
 79 |   xdata <- (avg)
 80 |   ydata <- log10(cv)
 81 |   xdata <- xdata[is.na(ydata) != "TRUE"]
 82 |   ydata <- ydata[is.na(ydata) != "TRUE"]
 83 |   
 84 |   if (fitMethod=="loess"){
 85 |     fitLoc <- stats::loess(formula = ydata ~ log10(x = xdata),data = data.frame("ydata"=ydata,"xdata"=xdata,stringsAsFactors = F),span = 0.8)
 86 |   } else {
 87 |     fitLoc <- locfit::locfit.robust(ydata ~ locfit::lp(log10(xdata), nn = .2))
 88 |   }
 89 |   
 90 |   xSeq <- seq(min(log10(xdata)), max(log10(xdata)), 0.005)
 91 |   gapNum <- matrix(0, length(xSeq), 1)
 92 |   for(i in 1:length(xSeq)) {
 93 |     cdx <- which((log10(xdata) >= xSeq[i] - 0.05) & (log10(xdata) < xSeq[i] + 0.05))
 94 |     gapNum[i,1] <- length(cdx)
 95 |   }
 96 |   cdx <- which(gapNum > m*0.005)
 97 |   xSeq <- 10 ^ xSeq
 98 |   ySeq <- predict(fitLoc,log10(xSeq))
 99 |   yDiff <- diff(ySeq)
100 |   ix <- which(yDiff > 0 & log10(xSeq[-1]) > 0)
101 |   if(length(ix) == 0)
102 |     ix <- length(ySeq) - 1
103 |   xSeq_all <- 10^seq(min(log10(xdata)), max(log10(xdata)), 0.001)
104 |   xSeq <- xSeq[cdx[1]:ix[1] + 1]
105 |   ySeq <- ySeq[cdx[1]:ix[1] + 1]
106 |   
107 |   b <- 1
108 |   a <- 0
109 |   df <- data.frame(x=xSeq, y = ySeq)
110 |   fit = stats::nls(y ~ 0.5 * log10(b / x + a), data = df, start=list(b = b,a = a), stats::nls.control(maxiter = 500), na.action =  'na.exclude')
111 |   newdf <- data.frame(x = xSeq_all)
112 |   ydataFit <- stats::predict(fit,newdata = newdf)
113 |   
114 |   # Calculate CV difference
115 |   logX <- log10(xdata)
116 |   
117 |   logXseq <- log10(xSeq_all)
118 |   cvDist <- matrix(0,length(xdata),1)
119 |   
120 |   #progress_for(n=0,tot = length(logX),display = verbose)
121 | 
122 |   for (i in 1:length(logX)) 
123 |   {
124 |     cx <- which(logXseq >= logX[i] - 0.2 & logXseq < logX[i] + 0.2)
125 |     tmp <- sqrt((logXseq[cx] - logX[i])^2 + (ydataFit[cx] - ydata[i])^2)
126 |     tx <- which.min(tmp)
127 |     
128 |     if(logXseq[cx[tx]] > logX[i]) {
129 |       if(ydataFit[cx[tx]] > ydata[i]) {
130 |         cvDist[i] <- -1*tmp[tx]
131 |       } else {
132 |         cvDist[i] <- tmp[tx]
133 |       }
134 |       cvDist[i] <- -1*tmp[tx]
135 |     } else if (logXseq[cx[tx]] <= logX[i]) {
136 |       if(ydataFit[cx[tx]] < ydata[i]) {
137 |         cvDist[i] <- tmp[tx]
138 |       } else {
139 |         cvDist[i] <- -1*tmp[tx]
140 |       }
141 |     }
142 |     
143 |     #progress_for(n=i,tot = length(logX),display = verbose)
144 |     
145 |   }
146 |   
147 |   cvDist <- log2(10^cvDist)
148 |   
149 |   # use kernel density estimate to find the peak
150 |   dor <- stats::density(cvDist, kernel = "gaussian")
151 |   distMid <-dor$x[which.max(dor$y)]
152 |   dist2 <- cvDist - distMid
153 |   tmpDist <- c(dist2[dist2 <= 0], abs(dist2[dist2 < 0])) + distMid
154 |   distFit <- MASS::fitdistr(tmpDist, "normal")
155 |   
156 |   res = data.frame(gene=rownames(data),
157 |                   mean=avg,
158 |                   "cv"=cv,
159 |                   P=pnorm(cvDist, mean = distFit$estimate[1], sd = distFit$estimate[2], lower.tail = FALSE)
160 |   )
161 |   res$FDR <- stats::p.adjust(res$P, 'fdr')
162 |   
163 |   return(res)
164 | }
165 | 
166 | 


--------------------------------------------------------------------------------
/R/dimensinalityReduction.R:
--------------------------------------------------------------------------------
  1 | #' Non-Negative Matrix Factorization (NMF) 
  2 | #'
  3 | #' Reduce dimensionality of the single cell dataset using Non-Negative Matrix Factorization (NMF)
  4 | #' 
  5 | #' @param data list; GFICF object
  6 | #' @param dim integer; Number of dimension which to reduce the dataset.
  7 | #' @param centre logical; Centre gficf scores before applying reduction (increase separation).
  8 | #' @param randomized logical; Use randomized (faster) version for matrix decomposition (default is TRUE).
  9 | #' @param seed integer; Initial seed to use.
 10 | #' @param use.odgenes boolean; Use only significant overdispersed genes respect to ICF values.
 11 | #' @param n.odgenes integer; Number of overdispersed genes to use. A good choise seems to be usually between 1000 and 3000.
 12 | #' @param plot.odgenes boolean; Show significant overdispersed genes respect to ICF values.
 13 | #' @param nt numeric; Numbmber of thread to use (default is 0, i.e. all available CPU cores).
 14 | #' @param ... Additional arguments to pass to nfm call (see ?RcppML::nmf).
 15 | #' @return The updated gficf object.
 16 | #' @import RcppML
 17 | #' @import Matrix
 18 | #' 
 19 | #' @export
 20 | runNMF = function(data,dim=NULL,seed=180582,use.odgenes=F,n.odgenes=NULL,plot.odgenes=F, nt=0, ...)
 21 | {
 22 |   if(use.odgenes & is.null(data$rawCounts)) {stop("Raw Counts absent! Please run gficf normalization with storeRaw = T")}
 23 |   
 24 |   if (nt==0) {nt = ifelse(detectCores()>1,detectCores()-1,1)}
 25 |   options(RcppML.threads = nt)
 26 |   
 27 |   if (is.null(dim))
 28 |   {
 29 |     if (is.null(data$dimPCA)) {stop("Specify the number of dims or run computePCADim first")} else {dim=data$dimPCA}
 30 |   } else {
 31 |     data$dimPCA = dim
 32 |   }
 33 |   
 34 |   set.seed(seed)
 35 |   
 36 |   data$pca = list()
 37 | 
 38 |   if(use.odgenes) {
 39 |     overD=suppressWarnings(findOverDispersed(data = data,alpha = 0.1,verbose = F,plot = plot.odgenes))
 40 |     overD$lpa[is.na(overD$lpa)] <- 0
 41 |     odgenes <- rownames(overD[overD$lpa<log(0.1),])
 42 |     if(!is.null(n.odgenes)) {
 43 |       if(n.odgenes>length(odgenes)) {
 44 |         odgenes <- rownames(overD)[(order(overD$lp,decreasing=F)[1:min(nrow(data$gficf),n.odgenes)])]
 45 |       } else {
 46 |         odgenes <- odgenes[1:n.odgenes]
 47 |       }
 48 |     }
 49 |     data$pca$cells = t(data$gficf)[,odgenes]
 50 |     data$pca$odgenes = overD
 51 |     tsmessage("... using ",length(odgenes)," OD genes",verbose = T)
 52 |   } 
 53 |   
 54 |   if (is.null(data$pca$cells)){
 55 |     tsmessage("Performing NFM..")
 56 |     nfm = RcppML::nmf(data$gficf,k = dim, ...)
 57 |   } else {
 58 |     nfm = RcppML::nmf(t(data$pca$cells),k = dim, ...)
 59 |   }
 60 |   
 61 |   data$pca$cells <- t(nfm$h)
 62 |   data$pca$genes <- nfm$w
 63 |   rm(nfm);gc()
 64 |   data$pca$centre <- F # for legacy
 65 |   data$pca$rescale <- F # for legacy
 66 |   data$pca$type = "NMF"
 67 |   data$pca$use.odgenes = use.odgenes
 68 |   
 69 |   if(use.odgenes) {rownames(data$pca$genes)=odgenes} else {rownames(data$pca$genes) = rownames(data$gficf)}
 70 |   rownames(data$pca$cells) = colnames(data$gficf)
 71 |   return(data)
 72 | }
 73 | 
 74 | #' Principal Component Analysis (PCA) 
 75 | #'
 76 | #' Reduce dimensionality of the single cell dataset using Principal Component Analysis (PCA)
 77 | #' 
 78 | #' @param data list; GFICF object
 79 | #' @param dim integer; Number of dimension which to reduce the dataset.
 80 | #' @param centre logical; Centre gficf scores before applying reduction (increase separation).
 81 | #' @param seed integer; Initial seed to use.
 82 | #' @param use.odgenes boolean; Use only significant overdispersed genes respect to ICF values.
 83 | #' @param n.odgenes integer; Number of overdispersed genes to use. A good choise seems to be usually between 1000 and 3000.
 84 | #' @param plot.odgenes boolean; Show significant overdispersed genes respect to ICF values.
 85 | #' @return The updated gficf object. 
 86 | #' @return The updated gficf object.
 87 | #' @importFrom irlba irlba
 88 | #' 
 89 | #' @export
 90 | runPCA = function(data,dim=NULL,var.scale=F,centre=F,seed=180582,use.odgenes=F,n.odgenes=NULL,plot.odgenes=F)
 91 | {
 92 |   
 93 |   if(use.odgenes & is.null(data$rawCounts)) {stop("Raw Counts absent! Please run gficf normalization with storeRaw = T")}
 94 |   
 95 |   if (is.null(dim))
 96 |   {
 97 |     if (is.null(data$dimPCA)) {stop("Specify the number of dims or run computePCADim first")} else {dim=data$dimPCA}
 98 |   } else {
 99 |     data$dimPCA = dim
100 |   }
101 |   
102 |   set.seed(seed)
103 |   
104 |   data$pca = list()
105 |   data$pca$cells = t(data$gficf)
106 | 
107 |   if(use.odgenes) {
108 |     overD=suppressWarnings(findOverDispersed(data = data,alpha = 0.1,verbose = F,plot = plot.odgenes))
109 |     overD$lpa[is.na(overD$lpa)] <- 0
110 |     odgenes <- rownames(overD[overD$lpa<log(0.1),])
111 |     if(!is.null(n.odgenes)) {
112 |       if(n.odgenes>length(odgenes)) {
113 |         odgenes <- rownames(overD)[(order(overD$lp,decreasing=F)[1:min(nrow(data$gficf),n.odgenes)])]
114 |       } else {
115 |         odgenes <- odgenes[1:n.odgenes]
116 |       }
117 |     }
118 |     data$pca$cells = data$pca$cells[,odgenes]
119 |     data$pca$odgenes = overD
120 |     tsmessage("... using ",length(odgenes)," OD genes",verbose = T)
121 |   }
122 |   
123 |   #x = rsvd::rpca(data$pca$cells,k=dim,center=centre,scale=F,rand=randomized)
124 |   if (centre) {
125 |     x <- irlba::irlba(A = data$pca$cells,nv=dim,center = Matrix::rowMeans(t(data$pca$cells)))
126 |   } else {
127 |     x <- irlba::irlba(A = data$pca$cells,nv=dim)
128 |   }
129 |   
130 |   x$x <- x$u %*% diag(x$d)
131 |   data$pca$cells = x$x
132 |   data$pca$centre <- centre
133 |   data$pca$rescale <- F
134 |   data$pca$genes <- x$v
135 |   data$pca$use.odgenes = use.odgenes
136 |   rm(x); gc()
137 |   data$pca$type = "PCA"
138 |   if(use.odgenes) {rownames(data$pca$genes)=odgenes} else {rownames(data$pca$genes) = rownames(data$gficf)}
139 |   rownames(data$pca$cells) = colnames(data$gficf)
140 |   colnames(data$pca$cells) = colnames(data$pca$genes) = paste("C",1:dim,sep = "")
141 |   return(data)
142 | }
143 |   
144 | #' Dimensionality reduction
145 | #'
146 | #' Run t-SNE or UMAP or t-UMAP dimensionality reduction on selected features from PCA or NMF.
147 | #' See ?umap or ?Rtsne for additional parameter to use. 
148 | #' 
149 | #' @param data list; GFICF object
150 | #' @param reduction characters; Reduction method to use. One of:
151 | #' \itemize{
152 | #'   \item \code{"tsne"}
153 | #'   \item \code{"umap"}
154 | #'   \item \code{"tumap"} (the default)
155 | #' }
156 | #' @param nt integer; Number of thread to use (default 2).
157 | #' @param seed integer; Initial seed to use.
158 | #' @param verbose boolean; Icrease verbosity. 
159 | #' @param ... Additional arguments to pass to Rtsne/umap/tumap call.
160 | #' @return The updated gficf object.
161 | #' @import uwot
162 | #' @importFrom Rtsne Rtsne
163 | #' 
164 | #' @export
165 | runReduction = function(data,reduction="tumap",nt=2,seed=18051982, verbose=T, ...)
166 | {
167 | 
168 |   reduction = base::match.arg(arg = reduction,choices = c("umap","tumap","tsne"),several.ok = F)
169 |   
170 |   set.seed(seed)
171 |   if (!is.null(data$pca))
172 |   {
173 |     if(reduction=="tumap"){
174 |       if (is.null(data$pca$harmony)){
175 |         data$uwot = uwot::tumap(X = data$pca$cells,scale = F,n_threads = nt,verbose = verbose,ret_model = T, ...)
176 |       } else {
177 |         data$uwot = uwot::tumap(X = t(data$pca$harmony$Z_corr),scale = F,n_threads = nt,verbose = verbose,ret_model = T, ...)
178 |       }
179 |       data$embedded = base::as.data.frame(data$uwot$embedding)
180 |     }
181 |     
182 |     if(reduction=="umap"){
183 |       if (is.null(data$pca$harmony)){
184 |         data$uwot = uwot::umap(X = data$pca$cells, scale = F,n_threads = nt,verbose = verbose, ret_model = T, ...)
185 |       } else {
186 |         data$uwot = uwot::umap(X = t(data$pca$harmony$Z_corr), scale = F,n_threads = nt,verbose = verbose, ret_model = T, ...)
187 |       }
188 |       data$embedded = base::as.data.frame(data$uwot$embedding)
189 |     }
190 |     
191 |     if(reduction=="tsne"){
192 |       data$uwot = NULL
193 |       if (is.null(data$pca$harmony)){
194 |         data$embedded = base::as.data.frame(Rtsne::Rtsne(X = data$pca$cells,dims = 2, pca = F,verbose = verbose,max_iter=1000,num_threads=nt, ...)$Y)
195 |       } else {
196 |         data$embedded = base::as.data.frame(Rtsne::Rtsne(X = t(data$pca$harmony$Z_corr),dims = 2, pca = F,verbose = verbose,max_iter=1000,num_threads=nt, ...)$Y)
197 |       }
198 |     }
199 |   } else {
200 |     message("Warning: Reduction is applied directly on GF-ICF values.. can be slow if the dataset is big!")
201 |     
202 |     if(reduction=="tumap"){data$embedded = base::as.data.frame(uwot::tumap(X = as.matrix(t(data$gficf)),scale = F,n_threads = nt,verbose = verbose, ...))}
203 |     
204 |     if(reduction=="umap"){data$embedded = base::as.data.frame(uwot::umap(X = as.matrix(t(data$gficf)), scale = F,n_threads = nt,verbose = verbose, ...))}
205 |     
206 |     if(reduction=="tsne"){data$embedded = base::as.data.frame(Rtsne::Rtsne(X = as.matrix(t(data$gficf)), dims = 2, pca = F, verbose = verbose, max_iter=1000,num_threads=nt, ...)$Y)}
207 |   }  
208 |   rownames(data$embedded) = base::colnames(data$gficf)
209 |   colnames(data$embedded) = base::c("X","Y")
210 |   data$reduction = reduction
211 |   return(data)
212 | }
213 | 
214 | #' Number of features to use 
215 | #'
216 | #' Compute the number of dimension to use for either PCA or LSA.
217 | #' 
218 | #' @param data list; GFICF object
219 | #' @param seed numeric; seed to use.
220 | #' @param subsampling logical; Use only a subset of the data for the imputation of dimensions to use.
221 | #' @param plot logical; Show eblow plot.
222 | #' @importFrom RSpectra svds
223 | #' 
224 | #' @export
225 | computePCADim = function(data,seed=180582,subsampling=F,plot=T)
226 | {
227 |   set.seed(seed)
228 |   dim = min(50,ncol(data$gficf))
229 |   
230 |   if (subsampling)
231 |   {
232 |     x = data$gficf[,sample(x = 1:ncol(data$gficf),size = round(ncol(data$gficf)/100*5))]
233 |     ppk<- RSpectra::svds(t(x),k=dim)
234 |     rm(x)
235 |   } else {
236 |     ppk<- RSpectra::svds(t(data$gficf),k=dim)
237 |   }
238 |   
239 |   explained.var = ppk$d^2 / sum(ppk$d^2)
240 |   if(plot) {plot(explained.var,xlab="components",ylab="explained.var")}
241 |   
242 |   ratio_to_first_diff <- diff(ppk$d^2 / sum(ppk$d^2)) / diff(ppk$d^2 / sum(ppk$d^2))[1]
243 |   #reduction_dim <- (which(ratio_to_first_diff < 0.1) + 1)[1]
244 |   ix = which(cumsum(diff((which(ratio_to_first_diff < 0.1))) == 1)>1)[1]
245 |   reduction_dim = which(ratio_to_first_diff < 0.1)[ix]
246 |   
247 |   cat("Number of estimated dimensions =",reduction_dim)
248 |   data$dimPCA = reduction_dim
249 |   return(data)
250 | }
251 | 
252 | # find over dispersed genes respect to computed ICF
253 | # ispired by pagoda2 function. Thanks to them.
254 | #' @import mgcv
255 | findOverDispersed=function(data,gam.k=5, alpha=5e-2, plot=FALSE, use.unadjusted.pvals=FALSE,do.par=T,max.adjusted.variance=1e3,min.adjusted.variance=1e-3,verbose=TRUE,min.gene.cells=0)
256 | {
257 |   rowSel <- NULL;
258 |   
259 |   tsmessage("calculating variance fit ...",verbose=verbose)
260 |   df = colMeanVarS(t(data$rawCounts),ncores = ifelse(detectCores()>1,detectCores()-1,1))
261 |   df$m = data$w
262 |   
263 |   # gene-relative normalizaton
264 |   df$v <- log(df$v);
265 |   rownames(df) <- rownames(data$gficf);
266 |   vi <- which(is.finite(df$v) & df$nobs>=min.gene.cells);
267 |   if(length(vi)<gam.k*1.5) { gam.k=1 };# too few genes
268 |   if(gam.k<2) {
269 |     tsmessage(" using lm ",verbose=verbose)
270 |     m <- lm(v ~ m, data = df[vi,])
271 |   } else {
272 |     tsmessage(" using gam ",verbose=verbose)
273 |     m <- mgcv::gam(as.formula(paste0('v ~ s(m, k = ',gam.k,')')), data = df[vi,])
274 |   }
275 |   df$res <- -Inf;  df$res[vi] <- resid(m,type='response')
276 |   n.obs <- df$nobs;
277 |   suppressWarnings(df$lp <- as.numeric(pf(exp(df$res),n.obs,n.obs,lower.tail=F,log.p=T)))
278 |   df$lpa <- bh.adjust(df$lp,log=TRUE)
279 |   n.cells <- ncol(data$gficf)
280 |   df$qv <- as.numeric(qchisq(df$lp, n.cells-1, lower.tail = FALSE,log.p=TRUE)/n.cells)
281 |   
282 |   if(use.unadjusted.pvals) {
283 |     ods <- which(df$lp<log(alpha))
284 |   } else {
285 |     ods <- which(df$lpa<log(alpha))
286 |   }
287 |   
288 |   tsmessage(paste0(length(ods),'overdispersed genes ...',length(ods) ),verbose=verbose)
289 |   
290 |   df$gsf <- geneScaleFactors <- sqrt(pmax(min.adjusted.variance,pmin(max.adjusted.variance,df$qv))/exp(df$v));
291 |   df$gsf[!is.finite(df$gsf)] <- 0;
292 |   
293 |   if(plot) {
294 |     if(do.par) {
295 |       par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0);
296 |     }
297 |     smoothScatter(df$m,df$v,main='',xlab='ICF value',ylab='log10[ variance ]')
298 |     grid <- seq(min(df$m[vi]),max(df$m[vi]),length.out=1000)
299 |     lines(grid,predict(m,newdata=data.frame(m=grid)),col="blue")
300 |     if(length(ods)>0) {
301 |       points(df$m[ods],df$v[ods],pch='.',col=2,cex=1)
302 |     }
303 |     smoothScatter(df$m[vi],df$qv[vi],xlab='ICF value',ylab='',main='adjusted')
304 |     abline(h=1,lty=2,col=8)
305 |     if(is.finite(max.adjusted.variance)) { abline(h=max.adjusted.variance,lty=2,col=1) }
306 |     points(df$m[ods],df$qv[ods],col=2,pch='.')
307 |   }
308 |   tsmessage("done.\n",verbose=verbose)
309 |   return(df)
310 | }
311 | 
312 | # BH P-value adjustment with a log option
313 | bh.adjust <- function(x, log = FALSE, verbose = F)
314 | {
315 |   nai <- which(!is.na(x))
316 |   ox <- x
317 |   x<-x[nai]
318 |   id <- order(x, decreasing = FALSE)
319 |   if(log) {
320 |     q <- x[id] + log(length(x)/seq_along(x))
321 |   } else {
322 |     q <- x[id]*length(x)/seq_along(x)
323 |   }
324 |   a <- rev(cummin(rev(q)))[order(id)]
325 |   ox[nai]<-a
326 |   ox
327 | }
328 | 
329 | #' Number of features to use 
330 | #'
331 | #' Compute the number of dimension to use for either PCA or LSA.
332 | #' 
333 | #' @param data list; GFICF object
334 | #' @param metadata dataframe; Either (1) Dataframe with variables to integrate or (2) vector with labels.
335 | #' @param var.to.use character; If meta_data is dataframe, this defined which variable(s) to remove (character vector).
336 | #' @param verbose boolean; Increase verbosity.
337 | #' @param ... Additional arguments to pass to HarmonyMatrix function.
338 | #' @importFrom harmony HarmonyMatrix
339 | #' 
340 | #' @export
341 | runHarmony <- function(data,metadata, var.to.use, verbose = T, ...)
342 | {
343 |   tsmessage(".. Running Harmony on PCA/NMF space",verbose=verbose)
344 |   if (is.null(data$pca)) {stop("Please run fist PCA or NMF reduction!")}
345 |   data$pca$harmony <- harmony::HarmonyMatrix(data$pca$cells, meta_data = metadata, vars_use = var.to.use,do_pca = F, verbose = F, return_object = T, ...)
346 |   colnames(data$pca$harmony$Z_corr) <- rownames(data$pca$cells)
347 |   rownames(data$pca$harmony$Z_corr) <- colnames(data$pca$cells)
348 |   tsmessage(".. Finished!",verbose=verbose)
349 |   return(data)
350 | }
351 | 


--------------------------------------------------------------------------------
/R/gficf.R:
--------------------------------------------------------------------------------
  1 | #' Gene Frequency - Inverse Cell Frequency (GF-ICF)
  2 | #'
  3 | #' R implementation of the GF-ICF
  4 | #' Thanks to 3’-end scRNA-seq approaches, we can now have an accurate estimation of gene expression without having to account for gene length,
  5 | #' thus the number of transcripts (i.e. UMI) associated to each gene, strictly reflects the frequency of a gene in a cell, exactly like a word in a document.
  6 | #' GFICF (Gene Frequency - Inverse Cell Frequency) is analogous of TF-IDF scoring method as defined for tex mining With GFICF we consider a cell to be
  7 | #' analogous to a document, genes analogous to words and gene counts to be analogous of the word’s occurrence in a document.
  8 | #' 
  9 | #' @param M Matrix; UMI cell count matrix
 10 | #' @param QCdata list; QC cell object. 
 11 | #' @param cell_count_cutoff numeric; All genes detected in less than cell_count_cutoff cells will be excluded (default 5).
 12 | #' @param cell_percentage_cutoff2 numeric; All genes detected in at least this percentage of cells will be included (default 0.03, i.e. 3 percent of cells).
 13 | #' @param nonz_mean_cutoff numeric genes detected in the number of cells between the above mentioned cutoffs are selected only when their average expression in non-zero cells is above this cutoff (default 1.12).
 14 | #' @param  normalize logical; Rescale UMI counts before apply GFICF. Rescaling is done using EdgeR normalization.
 15 | #' @param storeRaw logical; Store UMI counts.
 16 | #' @param batches vector; Vector or factor for batch.
 17 | #' @param groups vector; Vector or factor for biological condition of interest.
 18 | #' @param filterGenes logical; Apply gene filter (default TRUE).
 19 | #' @param verbose boolean; Increase verbosity.
 20 | #' @param ... Additional arguments to pass to ComBat_seq call.
 21 | #' @return The updated gficf object.
 22 | #' 
 23 | #' @export
 24 | gficf = function(M=NULL,QCdata=NULL,cell_count_cutoff=5,cell_percentage_cutoff2=0.03,nonz_mean_cutoff=1.12,normalize=TRUE,storeRaw=TRUE,batches=NULL,groups=NULL,filterGenes=TRUE,verbose=TRUE,fastNomalization=F, ...)
 25 | {
 26 |   if(is.null(M) & is.null(QCdata)) {stop("Input data is missing!!")}
 27 |   
 28 |   data = list()
 29 |   if (!is.null(QCdata)) {
 30 |     data = QCdata
 31 |     rm(QCdata);gc(reset = T)
 32 |     if (!is.null(M)) {rm(M);gc()}
 33 |   } else {
 34 |     data$counts = M;rm(M);gc()
 35 |   }
 36 |   
 37 |   data = normCountsData(data,cell_count_cutoff,cell_percentage_cutoff2,nonz_mean_cutoff,normalize,batches,groups,verbose,filterGenes,fastNomalization, ...)
 38 |   data$gficf = tf(data$rawCounts,verbose = verbose)
 39 |   if (!storeRaw) {data$rawCounts=NULL;data$counts=NULL;gc()}
 40 |   data$w = getIdfW(data$gficf,verbose = verbose)
 41 |   data$gficf = idf(data$gficf,data$w,verbose = verbose)
 42 |   data$gficf = t(l.norm(t(data$gficf),norm = "l2",verbose = verbose))
 43 |   
 44 |   data$param <- list()
 45 |   data$param$cell_count_cutoff = cell_count_cutoff
 46 |   data$param$cell_percentage_cutoff2 = cell_percentage_cutoff2
 47 |   data$param$nonz_mean_cutoff = nonz_mean_cutoff
 48 |   data$param$normalized = normalize
 49 |   return(data)
 50 | }
 51 | 
 52 | #' @import Matrix
 53 | #' @importFrom edgeR DGEList calcNormFactors cpm
 54 | #' @importFrom sva ComBat_seq
 55 | #' 
 56 | normCounts = function(M,cell_count_cutoff=5,cell_percentage_cutoff2=0.03,nonz_mean_cutoff=1.12,normalizeCounts=TRUE,batches=NULL,groups=NULL,verbose=TRUE,filterGene=FALSE,fastNomalization=FALSE, ...)
 57 | {
 58 |   if (filterGene) {
 59 |     tsmessage("Gene filtering..",verbose = verbose)
 60 |     M = filter_genes_cell2loc_style(data = M,cell_count_cutoff,cell_percentage_cutoff2,nonz_mean_cutoff)
 61 |   }
 62 |   
 63 |   if (normalizeCounts) 
 64 |   {
 65 |     if(!is.null(batches)){
 66 |       tsmessage("Correcting batches..",verbose = verbose)
 67 |       M = Matrix::Matrix(data = sva::ComBat_seq(counts = as.matrix(M),batch = batches,group = groups, ...),sparse = T)
 68 |     }
 69 |     tsmessage("Normalize counts..",verbose = verbose)
 70 |     if (!fastNomalization) {
 71 |       M <- Matrix::Matrix(edgeR::cpm(edgeR::calcNormFactors(edgeR::DGEList(counts=M),normalized.lib.sizes = T)),sparse = T)
 72 |     } else {
 73 |       nt = ifelse(detectCores()>1,detectCores()-1,1)
 74 |       M <- scaleUMI(F,nt,F)
 75 |     }
 76 |   } else {
 77 |     data$rawCounts <- data$counts
 78 |     data$counts <- NULL; gc()
 79 |   }
 80 |   
 81 |   return(M)
 82 | }
 83 | 
 84 | #' @import Matrix
 85 | #' @importFrom edgeR DGEList calcNormFactors cpm
 86 | #' @importFrom sva ComBat_seq
 87 | #' 
 88 | normCountsData = function(data,cell_count_cutoff=5,cell_percentage_cutoff2=0.03,nonz_mean_cutoff=1.12,normalizeCounts=TRUE,batches=NULL,groups=NULL,verbose=TRUE,filterGene=TRUE, fastNomalization=FALSE, ...)
 89 | {
 90 |   if (filterGene) {
 91 |     tsmessage("Gene filtering..",verbose = verbose)
 92 |     data$counts = filter_genes_cell2loc_style(data = data$counts,cell_count_cutoff,cell_percentage_cutoff2,nonz_mean_cutoff)
 93 |   }
 94 |   
 95 |   if (normalizeCounts) 
 96 |   {
 97 |     if(!is.null(batches)){
 98 |       tsmessage("Correcting batches..",verbose = verbose)
 99 |       data$counts = Matrix::Matrix(data = sva::ComBat_seq(counts = as.matrix(data$counts),batch = batches,group = groups, ...),sparse = T)
100 |       gc()
101 |     }
102 |     tsmessage("Normalize counts..",verbose = verbose)
103 |     if (!fastNomalization) {
104 |       data$rawCounts <- Matrix::Matrix(edgeR::cpm(edgeR::calcNormFactors(edgeR::DGEList(counts=data$counts),normalized.lib.sizes = T)),sparse = T)
105 |     } else {
106 |       nt = ifelse(detectCores()>1,detectCores()-1,1)
107 |       data$rawCounts <- scaleUMI(data$counts,nt,F)
108 |     }
109 |   } else {
110 |     data$rawCounts <- data$counts
111 |     data$counts <- NULL; gc()
112 |   }
113 |   
114 |   if(is.null(batches)){data$counts=NULL;gc()}
115 |   
116 |   return(data)
117 | }
118 | 
119 | #' @import Matrix
120 | #' 
121 | tf = function(M,verbose)
122 | {
123 | 
124 |   tsmessage("Apply GF transformation..",verbose = verbose)
125 |   M =t(t(M) / armaColSum(M))
126 |   
127 |   return(M)
128 | }
129 | 
130 | #' @import Matrix
131 | #' 
132 | idf = function(M,w,verbose)
133 | {
134 |   tsmessage("Applay ICF..",verbose = verbose)
135 |   M = M[rownames(M) %in% names(w),]
136 |   if(nrow(M)<length(w))
137 |   {
138 |     g = names(w)[!names(w)%in%rownames(M)]
139 |     tmp = Matrix::Matrix(data = 0,nrow = length(g),ncol = ncol(M))
140 |     rownames(tmp) = g
141 |     colnames(tmp) = colnames(M)
142 |     M = rbind(M,tmp)
143 |   }
144 |   M = M[names(w),]
145 |   M = M * w
146 |   return(M)
147 | }
148 | 
149 | #' @import Matrix
150 | #' 
151 | getIdfW = function(M,type="classic",verbose)
152 | {
153 |   tsmessage("Compute ICF weigth..",verbose = verbose)
154 |   nt = armaRowSum(M!=0)
155 |   if (type == "classic") {w = log( (ncol(M)+1) / (nt+1) );rm(nt)}
156 |   if (type == "prob") {w = log( (ncol(M) - nt) / nt );rm(nt)}
157 |   if (type == "smooth") {w = log( 1 + ncol(M)/nt );rm(nt)}
158 |   return(w)
159 | }
160 | 
161 | 
162 | 
163 | l.norm = function (m, norm = c("l1", "l2"),verbose) 
164 | {
165 |   tsmessage(paste("Apply",norm),verbose = verbose)
166 |   norm_vec = switch(norm, l1 = 1/armaRowSum(m), l2 = 1/sqrt(armaRowSum(m^2)))
167 |   norm_vec[is.infinite(norm_vec)] = 0
168 |   if (inherits(m, "sparseMatrix")) {
169 |     norm_vec <- Diagonal(x = norm_vec)
170 |     rownames(norm_vec) <- rownames(m)
171 |     m <- norm_vec %*% m
172 |   }
173 |   else m * norm_vec
174 | }
175 | 
176 | #' Save GFICF object.
177 | #' 
178 | #' Function to write a GFICF object to a file, preserving UMAP model.
179 | #' 
180 | #' @param data a GFICF object create by \code{\link{gficf}}.
181 | #' @param file name of the file where the model is to be saved.
182 | #' 
183 | #' @examples 
184 | #' # save
185 | #' gficf_file <- tempfile("gficf_test")
186 | #' saveGFICF(data, file = gficf_file)
187 | #' 
188 | #' # restore
189 | #' data2 <- loadGFICF(file = gficf_file)
190 | #' 
191 | #' @export
192 | saveGFICF <- function(data, file, verbose = TRUE) 
193 | {
194 |   wd <- getwd()
195 |   tryCatch({
196 |     # create directory to store files in
197 |     mod_dir <- tempfile(pattern = "dir")
198 |     #dir.create(mod_dir)
199 |     gficf_dir <- file.path(mod_dir, "gficf")
200 |     dir.create(gficf_dir,recursive = T)
201 |     
202 |     # save uwot object
203 |     if (!is.null(data$reduction)) {
204 |       if( data$reduction %in% c("umap","tumap"))
205 |       {
206 |         uwot_dir <- file.path(gficf_dir, "uwot")
207 |         dir.create(uwot_dir,recursive = T)
208 |         uwot_tmpfname <- file.path(uwot_dir,"uwot_obj")
209 |         data$uwot <- uwot::save_uwot(model = data$uwot,file = uwot_tmpfname,verbose = verbose)
210 |       }
211 |     }
212 |     
213 |     # save gficf object
214 |     gficfl_tmpfname <- file.path(gficf_dir, "data")
215 |     saveRDS(data, file = gficfl_tmpfname)
216 |     
217 |     # archive the files under the temp dir into the single target file
218 |     # change directory so the archive only contains one directory
219 |     setwd(mod_dir)
220 |     if(file.exists(file)) {file.rename(file,paste0(file,".bak"))}
221 |     utils::tar(tarfile = file, files = "gficf/")
222 |   },
223 |   finally = {
224 |     setwd(wd)
225 |     if (file.exists(mod_dir)) {
226 |       unlink(mod_dir, recursive = TRUE)
227 |     }
228 |     if(file.exists(paste0(file,".bak"))) {
229 |       file.remove(paste0(file,".bak"))
230 |     }
231 |   })
232 | }
233 | 
234 | #' Restore GFICF object.
235 | #' 
236 | #' Function to read a GFICF object from a file saved with \code{\link{saveGFICF}}.
237 | #' 
238 | #' @param file name of the file where the object is stored.
239 | #' 
240 | #' @examples 
241 | #' gficf_file <- tempfile("gficf_data")
242 | #' 
243 | #' # restore
244 | #' data2 <- loadGFICF(file = gficf_file)
245 | #' 
246 | #' @export
247 | loadGFICF <- function(file,verbose = T)
248 | {
249 |   model <- NULL
250 |   
251 |   tryCatch({
252 |     # create directory to store files in
253 |     mod_dir <- tempfile(pattern = "dir")
254 |     dir.create(mod_dir)
255 |     utils::untar(file, exdir = mod_dir)
256 |     
257 |     # load gficf object
258 |     gficf_fname <- file.path(mod_dir, "gficf/data")
259 |     if (!file.exists(gficf_fname)) {
260 |       stop("Can't find gficf data in ", file)
261 |     }
262 |     data <- readRDS(file = gficf_fname)
263 |     
264 |     # load umap obj if necessary
265 |     if(data$reduction %in% c("umap","tumap"))
266 |     {
267 |       nn_fname <- file.path(mod_dir,"gficf/uwot/uwot_obj")
268 |       if (!file.exists(nn_fname)) {
269 |         stop("Can't find uwot object ", nn_fname, " in ", file)
270 |       }
271 |       data$uwot <- uwot::load_uwot(file = nn_fname, verbose = verbose)
272 |     }
273 |   },
274 |   finally = {
275 |     if (file.exists(mod_dir)) {
276 |       unlink(mod_dir, recursive = TRUE)
277 |     }
278 |   })
279 |   
280 |   return(data)
281 | }
282 | 


--------------------------------------------------------------------------------
/R/pathwayAnalisys.R:
--------------------------------------------------------------------------------
  1 | #' @import AnnotationHub
  2 | #' @importFrom  babelgene orthologs
  3 | gmtPathways <- function(gmt.file,convertToEns,convertHu2Mm,verbose)
  4 | {
  5 |   pathwayLines <- strsplit(readLines(gmt.file), "\t")
  6 |   pathways <- lapply(pathwayLines, tail, -2)
  7 |   names(pathways) <- sapply(pathwayLines, head, 1)
  8 |   
  9 |   if (convertToEns & !convertHu2Mm)
 10 |   {
 11 |     tsmessage("... Retrieving gene annotation from AnnotationHub()",verbose = verbose)
 12 |     ah <- AnnotationHub::AnnotationHub()
 13 |     ahDb <- AnnotationHub::query(ah,pattern = c("Homo sapiens","EnsDb"), ignore.case = TRUE)
 14 |     id <- tail(rownames(mcols(ahDb)),n=1)
 15 |     edb <- ah[[id]]
 16 |     # Extract gene-level information from database
 17 |     ens.map <- subset(genes(edb,return.type = "data.frame"),seq_name%in%c(as.character(1:22),"X","Y","MT") & !gene_biotype%in%"LRG_gene")
 18 |     tsmessage(".. Start converting human symbols to human ensamble id",verbose = verbose)
 19 |     g = as.character(unique(unlist(pathways)))
 20 |     pathways = lapply(pathways, function(x,y=ens.map){r=y$gene_id[y$gene_name%in%x];r=r[!is.na(r)];return(unique(r))})
 21 |     tsmessage("Done!",verbose = verbose)
 22 |   }
 23 |   
 24 |   if (convertToEns & convertHu2Mm )
 25 |   {
 26 |     tsmessage(".. Start converting human symbols to mouse ensamble id",verbose = verbose)
 27 |     g = as.character(unique(unlist(pathways)))
 28 |     g = g[!startsWith(g,prefix = "ENSG")] 
 29 |     map.gene = babelgene::orthologs(genes = g,species = "mouse")
 30 |     pathways = lapply(pathways, function(x,y=map.gene) {r = unique(y$ensembl[y$human_symbol%in%x]);return(r[!is.na(r)])})
 31 |     tsmessage("Done!",verbose = verbose)
 32 |   }
 33 |   
 34 |   if (!convertToEns & convertHu2Mm )
 35 |   {
 36 |     tsmessage(".. Start converting human symbols to mouse symbols",verbose = verbose)
 37 |     g = as.character(unique(unlist(pathways)))
 38 |     g = g[!startsWith(g,prefix = "ENSG")]
 39 |     map.gene = babelgene::orthologs(genes = g,species = "mouse")
 40 |     pathways = lapply(pathways, function(x,y=map.gene) {r = unique(y$symbol[y$human_symbol%in%x]);return(r[!is.na(r)])})
 41 |     tsmessage("Done!",verbose = verbose)
 42 |   }
 43 |   
 44 |   pathways = pathways[sapply(pathways, length)>0]  
 45 |   pathways
 46 | }
 47 | 
 48 | #' Gene Set Enrichement Analysi on GF-ICF
 49 | #'
 50 | #' Compute GSEA for each cluster across a set of input pathways.
 51 | #' 
 52 | #' @param data list; GFICF object
 53 | #' @param gmt.file characters; Path to gmt file from MSigDB
 54 | #' @param nsim integer; number of simulation used to compute ES significance.
 55 | #' @param convertToEns boolean: Convert gene sets from gene symbols to Ensable id.
 56 | #' @param convertHu2Mm boolean: Convert gene sets from human symbols to Mouse Ensable id.
 57 | #' @param nt numeric; Number of cpu to use for the GSEA
 58 | #' @param minSize numeric; Minimal size of a gene set to test (default 15). All pathways below the threshold are excluded.
 59 | #' @param maxSize numeric; Maximal size of a gene set to test (default Inf). All pathways above the threshold are excluded.
 60 | #' @param verbose boolean; Show the progress bar.
 61 | #' @param seed integer; Seed to use for random number generation.
 62 | #' @param method string; Method to use GSEA or GSVA. Default is GSEA.
 63 | #' @return The updated gficf object.
 64 | #' @importFrom fgsea fgsea
 65 | #' @import fastmatch
 66 | #' @importFrom limma lmFit eBayes topTable
 67 | #' @import GSVA 
 68 | #' @export
 69 | runGSEA <- function(data,gmt.file,nsim=1000,convertToEns=T,convertHu2Mm=F,nt=2,minSize=15,maxSize=Inf,verbose=TRUE,seed=180582,method="GSEA")
 70 | {
 71 |   set.seed(seed)
 72 |   
 73 |   if (is.null(data$cluster.gene.rnk)) {stop("Please run clustcell function first")}
 74 |   mthod = base::match.arg(arg = method,choices = c("GSEA","GSVA"),several.ok = F)
 75 |   
 76 |   if (method == "GSEA")
 77 |   {
 78 |     tsmessage("Choosen method is GSEA...",verbose=verbose)
 79 |     data$gsea = list()
 80 |     data$gsea$pathways = gmtPathways(gmt.file,convertToEns,convertHu2Mm,verbose)
 81 |     data$gsea$es = Matrix::Matrix(data = 0,nrow = length(data$gsea$pathways),ncol = ncol(data$cluster.gene.rnk))
 82 |     data$gsea$nes = Matrix::Matrix(data = 0,nrow = length(data$gsea$pathways),ncol = ncol(data$cluster.gene.rnk))
 83 |     data$gsea$pval = Matrix::Matrix(data = 0,nrow = length(data$gsea$pathways),ncol = ncol(data$cluster.gene.rnk))
 84 |     data$gsea$fdr = Matrix::Matrix(data = 0,nrow = length(data$gsea$pathways),ncol = ncol(data$cluster.gene.rnk))
 85 |     
 86 |     rownames(data$gsea$es) = rownames(data$gsea$nes) = rownames(data$gsea$pval) = rownames(data$gsea$fdr) = names(data$gsea$pathways)
 87 |     colnames(data$gsea$es) = colnames(data$gsea$nes) = colnames(data$gsea$pval) = colnames(data$gsea$fdr) = colnames(data$cluster.gene.rnk)
 88 |     
 89 |     pb = utils::txtProgressBar(min = 0, max = ncol(data$cluster.gene.rnk), initial = 0,style = 3)
 90 |     for (i in 1:ncol(data$cluster.gene.rnk))
 91 |     {
 92 |       df = as.data.frame(fgsea::fgseaMultilevel(pathways = data$gsea$pathways,stats = data$cluster.gene.rnk[,i],nPermSimple = nsim,gseaParam = 0,nproc = nt,minSize = minSize,maxSize = maxSize))[,1:7]
 93 |       data$gsea$es[df$pathway,i] = df$ES
 94 |       data$gsea$nes[df$pathway,i] = df$NES
 95 |       data$gsea$pval[df$pathway,i] = df$pval
 96 |       data$gsea$fdr[df$pathway,i] = df$padj
 97 |       utils::setTxtProgressBar(pb,i)
 98 |     }
 99 |     close(pb)
100 |   
101 |   data$gsea$stat = df[,c("pathway","size")]
102 |   } else {
103 |     tsmessage("Choosen method is GSVA...",verbose=verbose)
104 |     data$gsva = list()
105 |     data$gsva$pathways = gmtPathways(gmt.file,convertToEns,convertHu2Mm,verbose)
106 |     data$gsva$DEpathways = NULL 
107 |     data$gsva$res = Matrix::Matrix(data = 0,nrow = length(data$gsva$pathways),ncol = ncol(data$gficf))
108 |     rownames(data$gsva$res) = names(data$gsva$pathways)
109 |     colnames(data$gsva$res) = colnames(data$gficf)
110 |     
111 |     tsmessage("Start executiong GSVA cluster by cluster",verbose=verbose)
112 |     options(warn=-1)
113 |     u = unique(data$embedded$cluster)
114 |     for (i in 1:length(u))
115 |     {
116 |       tsmessage(paste0("..Executing GSVA for cluster ",i," out of ",length(u)))
117 |       cells = rownames(data$embedded)[data$embedded$cluster%in%u[i]]
118 |       res = GSVA::gsva(expr = as.matrix(data$gficf[,cells]),gset.idx.list = data$gsva$pathways,kcdf="Gaussian",min.sz=minSize,max.sz=maxSize,parallel.sz=nt,method="gsva",verbose=F)
119 |       data$gsva$res[rownames(res),cells] = res
120 |       rm(res)
121 |     }
122 |     options(warn=0)
123 |     data$gsva$res = data$gsva$res[armaRowSum(data$gsva$res!=0)>0,]
124 |     
125 |     tsmessage("Start executiong Limma cluster by cluster",verbose=verbose)
126 |     for (i in 1:length(u))
127 |     {
128 |       tsmessage(paste0("..Calling DE pathways for cluster ",i," out of ",length(u)))
129 |       clusters = data$embedded$cluster
130 |       clusters[!clusters%in%u[i]] = "other"
131 |       clusters[!clusters%in%"other"] = paste0("C",clusters[!clusters%in%"other"])
132 |       design <- model.matrix(~ factor(clusters))
133 |       colnames(design) <- c("ALL", paste0("C",u[i],"vsOTHER"))
134 |       fit <- limma::lmFit(data$gsva$res, design)
135 |       fit <- limma::eBayes(fit)
136 |       df <- as.data.frame(limma::topTable(fit, coef=paste0("C",u[i],"vsOTHER"), number=Inf))
137 |       df$pathway = rownames(df)
138 |       df$cluster = u[i]
139 |       data$gsva$DEpathways = rbind(data$gsva$DEpathways,df)
140 |       rm(df)
141 |     }
142 |     rownames(data$gsva$DEpathways) = NULL
143 |   }
144 |   return(data)
145 | }
146 | 
147 | #' Single cell Gene Set Enrichement Analysis on GF-ICF
148 | #'
149 | #' Compute GSEA for each cells across a set of input pathways by using NMF.
150 | #' 
151 | #' @param data list; GFICF object
152 | #' @param geneID characters; The type of gene identifier to use, such as ensamble of symbol. 
153 | #' @param species characters; Species name, such as human or mouse.
154 | #' @param category characters; MSigDB collection abbreviation, such as H or C1.
155 | #' @param subcategory characters; MSigDB sub-collection abbreviation, such as CGP or BP.
156 | #' @param pathway.list list; Custom list of pathways. Each element correspond to a pathway a and contains a vector of genes.
157 | #' @param nsim integer; number of simulation used to compute ES significance.
158 | #' @param nt numeric; Number of cpu to use for the GSEA and NMF. Default is 0 (i.e., all available cores minus one)
159 | #' @param minSize numeric; Minimal size of a gene set to test (default 15). All pathways below the threshold are excluded.
160 | #' @param maxSize numeric; Maximal size of a gene set to test (default Inf). All pathways above the threshold are excluded.
161 | #' @param verbose boolean; Show the progress bar.
162 | #' @param seed integer; Seed to use for random number generation.
163 | #' @param nmf.k numeric; Rank of NMF.
164 | #' @param fdr.th numeric; FDR threshold for GSEA.
165 | #' @param rescale string; If different by none, pathway's activity scores are resealed as Z-score. Possible values are none, byGS or byCell. Default is none.
166 | #' @param normalization; normalization to use before to apply NMF. Possible values are gficf or cpm. Default and highly raccomanded is gficf.
167 | #' @return The updated gficf object.
168 | #' @importFrom fgsea fgsea
169 | #' @import fastmatch
170 | #' @importFrom RcppML nmf
171 | #' @import utils
172 | #' @import pointr
173 | #' @import msigdbr
174 | #' @importFrom BiocParallel SnowParam
175 | #' @export
176 | runScGSEA <- function(data,geneID,species,category,subcategory=NULL,pathway.list=NULL,nsim=10000,nt=0,minSize=15,maxSize=Inf,verbose=TRUE,seed=180582,nmf.k=100,fdr.th=0.05,gp=0,rescale="none",normalization="gficf")
177 | {
178 |   if(nt==0) {nt=detectCores()}
179 |   #species = base::match.arg(arg = species,choices = c("human","mouse"),several.ok = F)
180 |   geneID = base::match.arg(arg = geneID,choices = c("ensamble","symbol"),several.ok = F)
181 |   rescale = base::match.arg(arg = rescale,choices = c("none","byGS","byCell"),several.ok = F)
182 |   normalization = base::match.arg(arg = normalization,choices = c("gficf","cpm"),several.ok = F)
183 |   options(RcppML.threads = nt)
184 |   set.seed(seed)
185 |   
186 |   if (is.null(data$scgsea))
187 |   {
188 |     data$scgsea = list()
189 |     if (normalization=="gficf")
190 |     {
191 |       if (!is.null(data$pca) && data$pca$type == "NMF"){
192 |         if (data$dimPCA<nmf.k || data$pca$use.odgenes) {
193 |           tsmessage("... Performing NMF",verbose=verbose)
194 |           tmp = RcppML::nmf(data = data$gficf,k=nmf.k)
195 |           data$scgsea$nmf.w <- Matrix::Matrix(data = tmp@w,sparse = T)
196 |           data$scgsea$nmf.h <- t(Matrix::Matrix(data = tmp@h,sparse = T))
197 |           rm(tmp);gc()
198 |         } else {
199 |           tsmessage(paste0("Found NMF reduction with k greaten or equal to ", nmf.k),verbose=T)
200 |           pointr::ptr("tmp", "data$pca$genes")
201 |           data$scgsea$nmf.w = tmp
202 |           pointr::ptr("tmp2", "data$pca$cells")
203 |           data$scgsea$nmf.h = tmp2
204 |           rm(tmp,tmp2);gc()
205 |         }
206 |       } else {
207 |         tsmessage("... Performing NMF",verbose=verbose)
208 |         tmp = RcppML::nmf(data = data$gficf,k=nmf.k)
209 |         data$scgsea$nmf.w <- Matrix::Matrix(data = tmp@w,sparse = T)
210 |         data$scgsea$nmf.h <- t(Matrix::Matrix(data = tmp@h,sparse = T))
211 |         rm(tmp);gc()
212 |       }
213 |     } else {
214 |       tsmessage("... Performing NMF",verbose=verbose)
215 |       tmp = RcppML::nmf(data = log1p(data$rawCounts),k=nmf.k)
216 |       data$scgsea$nmf.w <- Matrix::Matrix(data = tmp@w,sparse = T)
217 |       data$scgsea$nmf.h <- t(Matrix::Matrix(data = tmp@h,sparse = T))
218 |       rm(tmp);gc()
219 |     }
220 |   } else {
221 |     tsmessage("Found a previous scGSEA, thus the already computed NMF will be used",verbose=T)
222 |     tsmessage("If you want to recompute NMF, please call resetScGSEA first",verbose=T)
223 |     data$scgsea$es <- NULL
224 |     data$scgsea$nes <- NULL
225 |     data$scgsea$pval <- NULL
226 |     data$scgsea$fdr <- NULL
227 |     data$scgsea$pathways <- NULL
228 |     data$scgsea$x <- NULL
229 |     data$scgsea$stat <- NULL
230 |   }
231 |   
232 |   tsmessage("Loading pathways...",verbose=verbose)
233 |   if (!is.list(pathway.list)) {
234 |     gs = msigdbr::msigdbr(species = species, category = category,subcategory = subcategory)
235 |     if (geneID=="symbol") {
236 |       data$scgsea$pathways = split(x = gs$gene_symbol, f = gs$gs_name)
237 |     } else {
238 |       data$scgsea$pathways = split(x = gs$ensembl_gene, f = gs$gs_name)
239 |     }
240 |   } else {
241 |     data$scgsea$pathways = pathway.list; rm(pathway.list)
242 |   }
243 |   data$scgsea$es = Matrix::Matrix(data = 0,nrow = length(data$scgsea$pathways),ncol = ncol(data$scgsea$nmf.w))
244 |   data$scgsea$nes = Matrix::Matrix(data = 0,nrow = length(data$scgsea$pathways),ncol = ncol(data$scgsea$nmf.w))
245 |   data$scgsea$pval = Matrix::Matrix(data = 0,nrow = length(data$scgsea$pathways),ncol = ncol(data$scgsea$nmf.w))
246 |   data$scgsea$fdr = Matrix::Matrix(data = 0,nrow = length(data$scgsea$pathways),ncol = ncol(data$scgsea$nmf.w))
247 |     
248 |   rownames(data$scgsea$es) = rownames(data$scgsea$nes) = rownames(data$scgsea$pval) = rownames(data$scgsea$fdr) = names(data$scgsea$pathways)
249 |   
250 |   tsmessage("Performing GSEA...",verbose=verbose)
251 |   oldw <- getOption("warn")
252 |   options(warn = -1)
253 |   pb = utils::txtProgressBar(min = 0, max = ncol(data$scgsea$nmf.w), initial = 0,style = 3)
254 |   
255 |   # Optimize fgsea cpus assigning at least 100 pathways for cpus
256 |   nt_fgsea <- ceiling(length(data$scgsea$pathways)/100)
257 |   nt_fgsea <- ifelse(nt_fgsea>nt,nt,nt_fgsea)
258 |   bpparameters <- BiocParallel::SnowParam(nt_fgsea)
259 |   for (i in 1:ncol(data$scgsea$nmf.w))
260 |   {
261 |       df = as.data.frame(fgsea::fgseaMultilevel(pathways = data$scgsea$pathways,stats = data$scgsea$nmf.w[,i],nPermSimple = nsim,gseaParam = gp,BPPARAM = bpparameters,minSize = minSize,maxSize = maxSize))[,1:7]
262 |       data$scgsea$es[df$pathway,i] = df$ES
263 |       data$scgsea$nes[df$pathway,i] = df$NES
264 |       data$scgsea$pval[df$pathway,i] = df$pval
265 |       data$scgsea$fdr[df$pathway,i] = df$padj
266 |       utils::setTxtProgressBar(pb,i)
267 |   }
268 |   base::close(pb)
269 |   on.exit(options(warn = oldw))
270 |   
271 |   ix = is.na(data$scgsea$nes)
272 |   if(sum(ix)>0) {
273 |     data$scgsea$nes[ix] = 0
274 |     data$scgsea$pval[ix] = 1
275 |     data$scgsea$fdr[ix] = 1
276 |   }
277 |   
278 |   data$scgsea$x = data$scgsea$nes
279 |   data$scgsea$x[data$scgsea$x<0 | data$scgsea$fdr>=fdr.th] = 0
280 |   data$scgsea$x = Matrix::Matrix(data = data$scgsea$nmf.h %*% t(data$scgsea$x),sparse = T)
281 |   
282 |   data$scgsea$stat = df[,c("pathway","size")]
283 |   data$scgsea$x = data$scgsea$x[,armaColSum(data$scgsea$x)>0]
284 |   
285 |   if(rescale!="none"){
286 |     if(rescale=="byGS") {
287 |       data$scgsea$x = t(data$scgsea$x)
288 |       data$scgsea$x = t( (data$scgsea$x - rowMeans(data$scgsea$x)) / apply(data$scgsea$x, 1, sd))
289 |     }
290 |     if(rescale=="byCell") {
291 |       data$scgsea$x = (data$scgsea$x - rowMeans(data$scgsea$x)) / apply(data$scgsea$x, 1, sd)
292 |     }
293 |   }
294 |   
295 |   return(data)
296 | }
297 | 
298 | #' Remove previous scGSEA analysis
299 | #'
300 | #' Remove previous scGSEA analysis
301 | #' @param data list; GFICF object
302 | #' @export
303 | resetScGSEA <- function(data){
304 |   data$scgsea <- NULL
305 |   return(data)
306 | }
307 | 
308 | 


--------------------------------------------------------------------------------
/R/util.R:
--------------------------------------------------------------------------------
  1 | #' @import Matrix
  2 | #' 
  3 | scaleMatrix = function(x,rescale,centre)
  4 | {
  5 |   if (FALSE)
  6 |   {
  7 |     message("Rescaling..")
  8 |     bc_tot <- armaRowSum(x)
  9 |     median_tot <- stats::median(bc_tot)
 10 |     x <- base::sweep(x, 1, median_tot/bc_tot, '*')
 11 |     message("Rescaling Done!")
 12 |   }
 13 |   
 14 |   if (FALSE)
 15 |   {
 16 |     message("Centering data..")
 17 |     x <- base::sweep(x, 2, Matrix::colMeans(x), '-')
 18 |     x <- base::sweep(x, 2, base::apply(x, 2, sd), '/')
 19 |     message("Centering Done!")
 20 |   }
 21 |   return(x)
 22 | }
 23 | 
 24 | 
 25 | stime <- function() {
 26 |   format(Sys.time(), "%T")
 27 | }
 28 | 
 29 | # message with a time stamp
 30 | tsmessage <- function(..., domain = NULL, appendLF = TRUE, verbose = TRUE,time_stamp = TRUE) {
 31 |   if (verbose) {
 32 |     msg <- ""
 33 |     if (time_stamp) {
 34 |       msg <- paste0(stime(), " ")
 35 |     }
 36 |     message(msg, ..., domain = domain, appendLF = appendLF)
 37 |     utils::flush.console()
 38 |   }
 39 | }
 40 | 
 41 | #' Convert Ensamble IDs to Official Gene Symbols
 42 | #'
 43 | #' It uses biomart. If more the one gene is associated to the enamble, the first one retrived from
 44 | #' Biomart is used.
 45 | #' 
 46 | #' @param df data frame; Data frame containing the IDs to convert.
 47 | #' @param col characters; Name of column containing the ensamble ids.
 48 | #' @param organism characters; Organism of origin (i.e. human or mouse).
 49 | #' @param verbose boolean; Icrease verbosity.
 50 | #' @return The updated data frame with a new column called symb.
 51 | #'
 52 | #' @import AnnotationHub
 53 | #' @import fastmatch
 54 | #' 
 55 | #' @export
 56 | ensToSymbol = function(df,col,organism,verbose=T)
 57 | {
 58 |   organism = tolower(organism)
 59 |   organism = base::match.arg(arg = organism,choices = c("human","mouse"),several.ok = F)
 60 |   org.map = c("Homo Sapiens","Mus Musculus")
 61 |   names(org.map) = c("human","mouse")
 62 |   
 63 |   tsmessage("... Retrieving gene annotation from AnnotationHub()",verbose = verbose)
 64 |   ah <- AnnotationHub::AnnotationHub()
 65 |   ahDb <- AnnotationHub::query(ah,pattern = c(org.map[organism],"EnsDb"), ignore.case = TRUE)
 66 |   id <- tail(rownames(AnnotationHub::mcols(ahDb)),n=1)
 67 |   edb <- ah[[id]]
 68 |   ens.map <- subset(genes(edb,return.type = "data.frame"),seq_name%in%c(as.character(1:22),"X","Y","MT") & !gene_biotype%in%"LRG_gene")
 69 |   
 70 |   if(organism %in% "human")
 71 |   {
 72 |     tsmessage(".. Start converting human ensamble id to symbols",verbose = verbose)
 73 |     df$symb = NA
 74 |     df$symb = ens.map$gene_name[fastmatch::fmatch(df[,col],ens.map$gene_id)]
 75 |     tsmessage("Done!",verbose = verbose)
 76 |   }
 77 |   
 78 |   if (organism %in% "mouse")
 79 |   {
 80 |     tsmessage(".. Start converting mouse ensamble id to symbols",verbose = verbose)
 81 |     df$symb = NA
 82 |     df$symb = ens.map$gene_name[fastmatch::fmatch(df[,col],ens.map$gene_id)]
 83 |     tsmessage("Done!",verbose = verbose)
 84 |   }
 85 |   
 86 |   return(df)
 87 | }
 88 | 
 89 | #' Convert Official Gene Symbols to Ensamble IDs 
 90 | #'
 91 | #' It uses biomart. If more the one gene is associated to the enamble, the first one retrived from
 92 | #' Biomart is used.
 93 | #' 
 94 | #' @param df data frame; Data frame containing the IDs to convert.
 95 | #' @param col characters; Name of column containing the gene symbol.
 96 | #' @param organism characters; Organism of origin (i.e. human or mouse).
 97 | #' @param verbose boolean; Icrease verbosity.
 98 | #' @return The updated data frame with a new column called ens.
 99 | #'
100 | #' @import AnnotationHub
101 | #' @import fastmatch
102 | #' 
103 | #' @export
104 | symbolToEns = function(df,col,organism,verbose=T)
105 | {
106 |   organism = tolower(organism)
107 |   organism = base::match.arg(arg = organism,choices = c("human","mouse"),several.ok = F)
108 |   org.map = c("Homo Sapiens","Mus Musculus")
109 |   names(org.map) = c("human","mouse")
110 |   
111 |   tsmessage("... Retrieving gene annotation from AnnotationHub()",verbose = verbose)
112 |   ah <- AnnotationHub::AnnotationHub()
113 |   ahDb <- AnnotationHub::query(ah,pattern = c(org.map[organism],"EnsDb"), ignore.case = TRUE)
114 |   id <- tail(rownames(AnnotationHub::mcols(ahDb)),n=1)
115 |   edb <- ah[[id]]
116 |   ens.map <- subset(genes(edb,return.type = "data.frame"),seq_name%in%c(as.character(1:22),"X","Y","MT") & !gene_biotype%in%"LRG_gene")
117 |   
118 |   if(organism %in% "human")
119 |   {
120 |     tsmessage(".. Start converting human symbols to human ensamble id",verbose = verbose)
121 |     df$ens = NA
122 |     df$ens = ens.map$gene_id[fastmatch::fmatch(df[,col],ens.map$symbol)]
123 |     tsmessage("Done!",verbose = verbose)
124 |   }
125 |   
126 |   if (organism %in% "mouse")
127 |   {
128 |     tsmessage(".. Start converting human symbols to mouse ensamble id",verbose = verbose)
129 |     df$ens = NA
130 |     df$ens = ens.map$gene_id[fastmatch::fmatch(df[,col],ens.map$symbol)]
131 |     tsmessage("Done!",verbose = verbose)
132 |   }
133 |   
134 |   return(df)
135 | }
136 | 
137 | 
138 | # Rcpp progress bar style
139 | progress_for <- function(n, tot,display) {
140 |   if (display) {
141 |     message("0%   10   20   30   40   50   60   70   80   90   100%")
142 |     message("[----|----|----|----|----|----|----|----|----|----|")
143 |     # n:tot = nstars:50 -> nstars = (n*50)/tot
144 |     nstars = floor((n*50)/tot)
145 |     if(nstars>0)
146 |       for (i in 1:nstars) {
147 |         message("*", appendLF = FALSE)
148 |         utils::flush.console()
149 |       }
150 |     message("|")
151 |   }
152 | }
153 | 
154 | # Filter cells with a cell2location style
155 | # https://cell2location.readthedocs.io/en/latest/cell2location.utils.filtering.html
156 | filter_genes_cell2loc_style = function(data,cell_count_cutoff=5,cell_percentage_cutoff2=0.03,nonz_mean_cutoff=1.12)
157 | {
158 |   nt = ifelse(detectCores()>1,detectCores()-1,1)
159 |   data = data[armaRowSum(data)>0,]
160 |   csums = armaRowSum(data!=0)
161 |   gene_to_remove = csums <= cell_count_cutoff |  csums/ncol(data) <= cell_percentage_cutoff2
162 |   gene_to_remove = gene_to_remove & armaColMeans(t(data),nt,FALSE)$mu1 <= nonz_mean_cutoff
163 |   data = data[!gene_to_remove,]
164 |   return(data)
165 | }
166 | 
167 | #' Load in data from 10X
168 | #'
169 | #' Enables easy loading of sparse data matrices provided by 10X genomics.
170 | #'
171 | #' @param data.dir Directory containing the matrix.mtx, genes.tsv (or features.tsv), and barcodes.tsv
172 | #' files provided by 10X. A vector or named vector can be given in order to load
173 | #' several data directories. If a named vector is given, the cell barcode names
174 | #' will be prefixed with the name.
175 | #' @param gene.column Specify which column of genes.tsv or features.tsv to use for gene names; default is 2
176 | #' @param cell.column Specify which column of barcodes.tsv to use for cell names; default is 1
177 | #' @param unique.features Make feature names unique (default TRUE)
178 | #' @param strip.suffix Remove trailing "-1" if present in all cell barcodes.
179 | #'
180 | #' @return If features.csv indicates the data has multiple data types, a list
181 | #'   containing a sparse matrix of the data from each type will be returned.
182 | #'   Otherwise a sparse matrix containing the expression data will be returned.
183 | #'
184 | #' @importFrom Matrix readMM
185 | #' @importFrom utils read.delim
186 | #'
187 | #' @export
188 | #'
189 | #' @examples
190 | #' \dontrun{
191 | #' # For output from CellRanger < 3.0
192 | #' data_dir <- 'path/to/data/directory'
193 | #' list.files(data_dir) # Should show barcodes.tsv, genes.tsv, and matrix.mtx
194 | #' expression_matrix <- Read10X(data.dir = data_dir)
195 | #' seurat_object = CreateSeuratObject(counts = expression_matrix)
196 | #'
197 | #' # For output from CellRanger >= 3.0 with multiple data types
198 | #' data_dir <- 'path/to/data/directory'
199 | #' list.files(data_dir) # Should show barcodes.tsv.gz, features.tsv.gz, and matrix.mtx.gz
200 | #' data <- Read10X(data.dir = data_dir)
201 | #' seurat_object = CreateSeuratObject(counts = data$`Gene Expression`)
202 | #' seurat_object[['Protein']] = CreateAssayObject(counts = data$`Antibody Capture`)
203 | #' }
204 | #'
205 | Read10X <- function(
206 |     data.dir,
207 |     gene.column = 1,
208 |     cell.column = 1,
209 |     unique.features = TRUE,
210 |     strip.suffix = FALSE
211 | ) {
212 |   full.data <- list()
213 |   for (i in seq_along(along.with = data.dir)) {
214 |     run <- data.dir[i]
215 |     if (!dir.exists(paths = run)) {
216 |       stop("Directory provided does not exist")
217 |     }
218 |     barcode.loc <- file.path(run, 'barcodes.tsv')
219 |     gene.loc <- file.path(run, 'genes.tsv')
220 |     features.loc <- file.path(run, 'features.tsv.gz')
221 |     matrix.loc <- file.path(run, 'matrix.mtx')
222 |     # Flag to indicate if this data is from CellRanger >= 3.0
223 |     pre_ver_3 <- file.exists(gene.loc)
224 |     if (!pre_ver_3) {
225 |       addgz <- function(s) {
226 |         return(paste0(s, ".gz"))
227 |       }
228 |       barcode.loc <- addgz(s = barcode.loc)
229 |       matrix.loc <- addgz(s = matrix.loc)
230 |     }
231 |     if (!file.exists(barcode.loc)) {
232 |       stop("Barcode file missing. Expecting ", basename(path = barcode.loc))
233 |     }
234 |     if (!pre_ver_3 && !file.exists(features.loc) ) {
235 |       stop("Gene name or features file missing. Expecting ", basename(path = features.loc))
236 |     }
237 |     if (!file.exists(matrix.loc)) {
238 |       stop("Expression matrix file missing. Expecting ", basename(path = matrix.loc))
239 |     }
240 |     data <- readMM(file = matrix.loc)
241 |     cell.barcodes <- read.table(file = barcode.loc, header = FALSE, sep = '\t', row.names = NULL)
242 |     if (ncol(x = cell.barcodes) > 1) {
243 |       cell.names <- cell.barcodes[, cell.column]
244 |     } else {
245 |       cell.names <- readLines(con = barcode.loc)
246 |     }
247 |     if (all(grepl(pattern = "\\-1$", x = cell.names)) & strip.suffix) {
248 |       cell.names <- as.vector(x = as.character(x = sapply(
249 |         X = cell.names,
250 |         FUN = ExtractField,
251 |         field = 1,
252 |         delim = "-"
253 |       )))
254 |     }
255 |     if (is.null(x = names(x = data.dir))) {
256 |       if (length(x = data.dir) < 2) {
257 |         colnames(x = data) <- cell.names
258 |       } else {
259 |         colnames(x = data) <- paste0(i, "_", cell.names)
260 |       }
261 |     } else {
262 |       colnames(x = data) <- paste0(names(x = data.dir)[i], "_", cell.names)
263 |     }
264 |     feature.names <- read.delim(
265 |       file = ifelse(test = pre_ver_3, yes = gene.loc, no = features.loc),
266 |       header = FALSE,
267 |       stringsAsFactors = FALSE
268 |     )
269 |     if (any(is.na(x = feature.names[, gene.column]))) {
270 |       warning(
271 |         'Some features names are NA. Replacing NA names with ID from the opposite column requested',
272 |         call. = FALSE,
273 |         immediate. = TRUE
274 |       )
275 |       na.features <- which(x = is.na(x = feature.names[, gene.column]))
276 |       replacement.column <- ifelse(test = gene.column == 2, yes = 1, no = 2)
277 |       feature.names[na.features, gene.column] <- feature.names[na.features, replacement.column]
278 |     }
279 |     if (unique.features) {
280 |       fcols = ncol(x = feature.names)
281 |       if (fcols < gene.column) {
282 |         stop(paste0("gene.column was set to ", gene.column,
283 |                     " but feature.tsv.gz (or genes.tsv) only has ", fcols, " columns.",
284 |                     " Try setting the gene.column argument to a value <= to ", fcols, "."))
285 |       }
286 |       rownames(x = data) <- make.unique(names = feature.names[, gene.column])
287 |     }
288 |     # In cell ranger 3.0, a third column specifying the type of data was added
289 |     # and we will return each type of data as a separate matrix
290 |     if (ncol(x = feature.names) > 2) {
291 |       data_types <- factor(x = feature.names$V3)
292 |       lvls <- levels(x = data_types)
293 |       if (length(x = lvls) > 1 && length(x = full.data) == 0) {
294 |         message("10X data contains more than one type and is being returned as a list containing matrices of each type.")
295 |       }
296 |       expr_name <- "Gene Expression"
297 |       if (expr_name %in% lvls) { # Return Gene Expression first
298 |         lvls <- c(expr_name, lvls[-which(x = lvls == expr_name)])
299 |       }
300 |       data <- lapply(
301 |         X = lvls,
302 |         FUN = function(l) {
303 |           return(data[data_types == l, , drop = FALSE])
304 |         }
305 |       )
306 |       names(x = data) <- lvls
307 |     } else{
308 |       data <- list(data)
309 |     }
310 |     full.data[[length(x = full.data) + 1]] <- data
311 |   }
312 |   # Combine all the data from different directories into one big matrix, note this
313 |   # assumes that all data directories essentially have the same features files
314 |   list_of_data <- list()
315 |   for (j in 1:length(x = full.data[[1]])) {
316 |     list_of_data[[j]] <- do.call(cbind, lapply(X = full.data, FUN = `[[`, j))
317 |     # Fix for Issue #913
318 |     list_of_data[[j]] <- as(object = list_of_data[[j]], Class = "CsparseMatrix")
319 |   }
320 |   names(x = list_of_data) <- names(x = full.data[[1]])
321 |   # If multiple features, will return a list, otherwise
322 |   # a matrix.
323 |   if (length(x = list_of_data) == 1) {
324 |     return(list_of_data[[1]])
325 |   } else {
326 |     return(list_of_data)
327 |   }
328 | }
329 | 
330 | detectCores <- function() {
331 |   .Call("detectCoresCpp")
332 | }
333 | 
334 | armaColSum <- function(M,nt=0,verbose=FALSE) {
335 |   res = NULL
336 |   c = class(M)
337 |   if (nt==0) {nt = ifelse(detectCores()>1,detectCores()-1,1)}
338 |   if(c[1]=="matrix") {
339 |     res = armaColSumFull(M,nt,verbose)
340 |   } else {
341 |     if (c[1]=="lgCMatrix" || c[1]=="lgTMatrix") {
342 |       res = Matrix::colSums(M)
343 |     } else { 
344 |       if(c[1]!="dgCMatrix") {M = as(M,"CsparseMatrix")}
345 |       res = armaColSumSparse(M,nt,verbose)
346 |     }
347 |   }
348 |   res = as.numeric(res)
349 |   names(res) = colnames(M)
350 |   return(res)
351 | }
352 | 
353 | armaRowSum <- function(M,nt=0,verbose=FALSE) {
354 |   return(armaColSum(t(M),nt,verbose))
355 | }
356 | 
357 | #' @param M A sparse matrix from the Matrix package.
358 | #' @param dir The directori in which to write the files. 
359 | Write10X = function(M,dir) {
360 |   if (!dir.exists(paths = dir)) {
361 |     dir.create(path = dir,showWarnings = F,recursive = T)
362 |   } 
363 |   writeMMgz(x = M,file = paste0(dir,"/matrix.mtx.gz"))
364 |   write.table(x = rownames(M),file = gzfile(paste0(dir,"/features.tsv.gz")),col.names = F,row.names = F)
365 |   write.table(x = colnames(M),file = gzfile(paste0(dir,"/barcodes.tsv.gz")),col.names = F,row.names = F)
366 | }
367 | 
368 | #' @param x A sparse matrix from the Matrix package.
369 | #' @param file A filename that ends in ".gz".
370 | writeMMgz <- function(x, file) {
371 |   mtype <- "real"
372 |   if (is(x, "ngCMatrix")) {
373 |     mtype <- "integer"
374 |   }
375 |   writeLines(
376 |     c(
377 |       sprintf("%%%%MatrixMarket matrix coordinate %s general", mtype),
378 |       sprintf("%s %s %s", x@Dim[1], x@Dim[2], length(x@x))
379 |     ),
380 |     gzfile(file)
381 |   )
382 |   data.table::fwrite(
383 |     x = summary(x),
384 |     file = file,
385 |     append = TRUE,
386 |     sep = " ",
387 |     row.names = FALSE,
388 |     col.names = FALSE
389 |   )
390 | }
391 | 


--------------------------------------------------------------------------------
/R/visualization.R:
--------------------------------------------------------------------------------
  1 | #' Plot cells in the ebedded space
  2 | #'
  3 | #' Plot cells in the bidimensional space and color it according to a specific parameter.
  4 | #' 
  5 | #' @param data list; GFICF object
  6 | #' @param colorBy characters; Color cells according to a column contained in data$embedded data frame. Default is NULL.
  7 | #' @param pointSize integer; Size of the points in the plot. Default is 0.5.
  8 | #' @param pointShape integer; Shape of the points in the plot. Default is 46.
  9 | #' @return The updated gficf object.
 10 | #' @export
 11 | #' @import ggrepel
 12 | #' @import ggplot2
 13 | #' 
 14 | #' @export
 15 | plotCells = function(data,colorBy=NULL,pointSize=.5,pointShape=46)
 16 | {
 17 |   if (is.null(colorBy)) {return(ggplot(data = data$embedded) + geom_point(aes(x=X,y=Y),size=pointSize,color="blue") + theme_bw())}
 18 |   
 19 |   if (!colorBy%in%colnames(data$embedded)) {stop("colorBy parameter not found")}
 20 |   df = data$embedded
 21 |   u = unique(df[,colorBy])
 22 |   c = NULL
 23 |   for (i in u)
 24 |   {
 25 |     d = as.matrix(dist(df[df[,colorBy]%in%i,c(1,2)]))
 26 |     d = apply(d, 1, sum)
 27 |     ix = which.min(d)
 28 |     c = rbind(c,data.frame(cluster=i,xx=df[names(d[ix]),"X"],yy=df[names(d[ix]),"Y"],stringsAsFactors = F))
 29 |   }
 30 |   
 31 |   tmp = df[,colorBy]
 32 |   
 33 |   ggplot(data = data$embedded) + geom_point(aes(x=X,y=Y,color=tmp),size=pointSize,shape=pointShape) + theme_bw() + geom_text_repel(data = c,aes(x=xx,y=yy,label=cluster),min.segment.length = 0) + geom_point(data = c,aes(x=xx,y=yy),size=2) + theme(legend.position = "none")
 34 | }
 35 | 
 36 | #' Plot gene expression across cells
 37 | #'
 38 | #' Plot the expression of a group of genes across cells.
 39 | #' 
 40 | #' @param data list; GFICF object
 41 | #' @param genes characters; Id of genes to plot. It must correspond to the IDs on the rows of raw count matrix.
 42 | #' @param log2Expr boolean; Relative expression of a gene is computed on rescaled in log2 expression (default TRUE).
 43 | #' @param x Matrix; Custom normalized raw counts. If present will be used instead of the ones normalized by gficf. Default is NULL.
 44 | #' @param rescale boolean; Rescale expression between 0 and 1. Default is false.
 45 | #' @return A list of plots.
 46 | #' @import Matrix
 47 | #' @import ggplot2
 48 | #' 
 49 | #' @export
 50 | plotGenes = function(data,genes,log2Expr=T,x=NULL,rescale=F)
 51 | {
 52 |   if (is.null(data$embedded)) {stop("Please run reduction in the embedded space first!")}
 53 |   if (!is.null(x)) {data$rawCounts=x}
 54 |   if (is.null(data$rawCounts)) {stop("Raw or normalized counts absent.")}
 55 |   
 56 |   data$rawCounts = normCounts(data$rawCounts,doc_proportion_max = 2,
 57 |                               doc_proportion_min = 0,
 58 |                               normalizeCounts = !data$param$normalized & is.null(x),
 59 |                               verbose=T)
 60 |   
 61 |   genes = genes[genes%in%rownames(data$rawCounts)]
 62 |   
 63 |   if (length(genes)==0) {stop("Genes are absent in the Expression matrix")}
 64 |   
 65 |   l  = vector(mode = "list",length = length(genes))
 66 |   names(l) = genes
 67 |   for (i in genes)
 68 |   {
 69 |     if ("predicted" %in% colnames(data$embedded))
 70 |     {
 71 |       df = subset(data$embedded, predicted %in% "NO")
 72 |     } else {
 73 |       df = data$embedded
 74 |     }
 75 |     
 76 |     if(log2Expr)
 77 |     {
 78 |       df$expr = log2(data$rawCounts[i,rownames(df)]+1)
 79 |     } else {
 80 |       df$expr = data$rawCounts[i,rownames(df)]
 81 |     }
 82 |     
 83 |     if(rescale) {df$expr = df$expr/max(df$expr)}
 84 |     df = df[order(df$expr,decreasing = F),]
 85 |     l[[i]] = ggplot(data = df,aes(x=X,y=Y,color=expr)) + geom_point(size=.5,shape=19) + theme_bw() +  scale_color_gradient2(low = "gray",mid = "#2171b5",high = "#08306b",midpoint = .5) + ggtitle(i)
 86 |     
 87 |   }
 88 |   
 89 |   return(l)
 90 | }
 91 | 
 92 | #' Plot the expression of a gene across group of cells.
 93 | #'
 94 | #' Plot the expression of a gene across group of cells with violion plot.
 95 | #' 
 96 | #' @param data list; GFICF object
 97 | #' @param gene characters; Id of genes to plot. It must correspond to the IDs on the rows of raw count matrix.
 98 | #' @param ncol numeric; Number of columns of the final plot (defaul is 3).
 99 | #' @param x Matrix; Custom normalized raw counts. If present will be used instead of the ones normalized by gficf. Default is NULL.
100 | #' @return A list of plots.
101 | #' 
102 | #' @import fastmatch
103 | #' @import Matrix
104 | #' @import ggplot2
105 | #' @importFrom reshape2 melt
106 | #' 
107 | #' @export
108 | plotGeneViolin = function(data,gene,ncol=3,x=NULL)
109 | {
110 |   if (is.null(data$community)) {stop("Please run clustcells first!")}
111 |   if (!is.null(x)) {data$rawCounts=x}
112 |   if (is.null(data$rawCounts)) {stop("Raw or normalized counts absent.")}
113 |   
114 |   cpms = normCounts(data$rawCounts,doc_proportion_max = 2,
115 |                     doc_proportion_min = 0,
116 |                     normalizeCounts = !data$param$normalized & is.null(x),
117 |                     verbose=T)
118 |   
119 |   df = reshape2::melt(as.matrix(cpms[gene,]))
120 |   colnames(df) = c("ens","cell.id","value")
121 |   
122 |   if(!is.null(names(gene))) {
123 |     ix = is.na(names(gene)) | names(gene) %in% "" | is.null(names(gene))
124 |     if(sum(ix)>0) {names(gene)[ix] = gene[ix]}
125 |     if(length(unique(names(gene))) == length(gene)) {
126 |       df$ens = names(gene)[fastmatch::fmatch(df$ens,gene)]
127 |     }
128 |   }
129 |   
130 |   df$value = log2(df$value+1)
131 |   df$cluster = data$embedded$cluster[match(df$cell.id,rownames(data$embedded))]
132 |   df$cluster = factor(as.character(df$cluster),levels = as.character(1:length(unique(df$cluster))))
133 |   p = ggplot2::ggplot(data = df,ggplot2::aes(x=cluster,y=value)) + 
134 |     ggplot2::geom_violin(scale = "width") + 
135 |     ggplot2::facet_wrap(~ens,scales = "free_y",ncol=ncol) + 
136 |     ggplot2::ylab("log2(CPM+1)") + ggplot2::xlab("") + ggplot2::theme_bw()
137 |   
138 |   return(p)
139 | }
140 | 
141 | #' Plot GSEA results
142 | #'
143 | #' Circle plot for gene set enrichement analysis results.
144 | #' 
145 | #' @param data list; GFICF object
146 | #' @param fdr number; FDR threshold to select significant pathways to plot.
147 | #' @param clusterRowCol boolean; if TRUE row and col of the plot are clustered. 
148 | #' @return plot from ggplot2 package.
149 | #' @import Matrix
150 | #' @import ggplot2
151 | #' @importFrom reshape2 melt
152 | #' 
153 | #' @export
154 | plotGSEA = function(data,fdr=.05,clusterRowCol=F)
155 | {
156 |   if (is.null(data$gsea)) {stop("Please run runGSEA function first")}
157 |   nes = data$gsea$nes
158 |   nes[data$gsea$es<=0 | data$gsea$fdr>=fdr] = 0
159 |   nes = nes[armaRowSum(nes)>0,]
160 |   
161 |   if (clusterRowCol)
162 |   {
163 |     h.c = hclust(dist(t(nes),method = "binary"))
164 |     h.p = hclust(dist(nes,method = "binary"))
165 |   }
166 |   
167 |   df = reshape2::melt(as.matrix(nes))
168 |   colnames(df) = c("pathway","cluster","nes")
169 |   
170 |   if (clusterRowCol)
171 |   {
172 |     df$cluster = factor(as.character(df$cluster),levels = rev(h.c$labels[h.c$order]))
173 |     df$pathway = factor(as.character(df$pathway),levels = rev(h.p$labels[h.p$order]))
174 |   } else {
175 |     df$cluster = factor(as.character(df$cluster),levels = as.character(1:length(unique(data$embedded$cluster))))
176 |   }
177 |   
178 |   ggplot(data = df,aes(x=pathway,y=cluster)) + geom_point(aes(size=nes)) + scale_size_continuous(range = c(0,7)) + theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + xlab("") + ylab("Cluster name")
179 | }
180 | 
181 | #' Plot GSEA results
182 | #'
183 | #' Plot GSEA values on top of UMAP/TSNE coordinates.
184 | #' 
185 | #' @param data list; GFICF object
186 | #' @param pathwayName characters; Name of the pathway to plot.
187 | #' @param fdr number; FDR threshold to select significant pathways to plot.
188 | #' @return plot from ggplot2 package.
189 | #' @import Matrix
190 | #' @import ggplot2
191 | #' 
192 | #' @export
193 | plotPathway = function(data,pathwayName,fdr=.05)
194 | {
195 |   if (is.null(data$gsea)) {stop("Please run runGSEA function first")}
196 |   nes = data$gsea$nes
197 |   nes[data$gsea$es<=0 | data$gsea$fdr>=fdr] = 0 
198 |   nes = nes[armaRowSum(nes)>0,]
199 |   nes = nes[pathwayName,]
200 |   df = data$embedded
201 |   df$NES = nes[match(df$cluster,names(nes))]
202 |   ggplot(data = df,aes(x=X,y=Y)) + geom_point(aes(color=NES),shape=20) + theme_bw() + scale_color_gradient(low = "gray",high = "red")
203 | }
204 | 
205 | #' Plot GSEA results
206 | #'
207 | #' Circle plot for gene set enrichement analysis results.
208 | #' 
209 | #' @param data list; GFICF object
210 | #' @param fdr number; FDR threshold to select significant pathways to plot.
211 | #' @param clusterRowCol boolean; if TRUE row and col of the plot are clustered.
212 | #' @param logFCth number; LogFC threshold to select pathways to plot. 
213 | #' @return plot from ggplot2 package.
214 | #' @import Matrix
215 | #' @import ggplot2
216 | #' @importFrom reshape2 melt acast
217 | #' 
218 | #' @export
219 | plotGSVA = function(data,fdr=.05,clusterRowCol=T,logFCth=0)
220 | {
221 |   if (is.null(data$gsva)) {stop("Please run runGSEA function first")}
222 |   M = reshape2::acast(subset(data$gsva$DEpathways,adj.P.Val<fdr & abs(logFC)>logFCth),pathway~cluster,fill = 0,value.var = "logFC")
223 |   
224 |   df = reshape2::melt(M)
225 |   if(clusterRowCol)
226 |   {
227 |     h.col  = hclust(dist(t(M)),method = "ward.D2")
228 |     h.row  = hclust(dist(M),method = "ward.D2")
229 |     df$Var1 = factor(as.character(df$Var1),levels = h.row$labels[h.row$order])
230 |     df$Var2 = factor(as.character(df$Var2),levels = h.col$labels[h.col$order])
231 |   }
232 |   
233 |   ggplot(data = df,aes(x=Var1,y=Var2,fill=value)) + geom_tile() + scale_fill_gradient2(low = "blue",high = "red") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + xlab("") + ylab("")
234 |   
235 | }
236 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | .onAttach <- function(libname, pkgname) {
2 |   require(Matrix)
3 |   packageStartupMessage(packageVersion("gficf"))
4 |   invisible()
5 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # gficf v2 - Single-cell gene set enrichment analysis
 2 | 
 3 | Details on ***scGSEA and scMAP*** implemented into the version 2 of GFICF package can be found in the NAR genomics and bioinformatics manuscript [Franchini et al. 2023](https://doi.org/10.1093/nargab/lqad024)  
 4 | 
 5 | GFICF v1 manuscript can be found here at [Gambardella et al. 2019](https://www.frontiersin.org/articles/10.3389/fgene.2019.00734/abstract).  
 6 | 
 7 | The package also includes [Phenograph](https://biorxiv.org/cgi/content/short/2022.10.24.513476v1)
 8 | [Louvain method](https://sites.google.com/site/findcommunities/)
 9 | clustering using [RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy) library
10 | from [uwot](https://github.com/jlmelville/uwot) and a naive but fast parallel implementation
11 | of Jaccard Coefficient estimation using [RcppParallel](https://cran.r-project.org/package=RcppParallel).
12 | The package also include data reduction with either Principal Component Analisys (PCA) or
13 | non-negative matrix factorization [RcppML](https://github.com/zdebruine/RcppML) before to apply t-SNE or UMAP for single cell data visualization.   
14 | 
15 | **Examples & Functionality**:  
16 | * [Install GFICF package](https://htmlpreview.github.io/?https://github.com/gambalab/gficf/blob/master/inst/doc/installation.html)  
17 | * [Getting Started](https://htmlpreview.github.io/?https://github.com/gambalab/gficf/blob/master/inst/doc/index.html)  
18 | * [Single-cell Gene Set Enrichement Analysis (scGSEA)](https://htmlpreview.github.io/?https://github.com/gambalab/gficf/blob/master/inst/doc/scGSEA.html)  
19 | * [Single-cell Mapper (scMAP)](https://htmlpreview.github.io/?https://github.com/gambalab/gficf/blob/master/inst/doc/scMAP.html)  
20 | * Batch Effect Correction (Cooming soon) 
21 | 


--------------------------------------------------------------------------------
/data-raw/small_atlas.R:
--------------------------------------------------------------------------------
 1 | library(Matrix)
 2 | 
 3 | M.raw = readRDS(file = "~/work/current/BRCA_AIRC_paper/paper_git/RData/RAW.filtered.BRCA.UMI.counts.5K.umi.rds")
 4 | 
 5 | set.seed(0)
 6 | sample = sapply(strsplit(x = colnames(M.raw),split = "_",fixed = T), function(x) x[1])
 7 | u = unique(sample)
 8 | cells=NULL
 9 | for (i in 1:length(u)) {
10 |   if (u[i]=="HDQP1") {
11 |     cells = c(cells,sample(x=colnames(M.raw)[sample%in%u[i]],size=110))
12 |   } else {
13 |     cells = c(cells,sample(x=colnames(M.raw)[sample%in%u[i]],size=150))
14 |   }
15 | }
16 | 
17 | small_BC_atlas = M.raw[,cells]
18 | small_BC_atlas = small_BC_atlas[rowSums(small_BC_atlas)>0,]
19 | usethis::use_data(small_BC_atlas,overwrite = T)
20 | 


--------------------------------------------------------------------------------
/data-raw/test_BC_atlas.R:
--------------------------------------------------------------------------------
 1 | library(Matrix)
 2 | 
 3 | M.raw = readRDS(file = "~/work/current/BRCA_AIRC_paper/paper_git/RData/RAW.filtered.BRCA.UMI.counts.5K.umi.rds")
 4 | load("~/work/package/gficf/data/small_BC_atlas.rda")
 5 | M.raw = M.raw[,!colnames(M.raw)%in%colnames(small_BC_atlas)]
 6 | 
 7 | set.seed(0)
 8 | sample = sapply(strsplit(x = colnames(M.raw),split = "_",fixed = T), function(x) x[1])
 9 | u = unique(sample)
10 | cells=NULL
11 | for (i in 1:length(u)) {
12 |   cells = c(cells,sample(x=colnames(M.raw)[sample%in%u[i]],size=30))
13 | }
14 | 
15 | test_BC_atlas = M.raw[,cells]
16 | test_BC_atlas = test_BC_atlas[rowSums(test_BC_atlas)>0,]
17 | usethis::use_data(test_BC_atlas,overwrite = T)
18 | 


--------------------------------------------------------------------------------
/data/small_BC_atlas.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/data/small_BC_atlas.rda


--------------------------------------------------------------------------------
/data/test_BC_atlas.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/data/test_BC_atlas.rda


--------------------------------------------------------------------------------
/img/Cd34_expression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/img/Cd34_expression.png


--------------------------------------------------------------------------------
/img/Cd8a_expression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/img/Cd8a_expression.png


--------------------------------------------------------------------------------
/img/tabula_annotated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/img/tabula_annotated.png


--------------------------------------------------------------------------------
/img/tabula_clusters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/img/tabula_clusters.png


--------------------------------------------------------------------------------
/inst/doc/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "GFICF Getting Started"
  3 | author:
  4 | - name: Gennaro Gambardella
  5 |   affiliation: TIGEM (Telethon Institute of Genetics and Medicine) 
  6 | package: gficf
  7 | output:
  8 |   BiocStyle::html_document
  9 | vignette: |
 10 |   %\VignetteIndexEntry{GFICF Getting Started}
 11 |   %\VignetteEngine{knitr::rmarkdown}
 12 |   %\VignetteEncoding{UTF-8}
 13 | ---
 14 | 
 15 | <!-- Google tag (gtag.js) -->
 16 | <script async src="https://www.googletagmanager.com/gtag/js?id=G-XD8QKTWJ1D"></script>
 17 | <script>
 18 |   window.dataLayer = window.dataLayer || [];
 19 |   function gtag(){dataLayer.push(arguments);}
 20 |   gtag('js', new Date());
 21 | 
 22 |   gtag('config', 'G-XD8QKTWJ1D');
 23 | </script>
 24 | 
 25 | # Getting started with `gficf` package
 26 | 
 27 | Welcome to GF-ICF! `gficf` is an R package for normalization, visualization and analysis of of single-cell RNA sequencing data, based on a data transformation model called term frequency–inverse document frequency [(TF-IDF)](https://en.wikipedia.org/wiki/Tf%E2%80%93idf), which has been extensively used in the field of text mining. This vignette gives an overview and introduction to `gficf`’s functionality.  
 28 | 
 29 | For this tutorial, we will be analyzing the small_BC_atlas dataset included in the `gficf` package. This dataset is a small version (comprising only 4,760 cells) of the Breast cancer cell-line atlas we recently published ([Gambardella et al.](https://www.nature.com/articles/s41467-022-29358-6)).
 30 | 
 31 | # QC and selecting cells for further analysis
 32 | `gficf` allows you to automatically filter cells based on several criteria. These include
 33 | 
 34 | 1. The total number of UMI detected within a cell (this correlates strongly with unique genes)
 35 | 2. The number of unique genes detected in each cell.
 36 |     a. This because low-quality cells or empty droplets will often have very few genes
 37 |     b. While cell doublets or multiplets may exhibit an aberrant high gene count
 38 | 3. The percentage of reads that map to the mitochondrial genome
 39 |     a. This because low-quality / dying cells often exhibit extensive mitochondrial contamination
 40 | 
 41 | Both point 2 and 3 are done automatically by the tool filtering out cells with low gene ratio detection and high MT ratio. This is accomplished  using loess regression to fit the relationships between the total number of UMI in a cell (in log scale) and the ratio of detected genes over total UMI (or total MT counts over total UMI).
 42 | 
 43 | **IMPORTANT:** For now only datasets for which genes are reported as ENSEMBLE id are supported! 
 44 | 
 45 | ```{r qc,echo=TRUE,cache=TRUE,warning=FALSE,prompt=T,results='hold'}
 46 | require(gficf)
 47 | require(ggplot2)
 48 | 
 49 | # Load the small BC atlas
 50 | data("small_BC_atlas")
 51 | 
 52 | # Filter out cells
 53 | data = filterCells(counts = small_BC_atlas,
 54 |                    organism = "human",
 55 |                    plot = F,
 56 |                    verbose = T,
 57 |                    minUMI = 5000)
 58 | ```
 59 | 
 60 | # Data Normalization and Visualization
 61 | After cell QC, we can start to normalize raw UMI counts and filtering out lowly and rarely expressed genes. In particular here we discard genes expressed in less then 15 cells or in less then 5% of total cells but having an average expression in non-zero cells less then 1.12 UMI. We next perform PCA on the normalized data but using only over-dispersed (i.e.,highly variable) genes. Finally t-UMAP non-linear dimensional reduction is used to visualize the dataset (see Figure \@ref(fig:norm)).
 62 | 
 63 | ```{r norm, fig.cap="UMAP Plot. UMAP plot of the small_BC_atlas dataset after cells have been normalized with GF-ICF model.",echo=TRUE,cache=TRUE,warning=FALSE,message=F,prompt=T,results='hide'}
 64 | # Data normalization and gene filtering
 65 | data <- gficf( QCdata = data,
 66 |                cell_count_cutoff = 15,
 67 |                nonz_mean_cutoff = 1.12,
 68 |                cell_percentage_cutoff2 = 0.05,
 69 |                normalize = T,
 70 |                verbose = T)
 71 | 
 72 | # Create PCA-subspace using overdispersed genes
 73 | data <- runPCA(data = data,dim = 10,use.odgenes = T)
 74 | 
 75 | # Create t-UMAP space
 76 | data <-runReduction(data = data,reduction = "tumap",nt = 2,verbose = T)
 77 | 
 78 | # Plot cells
 79 | p = plotCells(data = data,pointShape = 19) + 
 80 |     xlab("UMAP 1") + 
 81 |     ylab("UMAP 2")
 82 | 
 83 | plot(p)
 84 | ```
 85 | 
 86 | 
 87 | # Cell Clustering
 88 | In the package `gficf` the function `clustcells` implement the [Phenograph](https://www.cell.com/cell/fulltext/S0092-8674(15)00637-6) algorithm,
 89 | which is a clustering method designed for high-dimensional single-cell data analysis. It works by creating a graph ("network") representing phenotypic similarities between cells by calculating the Jaccard coefficient between nearest-neighbor sets, and then identifying communities using the well known [Louvain method](https://sites.google.com/site/findcommunities/) or [Leiden algorithm](https://www.nature.com/articles/s41598-019-41695-z) in this graph. 
 90 | 
 91 | In this particular implementation of Phenograph we use approximate nearest neighbors found using [RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy)
 92 | libraries present in the `uwot` package. The supported distance metrics for KNN that can be set trough the `dist.method` parameter are:
 93 | 
 94 | * Euclidean (default)
 95 | * Cosine
 96 | * Manhattan
 97 | * Hamming
 98 | 
 99 | Please note that the Hamming support is a lot slower than the other metrics. It is not recomadded to use it if you have more than a few hundred features, and even then expect it to take several minutes during the index  building phase in situations where the Euclidean metric would take only a few seconds.  
100 | 
101 | After computation of Jaccard distances among cells (with custom [RcppParallel](https://cran.r-project.org/package=RcppParallel) implementation), the Louvain or Leiden community detection algorithms can be run to identify cell clusters. The supported communities detection algorithm that can be set trough the `community.algo` parameter are:
102 | 
103 | * Louvain classic (igraph implementation)
104 | * Louvian with modularity optimization
105 | * Louvain algorithm with multilevel refinement (default)
106 | * Leiden algorithm from [Traag et al. 2019](https://www.nature.com/articles/s41598-019-41695-z) (need to be installed first via `sudo -H pip install leidenalg igraph`)
107 | * Walktrap
108 | * Fastgreedy
109 | 
110 | ```{r clustering, fig.cap="Cell clusters. UMAP plot of the small_BC_atlas dataset where cells are color-coded according to the cluster they belong.",echo=TRUE,cache=TRUE,warning=FALSE,message=F,prompt=T,results='hold'}
111 | 
112 | # Identify clusters
113 | data <- clustcells(data = data,
114 |                    community.algo = "louvain 3",
115 |                    nt = 2,
116 |                    resolution = .25,
117 |                    verbose = T)
118 | 
119 | # Plot cells color coded acording to their cluster
120 | p = plotCells(data = data,colorBy = "cluster",pointShape = 19) + 
121 |               xlab("UMAP 1") +
122 |               ylab("UMAP 2")
123 | plot(p)
124 | ```
125 | 
126 | 
127 | 
128 | # Adding and plot cell metadata
129 | Cell metadata are stored into the data.frame named `embedded` of the `gficf` object (i.e., it can be accessed via `data$embedded`). Any column can be added and later plotted on top of the UMAP/t-SNE plot (Figure \@ref(fig:ccl)) thanks to the function `plotCells` and specifying in parameter `colorBy` the name of the column with which is intended to color-code cells. This data frame contains by default the UMAP/t-SNE coordinates of each cell. The row names of this data.frame correspond instead to the cell barcode.
130 | 
131 | ```{r ccl, fig.cap="UMAP plot. UMAP plot of the small_BC_atlas dataset where cells are color-coded according to their cell-line of origin.",echo=TRUE,cache=TRUE,warning=FALSE,message=F,prompt=T,results='hide'}
132 | 
133 | # Cell meta-data can stored in the data$embedded data.frame
134 | # Let' add the info about the cell-line, stripping this information
135 | # from the name of the cell and storing it into ccl column.
136 | data$embedded$ccl = sapply(
137 |                             strsplit(x = rownames(data$embedded),
138 |                                      split = "_",fixed = T)
139 |                             ,function(x) x[1]
140 |                           )
141 | 
142 | # We can now plot cell according to their cell-line of origin
143 | p = plotCells(data = data,colorBy = "ccl",pointShape = 19) + 
144 |     xlab("UMAP 1") + 
145 |     ylab("UMAP 2")
146 | 
147 | plot(p)
148 | ```
149 | 
150 | ```
151 | # show top 20 cells
152 | > head(data$embedded,20)
153 | ```
154 | 
155 | ```{r metadata,echo=FALSE,cache=TRUE}
156 | library(DT)
157 | df = head(data$embedded,20)
158 | df$X = round(df$X,2)
159 | df$Y = round(df$Y,2)
160 | DT::datatable(df)
161 | ```
162 | 
163 | # Save and load `gficf` object.
164 | `gficf` object need to be saved/loaded with its specific functions. This because annoy index used by `uwot` package for UMAP is stored in memory and cannot be saved with standard `saveRDS()` or `save()` functions.
165 | 
166 | ```
167 | # save GFICF object
168 | > saveGFICF(data,file = "/path/where/to/save/object.gficf")
169 | 
170 | # load GFICF object
171 | > data = loadGFICF(file = "/path/where/is/object.gficf")
172 | ```
173 | 
174 | # Tips and suggestions.
175 | ## How can I read 10X files produced by Cell Ranger?
176 | Cell Ranger produce 3 output files (i.e., barcodes, features and matrix) usually stored in the same folder. `gficf` includes the `Read10X()` function that reads the output file prduced by cellranger pipeline and return as output the raw UMI count matrix. The values in this matrix represent the number of molecules for each gene (row) that are detected in each cell (column).
177 | ```
178 | # Load 10x dataset
179 | > M = Read10X(data.dir = "/path/where/cellranger/files/are/")
180 | ```
181 | 
182 | ## Where are my normalized counts?
183 | `gficf` normalized data are stored in the gfifc matrix that can be accessd via `data$gficf`
184 | 
185 | ```
186 | # show 10 rows of gficf normalized data
187 | > head(data$gficf)
188 | ```
189 | 
190 | ## How can I run `gficf` without performing cell QC?
191 | In case you want to run gficf directley from raw UMI matrix this can be done using the parameter `M` of `gficf function` as showed below.
192 | ```
193 | # Data normalization and gene filtering
194 | # from RAW counts
195 | > data <- gficf( M = small_BC_atlas,
196 |                cell_count_cutoff = 15,
197 |                nonz_mean_cutoff = 1.12,
198 |                cell_percentage_cutoff2 = 0.05,
199 |                normalize = T,
200 |                verbose = T)
201 | ```
202 | 
203 | ## Where are QC metrics stored in `gficf`?
204 | The number of unique genes and total molecules calculated by `filterCells` function are in the `QC.metadata` data.frame. It can be accessed as below:
205 | 
206 | ```
207 | # Top ten row of QC metrics data.frame
208 | > head(data$QC.metadata)
209 | ```
210 | 
211 | ## Are there alternative to PCA?
212 | `gficf` package inlude non-negative matrix factorization (NMF) as alternative to PCA before to apply t-SNE or UMAP reduction. NMF can be executed with the following command.
213 | ```
214 | # Run NMF with all available cores minus one
215 | > data = runNMF(data = data,dim = 50,use.odgenes = T)
216 | ```
217 | 
218 | # Session info {.unnumbered}
219 | 
220 | ```{r sessionInfo, echo=FALSE}
221 | sessionInfo()
222 | ```
223 | 


--------------------------------------------------------------------------------
/inst/doc/index_files/figure-html/ccl-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/index_files/figure-html/ccl-1.png


--------------------------------------------------------------------------------
/inst/doc/index_files/figure-html/clustering-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/index_files/figure-html/clustering-1.png


--------------------------------------------------------------------------------
/inst/doc/index_files/figure-html/norm-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/index_files/figure-html/norm-1.png


--------------------------------------------------------------------------------
/inst/doc/installation.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "GFICF Installation"
 3 | author:
 4 | - name: Gennaro Gambardella
 5 |   affiliation: TIGEM (Telethon Institute of Genetics and Medicine) 
 6 | package: gficf
 7 | output:
 8 |   BiocStyle::html_document
 9 | vignette: |
10 |   %\VignetteIndexEntry{GFICF Installation}
11 |   %\VignetteEngine{knitr::rmarkdown}
12 |   %\VignetteEncoding{UTF-8}
13 | ---
14 | 
15 | <!-- Google tag (gtag.js) -->
16 | <script async src="https://www.googletagmanager.com/gtag/js?id=G-XD8QKTWJ1D"></script>
17 | <script>
18 |   window.dataLayer = window.dataLayer || [];
19 |   function gtag(){dataLayer.push(arguments);}
20 |   gtag('js', new Date());
21 | 
22 |   gtag('config', 'G-XD8QKTWJ1D');
23 | </script>
24 | 
25 | # Install Dependancies (Officially supported only Linux)
26 | 
27 | `gficf` makes use of `Rcpp`, `RcppParallel` and `RcppGSL`. So you have to carry out
28 | a few extra steps before being able to build this package. The steps are reported below for each platform.
29 | 
30 | 
31 | ## Ubuntu/Debian
32 | 
33 | You need gsl dev library to successfully install RcppGSL library.
34 | On Ubuntu/Debian systems this can be accomplished by running from terminal the command 
35 | 
36 | ```bash
37 | sudo apt-get install libgsl-dev libcurl4-openssl-dev libssl-dev libxml2-dev
38 | ```
39 | 
40 | ## Mac OS X (Not Officially Supported)
41 | 
42 | 1.2.1 Open terminal and run `xcode-select --install` to install the command line developer tools.
43 | 
44 | 1.2.1. We than need to install gsl libraries. This can be done via [Homebrew](https://brew.sh/). So, still from terminal
45 | ```bash
46 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
47 | ```
48 | and than use `homebrew` to install gsl with following command
49 | ```bash
50 | brew install gsl
51 | ```
52 | 
53 | 
54 | ## Windows
55 | 
56 | 1.3.1 Skip this first step if you are using RStudio because it will ask you automatically. Otherwise install  [Rtools](https://cran.r-project.org/bin/windows/Rtools/) and ensure  `path\to\Rtools\bin` is on your path.   
57 | 
58 | 1.3.2 [Download gsl library for Windows](https://sourceforge.net/projects/gnu-scientific-library-windows/) from sourceforge and exctract it in `C:\` or where you want.   
59 | 
60 | 1.3.3 Open R/Rstudio and before to istall the package from github exec the following command in the R terminal.
61 | ```R
62 | # Change the path if you installed gsl librarie not in the default path.
63 | # Be sure to use the format '"path/to/gsl-xxx_mingw-xxx/gsl-xxx-static"'
64 | # In this way " characters will be mainteined and spaces 
65 | # in the path preserved if there are.
66 | 
67 | # For example for gsl-2.2.1 compiled with mingw-6.2.0:
68 | Sys.setenv(GSL_LIBS = '"C:/gsl-2.2.1_mingw-6.2.0/gsl-2.2.1-static"')
69 | ```
70 | 
71 | 
72 | # Install GF-ICF package
73 | 
74 | Exec in R terminal the following commands
75 | ```R
76 | # Install required bioconductor packages
77 | if (!requireNamespace("BiocManager", quietly = TRUE)) {
78 |   install.packages("BiocManager")
79 | }
80 | 
81 | BiocManager::install(setdiff(c("sva","edgeR", "fgsea"),rownames(installed.packages())),update = F)
82 | 
83 | # We rquire RcppML package from github (not the cran version)
84 | if("RcppML" %in% rownames(installed.packages())) {remove.packages("RcppML")}
85 | devtools::install_github("zdebruine/RcppML")
86 | 
87 | # Install gficf from github
88 | if(!require(devtools)){ install.packages("devtools")}
89 | devtools::install_github("gambalab/gficf")
90 | ```
91 | 


--------------------------------------------------------------------------------
/inst/doc/scGSEA.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Single-cell Gene Set Enrichement Analysis"
  3 | author:
  4 | - name: Gennaro Gambardella
  5 |   affiliation: TIGEM (Telethon Institute of Genetics and Medicine) 
  6 | package: gficf
  7 | output:
  8 |   BiocStyle::html_document
  9 | vignette: |
 10 |   %\VignetteIndexEntry{Single-cell Gene Set Enrichement Analysis}
 11 |   %\VignetteEngine{knitr::rmarkdown}
 12 |   %\VignetteEncoding{UTF-8}
 13 | ---
 14 | 
 15 | <!-- Google tag (gtag.js) -->
 16 | <script async src="https://www.googletagmanager.com/gtag/js?id=G-XD8QKTWJ1D"></script>
 17 | <script>
 18 |   window.dataLayer = window.dataLayer || [];
 19 |   function gtag(){dataLayer.push(arguments);}
 20 |   gtag('js', new Date());
 21 | 
 22 |   gtag('config', 'G-XD8QKTWJ1D');
 23 | </script>
 24 | 
 25 | # Introduction {.unnumbered}
 26 | 
 27 | single-cell Gene Set Enrichment Analysis (scGSEA) is a bioinformatic method that could measure the activity of an a priori defined collection of gene sets (i.e., pathways) at the single cell resolution. It that takes advantage of the informative biological signals spreading across the latent factors of gene expression values obtained from non-negative matrix factorization. The scGSEA method starts from a set of single-cell expression profiles and a collection of gene sets and scores their cumulative expression (i.e., pathway activity) in each of the profiled cells (see manucript for details).
 28 | 
 29 | # Data Normalization and Visualization
 30 | For this tutorial, we will be using as a reference atlas the `small_BC_atlas dataset` included in the `gficf` package (Figure \@ref(fig:atlas)). This dataset is a small version (comprising only 4,760 cells) of the Breast cancer cell-line atlas we recently published ([Gambardella et al.](https://www.nature.com/articles/s41467-022-29358-6)).
 31 | 
 32 | ```{R atlas, fig.cap="UMAP Plot. UMAP plot of the small_BC_atlas dataset after cells have been normalized with GF-ICF model. Cell are color coded according to thei cell line of origin.",echo=TRUE,cache=TRUE,warning=FALSE,message=FALSE,prompt=FALSE,results='hide'}
 33 | require(gficf)
 34 | require(ggplot2)
 35 | 
 36 | # Load the RAW UMI count matrix of small bc atlas
 37 | data("small_BC_atlas")
 38 | 
 39 | # Data normalization and gene filtering
 40 | data <- gficf( M = small_BC_atlas)
 41 | 
 42 | # Create PCA-subspace using overdispersed genes
 43 | data <- runPCA(data = data,dim = 10,use.odgenes = T)
 44 | 
 45 | # Create t-UMAP space
 46 | data <-runReduction(data = data,reduction = "umap",nt = 2,verbose = T,n_neighbors=150)
 47 | 
 48 | # Cell meta-data can stored in the data$embedded data.frame
 49 | # Let' add the info about the cell-line, stripping this information
 50 | # from the name of the cell and storing it into ccl column.
 51 | data$embedded$ccl = sapply(
 52 |                             strsplit(x = rownames(data$embedded),
 53 |                                      split = "_",fixed = T)
 54 |                             ,function(x) x[1]
 55 |                           )
 56 | 
 57 | 
 58 | # Plot cells
 59 | p = plotCells(data = data,pointShape = 19,colorBy = "ccl") + 
 60 |     xlab("UMAP 1") + 
 61 |     ylab("UMAP 2")
 62 | 
 63 | plot(p)
 64 | ```
 65 | 
 66 | # Performing scGSEA
 67 | Single cell gene set enrichment analysis is performed by the function `runScGSEA()` of `gficf` package. All available gene sets from [msigdb database](http://www.gsea-msigdb.org/gsea/msigdb/collections.jsp). The list of gene setes to use can can be specified trough the `category` parameter. Here we access to msigdb gene set collection through [`msigdbr`](https://cran.r-project.org/web/packages/msigdbr/vignettes/msigdbr-intro.html) package (see it for further details).
 68 | 
 69 | ```{r collecion,cache=TRUE,warning=FALSE,message=FALSE,prompt=FALSE}
 70 | require(msigdbr)
 71 | 
 72 | # Show supported gene sets colections
 73 | print(msigdbr::msigdbr_collections(),n=30)
 74 | ```
 75 | ```{r scgsea, cache=TRUE, warning=FALSE,message=FALSE,prompt=FALSE,results='hide'}
 76 | # Run scGSEA using 50 hallmarks genes
 77 | data = runScGSEA(data = data,
 78 |                  geneID = "ensamble",
 79 |                  species = "human",
 80 |                  category = "H",
 81 |                  nmf.k = 100,
 82 |                  fdr.th = .1,
 83 |                  rescale = "none",
 84 |                  verbose = T)
 85 | ```
 86 | 
 87 | # Cluster cells by pathway's activity levels
 88 | Now that we have reconstructed pathway's activity at single cell level we can try to cluster cell according to these values. To transform cells into a graph `gficf` package uses UMAP (i.e., the fuzzy graph produced by umap) or the phenograph algorithm. More information about how UMAP constructs a high dimensional graph representation of the data can be found [HERE](https://pair-code.github.io/understanding-umap/). Phenograph is instead a clustering method designed for high-dimensional single-cell data analysis. It works by first creating a graph ("network") that represents phenotypic similarities among cells by calculating the Jaccard coefficient between nearest-neighbor sets, and then identifying communities using the [Louvain method](https://sites.google.com/site/findcommunities/) in the reconstructed graph. Results are stored into the column `cluster.by.scGSEA` of the cell metadata data.frame `data$embedded`.  
 89 | 
 90 | ```{r clust, fig.cap="",echo=TRUE,cache=TRUE,warning=FALSE,message=FALSE,prompt=FALSE,results='hide'}
 91 | # Cluster cells with phenograph method but using
 92 | # estimate cells pathway's activity levels
 93 | data = clustcellsBYscGSEA(data,
 94 |                           method = "fgraph",
 95 |                           pca = 10,
 96 |                           k = 10,
 97 |                           resolution = .05,
 98 |                           n.start = 10,
 99 |                           n.iter = 50,
100 |                           verbose = T)
101 | 
102 | # Plot clusters on top of UMAP representation
103 | p = plotCells(data = data,pointShape = 19,colorBy = "cluster.by.scGSEA") + 
104 |   xlab("UMAP 1") + 
105 |   ylab("UMAP 2")
106 | 
107 | plot(p)
108 | ```
109 | 
110 | # Tips and Suggestions
111 | ## Where are my inferred pathway activity levels?
112 | Reconstructed pathway's activity levels in each cell are stored in the matrix `data$scgsea$x`. In this matrix rows are cells while columns are pathways.
113 | 
114 | ```{r head,cache=TRUE,warning=FALSE,message=FALSE,prompt=FALSE}
115 | head(data$scgsea$x[,1:2])
116 | ```
117 | 
118 | ## How many factors of NMF I have to use for scGSEA?
119 | scGSEA is a tool that leverages NMF expression latent factors to infer pathway activity at a single cell level. Thus, by design, it inherits both benefits and limitations of the NMF model. A well know limit of this model, like other matrix decomposition techniques, is the choice of the exact number of factors to use. We generally recommend using at least 100 NMF factors.
120 | 
121 | ## How can I use a custom gene set list for scGSEA?
122 | If you want to use a custom list of gene sets you can pass it trough the parameter `pathway.list` of `runScGSEA()` function. Name of each element of this list **must be unique** and must represents the name of the pathway.
123 | 
124 | ## Where is the cell graph produced by phenograph or UMAP?
125 | The cell graph produced by UMAP (or phenograph) algorithm is stored into the `data$scgsea$cell.graph` igraph object, while the identified cell communities into the column `cluster.by.scGSEA` of the cell metadata data.frame `data$embedded`. That graph can be then plotted and manipulated with any network package supporting igraph objects. Here, for example, we use [`netbiov`](https://www.bioconductor.org/packages/release/bioc/html/netbiov.html) package to visualize this network. Remember that in this weighted network the **edge weights represents similarities among cells** in terms of probability (for UMAP) or jaccard coefficients for phenograph (i.e., they are not distances). Thus, remember to transform these similarities into distances before to apply the minimum spanning tree (MST) algorithm or any other graph theory method that use distances.
126 | 
127 | ```{r stats,cache=TRUE,warning=FALSE,message=FALSE,prompt=FALSE}
128 | require(igraph)
129 | 
130 | # Extract the graph and print info
131 | g = data$scgsea$cell.graph
132 | 
133 | # Transform similarities into distances
134 | # you have to use MST
135 | # E(g)$weight <- 1 - E(g)$weight
136 | 
137 | # simplify the network
138 | g = igraph::simplify(g,edge.attr.comb = "min")
139 | 
140 | # print the network stats
141 | summary(g)
142 | ```
143 | ```{r graph, fig.cap="Cell Network. The cell network reconstructed by UMAP using the reconstructed pathway's activity levels in each cell.",echo=TRUE,cache=TRUE,warning=FALSE,message=FALSE,prompt=FALSE,results='hide'}
144 | require(netbiov)
145 | 
146 | # Plot the network (bolder edges are MST edges)
147 | hc <- rgb(t(col2rgb(heat.colors(20)))/255,alpha=.2)
148 | cl <- rgb(r=0, b=.7, g=1, alpha=.05)
149 | xx <- plot.modules(g,
150 |                    color.random=TRUE,
151 |                    v.size=1,
152 |                    layout.function=layout.graphopt)
153 | 
154 | ```
155 | 
156 | # Session info {.unnumbered}
157 | 
158 | ```{r sessionInfo, echo=FALSE}
159 | sessionInfo()
160 | ```
161 | 


--------------------------------------------------------------------------------
/inst/doc/scGSEA_files/figure-html/atlas-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/scGSEA_files/figure-html/atlas-1.png


--------------------------------------------------------------------------------
/inst/doc/scGSEA_files/figure-html/clust-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/scGSEA_files/figure-html/clust-1.png


--------------------------------------------------------------------------------
/inst/doc/scGSEA_files/figure-html/graph-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/scGSEA_files/figure-html/graph-1.png


--------------------------------------------------------------------------------
/inst/doc/scMAP.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Single-cell Mapper"
  3 | author:
  4 | - name: Gennaro Gambardella
  5 |   affiliation: TIGEM (Telethon Institute of Genetics and Medicine) 
  6 | package: gficf
  7 | output:
  8 |   BiocStyle::html_document
  9 | vignette: |
 10 |   %\VignetteIndexEntry{Single-cell Mapper}
 11 |   %\VignetteEngine{knitr::rmarkdown}
 12 |   %\VignetteEncoding{UTF-8}
 13 | ---
 14 | 
 15 | <!-- Google tag (gtag.js) -->
 16 | <script async src="https://www.googletagmanager.com/gtag/js?id=G-XD8QKTWJ1D"></script>
 17 | <script>
 18 |   window.dataLayer = window.dataLayer || [];
 19 |   function gtag(){dataLayer.push(arguments);}
 20 |   gtag('js', new Date());
 21 | 
 22 |   gtag('config', 'G-XD8QKTWJ1D');
 23 | </script>
 24 | 
 25 | # Introduction {.unnumbered}
 26 | 
 27 | Single-cell Mapper (scMAP) is a transfer learning algorithm that combines text mining data transformation and a k-nearest neighbours’ (KNN) classifier to map a query set of single-cell transcriptional profiles on top of a reference atlas. scMAP consists of three main steps:  
 28 | 
 29 | 1. In the first step query cell profiles are normalized with the GF-ICF  method but using the ICF weights learned by using the reference atlas;  
 30 | 
 31 | 2.  In the second step,  normalized cell profiles are first projected into the NMF (or PC)  sub-space of the reference atlas and then mapped into its UMAP embedding space;  
 32 | 
 33 | 3. Finally, the KNN algorithm is used to contextualize mapped cells and annotate them.  
 34 | 
 35 | Please read the manuscript for additional details and estimated performances of the method.
 36 | 
 37 | # Building the reference atlas
 38 | For this tutorial, we will be using as a reference atlas the `small_BC_atlas dataset` included in the `gficf` package (Figure \@ref(fig:atlas)). This dataset is a small version (comprising only 4,760 cells) of the Breast cancer cell-line atlas we recently published ([Gambardella et al.](https://www.nature.com/articles/s41467-022-29358-6)). 
 39 | 
 40 | ```{r atlas, fig.cap="UMAP of referece BC Atlas. UMAP plot of the small_BC_atlas dataset where cells are color-coded according to their cell-line of origin.",echo=TRUE,cache=TRUE,warning=FALSE,message=F,prompt=FALSE,results='hide'}
 41 | require(gficf)
 42 | require(ggplot2)
 43 | 
 44 | # Step 1. Build the reference atlas
 45 | # Load the RAW UMI count matrix on which to build the reference atlas
 46 | data("small_BC_atlas")
 47 | 
 48 | # 1.1. Normalization and gene filtering
 49 | data <- gficf( M = small_BC_atlas,
 50 |                cell_count_cutoff = 15,
 51 |                nonz_mean_cutoff = 1.12,
 52 |                cell_percentage_cutoff2 = 0.05,
 53 |                normalize = T,
 54 |                verbose = T)
 55 | 
 56 | # 1.2 Create NMF (or PCA) subspace 
 57 | # using all genes (usually improves performances)
 58 | data <- runNMF(data = data,dim = 50)
 59 | 
 60 | # 1.3 Create t-UMAP space
 61 | data <-runReduction(data = data,reduction = "umap",nt = 2,verbose = T)
 62 | 
 63 | # 1.4 Let's add info about the cell-line of origin
 64 | # Cell meta-data can stored in the data$embedded data.frame
 65 | # Let' add the info about the cell-line, stripping this information
 66 | # from the name of the cell and storing it into ccl column.
 67 | data$embedded$ccl = sapply(
 68 |                             strsplit(x = rownames(data$embedded),
 69 |                                      split = "_",fixed = T)
 70 |                             ,function(x) x[1]
 71 |                           )
 72 | 
 73 | # Plot cells by cell line of origin
 74 | p = plotCells(data = data,colorBy = "ccl",pointShape = 19) + 
 75 |     xlab("UMAP 1") + 
 76 |     ylab("UMAP 2")
 77 | 
 78 | plot(p)
 79 | ```
 80 | 
 81 | # How to map new cells into a reference atlas
 82 | 
 83 | After we have built our reference atlas (Figure \@ref(fig:atlas)), we use the 930 cells available into the `test_BC_atlas` dataset to test our mapping algorithm (Figure \@ref(fig:map)). Mapping of new cells into the embedded space of the reference atlas is performed by the `scMAP()` function. Metadata and coordinate of new mapped cells are stored into `data$embedded.predicted` data.frame.
 84 | 
 85 | ```{r map, fig.cap="UMAP of referece BC Atlas and mapped cells. UMAP plot of the small_BC_atlas dataset where cells are color-coded according to their cell-line of origin. Black points are the 930 mapped cells contained in the test_BC_atlas dataset",echo=TRUE,cache=TRUE,warning=FALSE,message=F,prompt=FALSE,results='hide'}
 86 | 
 87 | # Step 2. Map new cells into reference atlas.
 88 | # 2.1 Load the new cells to map
 89 | data("test_BC_atlas")
 90 | 
 91 | # 2.2 Cell mapping
 92 | data = scMAP(data = data, x = test_BC_atlas,nt = 2,normalize = T,verbose = T)
 93 | 
 94 | # 2.3 Plot mapped cells
 95 | p = ggplot() + 
 96 |     geom_point(data = data$embedded,aes(x=X,y=Y,color=ccl),
 97 |                shape=19,size=.5) + 
 98 |     geom_point(data = data$embedded.predicted,aes(x=X,y=Y),
 99 |                color="black",shape=19,size=.2) + 
100 |     theme_bw() +
101 |     xlab("UMAP 1") + 
102 |     ylab("UMAP 2") +
103 |     theme(legend.position = "none")
104 | 
105 | plot(p)
106 | ```
107 | 
108 | # How to contextualize mapped cells
109 | 
110 | Finally we can use KNN to classify and thus contextualize mapped cells into the reference atlas. In this example we try to infer cell-line of origin of each mapped cell for winch. After the mapping we also compute the classification accuracy that is for this example of 93.23%. Specifically, mapping task can be performed with the function `classify.cells()` of `gficf` package. In this function the class of reference cells must be specified by the `classes` parameter. Here we use as class of each cell its cell-line of origin. Classification results are stored into `data$embedded.predicted` data.frame where the column `predicted.class` reports the predicted class and the column `class.prob` its predicted probability.   
111 | 
112 | The `classify.cells()` function is built on top of [`KernelKnn`](https://github.com/mlampros/KernelKnn) package and thus implement several measures to compute the distances among cells that can be specified trough the `knn_method` parameter. At the same time the parameter `knn_weights_fun` can be used to specify the kernel function to use for cell classification. Default is `knn_weights_fun = NULL` that correspond the to [unweighted KNN algorithm](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm). For further details on possible kernel functions to use, plese see this [blog-post](http://mlampros.github.io/2016/07/10/KernelKnn/) of original `KernelKnn` package.
113 | 
114 | ```{r classify, echo=TRUE,cache=TRUE,warning=FALSE,message=F,prompt=FALSE}
115 | 
116 | # Step 3. Cell classification with KNN.
117 | data = classify.cells(data = data,
118 |                       classes = data$embedded$ccl,
119 |                       k = 11,
120 |                       knn_method = "manhattan",
121 |                       knn_weights_fun = NULL)
122 | 
123 | # show top ten classified cells
124 | head(data$embedded.predicted)
125 | ```
126 | 
127 | ```{r performance, echo=TRUE,cache=TRUE,warning=FALSE,prompt=FALSE}
128 | 
129 | # Strip from cell name the cell-line
130 | # of origin of each mapped cell
131 | data$embedded.predicted$ccl = sapply(
132 |                             strsplit(x = rownames(data$embedded.predicted),
133 |                                      split = "_",fixed = T)
134 |                             ,function(x) x[1]
135 |                           )
136 | 
137 | # Now we can compute the classification accuracy
138 | acc = sum(data$embedded.predicted$predicted.class==data$embedded.predicted$ccl)/nrow(data$embedded.predicted)*100
139 | cat("Classification accuracy is",round(acc,2),"%")
140 | ```
141 | 
142 | 
143 | 
144 | # Session info {.unnumbered}
145 | 
146 | ```{r sessionInfo, echo=FALSE}
147 | sessionInfo()
148 | ```
149 | 


--------------------------------------------------------------------------------
/inst/doc/scMAP_files/figure-html/atlas-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/scMAP_files/figure-html/atlas-1.png


--------------------------------------------------------------------------------
/inst/doc/scMAP_files/figure-html/map-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gambalab/gficf/8d28f4f7bc421e8ffa2cf40e0ba99cebd0049c1b/inst/doc/scMAP_files/figure-html/map-1.png


--------------------------------------------------------------------------------
/man/Read10X.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util.R
 3 | \name{Read10X}
 4 | \alias{Read10X}
 5 | \title{Load in data from 10X}
 6 | \usage{
 7 | Read10X(
 8 |   data.dir,
 9 |   gene.column = 1,
10 |   cell.column = 1,
11 |   unique.features = TRUE,
12 |   strip.suffix = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{data.dir}{Directory containing the matrix.mtx, genes.tsv (or features.tsv), and barcodes.tsv
17 | files provided by 10X. A vector or named vector can be given in order to load
18 | several data directories. If a named vector is given, the cell barcode names
19 | will be prefixed with the name.}
20 | 
21 | \item{gene.column}{Specify which column of genes.tsv or features.tsv to use for gene names; default is 2}
22 | 
23 | \item{cell.column}{Specify which column of barcodes.tsv to use for cell names; default is 1}
24 | 
25 | \item{unique.features}{Make feature names unique (default TRUE)}
26 | 
27 | \item{strip.suffix}{Remove trailing "-1" if present in all cell barcodes.}
28 | }
29 | \value{
30 | If features.csv indicates the data has multiple data types, a list
31 |   containing a sparse matrix of the data from each type will be returned.
32 |   Otherwise a sparse matrix containing the expression data will be returned.
33 | }
34 | \description{
35 | Enables easy loading of sparse data matrices provided by 10X genomics.
36 | }
37 | \examples{
38 | \dontrun{
39 | # For output from CellRanger < 3.0
40 | data_dir <- 'path/to/data/directory'
41 | list.files(data_dir) # Should show barcodes.tsv, genes.tsv, and matrix.mtx
42 | expression_matrix <- Read10X(data.dir = data_dir)
43 | seurat_object = CreateSeuratObject(counts = expression_matrix)
44 | 
45 | # For output from CellRanger >= 3.0 with multiple data types
46 | data_dir <- 'path/to/data/directory'
47 | list.files(data_dir) # Should show barcodes.tsv.gz, features.tsv.gz, and matrix.mtx.gz
48 | data <- Read10X(data.dir = data_dir)
49 | seurat_object = CreateSeuratObject(counts = data$`Gene Expression`)
50 | seurat_object[['Protein']] = CreateAssayObject(counts = data$`Antibody Capture`)
51 | }
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/man/classify.cells.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cellClassifier.R
 3 | \name{classify.cells}
 4 | \alias{classify.cells}
 5 | \title{Classify New Embedded Cells}
 6 | \usage{
 7 | classify.cells(
 8 |   data,
 9 |   classes,
10 |   k = 7,
11 |   seed = 18051982,
12 |   knn_method = "euclidean",
13 |   knn_weights_fun = NULL,
14 |   nt = 0
15 | )
16 | }
17 | \arguments{
18 | \item{data}{list; GFICF object}
19 | 
20 | \item{classes}{chareachters; Classes of already existing cells in the order of they are in colnames(data$gficf).}
21 | 
22 | \item{k}{integer; Number of K-nn to use for classification. Odd number less than 30 are preferred.}
23 | 
24 | \item{seed}{integer; Initial seed to use.}
25 | 
26 | \item{knn_method}{string; a string specifying the method. Valid methods are 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski' (by default the order 'p' of the minkowski parameter equals k), 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'.}
27 | 
28 | \item{knn_weights_fun}{value; there are various ways of specifying the kernel function. NULL value (default) correspond to unweighted KNN algorithm. See the details section of KernelKnn package for more values.}
29 | 
30 | \item{nt}{numeric; Number of thread to use (default is 0, i.e. all possible CPUs - 1)}
31 | }
32 | \value{
33 | A dataframe containing cell id and predicted classes.
34 | }
35 | \description{
36 | Classify new embedded cells using GF-ICF transformation and K-nn algorithm.
37 | Existing cells are used as training set.
38 | }
39 | 


--------------------------------------------------------------------------------
/man/clustcells.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/clustCells.R
 3 | \name{clustcells}
 4 | \alias{clustcells}
 5 | \title{PhenoGraph clustering}
 6 | \usage{
 7 | clustcells(
 8 |   data,
 9 |   from.embedded = F,
10 |   k = 15,
11 |   dist.method = "manhattan",
12 |   nt = 2,
13 |   community.algo = "louvain 3",
14 |   store.graph = T,
15 |   seed = 180582,
16 |   verbose = TRUE,
17 |   resolution = 0.25,
18 |   n.start = 50,
19 |   n.iter = 250
20 | )
21 | }
22 | \arguments{
23 | \item{data}{list; Input data (gficf object)}
24 | 
25 | \item{from.embedded}{logical; Use embeddedd (UMAP or tSNA) space for clustering cells. Best results are usually obtained not using the embedded space.}
26 | 
27 | \item{k}{integer; number of nearest neighbours (default:15)}
28 | 
29 | \item{dist.method}{character; Dist to use for K-nn. Type of distance metric to use to find nearest neighbors. One of:
30 | \itemize{
31 |   \item \code{"euclidean"} (the default)
32 |   \item \code{"cosine"}
33 |   \item \code{"manhattan"}
34 |   \item \code{"hamming"} (very slow)
35 | }}
36 | 
37 | \item{nt}{integer; Number of cpus to use for k-nn search}
38 | 
39 | \item{community.algo}{characthers; Community algorithm to use for clustering. Supported are:
40 | \itemize{
41 |   \item \code{"louvain"} (the default, the original louvain method)
42 |   \item \code{"louvain 2"} (louvain with modularity optimization from Seurat)
43 |   \item \code{"louvain 3"} (Louvain algorithm with multilevel refinement from Seurat)
44 |   \item \code{"leiden"} (Leiden algorithm see Traag et al. 2019)
45 |   \item \code{"walktrap"}
46 |   \item \code{"fastgreedy"}
47 | }}
48 | 
49 | \item{store.graph}{logical; Store produced phenograph in the gficf object}
50 | 
51 | \item{seed}{integer; Seed to use for replication.}
52 | 
53 | \item{verbose}{logical; Increase verbosity.}
54 | 
55 | \item{resolution}{Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities (used only for leiden and louvain 2 or 3 methods).}
56 | 
57 | \item{n.start}{Number of random starts (used only for louvain 2 or 3 methods).}
58 | 
59 | \item{n.iter}{Maximal number of iterations per random start (used only for louvain 2 or 3 methods).}
60 | }
61 | \value{
62 | the updated gficf object
63 | }
64 | \description{
65 | R implementation of the PhenoGraph algorithm
66 | }
67 | \details{
68 | A custom R implementation of the PhenoGraph (http://www.cell.com/cell/abstract/S0092-8674(15)00637-6) algorithm, 
69 | which is a clustering method designed for high-dimensional single-cell data analysis. It works by creating a graph ("network") representing 
70 | phenotypic similarities between cells by calclating the Jaccard coefficient between nearest-neighbor sets, and then identifying communities 
71 | using the well known Louvain method (https://sites.google.com/site/findcommunities/) in this graph. 
72 | 
73 | That version used PCA or LSA reduced meta-cells and multithreading annoy version for K-nn search (from uwot package).
74 | }
75 | 


--------------------------------------------------------------------------------
/man/clustcellsBYscGSEA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/clustCells.R
 3 | \name{clustcellsBYscGSEA}
 4 | \alias{clustcellsBYscGSEA}
 5 | \title{Cell clustering by pathway's activity}
 6 | \usage{
 7 | clustcellsBYscGSEA(
 8 |   data,
 9 |   method = "fgraph",
10 |   pca = NULL,
11 |   k = 10,
12 |   resolution = 0.25,
13 |   n.start = 50,
14 |   n.iter = 250,
15 |   nt = 0,
16 |   verbose = T,
17 |   seed = 180582
18 | )
19 | }
20 | \arguments{
21 | \item{data}{list; Input data (gficf object)}
22 | 
23 | \item{method}{character; Method used to produce the cell network, such as phenograph or fgraph.}
24 | 
25 | \item{pca}{numeric; If different from NULL data are reduced using pca component before to apply clustering algorithm.}
26 | 
27 | \item{k}{integer; number of nearest neighbours (default:10).}
28 | 
29 | \item{resolution}{Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities.}
30 | 
31 | \item{n.start}{Number of random starts.}
32 | 
33 | \item{n.iter}{Maximal number of iterations per random start.}
34 | 
35 | \item{nt}{integer; Number of cpus to use for k-nn search. If zero all cpu are used.}
36 | 
37 | \item{verbose}{logical; Increase verbosity.}
38 | 
39 | \item{seed}{integer; Seed to use for replication.}
40 | }
41 | \value{
42 | the updated gficf object
43 | }
44 | \description{
45 | Cell clustering using pathway expression instead of gene expression
46 | }
47 | \details{
48 | Cells are clustered using the estimated pathway activity levels by scGSEA() function.
49 | Cells are clustered either by using PhenoGraph algorithm or hierarchical clustering. In the case of PhenoGraph method is used
50 | identified clusters with louvain method are stored into data$embedded$cluster.by.scGSEA while the prduced graph into data$scgsea$cell.graph. 
51 | In case hierarchical clusteringis used the resulting dendogram is stored into data$scgsea$h.dendo.
52 | }
53 | 


--------------------------------------------------------------------------------
/man/computePCADim.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dimensinalityReduction.R
 3 | \name{computePCADim}
 4 | \alias{computePCADim}
 5 | \title{Number of features to use}
 6 | \usage{
 7 | computePCADim(data, seed = 180582, subsampling = F, plot = T)
 8 | }
 9 | \arguments{
10 | \item{data}{list; GFICF object}
11 | 
12 | \item{seed}{numeric; seed to use.}
13 | 
14 | \item{subsampling}{logical; Use only a subset of the data for the imputation of dimensions to use.}
15 | 
16 | \item{plot}{logical; Show eblow plot.}
17 | }
18 | \description{
19 | Compute the number of dimension to use for either PCA or LSA.
20 | }
21 | 


--------------------------------------------------------------------------------
/man/ensToSymbol.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util.R
 3 | \name{ensToSymbol}
 4 | \alias{ensToSymbol}
 5 | \title{Convert Ensamble IDs to Official Gene Symbols}
 6 | \usage{
 7 | ensToSymbol(df, col, organism, verbose = T)
 8 | }
 9 | \arguments{
10 | \item{df}{data frame; Data frame containing the IDs to convert.}
11 | 
12 | \item{col}{characters; Name of column containing the ensamble ids.}
13 | 
14 | \item{organism}{characters; Organism of origin (i.e. human or mouse).}
15 | 
16 | \item{verbose}{boolean; Icrease verbosity.}
17 | }
18 | \value{
19 | The updated data frame with a new column called symb.
20 | }
21 | \description{
22 | It uses biomart. If more the one gene is associated to the enamble, the first one retrived from
23 | Biomart is used.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/filterCells.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cellQC.R
 3 | \name{filterCells}
 4 | \alias{filterCells}
 5 | \title{Cell QC}
 6 | \usage{
 7 | filterCells(counts, organism, plot = F, verbose = T, minUMI = 800)
 8 | }
 9 | \arguments{
10 | \item{counts}{Matrix; Raw counts matrix}
11 | 
12 | \item{organism}{characters; Organism (supported human and mouse).}
13 | 
14 | \item{plot}{boolean; If regression plots must be showed.}
15 | 
16 | \item{verbose}{boolean; Increase verbosity.}
17 | 
18 | \item{minUMI}{numeric; Minimium number of UMI per cell (default 800).}
19 | }
20 | \value{
21 | The updated gficf object.
22 | }
23 | \description{
24 | Filter Cells with low gene ratio detection and high MT ratio.
25 | Loess and GAM regression are used to fit relationships between the number of UMI and either the ratio of detected genes or the MT ratio.
26 | }
27 | 


--------------------------------------------------------------------------------
/man/findClusterMarkers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deGenes.R
 3 | \name{findClusterMarkers}
 4 | \alias{findClusterMarkers}
 5 | \title{Find Marker Genes from cell clusters.}
 6 | \usage{
 7 | findClusterMarkers(data, nt = 2, hvg = T, verbose = T)
 8 | }
 9 | \arguments{
10 | \item{data}{list; GFICF object}
11 | 
12 | \item{nt}{integer; Number of thread to use (default 2).}
13 | 
14 | \item{hvg}{boolean; Use only High Variable Genes (default is TRUE).}
15 | 
16 | \item{verbose}{boolean; Icrease verbosity.}
17 | }
18 | \value{
19 | The updated gficf object.
20 | }
21 | \description{
22 | Try to identify marker genes across clusters performing Mann-Whitney U test.
23 | DE genes are identified the expression in each cluster versus the all the other.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/findVarGenes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deGenes.R
 3 | \name{findVarGenes}
 4 | \alias{findVarGenes}
 5 | \title{Find high variable genes following the approach
 6 |  proposed by Chen et al. in BMC Genomics (2016)
 7 |  Code adapted from https://github.com/hillas/scVEGs}
 8 | \usage{
 9 | findVarGenes(data, fitMethod = "locfit", verbose = T)
10 | }
11 | \arguments{
12 | \item{data}{list; GFICF object}
13 | 
14 | \item{fitMethod}{charachter; Method to use to fit variance and mean expression relationship (loess or locfit).}
15 | 
16 | \item{verbose}{boolean; Increase verbosity.}
17 | }
18 | \description{
19 | Find high variable genes following the approach
20 |  proposed by Chen et al. in BMC Genomics (2016)
21 |  Code adapted from https://github.com/hillas/scVEGs
22 | }
23 | 


--------------------------------------------------------------------------------
/man/gficf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gficf.R
 3 | \name{gficf}
 4 | \alias{gficf}
 5 | \title{Gene Frequency - Inverse Cell Frequency (GF-ICF)}
 6 | \usage{
 7 | gficf(
 8 |   M = NULL,
 9 |   QCdata = NULL,
10 |   cell_count_cutoff = 5,
11 |   cell_percentage_cutoff2 = 0.03,
12 |   nonz_mean_cutoff = 1.12,
13 |   normalize = TRUE,
14 |   storeRaw = TRUE,
15 |   batches = NULL,
16 |   groups = NULL,
17 |   filterGenes = TRUE,
18 |   verbose = TRUE,
19 |   ...
20 | )
21 | }
22 | \arguments{
23 | \item{M}{Matrix; UMI cell count matrix}
24 | 
25 | \item{QCdata}{list; QC cell object.}
26 | 
27 | \item{cell_count_cutoff}{numeric; All genes detected in less than cell_count_cutoff cells will be excluded (default 5).}
28 | 
29 | \item{cell_percentage_cutoff2}{numeric; All genes detected in at least this percentage of cells will be included (default 0.03, i.e. 3 percent of cells).}
30 | 
31 | \item{nonz_mean_cutoff}{numeric genes detected in the number of cells between the above mentioned cutoffs are selected only when their average expression in non-zero cells is above this cutoff (default 1.12).}
32 | 
33 | \item{normalize}{logical; Rescale UMI counts before apply GFICF. Rescaling is done using EdgeR normalization.}
34 | 
35 | \item{storeRaw}{logical; Store UMI counts.}
36 | 
37 | \item{batches}{vector; Vector or factor for batch.}
38 | 
39 | \item{groups}{vector; Vector or factor for biological condition of interest.}
40 | 
41 | \item{filterGenes}{logical; Apply gene filter (default TRUE).}
42 | 
43 | \item{verbose}{boolean; Increase verbosity.}
44 | 
45 | \item{...}{Additional arguments to pass to ComBat_seq call.}
46 | }
47 | \value{
48 | The updated gficf object.
49 | }
50 | \description{
51 | R implementation of the GF-ICF
52 | Thanks to 3’-end scRNA-seq approaches, we can now have an accurate estimation of gene expression without having to account for gene length,
53 | thus the number of transcripts (i.e. UMI) associated to each gene, strictly reflects the frequency of a gene in a cell, exactly like a word in a document.
54 | GFICF (Gene Frequency - Inverse Cell Frequency) is analogous of TF-IDF scoring method as defined for tex mining With GFICF we consider a cell to be
55 | analogous to a document, genes analogous to words and gene counts to be analogous of the word’s occurrence in a document.
56 | }
57 | 


--------------------------------------------------------------------------------
/man/loadGFICF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gficf.R
 3 | \name{loadGFICF}
 4 | \alias{loadGFICF}
 5 | \title{Restore GFICF object.}
 6 | \usage{
 7 | loadGFICF(file, verbose = T)
 8 | }
 9 | \arguments{
10 | \item{file}{name of the file where the object is stored.}
11 | }
12 | \description{
13 | Function to read a GFICF object from a file saved with \code{\link{saveGFICF}}.
14 | }
15 | \examples{
16 | gficf_file <- tempfile("gficf_data")
17 | 
18 | # restore
19 | data2 <- loadGFICF(file = gficf_file)
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/man/plotCells.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualization.R
 3 | \name{plotCells}
 4 | \alias{plotCells}
 5 | \title{Plot cells in the ebedded space}
 6 | \usage{
 7 | plotCells(data, colorBy = NULL, pointSize = 0.5, pointShape = 46)
 8 | }
 9 | \arguments{
10 | \item{data}{list; GFICF object}
11 | 
12 | \item{colorBy}{characters; Color cells according to a column contained in data$embedded data frame. Default is NULL.}
13 | 
14 | \item{pointSize}{integer; Size of the points in the plot. Default is 0.5.}
15 | 
16 | \item{pointShape}{integer; Shape of the points in the plot. Default is 46.}
17 | }
18 | \value{
19 | The updated gficf object.
20 | }
21 | \description{
22 | Plot cells in the bidimensional space and color it according to a specific parameter.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/plotGSEA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualization.R
 3 | \name{plotGSEA}
 4 | \alias{plotGSEA}
 5 | \title{Plot GSEA results}
 6 | \usage{
 7 | plotGSEA(data, fdr = 0.05, clusterRowCol = F)
 8 | }
 9 | \arguments{
10 | \item{data}{list; GFICF object}
11 | 
12 | \item{fdr}{number; FDR threshold to select significant pathways to plot.}
13 | 
14 | \item{clusterRowCol}{boolean; if TRUE row and col of the plot are clustered.}
15 | }
16 | \value{
17 | plot from ggplot2 package.
18 | }
19 | \description{
20 | Circle plot for gene set enrichement analysis results.
21 | }
22 | 


--------------------------------------------------------------------------------
/man/plotGSVA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualization.R
 3 | \name{plotGSVA}
 4 | \alias{plotGSVA}
 5 | \title{Plot GSEA results}
 6 | \usage{
 7 | plotGSVA(data, fdr = 0.05, clusterRowCol = T, logFCth = 0)
 8 | }
 9 | \arguments{
10 | \item{data}{list; GFICF object}
11 | 
12 | \item{fdr}{number; FDR threshold to select significant pathways to plot.}
13 | 
14 | \item{clusterRowCol}{boolean; if TRUE row and col of the plot are clustered.}
15 | 
16 | \item{logFCth}{number; LogFC threshold to select pathways to plot.}
17 | }
18 | \value{
19 | plot from ggplot2 package.
20 | }
21 | \description{
22 | Circle plot for gene set enrichement analysis results.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/plotGeneViolin.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualization.R
 3 | \name{plotGeneViolin}
 4 | \alias{plotGeneViolin}
 5 | \title{Plot the expression of a gene across group of cells.}
 6 | \usage{
 7 | plotGeneViolin(data, gene, ncol = 3, x = NULL)
 8 | }
 9 | \arguments{
10 | \item{data}{list; GFICF object}
11 | 
12 | \item{gene}{characters; Id of genes to plot. It must correspond to the IDs on the rows of raw count matrix.}
13 | 
14 | \item{ncol}{numeric; Number of columns of the final plot (defaul is 3).}
15 | 
16 | \item{x}{Matrix; Custom normalized raw counts. If present will be used instead of the ones normalized by gficf. Default is NULL.}
17 | }
18 | \value{
19 | A list of plots.
20 | }
21 | \description{
22 | Plot the expression of a gene across group of cells with violion plot.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/plotGenes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualization.R
 3 | \name{plotGenes}
 4 | \alias{plotGenes}
 5 | \title{Plot gene expression across cells}
 6 | \usage{
 7 | plotGenes(data, genes, log2Expr = T, x = NULL, rescale = F)
 8 | }
 9 | \arguments{
10 | \item{data}{list; GFICF object}
11 | 
12 | \item{genes}{characters; Id of genes to plot. It must correspond to the IDs on the rows of raw count matrix.}
13 | 
14 | \item{log2Expr}{boolean; Relative expression of a gene is computed on rescaled in log2 expression (default TRUE).}
15 | 
16 | \item{x}{Matrix; Custom normalized raw counts. If present will be used instead of the ones normalized by gficf. Default is NULL.}
17 | 
18 | \item{rescale}{boolean; Rescale expression between 0 and 1. Default is false.}
19 | }
20 | \value{
21 | A list of plots.
22 | }
23 | \description{
24 | Plot the expression of a group of genes across cells.
25 | }
26 | 


--------------------------------------------------------------------------------
/man/plotPathway.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualization.R
 3 | \name{plotPathway}
 4 | \alias{plotPathway}
 5 | \title{Plot GSEA results}
 6 | \usage{
 7 | plotPathway(data, pathwayName, fdr = 0.05)
 8 | }
 9 | \arguments{
10 | \item{data}{list; GFICF object}
11 | 
12 | \item{pathwayName}{characters; Name of the pathway to plot.}
13 | 
14 | \item{fdr}{number; FDR threshold to select significant pathways to plot.}
15 | }
16 | \value{
17 | plot from ggplot2 package.
18 | }
19 | \description{
20 | Plot GSEA values on top of UMAP/TSNE coordinates.
21 | }
22 | 


--------------------------------------------------------------------------------
/man/resetScGSEA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pathwayAnalisys.R
 3 | \name{resetScGSEA}
 4 | \alias{resetScGSEA}
 5 | \title{Remove previous scGSEA analysis}
 6 | \usage{
 7 | resetScGSEA(data)
 8 | }
 9 | \arguments{
10 | \item{data}{list; GFICF object}
11 | }
12 | \description{
13 | Remove previous scGSEA analysis
14 | }
15 | 


--------------------------------------------------------------------------------
/man/runGSEA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pathwayAnalisys.R
 3 | \name{runGSEA}
 4 | \alias{runGSEA}
 5 | \title{Gene Set Enrichement Analysi on GF-ICF}
 6 | \usage{
 7 | runGSEA(
 8 |   data,
 9 |   gmt.file,
10 |   nsim = 1000,
11 |   convertToEns = T,
12 |   convertHu2Mm = F,
13 |   nt = 2,
14 |   minSize = 15,
15 |   maxSize = Inf,
16 |   verbose = TRUE,
17 |   seed = 180582,
18 |   method = "GSEA"
19 | )
20 | }
21 | \arguments{
22 | \item{data}{list; GFICF object}
23 | 
24 | \item{gmt.file}{characters; Path to gmt file from MSigDB}
25 | 
26 | \item{nsim}{integer; number of simulation used to compute ES significance.}
27 | 
28 | \item{convertToEns}{boolean: Convert gene sets from gene symbols to Ensable id.}
29 | 
30 | \item{convertHu2Mm}{boolean: Convert gene sets from human symbols to Mouse Ensable id.}
31 | 
32 | \item{nt}{numeric; Number of cpu to use for the GSEA}
33 | 
34 | \item{minSize}{numeric; Minimal size of a gene set to test (default 15). All pathways below the threshold are excluded.}
35 | 
36 | \item{maxSize}{numeric; Maximal size of a gene set to test (default Inf). All pathways above the threshold are excluded.}
37 | 
38 | \item{verbose}{boolean; Show the progress bar.}
39 | 
40 | \item{seed}{integer; Seed to use for random number generation.}
41 | 
42 | \item{method}{string; Method to use GSEA or GSVA. Default is GSEA.}
43 | }
44 | \value{
45 | The updated gficf object.
46 | }
47 | \description{
48 | Compute GSEA for each cluster across a set of input pathways.
49 | }
50 | 


--------------------------------------------------------------------------------
/man/runHarmony.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dimensinalityReduction.R
 3 | \name{runHarmony}
 4 | \alias{runHarmony}
 5 | \title{Number of features to use}
 6 | \usage{
 7 | runHarmony(data, metadata, var.to.use, verbose = T, ...)
 8 | }
 9 | \arguments{
10 | \item{data}{list; GFICF object}
11 | 
12 | \item{metadata}{dataframe; Either (1) Dataframe with variables to integrate or (2) vector with labels.}
13 | 
14 | \item{var.to.use}{character; If meta_data is dataframe, this defined which variable(s) to remove (character vector).}
15 | 
16 | \item{verbose}{boolean; Increase verbosity.}
17 | 
18 | \item{...}{Additional arguments to pass to HarmonyMatrix function.}
19 | }
20 | \description{
21 | Compute the number of dimension to use for either PCA or LSA.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/runNMF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dimensinalityReduction.R
 3 | \name{runNMF}
 4 | \alias{runNMF}
 5 | \title{Non-Negative Matrix Factorization (NMF)}
 6 | \usage{
 7 | runNMF(
 8 |   data,
 9 |   dim = NULL,
10 |   seed = 180582,
11 |   use.odgenes = F,
12 |   n.odgenes = NULL,
13 |   plot.odgenes = F,
14 |   nt = 0,
15 |   ...
16 | )
17 | }
18 | \arguments{
19 | \item{data}{list; GFICF object}
20 | 
21 | \item{dim}{integer; Number of dimension which to reduce the dataset.}
22 | 
23 | \item{seed}{integer; Initial seed to use.}
24 | 
25 | \item{use.odgenes}{boolean; Use only significant overdispersed genes respect to ICF values.}
26 | 
27 | \item{n.odgenes}{integer; Number of overdispersed genes to use. A good choise seems to be usually between 1000 and 3000.}
28 | 
29 | \item{plot.odgenes}{boolean; Show significant overdispersed genes respect to ICF values.}
30 | 
31 | \item{nt}{numeric; Numbmber of thread to use (default is 0, i.e. all available CPU cores).}
32 | 
33 | \item{...}{Additional arguments to pass to nfm call (see ?RcppML::nmf).}
34 | 
35 | \item{centre}{logical; Centre gficf scores before applying reduction (increase separation).}
36 | 
37 | \item{randomized}{logical; Use randomized (faster) version for matrix decomposition (default is TRUE).}
38 | }
39 | \value{
40 | The updated gficf object.
41 | }
42 | \description{
43 | Reduce dimensionality of the single cell dataset using Non-Negative Matrix Factorization (NMF)
44 | }
45 | 


--------------------------------------------------------------------------------
/man/runPCA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dimensinalityReduction.R
 3 | \name{runPCA}
 4 | \alias{runPCA}
 5 | \title{Principal Component Analysis (PCA)}
 6 | \usage{
 7 | runPCA(
 8 |   data,
 9 |   dim = NULL,
10 |   var.scale = F,
11 |   centre = F,
12 |   seed = 180582,
13 |   use.odgenes = F,
14 |   n.odgenes = NULL,
15 |   plot.odgenes = F
16 | )
17 | }
18 | \arguments{
19 | \item{data}{list; GFICF object}
20 | 
21 | \item{dim}{integer; Number of dimension which to reduce the dataset.}
22 | 
23 | \item{centre}{logical; Centre gficf scores before applying reduction (increase separation).}
24 | 
25 | \item{seed}{integer; Initial seed to use.}
26 | 
27 | \item{use.odgenes}{boolean; Use only significant overdispersed genes respect to ICF values.}
28 | 
29 | \item{n.odgenes}{integer; Number of overdispersed genes to use. A good choise seems to be usually between 1000 and 3000.}
30 | 
31 | \item{plot.odgenes}{boolean; Show significant overdispersed genes respect to ICF values.}
32 | }
33 | \value{
34 | The updated gficf object.
35 | 
36 | The updated gficf object.
37 | }
38 | \description{
39 | Reduce dimensionality of the single cell dataset using Principal Component Analysis (PCA)
40 | }
41 | 


--------------------------------------------------------------------------------
/man/runReduction.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dimensinalityReduction.R
 3 | \name{runReduction}
 4 | \alias{runReduction}
 5 | \title{Dimensionality reduction}
 6 | \usage{
 7 | runReduction(
 8 |   data,
 9 |   reduction = "tumap",
10 |   nt = 2,
11 |   seed = 18051982,
12 |   verbose = T,
13 |   ...
14 | )
15 | }
16 | \arguments{
17 | \item{data}{list; GFICF object}
18 | 
19 | \item{reduction}{characters; Reduction method to use. One of:
20 | \itemize{
21 |   \item \code{"tsne"}
22 |   \item \code{"umap"}
23 |   \item \code{"tumap"} (the default)
24 | }}
25 | 
26 | \item{nt}{integer; Number of thread to use (default 2).}
27 | 
28 | \item{seed}{integer; Initial seed to use.}
29 | 
30 | \item{verbose}{boolean; Icrease verbosity.}
31 | 
32 | \item{...}{Additional arguments to pass to Rtsne/umap/tumap call.}
33 | }
34 | \value{
35 | The updated gficf object.
36 | }
37 | \description{
38 | Run t-SNE or UMAP or t-UMAP dimensionality reduction on selected features from PCA or NMF.
39 | See ?umap or ?Rtsne for additional parameter to use.
40 | }
41 | 


--------------------------------------------------------------------------------
/man/runScGSEA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pathwayAnalisys.R
 3 | \name{runScGSEA}
 4 | \alias{runScGSEA}
 5 | \title{Single cell Gene Set Enrichement Analysis on GF-ICF}
 6 | \usage{
 7 | runScGSEA(
 8 |   data,
 9 |   geneID,
10 |   species,
11 |   category,
12 |   subcategory = NULL,
13 |   pathway.list = NULL,
14 |   nsim = 10000,
15 |   nt = 0,
16 |   minSize = 15,
17 |   maxSize = Inf,
18 |   verbose = TRUE,
19 |   seed = 180582,
20 |   nmf.k = 100,
21 |   fdr.th = 0.05,
22 |   gp = 0,
23 |   rescale = "none",
24 |   normalization = "gficf"
25 | )
26 | }
27 | \arguments{
28 | \item{data}{list; GFICF object}
29 | 
30 | \item{geneID}{characters; The type of gene identifier to use, such as ensamble of symbol.}
31 | 
32 | \item{species}{characters; Species name, such as human or mouse.}
33 | 
34 | \item{category}{characters; MSigDB collection abbreviation, such as H or C1.}
35 | 
36 | \item{subcategory}{characters; MSigDB sub-collection abbreviation, such as CGP or BP.}
37 | 
38 | \item{pathway.list}{list; Custom list of pathways. Each element correspond to a pathway a and contains a vector of genes.}
39 | 
40 | \item{nsim}{integer; number of simulation used to compute ES significance.}
41 | 
42 | \item{nt}{numeric; Number of cpu to use for the GSEA and NMF. Default is 0 (i.e., all available cores minus one)}
43 | 
44 | \item{minSize}{numeric; Minimal size of a gene set to test (default 15). All pathways below the threshold are excluded.}
45 | 
46 | \item{maxSize}{numeric; Maximal size of a gene set to test (default Inf). All pathways above the threshold are excluded.}
47 | 
48 | \item{verbose}{boolean; Show the progress bar.}
49 | 
50 | \item{seed}{integer; Seed to use for random number generation.}
51 | 
52 | \item{nmf.k}{numeric; Rank of NMF.}
53 | 
54 | \item{fdr.th}{numeric; FDR threshold for GSEA.}
55 | 
56 | \item{rescale}{string; If different by none, pathway's activity scores are resealed as Z-score. Possible values are none, byGS or byCell. Default is none.}
57 | 
58 | \item{normalization;}{normalization to use before to apply NMF. Possible values are gficf or cpm. Default and highly raccomanded is gficf.}
59 | }
60 | \value{
61 | The updated gficf object.
62 | }
63 | \description{
64 | Compute GSEA for each cells across a set of input pathways by using NMF.
65 | }
66 | 


--------------------------------------------------------------------------------
/man/saveGFICF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gficf.R
 3 | \name{saveGFICF}
 4 | \alias{saveGFICF}
 5 | \title{Save GFICF object.}
 6 | \usage{
 7 | saveGFICF(data, file, verbose = TRUE)
 8 | }
 9 | \arguments{
10 | \item{data}{a GFICF object create by \code{\link{gficf}}.}
11 | 
12 | \item{file}{name of the file where the model is to be saved.}
13 | }
14 | \description{
15 | Function to write a GFICF object to a file, preserving UMAP model.
16 | }
17 | \examples{
18 | # save
19 | gficf_file <- tempfile("gficf_test")
20 | saveGFICF(data, file = gficf_file)
21 | 
22 | # restore
23 | data2 <- loadGFICF(file = gficf_file)
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/man/scMAP.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cellClassifier.R
 3 | \name{scMAP}
 4 | \alias{scMAP}
 5 | \title{Embed new cells in an existing space}
 6 | \usage{
 7 | scMAP(data, x, nt = 2, seed = 18051982, normalize = TRUE, verbose = TRUE)
 8 | }
 9 | \arguments{
10 | \item{data}{list; GFICF object}
11 | 
12 | \item{x}{Matrix; UMI counts matrix of cells to embedd.}
13 | 
14 | \item{nt}{integer; Number of thread to use (default 2).}
15 | 
16 | \item{seed}{integer; Initial seed to use.}
17 | 
18 | \item{normalize}{boolean; If counts must be normalized before to be rescaled with GFICF.}
19 | 
20 | \item{verbose}{boolean; Icrease verbosity.}
21 | }
22 | \value{
23 | The updated gficf object.
24 | }
25 | \description{
26 | This function embed new cells in an already existing space. For now it supports only UMAP and t-UMAP. Briefly new cells are first normalized with GF-ICF method but using as ICF weigth estimated on the existing cells and than projected in the existing PCA/NMF space before to be embedded in the already existing UMAP space via umap_transform function.
27 | }
28 | 


--------------------------------------------------------------------------------
/man/small_BC_atlas.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{small_BC_atlas}
 5 | \alias{small_BC_atlas}
 6 | \title{Small Single cell breast cancer atlas}
 7 | \format{
 8 | ## `small_BC_atlas`
 9 | A Matrix of 4,760 cells
10 | }
11 | \source{
12 | <https://www.nature.com/articles/s41467-022-29358-6>
13 | }
14 | \usage{
15 | small_BC_atlas
16 | }
17 | \description{
18 | A subset of data from the the Single cell breast cancer atlas
19 | Only 150 cells per cell-line are used
20 | }
21 | \keyword{datasets}
22 | 


--------------------------------------------------------------------------------
/man/symbolToEns.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/util.R
 3 | \name{symbolToEns}
 4 | \alias{symbolToEns}
 5 | \title{Convert Official Gene Symbols to Ensamble IDs}
 6 | \usage{
 7 | symbolToEns(df, col, organism, verbose = T)
 8 | }
 9 | \arguments{
10 | \item{df}{data frame; Data frame containing the IDs to convert.}
11 | 
12 | \item{col}{characters; Name of column containing the gene symbol.}
13 | 
14 | \item{organism}{characters; Organism of origin (i.e. human or mouse).}
15 | 
16 | \item{verbose}{boolean; Icrease verbosity.}
17 | }
18 | \value{
19 | The updated data frame with a new column called ens.
20 | }
21 | \description{
22 | It uses biomart. If more the one gene is associated to the enamble, the first one retrived from
23 | Biomart is used.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/test_BC_atlas.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{test_BC_atlas}
 5 | \alias{test_BC_atlas}
 6 | \title{Test set for scMAP method}
 7 | \format{
 8 | ## `test_BC_atlas`
 9 | A Matrix of 930 cells
10 | }
11 | \source{
12 | <https://www.nature.com/articles/s41467-022-29358-6>
13 | }
14 | \usage{
15 | test_BC_atlas
16 | }
17 | \description{
18 | A subset of data from the the Single cell breast cancer atlas
19 | Only 30 cells per cell-line are used. They do not ovrlapp with cells into small_BC_atlas dataset
20 | }
21 | \keyword{datasets}
22 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
 1 | # Turn on C++11 support to get access to long long (guaranteed 64-bit ints)
 2 | CXX_STD = CXX11
 3 | 
 4 | # RcppGSL
 5 | GSL_CFLAGS = $(shell ${R_HOME}/bin/Rscript -e "RcppGSL:::CFlags()")
 6 | GSL_LIBS   = $(shell ${R_HOME}/bin/Rscript -e "RcppGSL:::LdFlags()") `gsl-config --libs`
 7 | PKG_CPPFLAGS += $(GSL_CFLAGS) 
 8 | PKG_LIBS += $(GSL_LIBS)
 9 | 
10 | # RcppParallel
11 | PKG_CXXFLAGS += -DRCPP_PARALLEL_USE_TBB=1 -DSTRICT_R_HEADERS -DRCPP_NO_RTTI
12 | PKG_LIBS += $(shell ${R_HOME}/bin/Rscript -e "RcppParallel::RcppParallelLibs()")
13 | 
14 | # OMP
15 | PKG_CXXFLAGS += -I"./include" $(SHLIB_OPENMP_CXXFLAGS)
16 | PKG_LIBS += $(SHLIB_OPENMP_CXXFLAGS)
17 | PKG_CPPFLAGS += -DARMA_64BIT_WORD=1
18 | 


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
 1 | # Turn on C++11 support to get access to long long (guaranteed 64-bit ints)
 2 | CXX_STD = CXX11
 3 | 
 4 | # RcppGSL
 5 | PKG_CPPFLAGS= -I$(GSL_LIBS)/include
 6 | PKG_LIBS= -L$(GSL_LIBS)/lib/x64 -lgsl -lgslcblas 
 7 | 
 8 | # RcppParallel
 9 | PKG_CXXFLAGS += -DRCPP_PARALLEL_USE_TBB=1 -DSTRICT_R_HEADERS -DRCPP_NO_RTTI
10 | PKG_LIBS += $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "RcppParallel::RcppParallelLibs()")
11 | 
12 | # OMP
13 | PKG_CXXFLAGS += -I"./include" $(SHLIB_OPENMP_CXXFLAGS)
14 | PKG_LIBS += $(SHLIB_OPENMP_CXXFLAGS)
15 | PKG_CPPFLAGS += -DARMA_64BIT_WORD=1
16 | 
17 | # LAPACK AND BLAS
18 | PKG_LIBS += $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
19 | 


--------------------------------------------------------------------------------
/src/ModularityOptimizer.h:
--------------------------------------------------------------------------------
  1 | // This C++ code is from Seurat package available at https://github.com/satijalab/seurat
  2 | 
  3 | #pragma once
  4 | 
  5 | #include <chrono>
  6 | #include <exception>
  7 | #include <fstream>
  8 | #include <limits>
  9 | #include <memory>
 10 | #include <sstream>
 11 | #include <iostream>
 12 | #include <vector>
 13 | 
 14 | typedef std::vector<int> IVector;
 15 | typedef std::vector<double> DVector;
 16 | 
 17 | namespace ModularityOptimizer {
 18 | 
 19 | class JavaRandom {
 20 | private:
 21 |   uint64_t seed;
 22 |   int next(int bits);
 23 | public:
 24 |   JavaRandom(uint64_t seed);
 25 |   int nextInt(int n);
 26 |   void setSeed(uint64_t seed);
 27 | };
 28 | 
 29 | namespace Arrays2 {
 30 |   IVector generateRandomPermutation(int nElements);
 31 |   IVector generateRandomPermutation(int nElements, JavaRandom& random);
 32 | }
 33 | 
 34 | 
 35 | class Clustering {
 36 | private:
 37 |   int nNodes;
 38 | public:
 39 |   // Note: These two variables were "protected" in java, which means it is accessible to the whole package/public.
 40 |   // Although we could have used friend classes, this allows for better mirroring of the original code.
 41 |   int nClusters;
 42 |   IVector cluster;
 43 |   
 44 |   Clustering(int nNodes);
 45 |   Clustering(IVector cluster);
 46 |   int getNNodes() const {return nNodes;};
 47 |   int getNClusters() const {return nClusters;};
 48 |   IVector getClusters() const {return cluster;};
 49 |   int getCluster(int node) const {return cluster[node];};
 50 |   IVector getNNodesPerCluster() const;
 51 |   std::vector<IVector> getNodesPerCluster() const;
 52 |   void setCluster(int node, int cluster);
 53 |   void initSingletonClusters();
 54 |   void orderClustersByNNodes();
 55 |   void mergeClusters(const Clustering& clustering);
 56 | 
 57 | };
 58 | 
 59 | class Network {
 60 |   friend class VOSClusteringTechnique;
 61 | protected:
 62 |   int nNodes;
 63 |   int nEdges;
 64 |   DVector nodeWeight;
 65 |   IVector firstNeighborIndex;
 66 |   IVector neighbor;
 67 |   DVector edgeWeight;
 68 |   double totalEdgeWeightSelfLinks;
 69 | public:
 70 |   Network();
 71 |   Network(int nNodes, DVector* nodeWeight, std::vector<IVector>& edge, DVector* edgeWeight);
 72 |   Network(int nNodes, std::vector<IVector>& edge) : 
 73 |     Network(nNodes, nullptr, edge, nullptr) { };
 74 |   Network(int nNodes, DVector* nodeWeight, std::vector<IVector> edge) : 
 75 |     Network(nNodes, nodeWeight, edge, nullptr) {};
 76 |   Network(int nNodes, std::vector<IVector>& edge, DVector* edgeWeight) :
 77 |     Network(nNodes, nullptr, edge, edgeWeight) {};
 78 |   
 79 |   Network(int nNodes, DVector* nodeWeight, IVector& firstNeighborIndex, IVector& neighbor, DVector* edgeWeight);
 80 |   Network(int nNodes, IVector& firstNeighborIndex, IVector& neighbor) : 
 81 |     Network(nNodes, nullptr, firstNeighborIndex, neighbor, nullptr) {};
 82 |   
 83 |   Network(int nNodes, DVector* nodeWeight, IVector& firstNeighborIndex, IVector& neighbor) :
 84 |     Network(nNodes, nodeWeight, firstNeighborIndex, neighbor, nullptr){};
 85 |   
 86 |   Network(int nNodes, IVector& firstNeighborIndex, IVector& neighbor, DVector* edgeWeight) :
 87 |     Network(nNodes, nullptr, firstNeighborIndex, neighbor, edgeWeight) {};
 88 |   
 89 | 
 90 |  int getNNodes() {return nNodes;};
 91 |  double getTotalNodeWeight();
 92 |  DVector getNodeWeights();
 93 |  double getNodeWeight(int node) { return nodeWeight.at(node);};
 94 |  int getNEdges() {return nEdges / 2;};
 95 |  int getNEdges(int node) {return firstNeighborIndex.at(node + 1) - firstNeighborIndex.at(node);};
 96 |  IVector getNEdgesPerNode();
 97 |  std::vector<IVector> getEdges();
 98 |  IVector getEdges(int node);
 99 |  std::vector<IVector> getEdgesPerNode();
100 |  double getTotalEdgeWeight();
101 |  double getTotalEdgeWeight(int node);
102 |  DVector getTotalEdgeWeightPerNode();
103 |  DVector getEdgeWeights() {return edgeWeight;};
104 |  DVector getEdgeWeights(int node);
105 |  std::vector<DVector> getEdgeWeightsPerNode();
106 |  double getTotalEdgeWeightSelfLinks()
107 |  {
108 |    return totalEdgeWeightSelfLinks;
109 |  };
110 |  // Added these to avoid making these values public
111 |  int getFirstNeighborIndexValue(int i) const {
112 |    return firstNeighborIndex.at(i);
113 |  };
114 |  int getNeighborValue(int index) const {
115 |    return neighbor.at(index);
116 |  }
117 | 
118 |  std::vector<Network> createSubnetworks(Clustering clustering) const;
119 |  Network createReducedNetwork(const Clustering& clustering) const;
120 |  Clustering identifyComponents();
121 | private:
122 |   double generateRandomNumber(int node1, int node2, const IVector& nodePermutation);
123 |   Network createSubnetwork(const Clustering& clustering, int cluster, IVector& node, 
124 |                            IVector& subnetworkNode, IVector& subnetworkNeighbor, DVector& subnetworkEdgeWeight) const;
125 | };
126 | 
127 | 
128 | class VOSClusteringTechnique {
129 | private:
130 |   std::shared_ptr<Network> network;
131 |   std::shared_ptr<Clustering> clustering;
132 |   double resolution;
133 | 
134 | public:
135 |   VOSClusteringTechnique(std::shared_ptr<Network> network, double resolution);
136 |   VOSClusteringTechnique(std::shared_ptr<Network> network, std::shared_ptr<Clustering> clustering, double resolution);
137 |   std::shared_ptr<Network> getNetwork() { return network;}
138 |   std::shared_ptr<Clustering> getClustering()  { return clustering; }
139 |   double getResolution() {return resolution; }
140 |   void setNetwork(std::shared_ptr<Network> network) {this->network = network;}
141 |   void setClustering(std::shared_ptr<Clustering> clustering) {this->clustering = clustering;}
142 |   void setResolution(double resolution) {this->resolution = resolution;}
143 |   double calcQualityFunction();
144 | 
145 |   bool runLocalMovingAlgorithm(JavaRandom& random);
146 |   bool runLouvainAlgorithm(JavaRandom& random);
147 |   bool runIteratedLouvainAlgorithm(int maxNIterations, JavaRandom& random);
148 |   bool runLouvainAlgorithmWithMultilevelRefinement(JavaRandom& random);
149 |   bool runIteratedLouvainAlgorithmWithMultilevelRefinement(int maxNIterations, JavaRandom& random);
150 |   bool runSmartLocalMovingAlgorithm(JavaRandom& random);
151 |   bool runIteratedSmartLocalMovingAlgorithm(int nIterations, JavaRandom& random);
152 | 
153 |   int removeCluster(int cluster);
154 |   void removeSmallClusters(int minNNodesPerCluster);
155 | };
156 | 
157 | 
158 | std::shared_ptr<Network> matrixToNetwork(IVector& node1, IVector& node2, DVector& edgeWeight1, int modularityFunction);
159 | std::shared_ptr<Network> readInputFile(std::string fname, int modularityFunction);
160 | std::vector<std::string> split(const std::string& s, char delimiter);
161 | };
162 | 


--------------------------------------------------------------------------------
/src/RModularityOptimizer.cpp:
--------------------------------------------------------------------------------
  1 | // This C++ code is from Seurat package available at https://github.com/satijalab/seurat
  2 | 
  3 | #include <chrono>
  4 | #include <exception>
  5 | #include <fstream>
  6 | #include <iostream>
  7 | #include <limits>
  8 | #include <memory>
  9 | #include <sstream>
 10 | 
 11 | #include <RcppEigen.h>
 12 | #include <Rcpp.h>
 13 | #include <progress.hpp>
 14 | 
 15 | #include "ModularityOptimizer.h"
 16 | 
 17 | using namespace ModularityOptimizer;
 18 | using namespace std::chrono;
 19 | using namespace Rcpp;
 20 | 
 21 | 
 22 | // [[Rcpp::depends(RcppEigen)]]
 23 | // [[Rcpp::depends(RcppProgress)]]
 24 | // [[Rcpp::export]]
 25 | IntegerVector RunModularityClusteringCpp(Eigen::SparseMatrix<double> SNN,
 26 |     int modularityFunction,
 27 |     double resolution,
 28 |     int algorithm,
 29 |     int nRandomStarts,
 30 |     int nIterations,
 31 |     int randomSeed,
 32 |     bool printOutput,
 33 |     std::string edgefilename) {
 34 | 
 35 |   // validate arguments
 36 |   if(modularityFunction != 1 && modularityFunction != 2)
 37 |     stop("Modularity parameter must be equal to 1 or 2.");
 38 |   if(algorithm != 1 && algorithm !=2 && algorithm !=3 && algorithm !=4)
 39 |     stop("Algorithm for modularity optimization must be 1, 2, 3, or 4");
 40 |   if(nRandomStarts < 1)
 41 |     stop("Have to have at least one start");
 42 |   if(nIterations < 1)
 43 |     stop("Need at least one interation");
 44 |   if (modularityFunction == 2 && resolution > 1.0)
 45 |     stop("error: resolution<1 for alternative modularity");
 46 |   try {
 47 |   bool update;
 48 |   double modularity, maxModularity, resolution2;
 49 |   int i, j;
 50 | 
 51 |   std::string msg = "Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck";
 52 |   if (printOutput)
 53 |     Rcout << msg << std::endl << std::endl;
 54 | 
 55 |   // Load netwrok
 56 |   std::shared_ptr<Network> network;
 57 |   if(edgefilename != "") {
 58 |     if (printOutput)
 59 |       Rcout << "Reading input file..." << std::endl << std::endl;
 60 |     try{
 61 |       network = readInputFile(edgefilename, modularityFunction);
 62 |     } catch(...) {
 63 |       stop("Could not parse edge file.");
 64 |     }
 65 |   } else {
 66 |     // Load lower triangle
 67 |     int network_size = (SNN.nonZeros() / 2) + 3;
 68 |     IVector node1;
 69 |     IVector node2;
 70 |     DVector edgeweights;
 71 |     node1.reserve(network_size);
 72 |     node2.reserve(network_size);
 73 |     edgeweights.reserve(network_size);
 74 |     for (int k=0; k < SNN.outerSize(); ++k){
 75 |       for (Eigen::SparseMatrix<double>::InnerIterator it(SNN, k); it; ++it){
 76 |         if(it.col() >= it.row()){
 77 |           continue;
 78 |         }
 79 |         node1.emplace_back(it.col());
 80 |         node2.emplace_back(it.row());
 81 |         edgeweights.emplace_back(it.value());
 82 |       }
 83 |     }
 84 |     if(node1.size() == 0) {
 85 |       stop("Matrix contained no network data.  Check format.");
 86 |     }
 87 | 
 88 |     network = matrixToNetwork(node1, node2, edgeweights, modularityFunction);
 89 |     Rcpp::checkUserInterrupt();
 90 |   }
 91 | 
 92 |   if (printOutput)
 93 |   {
 94 |     Rprintf("Number of nodes: %d\n", network->getNNodes());
 95 |     Rprintf("Number of edges: %d\n", network->getNEdges());
 96 |     Rcout << std::endl;
 97 |     Rcout << "Running " <<  ((algorithm == 1) ? "Louvain algorithm" : ((algorithm == 2) ? "Louvain algorithm with multilevel refinement" : "smart local moving algorithm")) << "...";
 98 |     Rcout << std::endl;
 99 |   }
100 | 
101 |   resolution2 = ((modularityFunction == 1) ? (resolution / (2 * network->getTotalEdgeWeight() + network->getTotalEdgeWeightSelfLinks())) : resolution);
102 | 
103 |   auto beginTime = duration_cast<milliseconds>(system_clock::now().time_since_epoch());
104 |   std::shared_ptr<Clustering> clustering;
105 |   maxModularity = -std::numeric_limits<double>::infinity();
106 |   JavaRandom random(randomSeed);
107 | 
108 |   Progress p(nRandomStarts, printOutput);
109 |   for (i = 0; i < nRandomStarts; i++)
110 |   {
111 |     //if (printOutput && (nRandomStarts > 1))
112 |     //Rprintf("Random start: %d\n", i + 1);
113 | 
114 |     VOSClusteringTechnique vosClusteringTechnique(network, resolution2);
115 | 
116 |     j = 0;
117 |     update = true;
118 |     do
119 |     {
120 |       /*if (printOutput && (nIterations > 1))
121 |         Rprintf("Iteration: %d\n", j + 1);
122 |       */
123 |       if (algorithm == 1)
124 |         update = vosClusteringTechnique.runLouvainAlgorithm(random);
125 |       else if (algorithm == 2)
126 |         update = vosClusteringTechnique.runLouvainAlgorithmWithMultilevelRefinement(random);
127 |       else if (algorithm == 3)
128 |         vosClusteringTechnique.runSmartLocalMovingAlgorithm(random);
129 |       j++;
130 | 
131 |       modularity = vosClusteringTechnique.calcQualityFunction();
132 | 
133 |       //if (printOutput && (nIterations > 1))
134 |       //  Rprintf("Modularity: %.4f\n", modularity);
135 |       Rcpp::checkUserInterrupt();
136 |     }
137 |     while ((j < nIterations) && update);
138 | 
139 |     if (modularity > maxModularity)
140 |     {
141 |       clustering = vosClusteringTechnique.getClustering();
142 |       maxModularity = modularity;
143 |     }
144 | 
145 |     /*if (printOutput && (nRandomStarts > 1))
146 |     {
147 |       if (nIterations == 1)
148 |         Rprintf("Modularity: %.4f\n", modularity);
149 |       Rcout << std::endl;
150 |     }*/
151 |     p.increment();
152 |   }
153 |   auto endTime = duration_cast<milliseconds>(system_clock::now().time_since_epoch());
154 |   if(clustering == nullptr) {
155 |     stop("Clustering step failed.");
156 |   }
157 |   if (printOutput)
158 |   {
159 |     if (nRandomStarts == 1)
160 |     {
161 |       if (nIterations > 1)
162 |         Rcout << std::endl;
163 |       Rprintf("Modularity: %.4f\n", maxModularity);
164 |     }
165 |     else
166 |       Rprintf("Maximum modularity in %d random starts: %.4f\n", nRandomStarts, maxModularity);
167 |       Rprintf("Number of communities: %d\n", clustering->getNClusters());
168 |       Rprintf("Elapsed time: %d seconds\n", static_cast<int>((endTime - beginTime).count() / 1000.0));
169 |   }
170 | 
171 |   // Return results
172 |   clustering->orderClustersByNNodes();
173 |   IntegerVector iv(clustering->cluster.cbegin(), clustering->cluster.cend());
174 |   return iv;
175 |   } catch(std::exception &ex) {
176 |     forward_exception_to_r(ex);
177 |   } catch(...) {
178 |     ::Rf_error("c++ exception (unknown reason)");
179 |   }
180 |   return IntegerVector(1);
181 | }
182 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
  1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | #include <RcppArmadillo.h>
  5 | #include <RcppGSL.h>
  6 | #include <RcppEigen.h>
  7 | #include <Rcpp.h>
  8 | 
  9 | using namespace Rcpp;
 10 | 
 11 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
 12 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
 13 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
 14 | #endif
 15 | 
 16 | // RunModularityClusteringCpp
 17 | IntegerVector RunModularityClusteringCpp(Eigen::SparseMatrix<double> SNN, int modularityFunction, double resolution, int algorithm, int nRandomStarts, int nIterations, int randomSeed, bool printOutput, std::string edgefilename);
 18 | RcppExport SEXP _gficf_RunModularityClusteringCpp(SEXP SNNSEXP, SEXP modularityFunctionSEXP, SEXP resolutionSEXP, SEXP algorithmSEXP, SEXP nRandomStartsSEXP, SEXP nIterationsSEXP, SEXP randomSeedSEXP, SEXP printOutputSEXP, SEXP edgefilenameSEXP) {
 19 | BEGIN_RCPP
 20 |     Rcpp::RObject rcpp_result_gen;
 21 |     Rcpp::RNGScope rcpp_rngScope_gen;
 22 |     Rcpp::traits::input_parameter< Eigen::SparseMatrix<double> >::type SNN(SNNSEXP);
 23 |     Rcpp::traits::input_parameter< int >::type modularityFunction(modularityFunctionSEXP);
 24 |     Rcpp::traits::input_parameter< double >::type resolution(resolutionSEXP);
 25 |     Rcpp::traits::input_parameter< int >::type algorithm(algorithmSEXP);
 26 |     Rcpp::traits::input_parameter< int >::type nRandomStarts(nRandomStartsSEXP);
 27 |     Rcpp::traits::input_parameter< int >::type nIterations(nIterationsSEXP);
 28 |     Rcpp::traits::input_parameter< int >::type randomSeed(randomSeedSEXP);
 29 |     Rcpp::traits::input_parameter< bool >::type printOutput(printOutputSEXP);
 30 |     Rcpp::traits::input_parameter< std::string >::type edgefilename(edgefilenameSEXP);
 31 |     rcpp_result_gen = Rcpp::wrap(RunModularityClusteringCpp(SNN, modularityFunction, resolution, algorithm, nRandomStarts, nIterations, randomSeed, printOutput, edgefilename));
 32 |     return rcpp_result_gen;
 33 | END_RCPP
 34 | }
 35 | // jaccard_coeff
 36 | NumericMatrix jaccard_coeff(NumericMatrix idx, bool printOutput);
 37 | RcppExport SEXP _gficf_jaccard_coeff(SEXP idxSEXP, SEXP printOutputSEXP) {
 38 | BEGIN_RCPP
 39 |     Rcpp::RObject rcpp_result_gen;
 40 |     Rcpp::RNGScope rcpp_rngScope_gen;
 41 |     Rcpp::traits::input_parameter< NumericMatrix >::type idx(idxSEXP);
 42 |     Rcpp::traits::input_parameter< bool >::type printOutput(printOutputSEXP);
 43 |     rcpp_result_gen = Rcpp::wrap(jaccard_coeff(idx, printOutput));
 44 |     return rcpp_result_gen;
 45 | END_RCPP
 46 | }
 47 | // armaColSumFull
 48 | arma::mat armaColSumFull(const arma::mat& m, int ncores, bool verbose);
 49 | RcppExport SEXP _gficf_armaColSumFull(SEXP mSEXP, SEXP ncoresSEXP, SEXP verboseSEXP) {
 50 | BEGIN_RCPP
 51 |     Rcpp::RObject rcpp_result_gen;
 52 |     Rcpp::RNGScope rcpp_rngScope_gen;
 53 |     Rcpp::traits::input_parameter< const arma::mat& >::type m(mSEXP);
 54 |     Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP);
 55 |     Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
 56 |     rcpp_result_gen = Rcpp::wrap(armaColSumFull(m, ncores, verbose));
 57 |     return rcpp_result_gen;
 58 | END_RCPP
 59 | }
 60 | // armaColSumSparse
 61 | arma::mat armaColSumSparse(const arma::sp_mat& m, int ncores, bool verbose);
 62 | RcppExport SEXP _gficf_armaColSumSparse(SEXP mSEXP, SEXP ncoresSEXP, SEXP verboseSEXP) {
 63 | BEGIN_RCPP
 64 |     Rcpp::RObject rcpp_result_gen;
 65 |     Rcpp::RNGScope rcpp_rngScope_gen;
 66 |     Rcpp::traits::input_parameter< const arma::sp_mat& >::type m(mSEXP);
 67 |     Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP);
 68 |     Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
 69 |     rcpp_result_gen = Rcpp::wrap(armaColSumSparse(m, ncores, verbose));
 70 |     return rcpp_result_gen;
 71 | END_RCPP
 72 | }
 73 | // colMeanVarS
 74 | Rcpp::DataFrame colMeanVarS(const arma::sp_mat& m, int ncores);
 75 | RcppExport SEXP _gficf_colMeanVarS(SEXP mSEXP, SEXP ncoresSEXP) {
 76 | BEGIN_RCPP
 77 |     Rcpp::RObject rcpp_result_gen;
 78 |     Rcpp::RNGScope rcpp_rngScope_gen;
 79 |     Rcpp::traits::input_parameter< const arma::sp_mat& >::type m(mSEXP);
 80 |     Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP);
 81 |     rcpp_result_gen = Rcpp::wrap(colMeanVarS(m, ncores));
 82 |     return rcpp_result_gen;
 83 | END_RCPP
 84 | }
 85 | // armaManhattan
 86 | arma::sp_mat armaManhattan(const arma::sp_mat& m, int ncores, bool verbose, bool full, bool diag);
 87 | RcppExport SEXP _gficf_armaManhattan(SEXP mSEXP, SEXP ncoresSEXP, SEXP verboseSEXP, SEXP fullSEXP, SEXP diagSEXP) {
 88 | BEGIN_RCPP
 89 |     Rcpp::RObject rcpp_result_gen;
 90 |     Rcpp::RNGScope rcpp_rngScope_gen;
 91 |     Rcpp::traits::input_parameter< const arma::sp_mat& >::type m(mSEXP);
 92 |     Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP);
 93 |     Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
 94 |     Rcpp::traits::input_parameter< bool >::type full(fullSEXP);
 95 |     Rcpp::traits::input_parameter< bool >::type diag(diagSEXP);
 96 |     rcpp_result_gen = Rcpp::wrap(armaManhattan(m, ncores, verbose, full, diag));
 97 |     return rcpp_result_gen;
 98 | END_RCPP
 99 | }
100 | // armaCorr
101 | arma::sp_mat armaCorr(const arma::mat& m, int ncores, bool verbose, bool full, bool diag, bool dist);
102 | RcppExport SEXP _gficf_armaCorr(SEXP mSEXP, SEXP ncoresSEXP, SEXP verboseSEXP, SEXP fullSEXP, SEXP diagSEXP, SEXP distSEXP) {
103 | BEGIN_RCPP
104 |     Rcpp::RObject rcpp_result_gen;
105 |     Rcpp::RNGScope rcpp_rngScope_gen;
106 |     Rcpp::traits::input_parameter< const arma::mat& >::type m(mSEXP);
107 |     Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP);
108 |     Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
109 |     Rcpp::traits::input_parameter< bool >::type full(fullSEXP);
110 |     Rcpp::traits::input_parameter< bool >::type diag(diagSEXP);
111 |     Rcpp::traits::input_parameter< bool >::type dist(distSEXP);
112 |     rcpp_result_gen = Rcpp::wrap(armaCorr(m, ncores, verbose, full, diag, dist));
113 |     return rcpp_result_gen;
114 | END_RCPP
115 | }
116 | // armaColMeans
117 | Rcpp::DataFrame armaColMeans(const arma::sp_mat& m, int ncores, bool verbose);
118 | RcppExport SEXP _gficf_armaColMeans(SEXP mSEXP, SEXP ncoresSEXP, SEXP verboseSEXP) {
119 | BEGIN_RCPP
120 |     Rcpp::RObject rcpp_result_gen;
121 |     Rcpp::RNGScope rcpp_rngScope_gen;
122 |     Rcpp::traits::input_parameter< const arma::sp_mat& >::type m(mSEXP);
123 |     Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP);
124 |     Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
125 |     rcpp_result_gen = Rcpp::wrap(armaColMeans(m, ncores, verbose));
126 |     return rcpp_result_gen;
127 | END_RCPP
128 | }
129 | // scaleUMI
130 | arma::sp_mat scaleUMI(const arma::sp_mat& m, int ncores, bool verbose);
131 | RcppExport SEXP _gficf_scaleUMI(SEXP mSEXP, SEXP ncoresSEXP, SEXP verboseSEXP) {
132 | BEGIN_RCPP
133 |     Rcpp::RObject rcpp_result_gen;
134 |     Rcpp::RNGScope rcpp_rngScope_gen;
135 |     Rcpp::traits::input_parameter< const arma::sp_mat& >::type m(mSEXP);
136 |     Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP);
137 |     Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
138 |     rcpp_result_gen = Rcpp::wrap(scaleUMI(m, ncores, verbose));
139 |     return rcpp_result_gen;
140 | END_RCPP
141 | }
142 | // rcpp_WMU_test
143 | Rcpp::NumericMatrix rcpp_WMU_test(Rcpp::NumericMatrix M, Rcpp::NumericVector idx1, Rcpp::NumericVector idx2);
144 | RcppExport SEXP _gficf_rcpp_WMU_test(SEXP MSEXP, SEXP idx1SEXP, SEXP idx2SEXP) {
145 | BEGIN_RCPP
146 |     Rcpp::RObject rcpp_result_gen;
147 |     Rcpp::RNGScope rcpp_rngScope_gen;
148 |     Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type M(MSEXP);
149 |     Rcpp::traits::input_parameter< Rcpp::NumericVector >::type idx1(idx1SEXP);
150 |     Rcpp::traits::input_parameter< Rcpp::NumericVector >::type idx2(idx2SEXP);
151 |     rcpp_result_gen = Rcpp::wrap(rcpp_WMU_test(M, idx1, idx2));
152 |     return rcpp_result_gen;
153 | END_RCPP
154 | }
155 | // rcpp_parallel_jaccard_coef
156 | Rcpp::NumericMatrix rcpp_parallel_jaccard_coef(Rcpp::NumericMatrix mat, bool printOutput);
157 | RcppExport SEXP _gficf_rcpp_parallel_jaccard_coef(SEXP matSEXP, SEXP printOutputSEXP) {
158 | BEGIN_RCPP
159 |     Rcpp::RObject rcpp_result_gen;
160 |     Rcpp::RNGScope rcpp_rngScope_gen;
161 |     Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type mat(matSEXP);
162 |     Rcpp::traits::input_parameter< bool >::type printOutput(printOutputSEXP);
163 |     rcpp_result_gen = Rcpp::wrap(rcpp_parallel_jaccard_coef(mat, printOutput));
164 |     return rcpp_result_gen;
165 | END_RCPP
166 | }
167 | // rcpp_parallel_WMU_test
168 | Rcpp::NumericMatrix rcpp_parallel_WMU_test(Rcpp::NumericMatrix matX, Rcpp::NumericMatrix matY, bool printOutput);
169 | RcppExport SEXP _gficf_rcpp_parallel_WMU_test(SEXP matXSEXP, SEXP matYSEXP, SEXP printOutputSEXP) {
170 | BEGIN_RCPP
171 |     Rcpp::RObject rcpp_result_gen;
172 |     Rcpp::RNGScope rcpp_rngScope_gen;
173 |     Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type matX(matXSEXP);
174 |     Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type matY(matYSEXP);
175 |     Rcpp::traits::input_parameter< bool >::type printOutput(printOutputSEXP);
176 |     rcpp_result_gen = Rcpp::wrap(rcpp_parallel_WMU_test(matX, matY, printOutput));
177 |     return rcpp_result_gen;
178 | END_RCPP
179 | }
180 | 
181 | RcppExport SEXP detectCoresCpp(void);
182 | 
183 | static const R_CallMethodDef CallEntries[] = {
184 |     {"_gficf_RunModularityClusteringCpp", (DL_FUNC) &_gficf_RunModularityClusteringCpp, 9},
185 |     {"_gficf_jaccard_coeff", (DL_FUNC) &_gficf_jaccard_coeff, 2},
186 |     {"_gficf_armaColSumFull", (DL_FUNC) &_gficf_armaColSumFull, 3},
187 |     {"_gficf_armaColSumSparse", (DL_FUNC) &_gficf_armaColSumSparse, 3},
188 |     {"_gficf_colMeanVarS", (DL_FUNC) &_gficf_colMeanVarS, 2},
189 |     {"_gficf_armaManhattan", (DL_FUNC) &_gficf_armaManhattan, 5},
190 |     {"_gficf_armaCorr", (DL_FUNC) &_gficf_armaCorr, 6},
191 |     {"_gficf_armaColMeans", (DL_FUNC) &_gficf_armaColMeans, 3},
192 |     {"_gficf_scaleUMI", (DL_FUNC) &_gficf_scaleUMI, 3},
193 |     {"_gficf_rcpp_WMU_test", (DL_FUNC) &_gficf_rcpp_WMU_test, 3},
194 |     {"_gficf_rcpp_parallel_jaccard_coef", (DL_FUNC) &_gficf_rcpp_parallel_jaccard_coef, 2},
195 |     {"_gficf_rcpp_parallel_WMU_test", (DL_FUNC) &_gficf_rcpp_parallel_WMU_test, 3},
196 |     {"detectCoresCpp", (DL_FUNC) &detectCoresCpp, 0},
197 |     {NULL, NULL, 0}
198 | };
199 | 
200 | RcppExport void R_init_gficf(DllInfo *dll) {
201 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
202 |     R_useDynamicSymbols(dll, FALSE);
203 | }
204 | 


--------------------------------------------------------------------------------
/src/detectCores.cpp:
--------------------------------------------------------------------------------
 1 | // thnks to https://github.com/tnagler/RcppThread/commit/c26fc2b0d56555fa434c33352747822691334fe8
 2 | 
 3 | #include <R.h>
 4 | #include <Rdefines.h>
 5 | #include <thread>
 6 | 
 7 | extern "C" {
 8 |   
 9 |   SEXP detectCoresCpp() {
10 |     SEXP result;
11 |     PROTECT(result = NEW_INTEGER(1));
12 |     INTEGER(result)[0] = std::thread::hardware_concurrency();
13 |     UNPROTECT(1);
14 |     return result;
15 |   }
16 |   
17 |   
18 |   static const R_CallMethodDef callMethods[] = {
19 |     {"detectCoresCpp", (DL_FUNC) &detectCoresCpp, 0},
20 |     {NULL, NULL, 0}
21 |   };
22 |   
23 |   void R_init_RcppThread(DllInfo *info)
24 |   {
25 |     R_registerRoutines(info, NULL, callMethods, NULL, NULL);
26 |     R_useDynamicSymbols(info, TRUE);
27 |   }
28 |   
29 | }


--------------------------------------------------------------------------------
/src/jaccard_coeff.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | #include <progress.hpp>
 3 | using namespace Rcpp;
 4 | 
 5 | // Original C++ code is from JinmiaoChenLab/Rphenograph package available at https://github.com/JinmiaoChenLab/Rphenograph/tree/master/src
 6 | //
 7 | // Compute jaccard coefficient between nearest-neighbor sets
 8 | //
 9 | // Weights of both i->j and j->i are recorded if they have intersection. In this case
10 | // w(i->j) should be equal to w(j->i). In some case i->j has weights while j<-i has no
11 | // intersections, only w(i->j) is recorded. This is determinded in code `if(u>0)`. 
12 | // In this way, the undirected graph is symmetrized by halfing the weight 
13 | // in code `weights(r, 2) = u/(2.0*ncol - u)/2`.
14 | //
15 | // Author: Chen Hao, Date: 25/09/2015
16 | 
17 | // [[Rcpp::depends(RcppProgress)]]
18 | // [[Rcpp::export]]
19 | NumericMatrix jaccard_coeff(NumericMatrix idx, bool printOutput) {
20 |     int nrow = idx.nrow(), ncol = idx.ncol();
21 |     NumericMatrix weights(nrow*ncol, 3);
22 |     int r = 0;
23 |     if (printOutput)
24 |     {
25 |       Rprintf("Running Jaccard Coefficient Estimation...\n");
26 |     }
27 |     Progress p(nrow, printOutput);
28 |     for (int i = 0; i < nrow; i++) {
29 |         for (int j = 0; j < ncol; j++) {
30 |             int k = idx(i,j)-1;
31 |             NumericVector nodei = idx(i,_);
32 |             NumericVector nodej = idx(k,_);
33 |             int u = intersect(nodei, nodej).size();  // count intersection number
34 |             if(u>0){ 
35 |                 weights(r, 0) = i+1;
36 |                 weights(r, 1) = k+1;
37 |                 weights(r, 2) = u/(2.0*ncol - u);
38 |                 r++;
39 |             }
40 |         }
41 |         p.increment();
42 |     }
43 |     
44 |     return weights;
45 | }
46 | 


--------------------------------------------------------------------------------
/src/mann_whitney.cpp:
--------------------------------------------------------------------------------
  1 | #include "mann_whitney.h"
  2 | #include <iostream>
  3 | #include <vector>
  4 | #include <numeric>
  5 | #include <algorithm>
  6 | #include <cmath>
  7 | #include <Rcpp.h>
  8 | #include <RcppGSL.h>
  9 | #include <gsl/gsl_cdf.h>
 10 | 
 11 | // Compute two-sided Mann–Whitney U test with coninuity correction between two umpaired samples. 
 12 | // In R this c++ function corresond to wilcox.test(x,y,alternative = "two.sided", paired = F,exact = F,correct = T)
 13 | //
 14 | // Author: Gennaro Gambardella, Date: 20/08/2019
 15 | 
 16 | 
 17 | template <typename T>
 18 | std::vector<size_t> sort_indexes(std::vector<T> const &v)
 19 | {
 20 |   
 21 |   // initialize original index locations
 22 |   std::vector<size_t> idx(v.size());
 23 |   iota(idx.begin(), idx.end(), 0);
 24 |   
 25 |   // sort indexes based on comparing values in v
 26 |   sort(idx.begin(), idx.end(),
 27 |        [&v](size_t i1, size_t i2) {return v[i1] < v[i2];});
 28 |   
 29 |   return idx;
 30 | }
 31 | 
 32 | std::vector<double> getRanks(std::vector<double> const &absoluteValues)
 33 | {
 34 |   std::vector<double> ranks(absoluteValues.size());
 35 |   
 36 |   size_t i = 0;
 37 |   while (i < absoluteValues.size())
 38 |   {
 39 |     size_t j = i + 1;
 40 |     while (j < absoluteValues.size())
 41 |     {
 42 |       if(absoluteValues[i] != absoluteValues[j])
 43 |       {
 44 |         break;
 45 |       }
 46 |       j++;
 47 |     }
 48 |     for(size_t k = i; k <= j-1; k++)
 49 |     {   
 50 |       ranks[k] = 1 + (double)(i + j-1)/(double)2;
 51 |     }
 52 |     i = j;
 53 |   }
 54 |   return ranks;
 55 | }
 56 | 
 57 | std::vector<double> getUniq(std::vector<double> u)
 58 | {
 59 |   // using default comparison:
 60 |   std::vector<double>::iterator it;
 61 |   it = std::unique (u.begin(), u.end());
 62 |   u.resize( std::distance(u.begin(),it) );
 63 |   return u;
 64 | }
 65 | 
 66 | std::vector<double> getCounts(std::vector<double> const &valuesOrd)
 67 | {
 68 |   std::vector<double> u = getUniq(valuesOrd);
 69 |   std::vector<double> u_counts(u.size(),0);
 70 |   size_t k=0;
 71 |   double prev=valuesOrd[0];
 72 |   
 73 |   for(size_t i=0;i<valuesOrd.size();i++)
 74 |   {
 75 |     if(prev==valuesOrd[i])
 76 |     {
 77 |       u_counts[k]++;
 78 |     } else {
 79 |       k++;
 80 |       u_counts[k]++;
 81 |       prev=valuesOrd[i];
 82 |     }
 83 |   }
 84 |   return u_counts;
 85 | }
 86 | 
 87 | double getSigma(std::vector<double>& u_counts,double n1, double n2)
 88 | {
 89 |   double nties = 0;
 90 |   //int n = n1 + n2;
 91 |   
 92 |   if(u_counts.size() < n1 + n2)
 93 |   {
 94 |     for(std::vector<double>::iterator it = u_counts.begin(); it != u_counts.end(); ++it)
 95 |       nties += ( ((*it) * (*it) * (*it)) - *it );
 96 |   }
 97 |   
 98 |   return sqrt( (n1 * n2 / 12) * ( (n1 + n2 + 1) - nties / ((n1 + n2) * (n1 + n2 - 1))) );
 99 | }
100 | 
101 | double getPvalue(double& z)
102 | {
103 |   double p =1;
104 |   if(z<0){
105 |     p = gsl_cdf_gaussian_P(z,1)*2;
106 |   } else {
107 |     p = gsl_cdf_gaussian_Q(z,1)*2;
108 |   }
109 |   return p;
110 | }
111 | 
112 | Rcpp::NumericVector subset(Rcpp::NumericVector const &v, Rcpp::NumericVector idx)
113 | {
114 |   Rcpp::NumericVector res(idx.size());
115 |   size_t k=0;
116 |   for(Rcpp::NumericVector::iterator it = idx.begin(); it != idx.end(); ++it, k++)
117 |     res(k) = v[*it - 1];
118 |   return res;
119 | }
120 | 
121 | double avg(Rcpp::NumericVector const &v)
122 | {
123 |   return std::accumulate(v.begin(), v.end(), 0.0) / v.size();
124 | }
125 | 
126 | double avg(std::vector<double> const &v)
127 | {
128 |   return std::accumulate(v.begin(), v.end(), 0.0) / v.size();
129 | }
130 | 
131 | 
132 | double MWUtest(Rcpp::NumericVector const &v1, Rcpp::NumericVector const &v2)
133 | {
134 |   double pval = 1;
135 |   std::vector<double> values(v1.length());
136 |   std::vector<double> valuesOrd(v1.length()+v2.length());
137 |   std::vector<double> valuesRnk(v1.length()+v2.length());
138 |   
139 |   // concat vectors
140 |   std::copy(v1.begin(), v1.end(), values.begin());
141 |   values.insert( values.end(),v2.begin(),v2.end());
142 |   
143 |   // sort
144 |   std::vector<size_t> idx = sort_indexes(values);
145 |   for(size_t i=0;i<values.size();i++)
146 |   {
147 |     //std::cout << values[idx[i]] << std::endl;
148 |     valuesOrd[i] = values[idx[i]];
149 |   }
150 |   // get ranks
151 |   valuesRnk = getRanks(valuesOrd);
152 |   
153 |   std::vector<double> nties = getCounts(valuesOrd);
154 |   
155 |   if(nties.size()>1) {
156 |     
157 |     double long U1 = (v1.size() * (v1.size()+1)) * -0.5; 
158 |     double long U2 = (v2.size() * (v2.size()+1)) * -0.5;
159 |     std::vector<double> u1_rnk(v1.size(),0);
160 |     std::vector<double> u2_rnk(v2.size(),0);
161 |     std::vector<double> v1_ord(v1.size(),0);
162 |     std::vector<double> v2_ord(v2.size(),0);
163 |     int u1_i=0;
164 |     int u2_i = 0;
165 |     
166 |     for(size_t i=0;i<valuesOrd.size();i++)
167 |     {
168 |       if(idx[i]<v1.size())
169 |       {
170 |         U1 +=valuesRnk[i];
171 |         u1_rnk[u1_i] = valuesRnk[i];
172 |         v1_ord[u1_i] = valuesOrd[i];
173 |         u1_i++;
174 |       } else {
175 |         U2 +=valuesRnk[i];
176 |         u2_rnk[u2_i] = valuesRnk[i];
177 |         v2_ord[u2_i] = valuesOrd[i];
178 |         u2_i++;
179 |       }
180 |     }
181 |     
182 |     double mu = v1.size()*v2.size()/2, sig=0, z = 0;
183 |     sig = getSigma(nties,v1.size(),v2.size());
184 |     
185 |     z = U1<U2 ? U1-mu : U2-mu;
186 |     z = z<0 ? z+0.5 : z-0.5; // Continuity Correction
187 |     z = z/sig;
188 |     
189 |     pval = getPvalue(z);
190 |   }
191 |   return pval;
192 | }
193 | 


--------------------------------------------------------------------------------
/src/mann_whitney.h:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | 
 3 | // Compute two-sided Mann–Whitney U test between two umpaired samples. 
 4 | // In R this c++ function corresond to wilcox.test(x,y,alternative = "two.sided", paired = F,exact = F,correct = F)
 5 | //
 6 | // Author: Gennaro Gambardella, Date: 20/08/2019
 7 | 
 8 | 
 9 | template <typename T>
10 | std::vector<size_t> sort_indexes(std::vector<T> const &v);
11 | 
12 | std::vector<double> getRanks(std::vector<double> const &absoluteValues);
13 | 
14 | std::vector<double> getUniq(std::vector<double> u);
15 | 
16 | std::vector<double> getCounts(std::vector<double> const &valuesOrd);
17 | 
18 | double getSigma(std::vector<double>& u_counts,double n1, double n2);
19 | 
20 | double getPvalue(double& z);
21 | 
22 | Rcpp::NumericVector subset(Rcpp::NumericVector const &v, Rcpp::NumericVector idx);
23 | 
24 | double avg(Rcpp::NumericVector const &v);
25 | 
26 | double avg(std::vector<double> const &v);
27 | 
28 | double MWUtest(Rcpp::NumericVector const &v1, Rcpp::NumericVector const &v2);
29 | 


--------------------------------------------------------------------------------
/src/misc.cpp:
--------------------------------------------------------------------------------
  1 | #define ARMA_64BIT_WORD
  2 | #include <RcppArmadillo.h>
  3 | #include <progress.hpp>
  4 | 
  5 | #ifdef _OPENMP
  6 | #include <omp.h>
  7 | #endif
  8 | 
  9 | using namespace std;
 10 | using namespace Rcpp;
 11 | 
 12 | // [[Rcpp::depends(RcppArmadillo)]]
 13 | // [[Rcpp::plugins(openmp)]]
 14 | // [[Rcpp::plugins(cpp11)]]
 15 | // [[Rcpp::depends(RcppProgress)]]
 16 | 
 17 | 
 18 | // [[Rcpp::export]]
 19 | arma::mat armaColSumFull(const arma::mat& m, int ncores=1, bool verbose=false)
 20 | {
 21 |   arma::vec s(m.n_cols, arma::fill::zeros);
 22 |   int tot = m.n_cols;
 23 |   Progress p(tot, verbose);
 24 | #pragma omp parallel for num_threads(ncores) shared(s)
 25 |   for(int i=0;i<(m.n_cols);i++) {
 26 |     s(i) = arma::sum(m.col(i));
 27 |   }
 28 |   return(s);
 29 | }
 30 | 
 31 | // [[Rcpp::export]]
 32 | arma::mat armaColSumSparse(const arma::sp_mat& m, int ncores=1, bool verbose=false)
 33 | {
 34 |   arma::vec s(m.n_cols, arma::fill::zeros);
 35 |   int tot = m.n_cols;
 36 |   Progress p(tot, verbose);
 37 | #pragma omp parallel for num_threads(ncores) shared(s)
 38 |   for(int i=0;i<(m.n_cols);i++) {
 39 |     s(i) = arma::sum(m.col(i));
 40 |   }
 41 |   return(s);
 42 | }
 43 | 
 44 | // calculate column mean and variance
 45 | // [[Rcpp::export]]
 46 | Rcpp::DataFrame colMeanVarS(const arma::sp_mat& m, int ncores=1) {
 47 |   
 48 |   arma::vec meanV(m.n_cols,arma::fill::zeros); arma::vec varV(m.n_cols,arma::fill::zeros); arma::vec nobsV(m.n_cols,arma::fill::zeros);
 49 | #pragma omp parallel for num_threads(ncores) shared(meanV,varV,nobsV)
 50 |   for(int i=0;i<(m.n_cols);i++) {
 51 |     meanV(i) = arma::mean(m.col(i));
 52 |     nobsV(i) = m.col(i).n_nonzero;
 53 |     varV(i) = arma::var(m.col(i));
 54 |   }
 55 |   return Rcpp::DataFrame::create(Named("m")=meanV,Named("v")=varV,Named("nobs",nobsV));
 56 | }
 57 | 
 58 | // [[Rcpp::export]]
 59 | arma::sp_mat armaManhattan(const arma::sp_mat& m, int ncores=1,bool verbose=true, bool full=false, bool diag=true)
 60 | {
 61 |   typedef arma::sp_mat::const_col_iterator iter;
 62 |   arma::sp_mat d(m.n_cols,m.n_cols);
 63 |   int tot = (full) ? m.n_cols*m.n_cols - ((diag) ? m.n_cols : 0) : (m.n_cols*m.n_cols + ((diag) ? m.n_cols : 0))/2;
 64 |   Progress p(tot, verbose);
 65 | #pragma omp parallel for num_threads(ncores) shared(d)
 66 |   for(int i=0;i<(m.n_cols);i++) {
 67 |     for(int j = diag ? i : i+1;j<m.n_cols;j++) {
 68 |       if ( !Progress::check_abort())
 69 |       {
 70 |         p.increment(); // update progress
 71 |         iter i_iter = m.begin_col(i);
 72 |         iter j_iter = m.begin_col(j);
 73 |         double num=0;
 74 |         
 75 |         while( (i_iter != m.end_col(i)) && (j_iter != m.end_col(j)) )
 76 |         {
 77 |           if(i_iter.row() == j_iter.row())
 78 |           {
 79 |             num+= std::abs((*i_iter)-(*j_iter));
 80 |             ++i_iter;
 81 |             ++j_iter;
 82 |           } else {
 83 |             if(i_iter.row() < j_iter.row())
 84 |             {
 85 |               num+=std::abs(*i_iter);
 86 |               ++i_iter;
 87 |             } else {
 88 |               num+=std::abs(*j_iter);
 89 |               ++j_iter;
 90 |             }
 91 |           }
 92 |         }
 93 |         for(; i_iter != m.end_col(i); ++i_iter) { num+=std::abs(*i_iter); }
 94 |         for(; j_iter != m.end_col(j); ++j_iter){ num+=std::abs(*j_iter); }
 95 |         d(j,i) = num;
 96 |         if (full) {d(i,j) = d(j,i);}
 97 |       }
 98 |     }
 99 |   }
100 |   
101 |   return(d);
102 | }
103 | 
104 | 
105 | // Correlation coefficient between the columns of a dense matrix m
106 | // [[Rcpp::export]]
107 | arma::sp_mat armaCorr(const arma::mat& m, int ncores=1, bool verbose=true, bool full=false, bool diag=true, bool dist=true)
108 | {
109 |   arma::sp_mat d(m.n_cols,m.n_cols);
110 |   d.zeros();
111 |   int tot = (full) ? m.n_cols*m.n_cols - ((diag) ? m.n_cols : 0) : (m.n_cols*m.n_cols + ((diag) ? m.n_cols : 0))/2;
112 |   Progress p(tot, verbose);
113 | #pragma omp parallel for num_threads(ncores) shared(d)
114 |   for(int i=0;i<(m.n_cols);i++) {
115 |     arma::vec li = m.col(i);
116 |     for(int j = diag ? i : i+1;j<m.n_cols;j++) {
117 |       if ( !Progress::check_abort())
118 |       {
119 |         p.increment(); // update progress
120 |         arma::vec lj = m.col(j);
121 |         d(i,j) = (dist) ? 1-arma::as_scalar(arma::cor(li,lj,0)) : arma::as_scalar(arma::cor(li,lj,0));
122 |         d(j,i) = d(i,j);
123 |       }
124 |     }
125 |   }
126 |   return(d);
127 | }
128 | 
129 | // [[Rcpp::export]]
130 | Rcpp::DataFrame armaColMeans(const arma::sp_mat& m, int ncores=1,bool verbose=true)
131 | {
132 |   arma::vec meanV(m.n_cols,arma::fill::zeros); arma::vec meanAll(m.n_cols,arma::fill::zeros); arma::vec nobsV(m.n_cols,arma::fill::zeros);
133 |   typedef arma::sp_mat::const_col_iterator iter;
134 |   Progress p(m.n_cols, verbose);
135 | #pragma omp parallel for num_threads(ncores) shared(meanV,meanAll,nobsV)
136 |   for(unsigned int i=0;i<m.n_cols;i++) {
137 |     if ( !Progress::check_abort())
138 |     {
139 |       double tot=0; int n=0;
140 |       for(iter i_iter = m.begin_col(i); i_iter != m.end_col(i); ++i_iter) {
141 |         tot+=(*i_iter);
142 |         n++;
143 |       }
144 |       if(tot>0) {
145 |         meanV[i] = tot/n;
146 |         meanAll[i] = tot/m.n_cols;
147 |         nobsV[i] = n;
148 |       }
149 |       p.increment(); // update progress
150 |     }
151 |   }
152 |   return Rcpp::DataFrame::create(Named("mu1")=meanV,Named("mu2")=meanAll,Named("nobs",nobsV));
153 | }
154 | 
155 | // [[Rcpp::export]]
156 | arma::sp_mat scaleUMI(const arma::sp_mat& m, int ncores=1,bool verbose=false)
157 | {
158 |   typedef arma::sp_mat::const_col_iterator iter;
159 |   typedef arma::sp_mat::const_row_col_iterator iter2;
160 |   
161 |   arma::sp_mat d(m.n_rows,m.n_cols);
162 |   d.zeros();
163 |   arma::vec totV(m.n_cols,arma::fill::zeros);
164 |   arma::vec medianV(m.n_cols,arma::fill::zeros);
165 |   Progress p(m.n_cols, verbose);
166 |   
167 | #pragma omp parallel for num_threads(ncores) shared(totV)
168 |   for(unsigned int i=0;i<m.n_cols;i++) {
169 |     if ( !Progress::check_abort())
170 |     {
171 |       double tot=0; int n=0;
172 |       for(iter i_iter = m.begin_col(i); i_iter != m.end_col(i); ++i_iter) {
173 |         tot+=(*i_iter);
174 |       }
175 |       totV(i) = tot;
176 |       p.increment(); // update progress
177 |     }
178 |   }
179 |   
180 |   
181 |   //medianV = arma::median(arma::mat(m),1);
182 |   
183 |   Progress p2(m.n_nonzero, verbose);
184 | #pragma omp parallel for num_threads(ncores) shared(d)
185 |   for(unsigned int i=0;i<m.n_cols;i++) {
186 |     if ( !Progress::check_abort())
187 |     {
188 |       for(iter i_iter = m.begin_col(i); i_iter != m.end_col(i); ++i_iter) {
189 |         d(i_iter.row(),i) = log10( (*i_iter) * 1000000 / totV(i) + 1);
190 |         p2.increment(); // update progress
191 |       }
192 |     }
193 |   }
194 |   
195 |   return(d);
196 | }
197 | 
198 | 


--------------------------------------------------------------------------------
/src/rcpp_mann_whitney.cpp:
--------------------------------------------------------------------------------
 1 | #include "mann_whitney.h"
 2 | #include <Rcpp.h>
 3 | 
 4 | // Compute two-sided Mann–Whitney U test between two umpaired samples. 
 5 | // In R this c++ function corresond to wilcox.test(x,y,alternative = "two.sided", paired = F,exact = F,correct = F)
 6 | //
 7 | // Author: Gennaro Gambardella, Date: 20/08/2019
 8 | 
 9 | 
10 | 
11 | // [[Rcpp::export]]
12 | Rcpp::NumericMatrix rcpp_WMU_test(Rcpp::NumericMatrix M,Rcpp::NumericVector idx1,Rcpp::NumericVector idx2) {
13 |   Rcpp::NumericMatrix res(M.nrow(),2);
14 |   
15 |   for(int i=0;i<M.nrow();i++)
16 |   {
17 |     Rcpp::NumericVector g1 = subset(M.row(i),idx1);
18 |     Rcpp::NumericVector g2 = subset(M.row(i),idx2);
19 |     res(i,0) = MWUtest(g1,g2);
20 |     res(i,1) = std::log2( (double) (avg(g1+1)/avg(g2+1)) );
21 |   }
22 |   
23 |   return res;
24 | }
25 | 


--------------------------------------------------------------------------------
/src/rcpp_parallel_jaccard_coeff.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | #include <RcppParallel.h>
 3 | 
 4 | // Compute jaccard coefficient between nearest-neighbor sets in parallell
 5 | //
 6 | // Author: Gennaro Gambardella, Date: 12/08/2019
 7 | 
 8 | 
 9 | // [[Rcpp::depends(RcppParallel)]]
10 | struct JCoefficient : public RcppParallel::Worker {
11 |   
12 |   // input matrix to read from
13 |   const RcppParallel::RMatrix<double> mat;
14 |   
15 |   // output matrix to write to
16 |   RcppParallel::RMatrix<double> rmat;
17 |   
18 |   // initialize from Rcpp input and output matrixes (the RMatrix class
19 |   // can be automatically converted to from the Rcpp matrix type)
20 |   JCoefficient(const Rcpp::NumericMatrix& mat, Rcpp::NumericMatrix& rmat)
21 |     : mat(mat), rmat(rmat) {}
22 |   
23 |   // function call operator that work for the specified range (begin/end)
24 |   void operator()(std::size_t begin, std::size_t end) {
25 |     for (std::size_t i = begin; i < end; i++) {
26 |       for (std::size_t j = 0; j < mat.ncol(); j++) {
27 |         
28 |         int k = mat(i,j)-1;
29 |         
30 |         RcppParallel::RMatrix<double>::Row row1(mat.row(i));
31 |         std::vector<double> v1(row1.length());
32 |         std::copy(row1.begin(), row1.end(), v1.begin());
33 |         
34 |         RcppParallel::RMatrix<double>::Row row2(mat.row(k));
35 |         std::vector<double> v2(row2.length());
36 |         std::copy(row2.begin(), row2.end(), v2.begin());
37 |         
38 |         std::sort(v1.begin(), v1.end());
39 |         std::sort(v2.begin(), v2.end());
40 |         
41 |         std::vector<int> v_intersection;
42 |         
43 |         std::set_intersection(v1.begin(), v1.end(),
44 |                               v2.begin(), v2.end(),
45 |                               std::back_inserter(v_intersection));
46 |         int u = v_intersection.size();
47 |         
48 |         if(u>0){
49 |           rmat((i*mat.ncol())+j, 0) = i+1;
50 |           rmat((i*mat.ncol())+j, 1) = k+1;
51 |           rmat((i*mat.ncol())+j, 2) = u/(2.0*mat.ncol() - u);
52 |         }
53 |       }
54 |     }
55 |   }
56 | };
57 | 
58 | // [[Rcpp::export]]
59 | Rcpp::NumericMatrix rcpp_parallel_jaccard_coef(Rcpp::NumericMatrix mat, bool printOutput) {
60 |   
61 |   if (printOutput)
62 |   {
63 |     Rprintf("Running Parallell Jaccard Coefficient Estimation...\n");
64 |   }
65 |   
66 |   // allocate the matrix we will return
67 |   Rcpp::NumericMatrix rmat(mat.nrow()*mat.ncol(),3);
68 |   
69 |   // create the worker
70 |   JCoefficient JCoefficient(mat, rmat);
71 |   
72 |   // call it with parallelFor
73 |   parallelFor(0, mat.nrow(), JCoefficient);
74 |   
75 |   if (printOutput)
76 |   {
77 |     Rprintf("Done!!\n");
78 |   }
79 |   return rmat;
80 | }
81 | 


--------------------------------------------------------------------------------
/src/rcpp_parallel_mann_whitney.cpp:
--------------------------------------------------------------------------------
  1 | #include "mann_whitney.h"
  2 | #include <cmath>
  3 | #include <Rcpp.h>
  4 | #include <RcppParallel.h>
  5 | 
  6 | // Compute two-sided Mann–Whitney U test with coninuity correction between two umpaired samples.
  7 | // In R this c++ function corresond to wilcox.test(x,y,alternative = "two.sided", paired = F,exact = F,correct = F)
  8 | //
  9 | // Author: Gennaro Gambardella, Date: 20/08/2019
 10 | 
 11 | // [[Rcpp::depends(RcppParallel)]]
 12 | struct WMU_test : public RcppParallel::Worker {
 13 |   
 14 |   // input matrix to read from
 15 |   const RcppParallel::RMatrix<double> matX;
 16 |   const RcppParallel::RMatrix<double> matY;
 17 | 
 18 |   // output matrix to write to
 19 |   RcppParallel::RMatrix<double> rmat;
 20 |   
 21 |   // initialize from Rcpp input and output matrixes (the RMatrix class
 22 |   // can be automatically converted to from the Rcpp matrix type)
 23 |   WMU_test(const Rcpp::NumericMatrix& matX, const Rcpp::NumericMatrix& matY, Rcpp::NumericMatrix& rmat)
 24 |     : matX(matX), matY(matY), rmat(rmat) {}
 25 |   
 26 |   // function call operator that work for the specified range (begin/end)
 27 |   void operator()(std::size_t begin, std::size_t end) {
 28 |     for (std::size_t k = begin; k < end; k++) {
 29 |         
 30 |         RcppParallel::RMatrix<double>::Row row1(matX.row(k));
 31 |         RcppParallel::RMatrix<double>::Row row2(matY.row(k));
 32 |       
 33 |         std::vector<double> v1(row1.length());
 34 |         std::copy(row1.begin(), row1.end(), v1.begin());
 35 |         
 36 |         std::vector<double> v2(row2.length());
 37 |         std::copy(row2.begin(), row2.end(), v2.begin());
 38 |         
 39 |         double pval = 1;
 40 |         std::vector<double> values(v1.size());
 41 |         std::vector<double> valuesOrd(v1.size()+v2.size());
 42 |         std::vector<double> valuesRnk(v1.size()+v2.size());
 43 |         
 44 |         // concat vectors
 45 |         std::copy(v1.begin(), v1.end(), values.begin());
 46 |         values.insert( values.end(),v2.begin(),v2.end());
 47 |         
 48 |         // sort
 49 |         std::vector<size_t> idx = sort_indexes(values);
 50 |         for(size_t i=0;i<values.size();i++)
 51 |         {
 52 |           //std::cout << values[idx[i]] << std::endl;
 53 |           valuesOrd[i] = values[idx[i]];
 54 |         }
 55 |         // get ranks
 56 |         valuesRnk = getRanks(valuesOrd);
 57 |         
 58 |         std::vector<double> nties = getCounts(valuesOrd);
 59 |         
 60 |         if(nties.size()>1) {
 61 |           
 62 |           double long U1 = (v1.size() * (v1.size()+1)) * -0.5; 
 63 |           double long U2 = (v2.size() * (v2.size()+1)) * -0.5;
 64 |           std::vector<double> u1_rnk(v1.size(),0);
 65 |           std::vector<double> u2_rnk(v2.size(),0);
 66 |           std::vector<double> v1_ord(v1.size(),0);
 67 |           std::vector<double> v2_ord(v2.size(),0);
 68 |           int u1_i=0;
 69 |           int u2_i = 0;
 70 |           
 71 |           for(size_t i=0;i<valuesOrd.size();i++)
 72 |           {
 73 |             if(idx[i]<v1.size())
 74 |             {
 75 |               U1 +=valuesRnk[i];
 76 |               u1_rnk[u1_i] = valuesRnk[i];
 77 |               v1_ord[u1_i] = valuesOrd[i];
 78 |               u1_i++;
 79 |             } else {
 80 |               U2 +=valuesRnk[i];
 81 |               u2_rnk[u2_i] = valuesRnk[i];
 82 |               v2_ord[u2_i] = valuesOrd[i];
 83 |               u2_i++;
 84 |             }
 85 |           }
 86 |           
 87 |           double mu = v1.size()*v2.size()/2, sig=0, z = 0;
 88 |           sig = getSigma(nties,v1.size(),v2.size());
 89 |           
 90 |           z = U1<U2 ? U1-mu : U2-mu;
 91 |           z = z<0 ? z+0.5 : z-0.5; // Continuity Correction
 92 |           z = z/sig;
 93 |           
 94 |           pval = getPvalue(z);
 95 |         }
 96 |         
 97 |         rmat(k,0) = pval;
 98 |         transform(v1.begin(), v1.end(), v1.begin(),bind2nd(std::plus<double>(), 1.0));
 99 |         transform(v2.begin(), v2.end(), v2.begin(),bind2nd(std::plus<double>(), 1.0));
100 |         rmat(k,1) = log2( (double) (avg(v1)/avg(v2)) );
101 |     }
102 |   }
103 | };
104 | 
105 | // [[Rcpp::export]]
106 | Rcpp::NumericMatrix rcpp_parallel_WMU_test(Rcpp::NumericMatrix matX,Rcpp::NumericMatrix matY, bool printOutput) {
107 |   
108 |   if (printOutput)
109 |   {
110 |     Rprintf("Running Parallell WM-U test...\n");
111 |   }
112 |   
113 |   // allocate the matrix we will return
114 |   Rcpp::NumericMatrix rmat(matX.nrow(),2);
115 |   
116 |   // create the worker
117 |   WMU_test WMU_test(matX, matY, rmat);
118 |   
119 |   // call it with parallelFor
120 |   parallelFor(0, matX.nrow(), WMU_test);
121 |   
122 |   if (printOutput)
123 |   {
124 |     Rprintf("Done!!\n");
125 |   }
126 |   return rmat;
127 | }
128 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(gficf)
3 | 
4 | # test_check("gficf")
5 | 


--------------------------------------------------------------------------------