├── .Rbuildignore ├── .gitignore ├── data ├── all_genes.RData ├── snames_data.RData ├── all_expn_data.RData ├── heatmap_genes.RData └── selected_genes.RData ├── phemd_1.15.0.tar.gz ├── vignettes ├── melanomaData.RData └── phemd.Rmd ├── R ├── zzz.R ├── functions-getResults.R ├── Phemd-methods.R ├── functions-plotting.R └── functions-core.R ├── phemd.Rproj ├── man ├── phateInfo.Rd ├── GDM.Rd ├── monocleInfo.Rd ├── seuratInfo.Rd ├── rawExpn.Rd ├── batchIDs.Rd ├── sNames.Rd ├── all_genes.Rd ├── selectMarkers.Rd ├── heatmap_genes.Rd ├── snames_data.Rd ├── celltypeFreqs.Rd ├── all_expn_data.Rd ├── pooledCells.Rd ├── subsampledIdx.Rd ├── subsampledBool.Rd ├── getSampleSizes.Rd ├── drawColnames45.Rd ├── selected_genes.Rd ├── identifyCentroids.Rd ├── getArithmeticCentroids.Rd ├── removeTinySamples.Rd ├── selectFeatures.Rd ├── gaussianffLocal.Rd ├── aggregateSamples.Rd ├── orderCellsMonocle.Rd ├── createDataObj.Rd ├── retrieveRefClusters.Rd ├── getCellYield.Rd ├── assignCellClusterNearestNode.Rd ├── bindSeuratObj.Rd ├── plotSummaryHistograms.Rd ├── compareSamples.Rd ├── getSampleCelltypeFreqs.Rd ├── plotCellYield.Rd ├── Phemd.Rd ├── generateGDM.Rd ├── printClusterAssignments.Rd ├── getSampleHistsByCluster.Rd ├── plotEmbeddings.Rd ├── clusterIndividualSamples.Rd ├── plotHeatmaps.Rd ├── groupSamples.Rd ├── embedCells.Rd ├── plotGroupedSamplesDmap.Rd └── Phemd-methods.Rd ├── README.md ├── DESCRIPTION ├── LICENSE.md └── NAMESPACE /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj 2 | .Rproj.user 3 | .Rhistory 4 | .RData 5 | .Ruserdata 6 | .DS_Store 7 | -------------------------------------------------------------------------------- /data/all_genes.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/data/all_genes.RData -------------------------------------------------------------------------------- /phemd_1.15.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/phemd_1.15.0.tar.gz -------------------------------------------------------------------------------- /data/snames_data.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/data/snames_data.RData -------------------------------------------------------------------------------- /data/all_expn_data.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/data/all_expn_data.RData -------------------------------------------------------------------------------- /data/heatmap_genes.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/data/heatmap_genes.RData -------------------------------------------------------------------------------- /data/selected_genes.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/data/selected_genes.RData -------------------------------------------------------------------------------- /vignettes/melanomaData.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/vignettes/melanomaData.RData -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onAttach <- function(libname, pkgname) { 2 | msg <- sprintf( 3 | "Package '%s' is deprecated and will be removed from Bioconductor 4 | version %s", pkgname, "3.18") 5 | .Deprecated(msg=paste(strwrap(msg, exdent=2), collapse="\n")) 6 | } 7 | 8 | -------------------------------------------------------------------------------- /phemd.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /man/phateInfo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \name{phateInfo} 4 | \alias{phateInfo} 5 | \title{Accessor function for stored phate object} 6 | \usage{ 7 | phateInfo(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{A Phemd object.} 11 | } 12 | \value{ 13 | An object of class 'phate' (from phateR) 14 | } 15 | \description{ 16 | Accessor function for stored phate object 17 | } 18 | \examples{ 19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 20 | phateobj <- phateInfo(phemdObj) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/GDM.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \name{GDM} 4 | \alias{GDM} 5 | \title{Accessor function for EMD ground distance matrix} 6 | \usage{ 7 | GDM(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{A Phemd object} 11 | } 12 | \value{ 13 | Sqaure matrix representing pairwise distances between cell subtypes 14 | } 15 | \description{ 16 | Accessor function for EMD ground distance matrix 17 | } 18 | \examples{ 19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 20 | gdm <- GDM(phemdObj) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/monocleInfo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \name{monocleInfo} 4 | \alias{monocleInfo} 5 | \title{Accessor function for stored Monocle object} 6 | \usage{ 7 | monocleInfo(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{A Phemd object.} 11 | } 12 | \value{ 13 | An object of class 'CellDataSet' (from Monocle) 14 | } 15 | \description{ 16 | Accessor function for stored Monocle object 17 | } 18 | \examples{ 19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 20 | monocle_obj <- monocleInfo(phemdObj) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/seuratInfo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \name{seuratInfo} 4 | \alias{seuratInfo} 5 | \title{Accessor function for stored Seurat object within Phemd object} 6 | \usage{ 7 | seuratInfo(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{A Phemd object.} 11 | } 12 | \value{ 13 | An object of class 'Seurat' 14 | } 15 | \description{ 16 | Accessor function for stored Seurat object within Phemd object 17 | } 18 | \examples{ 19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 20 | seurat_obj <- seuratInfo(phemdObj) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/rawExpn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \name{rawExpn} 4 | \alias{rawExpn} 5 | \title{Accessor function for stored multi-sample raw expression data} 6 | \usage{ 7 | rawExpn(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{A Phemd object.} 11 | } 12 | \value{ 13 | List of matrices, each of which represents a single-cell sample 14 | } 15 | \description{ 16 | Accessor function for stored multi-sample raw expression data 17 | } 18 | \examples{ 19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 20 | raw_expn_data <- rawExpn(phemdObj) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/batchIDs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \name{batchIDs} 4 | \alias{batchIDs} 5 | \title{Accessor function for batch ID for each sample} 6 | \usage{ 7 | batchIDs(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{Phemd object} 11 | } 12 | \value{ 13 | Vector of length num_samples representing the experiment (batch) in which the sample was profiled 14 | } 15 | \description{ 16 | Accessor function for batch ID for each sample 17 | } 18 | \examples{ 19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 20 | batch_metadata <- batchIDs(phemdObj) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/sNames.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \name{sNames} 4 | \alias{sNames} 5 | \title{Accessor function for identifiers of all single-cell samples in experiment} 6 | \usage{ 7 | sNames(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{Phemd object} 11 | } 12 | \value{ 13 | Vector representing sample names corresponding to expression matrices 14 | } 15 | \description{ 16 | Accessor function for identifiers of all single-cell samples in experiment 17 | } 18 | \examples{ 19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 20 | sampleIDs <- sNames(phemdObj) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/all_genes.Rd: -------------------------------------------------------------------------------- 1 | \name{all_genes} 2 | \alias{all_genes} 3 | \docType{data} 4 | \title{All genes included in (subsampled) melanoma single-cell RNA-seq expression data} 5 | \description{This object contains 100 genes measured in melanoma single-cell RNA-seq expression data.} 6 | \usage{ 7 | data(all_genes) 8 | } 9 | \format{ 10 | Vector of length 100 representing row names of each matrix in melanoma expression dataset 11 | } 12 | \source{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056} 13 | \references{Tirosh, I. et al. Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science 352, 189–196 (2016)} 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/selectMarkers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \name{selectMarkers} 4 | \alias{selectMarkers} 5 | \title{Accessor function for gene/protein markers measured in experiment} 6 | \usage{ 7 | selectMarkers(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{Phemd object} 11 | } 12 | \value{ 13 | Vector representing gene/protein markers corresponding to expression matrices 14 | } 15 | \description{ 16 | Accessor function for gene/protein markers measured in experiment 17 | } 18 | \examples{ 19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 20 | genes <- selectMarkers(phemdObj) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/heatmap_genes.Rd: -------------------------------------------------------------------------------- 1 | \name{heatmap_genes} 2 | \alias{heatmap_genes} 3 | \docType{data} 4 | \title{Genes to be used when plotting heatmap for melanoma single-cell RNA-seq expression data} 5 | \description{This object contains genes to be used when plotting heatmap for melanoma single-cell RNA-seq expression data.} 6 | \usage{ 7 | data(heatmap_genes) 8 | } 9 | \format{ 10 | Vector of length 42 representing selected genes for plotting heatmap. 11 | } 12 | \source{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056} 13 | \references{Tirosh, I. et al. Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science 352, 189–196 (2016)} 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/snames_data.Rd: -------------------------------------------------------------------------------- 1 | \name{snames_data} 2 | \alias{snames_data} 3 | \docType{data} 4 | \title{Sample names for melanoma single-cell RNA-seq expression data} 5 | \description{This object contains sample names corresponding to samples contained in melanoma expression data.} 6 | \usage{ 7 | data("snames_data") 8 | } 9 | \format{ 10 | Vector of length 19 representing sample names corresponding to order of samples in all_expn_data in melanomaData dataset. 11 | } 12 | \source{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056} 13 | \references{Tirosh, I. et al. Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science 352, 189–196 (2016)} 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/celltypeFreqs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \name{celltypeFreqs} 4 | \alias{celltypeFreqs} 5 | \title{Accessor function for cell subtype distribution for each sample} 6 | \usage{ 7 | celltypeFreqs(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{Phemd object} 11 | } 12 | \value{ 13 | Matrix representing cell subtype relative frequencies for each sample (num_samples x num_genes) 14 | } 15 | \description{ 16 | Accessor function for cell subtype distribution for each sample 17 | } 18 | \examples{ 19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 20 | celltype_weights <- celltypeFreqs(phemdObj) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/all_expn_data.Rd: -------------------------------------------------------------------------------- 1 | \name{all_expn_data} 2 | \alias{all_expn_data} 3 | \docType{data} 4 | \title{Single-cell RNA-seq expression data for melanoma samples} 5 | \description{This dataset contains normalized single-cell RNA-seq expression data for 19 melanoma samples (immune cells).} 6 | \usage{ 7 | data(all_expn_data) 8 | } 9 | \format{ 10 | A list of length 19 with each element representing a distinct sample. Each list element (sample) is a matrix with dimension num_genes x num_cells. 11 | } 12 | \source{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056} 13 | \references{Tirosh, I. et al. Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science 352, 189–196 (2016)} 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/pooledCells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \name{pooledCells} 4 | \alias{pooledCells} 5 | \title{Accessor function for aggregated cells used for cell subtype definition} 6 | \usage{ 7 | pooledCells(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{Phemd object} 11 | } 12 | \value{ 13 | Numeric matrix representing expression data for cells from all experimental conditions (rows = markers, cols = cells) 14 | } 15 | \description{ 16 | Accessor function for aggregated cells used for cell subtype definition 17 | } 18 | \examples{ 19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 20 | cells_aggregated <- pooledCells(phemdObj) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/subsampledIdx.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \name{subsampledIdx} 4 | \alias{subsampledIdx} 5 | \title{Accessor function for aggregated cells used for cell subtype definition} 6 | \usage{ 7 | subsampledIdx(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{Phemd object} 11 | } 12 | \value{ 13 | List of vectors each representing the indices of elements in rawExpn(obj) that were subsampled and combined to form "data_aggregate" 14 | } 15 | \description{ 16 | Accessor function for aggregated cells used for cell subtype definition 17 | } 18 | \examples{ 19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 20 | subsampled_idx_list <- subsampledIdx(phemdObj) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/subsampledBool.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \name{subsampledBool} 4 | \alias{subsampledBool} 5 | \title{Accessor function for whether or not cells were subsampled when aggregated for cell subtype analysis} 6 | \usage{ 7 | subsampledBool(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{Phemd object} 11 | } 12 | \value{ 13 | Boolean represent whether or not subsampling was performed in the data aggregation process 14 | } 15 | \description{ 16 | Accessor function for whether or not cells were subsampled when aggregated for cell subtype analysis 17 | } 18 | \examples{ 19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 20 | subsampled <- subsampledBool(phemdObj) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/getSampleSizes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{getSampleSizes} 4 | \alias{getSampleSizes} 5 | \title{Retrieve single-cell sample sizes} 6 | \usage{ 7 | getSampleSizes(data_list) 8 | } 9 | \arguments{ 10 | \item{data_list}{List of length num_samples (each element has dimension num_cells x num_markers)} 11 | } 12 | \value{ 13 | Vector of length num_samples representing number of cells in each sample 14 | } 15 | \description{ 16 | Takes initial list of single-cell samples and returns vector containing number of cells in each sample. 17 | } 18 | \details{ 19 | Private method (not exported in namespace) 20 | } 21 | \examples{ 22 | \dontrun{ 23 | sample_sizes <- getSampleSizes(all_expn_data) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /man/drawColnames45.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-plotting.R 3 | \name{drawColnames45} 4 | \alias{drawColnames45} 5 | \title{Rotates heatmap marker labels 45 degrees} 6 | \usage{ 7 | drawColnames45(coln, gaps, ...) 8 | } 9 | \arguments{ 10 | \item{coln}{Column names} 11 | 12 | \item{gaps}{Spacing of labels} 13 | 14 | \item{...}{Additional parameters to be passed to \code{gpar}} 15 | } 16 | \value{ 17 | Formatted marker labels in heatmap 18 | } 19 | \description{ 20 | Overwrites default draw_colnames in the pheatmap package 21 | } 22 | \details{ 23 | To be used with pheatmap plotting function; not to be called directly. Thanks to Josh O'Brien at http://stackoverflow.com/questions/15505607 24 | } 25 | \examples{ 26 | #Not to be called directly 27 | } 28 | -------------------------------------------------------------------------------- /man/selected_genes.Rd: -------------------------------------------------------------------------------- 1 | \name{selected_genes} 2 | \alias{selected_genes} 3 | \docType{data} 4 | \title{Genes to be used when performing clustering and trajectory analyses on melanoma single-cell RNA-seq expression data} 5 | \description{This object contains genes to be used when performing clustering and trajectory analyses on melanoma single-cell RNA-seq expression data.} 6 | \usage{ 7 | data(selected_genes) 8 | } 9 | \format{ 10 | Vector of length 44 representing selected genes for performing computational analyses such as generating cell embeddings and clustering cell subtypes. 11 | } 12 | \source{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056} 13 | \references{Tirosh, I. et al. Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science 352, 189–196 (2016)} 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/identifyCentroids.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{identifyCentroids} 4 | \alias{identifyCentroids} 5 | \title{Identify cluster centroids (cell names)} 6 | \usage{ 7 | identifyCentroids(ref_clusters) 8 | } 9 | \arguments{ 10 | \item{ref_clusters}{list containing each cluster of interest (each list element is a matrix of dimension num_cells x num_markers)} 11 | } 12 | \value{ 13 | List of names; element \var{i} represents the name of the cell in cluster \var{i} that is closest to the centroid (arithmetic mean) of cluster \var{i} 14 | } 15 | \description{ 16 | Takes initial list and returns list of cell names representing centroid of cluster 17 | } 18 | \details{ 19 | Private method (not exported in namespace) 20 | } 21 | \examples{ 22 | \dontrun{ 23 | centroid_names <- identifyCentroids(ref_clusters) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /man/getArithmeticCentroids.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{getArithmeticCentroids} 4 | \alias{getArithmeticCentroids} 5 | \title{Get arithmetic centroids (coordinates)} 6 | \usage{ 7 | getArithmeticCentroids(ref_clusters) 8 | } 9 | \arguments{ 10 | \item{ref_clusters}{list containing each cluster of interest (each list element is a matrix of dimension num_cells x num_markers)} 11 | } 12 | \value{ 13 | Matrix of dimension num_cluster x num_markers; row \var{i} representing the arithmetic centroid of cluster \var{i} 14 | } 15 | \description{ 16 | Takes initial list and returns a matrix with row \var{i} representing the arithmetic centroid of cluster \var{i} 17 | } 18 | \details{ 19 | Private method (not exported in namespace) 20 | } 21 | \examples{ 22 | \dontrun{ 23 | cluster_centroids <- getArithmeticCentroids(ref_clusters) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PhEMD - "Phenotypic Earth Mover's Distance" 2 | PhEMD is a package for relating a network of single-cell samples. By first modeling the cell-state landscape of a model system and then computing pairwise distances between samples using EMD, PhEMD constructs a low-dimensional embedding that uncovers intrinsic manifold structure. 3 | 4 | 5 | ## Installing PhEMD 6 | PhEMD can currently be installed directly through Bioconductor as follows: 7 | ``` 8 | BiocManager::install("phemd") 9 | ``` 10 | Alternatively, the package can be cloned and installed directly from Git as below (although installation via Bioconductor is preferred). 11 | ``` 12 | git clone --recursive git://github.com/wschen/phemd.git 13 | cd phemd 14 | R CMD INSTALL . 15 | ``` 16 | 17 | ## Running PhEMD 18 | See https://bioconductor.org/packages/release/bioc/vignettes/phemd/inst/doc/phemd.html for a step-by-step tutorial on applying PhEMD to a multi-sample single-cell dataset. 19 | -------------------------------------------------------------------------------- /man/removeTinySamples.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{removeTinySamples} 4 | \alias{removeTinySamples} 5 | \title{Remove samples with too few cells} 6 | \usage{ 7 | removeTinySamples(obj, min_sz = 20) 8 | } 9 | \arguments{ 10 | \item{obj}{'Phemd' object containing raw expression data and associated metadata} 11 | 12 | \item{min_sz}{Minimum number of cells in each sample to be retained} 13 | } 14 | \value{ 15 | 'Phemd' object containing raw multi-sample expression data and associated metadata (same as input minus removed samples) 16 | } 17 | \description{ 18 | Removes samples from Phemd that have fewer cells than \code{min_sz} 19 | } 20 | \details{ 21 | Note: If used, this function must be called before (and not after) the \code{aggregateSamples} function is called 22 | } 23 | \examples{ 24 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 25 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) #removes samples with fewer than 10 cells 26 | 27 | } 28 | -------------------------------------------------------------------------------- /man/selectFeatures.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{selectFeatures} 4 | \alias{selectFeatures} 5 | \title{Perform feature selection on aggregated data} 6 | \usage{ 7 | selectFeatures(obj, selected_genes) 8 | } 9 | \arguments{ 10 | \item{obj}{'Phemd' object containing aggregated data} 11 | 12 | \item{selected_genes}{Vector containing names of genes to use for downstream analyses} 13 | } 14 | \value{ 15 | Same as input 'Phemd' object after performing feature-selection based dimensionality reduction on aggregated expression data 16 | } 17 | \description{ 18 | Takes as input a Phemd object with aggregated data and returns updated object after performing feature selection on aggregated data 19 | } 20 | \details{ 21 | \code{aggregateSamples} needs to be called before running this function 22 | } 23 | \examples{ 24 | 25 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 26 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 27 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 28 | my_phemdObj_lg <- selectFeatures(my_phemdObj_lg, selected_genes=c('TP53', 29 | 'EGFR', 'KRAS', 'FOXP3', 'LAG3')) 30 | 31 | } 32 | -------------------------------------------------------------------------------- /man/gaussianffLocal.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{gaussianffLocal} 4 | \alias{gaussianffLocal} 5 | \title{Models expression data using generalized linear model with Gaussian error} 6 | \usage{ 7 | gaussianffLocal(dispersion = 0, parallel = FALSE, zero = NULL) 8 | } 9 | \arguments{ 10 | \item{dispersion}{Dispersion parameter. If 0, then estimate as described in VGAM 1.0-5 documentation.} 11 | 12 | \item{parallel}{A logical or formula. If a formula, the response of the formula should be a logical and the terms of the formula indicates whether or not those terms are parallel.} 13 | 14 | \item{zero}{An integer-valued vector specifying which linear/additive predictors are modelled as intercepts only. The values must be from the set {1...M} where Mis the number of columns of the matrix response.} 15 | } 16 | \value{ 17 | Generalized linear model with Gaussian error 18 | } 19 | \description{ 20 | Useful for modeling pre-normalized single-cell expression data. 21 | } 22 | \details{ 23 | Private method (not to be called by user directly). Requires VGAM package. Obtained from VGAM v1.0-5 (https://www.rdocumentation.org/packages/VGAM/versions/1.0-5/topics/gaussianff) 24 | } 25 | -------------------------------------------------------------------------------- /man/aggregateSamples.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{aggregateSamples} 4 | \alias{aggregateSamples} 5 | \title{Aggregate expression data from all samples} 6 | \usage{ 7 | aggregateSamples(obj, max_cells = 12000) 8 | } 9 | \arguments{ 10 | \item{obj}{'Phemd' object containing raw expression data and associated metadata} 11 | 12 | \item{max_cells}{Maximum number of cells across all samples to be included in final matrix on which Monocle 2 will be run} 13 | } 14 | \value{ 15 | Same as input 'Phemd' object with additional slot 'data_aggregate' containing aggregated expression data (num_markers x num_cells) 16 | } 17 | \description{ 18 | Takes initial Phemd object and returns object with additional data frame in slot @data_aggregate containing cells aggregated from all samples (to be used for further analyses e.g. Monocle 2 trajectory building / pseudotime mapping / cell clustering) 19 | } 20 | \details{ 21 | Subsamples cells as necessary based on \code{max_cells}. If subsampling is performed, an equal number of cells are subsampled from each sample 22 | } 23 | \examples{ 24 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 25 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 26 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 27 | 28 | } 29 | -------------------------------------------------------------------------------- /man/orderCellsMonocle.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{orderCellsMonocle} 4 | \alias{orderCellsMonocle} 5 | \title{Compute Monocle2 cell state and pseudotime assignments} 6 | \usage{ 7 | orderCellsMonocle(obj, ...) 8 | } 9 | \arguments{ 10 | \item{obj}{'Phemd' object containing Monocle2 object initialized using embedCells} 11 | 12 | \item{...}{Additional parameters to be passed into \code{orderCells} function} 13 | } 14 | \value{ 15 | Same as input 'Phemd' object with updated cell-state embedding object containing cell state assignments 16 | } 17 | \description{ 18 | Takes as input a Phemd object with Monocle2 object and returns updated object with Monocle2 object containing cell state and pseudotime assignments 19 | } 20 | \details{ 21 | Wrapper function for \code{orderCells} in Monocle 2 package. \code{embedCells} needs to be called before calling this function. 22 | } 23 | \examples{ 24 | 25 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 26 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 27 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 28 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, cell_model='monocle2', data_model='gaussianff', sigma=0.02, maxIter=2) 29 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 30 | } 31 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: phemd 2 | Type: Package 3 | Title: Phenotypic EMD for comparison of single-cell samples 4 | Version: 1.15.1 5 | Authors@R: person("William S", "Chen", email="wil.yum.chen@gmail.com",role=c("aut","cre")) 6 | Description: Package for comparing and generating a low-dimensional embedding 7 | of multiple single-cell samples. 8 | License: GPL-2 9 | Encoding: UTF-8 10 | LazyData: true 11 | Depends: R (>= 4.0), monocle, Seurat 12 | Imports: 13 | SingleCellExperiment, 14 | RColorBrewer, 15 | igraph, 16 | transport, 17 | pracma, 18 | cluster, 19 | Rtsne, 20 | destiny, 21 | RANN, 22 | ggplot2, 23 | maptree, 24 | pheatmap, 25 | scatterplot3d, 26 | VGAM, 27 | methods, 28 | grDevices, 29 | graphics, 30 | stats, 31 | utils, 32 | cowplot, 33 | S4Vectors, 34 | BiocGenerics, 35 | SummarizedExperiment, 36 | Biobase, 37 | phateR, 38 | reticulate 39 | Config/reticulate: 40 | list( 41 | packages = list( 42 | list(package = "phate") 43 | ) 44 | ) 45 | Suggests: knitr 46 | VignetteBuilder: knitr 47 | biocViews: 48 | Clustering, 49 | ComparativeGenomics, 50 | Proteomics, 51 | Transcriptomics, 52 | Sequencing, 53 | DimensionReduction, 54 | SingleCell, 55 | DataRepresentation, 56 | Visualization, 57 | MultipleComparison 58 | RoxygenNote: 7.0.2 59 | PackageStatus: Deprecated 60 | -------------------------------------------------------------------------------- /man/createDataObj.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{createDataObj} 4 | \alias{createDataObj} 5 | \title{Create 'Phemd' object} 6 | \usage{ 7 | createDataObj(data, markers, snames, datatype = "list", valtype = "counts") 8 | } 9 | \arguments{ 10 | \item{data}{List of length \var{num_samples} containing expression data; each element is of size \var{num_cells} x \var{num_markers}. Alternately a SingleCellExperiment object.} 11 | 12 | \item{markers}{Vector containing marker names (i.e. column names of \code{all_data})} 13 | 14 | \item{snames}{Vector containing sample names (i.e. names of samples contained in \code{all_data})} 15 | 16 | \item{datatype}{Either "list" or "sce" (SingleCellExperiment with genes x cells)} 17 | 18 | \item{valtype}{Type of assay data (i.e. "counts", "normcounts", "logcounts", "tpm", "cpm") if datatype is "sce"} 19 | } 20 | \value{ 21 | 'Phemd' object containing raw multi-sample expression data and associated metadata 22 | } 23 | \description{ 24 | Wrapper function to create 'Phemd' object containing raw expression data and metadata 25 | } 26 | \details{ 27 | Note that each element in list can have different number of rows (i.e. number of cells in each sample can vary). 28 | } 29 | \examples{ 30 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 31 | 32 | } 33 | -------------------------------------------------------------------------------- /man/retrieveRefClusters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{retrieveRefClusters} 4 | \alias{retrieveRefClusters} 5 | \title{Retrieve reference cell clusters} 6 | \usage{ 7 | retrieveRefClusters( 8 | obj, 9 | cell_model = c("monocle2", "seurat", "phate"), 10 | expn_type = "reduced", 11 | ndim = 10 12 | ) 13 | } 14 | \arguments{ 15 | \item{obj}{Phemd struct containing cell-state embedding object and underlying expression data} 16 | 17 | \item{cell_model}{String representing data model for cell-state space ("seurat", "monocle2", or "phate")} 18 | 19 | \item{expn_type}{String representing whether to return raw expression values or coordinates in dimensionality-reduced feature space} 20 | 21 | \item{ndim}{Number of dimensions in reduced dimensionality space (e.g. PHATE / CCA) to use (only relevant in reduced dimensionality space)} 22 | } 23 | \value{ 24 | List of data matrices; each list element is of size num_cells_in_cluster x num_markers and represents a distinct cell cluster 25 | } 26 | \description{ 27 | Takes initial Phemd struct and returns cell clusters as assigned by clustering algorithm (e.g. PHATE or Monocle2) 28 | } 29 | \details{ 30 | Private method (not exported in namespace) 31 | } 32 | \examples{ 33 | \dontrun{ 34 | cluster_expression_data <- retrieveRefClusters(my_phemdObj) 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /man/getCellYield.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-getResults.R 3 | \name{getCellYield} 4 | \alias{getCellYield} 5 | \title{Gets cell yield of each sample as a table} 6 | \usage{ 7 | getCellYield(myobj, cluster_assignments = NULL) 8 | } 9 | \arguments{ 10 | \item{myobj}{phemdObj object containing expression data for each sample in 'data' slot} 11 | 12 | \item{cluster_assignments}{Vector of cluster assignments to be included as additional column in output table (optional)} 13 | } 14 | \value{ 15 | Data frame representing cell yield of each sample 16 | } 17 | \description{ 18 | Gets cell yield (number of viable cells) of each single-cell sample in decreasing order 19 | } 20 | \examples{ 21 | 22 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 23 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 24 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 25 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 26 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 27 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 28 | my_phemdObj_final <- generateGDM(my_phemdObj_final) 29 | my_EMD_mat <- compareSamples(my_phemdObj_final) 30 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 31 | getCellYield(my_phemdObj_final, cluster_assignments) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /man/assignCellClusterNearestNode.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{assignCellClusterNearestNode} 4 | \alias{assignCellClusterNearestNode} 5 | \title{Assign cells to a reference cell subtype} 6 | \usage{ 7 | assignCellClusterNearestNode( 8 | cur_cells, 9 | ref_cells, 10 | ref_cell_labels, 11 | cell_model = c("monocle2", "seurat", "phate") 12 | ) 13 | } 14 | \arguments{ 15 | \item{cur_cells}{Matrix of cells to be assigned to clusters (Dim: \var{num_cells} x \var{num_markers})} 16 | 17 | \item{ref_cells}{Matrix of cells used to build reference Monocle 2 tree (Dim: \var{num_monocle_cells} x \var{num_markers})} 18 | 19 | \item{ref_cell_labels}{Vector of length \var{num_monocle_cells} containing Monocle 2 cell branch assignments} 20 | 21 | \item{cell_model}{Either "monocle2", "seurat", or "phate" depending on method used to model cell state space} 22 | } 23 | \value{ 24 | Vector of length \var{num_cells} representing cluster assignments for each cell in \var{cur_cells} 25 | } 26 | \description{ 27 | Assigns each cell in \code{cur_cells} to a cluster based on nearest cell in Monocle 2 tree 28 | } 29 | \details{ 30 | Private method (not exported in namespace). Uses RANN package for fast knn search 31 | } 32 | \examples{ 33 | \dontrun{ 34 | cur_cells_cluster_labels <- assignCellClusterNearestNode(cur_cells_expn_data, 35 | clustered_cells_expn_data, clustered_cells_cluster_labels, cell_model='monocle2') 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /man/bindSeuratObj.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{bindSeuratObj} 4 | \alias{bindSeuratObj} 5 | \title{Attach 'Seurat' object to 'Phemd' object} 6 | \usage{ 7 | bindSeuratObj(phemd_obj, seurat_obj, batch.colname = "plt") 8 | } 9 | \arguments{ 10 | \item{phemd_obj}{Phemd object initialized using createDataObj} 11 | 12 | \item{seurat_obj}{S4 'seurat' object containing batch-normalized reference cell data} 13 | 14 | \item{batch.colname}{Name of column in Seurat object that denotes batch ID} 15 | } 16 | \value{ 17 | 'Phemd' object containing with attached Seurat object 18 | } 19 | \description{ 20 | Allows user to attach batch-normalized reference cell data from Seurat into 'Phemd' object containing raw expression data and metadata 21 | } 22 | \examples{ 23 | 24 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 25 | my_seuratObj <- Seurat::CreateSeuratObject(counts = t(all_expn_data[[1]]), project = "A") 26 | my_seuratObj <- Seurat::FindVariableFeatures(object = my_seuratObj) 27 | my_seuratObj <- Seurat::ScaleData(object = my_seuratObj, do.scale=FALSE, do.center=FALSE) 28 | my_seuratObj <- Seurat::RunPCA(object = my_seuratObj, pc.genes = colnames(all_expn_data[[1]]), do.print = FALSE) 29 | my_seuratObj <- Seurat::FindNeighbors(my_seuratObj, reduction = "pca", dims.use = 1:10) 30 | my_seuratObj <- Seurat::FindClusters(my_seuratObj, resolution = 0.6, print.output = 0, save.SNN = TRUE) 31 | my_phemdObj <- bindSeuratObj(my_phemdObj, my_seuratObj) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ---------------------------------- 2 | 3 | Non-Commercial License 4 | Yale Copyright © 2024 Yale University. 5 | 6 | Permission is hereby granted to use, copy, modify, and distribute this Software for any non-commercial purpose. Any distribution or modification or derivations of the Software (together “Derivative Works”) must be made available on GitHub and shall include this copyright notice and this permission notice in all copies or substantial portions of the Software. For the purposes of this license, "non-commercial" means not intended for or directed towards commercial advantage or monetary compensation either via the Software itself or Derivative Works or uses of either which lead to or generate any commercial products. In any event, the use and modification of the Software or Derivative Works shall remain governed by the terms and conditions of this Agreement; Any commercial use of the Software requires a separate commercial license from the copyright holder at Yale University. Direct any requests for commercial licenses to Yale Ventures at yaleventures@yale.edu. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | 10 | ---------------------------------- 11 | -------------------------------------------------------------------------------- /man/plotSummaryHistograms.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-plotting.R 3 | \name{plotSummaryHistograms} 4 | \alias{plotSummaryHistograms} 5 | \title{Plots cell subtype frequency histograms summarizing each group of samples} 6 | \usage{ 7 | plotSummaryHistograms( 8 | myobj, 9 | cluster_assignments, 10 | cell_model = c("monocle2", "seurat", "phate"), 11 | cmap = NULL, 12 | ncol.plot = 4, 13 | ax.lab.sz = 2.5, 14 | title.sz = 3 15 | ) 16 | } 17 | \arguments{ 18 | \item{myobj}{Phemd object containing cell subtype relative frequency in @data_cluster_weights slot} 19 | 20 | \item{cluster_assignments}{Vector containing group assignments for each sample in myobj} 21 | 22 | \item{cell_model}{Method by which cell state was modeled (either "monocle2", "seurat", or "phate")} 23 | 24 | \item{cmap}{Vector containing colors by which histogram bars should be colored (optional)} 25 | 26 | \item{ncol.plot}{Number of columns to use to plot multi-panel histogram plot} 27 | 28 | \item{ax.lab.sz}{Scaling factor for axis labels (default 2.5)} 29 | 30 | \item{title.sz}{Scaling factor for plot title (default 3)} 31 | } 32 | \value{ 33 | None 34 | } 35 | \description{ 36 | Visualizes plots of relative frequency ("weights") of cell subtypes ("bins" or "signatures") summarizing each group of single-cell samples. Each summary histogram is computed by taking the bin-wise mean of all samples in the group 37 | } 38 | \details{ 39 | \code{groupSamples} must be called before calling this function. Saves plots in directory called "summary_inhibs" 40 | } 41 | -------------------------------------------------------------------------------- /man/compareSamples.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{compareSamples} 4 | \alias{compareSamples} 5 | \title{Computes EMD distance matrix representing pairwise dissimilarity between samples} 6 | \usage{ 7 | compareSamples(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{'Phemd' object containing cell subtype relative frequencies for each sample in @data_cluster_weights slot and ground distance matrix (representing cell subtype dissimilarity) in @emd_dist_mat slot} 11 | } 12 | \value{ 13 | Distance matrix of dimension num_samples x num_samples representing pairwise dissimilarity between samples 14 | } 15 | \description{ 16 | Takes as input a Phemd object with cell subtype relative frequencies for each sample in @data_cluster_weights slot and ground distance matrix (representing cell subtype pairwise dissimilarity) in @emd_dist_mat slot. Returns distance matrix representing pairwise dissimilarity between samples 17 | } 18 | \details{ 19 | Requires 'transport' and 'pracma' packages 20 | } 21 | \examples{ 22 | 23 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 24 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 25 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 26 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 27 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 28 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 29 | my_phemdObj_final <- generateGDM(my_phemdObj_final) 30 | my_EMD_mat <- compareSamples(my_phemdObj_final) 31 | 32 | } 33 | -------------------------------------------------------------------------------- /man/getSampleCelltypeFreqs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-getResults.R 3 | \name{getSampleCelltypeFreqs} 4 | \alias{getSampleCelltypeFreqs} 5 | \title{Returns cell subtype distribution for each sample as a table} 6 | \usage{ 7 | getSampleCelltypeFreqs(myobj, cluster_assignments = NULL) 8 | } 9 | \arguments{ 10 | \item{myobj}{phemdObj object containing expression data for each sample in 'data' slot} 11 | 12 | \item{cluster_assignments}{Vector of cluster assignments to be included as additional column in output table (optional)} 13 | } 14 | \value{ 15 | Data frame representing relative frequencies of each cell subtype along with (optional) final inhibitor cluster assignment for each single-cell sample 16 | } 17 | \description{ 18 | Returns cell subtype distribution for each single-cell sample along with (optional) final inhibitor cluster assignment 19 | } 20 | \examples{ 21 | 22 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 23 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 24 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 25 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 26 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 27 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 28 | my_phemdObj_final <- generateGDM(my_phemdObj_final) 29 | my_EMD_mat <- compareSamples(my_phemdObj_final) 30 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 31 | getSampleCelltypeFreqs(my_phemdObj_final, cluster_assignments) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /man/plotCellYield.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-plotting.R 3 | \name{plotCellYield} 4 | \alias{plotCellYield} 5 | \title{Plot cell yield of each sample as bar plot} 6 | \usage{ 7 | plotCellYield(myobj, labels = NULL, cmap = NULL, font_sz = 0.6, w = 8, h = 9.5) 8 | } 9 | \arguments{ 10 | \item{myobj}{Phmed object containing expression data for each sample in 'data' slot} 11 | 12 | \item{labels}{Vector containing group labels for samples (optional). If not provided, bars will be of uniform color (blue)} 13 | 14 | \item{cmap}{Vector containing colors by which histogram bars should be colored (optional)} 15 | 16 | \item{font_sz}{Scaling factor for font size of sample names in barplot} 17 | 18 | \item{w}{Width of plot in inches} 19 | 20 | \item{h}{Height of plot in inches} 21 | } 22 | \value{ 23 | None 24 | } 25 | \description{ 26 | Plots cell yield (number of viable cells) of each single-cell sample in decreasing order as horizontal bar plot 27 | } 28 | \examples{ 29 | 30 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 31 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 32 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 33 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 34 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 35 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 36 | my_phemdObj_final <- generateGDM(my_phemdObj_final) 37 | my_EMD_mat <- compareSamples(my_phemdObj_final) 38 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 39 | plotCellYield(my_phemdObj_final, labels=cluster_assignments, font_sz = 0.8) 40 | 41 | } 42 | -------------------------------------------------------------------------------- /man/Phemd.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \docType{class} 4 | \name{Phemd} 5 | \alias{Phemd} 6 | \alias{Phemd-class} 7 | \title{Phemd class} 8 | \description{ 9 | The main PhEMD class to store single-cell expression data. 10 | } 11 | \section{Fields}{ 12 | 13 | \describe{ 14 | \item{\code{data}}{List of matrices, each of which represents a single-cell sample (num_cells x num_genes)} 15 | 16 | \item{\code{markers}}{Column names (e.g. genes) for each element (i.e. data matrix) in "data"} 17 | 18 | \item{\code{snames}}{Sample ID for each element in "data"} 19 | 20 | \item{\code{data_aggregate}}{Numeric matrix representing expression data for cells from all experimental conditions (rows = markers, cols = cells)} 21 | 22 | \item{\code{data_subsample_idx}}{List of vectors each representing the indices of elements in "data" that were subsampled and combined to form "data_aggregate"} 23 | 24 | \item{\code{subsampled_bool}}{Boolean represent whether or not subsampling was performed in the data aggregation process} 25 | 26 | \item{\code{monocle_obj}}{Data object of type "CellDataSet" that is the core Monocle data structure} 27 | 28 | \item{\code{data_cluster_weights}}{Matrix representing cell subtype relative frequencies for each sample (num_samples x num_genes)} 29 | 30 | \item{\code{emd_dist_mat}}{Matrix representing pairwise distances between each pair of cell subtypes} 31 | 32 | \item{\code{seurat_obj}}{Object of class "Seurat" that is the core Seurat data structure} 33 | 34 | \item{\code{phate_obj}}{Object of class "phate" that is the core PHATE data structure} 35 | 36 | \item{\code{experiment_ids}}{Vector of length num_samples representing the experiment (batch) in which the sample was profiled} 37 | }} 38 | 39 | -------------------------------------------------------------------------------- /man/generateGDM.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{generateGDM} 4 | \alias{generateGDM} 5 | \title{Computes ground distance matrix based on cell embedding} 6 | \usage{ 7 | generateGDM( 8 | obj, 9 | cell_model = c("monocle2", "seurat", "phate"), 10 | expn_type = "reduced", 11 | ndim = 8 12 | ) 13 | } 14 | \arguments{ 15 | \item{obj}{'Phemd' object containing cell-state embedding object} 16 | 17 | \item{cell_model}{Method by which cell state was modeled (either "monocle2", "seurat", or "phate")} 18 | 19 | \item{expn_type}{Data type to use to determine cell-type dissimilarities} 20 | 21 | \item{ndim}{Number of embedding dimensions to be used for computing cell-type dissimilarity (optional)} 22 | } 23 | \value{ 24 | Phemd object with ground distance matrix (to be used in EMD computation) in @data_cluster_weights slot 25 | } 26 | \description{ 27 | Takes as input a Phemd object containing cell-state embedding object. Returns updated object with ground distance matrix representing pairwise distances between distinct cell subtypes based on cell state embedding. 28 | } 29 | \details{ 30 | \code{embedCells} and \code{orderCellsMonocle} need to be called before calling this function. Requires 'igraph' package 31 | } 32 | \examples{ 33 | 34 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 35 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 36 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 37 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 38 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 39 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 40 | my_phemdObj_final <- generateGDM(my_phemdObj_final) 41 | 42 | } 43 | -------------------------------------------------------------------------------- /man/printClusterAssignments.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-getResults.R 3 | \name{printClusterAssignments} 4 | \alias{printClusterAssignments} 5 | \title{Writes samples to file based on community detection group assignments} 6 | \usage{ 7 | printClusterAssignments(cluster_assignments, obj, dest, overwrite = FALSE) 8 | } 9 | \arguments{ 10 | \item{cluster_assignments}{Vector containing group assignments for each sample} 11 | 12 | \item{obj}{phemdObj object containing sample names in @snames slot} 13 | 14 | \item{dest}{Path to existing directory where output should be saved} 15 | 16 | \item{overwrite}{Boolean representing whether or not to overwrite contents of "dest" with output of printClusterAssignments} 17 | } 18 | \value{ 19 | None 20 | } 21 | \description{ 22 | Takes vector of cluster assignments and phemdObj containing sample names and writes sample groups to file 23 | } 24 | \details{ 25 | Order of samples in obj@snames is assumed to be the same as the order of group assignments in cluster_assignments 26 | } 27 | \examples{ 28 | 29 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 30 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 31 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 32 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 33 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 34 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 35 | my_phemdObj_final <- generateGDM(my_phemdObj_final) 36 | my_EMD_mat <- compareSamples(my_phemdObj_final) 37 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 38 | printClusterAssignments(cluster_assignments, my_phemdObj_final, '.', overwrite=TRUE) 39 | 40 | } 41 | -------------------------------------------------------------------------------- /man/getSampleHistsByCluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-getResults.R 3 | \name{getSampleHistsByCluster} 4 | \alias{getSampleHistsByCluster} 5 | \title{Gets cell subtype frequency histograms for each sample by cluster ID} 6 | \usage{ 7 | getSampleHistsByCluster( 8 | myobj, 9 | cluster_assignments, 10 | cell_model = c("monocle2", "seurat") 11 | ) 12 | } 13 | \arguments{ 14 | \item{myobj}{phemdObj object containing cell subtype relative frequency in @data_cluster_weights slot} 15 | 16 | \item{cluster_assignments}{Vector containing group assignments for each sample in myobj} 17 | 18 | \item{cell_model}{Method by which cell state was modeled (either "monocle2" or "seurat")} 19 | } 20 | \value{ 21 | List of lists, with outer list representing sample cluster ID and inner list representing cell subtype frequencies of given sample 22 | } 23 | \description{ 24 | Gets relative frequency ("weights") of cell subtypes ("bins" or "signatures") in each single-cell sample 25 | } 26 | \details{ 27 | \code{groupSamples} must be called before calling this function. Saves plots in directory called "individual_inhibs" 28 | } 29 | \examples{ 30 | 31 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 32 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 33 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 34 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 35 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 36 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 37 | my_phemdObj_final <- generateGDM(my_phemdObj_final) 38 | my_EMD_mat <- compareSamples(my_phemdObj_final) 39 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 40 | weights_by_cluster <- getSampleHistsByCluster(my_phemdObj_final, cluster_assignments) 41 | 42 | } 43 | -------------------------------------------------------------------------------- /man/plotEmbeddings.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-plotting.R 3 | \name{plotEmbeddings} 4 | \alias{plotEmbeddings} 5 | \title{Plots Monocle2 cell embedding plots} 6 | \usage{ 7 | plotEmbeddings( 8 | obj, 9 | cell_model = c("monocle2", "seurat", "phate"), 10 | cmap = NULL, 11 | w = 4, 12 | h = 5, 13 | pt_sz = 1, 14 | ndims = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{obj}{'Phemd' object containing Monocle 2 object} 19 | 20 | \item{cell_model}{Method by which cell state was modeled (either "monocle2", "seurat", or "phate)} 21 | 22 | \item{cmap}{User-specified colormap to use to color cell state embedding (optional)} 23 | 24 | \item{w}{Width of plot in inches} 25 | 26 | \item{h}{Height of plot in inches} 27 | 28 | \item{pt_sz}{Scalar factor for point size} 29 | 30 | \item{ndims}{Number of dimensions to use for dimensionality reduction in case it hasn't been performed yet (only relevant when using Seurat data as input)} 31 | } 32 | \value{ 33 | Colormap (vector of colors) used to color Monocle2 cell state embedding 34 | } 35 | \description{ 36 | Takes as input a Phemd object containing either a Monocle2 object or Seurat object (already embedded and ordered) and plots cell embedding plots side by side. Optionally saves to specified folder. 37 | } 38 | \details{ 39 | \code{embedCells} and \code{orderCellsMonocle} need to be called before calling this function. Required additional packages: 'RColorBrewer', 'cowplot' 40 | } 41 | \examples{ 42 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 43 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 44 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 45 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model='gaussianff', sigma=0.02, maxIter=2) 46 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 47 | cmap <- plotEmbeddings(my_phemdObj_monocle) 48 | } 49 | -------------------------------------------------------------------------------- /man/clusterIndividualSamples.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{clusterIndividualSamples} 4 | \alias{clusterIndividualSamples} 5 | \title{Computes cell subtype abundances for each sample} 6 | \usage{ 7 | clusterIndividualSamples( 8 | obj, 9 | verbose = FALSE, 10 | cell_model = c("monocle2", "seurat", "phate") 11 | ) 12 | } 13 | \arguments{ 14 | \item{obj}{'Phemd' object containing single-cell expression data of all samples in @data slot and cell-state embedding object generated and stored using the embedCells function.} 15 | 16 | \item{verbose}{Boolean that determines whether progress (sequential processing of samples) should be printed. FALSE by default} 17 | 18 | \item{cell_model}{Either "monocle2", "seurat", or "phate" depending on method used to model cell state space} 19 | } 20 | \value{ 21 | 'Phemd' object with cell subtype frequencies of each sample that can be retrieved using the 'celltypeFreqs' accessor function 22 | } 23 | \description{ 24 | Takes as input a Phemd object with all single-cell expression data of all single-cell samples in @data slot and cell-state embedding generated by embedCells. Returns updated object with cell subtype frequencies of each sample that may be retrieved by the 'celltypeFreqs' accessor function. 25 | } 26 | \details{ 27 | \code{embedCells} (and \code{orderCellsMonocle} if using the Monocle2 embedding technique) needs to be called before calling this function. 28 | } 29 | \examples{ 30 | 31 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 32 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 33 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 34 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 35 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 36 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 37 | 38 | } 39 | -------------------------------------------------------------------------------- /man/plotHeatmaps.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-plotting.R 3 | \name{plotHeatmaps} 4 | \alias{plotHeatmaps} 5 | \title{Plot heatmap of cell subtypes} 6 | \usage{ 7 | plotHeatmaps( 8 | obj, 9 | cell_model = c("monocle2", "seurat", "phate"), 10 | selected_genes = NULL, 11 | w = 8, 12 | h = 5, 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{obj}{'Phemd' object containing cell-state embedding object} 18 | 19 | \item{cell_model}{Method by which cell state was modeled ("monocle2", "seurat", or "phate")} 20 | 21 | \item{selected_genes}{Vector containing gene names to include in heatmap (optional)} 22 | 23 | \item{w}{Width of plot in inches} 24 | 25 | \item{h}{Height of plot in inches} 26 | 27 | \item{...}{Additional parameters to be passed on to pheatmap function} 28 | } 29 | \value{ 30 | Heatmap containing expression values for each cell subtype. If cell_model is 'seurat', then returns a list of heatmaps (1 for each batch) that may be subsequently plotted individually 31 | } 32 | \description{ 33 | Takes as input a Phemd object containing either a Monocle2, Seurat, or PHATE object (already embedded and clustered) and plots heatmap characterizing cell subtypes 34 | } 35 | \details{ 36 | \code{embedCells} (and \code{orderCellsMonocle} if using Monocle2) need to be called before calling this function. Required additional package: 'pheatmap' 37 | } 38 | \examples{ 39 | 40 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 41 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 42 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 43 | my_phemdObj_lg <- selectFeatures(my_phemdObj_lg, selected_genes) 44 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', 45 | pseudo_expr=0, sigma=0.02, maxIter=2) 46 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 47 | myheatmap <- plotHeatmaps(my_phemdObj_monocle, cell_model='monocle2') 48 | 49 | } 50 | -------------------------------------------------------------------------------- /man/groupSamples.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{groupSamples} 4 | \alias{groupSamples} 5 | \title{Performs community detection on sample-sample distance matrix to identify groups of similar samples} 6 | \usage{ 7 | groupSamples( 8 | distmat, 9 | distfun = "hclust", 10 | ncluster = NULL, 11 | method = "complete", 12 | ... 13 | ) 14 | } 15 | \arguments{ 16 | \item{distmat}{A distance matrix of dimension num_samples x num_samples representing pairwise dissimilarity between samples} 17 | 18 | \item{distfun}{Method of partitioning network of samples (currently either 'hclust' or 'pam')} 19 | 20 | \item{ncluster}{Optional parameter specifying total number of sample groups} 21 | 22 | \item{method}{Optional parameter for hierarchical clustering (see "hclust" documentation)} 23 | 24 | \item{...}{Optional additional parameters to be passed to diffusionKmeans method} 25 | } 26 | \value{ 27 | Vector containing group assignments for each sample (same order as row-order of distmat) based on user-specified partitioning method (e.g. hierarchical clustering) 28 | } 29 | \description{ 30 | Takes sample-sample distance matrix as input and returns group assignments for each sample 31 | } 32 | \details{ 33 | By default, uses 'kgs' (Kelley-Gardner-Sutcliffe) method for determining optimal number of groups. Alternatively, can take user-specified number of groups). Requires 'cluster' and 'maptree' packages. 34 | } 35 | \examples{ 36 | 37 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 38 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 39 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 40 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, cell_model = 'monocle2', data_model = 'gaussianff', sigma=0.02, maxIter=2) 41 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 42 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 43 | my_phemdObj_final <- generateGDM(my_phemdObj_final) 44 | my_EMD_mat <- compareSamples(my_phemdObj_final) 45 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 46 | 47 | } 48 | -------------------------------------------------------------------------------- /man/embedCells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-core.R 3 | \name{embedCells} 4 | \alias{embedCells} 5 | \title{Generate cell-state embedding} 6 | \usage{ 7 | embedCells( 8 | obj, 9 | cell_model = c("monocle2", "seurat", "phate"), 10 | data_model = "negbinomial_sz", 11 | phate_ncluster = 8, 12 | phate_cluster_seed = NULL, 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{obj}{'Phemd' object containing aggregated data} 18 | 19 | \item{cell_model}{Method to use to generate cell-state embedding. Currently supports "phate" and "monocle2". If using the Seurat to model the cell-state space, please identify cell subtypes as outlined in the Seurat software package and then use the \code{bindSeuratObj} function.} 20 | 21 | \item{data_model}{Only relevant if cell_model = "monocle2". One of the following: 'negbinomial_sz', 'negbinomial', 'tobit', 'uninormal', 'gaussianff'. See "Family Function" table at the following link for more details on selecting the proper one. \url{http://cole-trapnell-lab.github.io/monocle-release/docs/#getting-started-with-monocle}} 22 | 23 | \item{phate_ncluster}{Only relevant if cell_model = "phate". Number of cell state clusters to return when using PHATE} 24 | 25 | \item{phate_cluster_seed}{Only relevant if cell_model = "phate". Seed to use when performing cell state clustering (optional)} 26 | 27 | \item{...}{Additional parameters to be passed to \code{reduceDimension} function for Monocle or \code{phate} function for PHATE} 28 | } 29 | \value{ 30 | Same as input 'Phemd' object containing additional cell-state embedding object 31 | } 32 | \description{ 33 | Takes as input a Phemd object with aggregated data and returns updated object containing cell-state embedding 34 | } 35 | \details{ 36 | \code{aggregateSamples} needs to be called before running this function. 37 | } 38 | \examples{ 39 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 40 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 41 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 42 | my_phemdObj_lg <- embedCells(my_phemdObj_lg, cell_model='monocle2', data_model = 'gaussianff', sigma=0.02, maxIter=2) 43 | } 44 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | exportClasses(Phemd) 2 | export(retrieveRefClusters) 3 | export(createDataObj) 4 | export(bindSeuratObj) 5 | export(removeTinySamples) 6 | export(aggregateSamples) 7 | export(selectFeatures) 8 | export(embedCells) 9 | export(orderCellsMonocle) 10 | export(plotEmbeddings) 11 | export(plotHeatmaps) 12 | export(clusterIndividualSamples) 13 | export(generateGDM) 14 | export(compareSamples) 15 | export(groupSamples) 16 | export(printClusterAssignments) 17 | export(plotGroupedSamplesDmap) 18 | export(getSampleHistsByCluster) 19 | export(plotSummaryHistograms) 20 | export(plotCellYield) 21 | export(getCellYield) 22 | export(drawColnames45) 23 | export(getSampleCelltypeFreqs) 24 | export(rawExpn) 25 | export(monocleInfo) 26 | export(seuratInfo) 27 | export(phateInfo) 28 | export(GDM) 29 | export(selectMarkers) 30 | export(sNames) 31 | export(pooledCells) 32 | export(subsampledIdx) 33 | export(subsampledBool) 34 | export(celltypeFreqs) 35 | export(batchIDs) 36 | export("rawExpn<-") 37 | export("selectMarkers<-") 38 | export("pooledCells<-") 39 | export("subsampledIdx<-") 40 | export("subsampledBool<-") 41 | export("monocleInfo<-") 42 | export("seuratInfo<-") 43 | export("phateInfo<-") 44 | export("celltypeFreqs<-") 45 | export("batchIDs<-") 46 | export("GDM<-") 47 | 48 | importFrom(pracma, squareform, isempty, strcat) 49 | import(reticulate) 50 | import(monocle) 51 | import(RColorBrewer) 52 | import(igraph) 53 | import(transport) 54 | import(cluster) 55 | import(Rtsne) 56 | import(destiny) 57 | import(ggplot2) 58 | import(RANN) 59 | import(scatterplot3d) 60 | import(maptree) 61 | import(pheatmap) 62 | import(Seurat) 63 | import(VGAM) 64 | import(methods) 65 | import(SingleCellExperiment) 66 | import(Biobase) 67 | import(phateR) 68 | importFrom("grDevices", "colorRampPalette", "dev.off", "palette", "png", 69 | "rainbow") 70 | importFrom("graphics", "barplot", "par", "text", "title") 71 | importFrom("stats", "as.dist", "cov", "cutree", "dist", "hclust", 72 | "mahalanobis") 73 | importFrom("utils", "assignInNamespace", "write.table") 74 | importFrom("cowplot", "plot_grid") 75 | importFrom("S4Vectors", "isEmpty") 76 | importFrom("BiocGenerics", "estimateSizeFactors", "estimateDispersions") 77 | importFrom("SummarizedExperiment", "assay", "assays") 78 | importFrom("utils", "packageVersion") -------------------------------------------------------------------------------- /man/plotGroupedSamplesDmap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions-plotting.R 3 | \name{plotGroupedSamplesDmap} 4 | \alias{plotGroupedSamplesDmap} 5 | \title{Plot diffusion map embedding of samples based on distance matrix} 6 | \usage{ 7 | plotGroupedSamplesDmap( 8 | my_distmat, 9 | cluster_assignments = NULL, 10 | pt_sz = 1, 11 | n_dim = 3, 12 | pt_label = NULL, 13 | cmap = NULL, 14 | w = 8, 15 | h = 5, 16 | scale.y = 1, 17 | angle = 40, 18 | autosave = FALSE, 19 | ... 20 | ) 21 | } 22 | \arguments{ 23 | \item{my_distmat}{phemdObj object containing sample names in @snames slot} 24 | 25 | \item{cluster_assignments}{Vector containing group assignments for each sample} 26 | 27 | \item{pt_sz}{Size of points representing samples in plot (scaling factor)} 28 | 29 | \item{n_dim}{Number of dimensions for embedding (either 2 or 3)} 30 | 31 | \item{pt_label}{Vector of sample names corresponding to each point (same order as samples in \code{my_distmat} and \code{cluster_assignments})} 32 | 33 | \item{cmap}{Vector containing colors by which points should be colored (corresponding to cluster_assignments)} 34 | 35 | \item{w}{Width of plot in inches} 36 | 37 | \item{h}{Height of plot in inches} 38 | 39 | \item{scale.y}{Scaling factor for diffusion map y-axis} 40 | 41 | \item{angle}{Rotation factor for diffusion map plot} 42 | 43 | \item{autosave}{Boolean denoting whether or not to save output diffusion map} 44 | 45 | \item{...}{Additional parameters to be passed to \code{DiffusionMap} function} 46 | } 47 | \value{ 48 | DiffusionMap object containing biological sample embedding and associated metadata 49 | } 50 | \description{ 51 | Visualizes diffusion map for network of samples based on square distance matrix (sample-sample pairwise dissimilarity) 52 | } 53 | \details{ 54 | Requires 'destiny' package 55 | } 56 | \examples{ 57 | 58 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 59 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 60 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 61 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 62 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 63 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 64 | my_phemdObj_final <- generateGDM(my_phemdObj_final) 65 | my_EMD_mat <- compareSamples(my_phemdObj_final) 66 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 67 | printClusterAssignments(cluster_assignments, my_phemdObj_final, '.', overwrite=TRUE) 68 | dm <- plotGroupedSamplesDmap(my_EMD_mat, cluster_assignments, pt_sz=2) 69 | 70 | } 71 | -------------------------------------------------------------------------------- /man/Phemd-methods.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Phemd-methods.R 3 | \docType{methods} 4 | \name{Phemd-methods} 5 | \alias{Phemd-methods} 6 | \alias{selectMarkers<-} 7 | \alias{selectMarkers<-,Phemd-method} 8 | \alias{Phemd,ANY,ANY-method} 9 | \alias{rawExpn<-} 10 | \alias{Phemd,character,ANY-method} 11 | \alias{rawExpn<-,Phemd-method} 12 | \alias{pooledCells<-} 13 | \alias{pooledCells<-,Phemd-method} 14 | \alias{subsampledIdx<-} 15 | \alias{subsampledIdx<-,Phemd-method} 16 | \alias{subsampledBool<-} 17 | \alias{subsampledBool<-,Phemd-method} 18 | \alias{monocleInfo<-} 19 | \alias{monocleInfo<-,Phemd-method} 20 | \alias{seuratInfo<-} 21 | \alias{seuratInfo<-,Phemd-method} 22 | \alias{phateInfo<-} 23 | \alias{phateInfo<-,Phemd-method} 24 | \alias{celltypeFreqs<-} 25 | \alias{celltypeFreqs<-,Phemd-method} 26 | \alias{batchIDs<-} 27 | \alias{batchIDs<-,Phemd-method} 28 | \alias{GDM<-} 29 | \alias{GDM<-,Phemd-method} 30 | \title{Setter function for protein / gene markers} 31 | \usage{ 32 | selectMarkers(obj) <- value 33 | 34 | \S4method{selectMarkers}{Phemd}(obj) <- value 35 | 36 | rawExpn(obj) <- value 37 | 38 | \S4method{rawExpn}{Phemd}(obj) <- value 39 | 40 | pooledCells(obj) <- value 41 | 42 | \S4method{pooledCells}{Phemd}(obj) <- value 43 | 44 | subsampledIdx(obj) <- value 45 | 46 | \S4method{subsampledIdx}{Phemd}(obj) <- value 47 | 48 | subsampledBool(obj) <- value 49 | 50 | \S4method{subsampledBool}{Phemd}(obj) <- value 51 | 52 | monocleInfo(obj) <- value 53 | 54 | \S4method{monocleInfo}{Phemd}(obj) <- value 55 | 56 | seuratInfo(obj) <- value 57 | 58 | \S4method{seuratInfo}{Phemd}(obj) <- value 59 | 60 | phateInfo(obj) <- value 61 | 62 | \S4method{phateInfo}{Phemd}(obj) <- value 63 | 64 | celltypeFreqs(obj) <- value 65 | 66 | \S4method{celltypeFreqs}{Phemd}(obj) <- value 67 | 68 | batchIDs(obj) <- value 69 | 70 | \S4method{batchIDs}{Phemd}(obj) <- value 71 | 72 | GDM(obj) <- value 73 | 74 | \S4method{GDM}{Phemd}(obj) <- value 75 | } 76 | \arguments{ 77 | \item{obj}{A Phemd object} 78 | 79 | \item{value}{Assignment object} 80 | } 81 | \value{ 82 | Updated Phemd object 83 | 84 | Updated Phemd object 85 | 86 | Updated Phemd object 87 | 88 | Updated Phemd object 89 | 90 | Updated Phemd object 91 | 92 | Updated Phemd object containing Seurat object 93 | 94 | Updated Phemd object containing phate object 95 | 96 | Updated Phemd object 97 | 98 | Updated Phemd object 99 | 100 | Updated Phemd object 101 | } 102 | \description{ 103 | Setter function for protein / gene markers 104 | 105 | Setter function for stored expression data 106 | 107 | Setter function for single-cell expression data aggregated from multiple samples 108 | 109 | Setter function for indices of cells subsampled from each sample during aggregation 110 | 111 | Setter function for boolean denoting whether cells were subsampled from each sample during aggregation 112 | 113 | Setter function for Monocle2 CellDataSet object for experiment 114 | 115 | Setter function for Seurat object for experiment 116 | 117 | Setter function for phate object for experiment 118 | 119 | Setter function for cell subtype frequencies of each single-cell sample 120 | 121 | Setter function for batch IDs of each single-cell sample 122 | 123 | Setter function for EMD ground distance matrix 124 | } 125 | \examples{ 126 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 127 | new_genes <- all_genes 128 | new_genes[1] <- 'IL2R' 129 | selectMarkers(phemdObj) <- new_genes 130 | 131 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 132 | new_expn_data <- all_expn_data 133 | new_expn_data <- lapply(new_expn_data, function(x) {log2(x+1)}) 134 | rawExpn(phemdObj) <- new_expn_data 135 | 136 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 137 | aggregated_data <- t(do.call(rbind,all_expn_data)) 138 | pooledCells(phemdObj) <- aggregated_data 139 | 140 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 141 | subsampledIdxList<- rep(list(1:10), length(all_expn_data)) #subsampled cells 1-10 from each sample 142 | subsampledIdx(phemdObj) <- subsampledIdxList 143 | 144 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 145 | subsampledBool(phemdObj) <- TRUE 146 | 147 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 148 | mydata <- pooledCells(phemdObj) 149 | myCellDataSet <- newCellDataSet(mydata,phenoData=NULL, expressionFamily=VGAM::negbinomial.size()) 150 | monocleInfo(phemdObj) <- myCellDataSet 151 | 152 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 153 | my_seuratObj <- Seurat::CreateSeuratObject(counts = t(all_expn_data[[1]]), project = "A") 154 | seuratInfo(phemdObj) <- my_seuratObj 155 | 156 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 157 | #my_phateObj <- phateR::phate(all_expn_data[[1]]) 158 | phateInfo(phemdObj) <- list() 159 | 160 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 161 | myCellTypeFreqs <- matrix(rexp(length(all_expn_data)*10, rate=.1), ncol=10) 162 | myCellTypeFreqs <- apply(myCellTypeFreqs, 1, function(x) {x / sum(x)}) 163 | celltypeFreqs(phemdObj) <- myCellTypeFreqs 164 | 165 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 166 | my_seuratObj <- Seurat::CreateSeuratObject(counts = t(all_expn_data[[1]]), project = "A") 167 | seuratInfo(phemdObj) <- my_seuratObj 168 | batchIDs(phemdObj) <- rep('A', length(all_expn_data)) 169 | 170 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 171 | cluster_locs <- 1:10 172 | myGDM <- as.matrix(dist(cluster_locs)) 173 | GDM(phemdObj) <- myGDM 174 | 175 | } 176 | -------------------------------------------------------------------------------- /R/functions-getResults.R: -------------------------------------------------------------------------------- 1 | #' @title Writes samples to file based on community detection group assignments 2 | #' @description Takes vector of cluster assignments and phemdObj containing sample names and writes sample groups to file 3 | #' @details Order of samples in obj@@snames is assumed to be the same as the order of group assignments in cluster_assignments 4 | #' @param cluster_assignments Vector containing group assignments for each sample 5 | #' @param obj phemdObj object containing sample names in @@snames slot 6 | #' @param dest Path to existing directory where output should be saved 7 | #' @param overwrite Boolean representing whether or not to overwrite contents of "dest" with output of printClusterAssignments 8 | #' @return None 9 | #' @examples 10 | #' 11 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 12 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 13 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 14 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 15 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 16 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 17 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final) 18 | #' my_EMD_mat <- compareSamples(my_phemdObj_final) 19 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 20 | #' printClusterAssignments(cluster_assignments, my_phemdObj_final, '.', overwrite=TRUE) 21 | #' 22 | printClusterAssignments <- function(cluster_assignments, obj, dest, overwrite=FALSE) { 23 | snames <- sNames(obj) 24 | if(dir.exists(paste(dest, 'sample_groups', sep='')) && overwrite==FALSE) { 25 | stop('Directory "sample_groups" already exists in specified path. Set "overwrite" parameter to TRUE if you want to overwrite existing directory') 26 | } 27 | unlink(paste(dest, 'sample_groups', sep=''), recursive=TRUE) 28 | dir.create(file.path(paste(dest, 'sample_groups', sep='')), showWarnings = FALSE) # create folder for output 29 | for(i in seq_len(max(cluster_assignments))) { 30 | cur_cluster_idx <- which(cluster_assignments == i) 31 | cur_file <- sprintf('sample_groups/scluster_%s.txt', intToUtf8(64+i)) 32 | cur_file <- strcat(dest, cur_file) 33 | write(snames[cur_cluster_idx], file=cur_file, sep="\n") 34 | } 35 | } 36 | 37 | #' @title Gets cell subtype frequency histograms for each sample by cluster ID 38 | #' @description Gets relative frequency ("weights") of cell subtypes ("bins" or "signatures") in each single-cell sample 39 | #' @details \code{groupSamples} must be called before calling this function. Saves plots in directory called "individual_inhibs" 40 | #' @param myobj phemdObj object containing cell subtype relative frequency in @@data_cluster_weights slot 41 | #' @param cluster_assignments Vector containing group assignments for each sample in myobj 42 | #' @param cell_model Method by which cell state was modeled (either "monocle2" or "seurat") 43 | #' @return List of lists, with outer list representing sample cluster ID and inner list representing cell subtype frequencies of given sample 44 | #' @examples 45 | #' 46 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 47 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 48 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 49 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 50 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 51 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 52 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final) 53 | #' my_EMD_mat <- compareSamples(my_phemdObj_final) 54 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 55 | #' weights_by_cluster <- getSampleHistsByCluster(my_phemdObj_final, cluster_assignments) 56 | #' 57 | getSampleHistsByCluster <- function(myobj, cluster_assignments, cell_model=c('monocle2', 'seurat')) { 58 | cell_model <- match.arg(cell_model, c('monocle2','seurat')) 59 | if(cell_model == 'monocle2') { 60 | monocle_obj <- monocleInfo(myobj) 61 | labels <- pData(phenoData(monocle_obj)) 62 | state_labels <- as.numeric(labels$State) 63 | } else if(cell_model == 'seurat') { 64 | seurat_obj <- seuratInfo(myobj) 65 | state_labels <- as.numeric(as.character(Idents(seurat_obj))) 66 | } else { 67 | stop('Error: cell_model must either be "monocle2" or "seurat"') 68 | } 69 | 70 | cluster_weights <- celltypeFreqs(myobj) 71 | snames <- sNames(myobj) 72 | 73 | weights_by_cluster <- list() 74 | for(i in seq_len(max(cluster_assignments))) { 75 | cur_inhibs <- which(cluster_assignments == i) 76 | for(j in seq_len(length(cur_inhibs))) { 77 | cur_idx <- cur_inhibs[j] 78 | cur_sname <- snames[cur_idx] 79 | if(is.null(weights_by_cluster[[intToUtf8(64+i)]])) { 80 | weights_by_cluster[[intToUtf8(64+i)]] <- list() 81 | } 82 | weights_by_cluster[[intToUtf8(64+i)]][[cur_sname]] <- cluster_weights[cur_idx,] 83 | } 84 | } 85 | return(weights_by_cluster) 86 | } 87 | 88 | 89 | #' @title Gets cell yield of each sample as a table 90 | #' @description Gets cell yield (number of viable cells) of each single-cell sample in decreasing order 91 | #' @param myobj phemdObj object containing expression data for each sample in 'data' slot 92 | #' @param cluster_assignments Vector of cluster assignments to be included as additional column in output table (optional) 93 | #' @return Data frame representing cell yield of each sample 94 | #' @examples 95 | #' 96 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 97 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 98 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 99 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 100 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 101 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 102 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final) 103 | #' my_EMD_mat <- compareSamples(my_phemdObj_final) 104 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 105 | #' getCellYield(my_phemdObj_final, cluster_assignments) 106 | #' 107 | getCellYield <- function(myobj, cluster_assignments=NULL) { 108 | nsample <- length(rawExpn(myobj)) 109 | cell_yield <- vapply(rawExpn(myobj), nrow, integer(1L)) 110 | 111 | order_idx <- order(cell_yield, decreasing=FALSE) 112 | cell_yield_ordered <- cell_yield[order_idx] 113 | snames_ordered <- sNames(myobj)[order_idx] 114 | cell_yield_tab <- cbind.data.frame(snames_ordered, cell_yield_ordered) 115 | colnames(cell_yield_tab) <- c('sample_ID', 'cell_yield') 116 | if(!is.null(cluster_assignments)) { 117 | cluster_assignments_reordered <- cluster_assignments[order_idx] 118 | cell_yield_tab$cluster_ID <- vapply(cluster_assignments_reordered, function(x) intToUtf8(64+x), '') 119 | } 120 | return(cell_yield_tab) 121 | } 122 | 123 | #' @title Returns cell subtype distribution for each sample as a table 124 | #' @description Returns cell subtype distribution for each single-cell sample along with (optional) final inhibitor cluster assignment 125 | #' @param myobj phemdObj object containing expression data for each sample in 'data' slot 126 | #' @param cluster_assignments Vector of cluster assignments to be included as additional column in output table (optional) 127 | #' @return Data frame representing relative frequencies of each cell subtype along with (optional) final inhibitor cluster assignment for each single-cell sample 128 | #' @examples 129 | #' 130 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 131 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 132 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 133 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 134 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 135 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 136 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final) 137 | #' my_EMD_mat <- compareSamples(my_phemdObj_final) 138 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 139 | #' getSampleCelltypeFreqs(my_phemdObj_final, cluster_assignments) 140 | #' 141 | getSampleCelltypeFreqs <- function(myobj, cluster_assignments=NULL) { 142 | celltype_freqs <- as.data.frame(celltypeFreqs(myobj)) 143 | celltype_freqs <- round(celltype_freqs, digits=3) 144 | colnames(celltype_freqs) <- paste('C-', seq_len(ncol(celltype_freqs)), sep='') 145 | 146 | if(!is.null(cluster_assignments)) { 147 | celltype_freqs$Sample.Cluster.ID <- vapply(cluster_assignments, function(x) intToUtf8(64+x), '') 148 | } 149 | 150 | weightsTab <- cbind.data.frame(Sample.Name=sNames(myobj), celltype_freqs) 151 | return(weightsTab) 152 | } 153 | -------------------------------------------------------------------------------- /vignettes/phemd.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Tutorial on using PhEMD to analyze multi-sample single-cell experiments" 3 | author: "William Chen" 4 | date: "October 20, 2018" 5 | output: BiocStyle::html_document 6 | 7 | vignette: > 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteIndexEntry{PhEMD vignette} 10 | \usepackage[UTF-8]{inputenc} 11 | --- 12 | 13 | ```{r setup, include=FALSE} 14 | knitr::opts_chunk$set(echo = TRUE) 15 | ``` 16 | 17 | 18 | ## Overview 19 | 20 | PhEMD is a package for comparing multiple heterogeneous single-cell samples to 21 | one another. It currently does so by first defining cell subtypes and relating 22 | them to one another using Monocle 2. It then computes Earth Mover's Distance 23 | (EMD) between each pair of samples, incorporating information about intrinsic 24 | cell subtype dissimilarity (i.e. manifold distance between cell subtypes) and 25 | differences between samples with respect to relative abundance of each cell 26 | subtype. PhEMD uses these pairwise distances to construct a network of 27 | single-cell samples that may be visually related in 2D or 3D using a diffusion 28 | map and partitioned to identify groups of similar samples. 29 | 30 | ## 1. Installation 31 | 32 | PhEMD requires R version >= 3.4.0 (recommended 3.5.0+), Bioconductor 33 | version >= 3.5 (recommended 3.7+), and Monocle 2 version >= 2.4.0 34 | (recommended 2.8.0) 35 | 36 | ###Install from Bioconductor 37 | 38 | ```{r echo=T, results = 'hide',message=F, warning=F, eval=F} 39 | BiocManager::install("phemd") 40 | ``` 41 | 42 | ###Install from Github (though direct installation from Bioconductor is preferred) 43 | 44 | ```{r echo=T, results = 'hide',message=F, warning=F, eval=F} 45 | library(devtools) 46 | install_github("wschen/phemd") 47 | ``` 48 | 49 | ###Load library after installation 50 | 51 | ```{r echo=T, results = 'hide', message=F, warning=F} 52 | library('phemd') 53 | library('monocle') 54 | ``` 55 | 56 | ## 2. Preparing data for cell state definition and embedding 57 | 58 | PhEMD expects single-cell data to be represented as an R list of samples. Each 59 | sample (i.e. list element) is expected to be a matrix of dimension *num_cells* 60 | x *num_markers*, where markers may represent genes or cytometry protein markers. 61 | For this vignette, we will be demonstrating our analysis pipeline on a melanoma 62 | dataset consisting of tumor-infiltrating immune cell scRNA-seq data (selected 63 | genes) that were log-transformed following TPM-normalization (first published 64 | by Tirosh et al., 2016). 65 | 66 | We first start by creating a PhEMD data object, specifying the multi-sample 67 | expression data (R list), marker names (i.e. column names of the data matrices 68 | in the list of expression data), and sample names (in the order they appear in 69 | the list of expression data). 70 | 71 | ```{r echo=T, results = 'hide', message=F, warning=F} 72 | load('melanomaData.RData') 73 | myobj <- createDataObj(all_expn_data, all_genes, as.character(snames)) 74 | ``` 75 | 76 | We can optionally remove samples in the PhEMD data object that have fewer than 77 | min_sz number of cells as follows: 78 | 79 | ```{r echo=T} 80 | myobj <- removeTinySamples(myobj, min_sz = 20) 81 | ``` 82 | 83 | Note that samples that don't meet the meet the minimum cell yield criteria are 84 | removed from rawExpn(myobj) and from the list of sample names in 85 | sampleNames(myobj). 86 | 87 | Next, aggregate data from all samples into a matrix that is stored in the PhEMD 88 | data object (in slot 'data_aggregate'). This aggregated data will then be used 89 | for initial cell subtype definition and embedding. If there are more cells 90 | collectively in all samples than max_cells, an equal number of cells from each 91 | sample will be subsampled and stored in pooledCells(myobj). 92 | 93 | ```{r echo=T, results = 'hide'} 94 | myobj <- aggregateSamples(myobj, max_cells=12000) 95 | ``` 96 | 97 | ## 3. Generate Monocle 2 cell embedding with cell state definitions 98 | 99 | Now that we have aggregated single-cell data from all samples, we are ready to 100 | perform cell subtype definition and dimensionality reduction to visually and 101 | spatially relate cells and cell subtypes. For this, we use Monocle 2. Before we 102 | begin, we first perform feature selection by selecting 44 important genes. 103 | Suggestions on how to choose important genes can be found here: http://cole-trapnell-lab.github.io/monocle-release/docs/#trajectory-step-1-choose-genes-that-define-a-cell-s-progress 104 | 105 | ```{r echo=T, results = 'hide'} 106 | myobj <- selectFeatures(myobj, selected_genes) 107 | ``` 108 | 109 | We are now ready to generate a Monocle 2 embedding. Our *embedCells()* function 110 | is a wrapper function for the *reduceDimension()* function in Monocle 2. For our 111 | example dataset, we specify the expression distribution model as *'gaussianff'* 112 | as is recommended in the Monocle 2 tutorial for log-transformed scRNA-seq TPM 113 | values (http://cole-trapnell-lab.github.io/monocle-release/docs/#choosing-a-distribution-for-your-data-required). 114 | *'negbinomial_sz'* is the recommended data type for most unnormalized scRNA-seq 115 | data (raw read counts) and *'gaussianff'* is recommended for log-transformed 116 | data or arcsin-transformed mass cytometry data. See above link for more details. 117 | 118 | Additional parameters may be passed to Monocle 2 *reduceDimension()* as optional 119 | named parameters in *embed_cells()*. We found that Monocle 2 is robust to a 120 | range of parameters. Sigma can be thought of as a "noise" parameter and we 121 | empirically found that sigma in the range of [0.01, 0.1] often works well for 122 | log-transformed scRNA-seq data or normalized CyTOF data. Greater values of sigma 123 | generally result in fewer total number of clusters. See Monocle 2 publication 124 | (Qiu et al., 2017) for additional details on parameter selection. 125 | 126 | ```{r echo=T, results = 'hide', message=F, warning=F} 127 | # generate 2D cell embedding and cell subtype assignments 128 | myobj <- embedCells(myobj, data_model = 'gaussianff', pseudo_expr=0, sigma=0.02) 129 | # generate pseudotime ordering of cells 130 | myobj <- orderCellsMonocle(myobj) 131 | ``` 132 | 133 | The result of the code above is a Monocle 2 object stored in 134 | *monocleInfo(myobj)*. This object contains cell subtype and pseudotime 135 | assignments for each cell in the aggregated data matrix (stored in 136 | *pooledCells(myobj)*). A 2D embedding of these cells has also been generated. 137 | We can visualize the embedding by writing them to file in this way: 138 | 139 | ```{r echo=T, results = 'hide', fig.width=8, fig.height=4} 140 | cmap <- plotEmbeddings(myobj, cell_model='monocle2') 141 | ``` 142 | 143 | To visualize the expression profiles of the cell subtypes, we can plot a heatmap 144 | and save to file as such: 145 | 146 | ```{r echo=T, results = 'hide', fig.width=8, fig.height=6} 147 | plotHeatmaps(myobj, cell_model='monocle2', selected_genes=heatmap_genes) 148 | ``` 149 | 150 | ## 4. Deconvolute single-cell samples and compare using Earth Mover's Distance 151 | 152 | Now that we have identified a comprehensive set of cell subtypes across all 153 | single-cell samples and related them in a low-dimensional embedding by 154 | aggregating cells from all samples, we want to perform deconvolution to 155 | determine the abundance of each cell subtype on a per sample basis. To do so, 156 | we call this function: 157 | 158 | ```{r echo=T, results = 'hide'} 159 | # Determine cell subtype breakdown of each sample 160 | myobj <- clusterIndividualSamples(myobj) 161 | ``` 162 | 163 | The results of this process are stored in *celltypeFreqs(myobj)*. Row 164 | *i* column *j* represents the fraction of all cells in sample *i* assigned to 165 | cell subtype *j*. 166 | 167 | To compare single-cell samples, we use Earth Mover's Distance, which is a metric 168 | that takes into account both the difference in relative frequencies of matching 169 | cell subtypes (e.g. % of all cells in each sample that are CD8+ T-cells) and the 170 | dissimilarity of the cell subtypes themselves (e.g. intrinsic dissimilarity 171 | between CD8+ and CD4+ T-cells). To compute the intrinsic dissimilarity between 172 | cell subtypes, we call the following function: 173 | 174 | ```{r echo=T, results = 'hide'} 175 | # Determine (dis)similarity of different cell subtypes 176 | myobj <- generateGDM(myobj) 177 | ``` 178 | 179 | *generateGDM()* stores the pairwise dissimilarity (i.e. "ground-distance" or 180 | "tree-distance") between cell subtypes in *GDM(myobj)*. 181 | 182 | We are now ready to compare single-cell samples using EMD. To do so, we simply 183 | call the function *compareSamples()*: 184 | 185 | ```{r echo=T, results = 'hide'} 186 | # Perform inter-sample comparisons using EMD 187 | my_distmat <- compareSamples(myobj) 188 | ``` 189 | 190 | *compareSamples()* returns a distance matrix representing the pairwise EMD 191 | between single-cell samples; *my_distmat[i,j]* represents the dissimilarity 192 | between samples *i* and *j* (i.e. samples represented by rows *i* and *j* in 193 | celltypeFreqs(myobj)). We can use this distance matrix to identify similar 194 | groups of samples as such: 195 | 196 | ```{r echo=T, results = 'hide'} 197 | ## Identify similar groups of inhibitors 198 | group_assignments <- groupSamples(my_distmat, distfun = 'hclust', ncluster=5) 199 | ``` 200 | 201 | ## 5. Visualize single-cell samples based on PhEMD-based similarity 202 | 203 | We can also use the PhEMD-based distance matrix to generate an embedding of 204 | single-cell samples, colored by group assignments. 205 | 206 | ```{r echo=T, results = 'hide', fig.width=8, fig.height=7, fig.keep='none'} 207 | dmap_obj <- plotGroupedSamplesDmap(my_distmat, group_assignments, pt_sz = 1.5) 208 | ``` 209 | 210 | ```{r echo=F, results = 'hide', fig.keep='none'} 211 | plotGroupedSamplesDmap(my_distmat, group_assignments, pt_sz = 1.5) 212 | ``` 213 | 214 | ```{r echo=F, results = 'hide', fig.width=8, fig.height=7} 215 | plotGroupedSamplesDmap(my_distmat, group_assignments, pt_sz = 1.5) 216 | ``` 217 | 218 | To retrieve the cell subtype distribution on a per-sample basis, use the 219 | following function. Histograms can be subsequently plotted for a given sample 220 | as desired. 221 | 222 | ```{r echo=T, results = 'hide'} 223 | # Plot cell subtype distribution (histogram) for each sample 224 | sample.cellfreqs <- getSampleHistsByCluster(myobj, group_assignments, cell_model='monocle2') 225 | ``` 226 | 227 | To plot cell subtype histograms summarizing groups of similar samples (bin-wise 228 | mean of each cell subtype across all samples assigned to a particular group), 229 | use the following plotting function: 230 | 231 | ```{r echo=T, results = 'hide', fig.width=10, fig.height=2.5} 232 | # Plot representative cell subtype distribution for each group of samples 233 | plotSummaryHistograms(myobj, group_assignments, cell_model='monocle2', cmap, 234 | ncol.plot = 5, ax.lab.sz=1.3, title.sz=2) 235 | ``` 236 | 237 | To plot cell yield of each samples as a barplot, use the following function: 238 | 239 | ```{r echo=T, results='hide', fig.width=8, fig.height=5.5} 240 | # Plot cell yield of each experimental condition 241 | plotCellYield(myobj, group_assignments, font_sz = 0.7, w=8, h=5) 242 | ``` 243 | 244 | ```{r echo=T} 245 | sessionInfo() 246 | ``` 247 | -------------------------------------------------------------------------------- /R/Phemd-methods.R: -------------------------------------------------------------------------------- 1 | ################## 2 | # Class constructor 3 | ################# 4 | 5 | #' Phemd class 6 | #' 7 | #' The main PhEMD class to store single-cell expression data. 8 | #' @field data List of matrices, each of which represents a single-cell sample (num_cells x num_genes) 9 | #' @field markers Column names (e.g. genes) for each element (i.e. data matrix) in "data" 10 | #' @field snames Sample ID for each element in "data" 11 | #' @field data_aggregate Numeric matrix representing expression data for cells from all experimental conditions (rows = markers, cols = cells) 12 | #' @field data_subsample_idx List of vectors each representing the indices of elements in "data" that were subsampled and combined to form "data_aggregate" 13 | #' @field subsampled_bool Boolean represent whether or not subsampling was performed in the data aggregation process 14 | #' @field monocle_obj Data object of type "CellDataSet" that is the core Monocle data structure 15 | #' @field data_cluster_weights Matrix representing cell subtype relative frequencies for each sample (num_samples x num_genes) 16 | #' @field emd_dist_mat Matrix representing pairwise distances between each pair of cell subtypes 17 | #' @field seurat_obj Object of class "Seurat" that is the core Seurat data structure 18 | #' @field phate_obj Object of class "phate" that is the core PHATE data structure 19 | #' @field experiment_ids Vector of length num_samples representing the experiment (batch) in which the sample was profiled 20 | #' @name Phemd 21 | #' @rdname Phemd 22 | #' @aliases Phemd-class 23 | #' @exportClass Phemd 24 | #' @importClassesFrom Seurat Seurat seurat 25 | 26 | setClassUnion("CDSorNULL",members=c('CellDataSet', "NULL")) 27 | setClassUnion("SeuratorNULL",members=c('Seurat', "NULL")) 28 | setClass("Phemd", 29 | contains=c('CellDataSet', 'Seurat'), 30 | slots=c(data = "list", 31 | markers = "character", 32 | snames = "character", 33 | data_aggregate = "matrix", 34 | data_subsample_idx = "list", 35 | subsampled_bool = "logical", 36 | monocle_obj = "CDSorNULL", 37 | data_cluster_weights = "matrix", 38 | emd_dist_mat = "matrix", 39 | seurat_obj = "SeuratorNULL", 40 | phate_obj = "list", 41 | cellstate_assignments = "list", 42 | experiment_ids = "character", 43 | version='package_version')) 44 | 45 | ########################### 46 | # Methods for Phemd class 47 | ########################## 48 | #' @name Phemd-methods 49 | #' @docType methods 50 | #' @rdname Phemd-methods 51 | #' 52 | setValidity("Phemd", function(object) { 53 | if(length(rawExpn(object)) < 1) { 54 | return('Phemd object must have at least 1 sample in rawExpn(object)') 55 | } 56 | if(length(sNames(object)) != length(rawExpn(object))) { 57 | return('sNames(object) must be the same length as rawExpn(object)') 58 | } 59 | if(sum(dim(pooledCells(object))) == 0 && ncol(rawExpn(object)[[1]]) != length(selectMarkers(object))) { 60 | return('Number of markers measured in rawExpn(object) must equal number of markers listed in selectMarkers(object)') 61 | } 62 | if(sum(dim(pooledCells(object))) != 0 && nrow(pooledCells(object)) != length(selectMarkers(object))) { 63 | return('Number of markers measured in pooledCells(object) must equal number of markers listed in selectMarkers(object)') 64 | } 65 | return(TRUE) 66 | }) 67 | 68 | 69 | ################## 70 | # Accessor functions 71 | ################### 72 | 73 | #' Accessor function for stored multi-sample raw expression data 74 | #' 75 | #' @param obj A Phemd object. 76 | #' @return List of matrices, each of which represents a single-cell sample 77 | #' @export 78 | #' @examples 79 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 80 | #' raw_expn_data <- rawExpn(phemdObj) 81 | #' 82 | rawExpn <- function(obj) { 83 | stopifnot(is(obj,"Phemd")) 84 | obj@data 85 | } 86 | 87 | 88 | #' Accessor function for stored Monocle object 89 | #' 90 | #' @param obj A Phemd object. 91 | #' @return An object of class 'CellDataSet' (from Monocle) 92 | #' @export 93 | #' @examples 94 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 95 | #' monocle_obj <- monocleInfo(phemdObj) 96 | #' 97 | monocleInfo <- function(obj) { 98 | stopifnot(is(obj,"Phemd")) 99 | obj@monocle_obj 100 | } 101 | 102 | #' Accessor function for stored Seurat object within Phemd object 103 | #' 104 | #' @param obj A Phemd object. 105 | #' @return An object of class 'Seurat' 106 | #' @export 107 | #' @examples 108 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 109 | #' seurat_obj <- seuratInfo(phemdObj) 110 | #' 111 | seuratInfo <- function(obj) { 112 | stopifnot(is(obj,"Phemd")) 113 | obj@seurat_obj 114 | } 115 | 116 | #' Accessor function for stored phate object 117 | #' 118 | #' @param obj A Phemd object. 119 | #' @return An object of class 'phate' (from phateR) 120 | #' @export 121 | #' @examples 122 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 123 | #' phateobj <- phateInfo(phemdObj) 124 | #' 125 | phateInfo <- function(obj) { 126 | stopifnot(is(obj,"Phemd")) 127 | obj@phate_obj 128 | } 129 | 130 | #' Accessor function for EMD ground distance matrix 131 | #' 132 | #' @param obj A Phemd object 133 | #' @return Sqaure matrix representing pairwise distances between cell subtypes 134 | #' @export 135 | #' @examples 136 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 137 | #' gdm <- GDM(phemdObj) 138 | #' 139 | GDM <- function(obj) { 140 | stopifnot(is(obj,"Phemd")) 141 | obj@emd_dist_mat 142 | } 143 | 144 | #' Accessor function for gene/protein markers measured in experiment 145 | #' 146 | #' @param obj Phemd object 147 | #' @return Vector representing gene/protein markers corresponding to expression matrices 148 | #' @export 149 | #' @examples 150 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 151 | #' genes <- selectMarkers(phemdObj) 152 | #' 153 | selectMarkers <- function(obj) { 154 | stopifnot(is(obj,"Phemd")) 155 | obj@markers 156 | } 157 | 158 | #' Accessor function for identifiers of all single-cell samples in experiment 159 | #' 160 | #' @param obj Phemd object 161 | #' @return Vector representing sample names corresponding to expression matrices 162 | #' @export 163 | #' @examples 164 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 165 | #' sampleIDs <- sNames(phemdObj) 166 | #' 167 | sNames <- function(obj) { 168 | stopifnot(is(obj,"Phemd")) 169 | obj@snames 170 | } 171 | 172 | 173 | #' Accessor function for aggregated cells used for cell subtype definition 174 | #' 175 | #' @param obj Phemd object 176 | #' @return Numeric matrix representing expression data for cells from all experimental conditions (rows = markers, cols = cells) 177 | #' @export 178 | #' @examples 179 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 180 | #' cells_aggregated <- pooledCells(phemdObj) 181 | #' 182 | pooledCells <- function(obj) { 183 | stopifnot(is(obj,"Phemd")) 184 | obj@data_aggregate 185 | } 186 | 187 | #' Accessor function for aggregated cells used for cell subtype definition 188 | #' 189 | #' @param obj Phemd object 190 | #' @return List of vectors each representing the indices of elements in rawExpn(obj) that were subsampled and combined to form "data_aggregate" 191 | #' @export 192 | #' @examples 193 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 194 | #' subsampled_idx_list <- subsampledIdx(phemdObj) 195 | #' 196 | subsampledIdx <- function(obj) { 197 | stopifnot(is(obj,"Phemd")) 198 | obj@data_subsample_idx 199 | } 200 | 201 | #' Accessor function for whether or not cells were subsampled when aggregated for cell subtype analysis 202 | #' 203 | #' @param obj Phemd object 204 | #' @return Boolean represent whether or not subsampling was performed in the data aggregation process 205 | #' @export 206 | #' @examples 207 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 208 | #' subsampled <- subsampledBool(phemdObj) 209 | #' 210 | subsampledBool <- function(obj) { 211 | stopifnot(is(obj,"Phemd")) 212 | obj@subsampled_bool 213 | } 214 | 215 | #' Accessor function for cell subtype distribution for each sample 216 | #' 217 | #' @param obj Phemd object 218 | #' @return Matrix representing cell subtype relative frequencies for each sample (num_samples x num_genes) 219 | #' @export 220 | #' @examples 221 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 222 | #' celltype_weights <- celltypeFreqs(phemdObj) 223 | #' 224 | celltypeFreqs <- function(obj) { 225 | stopifnot(is(obj,"Phemd")) 226 | obj@data_cluster_weights 227 | } 228 | 229 | #' Accessor function for batch ID for each sample 230 | #' 231 | #' @param obj Phemd object 232 | #' @return Vector of length num_samples representing the experiment (batch) in which the sample was profiled 233 | #' @export 234 | #' @examples 235 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 236 | #' batch_metadata <- batchIDs(phemdObj) 237 | #' 238 | batchIDs <- function(obj) { 239 | stopifnot(is(obj,"Phemd")) 240 | obj@experiment_ids 241 | } 242 | 243 | ################## 244 | # Setter functions 245 | ################### 246 | 247 | 248 | #' Setter function for protein / gene markers 249 | #' 250 | #' @rdname Phemd-methods 251 | #' @docType methods 252 | #' @return Updated Phemd object 253 | #' @export 254 | #' @examples 255 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 256 | #' new_genes <- all_genes 257 | #' new_genes[1] <- 'IL2R' 258 | #' selectMarkers(phemdObj) <- new_genes 259 | #' 260 | setGeneric("selectMarkers<-", function(obj, value) standardGeneric("selectMarkers<-")) 261 | 262 | #' @rdname Phemd-methods 263 | #' @aliases Phemd,ANY,ANY-method 264 | setMethod("selectMarkers<-", "Phemd", function(obj, value) { 265 | obj@markers <- value 266 | validObject(obj) 267 | obj 268 | }) 269 | 270 | #' Setter function for stored expression data 271 | #' 272 | #' @rdname Phemd-methods 273 | #' @aliases Phemd,character,ANY-method 274 | #' @export 275 | #' @examples 276 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 277 | #' new_expn_data <- all_expn_data 278 | #' new_expn_data <- lapply(new_expn_data, function(x) {log2(x+1)}) 279 | #' rawExpn(phemdObj) <- new_expn_data 280 | #' 281 | setGeneric("rawExpn<-", function(obj, value) standardGeneric("rawExpn<-")) 282 | 283 | #' @rdname Phemd-methods 284 | #' @aliases Phemd,ANY,ANY-method 285 | setMethod("rawExpn<-", "Phemd", function(obj, value) { 286 | obj@data <- value 287 | validObject(obj) 288 | obj 289 | }) 290 | 291 | #' Setter function for single-cell expression data aggregated from multiple samples 292 | #' 293 | #' @rdname Phemd-methods 294 | #' @docType methods 295 | #' @return Updated Phemd object 296 | #' @export 297 | #' @examples 298 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 299 | #' aggregated_data <- t(do.call(rbind,all_expn_data)) 300 | #' pooledCells(phemdObj) <- aggregated_data 301 | #' 302 | setGeneric("pooledCells<-", function(obj, value) standardGeneric("pooledCells<-")) 303 | 304 | #' @rdname Phemd-methods 305 | #' @aliases Phemd,ANY,ANY-method 306 | setMethod("pooledCells<-", "Phemd", function(obj, value) { 307 | obj@data_aggregate <- value 308 | validObject(obj) 309 | obj 310 | }) 311 | 312 | #' Setter function for indices of cells subsampled from each sample during aggregation 313 | #' 314 | #' @rdname Phemd-methods 315 | #' @docType methods 316 | #' @return Updated Phemd object 317 | #' @export 318 | #' @examples 319 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 320 | #' subsampledIdxList<- rep(list(1:10), length(all_expn_data)) #subsampled cells 1-10 from each sample 321 | #' subsampledIdx(phemdObj) <- subsampledIdxList 322 | #' 323 | setGeneric("subsampledIdx<-", function(obj, value) standardGeneric("subsampledIdx<-")) 324 | 325 | #' @rdname Phemd-methods 326 | #' @aliases Phemd,ANY,ANY-method 327 | setMethod("subsampledIdx<-", "Phemd", function(obj, value) { 328 | obj@data_subsample_idx <- value 329 | validObject(obj) 330 | obj 331 | }) 332 | 333 | #' Setter function for boolean denoting whether cells were subsampled from each sample during aggregation 334 | #' 335 | #' @rdname Phemd-methods 336 | #' @docType methods 337 | #' @return Updated Phemd object 338 | #' @export 339 | #' @examples 340 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 341 | #' subsampledBool(phemdObj) <- TRUE 342 | #' 343 | setGeneric("subsampledBool<-", function(obj, value) standardGeneric("subsampledBool<-")) 344 | 345 | #' @rdname Phemd-methods 346 | #' @aliases Phemd,ANY,ANY-method 347 | setMethod("subsampledBool<-", "Phemd", function(obj, value) { 348 | obj@subsampled_bool <- value 349 | validObject(obj) 350 | obj 351 | }) 352 | 353 | #' Setter function for Monocle2 CellDataSet object for experiment 354 | #' 355 | #' @rdname Phemd-methods 356 | #' @param obj A Phemd object 357 | #' @param value Assignment object 358 | #' @return Updated Phemd object 359 | #' @export 360 | #' @examples 361 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 362 | #' mydata <- pooledCells(phemdObj) 363 | #' myCellDataSet <- newCellDataSet(mydata,phenoData=NULL, expressionFamily=VGAM::negbinomial.size()) 364 | #' monocleInfo(phemdObj) <- myCellDataSet 365 | #' 366 | setGeneric("monocleInfo<-", function(obj, value) standardGeneric("monocleInfo<-")) 367 | 368 | #' @rdname Phemd-methods 369 | #' @aliases Phemd,ANY,ANY-method 370 | setMethod("monocleInfo<-", "Phemd", function(obj, value) { 371 | obj@monocle_obj <- value 372 | validObject(obj) 373 | obj 374 | }) 375 | 376 | #' Setter function for Seurat object for experiment 377 | #' 378 | #' @rdname Phemd-methods 379 | #' @docType methods 380 | #' @return Updated Phemd object containing Seurat object 381 | #' @export 382 | #' @examples 383 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 384 | #' my_seuratObj <- Seurat::CreateSeuratObject(counts = t(all_expn_data[[1]]), project = "A") 385 | #' seuratInfo(phemdObj) <- my_seuratObj 386 | #' 387 | setGeneric("seuratInfo<-", function(obj, value) standardGeneric("seuratInfo<-")) 388 | 389 | #' @rdname Phemd-methods 390 | #' @aliases Phemd,ANY,ANY-method 391 | setMethod("seuratInfo<-", "Phemd", function(obj, value) { 392 | obj@seurat_obj <- value 393 | validObject(obj) 394 | obj 395 | }) 396 | 397 | 398 | #' Setter function for phate object for experiment 399 | #' 400 | #' @rdname Phemd-methods 401 | #' @docType methods 402 | #' @return Updated Phemd object containing phate object 403 | #' @export 404 | #' @examples 405 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 406 | #' #my_phateObj <- phateR::phate(all_expn_data[[1]]) 407 | #' phateInfo(phemdObj) <- list() 408 | #' 409 | setGeneric("phateInfo<-", function(obj, value) standardGeneric("phateInfo<-")) 410 | 411 | #' @rdname Phemd-methods 412 | #' @aliases Phemd,ANY,ANY-method 413 | setMethod("phateInfo<-", "Phemd", function(obj, value) { 414 | obj@phate_obj <- value 415 | validObject(obj) 416 | obj 417 | }) 418 | 419 | #' Setter function for cell subtype frequencies of each single-cell sample 420 | #' 421 | #' @rdname Phemd-methods 422 | #' @docType methods 423 | #' @return Updated Phemd object 424 | #' @export 425 | #' @examples 426 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 427 | #' myCellTypeFreqs <- matrix(rexp(length(all_expn_data)*10, rate=.1), ncol=10) 428 | #' myCellTypeFreqs <- apply(myCellTypeFreqs, 1, function(x) {x / sum(x)}) 429 | #' celltypeFreqs(phemdObj) <- myCellTypeFreqs 430 | #' 431 | setGeneric("celltypeFreqs<-", function(obj, value) standardGeneric("celltypeFreqs<-")) 432 | 433 | #' @rdname Phemd-methods 434 | #' @aliases Phemd,ANY,ANY-method 435 | setMethod("celltypeFreqs<-", "Phemd", function(obj, value) { 436 | obj@data_cluster_weights <- value 437 | validObject(obj) 438 | obj 439 | }) 440 | 441 | #' Setter function for batch IDs of each single-cell sample 442 | #' 443 | #' @rdname Phemd-methods 444 | #' @docType methods 445 | #' @return Updated Phemd object 446 | #' @export 447 | #' @examples 448 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 449 | #' my_seuratObj <- Seurat::CreateSeuratObject(counts = t(all_expn_data[[1]]), project = "A") 450 | #' seuratInfo(phemdObj) <- my_seuratObj 451 | #' batchIDs(phemdObj) <- rep('A', length(all_expn_data)) 452 | #' 453 | setGeneric("batchIDs<-", function(obj, value) standardGeneric("batchIDs<-")) 454 | 455 | #' @rdname Phemd-methods 456 | #' @aliases Phemd,ANY,ANY-method 457 | setMethod("batchIDs<-", "Phemd", function(obj, value) { 458 | obj@experiment_ids <- value 459 | validObject(obj) 460 | obj 461 | }) 462 | 463 | #' Setter function for EMD ground distance matrix 464 | #' 465 | #' @rdname Phemd-methods 466 | #' @docType methods 467 | #' @return Updated Phemd object 468 | #' @export 469 | #' @examples 470 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 471 | #' cluster_locs <- 1:10 472 | #' myGDM <- as.matrix(dist(cluster_locs)) 473 | #' GDM(phemdObj) <- myGDM 474 | #' 475 | setGeneric("GDM<-", function(obj, value) standardGeneric("GDM<-")) 476 | 477 | #' @rdname Phemd-methods 478 | #' @aliases Phemd,ANY,ANY-method 479 | setMethod("GDM<-", "Phemd", function(obj, value) { 480 | obj@emd_dist_mat <- value 481 | validObject(obj) 482 | obj 483 | }) 484 | 485 | -------------------------------------------------------------------------------- /R/functions-plotting.R: -------------------------------------------------------------------------------- 1 | #' @title Plots Monocle2 cell embedding plots 2 | #' @description Takes as input a Phemd object containing either a Monocle2 object or Seurat object (already embedded and ordered) and plots cell embedding plots side by side. Optionally saves to specified folder. 3 | #' @details \code{embedCells} and \code{orderCellsMonocle} need to be called before calling this function. Required additional packages: 'RColorBrewer', 'cowplot' 4 | #' @param obj 'Phemd' object containing Monocle 2 object 5 | #' @param cell_model Method by which cell state was modeled (either "monocle2", "seurat", or "phate) 6 | #' @param cmap User-specified colormap to use to color cell state embedding (optional) 7 | #' @param w Width of plot in inches 8 | #' @param h Height of plot in inches 9 | #' @param pt_sz Scalar factor for point size 10 | #' @param ndims Number of dimensions to use for dimensionality reduction in case it hasn't been performed yet (only relevant when using Seurat data as input) 11 | #' @return Colormap (vector of colors) used to color Monocle2 cell state embedding 12 | #' @examples 13 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 14 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 15 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 16 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model='gaussianff', sigma=0.02, maxIter=2) 17 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 18 | #' cmap <- plotEmbeddings(my_phemdObj_monocle) 19 | plotEmbeddings <- function(obj, cell_model=c('monocle2', 'seurat', 'phate'), cmap=NULL, w=4, h=5, pt_sz=1, ndims=NULL) { 20 | saved_palette <- palette() 21 | cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate')) 22 | if(cell_model == 'monocle2') { 23 | monocle_obj <- monocleInfo(obj) 24 | cell_embedding <- reducedDimS(monocle_obj) 25 | cell_embedding_t <- as.data.frame(t(cell_embedding)) 26 | mydata <- pooledCells(obj) 27 | 28 | # Extract state labels from monocle data object 29 | labels <- pData(phenoData(monocle_obj)) 30 | state_labels <- as.numeric(labels$State) 31 | 32 | levels <- levels(factor(state_labels)) 33 | levels_renamed <- vapply(levels, function(x) paste("C-", x, sep=""), "") 34 | 35 | if(is.null(cmap)) { 36 | getPalette <- colorRampPalette(brewer.pal(11, "Spectral")) 37 | cmap <- getPalette(max(state_labels)) 38 | if("#FFFFBF" %in% cmap) cmap[which(cmap == "#FFFFBF")] <- "#D3D3D3" #replace light yellow with grey 39 | cmap <- sample(cmap) 40 | } 41 | palette(cmap) 42 | 43 | # visualize traj colored by state 44 | myplot_state <- ggplot(cell_embedding_t, aes(x=cell_embedding_t[,1], y=cell_embedding_t[,2], color=factor(state_labels))) + 45 | geom_point(size=0.4) + 46 | scale_color_manual(labels = levels_renamed, 47 | values = cmap) + 48 | guides(colour = guide_legend(override.aes = list(size=2))) + 49 | labs(x="", y = "", color = "Cell subtype") + 50 | theme_classic() + 51 | theme(axis.title.x=element_blank(), 52 | axis.text.x=element_blank(), 53 | axis.ticks.x=element_blank(), 54 | axis.title.y=element_blank(), 55 | axis.text.y=element_blank(), 56 | axis.ticks.y=element_blank(), 57 | axis.line = element_line(colour = "black", 58 | size = 1, linetype = "solid")) 59 | 60 | # visualize traj colored by pseudotime 61 | ncolor <- 9 62 | palette(brewer.pal(ncolor, "Blues")) 63 | 64 | col.labels <- labels$Pseudotime 65 | 66 | # visualize traj colored by pseudotime 67 | myplot_pt <- ggplot(cell_embedding_t, aes(x=cell_embedding_t[,1], y=cell_embedding_t[,2], color=col.labels)) + 68 | geom_point(size=0.4) + 69 | labs(x="", y = "", color = "Pseudotime") + 70 | theme_classic() + 71 | theme(axis.title.x=element_blank(), 72 | axis.text.x=element_blank(), 73 | axis.ticks.x=element_blank(), 74 | axis.title.y=element_blank(), 75 | axis.text.y=element_blank(), 76 | axis.ticks.y=element_blank(), 77 | axis.line = element_line(colour = "black", 78 | size = 1, linetype = "solid")) 79 | 80 | print(plot_grid(myplot_state, myplot_pt, ncol=2)) 81 | return(cmap) 82 | } else if(cell_model == 'seurat') { 83 | seurat_obj <- seuratInfo(obj) 84 | 85 | if(!'tsne' %in% names(seurat_obj@dr)) { 86 | print('Running t-SNE...') 87 | if(is.null(ndims)) ndims <- 10 88 | seurat_obj <- RunTSNE(seurat_obj, reduction.use="cca.aligned", dims.use=seq_len(ndims)) 89 | } 90 | 91 | # define color map 92 | if(is.null(cmap)) { 93 | getPalette <- colorRampPalette(brewer.pal(11, "Spectral")) 94 | cmap <- getPalette(max(as.numeric(seurat_obj@ident))) 95 | if("#FFFFBF" %in% cmap) cmap[which(cmap == "#FFFFBF")] <- "#D3D3D3" #replace light yellow with grey 96 | cmap <- sample(cmap) 97 | } 98 | 99 | TSNEPlot(seurat_obj, do.label=FALSE, pt.size=pt_sz, colors.use=cmap) 100 | 101 | } else if(cell_model == 'phate') { 102 | # Extract PHATE embedding 103 | phate_obj <- phateInfo(obj) 104 | cell_embedding_t <- as.data.frame(phate_obj$embedding) 105 | state_labels <- phate_obj$cellstate.labels 106 | 107 | levels <- levels(factor(state_labels)) 108 | levels_renamed <- vapply(levels, function(x) paste("C-", x, sep=""), "") 109 | 110 | if(is.null(cmap)) { 111 | getPalette <- colorRampPalette(brewer.pal(11, "Spectral")) 112 | cmap <- getPalette(max(state_labels)) 113 | if("#FFFFBF" %in% cmap) cmap[which(cmap == "#FFFFBF")] <- "#D3D3D3" #replace light yellow with grey 114 | cmap <- sample(cmap) 115 | } 116 | palette(cmap) 117 | 118 | # visualize traj colored by state 119 | myplot_state <- ggplot(cell_embedding_t, aes(x=cell_embedding_t[,1], y=cell_embedding_t[,2], color=factor(state_labels))) + 120 | geom_point(size=0.4) + 121 | scale_color_manual(labels = levels_renamed, 122 | values = cmap) + 123 | guides(colour = guide_legend(override.aes = list(size=2))) + 124 | labs(x="", y = "", color = "Cell subtype") + 125 | theme_classic() + 126 | theme(axis.title.x=element_blank(), 127 | axis.text.x=element_blank(), 128 | axis.ticks.x=element_blank(), 129 | axis.title.y=element_blank(), 130 | axis.text.y=element_blank(), 131 | axis.ticks.y=element_blank(), 132 | axis.line = element_line(colour = "black", 133 | size = 1, linetype = "solid")) 134 | print(myplot_state) 135 | } else { 136 | stop('Error: cell_model must be either "monocle2", "seurat", or "phate"') 137 | } 138 | palette(saved_palette) 139 | return(cmap) 140 | } 141 | 142 | #' @title Plot heatmap of cell subtypes 143 | #' @description Takes as input a Phemd object containing either a Monocle2, Seurat, or PHATE object (already embedded and clustered) and plots heatmap characterizing cell subtypes 144 | #' @details \code{embedCells} (and \code{orderCellsMonocle} if using Monocle2) need to be called before calling this function. Required additional package: 'pheatmap' 145 | #' @param obj 'Phemd' object containing cell-state embedding object 146 | #' @param cell_model Method by which cell state was modeled ("monocle2", "seurat", or "phate") 147 | #' @param selected_genes Vector containing gene names to include in heatmap (optional) 148 | #' @param w Width of plot in inches 149 | #' @param h Height of plot in inches 150 | #' @param ... Additional parameters to be passed on to pheatmap function 151 | #' @return Heatmap containing expression values for each cell subtype. If cell_model is 'seurat', then returns a list of heatmaps (1 for each batch) that may be subsequently plotted individually 152 | #' @examples 153 | #' 154 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 155 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 156 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 157 | #' my_phemdObj_lg <- selectFeatures(my_phemdObj_lg, selected_genes) 158 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', 159 | #' pseudo_expr=0, sigma=0.02, maxIter=2) 160 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 161 | #' myheatmap <- plotHeatmaps(my_phemdObj_monocle, cell_model='monocle2') 162 | #' 163 | plotHeatmaps <- function(obj, cell_model=c('monocle2','seurat', 'phate'), selected_genes=NULL, w=8, h=5, ...) { 164 | cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate')) 165 | if(cell_model %in% c('monocle2', 'phate')) { 166 | # retrieve reference clusters 167 | ref_clusters <- retrieveRefClusters(obj, cell_model=cell_model, expn_type='raw') 168 | selected_clusters <- seq_len(length(ref_clusters)) 169 | myheatmap <- matrix(0, nrow=length(selected_clusters), ncol=ncol(ref_clusters[[1]])) 170 | for(i in selected_clusters) { 171 | cur_cluster <- ref_clusters[[i]] 172 | if(!is.null(cur_cluster)) { #at least 1 cell 173 | if(nrow(cur_cluster) > 1) { 174 | myheatmap[i,] <- colMeans(cur_cluster) 175 | } else { 176 | myheatmap[i,] <- cur_cluster #only 1 cell 177 | } 178 | } 179 | } 180 | 181 | selected_clusters_renamed <- vapply(names(ref_clusters), function(x) paste("C-", x, sep=""), "") 182 | 183 | rownames(myheatmap) <- selected_clusters_renamed 184 | colnames(myheatmap) <- selectMarkers(obj) 185 | 186 | if(!is.null(selected_genes)) { 187 | col_tokeep <- match(selected_genes, selectMarkers(obj)) 188 | if(sum(is.na(col_tokeep)) > 0) { 189 | genes_not_found <- '' 190 | missing_idx <- which(is.na(col_tokeep)) 191 | for(i in seq_len(length(missing_idx))) { 192 | if(i == 1) genes_not_found <- paste(genes_not_found, selected_genes[missing_idx[i]], sep='') 193 | else genes_not_found <- paste(genes_not_found, selected_genes[missing_idx[i]], sep=', ') 194 | } 195 | print(sprintf("Genes not found: %s", genes_not_found, sep="")) 196 | } 197 | col_tokeep <- col_tokeep[!is.na(col_tokeep)] 198 | myheatmap <- myheatmap[,col_tokeep] 199 | } 200 | 201 | myheatmap[is.nan(myheatmap)] <- 0 #this in the event of empty clusters 202 | 203 | myheatmap2 <- log2(myheatmap - min(myheatmap) + 1) 204 | 205 | pheatmap(myheatmap2, 206 | cluster_rows=FALSE, 207 | cluster_cols=TRUE, 208 | border_color=NA, 209 | show_colnames=TRUE, 210 | show_rownames=TRUE, 211 | fontsize_col=8, 212 | fontsize_row=12, 213 | cellwidth=10, 214 | width=w, 215 | height=h, 216 | ... 217 | ) 218 | return(myheatmap2) 219 | } else if(cell_model == 'seurat') { 220 | seurat_obj <- seuratInfo(obj) 221 | state_labels <- as.numeric(as.character(Idents(seurat_obj))) 222 | 223 | 224 | names(state_labels) <- rownames(seurat_obj@meta.data) 225 | ref_data <- t(as.matrix(GetAssayData(seurat_obj, assay.type='RNA', slot='counts'))) 226 | 227 | batches <- unique(batchIDs(obj)) 228 | myheatmaps_all <- list() 229 | for(batch_id in batches) { 230 | cell_idx_curplt <- which(seurat_obj@meta.data$plt == batch_id) 231 | if(length(cell_idx_curplt) == 0) { 232 | stop(sprintf('Error: no cells in reference set match the experiment_id %s. Please check batchIDs(phemdObj).', batch_id)) 233 | } 234 | cur_ref_data <- ref_data[cell_idx_curplt,] 235 | cur_state_labels <- state_labels[cell_idx_curplt] 236 | 237 | myheatmap <- matrix(0, nrow=max(state_labels), ncol=ncol(cur_ref_data)) 238 | for(i in seq_len(max(state_labels))) { 239 | cur_idx <- which(cur_state_labels == i) 240 | cur_cluster <- cur_ref_data[cur_idx,] 241 | if(length(cur_idx) > 1) myheatmap[i,] <- colMeans(cur_cluster) 242 | if(length(cur_idx) == 1) myheatmap[i,] <- cur_cluster 243 | } 244 | 245 | selected_clusters_renamed <- vapply(seq_len(max(state_labels)), function(x) paste("C-", x, sep=""), "") 246 | 247 | rownames(myheatmap) <- selected_clusters_renamed 248 | colnames(myheatmap) <- selectMarkers(obj) 249 | 250 | if(!is.null(selected_genes)) { 251 | col_tokeep <- match(selected_genes, selectMarkers(obj)) 252 | if(sum(is.na(col_tokeep)) > 0) { 253 | genes_not_found <- '' 254 | missing_idx <- which(is.na(col_tokeep)) 255 | for(i in seq_len(length(missing_idx))) { 256 | if(i == 1) genes_not_found <- paste(genes_not_found, selected_genes[missing_idx[i]], sep='') 257 | else genes_not_found <- paste(genes_not_found, selected_genes[missing_idx[i]], sep=', ') 258 | } 259 | print(sprintf("Genes not found: %s", genes_not_found, sep="")) 260 | } 261 | col_tokeep <- col_tokeep[!is.na(col_tokeep)] 262 | myheatmap <- myheatmap[,col_tokeep] 263 | } 264 | 265 | myheatmap[is.nan(myheatmap)] <- 0 #this in the event of empty clusters 266 | 267 | myheatmap2 <- log2(myheatmap - min(myheatmap) + 1) 268 | myheatmaps_all[[batch_id]] <- myheatmap2 269 | } 270 | 271 | for(i in seq_len(length(myheatmaps_all))) { 272 | if(!exists('myheatmaps_avg')) myheatmaps_avg <- myheatmaps_all[[i]] 273 | else myheatmaps_avg <- myheatmaps_avg + myheatmaps_all[[i]] 274 | } 275 | myheatmaps_avg <- myheatmaps_avg / length(myheatmaps_all) 276 | myheatmaps_all[['average']] <- myheatmaps_avg 277 | pheatmap(myheatmaps_avg, 278 | cluster_rows=TRUE, 279 | cluster_cols=FALSE, 280 | border_color=NA, 281 | show_colnames=TRUE, 282 | show_rownames=TRUE, 283 | fontsize_col=8, 284 | fontsize_row=12, 285 | cellwidth=10, 286 | width=w, 287 | height=h, 288 | ...) 289 | return(myheatmaps_all) 290 | } else { 291 | stop('Error: cell_model must be either "monocle2", "seurat", or "phate"') 292 | } 293 | } 294 | 295 | 296 | #' @title Rotates heatmap marker labels 45 degrees 297 | #' @description Overwrites default draw_colnames in the pheatmap package 298 | #' @details To be used with pheatmap plotting function; not to be called directly. Thanks to Josh O'Brien at http://stackoverflow.com/questions/15505607 299 | #' @param coln Column names 300 | #' @param gaps Spacing of labels 301 | #' @param ... Additional parameters to be passed to \code{gpar} 302 | #' @return Formatted marker labels in heatmap 303 | #' @examples 304 | #' #Not to be called directly 305 | drawColnames45 <- function(coln, gaps, ...) { 306 | coord <- pheatmap:::find_coordinates(length(coln), gaps) 307 | x <- coord$coord - 0.5 * coord$size 308 | res <- grid::textGrob( 309 | coln, x = x, y = unit(1, "npc") - unit(3,"bigpts"), 310 | vjust = 0.75, hjust = 1, rot = 45, gp = grid::gpar(...) 311 | ) 312 | return(res) 313 | } 314 | 315 | 316 | #' @title Plot diffusion map embedding of samples based on distance matrix 317 | #' @description Visualizes diffusion map for network of samples based on square distance matrix (sample-sample pairwise dissimilarity) 318 | #' @details Requires 'destiny' package 319 | #' @param my_distmat phemdObj object containing sample names in @@snames slot 320 | #' @param cluster_assignments Vector containing group assignments for each sample 321 | #' @param pt_sz Size of points representing samples in plot (scaling factor) 322 | #' @param n_dim Number of dimensions for embedding (either 2 or 3) 323 | #' @param pt_label Vector of sample names corresponding to each point (same order as samples in \code{my_distmat} and \code{cluster_assignments}) 324 | #' @param cmap Vector containing colors by which points should be colored (corresponding to cluster_assignments) 325 | #' @param w Width of plot in inches 326 | #' @param h Height of plot in inches 327 | #' @param scale.y Scaling factor for diffusion map y-axis 328 | #' @param angle Rotation factor for diffusion map plot 329 | #' @param autosave Boolean denoting whether or not to save output diffusion map 330 | #' @param ... Additional parameters to be passed to \code{DiffusionMap} function 331 | #' @return DiffusionMap object containing biological sample embedding and associated metadata 332 | #' @examples 333 | #' 334 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 335 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 336 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 337 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 338 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 339 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 340 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final) 341 | #' my_EMD_mat <- compareSamples(my_phemdObj_final) 342 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 343 | #' printClusterAssignments(cluster_assignments, my_phemdObj_final, '.', overwrite=TRUE) 344 | #' dm <- plotGroupedSamplesDmap(my_EMD_mat, cluster_assignments, pt_sz=2) 345 | #' 346 | plotGroupedSamplesDmap <- function(my_distmat, cluster_assignments=NULL, pt_sz=1, n_dim=3, pt_label = NULL, cmap = NULL, w=8, h=5, scale.y=1, angle=40, autosave=FALSE, ...) { 347 | extra_args <- list(...) 348 | if(nrow(my_distmat) != ncol(my_distmat)) { 349 | stop('Error: my_distmat must be a square distance matrix') 350 | } 351 | 352 | # Plot inhibitor groups using diffusion map 353 | covars <- data.frame(covar1 = seq_len(nrow(my_distmat))) 354 | if(nrow(my_distmat) < 30) { 355 | extra_args['n_local'] <- 3 356 | } 357 | 358 | dm_args <- c(list(data=covars, distance = as.dist(my_distmat)), 359 | extra_args[names(extra_args) %in% c("n_local", "density_norm", "rotate", "k", "sigma", "verbose")]) 360 | dm <- do.call(DiffusionMap, dm_args) 361 | 362 | # Check whether user supplied clustering info 363 | if(is.null(cluster_assignments)) { 364 | cluster_assignments <- rep(1, nrow(my_distmat)) 365 | cluster_assignments_named <- rep('A', nrow(my_distmat)) 366 | legend_bool <- FALSE 367 | } else { 368 | if(nrow(my_distmat) != length(cluster_assignments)) { 369 | stop('Error: cluster_assignments must be the same length as the number of rows in my_distmat') 370 | } 371 | if(is.null(cmap)) { 372 | getPalette <- colorRampPalette(brewer.pal(11, "Spectral")) 373 | cmap <- getPalette(max(c(cluster_assignments),3)) # min palette size = 3 374 | if("#FFFFBF" %in% cmap) cmap[which(cmap == "#FFFFBF")] <- "#D3D3D3" #replace light yellow with grey 375 | } 376 | if(length(cmap) > 1) palette(cmap) 377 | 378 | cluster_assignments_named <- vapply(cluster_assignments, function(x) intToUtf8(64+x), "") 379 | legend_bool <- TRUE 380 | } 381 | 382 | if(n_dim >= 3) { 383 | plot(dm, c(1,2,3), pch=20, col=factor(cluster_assignments_named), pal=cmap, cex.symbols = pt_sz, box=FALSE, xlab="", ylab="", zlab="", y.margin.add = -0.5, draw_legend=legend_bool, legend_opts = list(posx = c(0.85,0.88), posy = c(0.05, 0.7)), scale.y=scale.y, angle=angle) 384 | 385 | } else { 386 | dm.embedding <- as.data.frame(eigenvectors(dm)) 387 | # visualize traj colored by state 388 | myplot <- ggplot(dm.embedding, aes(x=dm.embedding[,1], y=dm.embedding[,2], color=factor(cluster_assignments_named))) + 389 | geom_point(size=pt_sz) + 390 | labs(x="", y = "", color = "Sample cluster") + 391 | scale_color_manual(breaks = levels(factor(cluster_assignments_named)), 392 | values=cmap[1:length(levels(factor(cluster_assignments_named)))]) + 393 | theme_classic() + 394 | theme(axis.title.x=element_blank(), 395 | axis.text.x=element_blank(), 396 | axis.ticks.x=element_blank(), 397 | axis.title.y=element_blank(), 398 | axis.text.y=element_blank(), 399 | axis.ticks.y=element_blank(), 400 | axis.line = element_line(colour = "black", 401 | size = 1, linetype = "solid")) 402 | print(myplot) 403 | } 404 | 405 | if(!is.null(pt_label)) { 406 | cluster_assignments_named <- vapply(cluster_assignments, function(x) paste("G-", x, sep=""), "") 407 | if(n_dim >= 3) { 408 | s3d <- scatterplot3d(eigenvectors(dm)[,1], eigenvectors(dm)[,2], eigenvectors(dm)[,3], color=as.numeric(factor(cluster_assignments_named)), pch=20, grid=F, box=F) 409 | s3d.coords <- s3d$xyz.convert(eigenvectors(dm)[,1], eigenvectors(dm)[,2], eigenvectors(dm)[,3]) 410 | text(s3d.coords$x, s3d.coords$y, # x and y coordinates 411 | labels=pt_label, # text to plot 412 | cex=.3, pos=2) 413 | } else { 414 | plot(eigenvectors(dm)[,1], eigenvectors(dm)[,2], main = '', xlab = '', ylab = '', xaxt = 'n', yaxt = 'n', pch=20, col=factor(cluster_assignments_named), cex = pt_sz) 415 | text(eigenvectors(dm)[,c(1,2)],labels = pt_label, pos = 2, cex=0.4) 416 | } 417 | } 418 | return(dm) 419 | } 420 | 421 | #' @title Plots cell subtype frequency histograms summarizing each group of samples 422 | #' @description Visualizes plots of relative frequency ("weights") of cell subtypes ("bins" or "signatures") summarizing each group of single-cell samples. Each summary histogram is computed by taking the bin-wise mean of all samples in the group 423 | #' @details \code{groupSamples} must be called before calling this function. Saves plots in directory called "summary_inhibs" 424 | #' @param myobj Phemd object containing cell subtype relative frequency in @@data_cluster_weights slot 425 | #' @param cluster_assignments Vector containing group assignments for each sample in myobj 426 | #' @param cell_model Method by which cell state was modeled (either "monocle2", "seurat", or "phate") 427 | #' @param cmap Vector containing colors by which histogram bars should be colored (optional) 428 | #' @param ncol.plot Number of columns to use to plot multi-panel histogram plot 429 | #' @param ax.lab.sz Scaling factor for axis labels (default 2.5) 430 | #' @param title.sz Scaling factor for plot title (default 3) 431 | #' @return None 432 | #' @examples 433 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 434 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 435 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 436 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 437 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 438 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 439 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final) 440 | #' my_EMD_mat <- compareSamples(my_phemdObj_final) 441 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 442 | #' printClusterAssignments(cluster_assignments, my_phemdObj_final, '.', overwrite=TRUE) 443 | #' dm <- plotGroupedSamplesDmap(my_EMD_mat, cluster_assignments, '.', pt_sz=2, pt_label = sNames(my_phemdObj_final)) 444 | #' plotSummaryHistograms(my_phemdObj_final, cluster_assignments, cell_model='monocle2) 445 | #' 446 | plotSummaryHistograms <- function(myobj, cluster_assignments, cell_model=c('monocle2','seurat', 'phate'), cmap=NULL, ncol.plot=4, ax.lab.sz=2.5, title.sz=3) { 447 | cell_model <- match.arg(cell_model, c('monocle2','seurat')) 448 | if(cell_model == 'monocle2') { 449 | monocle_obj <- monocleInfo(myobj) 450 | labels <- pData(phenoData(monocle_obj)) 451 | state_labels <- as.numeric(labels$State) 452 | 453 | } else if(cell_model == 'seurat') { 454 | seurat_obj <- seuratInfo(myobj) 455 | state_labels <- as.numeric(as.character(Idents(seurat_obj))) 456 | } else if(cell_model == 'phate') { 457 | state_labels <- as.numeric(phateInfo(myobj)$cellstate.labels) 458 | } else { 459 | stop('Error: cell_model must be either "monocle2" or "seurat"') 460 | } 461 | 462 | 463 | cluster_weights <- celltypeFreqs(myobj) 464 | 465 | if(is.null(cmap)) { 466 | getPalette <- colorRampPalette(brewer.pal(11, "Spectral")) 467 | cmap <- getPalette(max(state_labels)) 468 | } 469 | 470 | proto_inhibs <- matrix(0, max(cluster_assignments), ncol(cluster_weights)) 471 | for(i in seq_len(max(cluster_assignments))) { 472 | if(sum(cluster_assignments == i) == 1) { 473 | proto_inhibs[i,] <- cluster_weights[which(cluster_assignments == i),] 474 | } else { 475 | proto_inhibs[i,] <- colMeans(cluster_weights[which(cluster_assignments == i),]) 476 | } 477 | } 478 | 479 | nrow.plot <- ceiling(max(cluster_assignments) / ncol.plot) 480 | par(mfrow=c(nrow.plot,ncol.plot)) 481 | for(i in seq_len(max(cluster_assignments))) { 482 | if(max(proto_inhibs[i,]) > 0.4) ymax <- max(proto_inhibs[i,])+0.1 483 | else ymax <- 0.4 484 | barplot(proto_inhibs[i,], col=cmap, main='', xlab='', ylab = "Frequency (%)", ylim = c(0, ymax), cex.axis=1.5, cex.names = 2, cex.lab = ax.lab.sz, names.arg = seq_len(ncol(proto_inhibs))) 485 | 486 | title(xlab="Cell subtype", line=3.5, cex.lab=ax.lab.sz) 487 | title(main=sprintf("Group %s", intToUtf8(64+i)), line=0, cex.main=title.sz) 488 | } 489 | } 490 | 491 | #' @title Plot cell yield of each sample as bar plot 492 | #' @description Plots cell yield (number of viable cells) of each single-cell sample in decreasing order as horizontal bar plot 493 | #' @param myobj Phmed object containing expression data for each sample in 'data' slot 494 | #' @param labels Vector containing group labels for samples (optional). If not provided, bars will be of uniform color (blue) 495 | #' @param cmap Vector containing colors by which histogram bars should be colored (optional) 496 | #' @param font_sz Scaling factor for font size of sample names in barplot 497 | #' @param w Width of plot in inches 498 | #' @param h Height of plot in inches 499 | #' @return None 500 | #' @examples 501 | #' 502 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 503 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 504 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 505 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 506 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 507 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 508 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final) 509 | #' my_EMD_mat <- compareSamples(my_phemdObj_final) 510 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 511 | #' plotCellYield(my_phemdObj_final, labels=cluster_assignments, font_sz = 0.8) 512 | #' 513 | plotCellYield <- function(myobj, labels=NULL, cmap=NULL, font_sz = 0.6, w=8, h=9.5) { 514 | nsample <- length(rawExpn(myobj)) 515 | cell_yield <- vapply(rawExpn(myobj), nrow, integer(1L)) 516 | 517 | order_idx <- order(cell_yield, decreasing=FALSE) 518 | cell_yield_ordered <- cell_yield[order_idx] 519 | snames_ordered <- sNames(myobj)[order_idx] 520 | 521 | 522 | par(mar=c(6,6,2,2)) 523 | 524 | if(!is.null(labels)) { 525 | if(length(labels) != nsample) { 526 | stop('Error: length of "labels" vector must be equal to length of rawExpn(myobj)') 527 | } 528 | labels_ordered <- labels[order_idx] 529 | if(is.null(cmap)) { 530 | getPalette <- colorRampPalette(brewer.pal(11, "Spectral")) 531 | cmap <- getPalette(max(labels)) 532 | if("#FFFFBF" %in% cmap) cmap[which(cmap == "#FFFFBF")] <- "#D3D3D3" #replace light yellow with grey 533 | } 534 | color_vec <- cmap[labels_ordered] 535 | xx <- barplot(cell_yield_ordered, main='', horiz=TRUE, names.arg=snames_ordered, las=1, cex.names=font_sz, col=color_vec) 536 | } else { 537 | xx <- barplot(cell_yield_ordered, main='', horiz=TRUE, names.arg=snames_ordered, las=1, cex.names=font_sz, col='blue') 538 | } 539 | title(xlab="Cell yield (number of cells)", line=3, cex.lab=1.5) 540 | 541 | } 542 | 543 | 544 | -------------------------------------------------------------------------------- /R/functions-core.R: -------------------------------------------------------------------------------- 1 | ################ 2 | ## FUNCTIONS ### 3 | ################ 4 | 5 | ######################################################### 6 | ### Private methods below (not exported in namespace) ### 7 | ######################################################### 8 | 9 | #' @title Retrieve single-cell sample sizes 10 | #' @description Takes initial list of single-cell samples and returns vector containing number of cells in each sample. 11 | #' @details Private method (not exported in namespace) 12 | #' @param data_list List of length num_samples (each element has dimension num_cells x num_markers) 13 | #' @return Vector of length num_samples representing number of cells in each sample 14 | #' @examples 15 | #' \dontrun{ 16 | #' sample_sizes <- getSampleSizes(all_expn_data) 17 | #' } 18 | 19 | getSampleSizes <- function(data_list) { 20 | return(vapply(data_list, nrow, integer(1L))) 21 | } 22 | 23 | #' @title Retrieve reference cell clusters 24 | #' @description Takes initial Phemd struct and returns cell clusters as assigned by clustering algorithm (e.g. PHATE or Monocle2) 25 | #' @details Private method (not exported in namespace) 26 | #' @param obj Phemd struct containing cell-state embedding object and underlying expression data 27 | #' @param cell_model String representing data model for cell-state space ("seurat", "monocle2", or "phate") 28 | #' @param expn_type String representing whether to return raw expression values or coordinates in dimensionality-reduced feature space 29 | #' @param ndim Number of dimensions in reduced dimensionality space (e.g. PHATE / CCA) to use (only relevant in reduced dimensionality space) 30 | #' @return List of data matrices; each list element is of size num_cells_in_cluster x num_markers and represents a distinct cell cluster 31 | #' @examples 32 | #' \dontrun{ 33 | #' cluster_expression_data <- retrieveRefClusters(my_phemdObj) 34 | #' } 35 | #' 36 | 37 | retrieveRefClusters <- function(obj, cell_model=c('monocle2','seurat', 'phate'), expn_type='reduced', ndim=10) { 38 | cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate')) 39 | if(cell_model == 'monocle2') { 40 | # Extract state labels from monocle data object 41 | monocle_obj <- monocleInfo(obj) 42 | labels <- pData(phenoData(monocle_obj)) 43 | state_labels <- as.numeric(labels$State) 44 | 45 | # Split data frame based on cluster assignments 46 | if(expn_type == 'reduced') { 47 | mydata <- as.data.frame(t(reducedDimS(obj@monocle_obj))) 48 | if(ndim < ncol(mydata)) { 49 | mydata <- mydata[,1:ndim] 50 | } 51 | } else if(expn_type == 'raw') { 52 | mydata <- as.data.frame(t(pooledCells(obj))) 53 | } 54 | } else if(cell_model == 'seurat') { 55 | seurat_obj <- seuratInfo(obj) 56 | state_labels <- as.numeric(as.character(Idents(seurat_obj))) 57 | if(min(state_labels) == 0) state_labels <- state_labels + 1 #ensure cluster labels are 1 indexed instead of zero indexed 58 | names(state_labels) <- names(Idents(seurat_obj)) # label cluster assignments with cell name 59 | if(expn_type == 'cca.aligned') { 60 | # aligned CCA expression data (num_cells x num_markers) 61 | mydata <- Embeddings(object = seurat_obj, reduction = 'cca.aligned')[,seq_len(ndim)] 62 | } else if(expn_type %in% c('pca', 'reduced')) { 63 | mydata <- Embeddings(object = seurat_obj, reduction = 'pca')[,seq_len(ndim)] 64 | } else if(expn_type == 'tsne') { 65 | mydata <- Embeddings(object = seurat_obj, reduction = 'tsne')[,seq_len(ndim)] 66 | } else if(expn_type == 'umap') { 67 | mydata <- Embeddings(object = seurat_obj, reduction = 'umap')[,seq_len(ndim)] 68 | } else if(expn_type == 'raw') { 69 | mydata <- t(as.matrix(GetAssayData(seurat_obj, assay='RNA', slot='counts'))) 70 | } else { 71 | stop('Error: expn_type must be one of the following: "raw", "pca", "umap", "tsne", "cca.aligned", "reduced"') 72 | } 73 | 74 | # Split data frame based on cluster assignments 75 | mydata <- as.data.frame(mydata) 76 | } else if(cell_model == 'phate') { 77 | phate_obj <- phateInfo(obj) 78 | state_labels <- phate_obj$cellstate.labels 79 | 80 | # Split data frame based on cluster assignments 81 | if(expn_type == 'reduced') { 82 | mydata <- as.data.frame(phate_obj$embedding) 83 | if(ndim < ncol(mydata)) { 84 | mydata <- mydata[,1:ndim] 85 | } 86 | } else if(expn_type == 'raw') { 87 | mydata <- as.data.frame(t(pooledCells(obj))) 88 | } else { 89 | stop('Error: expn_type must be either "raw" or "reduced"') 90 | } 91 | } else { 92 | stop('Error: cell_model must be either "monocle2", "seurat", or "phate"') 93 | } 94 | ref_clusters <- split(mydata, state_labels) 95 | return(ref_clusters) 96 | } 97 | 98 | 99 | #' @title Identify cluster centroids (cell names) 100 | #' @description Takes initial list and returns list of cell names representing centroid of cluster 101 | #' @details Private method (not exported in namespace) 102 | #' @param ref_clusters list containing each cluster of interest (each list element is a matrix of dimension num_cells x num_markers) 103 | #' @return List of names; element \var{i} represents the name of the cell in cluster \var{i} that is closest to the centroid (arithmetic mean) of cluster \var{i} 104 | #' @examples 105 | #' \dontrun{ 106 | #' centroid_names <- identifyCentroids(ref_clusters) 107 | #' } 108 | identifyCentroids <- function(ref_clusters) { 109 | centroids <- lapply(ref_clusters, function(cur_cluster) { 110 | arith_centroid <- colMeans(cur_cluster) 111 | curdist <- t(as.matrix(apply(cur_cluster, 1, function(x) norm(x-arith_centroid, type="2")))) 112 | closest_cell <- colnames(curdist)[which.min(curdist)] #closest cell to arithmetic centroid 113 | return(closest_cell) 114 | }) 115 | return(centroids) 116 | } 117 | 118 | 119 | #' @title Get arithmetic centroids (coordinates) 120 | #' @description Takes initial list and returns a matrix with row \var{i} representing the arithmetic centroid of cluster \var{i} 121 | #' @details Private method (not exported in namespace) 122 | #' @param ref_clusters list containing each cluster of interest (each list element is a matrix of dimension num_cells x num_markers) 123 | #' @return Matrix of dimension num_cluster x num_markers; row \var{i} representing the arithmetic centroid of cluster \var{i} 124 | #' @examples 125 | #' \dontrun{ 126 | #' cluster_centroids <- getArithmeticCentroids(ref_clusters) 127 | #' } 128 | getArithmeticCentroids <- function(ref_clusters) { 129 | if(length(ref_clusters) < 1) stop('Error: input requires at least 1 reference cluster') 130 | 131 | #centroids <- matrix(0, nrow=length(ref_clusters), ncol = ncol(ref_clusters[[1]])) 132 | #for(i in seq_len(length(ref_clusters))) { 133 | # cur_cluster <- ref_clusters[[i]] 134 | # centroids[i,] <- colMeans(cur_cluster) 135 | #} 136 | 137 | centroids_list <- lapply(ref_clusters, colMeans) 138 | centroids <- do.call(rbind, centroids_list) 139 | return(centroids) 140 | } 141 | 142 | #' @title Assign cells to a reference cell subtype 143 | #' @description Assigns each cell in \code{cur_cells} to a cluster based on nearest cell in Monocle 2 tree 144 | #' @details Private method (not exported in namespace). Uses RANN package for fast knn search 145 | #' @param cur_cells Matrix of cells to be assigned to clusters (Dim: \var{num_cells} x \var{num_markers}) 146 | #' @param ref_cells Matrix of cells used to build reference Monocle 2 tree (Dim: \var{num_monocle_cells} x \var{num_markers}) 147 | #' @param ref_cell_labels Vector of length \var{num_monocle_cells} containing Monocle 2 cell branch assignments 148 | #' @param cell_model Either "monocle2", "seurat", or "phate" depending on method used to model cell state space 149 | #' @return Vector of length \var{num_cells} representing cluster assignments for each cell in \var{cur_cells} 150 | #' @examples 151 | #' \dontrun{ 152 | #' cur_cells_cluster_labels <- assignCellClusterNearestNode(cur_cells_expn_data, 153 | #' clustered_cells_expn_data, clustered_cells_cluster_labels, cell_model='monocle2') 154 | #' } 155 | assignCellClusterNearestNode <- function(cur_cells, ref_cells, ref_cell_labels, cell_model=c('monocle2', 'seurat', 'phate')) { 156 | if(nrow(ref_cells) != length(ref_cell_labels)) stop("Error: number of cells and cell labels do not match") 157 | 158 | closest <- RANN::nn2(data = ref_cells, query = cur_cells, k = 1) #fast nearest neighbor search 159 | nearest_cell <- closest$nn.idx 160 | 161 | cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate')) 162 | if(cell_model %in% c('monocle2', 'phate')) { 163 | assigned <- ref_cell_labels[nearest_cell] 164 | } else if(cell_model == 'seurat') { 165 | nearest_cell_names <- rownames(ref_cells)[nearest_cell] 166 | assigned <- as.numeric(ref_cell_labels[nearest_cell_names]) 167 | } else { 168 | stop('Error: cell_model must be either monocle2, seurat, or phate') 169 | } 170 | 171 | return(assigned) 172 | } 173 | 174 | #' @title Models expression data using generalized linear model with Gaussian error 175 | #' @description Useful for modeling pre-normalized single-cell expression data. 176 | #' @details Private method (not to be called by user directly). Requires VGAM package. Obtained from VGAM v1.0-5 (https://www.rdocumentation.org/packages/VGAM/versions/1.0-5/topics/gaussianff) 177 | #' @param dispersion Dispersion parameter. If 0, then estimate as described in VGAM 1.0-5 documentation. 178 | #' @param parallel A logical or formula. If a formula, the response of the formula should be a logical and the terms of the formula indicates whether or not those terms are parallel. 179 | #' @param zero An integer-valued vector specifying which linear/additive predictors are modelled as intercepts only. The values must be from the set {1...M} where Mis the number of columns of the matrix response. 180 | #' @return Generalized linear model with Gaussian error 181 | #' 182 | gaussianffLocal <- function(dispersion = 0, parallel = FALSE, zero = NULL) { 183 | if (!VGAM::is.Numeric(dispersion, length.arg = 1) || dispersion < 184 | 0) { 185 | stop("bad input for argument 'dispersion'") 186 | } 187 | estimated.dispersion <- dispersion == 0 188 | cur_constraints <- expression({ 189 | cur_constraints <- VGAM::cm.VGAM(matrix(1, M, 1), 190 | x = x, bool = parallel, 191 | constraints = constraints 192 | ) 193 | cur_constraints <- VGAM::cm.zero.VGAM(constraints, 194 | x = x, zero, 195 | M = M, predictors.names = predictors.names, M1 = 1 196 | ) 197 | }) 198 | deviance_fn <- function(mu,y, w, residuals = FALSE, eta, extra = NULL) { 199 | M <- if (is.matrix(y)) { 200 | ncol(y) 201 | } else { 202 | 1 203 | } 204 | n <- if (is.matrix(y)) { 205 | nrow(y) 206 | } else { 207 | length(y) 208 | } 209 | wz <- VGAM:::VGAM.weights.function(w = w, M = M, n = n) 210 | if (residuals) { 211 | if (M > 1) { 212 | U <- vchol(wz, M = M, n = n) 213 | temp <- mux22(U, y - mu, 214 | M = M, upper = TRUE, 215 | as.matrix = TRUE 216 | ) 217 | dimnames(temp) <- dimnames(y) 218 | temp 219 | } 220 | else { 221 | (y - mu) * sqrt(wz) 222 | } 223 | } 224 | else { 225 | ResSS.vgam(y - mu, wz = wz, M = M) 226 | } 227 | } 228 | 229 | infos_fn <- function(...) { 230 | list( 231 | M1 = 1, Q1 = 1, expected = TRUE, multipleResponses = TRUE, 232 | quasi.type = TRUE, zero = zero 233 | ) 234 | } 235 | 236 | init_expn <- expression({ 237 | if (is.R()) assign("CQO.FastAlgorithm", TRUE, envir = VGAM::VGAMenv) else CQO.FastAlgorithm <<- TRUE 238 | if (any(function.name == c("cqo", "cao")) && (length(zero) || 239 | (is.logical(parallel) && parallel))) { 240 | stop("cannot handle non-default arguments for cqo() and cao()") 241 | } 242 | temp5 <- w.y.check( 243 | w = w, y = y, ncol.w.max = Inf, ncol.y.max = Inf, 244 | out.wy = TRUE, colsyperw = 1, maximize = TRUE 245 | ) 246 | w <- temp5$w 247 | y <- temp5$y 248 | M <- if (is.matrix(y)) ncol(y) else 1 249 | dy <- dimnames(y) 250 | predictors.names <- if (!is.null(dy[[2]])) { 251 | dy[[2]] 252 | } else { 253 | param.names( 254 | "Y", 255 | M 256 | ) 257 | } 258 | if (!length(etastart)) etastart <- 0 * y 259 | }) 260 | 261 | last_expn <- expression({ 262 | dy <- dimnames(y) 263 | if (!is.null(dy[[2]])) dimnames(fit$fitted.values) <- dy 264 | dpar <- dispersion 265 | if (!dpar) { 266 | wz <- VGAM:::VGAM.weights.function(w = w, M = M, n = n) 267 | temp5 <- ResSS.vgam(y - mu, wz = wz, M = M) 268 | dpar <- temp5 / (length(y) - (if (is.numeric(ncol(X.vlm.save))) ncol(X.vlm.save) else 0)) 269 | } 270 | misc$dispersion <- dpar 271 | misc$default.dispersion <- 0 272 | misc$estimated.dispersion <- estimated.dispersion 273 | misc$link <- rep_len("identitylink", M) 274 | names(misc$link) <- predictors.names 275 | misc$earg <- vector("list", M) 276 | for (ilocal in seq_len(M)) misc$earg[[ilocal]] <- list() 277 | names(misc$link) <- predictors.names 278 | if (is.R()) { 279 | if (exists("CQO.FastAlgorithm", envir = VGAM::VGAMenv)) { 280 | rm("CQO.FastAlgorithm", 281 | envir = VGAM::VGAMenv 282 | ) 283 | } 284 | } else { 285 | while (exists("CQO.FastAlgorithm")) remove("CQO.FastAlgorithm") 286 | } 287 | misc$expected <- TRUE 288 | misc$multipleResponses <- TRUE 289 | }) 290 | 291 | loglikelihood_fn <- function(mu, y, w, residuals = FALSE, 292 | eta, extra = NULL, summation = TRUE) { 293 | M <- if (is.matrix(y)) { 294 | ncol(y) 295 | } else { 296 | 1 297 | } 298 | n <- if (is.matrix(y)) { 299 | nrow(y) 300 | } else { 301 | length(y) 302 | } 303 | wz <- VGAM:::VGAM.weights.function(w = w, M = M, n = n) 304 | if (residuals) { 305 | stop("loglikelihood residuals not implemented yet") 306 | } 307 | else { 308 | temp1 <- ResSS.vgam(y - mu, wz = wz, M = M) 309 | ll.elts <- if (M == 1 || ncol(wz) == M) { 310 | -0.5 * temp1 + 0.5 * (log(wz)) - n * (M / 2) * 311 | log(2 * pi) 312 | } 313 | else { 314 | if (all(wz[1, ] == apply(wz, 2, min)) && all(wz[1, ] == apply(wz, 2, max))) { 315 | onewz <- m2a(wz[1, , drop = FALSE], M = M) 316 | onewz <- onewz[, , 1] 317 | logdet <- determinant(onewz)$modulus 318 | logretval <- -0.5 * temp1 + 0.5 * n * logdet - 319 | n * (M / 2) * log(2 * pi) 320 | distval <- stop("variable 'distval' not computed yet") 321 | logretval <- -(ncol(onewz) * log(2 * pi) + 322 | logdet + distval) / 2 323 | logretval 324 | } 325 | else { 326 | logretval <- -0.5 * temp1 - n * (M / 2) * log(2 * pi) 327 | for (ii in seq_len(n)) { 328 | onewz <- m2a(wz[ii, , drop = FALSE], M = M) 329 | onewz <- onewz[, , 1] 330 | logdet <- determinant(onewz)$modulus 331 | logretval <- logretval + 0.5 * logdet 332 | } 333 | logretval 334 | } 335 | } 336 | if (summation) { 337 | sum(ll.elts) 338 | } 339 | else { 340 | ll.elts 341 | } 342 | } 343 | } 344 | validparams_fn <- function(eta, y, extra = NULL) { 345 | okay1 <- all(is.finite(eta)) 346 | okay1 347 | } 348 | 349 | new("vglmff", 350 | blurb = c( 351 | "Vector linear/additive model\n", 352 | "Links: identitylink for Y1,...,YM" 353 | ), constraints = cur_constraints, deviance = deviance_fn, infos = infos_fn, 354 | initialize = init_expn, linkinv = function(eta, extra = NULL) eta, 355 | last = last_expn, 356 | loglikelihood = loglikelihood_fn, linkfun = function(mu, extra = NULL) mu, 357 | vfamily = "gaussianff", 358 | validparams = validparams_fn, deriv = expression({ 359 | wz <- VGAM:::VGAM.weights.function(w = w, M = M, n = n) 360 | mux22(cc = t(wz), xmat = y - mu, M = M, as.matrix = TRUE) 361 | }), weight = expression({ 362 | wz 363 | }) 364 | ) 365 | } 366 | 367 | #' @title project2MST (updated from monocle v2.24.1 package for R 4.0+ compatibliity) 368 | #' @description project2MST 369 | #' @details Private method (not to be called by user directly). Adapted from monocle v2.24.1 (https://www.bioconductor.org/packages/release/bioc/manuals/monocle/man/monocle.pdf) 370 | #' @param cds the CellDataSet upon which to perform this operation 371 | #' @param Projection_Method the projection method 372 | #' @return CellDataSet with updatedtrajectory inference 373 | project2MSTUpdated <- function(cds, Projection_Method) 374 | { 375 | dp_mst <- minSpanningTree(cds) 376 | Z <- reducedDimS(cds) 377 | Y <- reducedDimK(cds) 378 | cds <- monocle:::findNearestPointOnMST(cds) 379 | closest_vertex <- cds@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_closest_vertex 380 | closest_vertex_names <- colnames(Y)[closest_vertex] 381 | closest_vertex_df <- as.matrix(closest_vertex) 382 | row.names(closest_vertex_df) <- row.names(closest_vertex) 383 | tip_leaves <- names(which(degree(dp_mst) == 1)) 384 | if (!is.function(Projection_Method)) { 385 | P <- Y[, closest_vertex] 386 | } 387 | else { 388 | P <- matrix(rep(0, length(Z)), nrow = nrow(Z)) 389 | for (i in 1:length(closest_vertex)) { 390 | neighbors <- names(V(dp_mst)[suppressWarnings(nei(closest_vertex_names[i], 391 | mode = "all"))]) 392 | projection <- NULL 393 | distance <- NULL 394 | Z_i <- Z[, i] 395 | for (neighbor in neighbors) { 396 | if (closest_vertex_names[i] %in% tip_leaves) { 397 | tmp <- monocle:::projPointOnLine(Z_i, Y[, c(closest_vertex_names[i], 398 | neighbor)]) 399 | } 400 | else { 401 | tmp <- Projection_Method(Z_i, Y[, c(closest_vertex_names[i], 402 | neighbor)]) 403 | } 404 | projection <- rbind(projection, tmp) 405 | distance <- c(distance, dist(rbind(Z_i, tmp))) 406 | } 407 | if (!inherits(projection, "matrix")) { #Updated from class(mydata) != "matrix" 408 | projection <- as.matrix(projection) 409 | } 410 | P[, i] <- projection[which(distance == min(distance))[1], 411 | ] 412 | } 413 | } 414 | colnames(P) <- colnames(Z) 415 | dp <- as.matrix(dist(t(P))) 416 | min_dist = min(dp[dp != 0]) 417 | dp <- dp + min_dist 418 | diag(dp) <- 0 419 | cellPairwiseDistances(cds) <- dp 420 | gp <- graph.adjacency(dp, mode = "undirected", weighted = TRUE) 421 | dp_mst <- minimum.spanning.tree(gp) 422 | cds@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_tree <- dp_mst 423 | cds@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_dist <- P 424 | cds@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_closest_vertex <- closest_vertex_df 425 | cds 426 | } 427 | 428 | #' @title newCellDataSetUpdated (updated from monocle v2.24.1 package for R 4.0+ compatibliity) 429 | #' @description Creates new CDS 430 | #' @details Private method (not to be called by user directly). Adapted from monocle v2.24.1 (https://www.bioconductor.org/packages/release/bioc/manuals/monocle/man/monocle.pdf) 431 | #' @param cellData the CellDataSet upon which to perform this operation 432 | #' @param phenoData phenoData 433 | #' @param featureData featureData 434 | #' @param lowerDetectionLimit lowerDetectionLimit 435 | #' @param expressionFamily expressionFamily 436 | #' @return CellDataSet 437 | newCellDataSetUpdated <- function (cellData, phenoData = NULL, featureData = NULL, lowerDetectionLimit = 0.1, 438 | expressionFamily = VGAM::negbinomial.size()) 439 | { 440 | if (!("gene_short_name" %in% colnames(featureData))) { 441 | warning("Warning: featureData must contain a column verbatim named 'gene_short_name' for certain functions") 442 | } 443 | if (!inherits(cellData, "matrix") && any(isSparseMatrix(cellData) == 444 | FALSE)) { 445 | stop("Error: argument cellData must be a matrix (either sparse from the Matrix package or dense)") 446 | } 447 | if (!("gene_short_name" %in% colnames(featureData))) { 448 | warning("Warning: featureData must contain a column verbatim named 'gene_short_name' for certain functions") 449 | } 450 | sizeFactors <- rep(NA_real_, ncol(cellData)) 451 | if (is.null(phenoData)) 452 | phenoData <- annotatedDataFrameFrom(cellData, byrow = FALSE) 453 | if (is.null(featureData)) 454 | featureData <- annotatedDataFrameFrom(cellData, byrow = TRUE) 455 | if (!("gene_short_name" %in% colnames(featureData))) { 456 | warning("Warning: featureData must contain a column verbatim named 'gene_short_name' for certain functions") 457 | } 458 | phenoData$Size_Factor <- sizeFactors 459 | cds <- new("CellDataSet", assayData = assayDataNew("environment", 460 | exprs = cellData), phenoData = phenoData, featureData = featureData, 461 | lowerDetectionLimit = lowerDetectionLimit, expressionFamily = expressionFamily, 462 | dispFitInfo = new.env(hash = TRUE)) 463 | validObject(cds) 464 | cds 465 | } 466 | 467 | #' @title orderCells (updated from monocle v2.24.1 package for R 4.0+ compatibliity) 468 | #' @description Orders cells according to pseudotime. 469 | #' @details Private method (not to be called by user directly). Adapted from monocle v2.24.1 (https://www.bioconductor.org/packages/release/bioc/manuals/monocle/man/monocle.pdf) 470 | #' @param cds the CellDataSet upon which to perform this operation 471 | #' @param root_state The state to use as the root of the trajectory. 472 | #' @param params num_paths the number of end-point cell states to allow in the biological process. 473 | #' @param reverse whether to reverse the beginning and end points of the learned biological process. 474 | #' @return CellDataSet with updatedtrajectory inference 475 | orderCellsUpdated <- function (cds, root_state = NULL, num_paths = NULL, reverse = NULL) 476 | { 477 | if (class(cds)[1] != "CellDataSet") { 478 | stop("Error cds is not of type 'CellDataSet'") 479 | } 480 | if (is.null(cds@dim_reduce_type)) { 481 | stop("Error: dimensionality not yet reduced. Please call reduceDimension() before calling this function.") 482 | } 483 | if (any(c(length(cds@reducedDimS) == 0, length(cds@reducedDimK) == 484 | 0))) { 485 | stop("Error: dimension reduction didn't prodvide correct results. Please check your reduceDimension() step and ensure correct dimension reduction are performed before calling this function.") 486 | } 487 | root_cell <- monocle:::select_root_cell(cds, root_state, reverse) 488 | cds@auxOrderingData <- new.env(hash = TRUE) 489 | if (cds@dim_reduce_type == "ICA") { 490 | if (is.null(num_paths)) { 491 | num_paths = 1 492 | } 493 | adjusted_S <- t(cds@reducedDimS) 494 | dp <- as.matrix(dist(adjusted_S)) 495 | cellPairwiseDistances(cds) <- as.matrix(dist(adjusted_S)) 496 | gp <- graph.adjacency(dp, mode = "undirected", weighted = TRUE) 497 | dp_mst <- minimum.spanning.tree(gp) 498 | minSpanningTree(cds) <- dp_mst 499 | next_node <<- 0 500 | res <- pq_helper(dp_mst, use_weights = FALSE, root_node = root_cell) 501 | cds@auxOrderingData[[cds@dim_reduce_type]]$root_cell <- root_cell 502 | order_list <- monocle:::extract_good_branched_ordering(res$subtree, 503 | res$root, cellPairwiseDistances(cds), num_paths, 504 | FALSE) 505 | cc_ordering <- order_list$ordering_df 506 | row.names(cc_ordering) <- cc_ordering$sample_name 507 | minSpanningTree(cds) <- as.undirected(order_list$cell_ordering_tree) 508 | pData(cds)$Pseudotime <- cc_ordering[row.names(pData(cds)), 509 | ]$pseudo_time 510 | pData(cds)$State <- cc_ordering[row.names(pData(cds)), 511 | ]$cell_state 512 | mst_branch_nodes <- V(minSpanningTree(cds))[which(degree(minSpanningTree(cds)) > 513 | 2)]$name 514 | minSpanningTree(cds) <- dp_mst 515 | cds@auxOrderingData[[cds@dim_reduce_type]]$cell_ordering_tree <- as.undirected(order_list$cell_ordering_tree) 516 | } 517 | else if (cds@dim_reduce_type == "DDRTree") { 518 | if (is.null(num_paths) == FALSE) { 519 | message("Warning: num_paths only valid for method 'ICA' in reduceDimension()") 520 | } 521 | cc_ordering <- monocle:::extract_ddrtree_ordering(cds, root_cell) 522 | pData(cds)$Pseudotime <- cc_ordering[row.names(pData(cds)), 523 | ]$pseudo_time 524 | K_old <- reducedDimK(cds) 525 | old_dp <- cellPairwiseDistances(cds) 526 | old_mst <- minSpanningTree(cds) 527 | old_A <- reducedDimA(cds) 528 | old_W <- reducedDimW(cds) 529 | cds <- project2MSTUpdated(cds, monocle:::project_point_to_line_segment) #Calling local updated version of this fx 530 | minSpanningTree(cds) <- cds@auxOrderingData[[cds@dim_reduce_type]]$pr_graph_cell_proj_tree 531 | root_cell_idx <- which(V(old_mst)$name == root_cell, 532 | arr.ind = T) 533 | cells_mapped_to_graph_root <- which(cds@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_closest_vertex == 534 | root_cell_idx) 535 | if (length(cells_mapped_to_graph_root) == 0) { 536 | cells_mapped_to_graph_root <- root_cell_idx 537 | } 538 | cells_mapped_to_graph_root <- V(minSpanningTree(cds))[cells_mapped_to_graph_root]$name 539 | tip_leaves <- names(which(degree(minSpanningTree(cds)) == 540 | 1)) 541 | root_cell <- cells_mapped_to_graph_root[cells_mapped_to_graph_root %in% 542 | tip_leaves][1] 543 | if (is.na(root_cell)) { 544 | root_cell <- select_root_cell(cds, root_state, reverse) 545 | } 546 | cds@auxOrderingData[[cds@dim_reduce_type]]$root_cell <- root_cell 547 | cc_ordering_new_pseudotime <- monocle:::extract_ddrtree_ordering(cds, 548 | root_cell) 549 | pData(cds)$Pseudotime <- cc_ordering_new_pseudotime[row.names(pData(cds)), 550 | ]$pseudo_time 551 | if (is.null(root_state) == TRUE) { 552 | closest_vertex <- cds@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_closest_vertex 553 | pData(cds)$State <- cc_ordering[closest_vertex[, 554 | 1], ]$cell_state 555 | } 556 | monocle:::reducedDimK(cds) <- K_old 557 | cellPairwiseDistances(cds) <- old_dp 558 | minSpanningTree(cds) <- old_mst 559 | monocle:::reducedDimA(cds) <- old_A 560 | monocle:::reducedDimW(cds) <- old_W 561 | mst_branch_nodes <- V(minSpanningTree(cds))[which(degree(minSpanningTree(cds)) > 562 | 2)]$name 563 | } 564 | else if (cds@dim_reduce_type == "SimplePPT") { 565 | if (is.null(num_paths) == FALSE) { 566 | message("Warning: num_paths only valid for method 'ICA' in reduceDimension()") 567 | } 568 | cc_ordering <- monocle:::extract_ddrtree_ordering(cds, root_cell) 569 | pData(cds)$Pseudotime <- cc_ordering[row.names(pData(cds)), 570 | ]$pseudo_time 571 | pData(cds)$State <- cc_ordering[row.names(pData(cds)), 572 | ]$cell_state 573 | mst_branch_nodes <- V(minSpanningTree(cds))[which(degree(minSpanningTree(cds)) > 574 | 2)]$name 575 | } 576 | cds@auxOrderingData[[cds@dim_reduce_type]]$branch_points <- mst_branch_nodes 577 | cds 578 | } 579 | 580 | ############################ 581 | ### Public methods below ### 582 | ############################ 583 | 584 | #' @title Create 'Phemd' object 585 | #' @description Wrapper function to create 'Phemd' object containing raw expression data and metadata 586 | #' @details Note that each element in list can have different number of rows (i.e. number of cells in each sample can vary). 587 | #' @param data List of length \var{num_samples} containing expression data; each element is of size \var{num_cells} x \var{num_markers}. Alternately a SingleCellExperiment object. 588 | #' @param markers Vector containing marker names (i.e. column names of \code{all_data}) 589 | #' @param snames Vector containing sample names (i.e. names of samples contained in \code{all_data}) 590 | #' @param datatype Either "list" or "sce" (SingleCellExperiment with genes x cells) 591 | #' @param valtype Type of assay data (i.e. "counts", "normcounts", "logcounts", "tpm", "cpm") if datatype is "sce" 592 | #' @return 'Phemd' object containing raw multi-sample expression data and associated metadata 593 | #' @examples 594 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 595 | #' 596 | createDataObj <- function(data, markers, snames, datatype='list', valtype='counts') { 597 | if(datatype == 'list') { 598 | all_data <- data 599 | } else if(datatype == 'sce') { 600 | if(valtype %in% names(assays(data))) { 601 | stop(sprintf('Error: %s not found in input SingleCellExperiment', valtype)) 602 | } 603 | all_data <- t(assay(data, valtype)) # generally SCE objects are genes x cells; transpose 604 | } else { 605 | stop('Error: Input datatype must be either "list" or "sce" (SingleCellExperiment)') 606 | } 607 | 608 | nsample <- length(all_data) 609 | if(nsample == 0) stop('all_data is empty (length=0)') 610 | stopifnot(nsample == length(snames)) 611 | if(nsample > 0) { 612 | stopifnot(ncol(all_data[[1]]) == length(markers)) 613 | } 614 | 615 | # Error-checking to ensure that all samples in data list have same number of markers 616 | nmarker_vec <- rep(0, nsample) 617 | for(i in seq_len(nsample)) { 618 | nmarker_vec[i] <- ncol(all_data[[i]]) 619 | } 620 | if(sum(nmarker_vec - nmarker_vec[1]) != 0) { 621 | stop(sprintf("Error: Sample %d has a different number of columns than Sample 1", which(nmarker_vec-nmarker_vec[1] != 0))) 622 | } 623 | data_obj <- new('Phemd', data = all_data, markers = markers, snames = snames, monocle_obj=NULL, seurat_obj=NULL, version=packageVersion(pkg = 'Seurat')) 624 | return(data_obj) 625 | } 626 | 627 | 628 | #' @title Attach 'Seurat' object to 'Phemd' object 629 | #' @description Allows user to attach batch-normalized reference cell data from Seurat into 'Phemd' object containing raw expression data and metadata 630 | #' @param phemd_obj Phemd object initialized using createDataObj 631 | #' @param seurat_obj S4 'seurat' object containing batch-normalized reference cell data 632 | #' @param batch.colname Name of column in Seurat object that denotes batch ID 633 | #' @return 'Phemd' object containing with attached Seurat object 634 | #' @examples 635 | #' 636 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 637 | #' my_seuratObj <- Seurat::CreateSeuratObject(counts = t(all_expn_data[[1]]), project = "A") 638 | #' my_seuratObj <- Seurat::FindVariableFeatures(object = my_seuratObj) 639 | #' my_seuratObj <- Seurat::ScaleData(object = my_seuratObj, do.scale=FALSE, do.center=FALSE) 640 | #' my_seuratObj <- Seurat::RunPCA(object = my_seuratObj, pc.genes = colnames(all_expn_data[[1]]), do.print = FALSE) 641 | #' my_seuratObj <- Seurat::FindNeighbors(my_seuratObj, reduction = "pca", dims.use = 1:10) 642 | #' my_seuratObj <- Seurat::FindClusters(my_seuratObj, resolution = 0.6, print.output = 0, save.SNN = TRUE) 643 | #' my_phemdObj <- bindSeuratObj(my_phemdObj, my_seuratObj) 644 | #' 645 | bindSeuratObj <- function(phemd_obj, seurat_obj, batch.colname='plt') { 646 | stopifnot(is(seurat_obj,'Seurat')) 647 | # ensure cluster names are 1-indexed 648 | if(min(as.numeric(as.character(Idents(seurat_obj)))) == 0) { 649 | label_names <- levels(Idents(seurat_obj)) 650 | labels_renumbered <- factor(as.numeric(as.character(Idents(seurat_obj))) +1) 651 | #levels(labels_renumbered) <- label_names 652 | Idents(seurat_obj) <- labels_renumbered 653 | } 654 | if(batch.colname != 'plt') { 655 | seurat_obj@meta.data$plt <- seurat_obj@meta.data[[batch.colname]] 656 | } 657 | # assign batch ID of each cell as project ID defined upon Seurat obj initialization 658 | if(!'plt' %in% colnames(seurat_obj@meta.data)) { 659 | seurat_obj@meta.data$plt <- as.character(seurat_obj@meta.data$orig.ident) 660 | } 661 | seuratInfo(phemd_obj) <- seurat_obj 662 | 663 | return(phemd_obj) 664 | } 665 | 666 | #' @title Remove samples with too few cells 667 | #' @description Removes samples from Phemd that have fewer cells than \code{min_sz} 668 | #' @details Note: If used, this function must be called before (and not after) the \code{aggregateSamples} function is called 669 | #' @param obj 'Phemd' object containing raw expression data and associated metadata 670 | #' @param min_sz Minimum number of cells in each sample to be retained 671 | #' @return 'Phemd' object containing raw multi-sample expression data and associated metadata (same as input minus removed samples) 672 | #' @examples 673 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 674 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) #removes samples with fewer than 10 cells 675 | #' 676 | removeTinySamples <- function(obj, min_sz=20) { 677 | stopifnot(is(obj,'Phemd')) 678 | stopifnot(mode(min_sz) == 'numeric') 679 | all_data <- rawExpn(obj) 680 | all_snames <- sNames(obj) 681 | all_sample_sz <- getSampleSizes(all_data) 682 | to_remove_idx <- which(all_sample_sz < min_sz) 683 | if(length(to_remove_idx) == 0) return(obj) 684 | 685 | to_remove_idx <- to_remove_idx[order(to_remove_idx, decreasing=TRUE)] #remove from end to front 686 | for(i in to_remove_idx) { 687 | print(sprintf('%s removed because only contains %d cells', all_snames[i], all_sample_sz[i])) 688 | all_data[[i]] <- NULL 689 | } 690 | all_snames <- all_snames[-to_remove_idx] 691 | 692 | obj@data <- all_data 693 | obj@snames <- all_snames 694 | validObject(obj) 695 | return(obj) 696 | } 697 | 698 | #' @title Aggregate expression data from all samples 699 | #' @description Takes initial Phemd object and returns object with additional data frame in slot @@data_aggregate containing cells aggregated from all samples (to be used for further analyses e.g. Monocle 2 trajectory building / pseudotime mapping / cell clustering) 700 | #' @details Subsamples cells as necessary based on \code{max_cells}. If subsampling is performed, an equal number of cells are subsampled from each sample 701 | #' @param obj 'Phemd' object containing raw expression data and associated metadata 702 | #' @param max_cells Maximum number of cells across all samples to be included in final matrix on which Monocle 2 will be run 703 | #' @return Same as input 'Phemd' object with additional slot 'data_aggregate' containing aggregated expression data (num_markers x num_cells) 704 | #' @examples 705 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 706 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 707 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 708 | #' 709 | aggregateSamples <- function(obj, max_cells=12000) { 710 | stopifnot(is(obj, 'Phemd')) 711 | stopifnot(mode(max_cells) == 'numeric') 712 | 713 | all_data <- rawExpn(obj) 714 | nsample <- length(all_data) 715 | if(nsample == 0) return(obj) 716 | 717 | subsample_sz <- floor(max_cells/nsample) 718 | all_aggregate_data <- matrix(nrow=0,ncol=ncol(all_data[[1]])) 719 | all_subsample_idx <- list() 720 | subsample_bool = FALSE 721 | 722 | all_sample_sz <- getSampleSizes(all_data) 723 | if(sum(all_sample_sz) > max_cells) subsample_bool = TRUE 724 | 725 | for(i in seq_len(nsample)) { 726 | cur_data <- all_data[[i]] 727 | # take all cells unless total cells across all samples > max_cells 728 | if(subsample_bool) { 729 | cur_subsample_idx <- sample(seq_len(nrow(cur_data)), min(subsample_sz, nrow(cur_data))) 730 | all_subsample_idx[[i]] <- cur_subsample_idx 731 | all_aggregate_data <- rbind(all_aggregate_data, cur_data[cur_subsample_idx,]) 732 | } else { 733 | all_aggregate_data <- rbind(all_aggregate_data, cur_data) 734 | } 735 | } 736 | all_aggregate_data <- t(all_aggregate_data) #rows = markers, cols = cells 737 | colnames(all_aggregate_data) <- seq_len(ncol(all_aggregate_data)) 738 | rownames(all_aggregate_data) <- selectMarkers(obj) 739 | 740 | pooledCells(obj) <- as.matrix(all_aggregate_data) 741 | subsampledIdx(obj) <- all_subsample_idx 742 | subsampledBool(obj) <- subsample_bool 743 | return(obj) 744 | } 745 | 746 | #' @title Perform feature selection on aggregated data 747 | #' @description Takes as input a Phemd object with aggregated data and returns updated object after performing feature selection on aggregated data 748 | #' @details \code{aggregateSamples} needs to be called before running this function 749 | #' @param obj 'Phemd' object containing aggregated data 750 | #' @param selected_genes Vector containing names of genes to use for downstream analyses 751 | #' @return Same as input 'Phemd' object after performing feature-selection based dimensionality reduction on aggregated expression data 752 | #' @examples 753 | #' 754 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 755 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 756 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 757 | #' my_phemdObj_lg <- selectFeatures(my_phemdObj_lg, selected_genes=c('TP53', 758 | #' 'EGFR', 'KRAS', 'FOXP3', 'LAG3')) 759 | #' 760 | selectFeatures <- function(obj, selected_genes) { 761 | if(isempty(pooledCells(obj))) stop('slot "data_aggregate" is empty; please call aggregateSamples() before running selectFeatures()') 762 | all_aggregate_data <- pooledCells(obj) 763 | all_genes <- selectMarkers(obj) 764 | selected_gene_map <- match(selected_genes, all_genes) 765 | #TODO: print genes in selected_genes that were unable to map to all_genes 766 | if(sum(!is.na(selected_gene_map)) == 0) { 767 | stop('None of the genes in "selected_genes" were found. Aborting feature selection.') 768 | } 769 | 770 | all_aggregate_data <- all_aggregate_data[selected_gene_map,] 771 | obj@data_aggregate <- all_aggregate_data #set slots manually due to validity constraints 772 | obj@markers <- all_genes[selected_gene_map] 773 | validObject(obj) 774 | return(obj) 775 | } 776 | 777 | #' @title Generate cell-state embedding 778 | #' @description Takes as input a Phemd object with aggregated data and returns updated object containing cell-state embedding 779 | #' @details \code{aggregateSamples} needs to be called before running this function. 780 | #' @param obj 'Phemd' object containing aggregated data 781 | #' @param cell_model Method to use to generate cell-state embedding. Currently supports "phate" and "monocle2". If using the Seurat to model the cell-state space, please identify cell subtypes as outlined in the Seurat software package and then use the \code{bindSeuratObj} function. 782 | #' @param data_model Only relevant if cell_model = "monocle2". One of the following: 'negbinomial_sz', 'negbinomial', 'tobit', 'uninormal', 'gaussianff'. See "Family Function" table at the following link for more details on selecting the proper one. \url{http://cole-trapnell-lab.github.io/monocle-release/docs/#getting-started-with-monocle} 783 | #' @param phate_ncluster Only relevant if cell_model = "phate". Number of cell state clusters to return when using PHATE 784 | #' @param phate_cluster_seed Only relevant if cell_model = "phate". Seed to use when performing cell state clustering (optional) 785 | #' @param ... Additional parameters to be passed to \code{reduceDimension} function for Monocle or \code{phate} function for PHATE 786 | #' @return Same as input 'Phemd' object containing additional cell-state embedding object 787 | #' @examples 788 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 789 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 790 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 791 | #' my_phemdObj_lg <- embedCells(my_phemdObj_lg, cell_model='monocle2', data_model = 'gaussianff', sigma=0.02, maxIter=2) 792 | embedCells <- function(obj, cell_model=c('monocle2', 'seurat', 'phate'), data_model = 'negbinomial_sz', phate_ncluster=8, phate_cluster_seed=NULL, ...) { 793 | extra_args <- list(...) 794 | cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate')) 795 | 796 | if(isempty(pooledCells(obj))) stop('slot "data_aggregate" is empty; please call aggregateSamples() before running embedCells()') 797 | mydata <- pooledCells(obj) 798 | if(is.null(mydata)) stop("Error: call 'aggregateSamples' function first ") 799 | 800 | if(cell_model == 'monocle2') { 801 | myFeatureData <- as.data.frame(selectMarkers(obj)) 802 | fd <- new("AnnotatedDataFrame", data=myFeatureData) 803 | rownames(fd) <- selectMarkers(obj) 804 | 805 | if(is.null(data_model)) { 806 | print('Assuming data fit negative binomial pattern of expression...') 807 | data_model <- 'negbinomial_sz' 808 | } 809 | 810 | if(data_model == 'negbinomial_sz') { 811 | expression_fam_fn <- VGAM::negbinomial.size() 812 | } else if(data_model == 'negbinomial') { 813 | expression_fam_fn <- VGAM::negbinomial() 814 | } else if(data_model == 'tobit') { 815 | expression_fam_fn <- VGAM::tobit() 816 | } else if(data_model == 'uninormal') { 817 | expression_fam_fn <- VGAM::uninormal() 818 | } else if(data_model == 'gaussianff') { 819 | #expression_fam_fn <- VGAM::gaussianff() 820 | expression_fam_fn <- gaussianffLocal() 821 | } else { 822 | stop("Error: Invalid data_model specified") 823 | } 824 | 825 | # Helpful to use ncenter and sigma that's higher than default in reduceDimension for datasets w/ relatively more cells 826 | if(!('ncenter' %in% names(extra_args)) && ncol(mydata) > 3000) { 827 | extra_args['ncenter'] <- 750 828 | if(!('sigma' %in% names(extra_args))) extra_args['sigma'] <- 0.03 829 | } 830 | 831 | if(!('maxIter' %in% names(extra_args))) { 832 | extra_args['maxIter'] <- 12 #set maximum number of iterations to 12 833 | } 834 | 835 | if(!('max_components' %in% names(extra_args))) { 836 | extra_args['max_components'] <- 2 #set number of dimensionality-reduced components to 2 837 | } 838 | 839 | monocle_obj <- newCellDataSetUpdated(mydata,phenoData=NULL,featureData=fd, 840 | expressionFamily=expression_fam_fn) 841 | varLabels(featureData(monocle_obj)) <- 'gene_short_name' #random formatting requirement for monocle 842 | 843 | if(data_model == 'negbinomial_sz') { 844 | monocle_obj <- estimateSizeFactors(monocle_obj) #Only needed for negbinomial data; also note that with R 4.0 class(mymatrix) returns ["matrix", "array"] so isSparseMatrix doesn't return scalar and hence is broken 845 | monocle_obj <- estimateDispersions(monocle_obj) 846 | } 847 | if(data_model == 'gaussianff') { 848 | extra_args['norm_method'] <- 'none' 849 | extra_args['scaling'] <- 'FALSE' 850 | } 851 | 852 | rd_args <- c(list(cds=monocle_obj, reduction_method='DDRTree'), 853 | extra_args[names(extra_args) %in% c("verbose", "ncenter", "norm_method", 'scaling', 'pseudo_expr', "initial_method", "maxIter", "sigma", "lambda", "param.gamma", "tol", "max_components")]) 854 | monocle_obj_red <- do.call(reduceDimension, rd_args) 855 | 856 | monocleInfo(obj) <- monocle_obj_red 857 | } else if(cell_model == 'phate') { 858 | # Perform phate embedding and generate cell state labels 859 | rd_args <- c(list(data=t(mydata)), 860 | extra_args) 861 | phate_obj <- do.call(phate, rd_args) 862 | state_labels <- cluster_phate(phate_obj, phate_ncluster, phate_cluster_seed) 863 | if(min(state_labels) <= 0) { 864 | # Ensure numeric cell state labels are 1-indexed 865 | state_labels <- state_labels - min(state_labels) + 1 866 | } 867 | phate_obj$cellstate.labels <- state_labels 868 | class(phate_obj) <- 'list' 869 | phateInfo(obj) <- phate_obj 870 | } 871 | return(obj) 872 | } 873 | 874 | #' @title Compute Monocle2 cell state and pseudotime assignments 875 | #' @description Takes as input a Phemd object with Monocle2 object and returns updated object with Monocle2 object containing cell state and pseudotime assignments 876 | #' @details Wrapper function for \code{orderCells} in Monocle 2 package. \code{embedCells} needs to be called before calling this function. 877 | #' @param obj 'Phemd' object containing Monocle2 object initialized using embedCells 878 | #' @param ... Additional parameters to be passed into \code{orderCells} function 879 | #' @return Same as input 'Phemd' object with updated cell-state embedding object containing cell state assignments 880 | #' @examples 881 | #' 882 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 883 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 884 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 885 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, cell_model='monocle2', data_model='gaussianff', sigma=0.02, maxIter=2) 886 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 887 | orderCellsMonocle <- function(obj, ...) { 888 | monocle_obj <- monocleInfo(obj) 889 | if(ncol(monocle_obj) == 0) stop('slot "monocle_obj" is empty; please call embedCells() before running orderCellsMonocle()') 890 | 891 | extra_args <- list(...) 892 | oc_args <- c(list(cds=monocle_obj), 893 | extra_args[names(extra_args) %in% c("root_state", "reverse")]) 894 | 895 | monocle_obj_ordered <- do.call(orderCellsUpdated, oc_args) #updated from Monocle2 to be compatible with R4.2 896 | monocleInfo(obj) <- monocle_obj_ordered 897 | return(obj) 898 | } 899 | 900 | #' @title Computes cell subtype abundances for each sample 901 | #' @description Takes as input a Phemd object with all single-cell expression data of all single-cell samples in @@data slot and cell-state embedding generated by embedCells. Returns updated object with cell subtype frequencies of each sample that may be retrieved by the 'celltypeFreqs' accessor function. 902 | #' @details \code{embedCells} (and \code{orderCellsMonocle} if using the Monocle2 embedding technique) needs to be called before calling this function. 903 | #' @param obj 'Phemd' object containing single-cell expression data of all samples in @@data slot and cell-state embedding object generated and stored using the embedCells function. 904 | #' @param verbose Boolean that determines whether progress (sequential processing of samples) should be printed. FALSE by default 905 | #' @param cell_model Either "monocle2", "seurat", or "phate" depending on method used to model cell state space 906 | #' @return 'Phemd' object with cell subtype frequencies of each sample that can be retrieved using the 'celltypeFreqs' accessor function 907 | #' @examples 908 | #' 909 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 910 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 911 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 912 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 913 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 914 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 915 | #' 916 | clusterIndividualSamples <- function(obj, verbose=FALSE, cell_model=c('monocle2', 'seurat', 'phate')) { 917 | stopifnot(is(obj,'Phemd')) 918 | all_data <- rawExpn(obj) 919 | cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate')) 920 | if(cell_model %in% c('monocle2', 'phate')) { 921 | if(cell_model == 'monocle2') { 922 | # Use Monocle2 embedding and clusters 923 | monocle_obj <- monocleInfo(obj) 924 | if(ncol(monocle_obj) == 0) stop('slot "monocle_obj" is empty; please call embedCells() and orderCellsMonocle() before calling this function') 925 | 926 | # Extract state labels from monocle data object 927 | labels <- pData(phenoData(monocle_obj)) 928 | if(!('State' %in% names(labels))) stop('monocleInfo(obj) does not have cell state assignments; please call embedCells() and orderCellsMonocle() before calling this function') 929 | state_labels <- as.numeric(labels$State) 930 | # retrieve reference clusters 931 | ref_clusters <- retrieveRefClusters(obj, cell_model='monocle2', expn_type='raw') 932 | refcells_expn <- t(exprs(monocle_obj)) 933 | } else { 934 | # Use PHATE embedding and clusters 935 | phate_obj <- phateInfo(obj) 936 | if(length(phate_obj) == 0) { 937 | stop('slot "phate_obj" is empty; please call embedCells() and clusterCellsPHATE() before calling this function') 938 | } 939 | if(!('cellstate.labels' %in% names(phate_obj))) { 940 | stop('phateInfo(obj) does not have cell state assignments; please call embedCells() and clusterCellsPHATE() before calling this function') 941 | } 942 | state_labels <- phate_obj$cellstate.labels 943 | # retrieve reference clusters 944 | ref_clusters <- retrieveRefClusters(obj, cell_model='phate', expn_type='raw') 945 | refcells_expn <- t(pooledCells(obj)) 946 | } 947 | nclusters <- length(ref_clusters) 948 | cluster.ids <- names(ref_clusters) 949 | 950 | if(!subsampledBool(obj)) { 951 | ## subsampling not performed; all cells assigned to clusters as-is 952 | 953 | cluster_weights <- matrix(0, nrow=length(all_data), ncol = nclusters) 954 | 955 | start_idx <- 1 956 | for(i in seq_len(length(all_data))) { 957 | cur_sample_sz <- nrow(all_data[[i]]) 958 | end_idx <- start_idx + cur_sample_sz - 1 959 | sample_labels <- state_labels[start_idx:end_idx] #cell subtype assignments for current sample 960 | cur_hist <- rep(0, nclusters) 961 | for(j in seq_len(nclusters)) { 962 | cur_hist[j] <- sum(sample_labels == cluster.ids[j]) 963 | } 964 | cur_hist <- cur_hist / sum(cur_hist) 965 | cluster_weights[i,] <- cur_hist 966 | start_idx <- end_idx + 1 967 | } 968 | colnames(cluster_weights) <- paste('C-', cluster.ids, sep='') 969 | celltypeFreqs(obj) <- cluster_weights 970 | } else { 971 | ## subsampling performed; need to assign cells to cluster of nearest cell in embedding 972 | refcluster_sizes <- rep(0, length(ref_clusters)) 973 | counter1 <- 0 974 | for(i in seq_len(nclusters)) { 975 | refcluster_sizes[i] <- nrow(ref_clusters[[i]]) 976 | counter1 <- counter1 + nrow(ref_clusters[[i]]) 977 | } 978 | print(refcluster_sizes / counter1) 979 | 980 | cluster_weights <- matrix(0, nrow=length(all_data), ncol=nclusters) 981 | for(i in seq_len(length(all_data))) { 982 | cur_data <- all_data[[i]] 983 | cur_ncells <- nrow(cur_data) 984 | cur_hist <- rep(0, nclusters) 985 | 986 | if(verbose && i %% 10 == 0) { 987 | print(sprintf('Processing sample number: %d', i)) 988 | } 989 | 990 | # Use nearest-cell mapping instead of nearest-centroid mapping 991 | cur_cell_labels <- assignCellClusterNearestNode(cur_data, 992 | refcells_expn, 993 | state_labels, 994 | cell_model=cell_model) 995 | 996 | for(j in seq_len(nclusters)) { 997 | cur_hist[j] <- sum(cur_cell_labels == cluster.ids[j]) 998 | } 999 | cur_hist <- cur_hist / sum(cur_hist) 1000 | cluster_weights[i,] <- cur_hist 1001 | } 1002 | print(colMeans(cluster_weights)) 1003 | colnames(cluster_weights) <- paste('C-', cluster.ids, sep='') 1004 | 1005 | celltypeFreqs(obj) <- cluster_weights 1006 | } 1007 | } else if(cell_model == 'seurat') { 1008 | seurat_obj <- seuratInfo(obj) 1009 | # retrieve reference clusters for starting estimate of cluster sizes 1010 | ref_clusters <- retrieveRefClusters(obj, cell_model='seurat', 1011 | expn_type = 'raw') 1012 | nclusters <- length(ref_clusters) 1013 | ## subsampling performed; assign cells to same cluster as nearest cell 1014 | # in embedding 1015 | refcluster_sizes <- rep(0, length(ref_clusters)) 1016 | counter1 <- 0 1017 | for(i in seq_len(nclusters)) { 1018 | refcluster_sizes[i] <- nrow(ref_clusters[[i]]) 1019 | counter1 <- counter1 + nrow(ref_clusters[[i]]) 1020 | } 1021 | print(refcluster_sizes / counter1) 1022 | 1023 | cluster_weights <- matrix(0, nrow=length(all_data), ncol = nclusters) 1024 | sample_batches <- batchIDs(obj) 1025 | for(i in seq_len(length(all_data))) { 1026 | cur_data <- all_data[[i]] 1027 | cur_plt <- sample_batches[i] 1028 | if(verbose && i %% 10 == 0) { 1029 | print(sprintf('Processing sample number: %d', i)) 1030 | } 1031 | 1032 | cur_hist <- rep(0, nclusters) 1033 | state_labels <- as.numeric(as.character(Idents(seurat_obj))) 1034 | names(state_labels) <- rownames(seurat_obj@meta.data) 1035 | ref_data <- t(as.matrix(GetAssayData(seurat_obj, assay='RNA', 1036 | slot='counts'))) 1037 | cell_idx_curplt <- which(seurat_obj@meta.data$plt == cur_plt) 1038 | if(length(cell_idx_curplt) == 0) { 1039 | stop(sprintf('Error: no cells in reference set match the experiment_id %s of sample %d', cur_plt, i)) 1040 | } 1041 | ref_data <- ref_data[cell_idx_curplt,] 1042 | state_labels <- state_labels[cell_idx_curplt] 1043 | 1044 | # Use nearest-cell mapping 1045 | cur_cell_labels <- assignCellClusterNearestNode(cur_data, 1046 | ref_data, 1047 | state_labels, 1048 | cell_model=cell_model) 1049 | 1050 | for(j in seq_len(nclusters)) { 1051 | cur_hist[j] <- sum(cur_cell_labels == j) 1052 | } 1053 | cur_hist <- cur_hist / sum(cur_hist); 1054 | cluster_weights[i,] <- cur_hist 1055 | } 1056 | print(colMeans(cluster_weights)) 1057 | 1058 | celltypeFreqs(obj) <- cluster_weights 1059 | } else { 1060 | stop('Error: cell_model must be either "monocle2", "seurat", or "phate"') 1061 | } 1062 | 1063 | return(obj) 1064 | } 1065 | 1066 | 1067 | #' @title Computes ground distance matrix based on cell embedding 1068 | #' @description Takes as input a Phemd object containing cell-state embedding object. Returns updated object with ground distance matrix representing pairwise distances between distinct cell subtypes based on cell state embedding. 1069 | #' @details \code{embedCells} and \code{orderCellsMonocle} need to be called before calling this function. Requires 'igraph' package 1070 | #' @param obj 'Phemd' object containing cell-state embedding object 1071 | #' @param cell_model Method by which cell state was modeled (either "monocle2", "seurat", or "phate") 1072 | #' @param expn_type Data type to use to determine cell-type dissimilarities 1073 | #' @param ndim Number of embedding dimensions to be used for computing cell-type dissimilarity (optional) 1074 | #' @return Phemd object with ground distance matrix (to be used in EMD computation) in @@data_cluster_weights slot 1075 | #' @examples 1076 | #' 1077 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 1078 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 1079 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 1080 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 1081 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 1082 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 1083 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final) 1084 | #' 1085 | generateGDM <- function(obj, cell_model=c('monocle2', 'seurat', 'phate'), expn_type='reduced', ndim=8) { 1086 | stopifnot(is(obj,'Phemd')) 1087 | 1088 | cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate')) 1089 | if(cell_model == 'monocle2') { 1090 | monocle_obj <- monocleInfo(obj) 1091 | # retrieve reference clusters 1092 | ref_clusters <- retrieveRefClusters(obj, cell_model='monocle2', expn_type=expn_type) 1093 | nclusters <- length(ref_clusters) 1094 | 1095 | # get graph underlying Monocle tree 1096 | mst_graph <- monocle_obj@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_tree 1097 | centroids <- identifyCentroids(ref_clusters) 1098 | pseudotimes <- pData(phenoData(monocle_obj))$Pseudotime 1099 | emd_dists <- matrix(0, nrow=nclusters, ncol=nclusters) 1100 | for(i in seq_len(nclusters)) { 1101 | for(j in seq_len(nclusters)) { 1102 | if(i == j) next 1103 | path_btwn_cells <- shortest_paths(mst_graph, centroids[[i]], centroids[[j]])$vpath[[1]] #cell1 and cell2 should be strings representing vertices in mst 1104 | pseudotime_curpath <- pseudotimes[path_btwn_cells] 1105 | pseudotime_dist <- abs(pseudotimes[as.numeric(centroids[[i]])] - min(pseudotime_curpath)) + abs(pseudotimes[as.numeric(centroids[[j]])] - min(pseudotime_curpath)) 1106 | emd_dists[i,j] <- pseudotime_dist 1107 | } 1108 | } 1109 | GDM(obj) <- emd_dists 1110 | } else if(cell_model %in% c('seurat', 'phate')) { 1111 | ref_clusters <- retrieveRefClusters(obj, cell_model=cell_model, expn_type=expn_type, ndim=ndim) 1112 | centroids <- getArithmeticCentroids(ref_clusters) 1113 | GDM(obj) <- as.matrix(dist(centroids)) 1114 | } else { 1115 | stop('Error: cell_model must be either "monocle2", "seurat", or "phate"') 1116 | } 1117 | return(obj) 1118 | } 1119 | 1120 | #' @title Computes EMD distance matrix representing pairwise dissimilarity between samples 1121 | #' @description Takes as input a Phemd object with cell subtype relative frequencies for each sample in @@data_cluster_weights slot and ground distance matrix (representing cell subtype pairwise dissimilarity) in @@emd_dist_mat slot. Returns distance matrix representing pairwise dissimilarity between samples 1122 | #' @details Requires 'transport' and 'pracma' packages 1123 | #' @param obj 'Phemd' object containing cell subtype relative frequencies for each sample in @@data_cluster_weights slot and ground distance matrix (representing cell subtype dissimilarity) in @@emd_dist_mat slot 1124 | #' @return Distance matrix of dimension num_samples x num_samples representing pairwise dissimilarity between samples 1125 | #' @examples 1126 | #' 1127 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 1128 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 1129 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 1130 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2) 1131 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 1132 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 1133 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final) 1134 | #' my_EMD_mat <- compareSamples(my_phemdObj_final) 1135 | #' 1136 | compareSamples <- function(obj) { 1137 | stopifnot(is(obj,'Phemd')) 1138 | cluster_weights <- celltypeFreqs(obj) 1139 | emd_dists <- GDM(obj) 1140 | # generate inhibitor distance matrix 1141 | Y <- rep(0, (nrow(cluster_weights)-1)*nrow(cluster_weights)/2) 1142 | counter <- 1 1143 | for(i in seq_len(nrow(cluster_weights))){ 1144 | cur_f1_weights <- cluster_weights[i,] 1145 | for(j in (i+1):nrow(cluster_weights)) { 1146 | if(j > nrow(cluster_weights)) break #this doesn't automatically happen in R 1147 | cur_f2_weights <- cluster_weights[j,] 1148 | # Compute EMD 1149 | flow <- transport(cur_f1_weights, cur_f2_weights, emd_dists, 1150 | method='primaldual') 1151 | curdist <- 0 1152 | for(k in seq_len(nrow(flow))) { 1153 | cur_penalty <- emd_dists[flow[k,1], flow[k,2]] 1154 | curdist <- curdist+cur_penalty*flow[k,3] 1155 | } 1156 | Y[counter] <- curdist 1157 | counter <- counter + 1 1158 | } 1159 | } 1160 | Y_sq <- squareform(Y) 1161 | return(Y_sq) 1162 | } 1163 | 1164 | #' @title Performs community detection on sample-sample distance matrix to identify groups of similar samples 1165 | #' @description Takes sample-sample distance matrix as input and returns group assignments for each sample 1166 | #' @details By default, uses 'kgs' (Kelley-Gardner-Sutcliffe) method for determining optimal number of groups. Alternatively, can take user-specified number of groups). Requires 'cluster' and 'maptree' packages. 1167 | #' @param distmat A distance matrix of dimension num_samples x num_samples representing pairwise dissimilarity between samples 1168 | #' @param distfun Method of partitioning network of samples (currently either 'hclust' or 'pam') 1169 | #' @param ncluster Optional parameter specifying total number of sample groups 1170 | #' @param method Optional parameter for hierarchical clustering (see "hclust" documentation) 1171 | #' @param ... Optional additional parameters to be passed to diffusionKmeans method 1172 | #' @return Vector containing group assignments for each sample (same order as row-order of distmat) based on user-specified partitioning method (e.g. hierarchical clustering) 1173 | #' @examples 1174 | #' 1175 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data)) 1176 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) 1177 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000) 1178 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, cell_model = 'monocle2', data_model = 'gaussianff', sigma=0.02, maxIter=2) 1179 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle) 1180 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle) 1181 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final) 1182 | #' my_EMD_mat <- compareSamples(my_phemdObj_final) 1183 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4) 1184 | #' 1185 | groupSamples <- function(distmat, distfun = 'hclust', ncluster=NULL, method='complete', ...) { 1186 | ## Clustering on distance matrix 1187 | if(nrow(distmat) != ncol(distmat)) { 1188 | stop('Error: distmat must be a square distance matrix of dimension num_samples x num_samples') 1189 | } 1190 | if(distfun == 'hclust') { 1191 | cluster_results <- hclust(as.dist(distmat), method=method) 1192 | if(is.null(ncluster)) { 1193 | # kgs method for determining optimal number of clusters 1194 | op_k <- kgs(cluster_results, as.dist(distmat), maxclust = 15) 1195 | ncluster <- op_k[which(op_k == min(op_k))] 1196 | } 1197 | cluster_assignments <- cutree(cluster_results, k=ncluster) 1198 | } else if(distfun == 'pam') { 1199 | if(is.null(ncluster)) ncluster <- 4 1200 | cluster_results <- pam(distmat, ncluster, diss=TRUE) 1201 | cluster_assignments <- cluster_results$clustering 1202 | } else { 1203 | stop("Error: Please specify distfun as either 'hclust' or 'pam'") 1204 | } 1205 | return(cluster_assignments) 1206 | } 1207 | 1208 | 1209 | 1210 | 1211 | 1212 | --------------------------------------------------------------------------------