├── .Rbuildignore
├── .gitignore
├── data
    ├── all_genes.RData
    ├── snames_data.RData
    ├── all_expn_data.RData
    ├── heatmap_genes.RData
    └── selected_genes.RData
├── phemd_1.15.0.tar.gz
├── vignettes
    ├── melanomaData.RData
    └── phemd.Rmd
├── R
    ├── zzz.R
    ├── functions-getResults.R
    ├── Phemd-methods.R
    ├── functions-plotting.R
    └── functions-core.R
├── phemd.Rproj
├── man
    ├── phateInfo.Rd
    ├── GDM.Rd
    ├── monocleInfo.Rd
    ├── seuratInfo.Rd
    ├── rawExpn.Rd
    ├── batchIDs.Rd
    ├── sNames.Rd
    ├── all_genes.Rd
    ├── selectMarkers.Rd
    ├── heatmap_genes.Rd
    ├── snames_data.Rd
    ├── celltypeFreqs.Rd
    ├── all_expn_data.Rd
    ├── pooledCells.Rd
    ├── subsampledIdx.Rd
    ├── subsampledBool.Rd
    ├── getSampleSizes.Rd
    ├── drawColnames45.Rd
    ├── selected_genes.Rd
    ├── identifyCentroids.Rd
    ├── getArithmeticCentroids.Rd
    ├── removeTinySamples.Rd
    ├── selectFeatures.Rd
    ├── gaussianffLocal.Rd
    ├── aggregateSamples.Rd
    ├── orderCellsMonocle.Rd
    ├── createDataObj.Rd
    ├── retrieveRefClusters.Rd
    ├── getCellYield.Rd
    ├── assignCellClusterNearestNode.Rd
    ├── bindSeuratObj.Rd
    ├── plotSummaryHistograms.Rd
    ├── compareSamples.Rd
    ├── getSampleCelltypeFreqs.Rd
    ├── plotCellYield.Rd
    ├── Phemd.Rd
    ├── generateGDM.Rd
    ├── printClusterAssignments.Rd
    ├── getSampleHistsByCluster.Rd
    ├── plotEmbeddings.Rd
    ├── clusterIndividualSamples.Rd
    ├── plotHeatmaps.Rd
    ├── groupSamples.Rd
    ├── embedCells.Rd
    ├── plotGroupedSamplesDmap.Rd
    └── Phemd-methods.Rd
├── README.md
├── DESCRIPTION
├── LICENSE.md
└── NAMESPACE


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj
2 | .Rproj.user
3 | .Rhistory
4 | .RData
5 | .Ruserdata
6 | .DS_Store
7 | 


--------------------------------------------------------------------------------
/data/all_genes.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/data/all_genes.RData


--------------------------------------------------------------------------------
/phemd_1.15.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/phemd_1.15.0.tar.gz


--------------------------------------------------------------------------------
/data/snames_data.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/data/snames_data.RData


--------------------------------------------------------------------------------
/data/all_expn_data.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/data/all_expn_data.RData


--------------------------------------------------------------------------------
/data/heatmap_genes.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/data/heatmap_genes.RData


--------------------------------------------------------------------------------
/data/selected_genes.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/data/selected_genes.RData


--------------------------------------------------------------------------------
/vignettes/melanomaData.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KrishnaswamyLab/phemd/HEAD/vignettes/melanomaData.RData


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | .onAttach <- function(libname, pkgname) {
2 |     msg <- sprintf(
3 |         "Package '%s' is deprecated and will be removed from Bioconductor
4 |          version %s", pkgname, "3.18")
5 |     .Deprecated(msg=paste(strwrap(msg, exdent=2), collapse="\n"))
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/phemd.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | 


--------------------------------------------------------------------------------
/man/phateInfo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Phemd-methods.R
 3 | \name{phateInfo}
 4 | \alias{phateInfo}
 5 | \title{Accessor function for stored phate object}
 6 | \usage{
 7 | phateInfo(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{A Phemd object.}
11 | }
12 | \value{
13 | An object of class 'phate' (from phateR)
14 | }
15 | \description{
16 | Accessor function for stored phate object
17 | }
18 | \examples{
19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
20 | phateobj <- phateInfo(phemdObj)
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/GDM.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Phemd-methods.R
 3 | \name{GDM}
 4 | \alias{GDM}
 5 | \title{Accessor function for EMD ground distance matrix}
 6 | \usage{
 7 | GDM(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{A Phemd object}
11 | }
12 | \value{
13 | Sqaure matrix representing pairwise distances between cell subtypes
14 | }
15 | \description{
16 | Accessor function for EMD ground distance matrix
17 | }
18 | \examples{
19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
20 | gdm <- GDM(phemdObj)
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/monocleInfo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Phemd-methods.R
 3 | \name{monocleInfo}
 4 | \alias{monocleInfo}
 5 | \title{Accessor function for stored Monocle object}
 6 | \usage{
 7 | monocleInfo(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{A Phemd object.}
11 | }
12 | \value{
13 | An object of class 'CellDataSet' (from Monocle)
14 | }
15 | \description{
16 | Accessor function for stored Monocle object
17 | }
18 | \examples{
19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
20 | monocle_obj <- monocleInfo(phemdObj)
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/seuratInfo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Phemd-methods.R
 3 | \name{seuratInfo}
 4 | \alias{seuratInfo}
 5 | \title{Accessor function for stored Seurat object within Phemd object}
 6 | \usage{
 7 | seuratInfo(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{A Phemd object.}
11 | }
12 | \value{
13 | An object of class 'Seurat'
14 | }
15 | \description{
16 | Accessor function for stored Seurat object within Phemd object
17 | }
18 | \examples{
19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
20 | seurat_obj <- seuratInfo(phemdObj)
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/rawExpn.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Phemd-methods.R
 3 | \name{rawExpn}
 4 | \alias{rawExpn}
 5 | \title{Accessor function for stored multi-sample raw expression data}
 6 | \usage{
 7 | rawExpn(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{A Phemd object.}
11 | }
12 | \value{
13 | List of matrices, each of which represents a single-cell sample
14 | }
15 | \description{
16 | Accessor function for stored multi-sample raw expression data
17 | }
18 | \examples{
19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
20 | raw_expn_data <- rawExpn(phemdObj)
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/batchIDs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Phemd-methods.R
 3 | \name{batchIDs}
 4 | \alias{batchIDs}
 5 | \title{Accessor function for batch ID for each sample}
 6 | \usage{
 7 | batchIDs(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{Phemd object}
11 | }
12 | \value{
13 | Vector of length num_samples representing the experiment (batch) in which the sample was profiled
14 | }
15 | \description{
16 | Accessor function for batch ID for each sample
17 | }
18 | \examples{
19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
20 | batch_metadata <- batchIDs(phemdObj)
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/sNames.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Phemd-methods.R
 3 | \name{sNames}
 4 | \alias{sNames}
 5 | \title{Accessor function for identifiers of all single-cell samples in experiment}
 6 | \usage{
 7 | sNames(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{Phemd object}
11 | }
12 | \value{
13 | Vector representing sample names corresponding to expression matrices
14 | }
15 | \description{
16 | Accessor function for identifiers of all single-cell samples in experiment
17 | }
18 | \examples{
19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
20 | sampleIDs <- sNames(phemdObj)
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/all_genes.Rd:
--------------------------------------------------------------------------------
 1 | \name{all_genes}
 2 | \alias{all_genes}
 3 | \docType{data}
 4 | \title{All genes included in (subsampled) melanoma single-cell RNA-seq expression data}
 5 | \description{This object contains 100 genes measured in melanoma single-cell RNA-seq expression data.}
 6 | \usage{
 7 |   data(all_genes)
 8 | }
 9 | \format{
10 |   Vector of length 100 representing row names of each matrix in melanoma expression dataset
11 | }
12 | \source{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056}
13 | \references{Tirosh, I. et al. Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science 352, 189–196 (2016)}
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/selectMarkers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Phemd-methods.R
 3 | \name{selectMarkers}
 4 | \alias{selectMarkers}
 5 | \title{Accessor function for gene/protein markers measured in experiment}
 6 | \usage{
 7 | selectMarkers(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{Phemd object}
11 | }
12 | \value{
13 | Vector representing gene/protein markers corresponding to expression matrices
14 | }
15 | \description{
16 | Accessor function for gene/protein markers measured in experiment
17 | }
18 | \examples{
19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
20 | genes <- selectMarkers(phemdObj)
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/heatmap_genes.Rd:
--------------------------------------------------------------------------------
 1 | \name{heatmap_genes}
 2 | \alias{heatmap_genes}
 3 | \docType{data}
 4 | \title{Genes to be used when plotting heatmap for melanoma single-cell RNA-seq expression data}
 5 | \description{This object contains genes to be used when plotting heatmap for melanoma single-cell RNA-seq expression data.}
 6 | \usage{
 7 |   data(heatmap_genes)
 8 | }
 9 | \format{
10 |   Vector of length 42 representing selected genes for plotting heatmap.
11 | }
12 | \source{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056}
13 | \references{Tirosh, I. et al. Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science 352, 189–196 (2016)}
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/snames_data.Rd:
--------------------------------------------------------------------------------
 1 | \name{snames_data}
 2 | \alias{snames_data}
 3 | \docType{data}
 4 | \title{Sample names for melanoma single-cell RNA-seq expression data}
 5 | \description{This object contains sample names corresponding to samples contained in melanoma expression data.}
 6 | \usage{
 7 |   data("snames_data")
 8 |   }
 9 | \format{
10 |   Vector of length 19 representing sample names corresponding to order of samples in all_expn_data in melanomaData dataset.
11 | }
12 | \source{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056}
13 | \references{Tirosh, I. et al. Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science 352, 189–196 (2016)}
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/celltypeFreqs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Phemd-methods.R
 3 | \name{celltypeFreqs}
 4 | \alias{celltypeFreqs}
 5 | \title{Accessor function for cell subtype distribution for each sample}
 6 | \usage{
 7 | celltypeFreqs(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{Phemd object}
11 | }
12 | \value{
13 | Matrix representing cell subtype relative frequencies for each sample (num_samples x num_genes)
14 | }
15 | \description{
16 | Accessor function for cell subtype distribution for each sample
17 | }
18 | \examples{
19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
20 | celltype_weights <- celltypeFreqs(phemdObj)
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/all_expn_data.Rd:
--------------------------------------------------------------------------------
 1 | \name{all_expn_data}
 2 | \alias{all_expn_data}
 3 | \docType{data}
 4 | \title{Single-cell RNA-seq expression data for melanoma samples}
 5 | \description{This dataset contains normalized single-cell RNA-seq expression data for 19 melanoma samples (immune cells).}
 6 | \usage{
 7 | data(all_expn_data)
 8 | }
 9 | \format{
10 |   A list of length 19 with each element representing a distinct sample. Each list element (sample) is a matrix with dimension num_genes x num_cells.
11 | }
12 | \source{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056}
13 | \references{Tirosh, I. et al. Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science 352, 189–196 (2016)}
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/pooledCells.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Phemd-methods.R
 3 | \name{pooledCells}
 4 | \alias{pooledCells}
 5 | \title{Accessor function for aggregated cells used for cell subtype definition}
 6 | \usage{
 7 | pooledCells(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{Phemd object}
11 | }
12 | \value{
13 | Numeric matrix representing expression data for cells from all experimental conditions (rows = markers, cols = cells)
14 | }
15 | \description{
16 | Accessor function for aggregated cells used for cell subtype definition
17 | }
18 | \examples{
19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
20 | cells_aggregated <- pooledCells(phemdObj)
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/subsampledIdx.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Phemd-methods.R
 3 | \name{subsampledIdx}
 4 | \alias{subsampledIdx}
 5 | \title{Accessor function for aggregated cells used for cell subtype definition}
 6 | \usage{
 7 | subsampledIdx(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{Phemd object}
11 | }
12 | \value{
13 | List of vectors each representing the indices of elements in rawExpn(obj) that were subsampled and combined to form "data_aggregate"
14 | }
15 | \description{
16 | Accessor function for aggregated cells used for cell subtype definition
17 | }
18 | \examples{
19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
20 | subsampled_idx_list <- subsampledIdx(phemdObj)
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/subsampledBool.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Phemd-methods.R
 3 | \name{subsampledBool}
 4 | \alias{subsampledBool}
 5 | \title{Accessor function for whether or not cells were subsampled when aggregated for cell subtype analysis}
 6 | \usage{
 7 | subsampledBool(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{Phemd object}
11 | }
12 | \value{
13 | Boolean represent whether or not subsampling was performed in the data aggregation process
14 | }
15 | \description{
16 | Accessor function for whether or not cells were subsampled when aggregated for cell subtype analysis
17 | }
18 | \examples{
19 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
20 | subsampled <- subsampledBool(phemdObj)
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/getSampleSizes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{getSampleSizes}
 4 | \alias{getSampleSizes}
 5 | \title{Retrieve single-cell sample sizes}
 6 | \usage{
 7 | getSampleSizes(data_list)
 8 | }
 9 | \arguments{
10 | \item{data_list}{List of length num_samples (each element has dimension num_cells x num_markers)}
11 | }
12 | \value{
13 | Vector of length num_samples representing number of cells in each sample
14 | }
15 | \description{
16 | Takes initial list of single-cell samples and returns vector containing number of cells in each sample.
17 | }
18 | \details{
19 | Private method (not exported in namespace)
20 | }
21 | \examples{
22 | \dontrun{
23 | sample_sizes <- getSampleSizes(all_expn_data)
24 | }
25 | }
26 | 


--------------------------------------------------------------------------------
/man/drawColnames45.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-plotting.R
 3 | \name{drawColnames45}
 4 | \alias{drawColnames45}
 5 | \title{Rotates heatmap marker labels 45 degrees}
 6 | \usage{
 7 | drawColnames45(coln, gaps, ...)
 8 | }
 9 | \arguments{
10 | \item{coln}{Column names}
11 | 
12 | \item{gaps}{Spacing of labels}
13 | 
14 | \item{...}{Additional parameters to be passed to \code{gpar}}
15 | }
16 | \value{
17 | Formatted marker labels in heatmap
18 | }
19 | \description{
20 | Overwrites default draw_colnames in the pheatmap package
21 | }
22 | \details{
23 | To be used with pheatmap plotting function; not to be called directly. Thanks to Josh O'Brien at http://stackoverflow.com/questions/15505607
24 | }
25 | \examples{
26 | #Not to be called directly
27 | }
28 | 


--------------------------------------------------------------------------------
/man/selected_genes.Rd:
--------------------------------------------------------------------------------
 1 | \name{selected_genes}
 2 | \alias{selected_genes}
 3 | \docType{data}
 4 | \title{Genes to be used when performing clustering and trajectory analyses on melanoma single-cell RNA-seq expression data}
 5 | \description{This object contains genes to be used when performing clustering and trajectory analyses on melanoma single-cell RNA-seq expression data.}
 6 | \usage{
 7 |   data(selected_genes)
 8 | }
 9 | \format{
10 |   Vector of length 44 representing selected genes for performing computational analyses such as generating cell embeddings and clustering cell subtypes.
11 | }
12 | \source{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056}
13 | \references{Tirosh, I. et al. Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science 352, 189–196 (2016)}
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/identifyCentroids.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{identifyCentroids}
 4 | \alias{identifyCentroids}
 5 | \title{Identify cluster centroids (cell names)}
 6 | \usage{
 7 | identifyCentroids(ref_clusters)
 8 | }
 9 | \arguments{
10 | \item{ref_clusters}{list containing each cluster of interest (each list element is a matrix of dimension num_cells x num_markers)}
11 | }
12 | \value{
13 | List of names; element \var{i} represents the name of the cell in cluster \var{i} that is closest to the centroid (arithmetic mean) of cluster \var{i}
14 | }
15 | \description{
16 | Takes initial list and returns list of cell names representing centroid of cluster
17 | }
18 | \details{
19 | Private method (not exported in namespace)
20 | }
21 | \examples{
22 | \dontrun{
23 | centroid_names <- identifyCentroids(ref_clusters)
24 | }
25 | }
26 | 


--------------------------------------------------------------------------------
/man/getArithmeticCentroids.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{getArithmeticCentroids}
 4 | \alias{getArithmeticCentroids}
 5 | \title{Get arithmetic centroids (coordinates)}
 6 | \usage{
 7 | getArithmeticCentroids(ref_clusters)
 8 | }
 9 | \arguments{
10 | \item{ref_clusters}{list containing each cluster of interest (each list element is a matrix of dimension num_cells x num_markers)}
11 | }
12 | \value{
13 | Matrix of dimension num_cluster x num_markers; row \var{i} representing the arithmetic centroid of cluster \var{i}
14 | }
15 | \description{
16 | Takes initial list and returns a matrix with row \var{i} representing the arithmetic centroid of cluster \var{i}
17 | }
18 | \details{
19 | Private method (not exported in namespace)
20 | }
21 | \examples{
22 | \dontrun{
23 | cluster_centroids <- getArithmeticCentroids(ref_clusters)
24 | }
25 | }
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PhEMD - "Phenotypic Earth Mover's Distance"
 2 | PhEMD is a package for relating a network of single-cell samples. By first modeling the cell-state landscape of a model system and then computing pairwise distances between samples using EMD, PhEMD constructs a low-dimensional embedding that uncovers intrinsic manifold structure.
 3 | 
 4 | 
 5 | ## Installing PhEMD
 6 | PhEMD can currently be installed directly through Bioconductor as follows:
 7 | ```
 8 | BiocManager::install("phemd")
 9 | ```
10 | Alternatively, the package can be cloned and installed directly from Git as below (although installation via Bioconductor is preferred).
11 | ```
12 | git clone --recursive git://github.com/wschen/phemd.git
13 | cd phemd
14 | R CMD INSTALL .
15 | ```
16 | 
17 | ## Running PhEMD
18 | See https://bioconductor.org/packages/release/bioc/vignettes/phemd/inst/doc/phemd.html for a step-by-step tutorial on applying PhEMD to a multi-sample single-cell dataset.
19 | 


--------------------------------------------------------------------------------
/man/removeTinySamples.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{removeTinySamples}
 4 | \alias{removeTinySamples}
 5 | \title{Remove samples with too few cells}
 6 | \usage{
 7 | removeTinySamples(obj, min_sz = 20)
 8 | }
 9 | \arguments{
10 | \item{obj}{'Phemd' object containing raw expression data and associated metadata}
11 | 
12 | \item{min_sz}{Minimum number of cells in each sample to be retained}
13 | }
14 | \value{
15 | 'Phemd' object containing raw multi-sample expression data and associated metadata (same as input minus removed samples)
16 | }
17 | \description{
18 | Removes samples from Phemd that have fewer cells than \code{min_sz}
19 | }
20 | \details{
21 | Note: If used, this function must be called before (and not after) the \code{aggregateSamples} function is called
22 | }
23 | \examples{
24 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
25 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) #removes samples with fewer than 10 cells
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/man/selectFeatures.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{selectFeatures}
 4 | \alias{selectFeatures}
 5 | \title{Perform feature selection on aggregated data}
 6 | \usage{
 7 | selectFeatures(obj, selected_genes)
 8 | }
 9 | \arguments{
10 | \item{obj}{'Phemd' object containing aggregated data}
11 | 
12 | \item{selected_genes}{Vector containing names of genes to use for downstream analyses}
13 | }
14 | \value{
15 | Same as input 'Phemd' object after performing feature-selection based dimensionality reduction on aggregated expression data
16 | }
17 | \description{
18 | Takes as input a Phemd object with aggregated data and returns updated object after performing feature selection on aggregated data
19 | }
20 | \details{
21 | \code{aggregateSamples} needs to be called before running this function
22 | }
23 | \examples{
24 | 
25 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
26 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
27 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
28 | my_phemdObj_lg <- selectFeatures(my_phemdObj_lg, selected_genes=c('TP53',
29 | 'EGFR', 'KRAS', 'FOXP3', 'LAG3'))
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/man/gaussianffLocal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{gaussianffLocal}
 4 | \alias{gaussianffLocal}
 5 | \title{Models expression data using generalized linear model with Gaussian error}
 6 | \usage{
 7 | gaussianffLocal(dispersion = 0, parallel = FALSE, zero = NULL)
 8 | }
 9 | \arguments{
10 | \item{dispersion}{Dispersion parameter. If 0, then estimate as described in VGAM 1.0-5 documentation.}
11 | 
12 | \item{parallel}{A logical or formula. If a formula, the response of the formula should be a logical and the terms of the formula indicates whether or not those terms are parallel.}
13 | 
14 | \item{zero}{An integer-valued vector specifying which linear/additive predictors are modelled as intercepts only. The values must be from the set {1...M} where Mis the number of columns of the matrix response.}
15 | }
16 | \value{
17 | Generalized linear model with Gaussian error
18 | }
19 | \description{
20 | Useful for modeling pre-normalized single-cell expression data.
21 | }
22 | \details{
23 | Private method (not to be called by user directly). Requires VGAM package. Obtained from VGAM v1.0-5 (https://www.rdocumentation.org/packages/VGAM/versions/1.0-5/topics/gaussianff)
24 | }
25 | 


--------------------------------------------------------------------------------
/man/aggregateSamples.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{aggregateSamples}
 4 | \alias{aggregateSamples}
 5 | \title{Aggregate expression data from all samples}
 6 | \usage{
 7 | aggregateSamples(obj, max_cells = 12000)
 8 | }
 9 | \arguments{
10 | \item{obj}{'Phemd' object containing raw expression data and associated metadata}
11 | 
12 | \item{max_cells}{Maximum number of cells across all samples to be included in final matrix on which Monocle 2 will be run}
13 | }
14 | \value{
15 | Same as input 'Phemd' object with additional slot 'data_aggregate' containing aggregated expression data (num_markers x num_cells)
16 | }
17 | \description{
18 | Takes initial Phemd object and returns object with additional data frame in slot @data_aggregate containing cells aggregated from all samples (to be used for further analyses e.g. Monocle 2 trajectory building / pseudotime mapping / cell clustering)
19 | }
20 | \details{
21 | Subsamples cells as necessary based on \code{max_cells}. If subsampling is performed, an equal number of cells are subsampled from each sample
22 | }
23 | \examples{
24 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
25 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
26 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/man/orderCellsMonocle.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{orderCellsMonocle}
 4 | \alias{orderCellsMonocle}
 5 | \title{Compute Monocle2 cell state and pseudotime assignments}
 6 | \usage{
 7 | orderCellsMonocle(obj, ...)
 8 | }
 9 | \arguments{
10 | \item{obj}{'Phemd' object containing Monocle2 object initialized using embedCells}
11 | 
12 | \item{...}{Additional parameters to be passed into \code{orderCells} function}
13 | }
14 | \value{
15 | Same as input 'Phemd' object with updated cell-state embedding object containing cell state assignments
16 | }
17 | \description{
18 | Takes as input a Phemd object with Monocle2 object and returns updated object with Monocle2 object containing cell state and pseudotime assignments
19 | }
20 | \details{
21 | Wrapper function for \code{orderCells} in Monocle 2 package. \code{embedCells} needs to be called before calling this function.
22 | }
23 | \examples{
24 | 
25 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
26 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
27 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
28 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, cell_model='monocle2', data_model='gaussianff', sigma=0.02, maxIter=2)
29 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
30 | }
31 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: phemd
 2 | Type: Package
 3 | Title: Phenotypic EMD for comparison of single-cell samples
 4 | Version: 1.15.1
 5 | Authors@R: person("William S", "Chen", email="wil.yum.chen@gmail.com",role=c("aut","cre"))
 6 | Description: Package for comparing and generating a low-dimensional embedding
 7 |   of multiple single-cell samples.
 8 | License: GPL-2
 9 | Encoding: UTF-8
10 | LazyData: true
11 | Depends: R (>= 4.0), monocle, Seurat
12 | Imports: 
13 |   SingleCellExperiment, 
14 |   RColorBrewer, 
15 |   igraph, 
16 |   transport, 
17 |   pracma, 
18 |   cluster, 
19 |   Rtsne, 
20 |   destiny, 
21 |   RANN, 
22 |   ggplot2, 
23 |   maptree, 
24 |   pheatmap, 
25 |   scatterplot3d, 
26 |   VGAM, 
27 |   methods, 
28 |   grDevices, 
29 |   graphics, 
30 |   stats, 
31 |   utils, 
32 |   cowplot, 
33 |   S4Vectors,
34 |   BiocGenerics,
35 |   SummarizedExperiment,
36 |   Biobase,
37 |   phateR,
38 |   reticulate
39 | Config/reticulate:
40 |   list(
41 |     packages = list(
42 |       list(package = "phate")
43 |     )
44 |   )
45 | Suggests: knitr
46 | VignetteBuilder: knitr
47 | biocViews: 
48 |   Clustering, 
49 |   ComparativeGenomics, 
50 |   Proteomics, 
51 |   Transcriptomics, 
52 |   Sequencing, 
53 |   DimensionReduction, 
54 |   SingleCell, 
55 |   DataRepresentation, 
56 |   Visualization, 
57 |   MultipleComparison
58 | RoxygenNote: 7.0.2
59 | PackageStatus: Deprecated
60 | 


--------------------------------------------------------------------------------
/man/createDataObj.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{createDataObj}
 4 | \alias{createDataObj}
 5 | \title{Create 'Phemd' object}
 6 | \usage{
 7 | createDataObj(data, markers, snames, datatype = "list", valtype = "counts")
 8 | }
 9 | \arguments{
10 | \item{data}{List of length \var{num_samples} containing expression data; each element is of size \var{num_cells} x \var{num_markers}. Alternately a SingleCellExperiment object.}
11 | 
12 | \item{markers}{Vector containing marker names (i.e. column names of \code{all_data})}
13 | 
14 | \item{snames}{Vector containing sample names (i.e. names of samples contained in \code{all_data})}
15 | 
16 | \item{datatype}{Either "list" or "sce" (SingleCellExperiment with genes x cells)}
17 | 
18 | \item{valtype}{Type of assay data (i.e. "counts", "normcounts", "logcounts", "tpm", "cpm") if datatype is "sce"}
19 | }
20 | \value{
21 | 'Phemd' object containing raw multi-sample expression data and associated metadata
22 | }
23 | \description{
24 | Wrapper function to create 'Phemd' object containing raw expression data and metadata
25 | }
26 | \details{
27 | Note that each element in list can have different number of rows (i.e. number of cells in each sample can vary).
28 | }
29 | \examples{
30 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/man/retrieveRefClusters.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{retrieveRefClusters}
 4 | \alias{retrieveRefClusters}
 5 | \title{Retrieve reference cell clusters}
 6 | \usage{
 7 | retrieveRefClusters(
 8 |   obj,
 9 |   cell_model = c("monocle2", "seurat", "phate"),
10 |   expn_type = "reduced",
11 |   ndim = 10
12 | )
13 | }
14 | \arguments{
15 | \item{obj}{Phemd struct containing cell-state embedding object and underlying expression data}
16 | 
17 | \item{cell_model}{String representing data model for cell-state space ("seurat", "monocle2", or "phate")}
18 | 
19 | \item{expn_type}{String representing whether to return raw expression values or coordinates in dimensionality-reduced feature space}
20 | 
21 | \item{ndim}{Number of dimensions in reduced dimensionality space (e.g. PHATE / CCA) to use (only relevant in reduced dimensionality space)}
22 | }
23 | \value{
24 | List of data matrices; each list element is of size num_cells_in_cluster x num_markers and represents a distinct cell cluster
25 | }
26 | \description{
27 | Takes initial Phemd struct and returns cell clusters as assigned by clustering algorithm (e.g. PHATE or Monocle2)
28 | }
29 | \details{
30 | Private method (not exported in namespace)
31 | }
32 | \examples{
33 | \dontrun{
34 | cluster_expression_data <- retrieveRefClusters(my_phemdObj)
35 | }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/man/getCellYield.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-getResults.R
 3 | \name{getCellYield}
 4 | \alias{getCellYield}
 5 | \title{Gets cell yield of each sample as a table}
 6 | \usage{
 7 | getCellYield(myobj, cluster_assignments = NULL)
 8 | }
 9 | \arguments{
10 | \item{myobj}{phemdObj object containing expression data for each sample in 'data' slot}
11 | 
12 | \item{cluster_assignments}{Vector of cluster assignments to be included as additional column in output table (optional)}
13 | }
14 | \value{
15 | Data frame representing cell yield of each sample
16 | }
17 | \description{
18 | Gets cell yield (number of viable cells) of each single-cell sample in decreasing order
19 | }
20 | \examples{
21 | 
22 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
23 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
24 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
25 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
26 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
27 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
28 | my_phemdObj_final <- generateGDM(my_phemdObj_final)
29 | my_EMD_mat <- compareSamples(my_phemdObj_final)
30 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
31 | getCellYield(my_phemdObj_final, cluster_assignments)
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/man/assignCellClusterNearestNode.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{assignCellClusterNearestNode}
 4 | \alias{assignCellClusterNearestNode}
 5 | \title{Assign cells to a reference cell subtype}
 6 | \usage{
 7 | assignCellClusterNearestNode(
 8 |   cur_cells,
 9 |   ref_cells,
10 |   ref_cell_labels,
11 |   cell_model = c("monocle2", "seurat", "phate")
12 | )
13 | }
14 | \arguments{
15 | \item{cur_cells}{Matrix of cells to be assigned to clusters (Dim: \var{num_cells} x \var{num_markers})}
16 | 
17 | \item{ref_cells}{Matrix of cells used to build reference Monocle 2 tree (Dim: \var{num_monocle_cells} x \var{num_markers})}
18 | 
19 | \item{ref_cell_labels}{Vector of length \var{num_monocle_cells} containing Monocle 2 cell branch assignments}
20 | 
21 | \item{cell_model}{Either "monocle2", "seurat", or "phate" depending on method used to model cell state space}
22 | }
23 | \value{
24 | Vector of length \var{num_cells} representing cluster assignments for each cell in \var{cur_cells}
25 | }
26 | \description{
27 | Assigns each cell in \code{cur_cells} to a cluster based on nearest cell in Monocle 2 tree
28 | }
29 | \details{
30 | Private method (not exported in namespace). Uses RANN package for fast knn search
31 | }
32 | \examples{
33 | \dontrun{
34 | cur_cells_cluster_labels <- assignCellClusterNearestNode(cur_cells_expn_data,
35 | clustered_cells_expn_data, clustered_cells_cluster_labels, cell_model='monocle2')
36 | }
37 | }
38 | 


--------------------------------------------------------------------------------
/man/bindSeuratObj.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{bindSeuratObj}
 4 | \alias{bindSeuratObj}
 5 | \title{Attach 'Seurat' object to 'Phemd' object}
 6 | \usage{
 7 | bindSeuratObj(phemd_obj, seurat_obj, batch.colname = "plt")
 8 | }
 9 | \arguments{
10 | \item{phemd_obj}{Phemd object initialized using createDataObj}
11 | 
12 | \item{seurat_obj}{S4 'seurat' object containing batch-normalized reference cell data}
13 | 
14 | \item{batch.colname}{Name of column in Seurat object that denotes batch ID}
15 | }
16 | \value{
17 | 'Phemd' object containing with attached Seurat object
18 | }
19 | \description{
20 | Allows user to attach batch-normalized reference cell data from Seurat into 'Phemd' object containing raw expression data and metadata
21 | }
22 | \examples{
23 | 
24 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
25 | my_seuratObj <- Seurat::CreateSeuratObject(counts = t(all_expn_data[[1]]), project = "A")
26 | my_seuratObj <- Seurat::FindVariableFeatures(object = my_seuratObj)
27 | my_seuratObj <- Seurat::ScaleData(object = my_seuratObj, do.scale=FALSE, do.center=FALSE)
28 | my_seuratObj <- Seurat::RunPCA(object = my_seuratObj, pc.genes = colnames(all_expn_data[[1]]), do.print = FALSE)
29 | my_seuratObj <- Seurat::FindNeighbors(my_seuratObj, reduction = "pca", dims.use = 1:10)
30 | my_seuratObj <- Seurat::FindClusters(my_seuratObj, resolution = 0.6, print.output = 0, save.SNN = TRUE)
31 | my_phemdObj <- bindSeuratObj(my_phemdObj, my_seuratObj)
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | ----------------------------------
 2 | 
 3 | Non-Commercial License
 4 | Yale Copyright © 2024 Yale University. 
 5 |  
 6 | Permission is hereby granted to use, copy, modify, and distribute this Software for any non-commercial purpose. Any distribution or modification or derivations of the Software (together “Derivative Works”) must be made available on GitHub and shall include this copyright notice and this permission notice in all copies or substantial portions of the Software.  For the purposes of this license, "non-commercial" means not intended for or directed towards commercial advantage or monetary compensation either via the Software itself or Derivative Works or uses of either which lead to or generate any commercial products. In any event, the use and modification of the Software or Derivative Works shall remain governed by the terms and conditions of this Agreement; Any commercial use of the Software requires a separate commercial license from the copyright holder at Yale University. Direct any requests for commercial licenses to Yale Ventures at yaleventures@yale.edu.
 7 |  
 8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
 9 | 
10 | ----------------------------------
11 | 


--------------------------------------------------------------------------------
/man/plotSummaryHistograms.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-plotting.R
 3 | \name{plotSummaryHistograms}
 4 | \alias{plotSummaryHistograms}
 5 | \title{Plots cell subtype frequency histograms summarizing each group of samples}
 6 | \usage{
 7 | plotSummaryHistograms(
 8 |   myobj,
 9 |   cluster_assignments,
10 |   cell_model = c("monocle2", "seurat", "phate"),
11 |   cmap = NULL,
12 |   ncol.plot = 4,
13 |   ax.lab.sz = 2.5,
14 |   title.sz = 3
15 | )
16 | }
17 | \arguments{
18 | \item{myobj}{Phemd object containing cell subtype relative frequency in @data_cluster_weights slot}
19 | 
20 | \item{cluster_assignments}{Vector containing group assignments for each sample in myobj}
21 | 
22 | \item{cell_model}{Method by which cell state was modeled (either "monocle2", "seurat", or "phate")}
23 | 
24 | \item{cmap}{Vector containing colors by which histogram bars should be colored (optional)}
25 | 
26 | \item{ncol.plot}{Number of columns to use to plot multi-panel histogram plot}
27 | 
28 | \item{ax.lab.sz}{Scaling factor for axis labels (default 2.5)}
29 | 
30 | \item{title.sz}{Scaling factor for plot title (default 3)}
31 | }
32 | \value{
33 | None
34 | }
35 | \description{
36 | Visualizes plots of relative frequency ("weights") of cell subtypes ("bins" or "signatures") summarizing each group of single-cell samples. Each summary histogram is computed by taking the bin-wise mean of all samples in the group
37 | }
38 | \details{
39 | \code{groupSamples} must be called before calling this function. Saves plots in directory called "summary_inhibs"
40 | }
41 | 


--------------------------------------------------------------------------------
/man/compareSamples.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{compareSamples}
 4 | \alias{compareSamples}
 5 | \title{Computes EMD distance matrix representing pairwise dissimilarity between samples}
 6 | \usage{
 7 | compareSamples(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{'Phemd' object containing cell subtype relative frequencies for each sample in @data_cluster_weights slot and ground distance matrix (representing cell subtype dissimilarity) in @emd_dist_mat slot}
11 | }
12 | \value{
13 | Distance matrix of dimension num_samples x num_samples representing pairwise dissimilarity between samples
14 | }
15 | \description{
16 | Takes as input a Phemd object with cell subtype relative frequencies for each sample in @data_cluster_weights slot and ground distance matrix (representing cell subtype pairwise dissimilarity) in @emd_dist_mat slot. Returns distance matrix representing pairwise dissimilarity between samples
17 | }
18 | \details{
19 | Requires 'transport' and 'pracma' packages
20 | }
21 | \examples{
22 | 
23 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
24 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
25 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
26 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
27 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
28 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
29 | my_phemdObj_final <- generateGDM(my_phemdObj_final)
30 | my_EMD_mat <- compareSamples(my_phemdObj_final)
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/man/getSampleCelltypeFreqs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-getResults.R
 3 | \name{getSampleCelltypeFreqs}
 4 | \alias{getSampleCelltypeFreqs}
 5 | \title{Returns cell subtype distribution for each sample as a table}
 6 | \usage{
 7 | getSampleCelltypeFreqs(myobj, cluster_assignments = NULL)
 8 | }
 9 | \arguments{
10 | \item{myobj}{phemdObj object containing expression data for each sample in 'data' slot}
11 | 
12 | \item{cluster_assignments}{Vector of cluster assignments to be included as additional column in output table (optional)}
13 | }
14 | \value{
15 | Data frame representing relative frequencies of each cell subtype along with (optional) final inhibitor cluster assignment for each single-cell sample
16 | }
17 | \description{
18 | Returns cell subtype distribution for each single-cell sample along with (optional) final inhibitor cluster assignment
19 | }
20 | \examples{
21 | 
22 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
23 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
24 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
25 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
26 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
27 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
28 | my_phemdObj_final <- generateGDM(my_phemdObj_final)
29 | my_EMD_mat <- compareSamples(my_phemdObj_final)
30 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
31 | getSampleCelltypeFreqs(my_phemdObj_final, cluster_assignments)
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/man/plotCellYield.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-plotting.R
 3 | \name{plotCellYield}
 4 | \alias{plotCellYield}
 5 | \title{Plot cell yield of each sample as bar plot}
 6 | \usage{
 7 | plotCellYield(myobj, labels = NULL, cmap = NULL, font_sz = 0.6, w = 8, h = 9.5)
 8 | }
 9 | \arguments{
10 | \item{myobj}{Phmed object containing expression data for each sample in 'data' slot}
11 | 
12 | \item{labels}{Vector containing group labels for samples (optional). If not provided, bars will be of uniform color (blue)}
13 | 
14 | \item{cmap}{Vector containing colors by which histogram bars should be colored (optional)}
15 | 
16 | \item{font_sz}{Scaling factor for font size of sample names in barplot}
17 | 
18 | \item{w}{Width of plot in inches}
19 | 
20 | \item{h}{Height of plot in inches}
21 | }
22 | \value{
23 | None
24 | }
25 | \description{
26 | Plots cell yield (number of viable cells) of each single-cell sample in decreasing order as horizontal bar plot
27 | }
28 | \examples{
29 | 
30 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
31 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
32 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
33 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
34 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
35 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
36 | my_phemdObj_final <- generateGDM(my_phemdObj_final)
37 | my_EMD_mat <- compareSamples(my_phemdObj_final)
38 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
39 | plotCellYield(my_phemdObj_final, labels=cluster_assignments, font_sz = 0.8)
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/man/Phemd.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Phemd-methods.R
 3 | \docType{class}
 4 | \name{Phemd}
 5 | \alias{Phemd}
 6 | \alias{Phemd-class}
 7 | \title{Phemd class}
 8 | \description{
 9 | The main PhEMD class to store single-cell expression data.
10 | }
11 | \section{Fields}{
12 | 
13 | \describe{
14 | \item{\code{data}}{List of matrices, each of which represents a single-cell sample (num_cells x num_genes)}
15 | 
16 | \item{\code{markers}}{Column names (e.g. genes) for each element (i.e. data matrix) in "data"}
17 | 
18 | \item{\code{snames}}{Sample ID for each element in "data"}
19 | 
20 | \item{\code{data_aggregate}}{Numeric matrix representing expression data for cells from all experimental conditions (rows = markers, cols = cells)}
21 | 
22 | \item{\code{data_subsample_idx}}{List of vectors each representing the indices of elements in "data" that were subsampled and combined to form "data_aggregate"}
23 | 
24 | \item{\code{subsampled_bool}}{Boolean represent whether or not subsampling was performed in the data aggregation process}
25 | 
26 | \item{\code{monocle_obj}}{Data object of type "CellDataSet" that is the core Monocle data structure}
27 | 
28 | \item{\code{data_cluster_weights}}{Matrix representing cell subtype relative frequencies for each sample (num_samples x num_genes)}
29 | 
30 | \item{\code{emd_dist_mat}}{Matrix representing pairwise distances between each pair of cell subtypes}
31 | 
32 | \item{\code{seurat_obj}}{Object of class "Seurat" that is the core Seurat data structure}
33 | 
34 | \item{\code{phate_obj}}{Object of class "phate" that is the core PHATE data structure}
35 | 
36 | \item{\code{experiment_ids}}{Vector of length num_samples representing the experiment (batch) in which the sample was profiled}
37 | }}
38 | 
39 | 


--------------------------------------------------------------------------------
/man/generateGDM.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{generateGDM}
 4 | \alias{generateGDM}
 5 | \title{Computes ground distance matrix based on cell embedding}
 6 | \usage{
 7 | generateGDM(
 8 |   obj,
 9 |   cell_model = c("monocle2", "seurat", "phate"),
10 |   expn_type = "reduced",
11 |   ndim = 8
12 | )
13 | }
14 | \arguments{
15 | \item{obj}{'Phemd' object containing cell-state embedding object}
16 | 
17 | \item{cell_model}{Method by which cell state was modeled (either "monocle2", "seurat", or "phate")}
18 | 
19 | \item{expn_type}{Data type to use to determine cell-type dissimilarities}
20 | 
21 | \item{ndim}{Number of embedding dimensions to be used for computing cell-type dissimilarity (optional)}
22 | }
23 | \value{
24 | Phemd object with ground distance matrix (to be used in EMD computation) in @data_cluster_weights slot
25 | }
26 | \description{
27 | Takes as input a Phemd object containing cell-state embedding object. Returns updated object with ground distance matrix representing pairwise distances between distinct cell subtypes based on cell state embedding.
28 | }
29 | \details{
30 | \code{embedCells} and \code{orderCellsMonocle} need to be called before calling this function. Requires 'igraph' package
31 | }
32 | \examples{
33 | 
34 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
35 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
36 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
37 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
38 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
39 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
40 | my_phemdObj_final <- generateGDM(my_phemdObj_final)
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/man/printClusterAssignments.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-getResults.R
 3 | \name{printClusterAssignments}
 4 | \alias{printClusterAssignments}
 5 | \title{Writes samples to file based on community detection group assignments}
 6 | \usage{
 7 | printClusterAssignments(cluster_assignments, obj, dest, overwrite = FALSE)
 8 | }
 9 | \arguments{
10 | \item{cluster_assignments}{Vector containing group assignments for each sample}
11 | 
12 | \item{obj}{phemdObj object containing sample names in @snames slot}
13 | 
14 | \item{dest}{Path to existing directory where output should be saved}
15 | 
16 | \item{overwrite}{Boolean representing whether or not to overwrite contents of "dest" with output of printClusterAssignments}
17 | }
18 | \value{
19 | None
20 | }
21 | \description{
22 | Takes vector of cluster assignments and phemdObj containing sample names and writes sample groups to file
23 | }
24 | \details{
25 | Order of samples in obj@snames is assumed to be the same as the order of group assignments in cluster_assignments
26 | }
27 | \examples{
28 | 
29 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
30 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
31 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
32 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
33 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
34 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
35 | my_phemdObj_final <- generateGDM(my_phemdObj_final)
36 | my_EMD_mat <- compareSamples(my_phemdObj_final)
37 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
38 | printClusterAssignments(cluster_assignments, my_phemdObj_final, '.', overwrite=TRUE)
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/man/getSampleHistsByCluster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-getResults.R
 3 | \name{getSampleHistsByCluster}
 4 | \alias{getSampleHistsByCluster}
 5 | \title{Gets cell subtype frequency histograms for each sample by cluster ID}
 6 | \usage{
 7 | getSampleHistsByCluster(
 8 |   myobj,
 9 |   cluster_assignments,
10 |   cell_model = c("monocle2", "seurat")
11 | )
12 | }
13 | \arguments{
14 | \item{myobj}{phemdObj object containing cell subtype relative frequency in @data_cluster_weights slot}
15 | 
16 | \item{cluster_assignments}{Vector containing group assignments for each sample in myobj}
17 | 
18 | \item{cell_model}{Method by which cell state was modeled (either "monocle2" or "seurat")}
19 | }
20 | \value{
21 | List of lists, with outer list representing sample cluster ID and inner list representing cell subtype frequencies of given sample
22 | }
23 | \description{
24 | Gets relative frequency ("weights") of cell subtypes ("bins" or "signatures") in each single-cell sample
25 | }
26 | \details{
27 | \code{groupSamples} must be called before calling this function. Saves plots in directory called "individual_inhibs"
28 | }
29 | \examples{
30 | 
31 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
32 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
33 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
34 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
35 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
36 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
37 | my_phemdObj_final <- generateGDM(my_phemdObj_final)
38 | my_EMD_mat <- compareSamples(my_phemdObj_final)
39 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
40 | weights_by_cluster <- getSampleHistsByCluster(my_phemdObj_final, cluster_assignments)
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/man/plotEmbeddings.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-plotting.R
 3 | \name{plotEmbeddings}
 4 | \alias{plotEmbeddings}
 5 | \title{Plots Monocle2 cell embedding plots}
 6 | \usage{
 7 | plotEmbeddings(
 8 |   obj,
 9 |   cell_model = c("monocle2", "seurat", "phate"),
10 |   cmap = NULL,
11 |   w = 4,
12 |   h = 5,
13 |   pt_sz = 1,
14 |   ndims = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{obj}{'Phemd' object containing Monocle 2 object}
19 | 
20 | \item{cell_model}{Method by which cell state was modeled (either "monocle2", "seurat", or "phate)}
21 | 
22 | \item{cmap}{User-specified colormap to use to color cell state embedding (optional)}
23 | 
24 | \item{w}{Width of plot in inches}
25 | 
26 | \item{h}{Height of plot in inches}
27 | 
28 | \item{pt_sz}{Scalar factor for point size}
29 | 
30 | \item{ndims}{Number of dimensions to use for dimensionality reduction in case it hasn't been performed yet (only relevant when using Seurat data as input)}
31 | }
32 | \value{
33 | Colormap (vector of colors) used to color Monocle2 cell state embedding
34 | }
35 | \description{
36 | Takes as input a Phemd object containing either a Monocle2 object or Seurat object (already embedded and ordered) and plots cell embedding plots side by side. Optionally saves to specified folder.
37 | }
38 | \details{
39 | \code{embedCells} and \code{orderCellsMonocle} need to be called before calling this function. Required additional packages: 'RColorBrewer', 'cowplot'
40 | }
41 | \examples{
42 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
43 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
44 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
45 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model='gaussianff', sigma=0.02, maxIter=2)
46 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
47 | cmap <- plotEmbeddings(my_phemdObj_monocle)
48 | }
49 | 


--------------------------------------------------------------------------------
/man/clusterIndividualSamples.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{clusterIndividualSamples}
 4 | \alias{clusterIndividualSamples}
 5 | \title{Computes cell subtype abundances for each sample}
 6 | \usage{
 7 | clusterIndividualSamples(
 8 |   obj,
 9 |   verbose = FALSE,
10 |   cell_model = c("monocle2", "seurat", "phate")
11 | )
12 | }
13 | \arguments{
14 | \item{obj}{'Phemd' object containing single-cell expression data of all samples in @data slot and cell-state embedding object generated and stored using the embedCells function.}
15 | 
16 | \item{verbose}{Boolean that determines whether progress (sequential processing of samples) should be printed. FALSE by default}
17 | 
18 | \item{cell_model}{Either "monocle2", "seurat", or "phate" depending on method used to model cell state space}
19 | }
20 | \value{
21 | 'Phemd' object with cell subtype frequencies of each sample that can be retrieved using the 'celltypeFreqs' accessor function
22 | }
23 | \description{
24 | Takes as input a Phemd object with all single-cell expression data of all single-cell samples in @data slot and cell-state embedding generated by embedCells. Returns updated object with cell subtype frequencies of each sample that may be retrieved by the 'celltypeFreqs' accessor function.
25 | }
26 | \details{
27 | \code{embedCells} (and \code{orderCellsMonocle} if using the Monocle2 embedding technique) needs to be called before calling this function.
28 | }
29 | \examples{
30 | 
31 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
32 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
33 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
34 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
35 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
36 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/man/plotHeatmaps.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-plotting.R
 3 | \name{plotHeatmaps}
 4 | \alias{plotHeatmaps}
 5 | \title{Plot heatmap of cell subtypes}
 6 | \usage{
 7 | plotHeatmaps(
 8 |   obj,
 9 |   cell_model = c("monocle2", "seurat", "phate"),
10 |   selected_genes = NULL,
11 |   w = 8,
12 |   h = 5,
13 |   ...
14 | )
15 | }
16 | \arguments{
17 | \item{obj}{'Phemd' object containing cell-state embedding object}
18 | 
19 | \item{cell_model}{Method by which cell state was modeled ("monocle2", "seurat", or "phate")}
20 | 
21 | \item{selected_genes}{Vector containing gene names to include in heatmap (optional)}
22 | 
23 | \item{w}{Width of plot in inches}
24 | 
25 | \item{h}{Height of plot in inches}
26 | 
27 | \item{...}{Additional parameters to be passed on to pheatmap function}
28 | }
29 | \value{
30 | Heatmap containing expression values for each cell subtype. If cell_model is 'seurat', then returns a list of heatmaps (1 for each batch) that may be subsequently plotted individually
31 | }
32 | \description{
33 | Takes as input a Phemd object containing either a Monocle2, Seurat, or PHATE object (already embedded and clustered) and plots heatmap characterizing cell subtypes
34 | }
35 | \details{
36 | \code{embedCells} (and \code{orderCellsMonocle} if using Monocle2) need to be called before calling this function. Required additional package: 'pheatmap'
37 | }
38 | \examples{
39 | 
40 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
41 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
42 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
43 | my_phemdObj_lg <- selectFeatures(my_phemdObj_lg, selected_genes)
44 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff',
45 | pseudo_expr=0, sigma=0.02, maxIter=2)
46 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
47 | myheatmap <- plotHeatmaps(my_phemdObj_monocle, cell_model='monocle2')
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/man/groupSamples.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{groupSamples}
 4 | \alias{groupSamples}
 5 | \title{Performs community detection on sample-sample distance matrix to identify groups of similar samples}
 6 | \usage{
 7 | groupSamples(
 8 |   distmat,
 9 |   distfun = "hclust",
10 |   ncluster = NULL,
11 |   method = "complete",
12 |   ...
13 | )
14 | }
15 | \arguments{
16 | \item{distmat}{A distance matrix of dimension num_samples x num_samples representing pairwise dissimilarity between samples}
17 | 
18 | \item{distfun}{Method of partitioning network of samples (currently either 'hclust' or 'pam')}
19 | 
20 | \item{ncluster}{Optional parameter specifying total number of sample groups}
21 | 
22 | \item{method}{Optional parameter for hierarchical clustering (see "hclust" documentation)}
23 | 
24 | \item{...}{Optional additional parameters to be passed to diffusionKmeans method}
25 | }
26 | \value{
27 | Vector containing group assignments for each sample (same order as row-order of distmat) based on user-specified partitioning method (e.g. hierarchical clustering)
28 | }
29 | \description{
30 | Takes sample-sample distance matrix as input and returns group assignments for each sample
31 | }
32 | \details{
33 | By default, uses 'kgs' (Kelley-Gardner-Sutcliffe) method for determining optimal number of groups. Alternatively, can take user-specified number of groups). Requires 'cluster' and 'maptree' packages.
34 | }
35 | \examples{
36 | 
37 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
38 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
39 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
40 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, cell_model = 'monocle2', data_model = 'gaussianff', sigma=0.02, maxIter=2)
41 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
42 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
43 | my_phemdObj_final <- generateGDM(my_phemdObj_final)
44 | my_EMD_mat <- compareSamples(my_phemdObj_final)
45 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/man/embedCells.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-core.R
 3 | \name{embedCells}
 4 | \alias{embedCells}
 5 | \title{Generate cell-state embedding}
 6 | \usage{
 7 | embedCells(
 8 |   obj,
 9 |   cell_model = c("monocle2", "seurat", "phate"),
10 |   data_model = "negbinomial_sz",
11 |   phate_ncluster = 8,
12 |   phate_cluster_seed = NULL,
13 |   ...
14 | )
15 | }
16 | \arguments{
17 | \item{obj}{'Phemd' object containing aggregated data}
18 | 
19 | \item{cell_model}{Method to use to generate cell-state embedding. Currently supports "phate" and "monocle2". If using the Seurat to model the cell-state space, please identify cell subtypes as outlined in the Seurat software package and then use the \code{bindSeuratObj} function.}
20 | 
21 | \item{data_model}{Only relevant if cell_model = "monocle2". One of the following: 'negbinomial_sz', 'negbinomial', 'tobit', 'uninormal', 'gaussianff'. See "Family Function" table at the following link for more details on selecting the proper one. \url{http://cole-trapnell-lab.github.io/monocle-release/docs/#getting-started-with-monocle}}
22 | 
23 | \item{phate_ncluster}{Only relevant if cell_model = "phate". Number of cell state clusters to return when using PHATE}
24 | 
25 | \item{phate_cluster_seed}{Only relevant if cell_model = "phate". Seed to use when performing cell state clustering (optional)}
26 | 
27 | \item{...}{Additional parameters to be passed to \code{reduceDimension} function for Monocle or \code{phate} function for PHATE}
28 | }
29 | \value{
30 | Same as input 'Phemd' object containing additional cell-state embedding object
31 | }
32 | \description{
33 | Takes as input a Phemd object with aggregated data and returns updated object containing cell-state embedding
34 | }
35 | \details{
36 | \code{aggregateSamples} needs to be called before running this function.
37 | }
38 | \examples{
39 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
40 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
41 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
42 | my_phemdObj_lg <- embedCells(my_phemdObj_lg, cell_model='monocle2', data_model = 'gaussianff', sigma=0.02, maxIter=2)
43 | }
44 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | exportClasses(Phemd)
 2 | export(retrieveRefClusters)
 3 | export(createDataObj)
 4 | export(bindSeuratObj)
 5 | export(removeTinySamples)
 6 | export(aggregateSamples)
 7 | export(selectFeatures)
 8 | export(embedCells)
 9 | export(orderCellsMonocle)
10 | export(plotEmbeddings)
11 | export(plotHeatmaps)
12 | export(clusterIndividualSamples)
13 | export(generateGDM)
14 | export(compareSamples)
15 | export(groupSamples)
16 | export(printClusterAssignments)
17 | export(plotGroupedSamplesDmap)
18 | export(getSampleHistsByCluster)
19 | export(plotSummaryHistograms)
20 | export(plotCellYield)
21 | export(getCellYield)
22 | export(drawColnames45)
23 | export(getSampleCelltypeFreqs)
24 | export(rawExpn)
25 | export(monocleInfo)
26 | export(seuratInfo)
27 | export(phateInfo)
28 | export(GDM)
29 | export(selectMarkers)
30 | export(sNames)
31 | export(pooledCells)
32 | export(subsampledIdx)
33 | export(subsampledBool)
34 | export(celltypeFreqs)
35 | export(batchIDs)
36 | export("rawExpn<-")
37 | export("selectMarkers<-")
38 | export("pooledCells<-")
39 | export("subsampledIdx<-")
40 | export("subsampledBool<-")
41 | export("monocleInfo<-")
42 | export("seuratInfo<-")
43 | export("phateInfo<-")
44 | export("celltypeFreqs<-")
45 | export("batchIDs<-")
46 | export("GDM<-")
47 | 
48 | importFrom(pracma, squareform, isempty, strcat)
49 | import(reticulate)
50 | import(monocle)
51 | import(RColorBrewer)
52 | import(igraph)
53 | import(transport)
54 | import(cluster)
55 | import(Rtsne)
56 | import(destiny)
57 | import(ggplot2)
58 | import(RANN)
59 | import(scatterplot3d)
60 | import(maptree)
61 | import(pheatmap)
62 | import(Seurat)
63 | import(VGAM)
64 | import(methods)
65 | import(SingleCellExperiment)
66 | import(Biobase)
67 | import(phateR)
68 | importFrom("grDevices", "colorRampPalette", "dev.off", "palette", "png", 
69 |            "rainbow")
70 | importFrom("graphics", "barplot", "par", "text", "title")
71 | importFrom("stats", "as.dist", "cov", "cutree", "dist", "hclust", 
72 |            "mahalanobis")
73 | importFrom("utils", "assignInNamespace", "write.table")
74 | importFrom("cowplot", "plot_grid")
75 | importFrom("S4Vectors", "isEmpty")
76 | importFrom("BiocGenerics", "estimateSizeFactors", "estimateDispersions")
77 | importFrom("SummarizedExperiment", "assay", "assays")
78 | importFrom("utils", "packageVersion")


--------------------------------------------------------------------------------
/man/plotGroupedSamplesDmap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions-plotting.R
 3 | \name{plotGroupedSamplesDmap}
 4 | \alias{plotGroupedSamplesDmap}
 5 | \title{Plot diffusion map embedding of samples based on distance matrix}
 6 | \usage{
 7 | plotGroupedSamplesDmap(
 8 |   my_distmat,
 9 |   cluster_assignments = NULL,
10 |   pt_sz = 1,
11 |   n_dim = 3,
12 |   pt_label = NULL,
13 |   cmap = NULL,
14 |   w = 8,
15 |   h = 5,
16 |   scale.y = 1,
17 |   angle = 40,
18 |   autosave = FALSE,
19 |   ...
20 | )
21 | }
22 | \arguments{
23 | \item{my_distmat}{phemdObj object containing sample names in @snames slot}
24 | 
25 | \item{cluster_assignments}{Vector containing group assignments for each sample}
26 | 
27 | \item{pt_sz}{Size of points representing samples in plot (scaling factor)}
28 | 
29 | \item{n_dim}{Number of dimensions for embedding (either 2 or 3)}
30 | 
31 | \item{pt_label}{Vector of sample names corresponding to each point (same order as samples in \code{my_distmat} and \code{cluster_assignments})}
32 | 
33 | \item{cmap}{Vector containing colors by which points should be colored (corresponding to cluster_assignments)}
34 | 
35 | \item{w}{Width of plot in inches}
36 | 
37 | \item{h}{Height of plot in inches}
38 | 
39 | \item{scale.y}{Scaling factor for diffusion map y-axis}
40 | 
41 | \item{angle}{Rotation factor for diffusion map plot}
42 | 
43 | \item{autosave}{Boolean denoting whether or not to save output diffusion map}
44 | 
45 | \item{...}{Additional parameters to be passed to \code{DiffusionMap} function}
46 | }
47 | \value{
48 | DiffusionMap object containing biological sample embedding and associated metadata
49 | }
50 | \description{
51 | Visualizes diffusion map for network of samples based on square distance matrix (sample-sample pairwise dissimilarity)
52 | }
53 | \details{
54 | Requires 'destiny' package
55 | }
56 | \examples{
57 | 
58 | my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
59 | my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
60 | my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
61 | my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
62 | my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
63 | my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
64 | my_phemdObj_final <- generateGDM(my_phemdObj_final)
65 | my_EMD_mat <- compareSamples(my_phemdObj_final)
66 | cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
67 | printClusterAssignments(cluster_assignments, my_phemdObj_final, '.', overwrite=TRUE)
68 | dm <- plotGroupedSamplesDmap(my_EMD_mat, cluster_assignments, pt_sz=2)
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/man/Phemd-methods.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/Phemd-methods.R
  3 | \docType{methods}
  4 | \name{Phemd-methods}
  5 | \alias{Phemd-methods}
  6 | \alias{selectMarkers<-}
  7 | \alias{selectMarkers<-,Phemd-method}
  8 | \alias{Phemd,ANY,ANY-method}
  9 | \alias{rawExpn<-}
 10 | \alias{Phemd,character,ANY-method}
 11 | \alias{rawExpn<-,Phemd-method}
 12 | \alias{pooledCells<-}
 13 | \alias{pooledCells<-,Phemd-method}
 14 | \alias{subsampledIdx<-}
 15 | \alias{subsampledIdx<-,Phemd-method}
 16 | \alias{subsampledBool<-}
 17 | \alias{subsampledBool<-,Phemd-method}
 18 | \alias{monocleInfo<-}
 19 | \alias{monocleInfo<-,Phemd-method}
 20 | \alias{seuratInfo<-}
 21 | \alias{seuratInfo<-,Phemd-method}
 22 | \alias{phateInfo<-}
 23 | \alias{phateInfo<-,Phemd-method}
 24 | \alias{celltypeFreqs<-}
 25 | \alias{celltypeFreqs<-,Phemd-method}
 26 | \alias{batchIDs<-}
 27 | \alias{batchIDs<-,Phemd-method}
 28 | \alias{GDM<-}
 29 | \alias{GDM<-,Phemd-method}
 30 | \title{Setter function for protein / gene markers}
 31 | \usage{
 32 | selectMarkers(obj) <- value
 33 | 
 34 | \S4method{selectMarkers}{Phemd}(obj) <- value
 35 | 
 36 | rawExpn(obj) <- value
 37 | 
 38 | \S4method{rawExpn}{Phemd}(obj) <- value
 39 | 
 40 | pooledCells(obj) <- value
 41 | 
 42 | \S4method{pooledCells}{Phemd}(obj) <- value
 43 | 
 44 | subsampledIdx(obj) <- value
 45 | 
 46 | \S4method{subsampledIdx}{Phemd}(obj) <- value
 47 | 
 48 | subsampledBool(obj) <- value
 49 | 
 50 | \S4method{subsampledBool}{Phemd}(obj) <- value
 51 | 
 52 | monocleInfo(obj) <- value
 53 | 
 54 | \S4method{monocleInfo}{Phemd}(obj) <- value
 55 | 
 56 | seuratInfo(obj) <- value
 57 | 
 58 | \S4method{seuratInfo}{Phemd}(obj) <- value
 59 | 
 60 | phateInfo(obj) <- value
 61 | 
 62 | \S4method{phateInfo}{Phemd}(obj) <- value
 63 | 
 64 | celltypeFreqs(obj) <- value
 65 | 
 66 | \S4method{celltypeFreqs}{Phemd}(obj) <- value
 67 | 
 68 | batchIDs(obj) <- value
 69 | 
 70 | \S4method{batchIDs}{Phemd}(obj) <- value
 71 | 
 72 | GDM(obj) <- value
 73 | 
 74 | \S4method{GDM}{Phemd}(obj) <- value
 75 | }
 76 | \arguments{
 77 | \item{obj}{A Phemd object}
 78 | 
 79 | \item{value}{Assignment object}
 80 | }
 81 | \value{
 82 | Updated Phemd object
 83 | 
 84 | Updated Phemd object
 85 | 
 86 | Updated Phemd object
 87 | 
 88 | Updated Phemd object
 89 | 
 90 | Updated Phemd object
 91 | 
 92 | Updated Phemd object containing Seurat object
 93 | 
 94 | Updated Phemd object containing phate object
 95 | 
 96 | Updated Phemd object
 97 | 
 98 | Updated Phemd object
 99 | 
100 | Updated Phemd object
101 | }
102 | \description{
103 | Setter function for protein / gene markers
104 | 
105 | Setter function for stored expression data
106 | 
107 | Setter function for single-cell expression data aggregated from multiple samples
108 | 
109 | Setter function for indices of cells subsampled from each sample during aggregation
110 | 
111 | Setter function for boolean denoting whether cells were subsampled from each sample during aggregation
112 | 
113 | Setter function for Monocle2 CellDataSet object for experiment
114 | 
115 | Setter function for Seurat object for experiment
116 | 
117 | Setter function for phate object for experiment
118 | 
119 | Setter function for cell subtype frequencies of each single-cell sample
120 | 
121 | Setter function for batch IDs of each single-cell sample
122 | 
123 | Setter function for EMD ground distance matrix
124 | }
125 | \examples{
126 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
127 | new_genes <- all_genes
128 | new_genes[1] <- 'IL2R'
129 | selectMarkers(phemdObj) <- new_genes
130 | 
131 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
132 | new_expn_data <- all_expn_data
133 | new_expn_data <- lapply(new_expn_data, function(x) {log2(x+1)})
134 | rawExpn(phemdObj) <- new_expn_data
135 | 
136 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
137 | aggregated_data <- t(do.call(rbind,all_expn_data))
138 | pooledCells(phemdObj) <- aggregated_data
139 | 
140 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
141 | subsampledIdxList<- rep(list(1:10), length(all_expn_data)) #subsampled cells 1-10 from each sample
142 | subsampledIdx(phemdObj) <- subsampledIdxList
143 | 
144 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
145 | subsampledBool(phemdObj) <- TRUE
146 | 
147 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
148 | mydata <- pooledCells(phemdObj)
149 | myCellDataSet <- newCellDataSet(mydata,phenoData=NULL, expressionFamily=VGAM::negbinomial.size())
150 | monocleInfo(phemdObj) <- myCellDataSet
151 | 
152 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
153 | my_seuratObj <- Seurat::CreateSeuratObject(counts = t(all_expn_data[[1]]), project = "A")
154 | seuratInfo(phemdObj) <- my_seuratObj
155 | 
156 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
157 | #my_phateObj <- phateR::phate(all_expn_data[[1]])
158 | phateInfo(phemdObj) <- list()
159 | 
160 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
161 | myCellTypeFreqs <- matrix(rexp(length(all_expn_data)*10, rate=.1), ncol=10)
162 | myCellTypeFreqs <- apply(myCellTypeFreqs, 1, function(x) {x / sum(x)})
163 | celltypeFreqs(phemdObj) <- myCellTypeFreqs
164 | 
165 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
166 | my_seuratObj <- Seurat::CreateSeuratObject(counts = t(all_expn_data[[1]]), project = "A")
167 | seuratInfo(phemdObj) <- my_seuratObj
168 | batchIDs(phemdObj) <- rep('A', length(all_expn_data))
169 | 
170 | phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
171 | cluster_locs <- 1:10
172 | myGDM <- as.matrix(dist(cluster_locs))
173 | GDM(phemdObj) <- myGDM
174 | 
175 | }
176 | 


--------------------------------------------------------------------------------
/R/functions-getResults.R:
--------------------------------------------------------------------------------
  1 | #' @title Writes samples to file based on community detection group assignments
  2 | #' @description Takes vector of cluster assignments and phemdObj containing sample names and writes sample groups to file
  3 | #' @details Order of samples in obj@@snames is assumed to be the same as the order of group assignments in cluster_assignments
  4 | #' @param cluster_assignments Vector containing group assignments for each sample
  5 | #' @param obj phemdObj object containing sample names in @@snames slot
  6 | #' @param dest Path to existing directory where output should be saved
  7 | #' @param overwrite Boolean representing whether or not to overwrite contents of "dest" with output of printClusterAssignments
  8 | #' @return None
  9 | #' @examples
 10 | #' 
 11 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 12 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
 13 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
 14 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
 15 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
 16 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
 17 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final)
 18 | #' my_EMD_mat <- compareSamples(my_phemdObj_final)
 19 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
 20 | #' printClusterAssignments(cluster_assignments, my_phemdObj_final, '.', overwrite=TRUE)
 21 | #' 
 22 | printClusterAssignments <- function(cluster_assignments, obj, dest, overwrite=FALSE) {
 23 |   snames <- sNames(obj)
 24 |   if(dir.exists(paste(dest, 'sample_groups', sep='')) && overwrite==FALSE) {
 25 |     stop('Directory "sample_groups" already exists in specified path. Set "overwrite" parameter to TRUE if you want to overwrite existing directory')
 26 |   }
 27 |   unlink(paste(dest, 'sample_groups', sep=''), recursive=TRUE)
 28 |   dir.create(file.path(paste(dest, 'sample_groups', sep='')), showWarnings = FALSE) # create folder for output
 29 |   for(i in seq_len(max(cluster_assignments))) {
 30 |     cur_cluster_idx <- which(cluster_assignments == i)
 31 |     cur_file <- sprintf('sample_groups/scluster_%s.txt', intToUtf8(64+i))
 32 |     cur_file <- strcat(dest, cur_file)
 33 |     write(snames[cur_cluster_idx], file=cur_file, sep="\n")
 34 |   }
 35 | }
 36 | 
 37 | #' @title Gets cell subtype frequency histograms for each sample by cluster ID
 38 | #' @description Gets relative frequency ("weights") of cell subtypes ("bins" or "signatures") in each single-cell sample
 39 | #' @details \code{groupSamples} must be called before calling this function. Saves plots in directory called "individual_inhibs"
 40 | #' @param myobj phemdObj object containing cell subtype relative frequency in @@data_cluster_weights slot
 41 | #' @param cluster_assignments Vector containing group assignments for each sample in myobj
 42 | #' @param cell_model Method by which cell state was modeled (either "monocle2" or "seurat")
 43 | #' @return List of lists, with outer list representing sample cluster ID and inner list representing cell subtype frequencies of given sample
 44 | #' @examples
 45 | #' 
 46 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 47 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
 48 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
 49 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
 50 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
 51 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
 52 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final)
 53 | #' my_EMD_mat <- compareSamples(my_phemdObj_final)
 54 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
 55 | #' weights_by_cluster <- getSampleHistsByCluster(my_phemdObj_final, cluster_assignments)
 56 | #' 
 57 | getSampleHistsByCluster <- function(myobj, cluster_assignments, cell_model=c('monocle2', 'seurat')) {
 58 |   cell_model <- match.arg(cell_model, c('monocle2','seurat'))
 59 |   if(cell_model == 'monocle2') {
 60 |     monocle_obj <- monocleInfo(myobj)
 61 |     labels <- pData(phenoData(monocle_obj))
 62 |     state_labels <- as.numeric(labels$State)
 63 |   } else if(cell_model == 'seurat') {
 64 |     seurat_obj <- seuratInfo(myobj)
 65 |     state_labels <- as.numeric(as.character(Idents(seurat_obj)))
 66 |   } else {
 67 |     stop('Error: cell_model must either be "monocle2" or "seurat"')
 68 |   }
 69 |   
 70 |   cluster_weights <- celltypeFreqs(myobj)
 71 |   snames <- sNames(myobj)
 72 |   
 73 |   weights_by_cluster <- list()
 74 |   for(i in seq_len(max(cluster_assignments))) {
 75 |     cur_inhibs <- which(cluster_assignments == i)
 76 |     for(j in seq_len(length(cur_inhibs))) {
 77 |       cur_idx <- cur_inhibs[j]
 78 |       cur_sname <- snames[cur_idx]
 79 |       if(is.null(weights_by_cluster[[intToUtf8(64+i)]])) {
 80 |         weights_by_cluster[[intToUtf8(64+i)]] <- list()
 81 |       }
 82 |       weights_by_cluster[[intToUtf8(64+i)]][[cur_sname]] <- cluster_weights[cur_idx,]
 83 |     }
 84 |   }
 85 |   return(weights_by_cluster)
 86 | }
 87 | 
 88 | 
 89 | #' @title Gets cell yield of each sample as a table
 90 | #' @description Gets cell yield (number of viable cells) of each single-cell sample in decreasing order
 91 | #' @param myobj phemdObj object containing expression data for each sample in 'data' slot
 92 | #' @param cluster_assignments Vector of cluster assignments to be included as additional column in output table (optional)
 93 | #' @return Data frame representing cell yield of each sample
 94 | #' @examples
 95 | #' 
 96 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 97 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
 98 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
 99 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
100 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
101 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
102 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final)
103 | #' my_EMD_mat <- compareSamples(my_phemdObj_final)
104 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
105 | #' getCellYield(my_phemdObj_final, cluster_assignments)
106 | #' 
107 | getCellYield <- function(myobj, cluster_assignments=NULL) {
108 |   nsample <- length(rawExpn(myobj))
109 |   cell_yield <- vapply(rawExpn(myobj), nrow, integer(1L))
110 |   
111 |   order_idx <- order(cell_yield, decreasing=FALSE)
112 |   cell_yield_ordered <- cell_yield[order_idx]
113 |   snames_ordered <- sNames(myobj)[order_idx]
114 |   cell_yield_tab <- cbind.data.frame(snames_ordered, cell_yield_ordered)
115 |   colnames(cell_yield_tab) <- c('sample_ID', 'cell_yield')
116 |   if(!is.null(cluster_assignments)) {
117 |     cluster_assignments_reordered <- cluster_assignments[order_idx]
118 |     cell_yield_tab$cluster_ID <- vapply(cluster_assignments_reordered, function(x) intToUtf8(64+x), '')
119 |   }
120 |   return(cell_yield_tab)
121 | }
122 | 
123 | #' @title Returns cell subtype distribution for each sample as a table
124 | #' @description Returns cell subtype distribution for each single-cell sample along with (optional) final inhibitor cluster assignment
125 | #' @param myobj phemdObj object containing expression data for each sample in 'data' slot
126 | #' @param cluster_assignments Vector of cluster assignments to be included as additional column in output table (optional)
127 | #' @return Data frame representing relative frequencies of each cell subtype along with (optional) final inhibitor cluster assignment for each single-cell sample
128 | #' @examples
129 | #' 
130 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
131 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
132 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
133 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
134 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
135 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
136 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final)
137 | #' my_EMD_mat <- compareSamples(my_phemdObj_final)
138 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
139 | #' getSampleCelltypeFreqs(my_phemdObj_final, cluster_assignments)
140 | #' 
141 | getSampleCelltypeFreqs <- function(myobj, cluster_assignments=NULL) {
142 |   celltype_freqs <- as.data.frame(celltypeFreqs(myobj))
143 |   celltype_freqs <- round(celltype_freqs, digits=3)
144 |   colnames(celltype_freqs) <- paste('C-', seq_len(ncol(celltype_freqs)), sep='')
145 |   
146 |   if(!is.null(cluster_assignments)) {
147 |     celltype_freqs$Sample.Cluster.ID <- vapply(cluster_assignments, function(x) intToUtf8(64+x), '')
148 |   }
149 |   
150 |   weightsTab <- cbind.data.frame(Sample.Name=sNames(myobj), celltype_freqs)
151 |   return(weightsTab)
152 | }
153 | 


--------------------------------------------------------------------------------
/vignettes/phemd.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Tutorial on using PhEMD to analyze multi-sample single-cell experiments"
  3 | author: "William Chen"
  4 | date: "October 20, 2018"
  5 | output: BiocStyle::html_document
  6 | 
  7 | vignette: >
  8 |     %\VignetteEngine{knitr::rmarkdown}
  9 |     %\VignetteIndexEntry{PhEMD vignette}
 10 |     \usepackage[UTF-8]{inputenc}
 11 | ---
 12 | 
 13 | ```{r setup, include=FALSE}
 14 | knitr::opts_chunk$set(echo = TRUE)
 15 | ```
 16 | 
 17 | 
 18 | ## Overview
 19 | 
 20 | PhEMD is a package for comparing multiple heterogeneous single-cell samples to 
 21 | one another. It currently does so by first defining cell subtypes and relating 
 22 | them to one another using Monocle 2. It then computes Earth Mover's Distance 
 23 | (EMD) between each pair of samples, incorporating information about intrinsic 
 24 | cell subtype dissimilarity (i.e. manifold distance between cell subtypes) and 
 25 | differences between samples with respect to relative abundance of each cell 
 26 | subtype. PhEMD uses these pairwise distances to construct a network of 
 27 | single-cell samples that may be visually related in 2D or 3D using a diffusion 
 28 | map and partitioned to identify groups of similar samples.
 29 | 
 30 | ## 1. Installation
 31 | 
 32 | PhEMD requires R version >= 3.4.0 (recommended 3.5.0+), Bioconductor 
 33 | version >= 3.5 (recommended 3.7+), and Monocle 2 version >= 2.4.0 
 34 | (recommended 2.8.0)
 35 | 
 36 | ###Install from Bioconductor
 37 | 
 38 | ```{r echo=T, results = 'hide',message=F, warning=F, eval=F}
 39 | BiocManager::install("phemd")
 40 | ```
 41 | 
 42 | ###Install from Github (though direct installation from Bioconductor is preferred)
 43 |  
 44 | ```{r echo=T, results = 'hide',message=F, warning=F, eval=F}
 45 | library(devtools)
 46 | install_github("wschen/phemd")
 47 | ```
 48 | 
 49 | ###Load library after installation
 50 | 
 51 | ```{r echo=T, results = 'hide', message=F, warning=F}
 52 | library('phemd')
 53 | library('monocle')
 54 | ```
 55 | 
 56 | ## 2. Preparing data for cell state definition and embedding
 57 | 
 58 | PhEMD expects single-cell data to be represented as an R list of samples. Each 
 59 | sample (i.e. list element) is expected to be a matrix of dimension *num_cells* 
 60 | x *num_markers*, where markers may represent genes or cytometry protein markers.
 61 | For this vignette, we will be demonstrating our analysis pipeline on a melanoma 
 62 | dataset consisting of tumor-infiltrating immune cell scRNA-seq data (selected 
 63 | genes) that were log-transformed following TPM-normalization (first published 
 64 | by Tirosh et al., 2016).
 65 | 
 66 | We first start by creating a PhEMD data object, specifying the multi-sample 
 67 | expression data (R list), marker names (i.e. column names of the data matrices 
 68 | in the list of expression data), and sample names (in the order they appear in 
 69 | the list of expression data).
 70 | 
 71 | ```{r echo=T, results = 'hide', message=F, warning=F}
 72 | load('melanomaData.RData')
 73 | myobj <- createDataObj(all_expn_data, all_genes, as.character(snames))
 74 | ```
 75 | 
 76 | We can optionally remove samples in the PhEMD data object that have fewer than 
 77 | min_sz number of cells as follows:
 78 | 
 79 | ```{r echo=T}
 80 | myobj <- removeTinySamples(myobj, min_sz = 20)
 81 | ```
 82 | 
 83 | Note that samples that don't meet the meet the minimum cell yield criteria are 
 84 | removed from rawExpn(myobj) and from the list of sample names in 
 85 | sampleNames(myobj).
 86 | 
 87 | Next, aggregate data from all samples into a matrix that is stored in the PhEMD 
 88 | data object (in slot 'data_aggregate'). This aggregated data will then be used 
 89 | for initial cell subtype definition and embedding. If there are more cells 
 90 | collectively in all samples than max_cells, an equal number of cells from each 
 91 | sample will be subsampled and stored in pooledCells(myobj).
 92 | 
 93 | ```{r echo=T, results = 'hide'}
 94 | myobj <- aggregateSamples(myobj, max_cells=12000)
 95 | ```
 96 | 
 97 | ## 3. Generate Monocle 2 cell embedding with cell state definitions
 98 | 
 99 | Now that we have aggregated single-cell data from all samples, we are ready to 
100 | perform cell subtype definition and dimensionality reduction to visually and 
101 | spatially relate cells and cell subtypes. For this, we use Monocle 2. Before we 
102 | begin, we first perform feature selection by selecting 44 important genes. 
103 | Suggestions on how to choose important genes can be found here: http://cole-trapnell-lab.github.io/monocle-release/docs/#trajectory-step-1-choose-genes-that-define-a-cell-s-progress
104 | 
105 | ```{r echo=T, results = 'hide'}
106 | myobj <- selectFeatures(myobj, selected_genes)
107 | ```
108 | 
109 | We are now ready to generate a Monocle 2 embedding. Our *embedCells()* function
110 | is a wrapper function for the *reduceDimension()* function in Monocle 2. For our
111 | example dataset, we specify the expression distribution model as *'gaussianff'* 
112 | as is recommended in the Monocle 2 tutorial for log-transformed scRNA-seq TPM 
113 | values (http://cole-trapnell-lab.github.io/monocle-release/docs/#choosing-a-distribution-for-your-data-required). 
114 | *'negbinomial_sz'* is the recommended data type for most unnormalized scRNA-seq 
115 | data (raw read counts) and *'gaussianff'* is recommended for log-transformed 
116 | data or arcsin-transformed mass cytometry data. See above link for more details.
117 | 
118 | Additional parameters may be passed to Monocle 2 *reduceDimension()* as optional
119 | named parameters in *embed_cells()*. We found that Monocle 2 is robust to a 
120 | range of parameters. Sigma can be thought of as a "noise" parameter and we 
121 | empirically found that sigma in the range of [0.01, 0.1] often works well for
122 | log-transformed scRNA-seq data or normalized CyTOF data. Greater values of sigma
123 | generally result in fewer total number of clusters. See Monocle 2 publication 
124 | (Qiu et al., 2017) for additional details on parameter selection.
125 | 
126 | ```{r echo=T, results = 'hide', message=F, warning=F}
127 | # generate 2D cell embedding and cell subtype assignments
128 | myobj <- embedCells(myobj, data_model = 'gaussianff', pseudo_expr=0, sigma=0.02)
129 | # generate pseudotime ordering of cells
130 | myobj <- orderCellsMonocle(myobj)
131 | ```
132 | 
133 | The result of the code above is a Monocle 2 object stored in 
134 | *monocleInfo(myobj)*. This object contains cell subtype and pseudotime 
135 | assignments for each cell in the aggregated data matrix (stored in 
136 | *pooledCells(myobj)*). A 2D embedding of these cells has also been generated.
137 | We can visualize the embedding by writing them to file in this way:
138 | 
139 | ```{r echo=T, results = 'hide', fig.width=8, fig.height=4}
140 | cmap <- plotEmbeddings(myobj, cell_model='monocle2')
141 | ```
142 | 
143 | To visualize the expression profiles of the cell subtypes, we can plot a heatmap
144 | and save to file as such:
145 | 
146 | ```{r echo=T, results = 'hide', fig.width=8, fig.height=6}
147 | plotHeatmaps(myobj, cell_model='monocle2', selected_genes=heatmap_genes)
148 | ```
149 | 
150 | ## 4. Deconvolute single-cell samples and compare using Earth Mover's Distance
151 | 
152 | Now that we have identified a comprehensive set of cell subtypes across all 
153 | single-cell samples and related them in a low-dimensional embedding by 
154 | aggregating cells from all samples, we want to perform deconvolution to 
155 | determine the abundance of each cell subtype on a per sample basis. To do so, 
156 | we call this function:
157 | 
158 | ```{r echo=T, results = 'hide'}
159 | # Determine cell subtype breakdown of each sample
160 | myobj <- clusterIndividualSamples(myobj)
161 | ```
162 | 
163 | The results of this process are stored in *celltypeFreqs(myobj)*. Row 
164 | *i* column *j* represents the fraction of all cells in sample *i* assigned to 
165 | cell subtype *j*.
166 | 
167 | To compare single-cell samples, we use Earth Mover's Distance, which is a metric
168 | that takes into account both the difference in relative frequencies of matching
169 | cell subtypes (e.g. % of all cells in each sample that are CD8+ T-cells) and the
170 | dissimilarity of the cell subtypes themselves (e.g. intrinsic dissimilarity 
171 | between CD8+ and CD4+ T-cells). To compute the intrinsic dissimilarity between
172 | cell subtypes, we call the following function:
173 | 
174 | ```{r echo=T, results = 'hide'}
175 | # Determine (dis)similarity of different cell subtypes
176 | myobj <- generateGDM(myobj)
177 | ```
178 | 
179 | *generateGDM()* stores the pairwise dissimilarity (i.e. "ground-distance" or
180 | "tree-distance") between cell subtypes in *GDM(myobj)*.
181 | 
182 | We are now ready to compare single-cell samples using EMD. To do so, we simply
183 | call the function *compareSamples()*:
184 | 
185 | ```{r echo=T, results = 'hide'}
186 | # Perform inter-sample comparisons using EMD
187 | my_distmat <- compareSamples(myobj)
188 | ```
189 | 
190 | *compareSamples()* returns a distance matrix representing the pairwise EMD 
191 | between single-cell samples; *my_distmat[i,j]* represents the dissimilarity 
192 | between samples *i* and *j* (i.e. samples represented by rows *i* and *j* in 
193 | celltypeFreqs(myobj)). We can use this distance matrix to identify similar 
194 | groups of samples as such:
195 | 
196 | ```{r echo=T, results = 'hide'}
197 | ## Identify similar groups of inhibitors
198 | group_assignments <- groupSamples(my_distmat, distfun = 'hclust', ncluster=5)
199 | ```
200 | 
201 | ## 5. Visualize single-cell samples based on PhEMD-based similarity
202 | 
203 | We can also use the PhEMD-based distance matrix to generate an embedding of 
204 | single-cell samples, colored by group assignments.
205 | 
206 | ```{r echo=T, results = 'hide', fig.width=8, fig.height=7, fig.keep='none'}
207 | dmap_obj <- plotGroupedSamplesDmap(my_distmat, group_assignments, pt_sz = 1.5)
208 | ```
209 | 
210 | ```{r echo=F, results = 'hide', fig.keep='none'}
211 | plotGroupedSamplesDmap(my_distmat, group_assignments, pt_sz = 1.5)
212 | ```
213 | 
214 | ```{r echo=F, results = 'hide', fig.width=8, fig.height=7}
215 | plotGroupedSamplesDmap(my_distmat, group_assignments, pt_sz = 1.5)
216 | ```
217 | 
218 | To retrieve the cell subtype distribution on a per-sample basis, use the 
219 | following function. Histograms can be subsequently plotted for a given sample
220 | as desired.
221 | 
222 | ```{r echo=T, results = 'hide'}
223 | # Plot cell subtype distribution (histogram) for each sample
224 | sample.cellfreqs <- getSampleHistsByCluster(myobj, group_assignments, cell_model='monocle2')
225 | ```
226 | 
227 | To plot cell subtype histograms summarizing groups of similar samples (bin-wise
228 | mean of each cell subtype across all samples assigned to a particular group), 
229 | use the following plotting function:
230 | 
231 | ```{r echo=T, results = 'hide', fig.width=10, fig.height=2.5}
232 | # Plot representative cell subtype distribution for each group of samples
233 | plotSummaryHistograms(myobj, group_assignments, cell_model='monocle2', cmap, 
234 |                       ncol.plot = 5, ax.lab.sz=1.3, title.sz=2)
235 | ```
236 | 
237 | To plot cell yield of each samples as a barplot, use the following function:
238 | 
239 | ```{r echo=T, results='hide', fig.width=8, fig.height=5.5}
240 | # Plot cell yield of each experimental condition
241 | plotCellYield(myobj, group_assignments, font_sz = 0.7, w=8, h=5)
242 | ```
243 | 
244 | ```{r echo=T}
245 | sessionInfo()
246 | ```
247 | 


--------------------------------------------------------------------------------
/R/Phemd-methods.R:
--------------------------------------------------------------------------------
  1 | ##################
  2 | # Class constructor
  3 | #################
  4 | 
  5 | #' Phemd class
  6 | #'
  7 | #' The main PhEMD class to store single-cell expression data.
  8 | #' @field data List of matrices, each of which represents a single-cell sample (num_cells x num_genes)
  9 | #' @field markers Column names (e.g. genes) for each element (i.e. data matrix) in "data"
 10 | #' @field snames Sample ID for each element in "data"
 11 | #' @field data_aggregate Numeric matrix representing expression data for cells from all experimental conditions (rows = markers, cols = cells)
 12 | #' @field data_subsample_idx List of vectors each representing the indices of elements in "data" that were subsampled and combined to form "data_aggregate"
 13 | #' @field subsampled_bool Boolean represent whether or not subsampling was performed in the data aggregation process
 14 | #' @field monocle_obj Data object of type "CellDataSet" that is the core Monocle data structure
 15 | #' @field data_cluster_weights Matrix representing cell subtype relative frequencies for each sample (num_samples x num_genes)
 16 | #' @field emd_dist_mat Matrix representing pairwise distances between each pair of cell subtypes
 17 | #' @field seurat_obj Object of class "Seurat" that is the core Seurat data structure
 18 | #' @field phate_obj Object of class "phate" that is the core PHATE data structure
 19 | #' @field experiment_ids Vector of length num_samples representing the experiment (batch) in which the sample was profiled
 20 | #' @name Phemd
 21 | #' @rdname Phemd
 22 | #' @aliases Phemd-class
 23 | #' @exportClass Phemd
 24 | #' @importClassesFrom Seurat Seurat seurat
 25 | 
 26 | setClassUnion("CDSorNULL",members=c('CellDataSet', "NULL"))
 27 | setClassUnion("SeuratorNULL",members=c('Seurat', "NULL"))
 28 | setClass("Phemd",
 29 |          contains=c('CellDataSet', 'Seurat'),
 30 |          slots=c(data = "list",
 31 |                  markers = "character",
 32 |                  snames = "character",
 33 |                  data_aggregate = "matrix",
 34 |                  data_subsample_idx = "list",
 35 |                  subsampled_bool = "logical",
 36 |                  monocle_obj = "CDSorNULL",
 37 |                  data_cluster_weights = "matrix",
 38 |                  emd_dist_mat = "matrix",
 39 |                  seurat_obj = "SeuratorNULL",
 40 |                  phate_obj = "list",
 41 |                  cellstate_assignments = "list",
 42 |                  experiment_ids = "character", 
 43 |                  version='package_version'))
 44 | 
 45 | ###########################
 46 | # Methods for Phemd class
 47 | ##########################
 48 | #' @name Phemd-methods
 49 | #' @docType methods
 50 | #' @rdname Phemd-methods
 51 | #' 
 52 | setValidity("Phemd", function(object) {
 53 |   if(length(rawExpn(object)) < 1) {
 54 |     return('Phemd object must have at least 1 sample in rawExpn(object)')
 55 |   }
 56 |   if(length(sNames(object)) != length(rawExpn(object))) {
 57 |     return('sNames(object) must be the same length as rawExpn(object)')
 58 |   }
 59 |   if(sum(dim(pooledCells(object))) == 0 && ncol(rawExpn(object)[[1]]) != length(selectMarkers(object))) {
 60 |     return('Number of markers measured in rawExpn(object) must equal number of markers listed in selectMarkers(object)')
 61 |   }
 62 |   if(sum(dim(pooledCells(object))) != 0 && nrow(pooledCells(object)) != length(selectMarkers(object))) {
 63 |     return('Number of markers measured in pooledCells(object) must equal number of markers listed in selectMarkers(object)')
 64 |   }
 65 |   return(TRUE)  
 66 | })
 67 | 
 68 | 
 69 | ##################
 70 | # Accessor functions
 71 | ###################
 72 | 
 73 | #' Accessor function for stored multi-sample raw expression data
 74 | #' 
 75 | #' @param obj A Phemd object.
 76 | #' @return List of matrices, each of which represents a single-cell sample
 77 | #' @export
 78 | #' @examples
 79 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 80 | #' raw_expn_data <- rawExpn(phemdObj)
 81 | #' 
 82 | rawExpn <- function(obj) {
 83 |   stopifnot(is(obj,"Phemd"))
 84 |   obj@data
 85 | }
 86 | 
 87 | 
 88 | #' Accessor function for stored Monocle object
 89 | #' 
 90 | #' @param obj A Phemd object.
 91 | #' @return An object of class 'CellDataSet' (from Monocle)
 92 | #' @export
 93 | #' @examples
 94 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 95 | #' monocle_obj <- monocleInfo(phemdObj)
 96 | #' 
 97 | monocleInfo <- function(obj) {
 98 |   stopifnot(is(obj,"Phemd"))
 99 |   obj@monocle_obj
100 | }
101 | 
102 | #' Accessor function for stored Seurat object within Phemd object
103 | #' 
104 | #' @param obj A Phemd object.
105 | #' @return An object of class 'Seurat'
106 | #' @export
107 | #' @examples
108 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
109 | #' seurat_obj <- seuratInfo(phemdObj)
110 | #' 
111 | seuratInfo <- function(obj) {
112 |   stopifnot(is(obj,"Phemd"))
113 |   obj@seurat_obj
114 | }
115 | 
116 | #' Accessor function for stored phate object
117 | #' 
118 | #' @param obj A Phemd object.
119 | #' @return An object of class 'phate' (from phateR)
120 | #' @export
121 | #' @examples
122 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
123 | #' phateobj <- phateInfo(phemdObj)
124 | #' 
125 | phateInfo <- function(obj) {
126 |     stopifnot(is(obj,"Phemd"))
127 |     obj@phate_obj
128 | }
129 | 
130 | #' Accessor function for EMD ground distance matrix
131 | #' 
132 | #' @param obj A Phemd object
133 | #' @return Sqaure matrix representing pairwise distances between cell subtypes
134 | #' @export
135 | #' @examples
136 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
137 | #' gdm <- GDM(phemdObj)
138 | #' 
139 | GDM <- function(obj) {
140 |   stopifnot(is(obj,"Phemd"))
141 |   obj@emd_dist_mat
142 | }
143 | 
144 | #' Accessor function for gene/protein markers measured in experiment
145 | #' 
146 | #' @param obj Phemd object
147 | #' @return Vector representing gene/protein markers corresponding to expression matrices
148 | #' @export
149 | #' @examples
150 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
151 | #' genes <- selectMarkers(phemdObj)
152 | #' 
153 | selectMarkers <- function(obj) {
154 |   stopifnot(is(obj,"Phemd"))
155 |   obj@markers
156 | }
157 | 
158 | #' Accessor function for identifiers of all single-cell samples in experiment
159 | #' 
160 | #' @param obj Phemd object
161 | #' @return Vector representing sample names corresponding to expression matrices
162 | #' @export
163 | #' @examples
164 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
165 | #' sampleIDs <- sNames(phemdObj)
166 | #' 
167 | sNames <- function(obj) {
168 |   stopifnot(is(obj,"Phemd"))
169 |   obj@snames
170 | }
171 | 
172 | 
173 | #' Accessor function for aggregated cells used for cell subtype definition
174 | #' 
175 | #' @param obj Phemd object
176 | #' @return Numeric matrix representing expression data for cells from all experimental conditions (rows = markers, cols = cells)
177 | #' @export
178 | #' @examples
179 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
180 | #' cells_aggregated <- pooledCells(phemdObj)
181 | #' 
182 | pooledCells <- function(obj) {
183 |   stopifnot(is(obj,"Phemd"))
184 |   obj@data_aggregate
185 | }
186 | 
187 | #' Accessor function for aggregated cells used for cell subtype definition
188 | #' 
189 | #' @param obj Phemd object
190 | #' @return List of vectors each representing the indices of elements in rawExpn(obj) that were subsampled and combined to form "data_aggregate"
191 | #' @export
192 | #' @examples
193 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
194 | #' subsampled_idx_list <- subsampledIdx(phemdObj)
195 | #' 
196 | subsampledIdx <- function(obj) {
197 |   stopifnot(is(obj,"Phemd"))
198 |   obj@data_subsample_idx
199 | }
200 | 
201 | #' Accessor function for whether or not cells were subsampled when aggregated for cell subtype analysis
202 | #' 
203 | #' @param obj Phemd object
204 | #' @return Boolean represent whether or not subsampling was performed in the data aggregation process
205 | #' @export
206 | #' @examples
207 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
208 | #' subsampled <- subsampledBool(phemdObj)
209 | #' 
210 | subsampledBool <- function(obj) {
211 |   stopifnot(is(obj,"Phemd"))
212 |   obj@subsampled_bool
213 | }
214 | 
215 | #' Accessor function for cell subtype distribution for each sample
216 | #' 
217 | #' @param obj Phemd object
218 | #' @return Matrix representing cell subtype relative frequencies for each sample (num_samples x num_genes)
219 | #' @export
220 | #' @examples
221 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
222 | #' celltype_weights <- celltypeFreqs(phemdObj)
223 | #' 
224 | celltypeFreqs <- function(obj) {
225 |   stopifnot(is(obj,"Phemd"))
226 |   obj@data_cluster_weights
227 | }
228 | 
229 | #' Accessor function for batch ID for each sample
230 | #' 
231 | #' @param obj Phemd object
232 | #' @return Vector of length num_samples representing the experiment (batch) in which the sample was profiled
233 | #' @export
234 | #' @examples
235 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
236 | #' batch_metadata <- batchIDs(phemdObj)
237 | #' 
238 | batchIDs <- function(obj) {
239 |   stopifnot(is(obj,"Phemd"))
240 |   obj@experiment_ids
241 | }
242 | 
243 | ##################
244 | # Setter functions
245 | ###################
246 | 
247 | 
248 | #' Setter function for protein / gene markers
249 | #' 
250 | #' @rdname Phemd-methods
251 | #' @docType methods
252 | #' @return Updated Phemd object
253 | #' @export
254 | #' @examples
255 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
256 | #' new_genes <- all_genes
257 | #' new_genes[1] <- 'IL2R'
258 | #' selectMarkers(phemdObj) <- new_genes
259 | #' 
260 | setGeneric("selectMarkers<-", function(obj, value) standardGeneric("selectMarkers<-"))
261 | 
262 | #' @rdname Phemd-methods
263 | #' @aliases Phemd,ANY,ANY-method
264 | setMethod("selectMarkers<-", "Phemd", function(obj, value) {
265 |   obj@markers <- value
266 |   validObject(obj)
267 |   obj
268 | })
269 | 
270 | #' Setter function for stored expression data
271 | #' 
272 | #' @rdname Phemd-methods
273 | #' @aliases Phemd,character,ANY-method
274 | #' @export
275 | #' @examples
276 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
277 | #' new_expn_data <- all_expn_data
278 | #' new_expn_data <- lapply(new_expn_data, function(x) {log2(x+1)})
279 | #' rawExpn(phemdObj) <- new_expn_data
280 | #'
281 | setGeneric("rawExpn<-", function(obj, value) standardGeneric("rawExpn<-"))
282 | 
283 | #' @rdname Phemd-methods
284 | #' @aliases Phemd,ANY,ANY-method
285 | setMethod("rawExpn<-", "Phemd", function(obj, value) {
286 |   obj@data <- value
287 |   validObject(obj)
288 |   obj
289 | })
290 | 
291 | #' Setter function for single-cell expression data aggregated from multiple samples
292 | #' 
293 | #' @rdname Phemd-methods
294 | #' @docType methods
295 | #' @return Updated Phemd object
296 | #' @export
297 | #' @examples
298 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
299 | #' aggregated_data <- t(do.call(rbind,all_expn_data))
300 | #' pooledCells(phemdObj) <- aggregated_data
301 | #' 
302 | setGeneric("pooledCells<-", function(obj, value) standardGeneric("pooledCells<-"))
303 | 
304 | #' @rdname Phemd-methods
305 | #' @aliases Phemd,ANY,ANY-method
306 | setMethod("pooledCells<-", "Phemd", function(obj, value) {
307 |   obj@data_aggregate <- value
308 |   validObject(obj)
309 |   obj
310 | })
311 | 
312 | #' Setter function for indices of cells subsampled from each sample during aggregation
313 | #' 
314 | #' @rdname Phemd-methods
315 | #' @docType methods
316 | #' @return Updated Phemd object
317 | #' @export
318 | #' @examples
319 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
320 | #' subsampledIdxList<- rep(list(1:10), length(all_expn_data)) #subsampled cells 1-10 from each sample
321 | #' subsampledIdx(phemdObj) <- subsampledIdxList
322 | #' 
323 | setGeneric("subsampledIdx<-", function(obj, value) standardGeneric("subsampledIdx<-"))
324 | 
325 | #' @rdname Phemd-methods
326 | #' @aliases Phemd,ANY,ANY-method
327 | setMethod("subsampledIdx<-", "Phemd", function(obj, value) {
328 |   obj@data_subsample_idx <- value
329 |   validObject(obj)
330 |   obj
331 | })
332 | 
333 | #' Setter function for boolean denoting whether cells were subsampled from each sample during aggregation
334 | #' 
335 | #' @rdname Phemd-methods
336 | #' @docType methods
337 | #' @return Updated Phemd object
338 | #' @export
339 | #' @examples
340 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
341 | #' subsampledBool(phemdObj) <- TRUE
342 | #' 
343 | setGeneric("subsampledBool<-", function(obj, value) standardGeneric("subsampledBool<-"))
344 | 
345 | #' @rdname Phemd-methods
346 | #' @aliases Phemd,ANY,ANY-method
347 | setMethod("subsampledBool<-", "Phemd", function(obj, value) {
348 |   obj@subsampled_bool <- value
349 |   validObject(obj)
350 |   obj
351 | })
352 | 
353 | #' Setter function for Monocle2 CellDataSet object for experiment
354 | #' 
355 | #' @rdname Phemd-methods
356 | #' @param obj A Phemd object
357 | #' @param value Assignment object
358 | #' @return Updated Phemd object
359 | #' @export
360 | #' @examples
361 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
362 | #' mydata <- pooledCells(phemdObj)
363 | #' myCellDataSet <- newCellDataSet(mydata,phenoData=NULL, expressionFamily=VGAM::negbinomial.size())
364 | #' monocleInfo(phemdObj) <- myCellDataSet
365 | #' 
366 | setGeneric("monocleInfo<-", function(obj, value) standardGeneric("monocleInfo<-"))
367 | 
368 | #' @rdname Phemd-methods
369 | #' @aliases Phemd,ANY,ANY-method
370 | setMethod("monocleInfo<-", "Phemd", function(obj, value) {
371 |   obj@monocle_obj <- value
372 |   validObject(obj)
373 |   obj
374 | })
375 | 
376 | #' Setter function for Seurat object for experiment
377 | #' 
378 | #' @rdname Phemd-methods
379 | #' @docType methods
380 | #' @return Updated Phemd object containing Seurat object
381 | #' @export
382 | #' @examples
383 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
384 | #' my_seuratObj <- Seurat::CreateSeuratObject(counts = t(all_expn_data[[1]]), project = "A")
385 | #' seuratInfo(phemdObj) <- my_seuratObj
386 | #' 
387 | setGeneric("seuratInfo<-", function(obj, value) standardGeneric("seuratInfo<-"))
388 | 
389 | #' @rdname Phemd-methods
390 | #' @aliases Phemd,ANY,ANY-method
391 | setMethod("seuratInfo<-", "Phemd", function(obj, value) {
392 |   obj@seurat_obj <- value
393 |   validObject(obj)
394 |   obj
395 | })
396 | 
397 | 
398 | #' Setter function for phate object for experiment
399 | #' 
400 | #' @rdname Phemd-methods
401 | #' @docType methods
402 | #' @return Updated Phemd object containing phate object
403 | #' @export
404 | #' @examples
405 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
406 | #' #my_phateObj <- phateR::phate(all_expn_data[[1]])
407 | #' phateInfo(phemdObj) <- list()
408 | #' 
409 | setGeneric("phateInfo<-", function(obj, value) standardGeneric("phateInfo<-"))
410 | 
411 | #' @rdname Phemd-methods
412 | #' @aliases Phemd,ANY,ANY-method
413 | setMethod("phateInfo<-", "Phemd", function(obj, value) {
414 |     obj@phate_obj <- value
415 |     validObject(obj)
416 |     obj
417 | })
418 | 
419 | #' Setter function for cell subtype frequencies of each single-cell sample
420 | #' 
421 | #' @rdname Phemd-methods
422 | #' @docType methods
423 | #' @return Updated Phemd object
424 | #' @export
425 | #' @examples
426 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
427 | #' myCellTypeFreqs <- matrix(rexp(length(all_expn_data)*10, rate=.1), ncol=10)
428 | #' myCellTypeFreqs <- apply(myCellTypeFreqs, 1, function(x) {x / sum(x)})
429 | #' celltypeFreqs(phemdObj) <- myCellTypeFreqs
430 | #' 
431 | setGeneric("celltypeFreqs<-", function(obj, value) standardGeneric("celltypeFreqs<-"))
432 | 
433 | #' @rdname Phemd-methods
434 | #' @aliases Phemd,ANY,ANY-method
435 | setMethod("celltypeFreqs<-", "Phemd", function(obj, value) {
436 |   obj@data_cluster_weights <- value
437 |   validObject(obj)
438 |   obj
439 | })
440 | 
441 | #' Setter function for batch IDs of each single-cell sample
442 | #' 
443 | #' @rdname Phemd-methods
444 | #' @docType methods
445 | #' @return Updated Phemd object
446 | #' @export
447 | #' @examples
448 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
449 | #' my_seuratObj <- Seurat::CreateSeuratObject(counts = t(all_expn_data[[1]]), project = "A")
450 | #' seuratInfo(phemdObj) <- my_seuratObj
451 | #' batchIDs(phemdObj) <- rep('A', length(all_expn_data))
452 | #' 
453 | setGeneric("batchIDs<-", function(obj, value) standardGeneric("batchIDs<-"))
454 | 
455 | #' @rdname Phemd-methods
456 | #' @aliases Phemd,ANY,ANY-method
457 | setMethod("batchIDs<-", "Phemd", function(obj, value) {
458 |   obj@experiment_ids <- value
459 |   validObject(obj)
460 |   obj
461 | })
462 | 
463 | #' Setter function for EMD ground distance matrix
464 | #' 
465 | #' @rdname Phemd-methods
466 | #' @docType methods
467 | #' @return Updated Phemd object
468 | #' @export
469 | #' @examples
470 | #' phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
471 | #' cluster_locs <- 1:10
472 | #' myGDM <- as.matrix(dist(cluster_locs))
473 | #' GDM(phemdObj) <- myGDM
474 | #' 
475 | setGeneric("GDM<-", function(obj, value) standardGeneric("GDM<-"))
476 | 
477 | #' @rdname Phemd-methods
478 | #' @aliases Phemd,ANY,ANY-method
479 | setMethod("GDM<-", "Phemd", function(obj, value) {
480 |   obj@emd_dist_mat <- value
481 |   validObject(obj)
482 |   obj
483 | })
484 | 
485 | 


--------------------------------------------------------------------------------
/R/functions-plotting.R:
--------------------------------------------------------------------------------
  1 | #' @title Plots Monocle2 cell embedding plots
  2 | #' @description Takes as input a Phemd object containing either a Monocle2 object or Seurat object (already embedded and ordered) and plots cell embedding plots side by side. Optionally saves to specified folder.
  3 | #' @details \code{embedCells} and \code{orderCellsMonocle} need to be called before calling this function. Required additional packages: 'RColorBrewer', 'cowplot'
  4 | #' @param obj 'Phemd' object containing Monocle 2 object
  5 | #' @param cell_model Method by which cell state was modeled (either "monocle2", "seurat", or "phate)
  6 | #' @param cmap User-specified colormap to use to color cell state embedding (optional)
  7 | #' @param w Width of plot in inches
  8 | #' @param h Height of plot in inches
  9 | #' @param pt_sz Scalar factor for point size
 10 | #' @param ndims Number of dimensions to use for dimensionality reduction in case it hasn't been performed yet (only relevant when using Seurat data as input)
 11 | #' @return Colormap (vector of colors) used to color Monocle2 cell state embedding
 12 | #' @examples
 13 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 14 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
 15 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
 16 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model='gaussianff', sigma=0.02, maxIter=2)
 17 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
 18 | #' cmap <- plotEmbeddings(my_phemdObj_monocle)
 19 | plotEmbeddings <- function(obj, cell_model=c('monocle2', 'seurat', 'phate'), cmap=NULL, w=4, h=5, pt_sz=1, ndims=NULL) {
 20 |     saved_palette <- palette()
 21 |     cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate'))
 22 |     if(cell_model == 'monocle2') {
 23 |         monocle_obj <- monocleInfo(obj)
 24 |         cell_embedding <- reducedDimS(monocle_obj)
 25 |         cell_embedding_t <- as.data.frame(t(cell_embedding))
 26 |         mydata <- pooledCells(obj)
 27 |         
 28 |         # Extract state labels from monocle data object
 29 |         labels <- pData(phenoData(monocle_obj))
 30 |         state_labels <- as.numeric(labels$State)
 31 |         
 32 |         levels <- levels(factor(state_labels))
 33 |         levels_renamed <- vapply(levels, function(x) paste("C-", x, sep=""), "")
 34 |         
 35 |         if(is.null(cmap)) {
 36 |             getPalette <- colorRampPalette(brewer.pal(11, "Spectral"))
 37 |             cmap <- getPalette(max(state_labels))
 38 |             if("#FFFFBF" %in% cmap) cmap[which(cmap == "#FFFFBF")] <- "#D3D3D3" #replace light yellow with grey
 39 |             cmap <- sample(cmap)
 40 |         }
 41 |         palette(cmap)
 42 |         
 43 |         # visualize traj colored by state
 44 |         myplot_state <- ggplot(cell_embedding_t, aes(x=cell_embedding_t[,1], y=cell_embedding_t[,2], color=factor(state_labels))) +
 45 |         geom_point(size=0.4) +
 46 |         scale_color_manual(labels = levels_renamed,
 47 |         values = cmap) +
 48 |         guides(colour = guide_legend(override.aes = list(size=2))) +
 49 |         labs(x="", y = "", color = "Cell subtype") +
 50 |         theme_classic() +
 51 |         theme(axis.title.x=element_blank(),
 52 |         axis.text.x=element_blank(),
 53 |         axis.ticks.x=element_blank(),
 54 |         axis.title.y=element_blank(),
 55 |         axis.text.y=element_blank(),
 56 |         axis.ticks.y=element_blank(),
 57 |         axis.line = element_line(colour = "black",
 58 |         size = 1, linetype = "solid"))
 59 |         
 60 |         # visualize traj colored by pseudotime
 61 |         ncolor <- 9
 62 |         palette(brewer.pal(ncolor, "Blues"))
 63 |         
 64 |         col.labels <- labels$Pseudotime
 65 |         
 66 |         # visualize traj colored by pseudotime
 67 |         myplot_pt <- ggplot(cell_embedding_t, aes(x=cell_embedding_t[,1], y=cell_embedding_t[,2], color=col.labels)) +
 68 |         geom_point(size=0.4) +
 69 |         labs(x="", y = "", color = "Pseudotime") +
 70 |         theme_classic() +
 71 |         theme(axis.title.x=element_blank(),
 72 |         axis.text.x=element_blank(),
 73 |         axis.ticks.x=element_blank(),
 74 |         axis.title.y=element_blank(),
 75 |         axis.text.y=element_blank(),
 76 |         axis.ticks.y=element_blank(),
 77 |         axis.line = element_line(colour = "black",
 78 |         size = 1, linetype = "solid"))
 79 |         
 80 |         print(plot_grid(myplot_state, myplot_pt, ncol=2))
 81 |         return(cmap)
 82 |     } else if(cell_model == 'seurat') {
 83 |         seurat_obj <- seuratInfo(obj)
 84 |         
 85 |         if(!'tsne' %in% names(seurat_obj@dr)) {
 86 |             print('Running t-SNE...')
 87 |             if(is.null(ndims)) ndims <- 10
 88 |             seurat_obj <- RunTSNE(seurat_obj, reduction.use="cca.aligned", dims.use=seq_len(ndims))
 89 |         }
 90 |         
 91 |         # define color map
 92 |         if(is.null(cmap)) {
 93 |             getPalette <- colorRampPalette(brewer.pal(11, "Spectral"))
 94 |             cmap <- getPalette(max(as.numeric(seurat_obj@ident)))
 95 |             if("#FFFFBF" %in% cmap) cmap[which(cmap == "#FFFFBF")] <- "#D3D3D3" #replace light yellow with grey
 96 |             cmap <- sample(cmap)
 97 |         }
 98 |         
 99 |         TSNEPlot(seurat_obj, do.label=FALSE, pt.size=pt_sz, colors.use=cmap)
100 |         
101 |     } else if(cell_model == 'phate') {
102 |         # Extract PHATE embedding
103 |         phate_obj <- phateInfo(obj)
104 |         cell_embedding_t <- as.data.frame(phate_obj$embedding)
105 |         state_labels <- phate_obj$cellstate.labels
106 |         
107 |         levels <- levels(factor(state_labels))
108 |         levels_renamed <- vapply(levels, function(x) paste("C-", x, sep=""), "")
109 |         
110 |         if(is.null(cmap)) {
111 |             getPalette <- colorRampPalette(brewer.pal(11, "Spectral"))
112 |             cmap <- getPalette(max(state_labels))
113 |             if("#FFFFBF" %in% cmap) cmap[which(cmap == "#FFFFBF")] <- "#D3D3D3" #replace light yellow with grey
114 |             cmap <- sample(cmap)
115 |         }
116 |         palette(cmap)
117 |         
118 |         # visualize traj colored by state
119 |         myplot_state <- ggplot(cell_embedding_t, aes(x=cell_embedding_t[,1], y=cell_embedding_t[,2], color=factor(state_labels))) +
120 |             geom_point(size=0.4) +
121 |             scale_color_manual(labels = levels_renamed,
122 |                                values = cmap) +
123 |             guides(colour = guide_legend(override.aes = list(size=2))) +
124 |             labs(x="", y = "", color = "Cell subtype") +
125 |             theme_classic() +
126 |             theme(axis.title.x=element_blank(),
127 |                   axis.text.x=element_blank(),
128 |                   axis.ticks.x=element_blank(),
129 |                   axis.title.y=element_blank(),
130 |                   axis.text.y=element_blank(),
131 |                   axis.ticks.y=element_blank(),
132 |                   axis.line = element_line(colour = "black",
133 |                                            size = 1, linetype = "solid"))
134 |         print(myplot_state)
135 |     } else {
136 |         stop('Error: cell_model must be either "monocle2", "seurat", or "phate"')
137 |     }
138 |     palette(saved_palette)
139 |     return(cmap)
140 | }
141 | 
142 | #' @title Plot heatmap of cell subtypes
143 | #' @description Takes as input a Phemd object containing either a Monocle2, Seurat, or PHATE object (already embedded and clustered) and plots heatmap characterizing cell subtypes
144 | #' @details \code{embedCells} (and \code{orderCellsMonocle} if using Monocle2) need to be called before calling this function. Required additional package: 'pheatmap'
145 | #' @param obj 'Phemd' object containing cell-state embedding object
146 | #' @param cell_model Method by which cell state was modeled ("monocle2", "seurat", or "phate")
147 | #' @param selected_genes Vector containing gene names to include in heatmap (optional)
148 | #' @param w Width of plot in inches
149 | #' @param h Height of plot in inches
150 | #' @param ... Additional parameters to be passed on to pheatmap function
151 | #' @return Heatmap containing expression values for each cell subtype. If cell_model is 'seurat', then returns a list of heatmaps (1 for each batch) that may be subsequently plotted individually
152 | #' @examples
153 | #'
154 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
155 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
156 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
157 | #' my_phemdObj_lg <- selectFeatures(my_phemdObj_lg, selected_genes)
158 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff',
159 | #' pseudo_expr=0, sigma=0.02, maxIter=2)
160 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
161 | #' myheatmap <- plotHeatmaps(my_phemdObj_monocle, cell_model='monocle2')
162 | #'
163 | plotHeatmaps <- function(obj, cell_model=c('monocle2','seurat', 'phate'), selected_genes=NULL, w=8, h=5, ...) {
164 |     cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate'))
165 |     if(cell_model %in% c('monocle2', 'phate')) {
166 |         # retrieve reference clusters
167 |         ref_clusters <- retrieveRefClusters(obj, cell_model=cell_model, expn_type='raw')
168 |         selected_clusters <- seq_len(length(ref_clusters))
169 |         myheatmap <- matrix(0, nrow=length(selected_clusters), ncol=ncol(ref_clusters[[1]]))
170 |         for(i in selected_clusters) {
171 |             cur_cluster <- ref_clusters[[i]]
172 |             if(!is.null(cur_cluster)) { #at least 1 cell
173 |                 if(nrow(cur_cluster) > 1) {
174 |                     myheatmap[i,] <- colMeans(cur_cluster)
175 |                 } else {
176 |                     myheatmap[i,] <- cur_cluster #only 1 cell
177 |                 }
178 |             }
179 |         }
180 |         
181 |         selected_clusters_renamed <- vapply(names(ref_clusters), function(x) paste("C-", x, sep=""), "")
182 |         
183 |         rownames(myheatmap) <- selected_clusters_renamed
184 |         colnames(myheatmap) <- selectMarkers(obj)
185 |         
186 |         if(!is.null(selected_genes)) {
187 |             col_tokeep <- match(selected_genes, selectMarkers(obj))
188 |             if(sum(is.na(col_tokeep)) > 0) {
189 |                 genes_not_found <- ''
190 |                 missing_idx <- which(is.na(col_tokeep))
191 |                 for(i in seq_len(length(missing_idx))) {
192 |                     if(i == 1) genes_not_found <- paste(genes_not_found, selected_genes[missing_idx[i]], sep='')
193 |                     else genes_not_found <- paste(genes_not_found, selected_genes[missing_idx[i]], sep=', ')
194 |                 }
195 |                 print(sprintf("Genes not found: %s", genes_not_found, sep=""))
196 |             }
197 |             col_tokeep <- col_tokeep[!is.na(col_tokeep)]
198 |             myheatmap <- myheatmap[,col_tokeep]
199 |         }
200 |         
201 |         myheatmap[is.nan(myheatmap)] <- 0 #this in the event of empty clusters
202 |         
203 |         myheatmap2 <- log2(myheatmap - min(myheatmap) + 1)
204 |         
205 |         pheatmap(myheatmap2,
206 |         cluster_rows=FALSE,
207 |         cluster_cols=TRUE,
208 |         border_color=NA,
209 |         show_colnames=TRUE,
210 |         show_rownames=TRUE,
211 |         fontsize_col=8,
212 |         fontsize_row=12,
213 |         cellwidth=10,
214 |         width=w,
215 |         height=h,
216 |         ...
217 |         )
218 |         return(myheatmap2)
219 |     } else if(cell_model == 'seurat') {
220 |         seurat_obj <- seuratInfo(obj)
221 |         state_labels <- as.numeric(as.character(Idents(seurat_obj)))
222 |         
223 |     
224 |         names(state_labels) <- rownames(seurat_obj@meta.data)
225 |         ref_data <- t(as.matrix(GetAssayData(seurat_obj, assay.type='RNA', slot='counts')))
226 |         
227 |         batches <- unique(batchIDs(obj))
228 |         myheatmaps_all <- list()
229 |         for(batch_id in batches) {
230 |             cell_idx_curplt <- which(seurat_obj@meta.data$plt == batch_id)
231 |             if(length(cell_idx_curplt) == 0) {
232 |                 stop(sprintf('Error: no cells in reference set match the experiment_id %s. Please check batchIDs(phemdObj).', batch_id))
233 |             }
234 |             cur_ref_data <- ref_data[cell_idx_curplt,]
235 |             cur_state_labels <- state_labels[cell_idx_curplt]
236 |             
237 |             myheatmap <- matrix(0, nrow=max(state_labels), ncol=ncol(cur_ref_data))
238 |             for(i in seq_len(max(state_labels))) {
239 |                 cur_idx <- which(cur_state_labels == i)
240 |                 cur_cluster <- cur_ref_data[cur_idx,]
241 |                 if(length(cur_idx) > 1) myheatmap[i,] <- colMeans(cur_cluster)
242 |                 if(length(cur_idx) == 1) myheatmap[i,] <- cur_cluster
243 |             }
244 |             
245 |             selected_clusters_renamed <- vapply(seq_len(max(state_labels)), function(x) paste("C-", x, sep=""), "")
246 |             
247 |             rownames(myheatmap) <- selected_clusters_renamed
248 |             colnames(myheatmap) <- selectMarkers(obj)
249 |             
250 |             if(!is.null(selected_genes)) {
251 |                 col_tokeep <- match(selected_genes, selectMarkers(obj))
252 |                 if(sum(is.na(col_tokeep)) > 0) {
253 |                     genes_not_found <- ''
254 |                     missing_idx <- which(is.na(col_tokeep))
255 |                     for(i in seq_len(length(missing_idx))) {
256 |                         if(i == 1) genes_not_found <- paste(genes_not_found, selected_genes[missing_idx[i]], sep='')
257 |                         else genes_not_found <- paste(genes_not_found, selected_genes[missing_idx[i]], sep=', ')
258 |                     }
259 |                     print(sprintf("Genes not found: %s", genes_not_found, sep=""))
260 |                 }
261 |                 col_tokeep <- col_tokeep[!is.na(col_tokeep)]
262 |                 myheatmap <- myheatmap[,col_tokeep]
263 |             }
264 |             
265 |             myheatmap[is.nan(myheatmap)] <- 0 #this in the event of empty clusters
266 |             
267 |             myheatmap2 <- log2(myheatmap - min(myheatmap) + 1)
268 |             myheatmaps_all[[batch_id]] <- myheatmap2
269 |         }
270 |         
271 |         for(i in seq_len(length(myheatmaps_all))) {
272 |             if(!exists('myheatmaps_avg')) myheatmaps_avg <- myheatmaps_all[[i]]
273 |             else myheatmaps_avg <- myheatmaps_avg + myheatmaps_all[[i]]
274 |         }
275 |         myheatmaps_avg <- myheatmaps_avg / length(myheatmaps_all)
276 |         myheatmaps_all[['average']] <- myheatmaps_avg
277 |         pheatmap(myheatmaps_avg,
278 |         cluster_rows=TRUE,
279 |         cluster_cols=FALSE,
280 |         border_color=NA,
281 |         show_colnames=TRUE,
282 |         show_rownames=TRUE,
283 |         fontsize_col=8,
284 |         fontsize_row=12,
285 |         cellwidth=10,
286 |         width=w,
287 |         height=h,
288 |         ...)
289 |         return(myheatmaps_all)
290 |     } else {
291 |         stop('Error: cell_model must be either "monocle2", "seurat", or "phate"')
292 |     }
293 | }
294 | 
295 | 
296 | #' @title Rotates heatmap marker labels 45 degrees
297 | #' @description Overwrites default draw_colnames in the pheatmap package
298 | #' @details To be used with pheatmap plotting function; not to be called directly. Thanks to Josh O'Brien at http://stackoverflow.com/questions/15505607
299 | #' @param coln Column names
300 | #' @param gaps Spacing of labels
301 | #' @param ... Additional parameters to be passed to \code{gpar}
302 | #' @return Formatted marker labels in heatmap
303 | #' @examples
304 | #' #Not to be called directly
305 | drawColnames45 <- function(coln, gaps, ...) {
306 |     coord <- pheatmap:::find_coordinates(length(coln), gaps)
307 |     x     <- coord$coord - 0.5 * coord$size
308 |     res   <- grid::textGrob(
309 |     coln, x = x, y = unit(1, "npc") - unit(3,"bigpts"),
310 |     vjust = 0.75, hjust = 1, rot = 45, gp = grid::gpar(...)
311 |     )
312 |     return(res)
313 | }
314 | 
315 | 
316 | #' @title Plot diffusion map embedding of samples based on distance matrix
317 | #' @description Visualizes diffusion map for network of samples based on square distance matrix (sample-sample pairwise dissimilarity)
318 | #' @details Requires 'destiny' package
319 | #' @param my_distmat phemdObj object containing sample names in @@snames slot
320 | #' @param cluster_assignments Vector containing group assignments for each sample
321 | #' @param pt_sz Size of points representing samples in plot (scaling factor)
322 | #' @param n_dim Number of dimensions for embedding (either 2 or 3)
323 | #' @param pt_label Vector of sample names corresponding to each point (same order as samples in \code{my_distmat} and \code{cluster_assignments})
324 | #' @param cmap Vector containing colors by which points should be colored (corresponding to cluster_assignments)
325 | #' @param w Width of plot in inches
326 | #' @param h Height of plot in inches
327 | #' @param scale.y Scaling factor for diffusion map y-axis
328 | #' @param angle Rotation factor for diffusion map plot
329 | #' @param autosave Boolean denoting whether or not to save output diffusion map
330 | #' @param ... Additional parameters to be passed to \code{DiffusionMap} function
331 | #' @return DiffusionMap object containing biological sample embedding and associated metadata
332 | #' @examples
333 | #'
334 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
335 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
336 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
337 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
338 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
339 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
340 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final)
341 | #' my_EMD_mat <- compareSamples(my_phemdObj_final)
342 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
343 | #' printClusterAssignments(cluster_assignments, my_phemdObj_final, '.', overwrite=TRUE)
344 | #' dm <- plotGroupedSamplesDmap(my_EMD_mat, cluster_assignments, pt_sz=2)
345 | #'
346 | plotGroupedSamplesDmap <- function(my_distmat, cluster_assignments=NULL, pt_sz=1, n_dim=3, pt_label = NULL, cmap = NULL, w=8, h=5, scale.y=1, angle=40, autosave=FALSE, ...) {
347 |     extra_args <- list(...)
348 |     if(nrow(my_distmat) != ncol(my_distmat)) {
349 |         stop('Error: my_distmat must be a square distance matrix')
350 |     }
351 |     
352 |     # Plot inhibitor groups using diffusion map
353 |     covars <- data.frame(covar1 = seq_len(nrow(my_distmat)))
354 |     if(nrow(my_distmat) < 30) {
355 |         extra_args['n_local'] <- 3
356 |     }
357 |     
358 |     dm_args <- c(list(data=covars, distance = as.dist(my_distmat)),
359 |                  extra_args[names(extra_args) %in% c("n_local", "density_norm", "rotate", "k", "sigma", "verbose")])
360 |     dm <- do.call(DiffusionMap, dm_args)
361 |     
362 |     # Check whether user supplied clustering info
363 |     if(is.null(cluster_assignments)) {
364 |         cluster_assignments <- rep(1, nrow(my_distmat))
365 |         cluster_assignments_named <- rep('A', nrow(my_distmat))
366 |         legend_bool <- FALSE
367 |     } else {
368 |         if(nrow(my_distmat) != length(cluster_assignments)) {
369 |             stop('Error: cluster_assignments must be the same length as the number of rows in my_distmat')
370 |         }
371 |         if(is.null(cmap)) {
372 |             getPalette <- colorRampPalette(brewer.pal(11, "Spectral"))
373 |             cmap <- getPalette(max(c(cluster_assignments),3)) # min palette size = 3
374 |             if("#FFFFBF" %in% cmap) cmap[which(cmap == "#FFFFBF")] <- "#D3D3D3" #replace light yellow with grey
375 |         }
376 |         if(length(cmap) > 1) palette(cmap)
377 |   
378 |         cluster_assignments_named <- vapply(cluster_assignments, function(x) intToUtf8(64+x), "")
379 |         legend_bool <- TRUE
380 |     }
381 | 
382 |     if(n_dim >= 3) {
383 |         plot(dm, c(1,2,3), pch=20, col=factor(cluster_assignments_named), pal=cmap, cex.symbols = pt_sz, box=FALSE, xlab="", ylab="", zlab="", y.margin.add = -0.5, draw_legend=legend_bool, legend_opts = list(posx = c(0.85,0.88), posy = c(0.05, 0.7)), scale.y=scale.y, angle=angle)
384 |         
385 |     } else {
386 |         dm.embedding <- as.data.frame(eigenvectors(dm))
387 |         # visualize traj colored by state
388 |         myplot <- ggplot(dm.embedding, aes(x=dm.embedding[,1], y=dm.embedding[,2], color=factor(cluster_assignments_named))) +
389 |             geom_point(size=pt_sz) +
390 |             labs(x="", y = "", color = "Sample cluster") +
391 |             scale_color_manual(breaks = levels(factor(cluster_assignments_named)),
392 |                                values=cmap[1:length(levels(factor(cluster_assignments_named)))]) +
393 |             theme_classic() +
394 |             theme(axis.title.x=element_blank(),
395 |                   axis.text.x=element_blank(),
396 |                   axis.ticks.x=element_blank(),
397 |                   axis.title.y=element_blank(),
398 |                   axis.text.y=element_blank(),
399 |                   axis.ticks.y=element_blank(),
400 |                   axis.line = element_line(colour = "black",
401 |                                            size = 1, linetype = "solid"))
402 |         print(myplot)
403 |     }
404 |     
405 |     if(!is.null(pt_label)) {
406 |         cluster_assignments_named <- vapply(cluster_assignments, function(x) paste("G-", x, sep=""), "")
407 |         if(n_dim >= 3) {
408 |             s3d <- scatterplot3d(eigenvectors(dm)[,1], eigenvectors(dm)[,2], eigenvectors(dm)[,3], color=as.numeric(factor(cluster_assignments_named)), pch=20, grid=F, box=F)
409 |             s3d.coords <- s3d$xyz.convert(eigenvectors(dm)[,1], eigenvectors(dm)[,2], eigenvectors(dm)[,3])
410 |             text(s3d.coords$x, s3d.coords$y,             # x and y coordinates
411 |             labels=pt_label,               # text to plot
412 |             cex=.3, pos=2)
413 |         } else {
414 |             plot(eigenvectors(dm)[,1], eigenvectors(dm)[,2], main = '', xlab = '', ylab = '', xaxt = 'n', yaxt = 'n', pch=20, col=factor(cluster_assignments_named), cex = pt_sz)
415 |             text(eigenvectors(dm)[,c(1,2)],labels = pt_label, pos = 2, cex=0.4)
416 |         }
417 |     }
418 |     return(dm)
419 | }
420 | 
421 | #' @title Plots cell subtype frequency histograms summarizing each group of samples
422 | #' @description Visualizes plots of relative frequency ("weights") of cell subtypes ("bins" or "signatures") summarizing each group of single-cell samples. Each summary histogram is computed by taking the bin-wise mean of all samples in the group
423 | #' @details \code{groupSamples} must be called before calling this function. Saves plots in directory called "summary_inhibs"
424 | #' @param myobj Phemd object containing cell subtype relative frequency in @@data_cluster_weights slot
425 | #' @param cluster_assignments Vector containing group assignments for each sample in myobj
426 | #' @param cell_model Method by which cell state was modeled (either "monocle2", "seurat", or "phate")
427 | #' @param cmap Vector containing colors by which histogram bars should be colored (optional)
428 | #' @param ncol.plot Number of columns to use to plot multi-panel histogram plot
429 | #' @param ax.lab.sz Scaling factor for axis labels (default 2.5)
430 | #' @param title.sz Scaling factor for plot title (default 3)
431 | #' @return None
432 | #' @examples
433 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
434 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
435 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
436 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
437 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
438 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
439 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final)
440 | #' my_EMD_mat <- compareSamples(my_phemdObj_final)
441 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
442 | #' printClusterAssignments(cluster_assignments, my_phemdObj_final, '.', overwrite=TRUE)
443 | #' dm <- plotGroupedSamplesDmap(my_EMD_mat, cluster_assignments, '.', pt_sz=2, pt_label = sNames(my_phemdObj_final))
444 | #' plotSummaryHistograms(my_phemdObj_final, cluster_assignments, cell_model='monocle2)
445 | #'
446 | plotSummaryHistograms <- function(myobj, cluster_assignments, cell_model=c('monocle2','seurat', 'phate'), cmap=NULL, ncol.plot=4, ax.lab.sz=2.5, title.sz=3) {
447 |     cell_model <- match.arg(cell_model, c('monocle2','seurat'))
448 |     if(cell_model == 'monocle2') {
449 |         monocle_obj <- monocleInfo(myobj)
450 |         labels <- pData(phenoData(monocle_obj))
451 |         state_labels <- as.numeric(labels$State)
452 |         
453 |     } else if(cell_model == 'seurat') {
454 |         seurat_obj <- seuratInfo(myobj)
455 |         state_labels <- as.numeric(as.character(Idents(seurat_obj)))
456 |     } else if(cell_model == 'phate') {
457 |         state_labels <- as.numeric(phateInfo(myobj)$cellstate.labels)
458 |     } else {
459 |         stop('Error: cell_model must be either "monocle2" or "seurat"')
460 |     }
461 |     
462 |     
463 |     cluster_weights <- celltypeFreqs(myobj)
464 |     
465 |     if(is.null(cmap)) {
466 |         getPalette <- colorRampPalette(brewer.pal(11, "Spectral"))
467 |         cmap <- getPalette(max(state_labels))
468 |     }
469 |     
470 |     proto_inhibs <- matrix(0, max(cluster_assignments), ncol(cluster_weights))
471 |     for(i in seq_len(max(cluster_assignments))) {
472 |         if(sum(cluster_assignments == i) == 1) {
473 |             proto_inhibs[i,] <- cluster_weights[which(cluster_assignments == i),]
474 |         } else {
475 |             proto_inhibs[i,] <- colMeans(cluster_weights[which(cluster_assignments == i),])
476 |         }
477 |     }
478 |     
479 |     nrow.plot <- ceiling(max(cluster_assignments) / ncol.plot)
480 |     par(mfrow=c(nrow.plot,ncol.plot))
481 |     for(i in seq_len(max(cluster_assignments))) {
482 |         if(max(proto_inhibs[i,]) > 0.4) ymax <- max(proto_inhibs[i,])+0.1
483 |         else ymax <- 0.4
484 |         barplot(proto_inhibs[i,], col=cmap, main='', xlab='', ylab = "Frequency (%)", ylim = c(0, ymax), cex.axis=1.5, cex.names = 2, cex.lab = ax.lab.sz, names.arg = seq_len(ncol(proto_inhibs)))
485 |         
486 |         title(xlab="Cell subtype", line=3.5, cex.lab=ax.lab.sz)
487 |         title(main=sprintf("Group %s", intToUtf8(64+i)), line=0, cex.main=title.sz)
488 |     }
489 | }
490 | 
491 | #' @title Plot cell yield of each sample as bar plot
492 | #' @description Plots cell yield (number of viable cells) of each single-cell sample in decreasing order as horizontal bar plot
493 | #' @param myobj Phmed object containing expression data for each sample in 'data' slot
494 | #' @param labels Vector containing group labels for samples (optional). If not provided, bars will be of uniform color (blue)
495 | #' @param cmap Vector containing colors by which histogram bars should be colored (optional)
496 | #' @param font_sz Scaling factor for font size of sample names in barplot
497 | #' @param w Width of plot in inches
498 | #' @param h Height of plot in inches
499 | #' @return None
500 | #' @examples
501 | #'
502 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
503 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
504 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
505 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
506 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
507 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
508 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final)
509 | #' my_EMD_mat <- compareSamples(my_phemdObj_final)
510 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
511 | #' plotCellYield(my_phemdObj_final, labels=cluster_assignments, font_sz = 0.8)
512 | #'
513 | plotCellYield <- function(myobj, labels=NULL, cmap=NULL, font_sz = 0.6, w=8, h=9.5) {
514 |     nsample <- length(rawExpn(myobj))
515 |     cell_yield <- vapply(rawExpn(myobj), nrow, integer(1L))
516 |     
517 |     order_idx <- order(cell_yield, decreasing=FALSE)
518 |     cell_yield_ordered <- cell_yield[order_idx]
519 |     snames_ordered <- sNames(myobj)[order_idx]
520 |     
521 |     
522 |     par(mar=c(6,6,2,2))
523 |     
524 |     if(!is.null(labels)) {
525 |         if(length(labels) != nsample) {
526 |             stop('Error: length of "labels" vector must be equal to length of rawExpn(myobj)')
527 |         }
528 |         labels_ordered <- labels[order_idx]
529 |         if(is.null(cmap)) {
530 |             getPalette <- colorRampPalette(brewer.pal(11, "Spectral"))
531 |             cmap <- getPalette(max(labels))
532 |             if("#FFFFBF" %in% cmap) cmap[which(cmap == "#FFFFBF")] <- "#D3D3D3" #replace light yellow with grey
533 |         }
534 |         color_vec <- cmap[labels_ordered]
535 |         xx <- barplot(cell_yield_ordered, main='', horiz=TRUE, names.arg=snames_ordered, las=1, cex.names=font_sz, col=color_vec)
536 |     }  else {
537 |         xx <- barplot(cell_yield_ordered, main='', horiz=TRUE, names.arg=snames_ordered, las=1, cex.names=font_sz, col='blue')
538 |     }
539 |     title(xlab="Cell yield (number of cells)", line=3, cex.lab=1.5)
540 |     
541 | }
542 | 
543 | 
544 | 


--------------------------------------------------------------------------------
/R/functions-core.R:
--------------------------------------------------------------------------------
   1 | ################
   2 | ## FUNCTIONS ###
   3 | ################
   4 | 
   5 | #########################################################
   6 | ### Private methods below (not exported in namespace) ###
   7 | #########################################################
   8 | 
   9 | #' @title Retrieve single-cell sample sizes
  10 | #' @description Takes initial list of single-cell samples and returns vector containing number of cells in each sample.
  11 | #' @details Private method (not exported in namespace)
  12 | #' @param data_list List of length num_samples (each element has dimension num_cells x num_markers)
  13 | #' @return Vector of length num_samples representing number of cells in each sample
  14 | #' @examples
  15 | #' \dontrun{
  16 | #' sample_sizes <- getSampleSizes(all_expn_data)
  17 | #' }
  18 | 
  19 | getSampleSizes <- function(data_list) {
  20 |     return(vapply(data_list, nrow, integer(1L)))
  21 | }
  22 | 
  23 | #' @title Retrieve reference cell clusters
  24 | #' @description Takes initial Phemd struct and returns cell clusters as assigned by clustering algorithm (e.g. PHATE or Monocle2)
  25 | #' @details Private method (not exported in namespace)
  26 | #' @param obj Phemd struct containing cell-state embedding object and underlying expression data
  27 | #' @param cell_model String representing data model for cell-state space ("seurat", "monocle2", or "phate")
  28 | #' @param expn_type String representing whether to return raw expression values or coordinates in dimensionality-reduced feature space
  29 | #' @param ndim Number of dimensions in reduced dimensionality space (e.g. PHATE / CCA) to use (only relevant in reduced dimensionality space)
  30 | #' @return List of data matrices; each list element is of size num_cells_in_cluster x num_markers and represents a distinct cell cluster
  31 | #' @examples
  32 | #' \dontrun{
  33 | #' cluster_expression_data <- retrieveRefClusters(my_phemdObj)
  34 | #' }
  35 | #'
  36 | 
  37 | retrieveRefClusters <- function(obj, cell_model=c('monocle2','seurat', 'phate'), expn_type='reduced', ndim=10) {
  38 |     cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate'))
  39 |     if(cell_model == 'monocle2') {
  40 |         # Extract state labels from monocle data object
  41 |         monocle_obj <- monocleInfo(obj)
  42 |         labels <- pData(phenoData(monocle_obj))
  43 |         state_labels <- as.numeric(labels$State)
  44 |         
  45 |         # Split data frame based on cluster assignments
  46 |         if(expn_type == 'reduced') {
  47 |             mydata <- as.data.frame(t(reducedDimS(obj@monocle_obj)))
  48 |             if(ndim < ncol(mydata)) {
  49 |                 mydata <- mydata[,1:ndim]
  50 |             }
  51 |         } else if(expn_type == 'raw') {
  52 |             mydata <- as.data.frame(t(pooledCells(obj)))
  53 |         }
  54 |     } else if(cell_model == 'seurat') {
  55 |         seurat_obj <- seuratInfo(obj)
  56 |         state_labels <- as.numeric(as.character(Idents(seurat_obj)))
  57 |         if(min(state_labels) == 0) state_labels <- state_labels + 1 #ensure cluster labels are 1 indexed instead of zero indexed
  58 |         names(state_labels) <- names(Idents(seurat_obj)) # label cluster assignments with cell name
  59 |         if(expn_type == 'cca.aligned') {
  60 |             # aligned CCA expression data (num_cells x num_markers)
  61 |             mydata <- Embeddings(object = seurat_obj, reduction = 'cca.aligned')[,seq_len(ndim)]
  62 |         } else if(expn_type %in% c('pca', 'reduced')) {
  63 |             mydata <- Embeddings(object = seurat_obj, reduction = 'pca')[,seq_len(ndim)]
  64 |         } else if(expn_type == 'tsne') {
  65 |             mydata <- Embeddings(object = seurat_obj, reduction = 'tsne')[,seq_len(ndim)]
  66 |         } else if(expn_type == 'umap') {
  67 |             mydata <- Embeddings(object = seurat_obj, reduction = 'umap')[,seq_len(ndim)]
  68 |         } else if(expn_type == 'raw') {
  69 |             mydata <- t(as.matrix(GetAssayData(seurat_obj, assay='RNA', slot='counts')))
  70 |         } else {
  71 |             stop('Error: expn_type must be one of the following: "raw", "pca", "umap", "tsne", "cca.aligned", "reduced"')
  72 |         }
  73 |         
  74 |         # Split data frame based on cluster assignments
  75 |         mydata <- as.data.frame(mydata)
  76 |     } else if(cell_model == 'phate') {
  77 |         phate_obj <- phateInfo(obj)
  78 |         state_labels <- phate_obj$cellstate.labels
  79 |         
  80 |         # Split data frame based on cluster assignments
  81 |         if(expn_type == 'reduced') {
  82 |             mydata <- as.data.frame(phate_obj$embedding)
  83 |             if(ndim < ncol(mydata)) {
  84 |                 mydata <- mydata[,1:ndim]
  85 |             }
  86 |         } else if(expn_type == 'raw') {
  87 |             mydata <- as.data.frame(t(pooledCells(obj)))
  88 |         } else {
  89 |             stop('Error: expn_type must be either "raw" or "reduced"')
  90 |         }
  91 |     } else {
  92 |         stop('Error: cell_model must be either "monocle2", "seurat", or "phate"')
  93 |     }
  94 |     ref_clusters <- split(mydata, state_labels)
  95 |     return(ref_clusters)
  96 | }
  97 | 
  98 | 
  99 | #' @title Identify cluster centroids (cell names)
 100 | #' @description Takes initial list and returns list of cell names representing centroid of cluster
 101 | #' @details Private method (not exported in namespace)
 102 | #' @param ref_clusters list containing each cluster of interest (each list element is a matrix of dimension num_cells x num_markers)
 103 | #' @return List of names; element \var{i} represents the name of the cell in cluster \var{i} that is closest to the centroid (arithmetic mean) of cluster \var{i}
 104 | #' @examples
 105 | #' \dontrun{
 106 | #' centroid_names <- identifyCentroids(ref_clusters)
 107 | #' }
 108 | identifyCentroids <- function(ref_clusters) {
 109 |     centroids <- lapply(ref_clusters, function(cur_cluster) {
 110 |         arith_centroid <- colMeans(cur_cluster)
 111 |         curdist <- t(as.matrix(apply(cur_cluster, 1, function(x) norm(x-arith_centroid, type="2"))))
 112 |         closest_cell <- colnames(curdist)[which.min(curdist)] #closest cell to arithmetic centroid
 113 |         return(closest_cell)
 114 |     })
 115 |     return(centroids)
 116 | }
 117 | 
 118 | 
 119 | #' @title Get arithmetic centroids (coordinates)
 120 | #' @description Takes initial list and returns a matrix with row \var{i} representing the arithmetic centroid of cluster \var{i}
 121 | #' @details Private method (not exported in namespace)
 122 | #' @param ref_clusters list containing each cluster of interest (each list element is a matrix of dimension num_cells x num_markers)
 123 | #' @return Matrix of dimension num_cluster x num_markers; row \var{i} representing the arithmetic centroid of cluster \var{i}
 124 | #' @examples
 125 | #' \dontrun{
 126 | #' cluster_centroids <- getArithmeticCentroids(ref_clusters)
 127 | #' }
 128 | getArithmeticCentroids <- function(ref_clusters) {
 129 |     if(length(ref_clusters) < 1) stop('Error: input requires at least 1 reference cluster')
 130 |     
 131 |     #centroids <- matrix(0, nrow=length(ref_clusters), ncol = ncol(ref_clusters[[1]]))
 132 |     #for(i in seq_len(length(ref_clusters))) {
 133 |     #  cur_cluster <- ref_clusters[[i]]
 134 |     #  centroids[i,] <- colMeans(cur_cluster)
 135 |     #}
 136 |     
 137 |     centroids_list <- lapply(ref_clusters, colMeans)
 138 |     centroids <- do.call(rbind, centroids_list)
 139 |     return(centroids)
 140 | }
 141 | 
 142 | #' @title Assign cells to a reference cell subtype
 143 | #' @description Assigns each cell in \code{cur_cells} to a cluster based on nearest cell in Monocle 2 tree
 144 | #' @details Private method (not exported in namespace). Uses RANN package for fast knn search
 145 | #' @param cur_cells Matrix of cells to be assigned to clusters (Dim: \var{num_cells} x \var{num_markers})
 146 | #' @param ref_cells Matrix of cells used to build reference Monocle 2 tree (Dim: \var{num_monocle_cells} x \var{num_markers})
 147 | #' @param ref_cell_labels Vector of length \var{num_monocle_cells} containing Monocle 2 cell branch assignments
 148 | #' @param cell_model Either "monocle2", "seurat", or "phate" depending on method used to model cell state space
 149 | #' @return Vector of length \var{num_cells} representing cluster assignments for each cell in \var{cur_cells}
 150 | #' @examples
 151 | #' \dontrun{
 152 | #' cur_cells_cluster_labels <- assignCellClusterNearestNode(cur_cells_expn_data,
 153 | #' clustered_cells_expn_data, clustered_cells_cluster_labels, cell_model='monocle2')
 154 | #' }
 155 | assignCellClusterNearestNode <- function(cur_cells, ref_cells, ref_cell_labels, cell_model=c('monocle2', 'seurat', 'phate')) {
 156 |     if(nrow(ref_cells) != length(ref_cell_labels)) stop("Error: number of cells and cell labels do not match")
 157 |     
 158 |     closest <- RANN::nn2(data = ref_cells, query = cur_cells, k = 1) #fast nearest neighbor search
 159 |     nearest_cell <- closest$nn.idx
 160 |     
 161 |     cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate'))
 162 |     if(cell_model %in% c('monocle2', 'phate')) {
 163 |         assigned <- ref_cell_labels[nearest_cell]
 164 |     } else if(cell_model == 'seurat') {
 165 |         nearest_cell_names <- rownames(ref_cells)[nearest_cell]
 166 |         assigned <- as.numeric(ref_cell_labels[nearest_cell_names])
 167 |     } else {
 168 |         stop('Error: cell_model must be either monocle2, seurat, or phate')
 169 |     }
 170 |     
 171 |     return(assigned)
 172 | }
 173 | 
 174 | #' @title Models expression data using generalized linear model with Gaussian error
 175 | #' @description Useful for modeling pre-normalized single-cell expression data.
 176 | #' @details Private method (not to be called by user directly). Requires VGAM package. Obtained from VGAM v1.0-5 (https://www.rdocumentation.org/packages/VGAM/versions/1.0-5/topics/gaussianff)
 177 | #' @param dispersion Dispersion parameter. If 0, then estimate as described in VGAM 1.0-5 documentation.
 178 | #' @param parallel A logical or formula. If a formula, the response of the formula should be a logical and the terms of the formula indicates whether or not those terms are parallel.
 179 | #' @param zero An integer-valued vector specifying which linear/additive predictors are modelled as intercepts only. The values must be from the set {1...M} where Mis the number of columns of the matrix response.
 180 | #' @return Generalized linear model with Gaussian error
 181 | #'
 182 | gaussianffLocal <- function(dispersion = 0, parallel = FALSE, zero = NULL) {
 183 |     if (!VGAM::is.Numeric(dispersion, length.arg = 1) || dispersion <
 184 |     0) {
 185 |         stop("bad input for argument 'dispersion'")
 186 |     }
 187 |     estimated.dispersion <- dispersion == 0
 188 |     cur_constraints <- expression({
 189 |         cur_constraints <- VGAM::cm.VGAM(matrix(1, M, 1),
 190 |         x = x, bool = parallel,
 191 |         constraints = constraints
 192 |         )
 193 |         cur_constraints <- VGAM::cm.zero.VGAM(constraints,
 194 |         x = x, zero,
 195 |         M = M, predictors.names = predictors.names, M1 = 1
 196 |         )
 197 |     })
 198 |     deviance_fn <- function(mu,y, w, residuals = FALSE, eta, extra = NULL) {
 199 |         M <- if (is.matrix(y)) {
 200 |             ncol(y)
 201 |         } else {
 202 |             1
 203 |         }
 204 |         n <- if (is.matrix(y)) {
 205 |             nrow(y)
 206 |         } else {
 207 |             length(y)
 208 |         }
 209 |         wz <- VGAM:::VGAM.weights.function(w = w, M = M, n = n)
 210 |         if (residuals) {
 211 |             if (M > 1) {
 212 |                 U <- vchol(wz, M = M, n = n)
 213 |                 temp <- mux22(U, y - mu,
 214 |                 M = M, upper = TRUE,
 215 |                 as.matrix = TRUE
 216 |                 )
 217 |                 dimnames(temp) <- dimnames(y)
 218 |                 temp
 219 |             }
 220 |             else {
 221 |                 (y - mu) * sqrt(wz)
 222 |             }
 223 |         }
 224 |         else {
 225 |             ResSS.vgam(y - mu, wz = wz, M = M)
 226 |         }
 227 |     }
 228 |     
 229 |     infos_fn <- function(...) {
 230 |         list(
 231 |         M1 = 1, Q1 = 1, expected = TRUE, multipleResponses = TRUE,
 232 |         quasi.type = TRUE, zero = zero
 233 |         )
 234 |     }
 235 |     
 236 |     init_expn <- expression({
 237 |         if (is.R()) assign("CQO.FastAlgorithm", TRUE, envir = VGAM::VGAMenv) else CQO.FastAlgorithm <<- TRUE
 238 |         if (any(function.name == c("cqo", "cao")) && (length(zero) ||
 239 |         (is.logical(parallel) && parallel))) {
 240 |             stop("cannot handle non-default arguments for cqo() and cao()")
 241 |         }
 242 |         temp5 <- w.y.check(
 243 |         w = w, y = y, ncol.w.max = Inf, ncol.y.max = Inf,
 244 |         out.wy = TRUE, colsyperw = 1, maximize = TRUE
 245 |         )
 246 |         w <- temp5$w
 247 |         y <- temp5$y
 248 |         M <- if (is.matrix(y)) ncol(y) else 1
 249 |         dy <- dimnames(y)
 250 |         predictors.names <- if (!is.null(dy[[2]])) {
 251 |             dy[[2]]
 252 |         } else {
 253 |             param.names(
 254 |             "Y",
 255 |             M
 256 |             )
 257 |         }
 258 |         if (!length(etastart)) etastart <- 0 * y
 259 |     })
 260 |     
 261 |     last_expn <- expression({
 262 |         dy <- dimnames(y)
 263 |         if (!is.null(dy[[2]])) dimnames(fit$fitted.values) <- dy
 264 |         dpar <- dispersion
 265 |         if (!dpar) {
 266 |             wz <- VGAM:::VGAM.weights.function(w = w, M = M, n = n)
 267 |             temp5 <- ResSS.vgam(y - mu, wz = wz, M = M)
 268 |             dpar <- temp5 / (length(y) - (if (is.numeric(ncol(X.vlm.save))) ncol(X.vlm.save) else 0))
 269 |         }
 270 |         misc$dispersion <- dpar
 271 |         misc$default.dispersion <- 0
 272 |         misc$estimated.dispersion <- estimated.dispersion
 273 |         misc$link <- rep_len("identitylink", M)
 274 |         names(misc$link) <- predictors.names
 275 |         misc$earg <- vector("list", M)
 276 |         for (ilocal in seq_len(M)) misc$earg[[ilocal]] <- list()
 277 |         names(misc$link) <- predictors.names
 278 |         if (is.R()) {
 279 |             if (exists("CQO.FastAlgorithm", envir = VGAM::VGAMenv)) {
 280 |                 rm("CQO.FastAlgorithm",
 281 |                 envir = VGAM::VGAMenv
 282 |                 )
 283 |             }
 284 |         } else {
 285 |             while (exists("CQO.FastAlgorithm")) remove("CQO.FastAlgorithm")
 286 |         }
 287 |         misc$expected <- TRUE
 288 |         misc$multipleResponses <- TRUE
 289 |     })
 290 |     
 291 |     loglikelihood_fn <- function(mu, y, w, residuals = FALSE,
 292 |     eta, extra = NULL, summation = TRUE) {
 293 |         M <- if (is.matrix(y)) {
 294 |             ncol(y)
 295 |         } else {
 296 |             1
 297 |         }
 298 |         n <- if (is.matrix(y)) {
 299 |             nrow(y)
 300 |         } else {
 301 |             length(y)
 302 |         }
 303 |         wz <- VGAM:::VGAM.weights.function(w = w, M = M, n = n)
 304 |         if (residuals) {
 305 |             stop("loglikelihood residuals not implemented yet")
 306 |         }
 307 |         else {
 308 |             temp1 <- ResSS.vgam(y - mu, wz = wz, M = M)
 309 |             ll.elts <- if (M == 1 || ncol(wz) == M) {
 310 |                 -0.5 * temp1 + 0.5 * (log(wz)) - n * (M / 2) *
 311 |                 log(2 * pi)
 312 |             }
 313 |             else {
 314 |                 if (all(wz[1, ] == apply(wz, 2, min)) && all(wz[1, ] == apply(wz, 2, max))) {
 315 |                     onewz <- m2a(wz[1, , drop = FALSE], M = M)
 316 |                     onewz <- onewz[, , 1]
 317 |                     logdet <- determinant(onewz)$modulus
 318 |                     logretval <- -0.5 * temp1 + 0.5 * n * logdet -
 319 |                     n * (M / 2) * log(2 * pi)
 320 |                     distval <- stop("variable 'distval' not computed yet")
 321 |                     logretval <- -(ncol(onewz) * log(2 * pi) +
 322 |                     logdet + distval) / 2
 323 |                     logretval
 324 |                 }
 325 |                 else {
 326 |                     logretval <- -0.5 * temp1 - n * (M / 2) * log(2 * pi)
 327 |                     for (ii in seq_len(n)) {
 328 |                         onewz <- m2a(wz[ii, , drop = FALSE], M = M)
 329 |                         onewz <- onewz[, , 1]
 330 |                         logdet <- determinant(onewz)$modulus
 331 |                         logretval <- logretval + 0.5 * logdet
 332 |                     }
 333 |                     logretval
 334 |                 }
 335 |             }
 336 |             if (summation) {
 337 |                 sum(ll.elts)
 338 |             }
 339 |             else {
 340 |                 ll.elts
 341 |             }
 342 |         }
 343 |     }
 344 |     validparams_fn <- function(eta, y, extra = NULL) {
 345 |         okay1 <- all(is.finite(eta))
 346 |         okay1
 347 |     }
 348 |     
 349 |     new("vglmff",
 350 |     blurb = c(
 351 |     "Vector linear/additive model\n",
 352 |     "Links:    identitylink for Y1,...,YM"
 353 |     ), constraints = cur_constraints, deviance = deviance_fn, infos = infos_fn,
 354 |     initialize = init_expn, linkinv = function(eta, extra = NULL) eta,
 355 |     last = last_expn,
 356 |     loglikelihood = loglikelihood_fn, linkfun = function(mu, extra = NULL) mu,
 357 |     vfamily = "gaussianff",
 358 |     validparams = validparams_fn, deriv = expression({
 359 |         wz <- VGAM:::VGAM.weights.function(w = w, M = M, n = n)
 360 |         mux22(cc = t(wz), xmat = y - mu, M = M, as.matrix = TRUE)
 361 |     }), weight = expression({
 362 |         wz
 363 |     })
 364 |     )
 365 | }
 366 | 
 367 | #' @title project2MST (updated from monocle v2.24.1 package for R 4.0+ compatibliity)
 368 | #' @description project2MST
 369 | #' @details Private method (not to be called by user directly). Adapted from monocle v2.24.1 (https://www.bioconductor.org/packages/release/bioc/manuals/monocle/man/monocle.pdf)
 370 | #' @param cds the CellDataSet upon which to perform this operation
 371 | #' @param Projection_Method the projection method
 372 | #' @return CellDataSet with updatedtrajectory inference
 373 | project2MSTUpdated <- function(cds, Projection_Method) 
 374 | {
 375 |     dp_mst <- minSpanningTree(cds)
 376 |     Z <- reducedDimS(cds)
 377 |     Y <- reducedDimK(cds)
 378 |     cds <- monocle:::findNearestPointOnMST(cds)
 379 |     closest_vertex <- cds@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_closest_vertex
 380 |     closest_vertex_names <- colnames(Y)[closest_vertex]
 381 |     closest_vertex_df <- as.matrix(closest_vertex)
 382 |     row.names(closest_vertex_df) <- row.names(closest_vertex)
 383 |     tip_leaves <- names(which(degree(dp_mst) == 1))
 384 |     if (!is.function(Projection_Method)) {
 385 |         P <- Y[, closest_vertex]
 386 |     }
 387 |     else {
 388 |         P <- matrix(rep(0, length(Z)), nrow = nrow(Z))
 389 |         for (i in 1:length(closest_vertex)) {
 390 |             neighbors <- names(V(dp_mst)[suppressWarnings(nei(closest_vertex_names[i], 
 391 |                                                               mode = "all"))])
 392 |             projection <- NULL
 393 |             distance <- NULL
 394 |             Z_i <- Z[, i]
 395 |             for (neighbor in neighbors) {
 396 |                 if (closest_vertex_names[i] %in% tip_leaves) {
 397 |                     tmp <- monocle:::projPointOnLine(Z_i, Y[, c(closest_vertex_names[i], 
 398 |                                                       neighbor)])
 399 |                 }
 400 |                 else {
 401 |                     tmp <- Projection_Method(Z_i, Y[, c(closest_vertex_names[i], 
 402 |                                                         neighbor)])
 403 |                 }
 404 |                 projection <- rbind(projection, tmp)
 405 |                 distance <- c(distance, dist(rbind(Z_i, tmp)))
 406 |             }
 407 |             if (!inherits(projection, "matrix")) { #Updated from class(mydata) != "matrix"
 408 |                 projection <- as.matrix(projection)
 409 |             }
 410 |             P[, i] <- projection[which(distance == min(distance))[1], 
 411 |             ]
 412 |         }
 413 |     }
 414 |     colnames(P) <- colnames(Z)
 415 |     dp <- as.matrix(dist(t(P)))
 416 |     min_dist = min(dp[dp != 0])
 417 |     dp <- dp + min_dist
 418 |     diag(dp) <- 0
 419 |     cellPairwiseDistances(cds) <- dp
 420 |     gp <- graph.adjacency(dp, mode = "undirected", weighted = TRUE)
 421 |     dp_mst <- minimum.spanning.tree(gp)
 422 |     cds@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_tree <- dp_mst
 423 |     cds@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_dist <- P
 424 |     cds@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_closest_vertex <- closest_vertex_df
 425 |     cds
 426 | }
 427 | 
 428 | #' @title newCellDataSetUpdated (updated from monocle v2.24.1 package for R 4.0+ compatibliity)
 429 | #' @description Creates new CDS
 430 | #' @details Private method (not to be called by user directly). Adapted from monocle v2.24.1 (https://www.bioconductor.org/packages/release/bioc/manuals/monocle/man/monocle.pdf)
 431 | #' @param cellData the CellDataSet upon which to perform this operation
 432 | #' @param phenoData phenoData
 433 | #' @param featureData featureData
 434 | #' @param lowerDetectionLimit lowerDetectionLimit
 435 | #' @param expressionFamily expressionFamily
 436 | #' @return CellDataSet
 437 | newCellDataSetUpdated <- function (cellData, phenoData = NULL, featureData = NULL, lowerDetectionLimit = 0.1, 
 438 |              expressionFamily = VGAM::negbinomial.size()) 
 439 | {
 440 |     if (!("gene_short_name" %in% colnames(featureData))) {
 441 |         warning("Warning: featureData must contain a column verbatim named 'gene_short_name' for certain functions")
 442 |     }
 443 |     if (!inherits(cellData, "matrix") && any(isSparseMatrix(cellData) == 
 444 |         FALSE)) {
 445 |         stop("Error: argument cellData must be a matrix (either sparse from the Matrix package or dense)")
 446 |     }
 447 |     if (!("gene_short_name" %in% colnames(featureData))) {
 448 |         warning("Warning: featureData must contain a column verbatim named 'gene_short_name' for certain functions")
 449 |     }
 450 |     sizeFactors <- rep(NA_real_, ncol(cellData))
 451 |     if (is.null(phenoData)) 
 452 |         phenoData <- annotatedDataFrameFrom(cellData, byrow = FALSE)
 453 |     if (is.null(featureData)) 
 454 |         featureData <- annotatedDataFrameFrom(cellData, byrow = TRUE)
 455 |     if (!("gene_short_name" %in% colnames(featureData))) {
 456 |         warning("Warning: featureData must contain a column verbatim named 'gene_short_name' for certain functions")
 457 |     }
 458 |     phenoData$Size_Factor <- sizeFactors
 459 |     cds <- new("CellDataSet", assayData = assayDataNew("environment", 
 460 |                                                        exprs = cellData), phenoData = phenoData, featureData = featureData, 
 461 |                lowerDetectionLimit = lowerDetectionLimit, expressionFamily = expressionFamily, 
 462 |                dispFitInfo = new.env(hash = TRUE))
 463 |     validObject(cds)
 464 |     cds
 465 | }
 466 |     
 467 | #' @title orderCells (updated from monocle v2.24.1 package for R 4.0+ compatibliity)
 468 | #' @description Orders cells according to pseudotime.
 469 | #' @details Private method (not to be called by user directly). Adapted from monocle v2.24.1 (https://www.bioconductor.org/packages/release/bioc/manuals/monocle/man/monocle.pdf)
 470 | #' @param cds the CellDataSet upon which to perform this operation
 471 | #' @param root_state The state to use as the root of the trajectory.
 472 | #' @param params num_paths the number of end-point cell states to allow in the biological process.
 473 | #' @param reverse whether to reverse the beginning and end points of the learned biological process.
 474 | #' @return CellDataSet with updatedtrajectory inference
 475 | orderCellsUpdated <- function (cds, root_state = NULL, num_paths = NULL, reverse = NULL) 
 476 | {
 477 |     if (class(cds)[1] != "CellDataSet") {
 478 |         stop("Error cds is not of type 'CellDataSet'")
 479 |     }
 480 |     if (is.null(cds@dim_reduce_type)) {
 481 |         stop("Error: dimensionality not yet reduced. Please call reduceDimension() before calling this function.")
 482 |     }
 483 |     if (any(c(length(cds@reducedDimS) == 0, length(cds@reducedDimK) == 
 484 |               0))) {
 485 |         stop("Error: dimension reduction didn't prodvide correct results. Please check your reduceDimension() step and ensure correct dimension reduction are performed before calling this function.")
 486 |     }
 487 |     root_cell <- monocle:::select_root_cell(cds, root_state, reverse)
 488 |     cds@auxOrderingData <- new.env(hash = TRUE)
 489 |     if (cds@dim_reduce_type == "ICA") {
 490 |         if (is.null(num_paths)) {
 491 |             num_paths = 1
 492 |         }
 493 |         adjusted_S <- t(cds@reducedDimS)
 494 |         dp <- as.matrix(dist(adjusted_S))
 495 |         cellPairwiseDistances(cds) <- as.matrix(dist(adjusted_S))
 496 |         gp <- graph.adjacency(dp, mode = "undirected", weighted = TRUE)
 497 |         dp_mst <- minimum.spanning.tree(gp)
 498 |         minSpanningTree(cds) <- dp_mst
 499 |         next_node <<- 0
 500 |         res <- pq_helper(dp_mst, use_weights = FALSE, root_node = root_cell)
 501 |         cds@auxOrderingData[[cds@dim_reduce_type]]$root_cell <- root_cell
 502 |         order_list <- monocle:::extract_good_branched_ordering(res$subtree, 
 503 |                                                      res$root, cellPairwiseDistances(cds), num_paths, 
 504 |                                                      FALSE)
 505 |         cc_ordering <- order_list$ordering_df
 506 |         row.names(cc_ordering) <- cc_ordering$sample_name
 507 |         minSpanningTree(cds) <- as.undirected(order_list$cell_ordering_tree)
 508 |         pData(cds)$Pseudotime <- cc_ordering[row.names(pData(cds)), 
 509 |         ]$pseudo_time
 510 |         pData(cds)$State <- cc_ordering[row.names(pData(cds)), 
 511 |         ]$cell_state
 512 |         mst_branch_nodes <- V(minSpanningTree(cds))[which(degree(minSpanningTree(cds)) > 
 513 |                                                               2)]$name
 514 |         minSpanningTree(cds) <- dp_mst
 515 |         cds@auxOrderingData[[cds@dim_reduce_type]]$cell_ordering_tree <- as.undirected(order_list$cell_ordering_tree)
 516 |     }
 517 |     else if (cds@dim_reduce_type == "DDRTree") {
 518 |         if (is.null(num_paths) == FALSE) {
 519 |             message("Warning: num_paths only valid for method 'ICA' in reduceDimension()")
 520 |         }
 521 |         cc_ordering <- monocle:::extract_ddrtree_ordering(cds, root_cell)
 522 |         pData(cds)$Pseudotime <- cc_ordering[row.names(pData(cds)), 
 523 |         ]$pseudo_time
 524 |         K_old <- reducedDimK(cds)
 525 |         old_dp <- cellPairwiseDistances(cds)
 526 |         old_mst <- minSpanningTree(cds)
 527 |         old_A <- reducedDimA(cds)
 528 |         old_W <- reducedDimW(cds)
 529 |         cds <- project2MSTUpdated(cds, monocle:::project_point_to_line_segment) #Calling local updated version of this fx
 530 |         minSpanningTree(cds) <- cds@auxOrderingData[[cds@dim_reduce_type]]$pr_graph_cell_proj_tree
 531 |         root_cell_idx <- which(V(old_mst)$name == root_cell, 
 532 |                                arr.ind = T)
 533 |         cells_mapped_to_graph_root <- which(cds@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_closest_vertex == 
 534 |                                                 root_cell_idx)
 535 |         if (length(cells_mapped_to_graph_root) == 0) {
 536 |             cells_mapped_to_graph_root <- root_cell_idx
 537 |         }
 538 |         cells_mapped_to_graph_root <- V(minSpanningTree(cds))[cells_mapped_to_graph_root]$name
 539 |         tip_leaves <- names(which(degree(minSpanningTree(cds)) == 
 540 |                                       1))
 541 |         root_cell <- cells_mapped_to_graph_root[cells_mapped_to_graph_root %in% 
 542 |                                                     tip_leaves][1]
 543 |         if (is.na(root_cell)) {
 544 |             root_cell <- select_root_cell(cds, root_state, reverse)
 545 |         }
 546 |         cds@auxOrderingData[[cds@dim_reduce_type]]$root_cell <- root_cell
 547 |         cc_ordering_new_pseudotime <- monocle:::extract_ddrtree_ordering(cds, 
 548 |                                                                root_cell)
 549 |         pData(cds)$Pseudotime <- cc_ordering_new_pseudotime[row.names(pData(cds)), 
 550 |         ]$pseudo_time
 551 |         if (is.null(root_state) == TRUE) {
 552 |             closest_vertex <- cds@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_closest_vertex
 553 |             pData(cds)$State <- cc_ordering[closest_vertex[, 
 554 |                                                            1], ]$cell_state
 555 |         }
 556 |         monocle:::reducedDimK(cds) <- K_old
 557 |         cellPairwiseDistances(cds) <- old_dp
 558 |         minSpanningTree(cds) <- old_mst
 559 |         monocle:::reducedDimA(cds) <- old_A
 560 |         monocle:::reducedDimW(cds) <- old_W
 561 |         mst_branch_nodes <- V(minSpanningTree(cds))[which(degree(minSpanningTree(cds)) > 
 562 |                                                               2)]$name
 563 |     }
 564 |     else if (cds@dim_reduce_type == "SimplePPT") {
 565 |         if (is.null(num_paths) == FALSE) {
 566 |             message("Warning: num_paths only valid for method 'ICA' in reduceDimension()")
 567 |         }
 568 |         cc_ordering <- monocle:::extract_ddrtree_ordering(cds, root_cell)
 569 |         pData(cds)$Pseudotime <- cc_ordering[row.names(pData(cds)), 
 570 |         ]$pseudo_time
 571 |         pData(cds)$State <- cc_ordering[row.names(pData(cds)), 
 572 |         ]$cell_state
 573 |         mst_branch_nodes <- V(minSpanningTree(cds))[which(degree(minSpanningTree(cds)) > 
 574 |                                                               2)]$name
 575 |     }
 576 |     cds@auxOrderingData[[cds@dim_reduce_type]]$branch_points <- mst_branch_nodes
 577 |     cds
 578 | }
 579 | 
 580 | ############################
 581 | ### Public methods below ###
 582 | ############################
 583 | 
 584 | #' @title Create 'Phemd' object
 585 | #' @description Wrapper function to create 'Phemd' object containing raw expression data and metadata
 586 | #' @details Note that each element in list can have different number of rows (i.e. number of cells in each sample can vary).
 587 | #' @param data List of length \var{num_samples} containing expression data; each element is of size \var{num_cells} x \var{num_markers}. Alternately a SingleCellExperiment object.
 588 | #' @param markers Vector containing marker names (i.e. column names of \code{all_data})
 589 | #' @param snames Vector containing sample names (i.e. names of samples contained in \code{all_data})
 590 | #' @param datatype Either "list" or "sce" (SingleCellExperiment with genes x cells)
 591 | #' @param valtype Type of assay data (i.e. "counts", "normcounts", "logcounts", "tpm", "cpm") if datatype is "sce"
 592 | #' @return 'Phemd' object containing raw multi-sample expression data and associated metadata
 593 | #' @examples
 594 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 595 | #'
 596 | createDataObj <- function(data, markers, snames, datatype='list', valtype='counts') {
 597 |     if(datatype == 'list') {
 598 |         all_data <- data
 599 |     } else if(datatype == 'sce') {
 600 |         if(valtype %in% names(assays(data))) {
 601 |             stop(sprintf('Error: %s not found in input SingleCellExperiment', valtype))
 602 |         }
 603 |         all_data <- t(assay(data, valtype)) # generally SCE objects are genes x cells; transpose
 604 |     } else {
 605 |         stop('Error: Input datatype must be either "list" or "sce" (SingleCellExperiment)')
 606 |     }
 607 |     
 608 |     nsample <- length(all_data)
 609 |     if(nsample == 0) stop('all_data is empty (length=0)')
 610 |     stopifnot(nsample == length(snames))
 611 |     if(nsample > 0) {
 612 |         stopifnot(ncol(all_data[[1]]) == length(markers))
 613 |     }
 614 |     
 615 |     # Error-checking to ensure that all samples in data list have same number of markers
 616 |     nmarker_vec <- rep(0, nsample)
 617 |     for(i in seq_len(nsample)) {
 618 |         nmarker_vec[i] <- ncol(all_data[[i]])
 619 |     }
 620 |     if(sum(nmarker_vec - nmarker_vec[1]) != 0) {
 621 |         stop(sprintf("Error: Sample %d has a different number of columns than Sample 1", which(nmarker_vec-nmarker_vec[1] != 0)))
 622 |     }
 623 |     data_obj <- new('Phemd', data = all_data, markers = markers, snames = snames, monocle_obj=NULL, seurat_obj=NULL, version=packageVersion(pkg = 'Seurat'))
 624 |     return(data_obj)
 625 | }
 626 | 
 627 | 
 628 | #' @title Attach 'Seurat' object to 'Phemd' object
 629 | #' @description Allows user to attach batch-normalized reference cell data from Seurat into 'Phemd' object containing raw expression data and metadata
 630 | #' @param phemd_obj Phemd object initialized using createDataObj
 631 | #' @param seurat_obj S4 'seurat' object containing batch-normalized reference cell data
 632 | #' @param batch.colname Name of column in Seurat object that denotes batch ID
 633 | #' @return 'Phemd' object containing with attached Seurat object
 634 | #' @examples
 635 | #' 
 636 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 637 | #' my_seuratObj <- Seurat::CreateSeuratObject(counts = t(all_expn_data[[1]]), project = "A")
 638 | #' my_seuratObj <- Seurat::FindVariableFeatures(object = my_seuratObj)
 639 | #' my_seuratObj <- Seurat::ScaleData(object = my_seuratObj, do.scale=FALSE, do.center=FALSE)
 640 | #' my_seuratObj <- Seurat::RunPCA(object = my_seuratObj, pc.genes = colnames(all_expn_data[[1]]), do.print = FALSE)
 641 | #' my_seuratObj <- Seurat::FindNeighbors(my_seuratObj, reduction = "pca", dims.use = 1:10)
 642 | #' my_seuratObj <- Seurat::FindClusters(my_seuratObj, resolution = 0.6, print.output = 0, save.SNN = TRUE)
 643 | #' my_phemdObj <- bindSeuratObj(my_phemdObj, my_seuratObj)
 644 | #' 
 645 | bindSeuratObj <- function(phemd_obj, seurat_obj, batch.colname='plt') {
 646 |     stopifnot(is(seurat_obj,'Seurat'))
 647 |     # ensure cluster names are 1-indexed
 648 |     if(min(as.numeric(as.character(Idents(seurat_obj)))) == 0) {
 649 |         label_names <- levels(Idents(seurat_obj))
 650 |         labels_renumbered <- factor(as.numeric(as.character(Idents(seurat_obj))) +1)
 651 |         #levels(labels_renumbered) <- label_names
 652 |         Idents(seurat_obj) <- labels_renumbered
 653 |     }
 654 |     if(batch.colname != 'plt') {
 655 |         seurat_obj@meta.data$plt <- seurat_obj@meta.data[[batch.colname]]
 656 |     }
 657 |     # assign batch ID of each cell as project ID defined upon Seurat obj initialization
 658 |     if(!'plt' %in% colnames(seurat_obj@meta.data)) {
 659 |         seurat_obj@meta.data$plt <- as.character(seurat_obj@meta.data$orig.ident)
 660 |     }
 661 |     seuratInfo(phemd_obj) <- seurat_obj
 662 |     
 663 |     return(phemd_obj)
 664 | }
 665 | 
 666 | #' @title Remove samples with too few cells
 667 | #' @description Removes samples from Phemd that have fewer cells than \code{min_sz}
 668 | #' @details Note: If used, this function must be called before (and not after) the \code{aggregateSamples} function is called
 669 | #' @param obj 'Phemd' object containing raw expression data and associated metadata
 670 | #' @param min_sz Minimum number of cells in each sample to be retained
 671 | #' @return 'Phemd' object containing raw multi-sample expression data and associated metadata (same as input minus removed samples)
 672 | #' @examples
 673 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 674 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10) #removes samples with fewer than 10 cells
 675 | #'
 676 | removeTinySamples <- function(obj, min_sz=20) {
 677 |     stopifnot(is(obj,'Phemd'))
 678 |     stopifnot(mode(min_sz) == 'numeric')
 679 |     all_data <- rawExpn(obj)
 680 |     all_snames <- sNames(obj)
 681 |     all_sample_sz <- getSampleSizes(all_data)
 682 |     to_remove_idx <- which(all_sample_sz < min_sz)
 683 |     if(length(to_remove_idx) == 0) return(obj)
 684 |     
 685 |     to_remove_idx <- to_remove_idx[order(to_remove_idx, decreasing=TRUE)] #remove from end to front
 686 |     for(i in to_remove_idx) {
 687 |         print(sprintf('%s removed because only contains %d cells', all_snames[i], all_sample_sz[i]))
 688 |         all_data[[i]] <- NULL
 689 |     }
 690 |     all_snames <- all_snames[-to_remove_idx]
 691 |     
 692 |     obj@data <- all_data
 693 |     obj@snames <- all_snames
 694 |     validObject(obj)
 695 |     return(obj)
 696 | }
 697 | 
 698 | #' @title Aggregate expression data from all samples
 699 | #' @description Takes initial Phemd object and returns object with additional data frame in slot @@data_aggregate containing cells aggregated from all samples (to be used for further analyses e.g. Monocle 2 trajectory building / pseudotime mapping / cell clustering)
 700 | #' @details Subsamples cells as necessary based on \code{max_cells}. If subsampling is performed, an equal number of cells are subsampled from each sample
 701 | #' @param obj 'Phemd' object containing raw expression data and associated metadata
 702 | #' @param max_cells Maximum number of cells across all samples to be included in final matrix on which Monocle 2 will be run
 703 | #' @return Same as input 'Phemd' object with additional slot 'data_aggregate' containing aggregated expression data (num_markers x num_cells)
 704 | #' @examples
 705 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 706 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
 707 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
 708 | #'
 709 | aggregateSamples <- function(obj, max_cells=12000) {
 710 |     stopifnot(is(obj, 'Phemd'))
 711 |     stopifnot(mode(max_cells) == 'numeric')
 712 |     
 713 |     all_data <- rawExpn(obj)
 714 |     nsample <- length(all_data)
 715 |     if(nsample == 0) return(obj)
 716 |     
 717 |     subsample_sz <- floor(max_cells/nsample)
 718 |     all_aggregate_data <- matrix(nrow=0,ncol=ncol(all_data[[1]]))
 719 |     all_subsample_idx <- list()
 720 |     subsample_bool = FALSE
 721 |     
 722 |     all_sample_sz <- getSampleSizes(all_data)
 723 |     if(sum(all_sample_sz) > max_cells) subsample_bool = TRUE
 724 |     
 725 |     for(i in seq_len(nsample)) {
 726 |         cur_data <- all_data[[i]]
 727 |         # take all cells unless total cells across all samples > max_cells
 728 |         if(subsample_bool) {
 729 |             cur_subsample_idx <- sample(seq_len(nrow(cur_data)), min(subsample_sz, nrow(cur_data)))
 730 |             all_subsample_idx[[i]] <- cur_subsample_idx
 731 |             all_aggregate_data <- rbind(all_aggregate_data, cur_data[cur_subsample_idx,])
 732 |         } else {
 733 |             all_aggregate_data <- rbind(all_aggregate_data, cur_data)
 734 |         }
 735 |     }
 736 |     all_aggregate_data <- t(all_aggregate_data) #rows = markers, cols = cells
 737 |     colnames(all_aggregate_data) <- seq_len(ncol(all_aggregate_data))
 738 |     rownames(all_aggregate_data) <- selectMarkers(obj)
 739 |     
 740 |     pooledCells(obj) <- as.matrix(all_aggregate_data)
 741 |     subsampledIdx(obj) <- all_subsample_idx
 742 |     subsampledBool(obj) <- subsample_bool
 743 |     return(obj)
 744 | }
 745 | 
 746 | #' @title Perform feature selection on aggregated data
 747 | #' @description Takes as input a Phemd object with aggregated data and returns updated object after performing feature selection on aggregated data
 748 | #' @details \code{aggregateSamples} needs to be called before running this function
 749 | #' @param obj 'Phemd' object containing aggregated data
 750 | #' @param selected_genes Vector containing names of genes to use for downstream analyses
 751 | #' @return Same as input 'Phemd' object after performing feature-selection based dimensionality reduction on aggregated expression data
 752 | #' @examples
 753 | #'
 754 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 755 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
 756 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
 757 | #' my_phemdObj_lg <- selectFeatures(my_phemdObj_lg, selected_genes=c('TP53',
 758 | #' 'EGFR', 'KRAS', 'FOXP3', 'LAG3'))
 759 | #'
 760 | selectFeatures <- function(obj, selected_genes) {
 761 |     if(isempty(pooledCells(obj))) stop('slot "data_aggregate" is empty; please call aggregateSamples() before running selectFeatures()')
 762 |     all_aggregate_data <- pooledCells(obj)
 763 |     all_genes <- selectMarkers(obj)
 764 |     selected_gene_map <- match(selected_genes, all_genes)
 765 |     #TODO: print genes in selected_genes that were unable to map to all_genes
 766 |     if(sum(!is.na(selected_gene_map)) == 0) {
 767 |         stop('None of the genes in "selected_genes" were found. Aborting feature selection.')
 768 |     }
 769 |     
 770 |     all_aggregate_data <- all_aggregate_data[selected_gene_map,]
 771 |     obj@data_aggregate <- all_aggregate_data #set slots manually due to validity constraints
 772 |     obj@markers <- all_genes[selected_gene_map]
 773 |     validObject(obj)
 774 |     return(obj)
 775 | }
 776 | 
 777 | #' @title Generate cell-state embedding
 778 | #' @description Takes as input a Phemd object with aggregated data and returns updated object containing cell-state embedding
 779 | #' @details \code{aggregateSamples} needs to be called before running this function.
 780 | #' @param obj 'Phemd' object containing aggregated data
 781 | #' @param cell_model Method to use to generate cell-state embedding. Currently supports "phate" and "monocle2". If using the Seurat to model the cell-state space, please identify cell subtypes as outlined in the Seurat software package and then use the \code{bindSeuratObj} function.
 782 | #' @param data_model Only relevant if cell_model = "monocle2". One of the following: 'negbinomial_sz', 'negbinomial', 'tobit', 'uninormal', 'gaussianff'. See "Family Function" table at the following link for more details on selecting the proper one. \url{http://cole-trapnell-lab.github.io/monocle-release/docs/#getting-started-with-monocle}
 783 | #' @param phate_ncluster Only relevant if cell_model = "phate". Number of cell state clusters to return when using PHATE
 784 | #' @param phate_cluster_seed Only relevant if cell_model = "phate". Seed to use when performing cell state clustering (optional)
 785 | #' @param ... Additional parameters to be passed to \code{reduceDimension} function for Monocle or \code{phate} function for PHATE
 786 | #' @return Same as input 'Phemd' object containing additional cell-state embedding object
 787 | #' @examples
 788 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 789 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
 790 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
 791 | #' my_phemdObj_lg <- embedCells(my_phemdObj_lg, cell_model='monocle2', data_model = 'gaussianff', sigma=0.02, maxIter=2)
 792 | embedCells <- function(obj, cell_model=c('monocle2', 'seurat', 'phate'), data_model = 'negbinomial_sz', phate_ncluster=8, phate_cluster_seed=NULL, ...) {
 793 |     extra_args <- list(...)
 794 |     cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate'))
 795 |     
 796 |     if(isempty(pooledCells(obj))) stop('slot "data_aggregate" is empty; please call aggregateSamples() before running embedCells()')
 797 |     mydata <- pooledCells(obj)
 798 |     if(is.null(mydata)) stop("Error: call 'aggregateSamples' function first ")
 799 |     
 800 |     if(cell_model == 'monocle2') {
 801 |         myFeatureData <- as.data.frame(selectMarkers(obj))
 802 |         fd <- new("AnnotatedDataFrame", data=myFeatureData)
 803 |         rownames(fd) <- selectMarkers(obj)
 804 |         
 805 |         if(is.null(data_model)) {
 806 |             print('Assuming data fit negative binomial pattern of expression...')
 807 |             data_model <- 'negbinomial_sz'
 808 |         }
 809 |         
 810 |         if(data_model == 'negbinomial_sz') {
 811 |             expression_fam_fn <- VGAM::negbinomial.size()
 812 |         } else if(data_model == 'negbinomial') {
 813 |             expression_fam_fn <- VGAM::negbinomial()
 814 |         } else if(data_model == 'tobit') {
 815 |             expression_fam_fn <- VGAM::tobit()
 816 |         } else if(data_model == 'uninormal') {
 817 |             expression_fam_fn <- VGAM::uninormal()
 818 |         } else if(data_model == 'gaussianff') {
 819 |             #expression_fam_fn <- VGAM::gaussianff()
 820 |             expression_fam_fn <- gaussianffLocal()
 821 |         } else {
 822 |             stop("Error: Invalid data_model specified")
 823 |         }
 824 |         
 825 |         # Helpful to use ncenter and sigma that's higher than default in reduceDimension for datasets w/ relatively more cells
 826 |         if(!('ncenter' %in% names(extra_args)) && ncol(mydata) > 3000) {
 827 |             extra_args['ncenter'] <- 750
 828 |             if(!('sigma' %in% names(extra_args))) extra_args['sigma'] <- 0.03
 829 |         }
 830 |         
 831 |         if(!('maxIter' %in% names(extra_args))) {
 832 |             extra_args['maxIter'] <- 12 #set maximum number of iterations to 12
 833 |         }
 834 |         
 835 |         if(!('max_components' %in% names(extra_args))) {
 836 |             extra_args['max_components'] <- 2 #set number of dimensionality-reduced components to 2
 837 |         }
 838 |         
 839 |         monocle_obj <- newCellDataSetUpdated(mydata,phenoData=NULL,featureData=fd,
 840 |                                       expressionFamily=expression_fam_fn)
 841 |         varLabels(featureData(monocle_obj)) <- 'gene_short_name' #random formatting requirement for monocle
 842 |         
 843 |         if(data_model == 'negbinomial_sz') {
 844 |             monocle_obj <- estimateSizeFactors(monocle_obj) #Only needed for negbinomial data; also note that with R 4.0 class(mymatrix) returns ["matrix", "array"] so isSparseMatrix doesn't return scalar and hence is broken
 845 |             monocle_obj <- estimateDispersions(monocle_obj)
 846 |         }
 847 |         if(data_model == 'gaussianff') {
 848 |             extra_args['norm_method'] <- 'none'
 849 |             extra_args['scaling'] <- 'FALSE'
 850 |         }
 851 |         
 852 |         rd_args <- c(list(cds=monocle_obj, reduction_method='DDRTree'),
 853 |                      extra_args[names(extra_args) %in% c("verbose", "ncenter", "norm_method", 'scaling', 'pseudo_expr', "initial_method", "maxIter", "sigma", "lambda", "param.gamma", "tol", "max_components")])
 854 |         monocle_obj_red <- do.call(reduceDimension, rd_args)
 855 |         
 856 |         monocleInfo(obj) <- monocle_obj_red
 857 |     } else if(cell_model == 'phate') {
 858 |         # Perform phate embedding and generate cell state labels
 859 |         rd_args <- c(list(data=t(mydata)),
 860 |                      extra_args)
 861 |         phate_obj <- do.call(phate, rd_args)
 862 |         state_labels <- cluster_phate(phate_obj, phate_ncluster, phate_cluster_seed)
 863 |         if(min(state_labels) <= 0) {
 864 |             # Ensure numeric cell state labels are 1-indexed
 865 |             state_labels <- state_labels - min(state_labels) + 1 
 866 |         }
 867 |         phate_obj$cellstate.labels <- state_labels
 868 |         class(phate_obj) <- 'list'
 869 |         phateInfo(obj) <- phate_obj
 870 |     } 
 871 |     return(obj)
 872 | }
 873 | 
 874 | #' @title Compute Monocle2 cell state and pseudotime assignments
 875 | #' @description Takes as input a Phemd object with Monocle2 object and returns updated object with Monocle2 object containing cell state and pseudotime assignments
 876 | #' @details Wrapper function for \code{orderCells} in Monocle 2 package. \code{embedCells} needs to be called before calling this function.
 877 | #' @param obj 'Phemd' object containing Monocle2 object initialized using embedCells
 878 | #' @param ... Additional parameters to be passed into \code{orderCells} function
 879 | #' @return Same as input 'Phemd' object with updated cell-state embedding object containing cell state assignments
 880 | #' @examples
 881 | #'
 882 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 883 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
 884 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
 885 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, cell_model='monocle2', data_model='gaussianff', sigma=0.02, maxIter=2)
 886 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
 887 | orderCellsMonocle <- function(obj, ...) {
 888 |     monocle_obj <- monocleInfo(obj)
 889 |     if(ncol(monocle_obj) == 0) stop('slot "monocle_obj" is empty; please call embedCells() before running orderCellsMonocle()')
 890 |     
 891 |     extra_args <- list(...)
 892 |     oc_args <- c(list(cds=monocle_obj),
 893 |     extra_args[names(extra_args) %in% c("root_state", "reverse")])
 894 |     
 895 |     monocle_obj_ordered <- do.call(orderCellsUpdated, oc_args) #updated from Monocle2 to be compatible with R4.2
 896 |     monocleInfo(obj) <- monocle_obj_ordered
 897 |     return(obj)
 898 | }
 899 | 
 900 | #' @title Computes cell subtype abundances for each sample
 901 | #' @description Takes as input a Phemd object with all single-cell expression data of all single-cell samples in @@data slot and cell-state embedding generated by embedCells. Returns updated object with cell subtype frequencies of each sample that may be retrieved by the 'celltypeFreqs' accessor function.
 902 | #' @details \code{embedCells} (and \code{orderCellsMonocle} if using the Monocle2 embedding technique) needs to be called before calling this function.
 903 | #' @param obj 'Phemd' object containing single-cell expression data of all samples in @@data slot and cell-state embedding object generated and stored using the embedCells function.
 904 | #' @param verbose Boolean that determines whether progress (sequential processing of samples) should be printed. FALSE by default
 905 | #' @param cell_model Either "monocle2", "seurat", or "phate" depending on method used to model cell state space
 906 | #' @return 'Phemd' object with cell subtype frequencies of each sample that can be retrieved using the 'celltypeFreqs' accessor function
 907 | #' @examples
 908 | #'
 909 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
 910 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
 911 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
 912 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
 913 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
 914 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
 915 | #'
 916 | clusterIndividualSamples <- function(obj, verbose=FALSE, cell_model=c('monocle2', 'seurat', 'phate')) {
 917 |     stopifnot(is(obj,'Phemd'))
 918 |     all_data <- rawExpn(obj)
 919 |     cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate'))
 920 |     if(cell_model %in% c('monocle2', 'phate')) {
 921 |         if(cell_model == 'monocle2') {
 922 |             # Use Monocle2 embedding and clusters
 923 |             monocle_obj <- monocleInfo(obj)
 924 |             if(ncol(monocle_obj) == 0) stop('slot "monocle_obj" is empty; please call embedCells() and orderCellsMonocle() before calling this function')
 925 |             
 926 |             # Extract state labels from monocle data object
 927 |             labels <- pData(phenoData(monocle_obj))
 928 |             if(!('State' %in% names(labels))) stop('monocleInfo(obj) does not have cell state assignments; please call embedCells() and orderCellsMonocle() before calling this function')
 929 |             state_labels <- as.numeric(labels$State)
 930 |             # retrieve reference clusters
 931 |             ref_clusters <- retrieveRefClusters(obj, cell_model='monocle2', expn_type='raw')
 932 |             refcells_expn <- t(exprs(monocle_obj))
 933 |         } else {
 934 |             # Use PHATE embedding and clusters
 935 |             phate_obj <- phateInfo(obj)
 936 |             if(length(phate_obj) == 0) {
 937 |                 stop('slot "phate_obj" is empty; please call embedCells() and clusterCellsPHATE() before calling this function')
 938 |             }
 939 |             if(!('cellstate.labels' %in% names(phate_obj))) {
 940 |                 stop('phateInfo(obj) does not have cell state assignments; please call embedCells() and clusterCellsPHATE() before calling this function')
 941 |             }
 942 |             state_labels <- phate_obj$cellstate.labels
 943 |             # retrieve reference clusters
 944 |             ref_clusters <- retrieveRefClusters(obj, cell_model='phate', expn_type='raw')
 945 |             refcells_expn <- t(pooledCells(obj))
 946 |         }
 947 |         nclusters <- length(ref_clusters)
 948 |         cluster.ids <- names(ref_clusters)
 949 |         
 950 |         if(!subsampledBool(obj)) {
 951 |             ## subsampling not performed; all cells assigned to clusters as-is
 952 |             
 953 |             cluster_weights <- matrix(0, nrow=length(all_data), ncol = nclusters)
 954 |             
 955 |             start_idx <- 1
 956 |             for(i in seq_len(length(all_data))) {
 957 |                 cur_sample_sz <- nrow(all_data[[i]])
 958 |                 end_idx <- start_idx + cur_sample_sz - 1
 959 |                 sample_labels <- state_labels[start_idx:end_idx] #cell subtype assignments for current sample
 960 |                 cur_hist <- rep(0, nclusters)
 961 |                 for(j in seq_len(nclusters)) {
 962 |                     cur_hist[j] <- sum(sample_labels == cluster.ids[j])
 963 |                 }
 964 |                 cur_hist <- cur_hist / sum(cur_hist)
 965 |                 cluster_weights[i,] <- cur_hist
 966 |                 start_idx <- end_idx + 1
 967 |             }
 968 |             colnames(cluster_weights) <- paste('C-', cluster.ids, sep='')
 969 |             celltypeFreqs(obj) <- cluster_weights
 970 |         } else {
 971 |             ## subsampling performed; need to assign cells to cluster of nearest cell in embedding
 972 |             refcluster_sizes <- rep(0, length(ref_clusters))
 973 |             counter1 <- 0
 974 |             for(i in seq_len(nclusters)) {
 975 |                 refcluster_sizes[i] <- nrow(ref_clusters[[i]])
 976 |                 counter1 <- counter1 + nrow(ref_clusters[[i]])
 977 |             }
 978 |             print(refcluster_sizes / counter1)
 979 |             
 980 |             cluster_weights <- matrix(0, nrow=length(all_data), ncol=nclusters)
 981 |             for(i in seq_len(length(all_data))) {
 982 |                 cur_data <- all_data[[i]]
 983 |                 cur_ncells <- nrow(cur_data)
 984 |                 cur_hist <- rep(0, nclusters)
 985 |                 
 986 |                 if(verbose && i %% 10 == 0) {
 987 |                     print(sprintf('Processing sample number: %d', i))
 988 |                 }
 989 |                 
 990 |                 # Use nearest-cell mapping instead of nearest-centroid mapping
 991 |                 cur_cell_labels <- assignCellClusterNearestNode(cur_data,
 992 |                                                                 refcells_expn,
 993 |                                                                 state_labels,
 994 |                                                                 cell_model=cell_model)
 995 |          
 996 |                 for(j in seq_len(nclusters)) {
 997 |                     cur_hist[j] <- sum(cur_cell_labels == cluster.ids[j])
 998 |                 }
 999 |                 cur_hist <- cur_hist / sum(cur_hist)
1000 |                 cluster_weights[i,] <- cur_hist
1001 |             }
1002 |             print(colMeans(cluster_weights))
1003 |             colnames(cluster_weights) <- paste('C-', cluster.ids, sep='')
1004 |             
1005 |             celltypeFreqs(obj) <- cluster_weights
1006 |         }
1007 |     } else if(cell_model == 'seurat') {
1008 |         seurat_obj <- seuratInfo(obj)
1009 |         # retrieve reference clusters for starting estimate of cluster sizes
1010 |         ref_clusters <- retrieveRefClusters(obj, cell_model='seurat',
1011 |         expn_type = 'raw')
1012 |         nclusters <- length(ref_clusters)
1013 |         ## subsampling performed; assign cells to same cluster as nearest cell
1014 |         # in embedding
1015 |         refcluster_sizes <- rep(0, length(ref_clusters))
1016 |         counter1 <- 0
1017 |         for(i in seq_len(nclusters)) {
1018 |             refcluster_sizes[i] <- nrow(ref_clusters[[i]])
1019 |             counter1 <- counter1 + nrow(ref_clusters[[i]])
1020 |         }
1021 |         print(refcluster_sizes / counter1)
1022 |         
1023 |         cluster_weights <- matrix(0, nrow=length(all_data), ncol = nclusters)
1024 |         sample_batches <- batchIDs(obj)
1025 |         for(i in seq_len(length(all_data))) {
1026 |             cur_data <- all_data[[i]]
1027 |             cur_plt <- sample_batches[i]
1028 |             if(verbose && i %% 10 == 0) {
1029 |                 print(sprintf('Processing sample number: %d', i))
1030 |             }
1031 |             
1032 |             cur_hist <- rep(0, nclusters)
1033 |             state_labels <- as.numeric(as.character(Idents(seurat_obj)))
1034 |             names(state_labels) <- rownames(seurat_obj@meta.data)
1035 |             ref_data <- t(as.matrix(GetAssayData(seurat_obj, assay='RNA', 
1036 |                                                  slot='counts')))
1037 |             cell_idx_curplt <- which(seurat_obj@meta.data$plt == cur_plt)
1038 |             if(length(cell_idx_curplt) == 0) {
1039 |                 stop(sprintf('Error: no cells in reference set match the experiment_id %s of sample %d', cur_plt, i))
1040 |             }
1041 |             ref_data <- ref_data[cell_idx_curplt,]
1042 |             state_labels <- state_labels[cell_idx_curplt]
1043 |             
1044 |             # Use nearest-cell mapping
1045 |             cur_cell_labels <- assignCellClusterNearestNode(cur_data,
1046 |                                                             ref_data,
1047 |                                                             state_labels,
1048 |                                                             cell_model=cell_model)
1049 |             
1050 |             for(j in seq_len(nclusters)) {
1051 |                 cur_hist[j] <- sum(cur_cell_labels == j)
1052 |             }
1053 |             cur_hist <- cur_hist / sum(cur_hist);
1054 |             cluster_weights[i,] <- cur_hist
1055 |         }
1056 |         print(colMeans(cluster_weights))
1057 |         
1058 |         celltypeFreqs(obj) <- cluster_weights
1059 |     } else {
1060 |         stop('Error: cell_model must be either "monocle2", "seurat", or "phate"')
1061 |     }
1062 |     
1063 |     return(obj)
1064 | }
1065 | 
1066 | 
1067 | #' @title Computes ground distance matrix based on cell embedding
1068 | #' @description Takes as input a Phemd object containing cell-state embedding object. Returns updated object with ground distance matrix representing pairwise distances between distinct cell subtypes based on cell state embedding.
1069 | #' @details \code{embedCells} and \code{orderCellsMonocle} need to be called before calling this function. Requires 'igraph' package
1070 | #' @param obj 'Phemd' object containing cell-state embedding object
1071 | #' @param cell_model Method by which cell state was modeled (either "monocle2", "seurat", or "phate")
1072 | #' @param expn_type Data type to use to determine cell-type dissimilarities
1073 | #' @param ndim Number of embedding dimensions to be used for computing cell-type dissimilarity (optional)
1074 | #' @return Phemd object with ground distance matrix (to be used in EMD computation) in @@data_cluster_weights slot
1075 | #' @examples
1076 | #'
1077 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
1078 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
1079 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
1080 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
1081 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
1082 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
1083 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final)
1084 | #'
1085 | generateGDM <- function(obj, cell_model=c('monocle2', 'seurat', 'phate'), expn_type='reduced', ndim=8) {
1086 |     stopifnot(is(obj,'Phemd'))
1087 |     
1088 |     cell_model <- match.arg(cell_model, c('monocle2','seurat', 'phate'))
1089 |     if(cell_model == 'monocle2') {
1090 |         monocle_obj <- monocleInfo(obj)
1091 |         # retrieve reference clusters
1092 |         ref_clusters <- retrieveRefClusters(obj, cell_model='monocle2', expn_type=expn_type)
1093 |         nclusters <- length(ref_clusters)
1094 |         
1095 |         # get graph underlying Monocle tree
1096 |         mst_graph <- monocle_obj@auxOrderingData[["DDRTree"]]$pr_graph_cell_proj_tree
1097 |         centroids <- identifyCentroids(ref_clusters)
1098 |         pseudotimes <- pData(phenoData(monocle_obj))$Pseudotime
1099 |         emd_dists <- matrix(0, nrow=nclusters, ncol=nclusters)
1100 |         for(i in seq_len(nclusters)) {
1101 |             for(j in seq_len(nclusters)) {
1102 |                 if(i == j) next
1103 |                 path_btwn_cells <- shortest_paths(mst_graph, centroids[[i]], centroids[[j]])$vpath[[1]] #cell1 and cell2 should be strings representing vertices in mst
1104 |                 pseudotime_curpath <- pseudotimes[path_btwn_cells]
1105 |                 pseudotime_dist <- abs(pseudotimes[as.numeric(centroids[[i]])] - min(pseudotime_curpath)) + abs(pseudotimes[as.numeric(centroids[[j]])] - min(pseudotime_curpath))
1106 |                 emd_dists[i,j] <- pseudotime_dist
1107 |             }
1108 |         }
1109 |         GDM(obj) <- emd_dists
1110 |     } else if(cell_model %in% c('seurat', 'phate')) {
1111 |         ref_clusters <- retrieveRefClusters(obj, cell_model=cell_model, expn_type=expn_type, ndim=ndim)
1112 |         centroids <- getArithmeticCentroids(ref_clusters)
1113 |         GDM(obj) <- as.matrix(dist(centroids))
1114 |     } else {
1115 |         stop('Error: cell_model must be either "monocle2", "seurat", or "phate"')
1116 |     }
1117 |     return(obj)
1118 | }
1119 | 
1120 | #' @title Computes EMD distance matrix representing pairwise dissimilarity between samples
1121 | #' @description Takes as input a Phemd object with cell subtype relative frequencies for each sample in @@data_cluster_weights slot and ground distance matrix (representing cell subtype pairwise dissimilarity) in @@emd_dist_mat slot. Returns distance matrix representing pairwise dissimilarity between samples
1122 | #' @details Requires 'transport' and 'pracma' packages
1123 | #' @param obj 'Phemd' object containing cell subtype relative frequencies for each sample in @@data_cluster_weights slot and ground distance matrix (representing cell subtype dissimilarity) in @@emd_dist_mat slot
1124 | #' @return Distance matrix of dimension num_samples x num_samples representing pairwise dissimilarity between samples
1125 | #' @examples
1126 | #'
1127 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
1128 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
1129 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
1130 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, data_model = 'gaussianff', sigma=0.02, maxIter=2)
1131 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
1132 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
1133 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final)
1134 | #' my_EMD_mat <- compareSamples(my_phemdObj_final)
1135 | #'
1136 | compareSamples <- function(obj) {
1137 |     stopifnot(is(obj,'Phemd'))
1138 |     cluster_weights <- celltypeFreqs(obj)
1139 |     emd_dists <- GDM(obj)
1140 |     # generate inhibitor distance matrix
1141 |     Y <- rep(0, (nrow(cluster_weights)-1)*nrow(cluster_weights)/2)
1142 |     counter <- 1
1143 |     for(i in seq_len(nrow(cluster_weights))){
1144 |         cur_f1_weights <- cluster_weights[i,]
1145 |         for(j in (i+1):nrow(cluster_weights)) {
1146 |             if(j > nrow(cluster_weights)) break #this doesn't automatically happen in R
1147 |             cur_f2_weights <- cluster_weights[j,]
1148 |             # Compute EMD
1149 |             flow <- transport(cur_f1_weights, cur_f2_weights, emd_dists,
1150 |             method='primaldual')
1151 |             curdist <- 0
1152 |             for(k in seq_len(nrow(flow))) {
1153 |                 cur_penalty <- emd_dists[flow[k,1], flow[k,2]]
1154 |                 curdist <- curdist+cur_penalty*flow[k,3]
1155 |             }
1156 |             Y[counter] <- curdist
1157 |             counter <- counter + 1
1158 |         }
1159 |     }
1160 |     Y_sq <- squareform(Y)
1161 |     return(Y_sq)
1162 | }
1163 | 
1164 | #' @title Performs community detection on sample-sample distance matrix to identify groups of similar samples 
1165 | #' @description Takes sample-sample distance matrix as input and returns group assignments for each sample
1166 | #' @details By default, uses 'kgs' (Kelley-Gardner-Sutcliffe) method for determining optimal number of groups. Alternatively, can take user-specified number of groups). Requires 'cluster' and 'maptree' packages.
1167 | #' @param distmat A distance matrix of dimension num_samples x num_samples representing pairwise dissimilarity between samples
1168 | #' @param distfun Method of partitioning network of samples (currently either 'hclust' or 'pam')
1169 | #' @param ncluster Optional parameter specifying total number of sample groups
1170 | #' @param method Optional parameter for hierarchical clustering (see "hclust" documentation)
1171 | #' @param ... Optional additional parameters to be passed to diffusionKmeans method
1172 | #' @return Vector containing group assignments for each sample (same order as row-order of distmat) based on user-specified partitioning method (e.g. hierarchical clustering)
1173 | #' @examples
1174 | #' 
1175 | #' my_phemdObj <- createDataObj(all_expn_data, all_genes, as.character(snames_data))
1176 | #' my_phemdObj_lg <- removeTinySamples(my_phemdObj, 10)
1177 | #' my_phemdObj_lg <- aggregateSamples(my_phemdObj_lg, max_cells=1000)
1178 | #' my_phemdObj_monocle <- embedCells(my_phemdObj_lg, cell_model = 'monocle2', data_model = 'gaussianff', sigma=0.02, maxIter=2)
1179 | #' my_phemdObj_monocle <- orderCellsMonocle(my_phemdObj_monocle)
1180 | #' my_phemdObj_final <- clusterIndividualSamples(my_phemdObj_monocle)
1181 | #' my_phemdObj_final <- generateGDM(my_phemdObj_final)
1182 | #' my_EMD_mat <- compareSamples(my_phemdObj_final)
1183 | #' cluster_assignments <- groupSamples(my_EMD_mat, distfun = 'hclust', ncluster=4)
1184 | #' 
1185 | groupSamples <- function(distmat, distfun = 'hclust', ncluster=NULL, method='complete', ...) {
1186 |     ## Clustering on distance matrix
1187 |     if(nrow(distmat) != ncol(distmat)) {
1188 |         stop('Error: distmat must be a square distance matrix of dimension num_samples x num_samples')
1189 |     }
1190 |     if(distfun == 'hclust') {
1191 |         cluster_results <- hclust(as.dist(distmat), method=method)
1192 |         if(is.null(ncluster)) {
1193 |             # kgs method for determining optimal number of clusters
1194 |             op_k <- kgs(cluster_results, as.dist(distmat), maxclust = 15)
1195 |             ncluster <- op_k[which(op_k == min(op_k))]
1196 |         }
1197 |         cluster_assignments <- cutree(cluster_results, k=ncluster)
1198 |     } else if(distfun == 'pam') {
1199 |         if(is.null(ncluster)) ncluster <- 4
1200 |         cluster_results <- pam(distmat, ncluster, diss=TRUE)
1201 |         cluster_assignments <- cluster_results$clustering
1202 |     } else {
1203 |         stop("Error: Please specify distfun as either 'hclust' or 'pam'")
1204 |     }
1205 |     return(cluster_assignments)
1206 | }
1207 | 
1208 | 
1209 | 
1210 | 
1211 | 
1212 | 


--------------------------------------------------------------------------------