├── .travis.yml ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── R ├── calculatesilhouette.R ├── clusterviz.R ├── geomflatviolin.R ├── idents.R ├── mergemultipleseuratobjects.R ├── preprocesssubsetdata.R ├── preprocesssubsetdatav2.R ├── randomsubsetdata.R ├── scclusterboot.R ├── scclusteval-package.R ├── snncbi.R └── utils-pipe.R ├── README.Rmd ├── README.md ├── data └── idents.rda ├── man ├── AssignHighestJaccard.Rd ├── AssignStableCluster.Rd ├── CalculatePercentCellInStable.Rd ├── CalculateSilhouette.Rd ├── ClusterIdentityChordPlot.Rd ├── ClusterSizeBarplot.Rd ├── JaccardRainCloudPlot.Rd ├── JaccardSets.Rd ├── MatchClusters.Rd ├── MergeMultipleSeuratObjects.Rd ├── PairWiseJaccardSets.Rd ├── PairWiseJaccardSetsHeatmap.Rd ├── PairWiseOverlappingIdents.Rd ├── ParameterSetScatterPlot.Rd ├── PreprocessSubsetData.Rd ├── PreprocessSubsetDataV2.Rd ├── RandomSubsetData.Rd ├── SilhouetteRainCloudPlot.Rd ├── figures │ ├── .DS_Store │ ├── README-pressure-1.png │ ├── README-unnamed-chunk-1-1.png │ ├── README-unnamed-chunk-2-1.png │ ├── README-unnamed-chunk-2-2.png │ ├── README-unnamed-chunk-2-3.png │ ├── README-unnamed-chunk-2-4.png │ ├── README-unnamed-chunk-2-5.png │ ├── README-unnamed-chunk-3-1.png │ ├── README-unnamed-chunk-3-2.png │ ├── README-unnamed-chunk-3-3.png │ ├── README-unnamed-chunk-3-4.png │ ├── README-unnamed-chunk-4-1.png │ ├── README-unnamed-chunk-4-10.png │ ├── README-unnamed-chunk-4-2.png │ ├── README-unnamed-chunk-4-3.png │ ├── README-unnamed-chunk-4-4.png │ ├── README-unnamed-chunk-4-5.png │ ├── README-unnamed-chunk-4-6.png │ ├── README-unnamed-chunk-4-7.png │ ├── README-unnamed-chunk-4-8.png │ ├── README-unnamed-chunk-4-9.png │ ├── README-unnamed-chunk-5-1.png │ ├── jaccard_raincloud.png │ ├── scclusteval.png │ └── workflow.png ├── geom_flat_violin.Rd ├── idents.Rd ├── pipe.Rd ├── scClusterBoot.Rd └── scclusteval-package.Rd ├── scclusteval.Rproj ├── tests └── spelling.R └── vignettes ├── .gitignore └── pbmc_example.Rmd /.travis.yml: -------------------------------------------------------------------------------- 1 | # Use R 2 | language: r 3 | sudo: true 4 | cache: packages 5 | warnings_are_errors: false 6 | 7 | # environment variables set for all builds 8 | env: 9 | global: 10 | - BIOC_USE_DEVEL="FALSE" ## Use the current release version 11 | - R_BUILD_ARGS="--no-build-vignettes --no-manual" 12 | - R_CHECK_ARGS="--no-build-vignettes --no-manual --timings" ## do not build vignettes or manual 13 | - _R_CHECK_TIMINGS_="0" ## get the timing information for the examples for all of your functions 14 | 15 | r: 16 | - release 17 | 18 | # do not build vignettes...takes too long and times out on travis 19 | r_build_args: --no-build-vignettes --no-manual 20 | r_check_args: --no-build-vignettes --no-manual --timings 21 | 22 | # for codecov 23 | r_packages: 24 | - covr 25 | 26 | # we need to install BiocInstaller for testing Bioconductor packages 27 | bioc_required: true 28 | 29 | # only report coverage for the release version 30 | after_success: 31 | - test $TRAVIS_R_VERSION_STRING = 'release' && Rscript -e 'covr::codecov()' 32 | 33 | notifications: 34 | email: 35 | on_success: change 36 | on_failure: change -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: scclusteval 2 | Title: Evaluate the single cell clustering 3 | Version: 0.0.0.9000 4 | Authors@R: 5 | person(given = "Ming", 6 | family = "Tang", 7 | role = c("aut", "cre"), 8 | email = "tangming2005@gmail.com") 9 | Description: What the package does (one paragraph). 10 | License: MIT + file LICENSE 11 | Encoding: UTF-8 12 | LazyData: true 13 | Roxygen: list(markdown = TRUE) 14 | RoxygenNote: 7.1.0 15 | biocViews: 16 | Imports: 17 | dplyr, 18 | purrr, 19 | tidyr, 20 | magrittr, 21 | BiocParallel, 22 | future.apply, 23 | ComplexHeatmap, 24 | ggplot2, 25 | cluster 26 | Suggests: 27 | spelling, 28 | knitr, 29 | rmarkdown 30 | Language: en-US 31 | URL: https://github.com/crazyhottommy/scclusteval 32 | BugReports: https://github.com/crazyhottommy/scclusteval/issues 33 | VignetteBuilder: knitr 34 | Depends: 35 | Seurat (>= 4.0.0) 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2018 2 | COPYRIGHT HOLDER: Ming Tang 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2018 Ming Tang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export("%>%") 4 | export(AssignHighestJaccard) 5 | export(AssignStableCluster) 6 | export(CalculatePercentCellInStable) 7 | export(CalculateSilhouette) 8 | export(ClusterIdentityChordPlot) 9 | export(ClusterSizeBarplot) 10 | export(JaccardRainCloudPlot) 11 | export(JaccardSets) 12 | export(MatchClusters) 13 | export(MergeMultipleSeuratObjects) 14 | export(PairWiseJaccardSets) 15 | export(PairWiseJaccardSetsHeatmap) 16 | export(PairWiseOverlappingIdents) 17 | export(ParameterSetScatterPlot) 18 | export(PreprocessSubsetData) 19 | export(PreprocessSubsetDataV2) 20 | export(RandomSubsetData) 21 | export(SilhouetteRainCloudPlot) 22 | export(geom_flat_violin) 23 | export(scClusterBoot) 24 | importFrom(magrittr,"%>%") 25 | -------------------------------------------------------------------------------- /R/calculatesilhouette.R: -------------------------------------------------------------------------------- 1 | 2 | #' Calculate Silhouette width from PCA space for each cell after clustering 3 | #' This is calculated from Seurat object 4 | #' @param object A Seurat object with Idents set to cluster ids (factors) 5 | #' @param dims default 1:50 dimension to use in the PCA space to calculate 6 | #' eucledian distance 7 | #' 8 | #' @return a dataframe with silhouette width for each cell. see also \code{\link[cluster]{silhouette}} 9 | #' @export 10 | #' 11 | #' @examples 12 | #' CalculateSilhouette(pbmc_small, dims = 1:15) 13 | #' 14 | CalculateSilhouette<- function(object, dims = 1:50){ 15 | if (length(dims) > ncol(object@reductions$pca@cell.embeddings)) { 16 | stop("please specify PCA dims smaller than calculated") 17 | } 18 | cell_distance<- dist(object@reductions$pca@cell.embeddings[, dims]) 19 | # or as.integer 20 | cell_cluster<- as.numeric(as.character(Idents(object))) 21 | silhouette_score<- cluster::silhouette(cell_cluster, cell_distance) 22 | silhouette_score<- tibble::tibble(cluster = silhouette_score[,1], 23 | width = silhouette_score[,3], 24 | cell = colnames(object)) %>% 25 | dplyr::mutate(cluster = as.factor(cluster)) 26 | return(silhouette_score) 27 | } 28 | -------------------------------------------------------------------------------- /R/clusterviz.R: -------------------------------------------------------------------------------- 1 | 2 | #' Make a Barplot for cluster size 3 | #' 4 | #' @param ident a named factor vector. names are the cell names, the values are 5 | #' the cluster id. 6 | #' @param bar_col color for the bar. Default is blue. 7 | #' @param label_number whether or not put cell number in each cluster on top of the bar 8 | #' 9 | #' @return a ggplot2 bar graph object 10 | #' @export 11 | #' 12 | #' @examples 13 | #' data(pbmc_small) 14 | #' ClusterSizeBarplot(Idents(pbmc_small)) 15 | #' 16 | ClusterSizeBarplot<- function(ident, bar_col = "blue", label_number = TRUE){ 17 | g<- as.data.frame(table(ident)) %>% 18 | dplyr::rename(cluster = ident, size = Freq) %>% 19 | ggplot2::ggplot(ggplot2::aes(x = cluster, y = size)) + 20 | ggplot2::geom_bar(stat = "identity", fill = bar_col) + 21 | ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) 22 | if (!label_number){ 23 | return (g) 24 | 25 | } else { 26 | g<- g + ggplot2::geom_text(ggplot2::aes(label=size), vjust= -1.5, angle = 45) 27 | return (g) 28 | 29 | } 30 | } 31 | 32 | #' Make a Heatmap of the pairwise Jaccard distance between cluster ident of two 33 | #' Seurat object 34 | #' 35 | #' 36 | #' @param ident1 a named factor vector. names are the cell names, the values are 37 | #' the cluster id. 38 | #' @param ident2 a named factor vector. names are the cell names, the values are 39 | #' the cluster id. 40 | #' @param col_low Color for low Jaccard index. 41 | #' @param col_high Color for high Jaccard index. 42 | #' @param title The title of the heatmap 43 | #' @param cluster_rows cluster row or not, default FALSE 44 | #' @param cluster_columns cluster columns or not, default FASLE 45 | #' @param show_column_dend Whether or not show column dendrogram 46 | #' @param show_row_dend Whether or not show row dendrogram 47 | #' @param best_match Whether or not only show the best match of ident1 from ident2. 48 | #' if set to TRUE, the Jaccard index matrix will be subsetted using the ident2 column 49 | #' from the output of \code{\link{MatchClusters}}, the row order will be in order from cluster 50 | #' 0 to the total number of clusters, the columns will be the best match of ident1 from ident2, 51 | #' and the columns idents could be duplicated. e.g. single cluster from ident2 matches multiple 52 | #' clusters in ident1. 53 | #' @param ... other parameters pass to \code{\link[ComplexHeatmap]{Heatmap}} 54 | #' 55 | #' @return A Heatmap representing the pair-wise Jaccard correlation, rows are ident1, 56 | #' columns are ident2 57 | #' @export 58 | #' 59 | #' @examples 60 | #' 61 | PairWiseJaccardSetsHeatmap<- function(ident1, ident2, best_match = FALSE, 62 | title = NULL, col_low = "white", col_high= "red", 63 | cluster_rows = F, cluster_columns =F, 64 | show_row_dend = F, show_column_dend = F, ...){ 65 | cell_fun = function(j, i, x, y, width, height, fill) { 66 | grid::grid.rect(x = x, y = y, width = width *0.99, height = height *0.99, 67 | gp = grid::gpar(col = "grey", fill = fill, lty = 1, lwd = 0.5)) 68 | } 69 | mat<- PairWiseJaccardSets(ident1, ident2) 70 | col_fun<- circlize::colorRamp2(c(0, 1), c(col_low, col_high)) 71 | if (best_match){ 72 | cluster_rows = F 73 | cluster_columns =F 74 | show_row_dend = F 75 | show_column_dend = F 76 | match_idx<- MatchClusters(ident1, ident2) 77 | ComplexHeatmap::Heatmap(mat[, match_idx$ident2], 78 | cluster_rows = cluster_rows, cluster_columns = cluster_columns, 79 | show_row_names = T, show_column_names = T, 80 | show_row_dend = show_row_dend, 81 | show_column_dend = show_column_dend, 82 | col = col_fun, rect_gp = grid::gpar(type = "none"), 83 | cell_fun = cell_fun, 84 | name = "Jaccard index", 85 | column_title = title, 86 | heatmap_legend_param = list(color_bar = "discrete"), 87 | ...) 88 | } 89 | else{ 90 | ComplexHeatmap::Heatmap(mat, 91 | cluster_rows = cluster_rows, cluster_columns = cluster_columns, 92 | show_row_names = T, show_column_names = T, 93 | show_row_dend = show_row_dend, 94 | show_column_dend = show_column_dend, 95 | col = col_fun, rect_gp = grid::gpar(type = "none"), 96 | cell_fun = cell_fun, 97 | name = "Jaccard index", 98 | column_title = title, 99 | heatmap_legend_param = list(color_bar = "discrete"), 100 | ...) 101 | 102 | } 103 | 104 | } 105 | 106 | 107 | #' Plot the Jaccard index distribution using raincloud plot 108 | #' 109 | #' @param idents1 A list of cluster identity from the subsampled data set 110 | #' before reclustering. (cluster id copied from the original full data set) 111 | #' @param idents2 A list of cluster identity from the subsampled data sets after 112 | #' reclustering. 113 | #' @param title Title of the plot 114 | #' 115 | #' @return A ggplot2 object 116 | #' @export 117 | #' 118 | #' @examples 119 | #' 120 | #'\dontrun{ 121 | #'data(idents) 122 | #'## the pbmc here need to be fully processed. 123 | #'JaccardRainCloudPlot(idents, idents) 124 | #'} 125 | #' 126 | JaccardRainCloudPlot<- function(idents1, idents2, title= NULL){ 127 | mats<- AssignHighestJaccard(idents1, idents2) 128 | g<- mats %>% tibble::as_tibble() %>% tibble::rownames_to_column(var = "bootstrap") %>% 129 | tidyr::gather(-bootstrap, key= "cluster", value = "jaccard") %>% 130 | dplyr::mutate(cluster = as.factor(as.numeric(.$cluster))) %>% 131 | ggplot2::ggplot(ggplot2::aes(x = cluster, y = jaccard, fill = cluster)) + 132 | geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8) + 133 | ggplot2::geom_point(ggplot2::aes(y = jaccard, color = cluster), position = position_jitter(width = .15), size = .5, alpha = 0.8) + 134 | ggplot2::geom_boxplot(width = .1, outlier.shape = NA, alpha = 0.5) + 135 | ggplot2::theme_classic() + 136 | ggplot2::theme(legend.position="none") + 137 | ggplot2::ggtitle(title) 138 | return(g) 139 | } 140 | 141 | 142 | #' Plot a scatter plot for different clustering parameters 143 | #' 144 | #' x-axis is the parameters tested (e.g. many different k.param) 145 | #' y-axis is the total number of clusters and total number of stable clusters based 146 | #' on the jaccard cutoff as determined by AssignStableClusters, or precentage of cells 147 | #' in stable clusters. 148 | #' 149 | #' @param stable_clusters a dataframe with list-columns for data, stable_cluster determined by 150 | #' \code{\link{AssignStableCluster}} and the rest of the columns are pc, resolution and k_param. 151 | #' @param fullsample_idents a dataframe with the list-column contain the original ident for 152 | #' the full dataset. This is the direct output from the Snakemake workflow. 153 | #' @param x_var one of "pc", "resolution" and "k_param". 154 | #' @param y_var one of "number" or "percentage". If it is "number", 155 | #' y-axis si the total number of clusters and total number of stable clusters. 156 | #' @param facet_rows one of "pc", "resolution" and "k_param" for ggplot2 to facet. 157 | #' @param facet_cols one of "pc", "resolution" and "k_param" for ggplot2 to facet. 158 | #' 159 | #' @return a ggplot2 object 160 | #' @export 161 | #' 162 | #' @examples 163 | ParameterSetScatterPlot<- function(stable_clusters, 164 | fullsample_idents, 165 | x_var, 166 | y_var, 167 | facet_rows, 168 | facet_cols ) { 169 | 170 | df<- dplyr::left_join(stable_clusters, fullsample_idents) %>% 171 | dplyr::ungroup() %>% 172 | dplyr::mutate(total = map_dbl(stable_cluster, ~ length(.x$stable_cluster))) %>% 173 | dplyr::mutate(stable = map_dbl(stable_cluster, ~ .x$number_of_stable_cluster)) %>% 174 | dplyr::mutate(percentage = map2_dbl(original_ident_full, stable_cluster, 175 | function(x, y) CalculatePercentCellInStable(x, y$stable_cluster))) %>% 176 | dplyr::select(-data, - stable_cluster, -original_ident_full) %>% 177 | dplyr::mutate_if(is.character, function(x) as.factor(as.numeric(x))) %>% 178 | tidyr::gather(total:stable , key = "category", value = "number") 179 | ## plotting 180 | 181 | if (!all(c(x_var, y_var, facet_rows, facet_cols) %in% colnames(df))) { 182 | stop("x_var, faect_rows and facet_cols must be one of the parameter columns in the dataframe,\n 183 | y_var must be 'number' or 'percentage'.") 184 | } 185 | 186 | if (y_var == "percentage") { 187 | p<- ggplot2::ggplot(df, ggplot2::aes(x=.data[[x_var]], y = .data[[y_var]])) + 188 | ggplot2::geom_point(color = "blue") + 189 | ggplot2::geom_line(ggplot2::aes(group = 1), color = "red") + 190 | ggplot2::scale_y_continuous(labels = scales::percent) + 191 | ggplot2::facet_grid(rows = vars(.data[[facet_rows]]), cols = vars(.data[[facet_cols]])) + 192 | ggplot2::xlab(x_var) + 193 | ggplot2::ylab(y_var) 194 | } 195 | if (y_var == "number"){ 196 | p<- ggplot2::ggplot(df, ggplot2::aes(x=.data[[x_var]], y = .data[[y_var]])) + 197 | ggplot2::geom_point() + 198 | ggplot2::geom_line(ggplot2::aes(group = category, color = category )) + 199 | ggplot2::facet_grid(rows = vars(.data[[facet_rows]]), cols = vars(.data[[facet_cols]])) + 200 | ggplot2::xlab(x_var) + 201 | ggplot2::ylab(y_var) 202 | } 203 | 204 | return(p) 205 | 206 | } 207 | 208 | ## see https://jokergoo.github.io/circlize_book/book/the-chorddiagram-function.html 209 | 210 | #' Plot ChordDiagram of cell identity changes between two runs of clusters. 211 | #' 212 | #' @param ident1 a named factor vector. names are the cell names, the values are 213 | #' the cluster id. 214 | #' @param ident2 a named factor vector. names are the cell names, the values are 215 | #' the cluster id. 216 | #' @param clusters_to_show_ident1 A character vector of cluster ids to show for ident1. 217 | #' default is NULL, all clusters will be shown. 218 | #' @param big.gap Gap between sectors of two cluster runs. 219 | #' @param transparency Transparency of link colors, 0 means no transparency and 1 means full transparency. 220 | #' see \code{\link[circlize]{chordDiagramFromMatrix}} 221 | #' @param grid.col Grid colors which correspond to matrix rows/columns (or sectors). 222 | #' The length of the vector should be either 1 or length(union(rownames(mat), colnames(mat))). 223 | #' It's preferred that grid.col is a named vector of which names correspond to sectors. 224 | #' If it is not a named vector, the order of grid.col corresponds to order of sectors. 225 | #' see \code{\link[circlize]{chordDiagramFromMatrix}} 226 | #' @param link.sort whether sort links on every sector based on the width of the links on it. 227 | #' If it is set to "overall", all links are sorted regardless whether they are from rows or columns. 228 | #' see \code{\link[circlize]{chordDiagramFromMatrix}} 229 | #' @param link.decreasing for link.sort 230 | #' @param directional Whether links have directions. 1 means the direction is from the first column 231 | #' in df to the second column, -1 is the reverse, 0 is no direction, and 2 for two directional. 232 | #' see \code{\link[circlize]{chordDiagramFromMatrix}} 233 | #' 234 | #' @return A data frame which contains positions of links. see \code{\link[circlize]{chordDiagramFromMatrix}} 235 | #' @export 236 | #' 237 | #' @examples 238 | ClusterIdentityChordPlot<- function(ident1, ident2, 239 | clusters_to_show_ident1 = NULL, 240 | big.gap = 10, transparency = 0.5, 241 | grid.col = NULL, 242 | link.sort = TRUE, link.decreasing = TRUE, 243 | directional = -1){ 244 | mat<- PairWiseOverlappingIdents(ident1, ident2) 245 | if (!is.null(clusters_to_show_ident1)){ 246 | mat<- mat[clusters_to_show_ident1, ] 247 | } 248 | rownames(mat)<- paste0("1_", rownames(mat)) 249 | colnames(mat)<- paste0("2_", colnames(mat)) 250 | circlize::circos.par(start.degree = 90, clock.wise = FALSE) 251 | circlize::chordDiagram(mat, big.gap = big.gap, transparency = transparency, 252 | grid.col = grid.col, 253 | link.sort = link.sort, link.decreasing = link.decreasing, 254 | directional = directional) 255 | circlize::circos.clear() 256 | } 257 | 258 | 259 | #' Plot raincloud plot for silhouette score 260 | #' 261 | #' @param silhouette_score a dataframe returned by \code{link[CalculateSilhouette]} 262 | #' 263 | #' @return a ggplot2 object 264 | #' @export 265 | #' 266 | #' @examples 267 | #' 268 | #' SilhouetteRainCloudPlot(CalculateSilhouette(pbmc_small, dims = 1:15)) 269 | SilhouetteRainCloudPlot<- function(silhouette_score){ 270 | g<- ggplot2::ggplot(silhouette_score, ggplot2::aes(x = cluster, y = width, fill = cluster)) + 271 | geom_flat_violin(position = ggplot2::position_nudge(x = .2, y = 0), alpha = .8) + 272 | ggplot2::geom_point(ggplot2::aes(y = width, color = cluster), position = ggplot2::position_jitter(width = .15), size = .5, 273 | alpha = 0.8) + 274 | ggplot2::geom_boxplot(width = .1, outlier.shape = NA, alpha = 0.5) + 275 | ggplot2::ylab("silhouette width") + 276 | ggplot2::theme_classic(base_size = 14) + 277 | ggplot2::theme(legend.position="none") 278 | return(g) 279 | } 280 | -------------------------------------------------------------------------------- /R/geomflatviolin.R: -------------------------------------------------------------------------------- 1 | # somewhat hackish solution to: 2 | # https://twitter.com/EamonCaddigan/status/646759751242620928 3 | # based mostly on copy/pasting from ggplot2 geom_violin source: 4 | # https://github.com/hadley/ggplot2/blob/master/R/geom-violin.r 5 | # credit goes to David Robinson https://twitter.com/drob 6 | 7 | 8 | 9 | "%||%" <- function(a, b) { 10 | if (!is.null(a)) a else b 11 | } 12 | 13 | #' A Flat Violin plot 14 | #' 15 | #' @param mapping See \code{\link[ggplot2]{geom_violin}} 16 | #' @param data See \code{\link[ggplot2]{geom_violin}} 17 | #' @param position See \code{\link[ggplot2]{geom_violin}} 18 | #' @param trim See \code{\link[ggplot2]{geom_violin}} 19 | #' @param scale See \code{\link[ggplot2]{geom_violin}} 20 | #' @param show.legend See \code{\link[ggplot2]{geom_violin}} 21 | #' @param inherit.aes See \code{\link[ggplot2]{geom_violin}} 22 | #' @param ... 23 | #' 24 | #' @export 25 | #' 26 | #' @examples 27 | #' library(ggplot2) 28 | #' ggplot(diamonds, aes(cut, carat)) + 29 | #' geom_flat_violin() + 30 | #' coord_flip() 31 | #' 32 | geom_flat_violin <- function(mapping = NULL, data = NULL, stat = "ydensity", 33 | position = "dodge", trim = TRUE, scale = "area", 34 | show.legend = NA, inherit.aes = TRUE, ...) { 35 | ggplot2::layer( 36 | data = data, 37 | mapping = mapping, 38 | stat = stat, 39 | geom = GeomFlatViolin, 40 | position = position, 41 | show.legend = show.legend, 42 | inherit.aes = inherit.aes, 43 | params = list( 44 | trim = trim, 45 | scale = scale, 46 | ... 47 | ) 48 | ) 49 | } 50 | 51 | 52 | GeomFlatViolin <- 53 | ggplot2::ggproto("GeomFlatViolin", ggplot2::Geom, 54 | setup_data = function(data, params) { 55 | data$width <- data$width %||% 56 | params$width %||% (ggplot2::resolution(data$x, FALSE) * 0.9) 57 | 58 | # ymin, ymax, xmin, and xmax define the bounding rectangle for each group 59 | data %>% 60 | dplyr::group_by(group) %>% 61 | dplyr::mutate(ymin = min(y), 62 | ymax = max(y), 63 | xmin = x, 64 | xmax = x + width / 2) 65 | 66 | }, 67 | 68 | draw_group = function(data, panel_scales, coord) { 69 | # Find the points for the line to go all the way around 70 | data <- transform(data, xminv = x, 71 | xmaxv = x + violinwidth * (xmax - x)) 72 | 73 | # Make sure it's sorted properly to draw the outline 74 | newdata <- rbind(plyr::arrange(transform(data, x = xminv), y), 75 | plyr::arrange(transform(data, x = xmaxv), -y)) 76 | 77 | # Close the polygon: set first and last point the same 78 | # Needed for coord_polar and such 79 | newdata <- rbind(newdata, newdata[1,]) 80 | 81 | ggplot2:::ggname("geom_flat_violin", ggplot2::GeomPolygon$draw_panel(newdata, panel_scales, coord)) 82 | }, 83 | 84 | draw_key = ggplot2::draw_key_polygon, 85 | 86 | default_aes = ggplot2::aes(weight = 1, colour = "grey20", fill = "white", size = 0.5, 87 | alpha = NA, linetype = "solid"), 88 | 89 | required_aes = c("x", "y") 90 | ) 91 | 92 | 93 | -------------------------------------------------------------------------------- /R/idents.R: -------------------------------------------------------------------------------- 1 | #' cluster identity of subsetted pbmc data 2 | #' 3 | #' The 2700 cell pbmc data were subsetted to 80 percent of the cells for 100 times. 4 | #' Each time, we fully re-processed the subsetted data from FindVaraiableGenes to 5 | #' FindClusters using k =30 and resolution = 0.6, and record the cluster identity 6 | #' from the processed seurat@@ident 7 | #' slot and saved in to a list of factor. 8 | #' 9 | #' @docType data 10 | #' 11 | #' @usage data(idents) 12 | #' 13 | #' @format A list of factors 14 | #' @source \url{https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz} 15 | #' 16 | "idents" 17 | -------------------------------------------------------------------------------- /R/mergemultipleseuratobjects.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Read multiple 10x run into Seurat objects and merge into a single Seurat object 4 | #' 5 | #' Read multiple 10x run into Seurat objects and merge into a single Seurat object. 6 | #' The names of the list of paths will be prepended to the cell name. 7 | #' 8 | #' @param input_folders A named list of folder path for each run. 9 | #' @param do.normalize Whether or not normalize the data after mergeing, default is FALSE 10 | #' @param ... Other parameters for CreatSeuratObject in the Seurat package 11 | #' 12 | #' @return A single merged Seurat object from mulitple 10x runs. 13 | #' @export 14 | #' 15 | #' @examples 16 | #' \dontrun{ 17 | #' library(fs) 18 | #' library(here) 19 | #' library(stringr) 20 | #' input_folders<- dir_ls( path = here("data"), recursive = T) %>% path_dir() %>% 21 | #' unique() %>% str_subset("mm10-1.2.0_premrna") 22 | #' merged_seurat<- MergeMultipleSeuratObjects(input_folders) 23 | #' } 24 | 25 | 26 | MergeMultipleSeuratObjects<- function(input_folders, do.normalize = FALSE, ...){ 27 | seurat_data<- purrr::map(input_folders, Read10X) 28 | #prefix the sample name to the cell name, otherwise merge seurat objects gives error 29 | add_sample_name_to_cell<- function(x, y){ 30 | colnames(x)<- paste(y, colnames(x), sep = "_") 31 | return(x) 32 | } 33 | sample_names<- names(input_folders) 34 | seurat_data<- purrr::map2(seurat_data, sample_names, add_sample_name_to_cell) 35 | seurat_objects<- purrr::map2(seurat_data, sample_names, 36 | function(x,y) CreateSeuratObject(raw.data = x, 37 | project = y, 38 | ...)) 39 | #merge to a single seurat object 40 | merged_seurat<- purrr::reduce(seurat_objects, 41 | function(x,y) {MergeSeurat(x,y, 42 | do.normalize = do.normalize)}) 43 | } 44 | -------------------------------------------------------------------------------- /R/preprocesssubsetdata.R: -------------------------------------------------------------------------------- 1 | 2 | #' A wrapper for preprocessing subsetted Seurat object 3 | #' 4 | #' The wrapper does FindVeriableGenes, ScaleData, RunPCA, JackStraw to 5 | #' determine how many PCs to use, ProjectPCA and FindClusters and retrun 6 | #' a fully processed Seurat object. The input subsetted seurat object is 7 | #' supposed to be fully processed as well. So the NormalizeData step is not 8 | #' necessary. 9 | #' 10 | #' @param object A subsetted Seurat object created by RandomSubsetData 11 | #' @param num.pc number of PCs to calculate in RunPCA, JackStraw and JackStrawPlot 12 | #' step. The optimal PCs for FindClusters will be determined by only significant PCs 13 | #' from JackStrawPlot or if pc.use is set, JackStraw step will be skipped and use pc.use 14 | #' for FindClusters. 15 | #' @param pc.use number of PCs used for FindClusters. if pc.use is set, JackStraw step 16 | #' will be skipped and use pc.use for FindClusters. score.thresh and sig.pc.thresh will be ignored. 17 | #' @param n.start Number of random start. 18 | #' @param nn.eps Error bound when performing nearest neighbor seach using RANN; 19 | #' default of 0.0 implies exact nearest neighbor search. See FindClusters. 20 | #' @param resolution Value of the resolution parameter, use a value above (below) 21 | #' 1.0 if you want to obtain a larger (smaller) number of communities. see FIndClusters. 22 | #' @param k.param Defines k for the k-nearest neighbor algorithm. 23 | #' @param score.thresh Threshold to use for the proportion test of PC significance. 24 | #' @param sig.pc.thresh Threshold for the significance of a particular PC. 25 | #' @param ... any other parameters 26 | #' @param variable.features.n number of variable features for \code{\link[Seurat]{SCTransform}} 27 | #' @param workers number of CPUs to use for \code{\link[future]{plan}} parallel processing 28 | #' 29 | #' @return a fully processed Seurat object 30 | #' @export 31 | #' 32 | #' @examples 33 | #' \dontrun{ 34 | #' pbmc_small_subset<- RandomSubsetData(pbmc_small, 0.8) 35 | #' pbmc_small_subset_processed<- PreprocessSubsetData(pbmc_small_subset) 36 | #' pbmc_small_subset_processed@meta.data 37 | #' } 38 | 39 | 40 | PreprocessSubsetData<- function(object, 41 | variable.features.n = 3000, 42 | num.pc = 20, 43 | pc.use = NULL, 44 | workers = 2, 45 | score.thresh = 1e-5, 46 | sig.pc.thresh = 0.05, 47 | n.start = 100, 48 | nn.eps = 0, 49 | resolution = 0.8, 50 | k.param = 30, 51 | useSCTransform = TRUE, 52 | ...){ 53 | ## use future for parallelization 54 | future::plan("multiprocess", workers = workers) 55 | meta.data.colnames<- object@meta.data %>% colnames() 56 | vars.to.regress<- c("percent.mt","nFeature_RNA") 57 | # in case the seurat object does not have percent.mito in metadata 58 | vars.to.regress<- vars.to.regress[vars.to.regress %in% meta.data.colnames] 59 | # default is on variable features only, omit the features argument 60 | # SCTransform replaces NormalizeData, ScaleData and FindVariableFeatures 61 | 62 | if(!is.null(pc.use)){ 63 | if(pc.use > num.pc){ 64 | stop("Specify the maximum pc.use number as less than or equal to the total num.pc") 65 | } 66 | } 67 | 68 | if(useSCTransform==TRUE){ 69 | object<- SCTransform(object, vars.to.regress = vars.to.regress, 70 | variable.features.n = variable.features.n, verbose = FALSE) 71 | }else{ 72 | stop("The SCTransform method for normalization is the only method supported by this function. If you wish to use the approach that involves NormalizeData, ScaleData, and FindVariableFeatures and enables use of the Jackstraw procedure for determining which PCs to use please use the PreprocessSubsetDataV2 function.") 73 | 74 | } 75 | 76 | 77 | 78 | object<- RunPCA(object = object, features = VariableFeatures(object = object), 79 | npcs = num.pc) 80 | 81 | 82 | 83 | # if (is.null(pc.use) & useSCTransform==FALSE){ 84 | # object<- JackStraw(object = object, num.replicate = 100, dims = num.pc) 85 | # 86 | # object <- ScoreJackStraw(object = object, dims = 1:num.pc, score.thresh = score.thresh) 87 | # 88 | # PC_pvalues<- object@reductions$pca@jackstraw@overall.p.values 89 | # 90 | # ## determin how many PCs to use. 91 | # pc.use<- min(which(PC_pvalues[,"Score"] > sig.pc.thresh)) -1 92 | # 93 | # } 94 | 95 | 96 | if(is.null(pc.use)){ 97 | pc.use <- num.pc 98 | message("SCTransform is being used and the Jackstraw procedure for determining which PCs to use is not compatable with this procedure. Since pc.use was not specified it is being automatically set to num.pc") 99 | } 100 | 101 | # add significant pc number to metadata, need to have names same as the cells 102 | pc.use.meta<- rep(pc.use, length(colnames(object))) 103 | names(pc.use.meta)<- colnames(object) 104 | object<- AddMetaData(object = object, metadata = pc.use.meta, col.name = "pc.use") 105 | object<- FindNeighbors(object, dims = 1:pc.use, k.param = k.param, nn.eps = nn.eps, 106 | verbose = FALSE, reduction = "pca", force.recalc = TRUE) 107 | object <- FindClusters(object = object, 108 | n.start = n.start, 109 | resolution = resolution, 110 | verbose = FALSE) 111 | return(object) 112 | } 113 | -------------------------------------------------------------------------------- /R/preprocesssubsetdatav2.R: -------------------------------------------------------------------------------- 1 | #' A wrapper for preprocessing subsetted Seurat object using ScaleData 2 | #' 3 | #' The wrapper does FindVeriableGenes, ScaleData, RunPCA, JackStraw to 4 | #' determine how many PCs to use, ProjectPCA and FindClusters and retrun 5 | #' a fully processed Seurat object. The input subsetted seurat object is 6 | #' supposed to be fully processed as well. So the NormalizeData step is not 7 | #' necessary. 8 | #' 9 | #' @param object A subsetted Seurat object created by RandomSubsetData 10 | #' @param num.pc number of PCs to calculate in RunPCA, JackStraw and JackStrawPlot 11 | #' step. The optimal PCs for FindClusters will be determined by only significant PCs 12 | #' from JackStrawPlot or if pc.use is set, JackStraw step will be skipped and use pc.use 13 | #' for FindClusters. 14 | #' @param pc.use number of PCs used for FindClusters. if pc.use is set, JackStraw step 15 | #' will be skipped and use pc.use for FindClusters. score.thresh and sig.pc.thresh will be ignored. 16 | #' @param n.start Number of random start. 17 | #' @param nn.eps Error bound when performing nearest neighbor seach using RANN; 18 | #' default of 0.0 implies exact nearest neighbor search. See FindClusters. 19 | #' @param resolution Value of the resolution parameter, use a value above (below) 20 | #' 1.0 if you want to obtain a larger (smaller) number of communities. see FIndClusters. 21 | #' @param k.param Defines k for the k-nearest neighbor algorithm. 22 | #' @param score.thresh Threshold to use for the proportion test of PC significance. 23 | #' @param sig.pc.thresh Threshold for the significance of a particular PC. 24 | #' @param ... any other parameters 25 | #' @param variable.features.n number of variable features for \code{\link[Seurat]{SCTransform}} 26 | #' @param workers number of CPUs to use for \code{\link[future]{plan}} parallel processing 27 | #' 28 | #' @return a fully processed Seurat object 29 | #' @export 30 | #' 31 | #' @examples 32 | #' \dontrun{ 33 | #' pbmc_small_subset<- RandomSubsetData(pbmc_small, 0.8) 34 | #' pbmc_small_subset_processed<- PreprocessSubsetData(pbmc_small_subset) 35 | #' pbmc_small_subset_processed@@meta.data 36 | #' } 37 | 38 | 39 | PreprocessSubsetDataV2<- function(object, 40 | nfeatures = 2000, 41 | num.pc = 20, 42 | pc.use = NULL, 43 | workers = 2, 44 | score.thresh = 1e-5, 45 | sig.pc.thresh = 0.05, 46 | n.start = 100, 47 | nn.eps = 0, 48 | resolution = 0.8, 49 | k.param = 30, 50 | ...){ 51 | 52 | if(!is.null(pc.use)){ 53 | if(pc.use > num.pc){ 54 | stop("Specify the maximum pc.use number as less than or equal to the total num.pc") 55 | } 56 | } 57 | 58 | meta.data.colnames<- object@meta.data %>% colnames() 59 | vars.to.regress<- c("percent.mt","nFeature_RNA") 60 | # in case the seurat object does not have percent.mito in metadata 61 | vars.to.regress<- vars.to.regress[vars.to.regress %in% meta.data.colnames] 62 | # no need to use this for the integrated data 63 | #object<- FindVariableFeatures(object, selection.method = "vst", nfeatures = nfeatures) 64 | object<- ScaleData(object) 65 | 66 | object<- RunPCA(object = object, features = VariableFeatures(object = object), 67 | npcs = num.pc) 68 | 69 | if (is.null(pc.use)){ 70 | 71 | if("SCT"%in%names(SObjFiltered@assays)){ 72 | pc.use <- num.pc 73 | message("The SCTransform assay was detected in the object, and the Jackstraw procedure for determining which PCs to use is not compatable with this procedure. Since pc.use was not specified it is being automatically set to num.pc") 74 | }else{ 75 | object<- JackStraw( object = object, num.replicate = 100, dims = num.pc) 76 | 77 | object <- ScoreJackStraw(object = object, dims = 1:num.pc, score.thresh = score.thresh) 78 | 79 | PC_pvalues<- object@reductions$pca@jackstraw@overall.p.values 80 | 81 | ## determine how many PCs to use. 82 | pc.use<- min(which(PC_pvalues[,"Score"] > sig.pc.thresh)) -1 83 | } 84 | 85 | } 86 | 87 | # add significant pc number to metadata, need to have names same as the cells 88 | pc.use.meta<- rep(pc.use, length(colnames(object))) 89 | names(pc.use.meta)<- colnames(object) 90 | object<- AddMetaData(object = object, metadata = pc.use.meta, col.name = "pc.use") 91 | object<- FindNeighbors(object, dims = 1:pc.use, k.param = k.param, nn.eps = nn.eps, 92 | verbose = FALSE, reduction = "pca", force.recalc = TRUE) 93 | object <- FindClusters(object = object, 94 | n.start = n.start, 95 | resolution = resolution, 96 | verbose = FALSE) 97 | return(object) 98 | } 99 | -------------------------------------------------------------------------------- /R/randomsubsetdata.R: -------------------------------------------------------------------------------- 1 | #' Randomly subset (cells) seurat object by a rate 2 | #' 3 | #' @param object Seurat object 4 | #' @param rate a number betwee 0-1 for subsetting 5 | #' @param random.subset.seed set a random seed for sampling, default is NULL. 6 | #' @param ... any other parameters to \code{\link[Seurat]{subset}} 7 | #' 8 | #' @return Returns a randomly subsetted seurat object 9 | #' @export 10 | #' 11 | #' @examples 12 | #' pbmc_small 13 | #' pbmc_small_subset<- RandomSubsetData(pbmc_small, 0.8) 14 | #' dim(pbmc_small_subset@@meta.data) 15 | #' 16 | #' 17 | # read this issue https://github.com/satijalab/seurat/issues/243 18 | # Seurat V3 does not have do.clean =T any more 19 | # see https://github.com/satijalab/seurat/issues/1792 use DietSeurat 20 | RandomSubsetData<- function(object, rate, random.subset.seed = NULL, ...){ 21 | ncells<- nrow(object@meta.data) 22 | ncells.subsample<- round(ncells * rate) 23 | 24 | set.seed(random.subset.seed) 25 | 26 | selected.cells<- sample(colnames(object), ncells.subsample) 27 | object<- subset(object, cells = selected.cells, 28 | ...) 29 | return(object) 30 | } 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /R/scclusterboot.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Calculate jaccard distance for two sets of character vectors 4 | #' 5 | #' @param set1 character vector 1 6 | #' @param set2 character vector 2 7 | #' 8 | #' @return jaccard distance 9 | #' @export 10 | #' 11 | #' @examples 12 | #' JaccardSets(sample(LETTERS, 10), sample(LETTERS, 10)) 13 | JaccardSets<- function(set1, set2){ 14 | length(intersect(set1, set2))/length(unique(c(set1, set2))) 15 | } 16 | 17 | 18 | #' Calculate pair-wise Jaccard distance for @@ident slots from two Seurat objects 19 | #' 20 | #' Calculate pair-wise Jaccard distance for two named factor vector. e.g. 21 | #' seurat_obj1@ident and seurat_obj2@ident 22 | #' 23 | #' @param ident1 a named factor vector. names are the cell names, the values are 24 | #' the cluster id. 25 | #' @param ident2 a named factor vector. names are the cell names, the values are 26 | #' the cluster id. 27 | #' 28 | #' @return a matrix of pair-wise Jaccard distance. Rows are clusters from ident1, 29 | #' columns are clusters from ident2 30 | #' @export 31 | #' 32 | #' @examples 33 | #' \dontrun{ 34 | #' PairWiseJaccardSets(pbmc@@ident, pbmc_small@@ident) 35 | #'} 36 | #' 37 | PairWiseJaccardSets<- function(ident1, ident2){ 38 | ident1.list<- split(names(ident1), ident1) 39 | ident2.list<- split(names(ident2), ident2) 40 | res<- matrix(nrow = length(ident1.list), ncol = length(ident2.list), 41 | dimnames = list(names(ident1.list), names(ident2.list))) 42 | for (i in seq_along(ident1.list)){ 43 | res[i, ]<- purrr::map_dbl(ident2.list, ~JaccardSets(ident1.list[[i]], .x)) 44 | } 45 | return(res) 46 | } 47 | 48 | 49 | #' Calculate pair-wise overlapping cluster identities for @@ident slots from two Seurat objects 50 | #' 51 | #'Calculate pair-wise overlapping cluster identities for two named factor vector. e.g. 52 | #' seurat_obj1@ident and seurat_obj2@ident 53 | #' @param ident1 a named factor vector. names are the cell names, the values are 54 | #' the cluster id. 55 | #' @param ident2 a named factor vector. names are the cell names, the values are 56 | #' the cluster id. 57 | #' 58 | #' @return A matrix of pairwise number of common cell identities for each cluster. 59 | #' @export 60 | #' 61 | #' @examples 62 | #' \dontrun{ 63 | #' PairWiseOverlappingIdents(pbmc@@ident, pbmc_small@@ident) 64 | #' } 65 | PairWiseOverlappingIdents<- function(ident1, ident2){ 66 | ident1.list<- split(names(ident1), ident1) 67 | ident2.list<- split(names(ident2), ident2) 68 | res<- c() 69 | for (i in seq_along(ident1.list)){ 70 | ind<- purrr::map_dbl(ident2.list, ~length(intersect(ident1.list[[i]], .x))) 71 | res<- rbind(res, ind) 72 | } 73 | rownames(res)<- names(ident1.list) 74 | return(res) 75 | 76 | } 77 | 78 | 79 | #' Match two run of cluster ids with highest Jaccard index 80 | #' 81 | #' @param ident1 a named factor vector. names are the cell names, the values are 82 | #' the cluster id. 83 | #' @param ident2 a named factor vector. names are the cell names, the values are 84 | #' the cluster id. 85 | #' 86 | #' @return A tibble with two columns, column 1 is the cluster ids from ident1, column2 87 | #' is the cluster ids from ident2. 88 | #' @export 89 | #' 90 | #' @examples 91 | #' \dontrun{ 92 | #' MatchClusters(pbmc@@ident, pbmc_small@@ident) 93 | #' } 94 | MatchClusters<- function(ident1, ident2){ 95 | jaccard_mat<- PairWiseJaccardSets(ident1, ident2) 96 | 97 | get_corresponding_cluster<- function(x){ 98 | id<- which.max(x) 99 | return(colnames(jaccard_mat)[id]) 100 | } 101 | matching_ids<- apply(jaccard_mat, 1, get_corresponding_cluster) 102 | return(tibble::tibble(ident1 = names(matching_ids), ident2 = matching_ids)) 103 | } 104 | 105 | 106 | 107 | #' Assign highest Jaccard index for each cluster of the subsampled data set before 108 | #' reclustering with the cluster identites of subsampled data set after reclustering 109 | #' 110 | #' @param idents1 A list of cluster identity copied from the orginal data sets. 111 | #' idents1 is a list of the cluster identity from the subsampled data sets before reclustering. 112 | #' @param idents2 A list of cluster identity from the subsampled data sets. 113 | #' idents2 is a list of the cluster identity from the subsampled data sets after reclustering. 114 | #' The order of identities in idents1 and idents2 should correspond to each other. 115 | #' 116 | #' @return A matrix with dimention of #number of subsampling * #number of clusters in the 117 | #' original data set. 118 | #' @export 119 | #' 120 | #' @examples 121 | AssignHighestJaccard<- function(idents1, idents2){ 122 | mat_list<- purrr::map2(idents1, idents2, ~PairWiseJaccardSets(ident1 = .x, ident2 = .y)) 123 | SelectHighestJaccard<- function(mat){ 124 | apply(mat, 1, max) 125 | 126 | } 127 | # or use the anonymous function 128 | mat_max<- purrr::map(mat_list, SelectHighestJaccard) 129 | mats<- purrr::reduce(mat_max, dplyr::bind_rows) 130 | return(mats) 131 | } 132 | 133 | #' Assign stable cluster 134 | #' 135 | #' @param idents1 A list of cluster identity copied from the orginal data sets. 136 | #' idents1 is a list of the cluster identity from the subsampled data sets before reclustering. 137 | #' @param idents2 A list of cluster identity from the subsampled data sets. 138 | #' idents2 is a list of the cluster identity from the subsampled data sets after reclustering. 139 | #' The order of identities in idents1 and idents2 should correspond to each other. 140 | #' @param method what way to summarize the jaccard index across all simulations. 141 | #' to determine a cluster is stable or not. options are "Jaccard_mean", "Jaccard_median" and "Jaccard_percent" 142 | #' @param jaccard_cutoff Cutoff of the jaccard index to determin a cluster is stable or not. 143 | #' it is the mean or median cutoff when the method is "jaccard_mean" or "jaccard_median" and it is 144 | #' the cutoff for every subsampling when the method is "jaccard_percent" 145 | #' @param percent_cutoff The percentage of jaccard index greater than jaccard_cutoff. Used 146 | #' when method is "jaccard_percent". specify 0.6 when you mean 60%. 147 | #' 148 | #' @return A list containing the raw data for jaccard index for all simulations, 149 | #' TRUE or FALSE of stable cluster for each cluster and a number of stable clusters. 150 | #' A cluster is deemed as stable if the median (or mean) jaccard index is > cutoff. 151 | #' in addtion, a stable_index is calculated, which is the pecentage of jaccard index > 152 | #' cutoff for all the subsampling. e.g. for 100 times subsampling, 0.8 means 80% of the 153 | #' time, the jaccard index is > cutoff. Sometimes, we see bimodal distrbution of the 154 | #' 100 jaccard index, the percentage is a better measurement than the mean or median of the 155 | #' 100 jaccard index. 156 | #' 157 | #' @export 158 | #' 159 | #' @examples 160 | #' 161 | #' data(idents) 162 | #' 163 | #' AssignStableCluster(idents, idents) 164 | #' 165 | AssignStableCluster<- function(idents1, idents2, 166 | method = "jaccard_median", 167 | jaccard_cutoff = 0.6, 168 | percent_cutoff = 0.6){ 169 | mats<- AssignHighestJaccard(idents1, idents2) 170 | 171 | stable_index<- (mats > jaccard_cutoff) %>% 172 | as.data.frame() %>% 173 | dplyr::summarise_all(mean) %>% 174 | unlist() 175 | 176 | if (method == "jaccard_mean"){ 177 | stable_cluster<- mats %>% 178 | dplyr::summarise_all(mean) %>% 179 | dplyr::mutate_all(~ifelse(.x > jaccard_cutoff, TRUE, FALSE)) %>% 180 | unlist() 181 | number_of_stable_cluster<- sum(stable_cluster) 182 | 183 | } else if (method == "jaccard_median"){ 184 | stable_cluster<- mats %>% 185 | dplyr::summarise_all(median) %>% 186 | dplyr::mutate_all(~ifelse(.x > jaccard_cutoff, TRUE, FALSE)) %>% 187 | unlist() 188 | number_of_stable_cluster<- sum(stable_cluster) 189 | } else if (method == "jaccard_percent"){ 190 | number_of_stable_cluster<- sum(stable_index > percent_cutoff) 191 | stable_cluster<- stable_index > percent_cutoff 192 | 193 | } else { 194 | stop("please specify jaccard_mean, jaccard_median or jaccard_percent 195 | for method") 196 | } 197 | 198 | return(list(jaccardIndex = mats, stable_cluster = stable_cluster, 199 | number_of_stable_cluster = number_of_stable_cluster, 200 | stable_index = stable_index)) 201 | } 202 | 203 | 204 | #' Calculate the percentage of cells in stable clusters in the full data set 205 | #' 206 | #' @param ident. A named factor vector. names are the cell names, the values are 207 | #' the cluster id from the full data set. 208 | #' @param stable_cluster. A logical vector for each of the original cluster indicating 209 | #' it is stable or not, calculated from \code{\link{AssignStableCluster}} 210 | #' 211 | #' @return A percentage of cells in stable cluster 212 | #' @export 213 | #' 214 | #' @examples 215 | 216 | CalculatePercentCellInStable<- function(ident, stable_cluster){ 217 | ident.list<- split(names(ident), ident) 218 | number_of_cells_each_cluster<- purrr::map_int(ident.list, length) 219 | percent_cell_in_stable<- sum(number_of_cells_each_cluster[stable_cluster])/sum(number_of_cells_each_cluster) 220 | return(percent_cell_in_stable) 221 | 222 | } 223 | 224 | #' Bootstrap for a fully processed Seurat object 225 | #' 226 | #' @param object A fully processed Seurat object. 227 | #' @param n Number of times you want to bootstrap. 228 | #' @param rate A number between 0 and 1 for subsampling the cells. 229 | #' @param ... Other parameters passed to \code{\link{PreprocessSubsetData}} 230 | #' 231 | #' @return A list of lists containing the ident from the subsetted reclustered 232 | #' seurat objects. 233 | #' @export 234 | #' 235 | #' @examples 236 | #' 237 | 238 | # # see https://github.com/satijalab/seurat/issues/457 239 | # # parallelize Seurat functions. The authors decided to go with the future framework. 240 | # scClusterBoot<- function(object, n = 4, workers = 4, rate = 0.8, ...){ 241 | # multicoreParam <- BiocParallel::MulticoreParam(workers = workers) 242 | # BiocParallel::register(multicoreParam) 243 | # # the parameter n is not used inside the function 244 | # GetProcessedSubsetDataCluster<- function(n, ...){ 245 | # object<- RandomSubsetData(object, rate = rate) 246 | # object<- PreprocessSubsetData(object, ...) 247 | # return(list(ident = object@ident, pc.sig = object@meta.data$pc.sig)) 248 | # } 249 | # boot_clusters<- BiocParallel::bplapply(1:n, GetProcessedSubsetDataCluster) 250 | # return(boot_clusters) 251 | # } 252 | 253 | 254 | 255 | # scClusterBoot<- function(object, n = 4, workers = 4, rate = 0.8, ...){ 256 | # future::plan(multiprocess) 257 | # # the parameter n is not used inside the function 258 | # GetProcessedSubsetDataCluster<- function(n, ...){ 259 | # object<- RandomSubsetData(object, rate = rate) 260 | # object<- PreprocessSubsetData(object, ...) 261 | # return(list(ident = object@ident, pc.sig = object@meta.data$pc.sig)) 262 | # } 263 | # boot_clusters<- future.apply::future_lapply(1:n, GetProcessedSubsetDataCluster) 264 | # return(boot_clusters) 265 | # } 266 | 267 | scClusterBoot<- function(object, n = 4, rate = 0.8, ...){ 268 | # the parameter n is not used inside the function 269 | GetProcessedSubsetDataCluster<- function(n, ...){ 270 | object<- RandomSubsetData(object, rate = rate) 271 | object<- PreprocessSubsetData(object, ...) 272 | return(list(ident = object@ident, pc.sig = object@meta.data$pc.sig)) 273 | } 274 | boot_clusters<- lapply(1:n, GetProcessedSubsetDataCluster) 275 | return(boot_clusters) 276 | } 277 | 278 | 279 | 280 | 281 | -------------------------------------------------------------------------------- /R/scclusteval-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | -------------------------------------------------------------------------------- /R/snncbi.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/R/snncbi.R -------------------------------------------------------------------------------- /R/utils-pipe.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' See \code{magrittr::\link[magrittr]{\%>\%}} for details. 4 | #' 5 | #' @name %>% 6 | #' @rdname pipe 7 | #' @keywords internal 8 | #' @export 9 | #' @importFrom magrittr %>% 10 | #' @usage lhs \%>\% rhs 11 | NULL 12 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | [![Build Status](https://travis-ci.com/crazyhottommy/scclusteval.svg?branch=master)](https://travis-ci.com/crazyhottommy/scclusteval) 8 | 9 | ```{r setup, include = FALSE} 10 | knitr::opts_chunk$set( 11 | collapse = TRUE, 12 | comment = "#>", 13 | fig.path = "man/figures/README-", 14 | out.width = "60%", 15 | out.height = "60%" 16 | ) 17 | ``` 18 | # scclusteval 19 | 20 | ```{r pressure, echo=FALSE, fig.cap="hex sticker", out.width = '10%'} 21 | knitr::include_graphics("man/figures/scclusteval.png") 22 | ``` 23 | 24 | The goal of scclusteval(Single Cell Cluster Evaluation) is to evaluate the single cell clustering stability by subsampling the cells and provide many visualization methods for comparing clusters. 25 | 26 | For Theory behind the method, see Christian Henning, “Cluster-wise assessment of 27 | cluster stability,” Research Report 271, Dept. of Statistical Science, University 28 | College London, December 2006) 29 | 30 | ### Citation 31 | 32 | Ming Tang, Yasin Kaymaz,Brandon L. Logeman, Stephen Eichhorn, Zhengzheng S. Liang, Catherine Dulac and Timothy B. Sackton. Evaluating single-cell cluster stability using the Jaccard similarity index (2020) *Bioinformatics* In Press. 33 | 34 | ### Parameters that affect the clustering 35 | 36 | The most popular clustering method for single cell RNAseq is shared nearest neighbor (SNN) 37 | which is implemented in `{Seurat::FindClusters}`. See a paper by Mark Robinson group for comparing 38 | single cell RNAseq clustering methods:[A systematic performance evaluation of clustering methods for single-cell RNA-seq data](https://f1000research.com/articles/7-1141/v1) 39 | SNN in Seurat is the most accurate and fast one. 40 | 41 | The parameter `k.param` which specifies the number of nearest neighbors has a great effect on the number of clusters. Other Parameters such as the number of PCs and the resolution can affect the number of clusters as well. 42 | 43 | The process is as follows. 44 | 45 | To assess which k is best to use by subsampling the original data: 46 | 47 | 1. Performing the clustering at many different K values on the full data set. 48 | 49 | 2. We then sample without replacement a subset of the data set (e.g. 80% of the 50 | cells in the full data set), and then repeat the clustering procedure on just 51 | this subset of data (so repeating all aspects of clustering, including calling 52 | variable genes, calculating PCs, building the neighbor graph, etc), and we do 53 | this n times. 54 | 55 | 3. So for each K value, we have 1 clustering outcome for the full data set, 56 | and 20 clustering outcomes for subsampled portions of the data set. From this we 57 | identify the cluster in the first subsample clustering that is most similar to 58 | the full cluster 1 cells (the one that gives the maximum Jaccard coefficient) and 59 | record that value. If this maximum Jaccard coefficient is less than 0.6 (this is 60 | quite subjective), the original cluster is considered to be dissolved-it didn’t 61 | show up in the new clustering. A cluster that’s dissolved too often is probably 62 | not a “real” cluster. 63 | 64 | >As a rule of thumb, clusters with a stability value less than 0.6 should be considered 65 | unstable. Values between 0.6 and 0.75 indicate that the cluster is measuring a pattern 66 | in the data, but there isn’t high certainty about which points should be clustered 67 | together. Clusters with stability values above about 0.85 can be considered highly 68 | stable (they’re likely to be real clusters). 69 | 70 | 71 | 4. Repeat this for all subsample clustering outcomes, and then the 72 | stability value of a cluster is the median or mean Jaccard coefficient. If it's 73 | greater than 0.6 (or a cutoff you set) we say it's stable, otherwise it's unstable. 74 | So for a given K value this gives you a stable/unstable assignment for each cluster. 75 | We choose the k value to select for clustering the data by looking at which k value 76 | yielded the largest number of stable clusters while still having most of the cells from the 77 | data set in a stable cluster. 78 | 79 | 80 | We can repeat the 1-4 for different resolution and number of PCs and the combination of all different parameters. 81 | 82 | The workflow is: 83 | 84 | ![](man/figures/workflow.png) 85 | 86 | ## The subsampling process is implemented in a Snakemake workflow 87 | 88 | Because for each subsampling, one has to re-run the whole process of `FindVariableGenes`, 89 | `ScaleData`, `RunPCA`, `JackStraw` and `FindClusters` and for large data set, it can 90 | take very long time to run. 91 | 92 | E.g. if you test 5 different K, and for each K you subsample the full dataset 100 times. that's 93 | 500 runs. 94 | 95 | Snakemake will take advantage of the HPC cluster with large number of CPUs avaiable. 96 | 97 | The R package works with the output from the Snakemake workflow: [pyflow_seuratv3_parameter](https://github.com/crazyhottommy/pyflow_seuratv3_parameter). 98 | 99 | ## The scclusteval R package is for downstream analysis 100 | 101 | ### Installation 102 | 103 | You can install the scclusteval from github: 104 | 105 | ``` r 106 | devtools::install_github("crazyhottommy/scclusteval") 107 | ``` 108 | 109 | 110 | ## Useful functions 111 | 112 | ```{r} 113 | library(scclusteval) 114 | ?RandomSubsetData 115 | ?MergeMultipleSeuratObjects 116 | ?PreprocessSubsetData 117 | ?PairWiseJaccardSets 118 | 119 | ## in Rstudio type below and tab to see all avaiable functions 120 | ## scclusteval:: 121 | ``` 122 | 123 | 124 | ## Examples 125 | 126 | Examples to use the `scclusteval` package can be found at https://crazyhottommy.github.io/EvaluateSingleCellClustering/ 127 | 128 | ![](man/figures/README-unnamed-chunk-1-1.png) 129 | 130 | ![](man/figures/jaccard_raincloud.png) 131 | 132 | ## Acknowledgements 133 | 134 | Thanks to Tim Sackton and Catherine Dulac for their supervision and support. 135 | Thanks to Yasin Kaymaz in Sackton group for fruitful discussion. 136 | Thanks to Stephen Eichhorn in Xiaowei Zhuang lab for the idea and sharing the python code working on [Scanpy](https://github.com/theislab/scanpy) object. 137 | Thanks to Sophia(Zhengzheng) Liang and Brandon Logeman in Dulac lab for sharing data and giving feedbacks. 138 | Thanks [David Robinson](https://twitter.com/drob)'s `geomflatviolin` function which was used in the `raincloudplot`. 139 | 140 | ## Why this package? 141 | 142 | I saw `{fpc}` package has a function `clusterboot`. However, this function does not support 143 | SNN clustering. Although one can write a customer clustering function to feed into clusterboot, 144 | I need to build things upon `Seurat` package and those two can not be easilily integrated. In 145 | addition, `clusterboot` is not parallelized, I have to implement the `snakemake` workflow for faster 146 | processing. 147 | 148 | read this blog post http://www.win-vector.com/blog/2015/09/bootstrap-evaluation-of-clusters/ 149 | and https://www.czasopisma.uni.lodz.pl/foe/article/view/983 150 | 151 | 152 | ## To do list 153 | 154 | - [x] implement more visualization functions. 155 | - [ ] plot number of cells subsampled for each cluster in each iteration in raincloudplot. 156 | - [ ] impurity metric for assessing cluster stability. 157 | - [ ] read this post from Jean Fan from Xiaowei Zhuang's lab https://jef.works/blog/2018/02/28/stability-testing/ 158 | `getComMembership` function works on raw data matrix. It can be used independent of Seurat's `FindClusters`. chat with Jean for more details. 159 | - [ ] gene sets enrichment for each cluster. 160 | 161 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | [![Build 5 | Status](https://travis-ci.com/crazyhottommy/scclusteval.svg?branch=master)](https://travis-ci.com/crazyhottommy/scclusteval) 6 | 7 | # scclusteval 8 | 9 |
10 | 11 | hex sticker 12 | 13 |

14 | 15 | hex sticker 16 | 17 |

18 | 19 |
20 | 21 | The goal of scclusteval(Single Cell Cluster Evaluation) is to evaluate 22 | the single cell clustering stability by subsampling the cells and 23 | provide many visualization methods for comparing clusters. 24 | 25 | For Theory behind the method, see Christian Henning, “Cluster-wise 26 | assessment of cluster stability,” Research Report 271, Dept. of 27 | Statistical Science, University College London, December 2006) 28 | 29 | ### Citation 30 | 31 | Ming Tang, Yasin Kaymaz,Brandon L. Logeman, Stephen Eichhorn, Zhengzheng 32 | S. Liang, Catherine Dulac and Timothy B. Sackton. Evaluating single-cell 33 | cluster stability using the Jaccard similarity index (2020) 34 | *Bioinformatics* btaa956, https://doi.org/10.1093/bioinformatics/btaa956. 35 | 36 | ### Parameters that affect the clustering 37 | 38 | The most popular clustering method for single cell RNAseq is shared 39 | nearest neighbor (SNN) followed by Louvain community detection algorithm 40 | which is implemented in `{Seurat::FindClusters}`. 41 | See a paper by Mark Robinson group for comparing single cell RNAseq 42 | clustering methods:[A systematic performance evaluation of clustering 43 | methods for single-cell RNA-seq 44 | data](https://f1000research.com/articles/7-1141/v1) SNN in Seurat is the 45 | most accurate and fast one. 46 | 47 | The parameter `k.param` which specifies the number of nearest neighbors 48 | has a great effect on the number of clusters. Other Parameters such as 49 | the number of PCs and the resolution can affect the number of clusters 50 | as well. 51 | 52 | The process is as follows. 53 | 54 | To assess which k is best to use by subsampling the original data: 55 | 56 | 1. Performing the clustering at many different K values on the full 57 | data set. 58 | 59 | 2. We then sample without replacement a subset of the data set 60 | (e.g. 80% of the cells in the full data set), and then repeat the 61 | clustering procedure on just this subset of data (so repeating all 62 | aspects of clustering, including calling variable genes, calculating 63 | PCs, building the neighbor graph, etc), and we do this n times. 64 | 65 | 3. So for each K value, we have 1 clustering outcome for the full data 66 | set, and 20 clustering outcomes for subsampled portions of the data 67 | set. From this we identify the cluster in the first subsample 68 | clustering that is most similar to the full cluster 1 cells (the one 69 | that gives the maximum Jaccard coefficient) and record that value. 70 | If this maximum Jaccard coefficient is less than 0.6 (this is quite 71 | subjective), the original cluster is considered to be dissolved-it 72 | didn’t show up in the new clustering. A cluster that’s dissolved too 73 | often is probably not a “real” cluster. 74 | 75 | > As a rule of thumb, clusters with a stability value less than 0.6 76 | > should be considered unstable. Values between 0.6 and 0.75 indicate 77 | > that the cluster is measuring a pattern in the data, but there isn’t 78 | > high certainty about which points should be clustered together. 79 | > Clusters with stability values above about 0.85 can be considered 80 | > highly stable (they’re likely to be real clusters). 81 | 82 | 4. Repeat this for all subsample clustering outcomes, and then the 83 | stability value of a cluster is the median or mean Jaccard 84 | coefficient. If it’s greater than 0.6 (or a cutoff you set) we say 85 | it’s stable, otherwise it’s unstable. So for a given K value this 86 | gives you a stable/unstable assignment for each cluster. We choose 87 | the k value to select for clustering the data by looking at which k 88 | value yielded the largest number of stable clusters while still 89 | having most of the cells from the data set in a stable cluster. 90 | 91 | We can repeat the 1-4 for different resolution and number of PCs and the 92 | combination of all different parameters. 93 | 94 | The workflow is: 95 | 96 | ![](man/figures/workflow.png) 97 | 98 | ## The subsampling process is implemented in a Snakemake workflow 99 | 100 | Because for each subsampling, one has to re-run the whole process of 101 | `FindVariableGenes`, `ScaleData`, `RunPCA`, `JackStraw` and 102 | `FindClusters` and for large data set, it can take very long time to 103 | run. 104 | 105 | E.g. if you test 5 different K, and for each K you subsample the full 106 | dataset 100 times. that’s 500 runs. 107 | 108 | Snakemake will take advantage of the HPC cluster with large number of 109 | CPUs avaiable. 110 | 111 | The R package works with the output from the Snakemake workflow: 112 | [pyflow\_seuratv3\_parameter](https://github.com/crazyhottommy/pyflow_seuratv3_parameter). 113 | 114 | ## The scclusteval R package is for downstream analysis 115 | 116 | ### Installation 117 | 118 | You can install the scclusteval from github: 119 | 120 | ``` r 121 | devtools::install_github("crazyhottommy/scclusteval") 122 | ``` 123 | 124 | ## Useful functions 125 | 126 | ``` r 127 | library(scclusteval) 128 | #> Loading required package: Seurat 129 | ?RandomSubsetData 130 | ?MergeMultipleSeuratObjects 131 | ?PreprocessSubsetData 132 | ?PairWiseJaccardSets 133 | 134 | ## in Rstudio type below and tab to see all avaiable functions 135 | ## scclusteval:: 136 | ``` 137 | 138 | ## Examples 139 | 140 | Examples to use the `scclusteval` package can be found at 141 | 142 | 143 | ![](man/figures/README-unnamed-chunk-1-1.png) 144 | 145 | ![](man/figures/jaccard_raincloud.png) 146 | 147 | ## Acknowledgements 148 | 149 | Thanks to Tim Sackton and Catherine Dulac for their supervision and 150 | support. 151 | Thanks to Yasin Kaymaz in Sackton group for fruitful discussion. 152 | Thanks to Stephen Eichhorn in Xiaowei Zhuang lab for the idea and 153 | sharing the python code working on 154 | [Scanpy](https://github.com/theislab/scanpy) object. 155 | Thanks to Sophia(Zhengzheng) Liang and Brandon Logeman in Dulac lab for 156 | sharing data and giving feedbacks. 157 | Thanks [David Robinson](https://twitter.com/drob)’s `geomflatviolin` 158 | function which was used in the `raincloudplot`. 159 | 160 | ## Why this package? 161 | 162 | I saw `{fpc}` package has a function `clusterboot`. However, this 163 | function does not support SNN clustering. Although one can write a 164 | customer clustering function to feed into clusterboot, I need to build 165 | things upon `Seurat` package and those two can not be easilily 166 | integrated. In addition, `clusterboot` is not parallelized, I have to 167 | implement the `snakemake` workflow for faster processing. 168 | 169 | read this blog post 170 | 171 | and 172 | 173 | ## To do list 174 | 175 | - \[x\] implement more visualization functions. 176 | - \[ \] plot number of cells subsampled for each cluster in each 177 | iteration in raincloudplot. 178 | - \[ \] impurity metric for assessing cluster stability. 179 | - \[ \] read this post from Jean Fan from Xiaowei Zhuang’s lab 180 | 181 | `getComMembership` function works on raw data matrix. It can be used 182 | independent of Seurat’s `FindClusters`. chat with Jean for more 183 | details. 184 | - \[ \] gene sets enrichment for each cluster. 185 | -------------------------------------------------------------------------------- /data/idents.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/data/idents.rda -------------------------------------------------------------------------------- /man/AssignHighestJaccard.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scclusterboot.R 3 | \name{AssignHighestJaccard} 4 | \alias{AssignHighestJaccard} 5 | \title{Assign highest Jaccard index for each cluster of the subsampled data set before 6 | reclustering with the cluster identites of subsampled data set after reclustering} 7 | \usage{ 8 | AssignHighestJaccard(idents1, idents2) 9 | } 10 | \arguments{ 11 | \item{idents1}{A list of cluster identity copied from the orginal data sets. 12 | idents1 is a list of the cluster identity from the subsampled data sets before reclustering.} 13 | 14 | \item{idents2}{A list of cluster identity from the subsampled data sets. 15 | idents2 is a list of the cluster identity from the subsampled data sets after reclustering. 16 | The order of identities in idents1 and idents2 should correspond to each other.} 17 | } 18 | \value{ 19 | A matrix with dimention of #number of subsampling * #number of clusters in the 20 | original data set. 21 | } 22 | \description{ 23 | Assign highest Jaccard index for each cluster of the subsampled data set before 24 | reclustering with the cluster identites of subsampled data set after reclustering 25 | } 26 | -------------------------------------------------------------------------------- /man/AssignStableCluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scclusterboot.R 3 | \name{AssignStableCluster} 4 | \alias{AssignStableCluster} 5 | \title{Assign stable cluster} 6 | \usage{ 7 | AssignStableCluster( 8 | idents1, 9 | idents2, 10 | method = "jaccard_median", 11 | jaccard_cutoff = 0.6, 12 | percent_cutoff = 0.6 13 | ) 14 | } 15 | \arguments{ 16 | \item{idents1}{A list of cluster identity copied from the orginal data sets. 17 | idents1 is a list of the cluster identity from the subsampled data sets before reclustering.} 18 | 19 | \item{idents2}{A list of cluster identity from the subsampled data sets. 20 | idents2 is a list of the cluster identity from the subsampled data sets after reclustering. 21 | The order of identities in idents1 and idents2 should correspond to each other.} 22 | 23 | \item{method}{what way to summarize the jaccard index across all simulations. 24 | to determine a cluster is stable or not. options are "Jaccard_mean", "Jaccard_median" and "Jaccard_percent"} 25 | 26 | \item{jaccard_cutoff}{Cutoff of the jaccard index to determin a cluster is stable or not. 27 | it is the mean or median cutoff when the method is "jaccard_mean" or "jaccard_median" and it is 28 | the cutoff for every subsampling when the method is "jaccard_percent"} 29 | 30 | \item{percent_cutoff}{The percentage of jaccard index greater than jaccard_cutoff. Used 31 | when method is "jaccard_percent". specify 0.6 when you mean 60\%.} 32 | } 33 | \value{ 34 | A list containing the raw data for jaccard index for all simulations, 35 | TRUE or FALSE of stable cluster for each cluster and a number of stable clusters. 36 | A cluster is deemed as stable if the median (or mean) jaccard index is > cutoff. 37 | in addtion, a stable_index is calculated, which is the pecentage of jaccard index > 38 | cutoff for all the subsampling. e.g. for 100 times subsampling, 0.8 means 80\% of the 39 | time, the jaccard index is > cutoff. Sometimes, we see bimodal distrbution of the 40 | 100 jaccard index, the percentage is a better measurement than the mean or median of the 41 | 100 jaccard index. 42 | } 43 | \description{ 44 | Assign stable cluster 45 | } 46 | \examples{ 47 | 48 | data(idents) 49 | 50 | AssignStableCluster(idents, idents) 51 | 52 | } 53 | -------------------------------------------------------------------------------- /man/CalculatePercentCellInStable.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scclusterboot.R 3 | \name{CalculatePercentCellInStable} 4 | \alias{CalculatePercentCellInStable} 5 | \title{Calculate the percentage of cells in stable clusters in the full data set} 6 | \usage{ 7 | CalculatePercentCellInStable(ident, stable_cluster) 8 | } 9 | \arguments{ 10 | \item{ident.}{A named factor vector. names are the cell names, the values are 11 | the cluster id from the full data set.} 12 | 13 | \item{stable_cluster.}{A logical vector for each of the original cluster indicating 14 | it is stable or not, calculated from \code{\link{AssignStableCluster}}} 15 | } 16 | \value{ 17 | A percentage of cells in stable cluster 18 | } 19 | \description{ 20 | Calculate the percentage of cells in stable clusters in the full data set 21 | } 22 | -------------------------------------------------------------------------------- /man/CalculateSilhouette.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculatesilhouette.R 3 | \name{CalculateSilhouette} 4 | \alias{CalculateSilhouette} 5 | \title{Calculate Silhouette width from PCA space for each cell after clustering 6 | This is calculated from Seurat object} 7 | \usage{ 8 | CalculateSilhouette(object, dims = 1:50) 9 | } 10 | \arguments{ 11 | \item{object}{A Seurat object with Idents set to cluster ids (factors)} 12 | 13 | \item{dims}{default 1:50 dimension to use in the PCA space to calculate 14 | eucledian distance} 15 | } 16 | \value{ 17 | a dataframe with silhouette width for each cell. see also \code{\link[cluster]{silhouette}} 18 | } 19 | \description{ 20 | Calculate Silhouette width from PCA space for each cell after clustering 21 | This is calculated from Seurat object 22 | } 23 | \examples{ 24 | CalculateSilhouette(pbmc_small, dims = 1:15) 25 | 26 | } 27 | -------------------------------------------------------------------------------- /man/ClusterIdentityChordPlot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clusterviz.R 3 | \name{ClusterIdentityChordPlot} 4 | \alias{ClusterIdentityChordPlot} 5 | \title{Plot ChordDiagram of cell identity changes between two runs of clusters.} 6 | \usage{ 7 | ClusterIdentityChordPlot( 8 | ident1, 9 | ident2, 10 | clusters_to_show_ident1 = NULL, 11 | big.gap = 10, 12 | transparency = 0.5, 13 | grid.col = NULL, 14 | link.sort = TRUE, 15 | link.decreasing = TRUE, 16 | directional = -1 17 | ) 18 | } 19 | \arguments{ 20 | \item{ident1}{a named factor vector. names are the cell names, the values are 21 | the cluster id.} 22 | 23 | \item{ident2}{a named factor vector. names are the cell names, the values are 24 | the cluster id.} 25 | 26 | \item{clusters_to_show_ident1}{A character vector of cluster ids to show for ident1. 27 | default is NULL, all clusters will be shown.} 28 | 29 | \item{big.gap}{Gap between sectors of two cluster runs.} 30 | 31 | \item{transparency}{Transparency of link colors, 0 means no transparency and 1 means full transparency. 32 | see \code{\link[circlize]{chordDiagramFromMatrix}}} 33 | 34 | \item{grid.col}{Grid colors which correspond to matrix rows/columns (or sectors). 35 | The length of the vector should be either 1 or length(union(rownames(mat), colnames(mat))). 36 | It's preferred that grid.col is a named vector of which names correspond to sectors. 37 | If it is not a named vector, the order of grid.col corresponds to order of sectors. 38 | see \code{\link[circlize]{chordDiagramFromMatrix}}} 39 | 40 | \item{link.sort}{whether sort links on every sector based on the width of the links on it. 41 | If it is set to "overall", all links are sorted regardless whether they are from rows or columns. 42 | see \code{\link[circlize]{chordDiagramFromMatrix}}} 43 | 44 | \item{link.decreasing}{for link.sort} 45 | 46 | \item{directional}{Whether links have directions. 1 means the direction is from the first column 47 | in df to the second column, -1 is the reverse, 0 is no direction, and 2 for two directional. 48 | see \code{\link[circlize]{chordDiagramFromMatrix}}} 49 | } 50 | \value{ 51 | A data frame which contains positions of links. see \code{\link[circlize]{chordDiagramFromMatrix}} 52 | } 53 | \description{ 54 | Plot ChordDiagram of cell identity changes between two runs of clusters. 55 | } 56 | -------------------------------------------------------------------------------- /man/ClusterSizeBarplot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clusterviz.R 3 | \name{ClusterSizeBarplot} 4 | \alias{ClusterSizeBarplot} 5 | \title{Make a Barplot for cluster size} 6 | \usage{ 7 | ClusterSizeBarplot(ident, bar_col = "blue", label_number = TRUE) 8 | } 9 | \arguments{ 10 | \item{ident}{a named factor vector. names are the cell names, the values are 11 | the cluster id.} 12 | 13 | \item{bar_col}{color for the bar. Default is blue.} 14 | 15 | \item{label_number}{whether or not put cell number in each cluster on top of the bar} 16 | } 17 | \value{ 18 | a ggplot2 bar graph object 19 | } 20 | \description{ 21 | Make a Barplot for cluster size 22 | } 23 | \examples{ 24 | data(pbmc_small) 25 | ClusterSizeBarplot(Idents(pbmc_small)) 26 | 27 | } 28 | -------------------------------------------------------------------------------- /man/JaccardRainCloudPlot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clusterviz.R 3 | \name{JaccardRainCloudPlot} 4 | \alias{JaccardRainCloudPlot} 5 | \title{Plot the Jaccard index distribution using raincloud plot} 6 | \usage{ 7 | JaccardRainCloudPlot(idents1, idents2, title = NULL) 8 | } 9 | \arguments{ 10 | \item{idents1}{A list of cluster identity from the subsampled data set 11 | before reclustering. (cluster id copied from the original full data set)} 12 | 13 | \item{idents2}{A list of cluster identity from the subsampled data sets after 14 | reclustering.} 15 | 16 | \item{title}{Title of the plot} 17 | } 18 | \value{ 19 | A ggplot2 object 20 | } 21 | \description{ 22 | Plot the Jaccard index distribution using raincloud plot 23 | } 24 | \examples{ 25 | 26 | \dontrun{ 27 | data(idents) 28 | ## the pbmc here need to be fully processed. 29 | JaccardRainCloudPlot(idents, idents) 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /man/JaccardSets.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scclusterboot.R 3 | \name{JaccardSets} 4 | \alias{JaccardSets} 5 | \title{Calculate jaccard distance for two sets of character vectors} 6 | \usage{ 7 | JaccardSets(set1, set2) 8 | } 9 | \arguments{ 10 | \item{set1}{character vector 1} 11 | 12 | \item{set2}{character vector 2} 13 | } 14 | \value{ 15 | jaccard distance 16 | } 17 | \description{ 18 | Calculate jaccard distance for two sets of character vectors 19 | } 20 | \examples{ 21 | JaccardSets(sample(LETTERS, 10), sample(LETTERS, 10)) 22 | } 23 | -------------------------------------------------------------------------------- /man/MatchClusters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scclusterboot.R 3 | \name{MatchClusters} 4 | \alias{MatchClusters} 5 | \title{Match two run of cluster ids with highest Jaccard index} 6 | \usage{ 7 | MatchClusters(ident1, ident2) 8 | } 9 | \arguments{ 10 | \item{ident1}{a named factor vector. names are the cell names, the values are 11 | the cluster id.} 12 | 13 | \item{ident2}{a named factor vector. names are the cell names, the values are 14 | the cluster id.} 15 | } 16 | \value{ 17 | A tibble with two columns, column 1 is the cluster ids from ident1, column2 18 | is the cluster ids from ident2. 19 | } 20 | \description{ 21 | Match two run of cluster ids with highest Jaccard index 22 | } 23 | \examples{ 24 | \dontrun{ 25 | MatchClusters(pbmc@ident, pbmc_small@ident) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /man/MergeMultipleSeuratObjects.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mergemultipleseuratobjects.R 3 | \name{MergeMultipleSeuratObjects} 4 | \alias{MergeMultipleSeuratObjects} 5 | \title{Read multiple 10x run into Seurat objects and merge into a single Seurat object} 6 | \usage{ 7 | MergeMultipleSeuratObjects(input_folders, do.normalize = FALSE, ...) 8 | } 9 | \arguments{ 10 | \item{input_folders}{A named list of folder path for each run.} 11 | 12 | \item{do.normalize}{Whether or not normalize the data after mergeing, default is FALSE} 13 | 14 | \item{...}{Other parameters for CreatSeuratObject in the Seurat package} 15 | } 16 | \value{ 17 | A single merged Seurat object from mulitple 10x runs. 18 | } 19 | \description{ 20 | Read multiple 10x run into Seurat objects and merge into a single Seurat object. 21 | The names of the list of paths will be prepended to the cell name. 22 | } 23 | \examples{ 24 | \dontrun{ 25 | library(fs) 26 | library(here) 27 | library(stringr) 28 | input_folders<- dir_ls( path = here("data"), recursive = T) \%>\% path_dir() \%>\% 29 | unique() \%>\% str_subset("mm10-1.2.0_premrna") 30 | merged_seurat<- MergeMultipleSeuratObjects(input_folders) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /man/PairWiseJaccardSets.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scclusterboot.R 3 | \name{PairWiseJaccardSets} 4 | \alias{PairWiseJaccardSets} 5 | \title{Calculate pair-wise Jaccard distance for @ident slots from two Seurat objects} 6 | \usage{ 7 | PairWiseJaccardSets(ident1, ident2) 8 | } 9 | \arguments{ 10 | \item{ident1}{a named factor vector. names are the cell names, the values are 11 | the cluster id.} 12 | 13 | \item{ident2}{a named factor vector. names are the cell names, the values are 14 | the cluster id.} 15 | } 16 | \value{ 17 | a matrix of pair-wise Jaccard distance. Rows are clusters from ident1, 18 | columns are clusters from ident2 19 | } 20 | \description{ 21 | Calculate pair-wise Jaccard distance for two named factor vector. e.g. 22 | seurat_obj1@ident and seurat_obj2@ident 23 | } 24 | \examples{ 25 | \dontrun{ 26 | PairWiseJaccardSets(pbmc@ident, pbmc_small@ident) 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /man/PairWiseJaccardSetsHeatmap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clusterviz.R 3 | \name{PairWiseJaccardSetsHeatmap} 4 | \alias{PairWiseJaccardSetsHeatmap} 5 | \title{Make a Heatmap of the pairwise Jaccard distance between cluster ident of two 6 | Seurat object} 7 | \usage{ 8 | PairWiseJaccardSetsHeatmap( 9 | ident1, 10 | ident2, 11 | best_match = FALSE, 12 | title = NULL, 13 | col_low = "white", 14 | col_high = "red", 15 | cluster_rows = F, 16 | cluster_columns = F, 17 | show_row_dend = F, 18 | show_column_dend = F, 19 | ... 20 | ) 21 | } 22 | \arguments{ 23 | \item{ident1}{a named factor vector. names are the cell names, the values are 24 | the cluster id.} 25 | 26 | \item{ident2}{a named factor vector. names are the cell names, the values are 27 | the cluster id.} 28 | 29 | \item{best_match}{Whether or not only show the best match of ident1 from ident2. 30 | if set to TRUE, the Jaccard index matrix will be subsetted using the ident2 column 31 | from the output of \code{\link{MatchClusters}}, the row order will be in order from cluster 32 | 0 to the total number of clusters, the columns will be the best match of ident1 from ident2, 33 | and the columns idents could be duplicated. e.g. single cluster from ident2 matches multiple 34 | clusters in ident1.} 35 | 36 | \item{title}{The title of the heatmap} 37 | 38 | \item{col_low}{Color for low Jaccard index.} 39 | 40 | \item{col_high}{Color for high Jaccard index.} 41 | 42 | \item{cluster_rows}{cluster row or not, default FALSE} 43 | 44 | \item{cluster_columns}{cluster columns or not, default FASLE} 45 | 46 | \item{show_row_dend}{Whether or not show row dendrogram} 47 | 48 | \item{show_column_dend}{Whether or not show column dendrogram} 49 | 50 | \item{...}{other parameters pass to \code{\link[ComplexHeatmap]{Heatmap}}} 51 | } 52 | \value{ 53 | A Heatmap representing the pair-wise Jaccard correlation, rows are ident1, 54 | columns are ident2 55 | } 56 | \description{ 57 | Make a Heatmap of the pairwise Jaccard distance between cluster ident of two 58 | Seurat object 59 | } 60 | \examples{ 61 | 62 | } 63 | -------------------------------------------------------------------------------- /man/PairWiseOverlappingIdents.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scclusterboot.R 3 | \name{PairWiseOverlappingIdents} 4 | \alias{PairWiseOverlappingIdents} 5 | \title{Calculate pair-wise overlapping cluster identities for @ident slots from two Seurat objects} 6 | \usage{ 7 | PairWiseOverlappingIdents(ident1, ident2) 8 | } 9 | \arguments{ 10 | \item{ident1}{a named factor vector. names are the cell names, the values are 11 | the cluster id.} 12 | 13 | \item{ident2}{a named factor vector. names are the cell names, the values are 14 | the cluster id.} 15 | } 16 | \value{ 17 | A matrix of pairwise number of common cell identities for each cluster. 18 | } 19 | \description{ 20 | Calculate pair-wise overlapping cluster identities for two named factor vector. e.g. 21 | seurat_obj1@ident and seurat_obj2@ident 22 | } 23 | \examples{ 24 | \dontrun{ 25 | PairWiseOverlappingIdents(pbmc@ident, pbmc_small@ident) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /man/ParameterSetScatterPlot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clusterviz.R 3 | \name{ParameterSetScatterPlot} 4 | \alias{ParameterSetScatterPlot} 5 | \title{Plot a scatter plot for different clustering parameters} 6 | \usage{ 7 | ParameterSetScatterPlot( 8 | stable_clusters, 9 | fullsample_idents, 10 | x_var, 11 | y_var, 12 | facet_rows, 13 | facet_cols 14 | ) 15 | } 16 | \arguments{ 17 | \item{stable_clusters}{a dataframe with list-columns for data, stable_cluster determined by 18 | \code{\link{AssignStableCluster}} and the rest of the columns are pc, resolution and k_param.} 19 | 20 | \item{fullsample_idents}{a dataframe with the list-column contain the original ident for 21 | the full dataset. This is the direct output from the Snakemake workflow.} 22 | 23 | \item{x_var}{one of "pc", "resolution" and "k_param".} 24 | 25 | \item{y_var}{one of "number" or "percentage". If it is "number", 26 | y-axis si the total number of clusters and total number of stable clusters.} 27 | 28 | \item{facet_rows}{one of "pc", "resolution" and "k_param" for ggplot2 to facet.} 29 | 30 | \item{facet_cols}{one of "pc", "resolution" and "k_param" for ggplot2 to facet.} 31 | } 32 | \value{ 33 | a ggplot2 object 34 | } 35 | \description{ 36 | x-axis is the parameters tested (e.g. many different k.param) 37 | y-axis is the total number of clusters and total number of stable clusters based 38 | on the jaccard cutoff as determined by AssignStableClusters, or precentage of cells 39 | in stable clusters. 40 | } 41 | -------------------------------------------------------------------------------- /man/PreprocessSubsetData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/preprocesssubsetdata.R 3 | \name{PreprocessSubsetData} 4 | \alias{PreprocessSubsetData} 5 | \title{A wrapper for preprocessing subsetted Seurat object} 6 | \usage{ 7 | PreprocessSubsetData( 8 | object, 9 | variable.features.n = 3000, 10 | num.pc = 20, 11 | pc.use = NULL, 12 | workers = 2, 13 | score.thresh = 1e-05, 14 | sig.pc.thresh = 0.05, 15 | n.start = 100, 16 | nn.eps = 0, 17 | resolution = 0.8, 18 | k.param = 30, 19 | ... 20 | ) 21 | } 22 | \arguments{ 23 | \item{object}{A subsetted Seurat object created by RandomSubsetData} 24 | 25 | \item{variable.features.n}{number of variable features for \code{\link[Seurat]{SCTransform}}} 26 | 27 | \item{num.pc}{number of PCs to calculate in RunPCA, JackStraw and JackStrawPlot 28 | step. The optimal PCs for FindClusters will be determined by only significant PCs 29 | from JackStrawPlot or if pc.use is set, JackStraw step will be skipped and use pc.use 30 | for FindClusters.} 31 | 32 | \item{pc.use}{number of PCs used for FindClusters. if pc.use is set, JackStraw step 33 | will be skipped and use pc.use for FindClusters. score.thresh and sig.pc.thresh will be ignored.} 34 | 35 | \item{workers}{number of CPUs to use for \code{\link[future]{plan}} parallel processing} 36 | 37 | \item{score.thresh}{Threshold to use for the proportion test of PC significance.} 38 | 39 | \item{sig.pc.thresh}{Threshold for the significance of a particular PC.} 40 | 41 | \item{n.start}{Number of random start.} 42 | 43 | \item{nn.eps}{Error bound when performing nearest neighbor seach using RANN; 44 | default of 0.0 implies exact nearest neighbor search. See FindClusters.} 45 | 46 | \item{resolution}{Value of the resolution parameter, use a value above (below) 47 | 1.0 if you want to obtain a larger (smaller) number of communities. see FIndClusters.} 48 | 49 | \item{k.param}{Defines k for the k-nearest neighbor algorithm.} 50 | 51 | \item{...}{any other parameters} 52 | } 53 | \value{ 54 | a fully processed Seurat object 55 | } 56 | \description{ 57 | The wrapper does FindVeriableGenes, ScaleData, RunPCA, JackStraw to 58 | determine how many PCs to use, ProjectPCA and FindClusters and retrun 59 | a fully processed Seurat object. The input subsetted seurat object is 60 | supposed to be fully processed as well. So the NormalizeData step is not 61 | necessary. 62 | } 63 | \examples{ 64 | \dontrun{ 65 | pbmc_small_subset<- RandomSubsetData(pbmc_small, 0.8) 66 | pbmc_small_subset_processed<- PreprocessSubsetData(pbmc_small_subset) 67 | pbmc_small_subset_processed@meta.data 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /man/PreprocessSubsetDataV2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/preprocesssubsetdatav2.R 3 | \name{PreprocessSubsetDataV2} 4 | \alias{PreprocessSubsetDataV2} 5 | \title{A wrapper for preprocessing subsetted Seurat object using ScaleData} 6 | \usage{ 7 | PreprocessSubsetDataV2( 8 | object, 9 | nfeatures = 2000, 10 | num.pc = 20, 11 | pc.use = NULL, 12 | workers = 2, 13 | score.thresh = 1e-05, 14 | sig.pc.thresh = 0.05, 15 | n.start = 100, 16 | nn.eps = 0, 17 | resolution = 0.8, 18 | k.param = 30, 19 | ... 20 | ) 21 | } 22 | \arguments{ 23 | \item{object}{A subsetted Seurat object created by RandomSubsetData} 24 | 25 | \item{num.pc}{number of PCs to calculate in RunPCA, JackStraw and JackStrawPlot 26 | step. The optimal PCs for FindClusters will be determined by only significant PCs 27 | from JackStrawPlot or if pc.use is set, JackStraw step will be skipped and use pc.use 28 | for FindClusters.} 29 | 30 | \item{pc.use}{number of PCs used for FindClusters. if pc.use is set, JackStraw step 31 | will be skipped and use pc.use for FindClusters. score.thresh and sig.pc.thresh will be ignored.} 32 | 33 | \item{workers}{number of CPUs to use for \code{\link[future]{plan}} parallel processing} 34 | 35 | \item{score.thresh}{Threshold to use for the proportion test of PC significance.} 36 | 37 | \item{sig.pc.thresh}{Threshold for the significance of a particular PC.} 38 | 39 | \item{n.start}{Number of random start.} 40 | 41 | \item{nn.eps}{Error bound when performing nearest neighbor seach using RANN; 42 | default of 0.0 implies exact nearest neighbor search. See FindClusters.} 43 | 44 | \item{resolution}{Value of the resolution parameter, use a value above (below) 45 | 1.0 if you want to obtain a larger (smaller) number of communities. see FIndClusters.} 46 | 47 | \item{k.param}{Defines k for the k-nearest neighbor algorithm.} 48 | 49 | \item{...}{any other parameters} 50 | 51 | \item{variable.features.n}{number of variable features for \code{\link[Seurat]{SCTransform}}} 52 | } 53 | \value{ 54 | a fully processed Seurat object 55 | } 56 | \description{ 57 | The wrapper does FindVeriableGenes, ScaleData, RunPCA, JackStraw to 58 | determine how many PCs to use, ProjectPCA and FindClusters and retrun 59 | a fully processed Seurat object. The input subsetted seurat object is 60 | supposed to be fully processed as well. So the NormalizeData step is not 61 | necessary. 62 | } 63 | \examples{ 64 | \dontrun{ 65 | pbmc_small_subset<- RandomSubsetData(pbmc_small, 0.8) 66 | pbmc_small_subset_processed<- PreprocessSubsetData(pbmc_small_subset) 67 | pbmc_small_subset_processed@meta.data 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /man/RandomSubsetData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/randomsubsetdata.R 3 | \name{RandomSubsetData} 4 | \alias{RandomSubsetData} 5 | \title{Randomly subset (cells) seurat object by a rate} 6 | \usage{ 7 | RandomSubsetData(object, rate, random.subset.seed = NULL, ...) 8 | } 9 | \arguments{ 10 | \item{object}{Seurat object} 11 | 12 | \item{rate}{a number betwee 0-1 for subsetting} 13 | 14 | \item{random.subset.seed}{set a random seed for sampling, default is NULL.} 15 | 16 | \item{...}{any other parameters to \code{\link[Seurat]{subset}}} 17 | } 18 | \value{ 19 | Returns a randomly subsetted seurat object 20 | } 21 | \description{ 22 | Randomly subset (cells) seurat object by a rate 23 | } 24 | \examples{ 25 | pbmc_small 26 | pbmc_small_subset<- RandomSubsetData(pbmc_small, 0.8) 27 | dim(pbmc_small_subset@meta.data) 28 | 29 | 30 | } 31 | -------------------------------------------------------------------------------- /man/SilhouetteRainCloudPlot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clusterviz.R 3 | \name{SilhouetteRainCloudPlot} 4 | \alias{SilhouetteRainCloudPlot} 5 | \title{Plot raincloud plot for silhouette score} 6 | \usage{ 7 | SilhouetteRainCloudPlot(silhouette_score) 8 | } 9 | \arguments{ 10 | \item{silhouette_score}{a dataframe returned by \code{link[CalculateSilhouette]}} 11 | } 12 | \value{ 13 | a ggplot2 object 14 | } 15 | \description{ 16 | Plot raincloud plot for silhouette score 17 | } 18 | \examples{ 19 | 20 | SilhouetteRainCloudPlot(CalculateSilhouette(pbmc_small, dims = 1:15)) 21 | } 22 | -------------------------------------------------------------------------------- /man/figures/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/.DS_Store -------------------------------------------------------------------------------- /man/figures/README-pressure-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-pressure-1.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-2-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-2-2.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-2-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-2-3.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-2-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-2-4.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-2-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-2-5.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-3-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-3-2.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-3-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-3-3.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-3-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-3-4.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-4-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-10.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-4-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-2.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-4-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-3.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-4-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-4.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-4-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-5.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-4-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-6.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-4-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-7.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-4-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-8.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-4-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-9.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /man/figures/jaccard_raincloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/jaccard_raincloud.png -------------------------------------------------------------------------------- /man/figures/scclusteval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/scclusteval.png -------------------------------------------------------------------------------- /man/figures/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/workflow.png -------------------------------------------------------------------------------- /man/geom_flat_violin.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geomflatviolin.R 3 | \name{geom_flat_violin} 4 | \alias{geom_flat_violin} 5 | \title{A Flat Violin plot} 6 | \usage{ 7 | geom_flat_violin( 8 | mapping = NULL, 9 | data = NULL, 10 | stat = "ydensity", 11 | position = "dodge", 12 | trim = TRUE, 13 | scale = "area", 14 | show.legend = NA, 15 | inherit.aes = TRUE, 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{mapping}{See \code{\link[ggplot2]{geom_violin}}} 21 | 22 | \item{data}{See \code{\link[ggplot2]{geom_violin}}} 23 | 24 | \item{position}{See \code{\link[ggplot2]{geom_violin}}} 25 | 26 | \item{trim}{See \code{\link[ggplot2]{geom_violin}}} 27 | 28 | \item{scale}{See \code{\link[ggplot2]{geom_violin}}} 29 | 30 | \item{show.legend}{See \code{\link[ggplot2]{geom_violin}}} 31 | 32 | \item{inherit.aes}{See \code{\link[ggplot2]{geom_violin}}} 33 | 34 | \item{...}{} 35 | } 36 | \description{ 37 | A Flat Violin plot 38 | } 39 | \examples{ 40 | library(ggplot2) 41 | ggplot(diamonds, aes(cut, carat)) + 42 | geom_flat_violin() + 43 | coord_flip() 44 | 45 | } 46 | -------------------------------------------------------------------------------- /man/idents.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/idents.R 3 | \docType{data} 4 | \name{idents} 5 | \alias{idents} 6 | \title{cluster identity of subsetted pbmc data} 7 | \format{ 8 | A list of factors 9 | } 10 | \source{ 11 | \url{https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz} 12 | } 13 | \usage{ 14 | data(idents) 15 | } 16 | \description{ 17 | The 2700 cell pbmc data were subsetted to 80 percent of the cells for 100 times. 18 | Each time, we fully re-processed the subsetted data from FindVaraiableGenes to 19 | FindClusters using k =30 and resolution = 0.6, and record the cluster identity 20 | from the processed seurat@ident 21 | slot and saved in to a list of factor. 22 | } 23 | \keyword{datasets} 24 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \description{ 10 | See \code{magrittr::\link[magrittr]{\%>\%}} for details. 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/scClusterBoot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scclusterboot.R 3 | \name{scClusterBoot} 4 | \alias{scClusterBoot} 5 | \title{Bootstrap for a fully processed Seurat object} 6 | \usage{ 7 | scClusterBoot(object, n = 4, rate = 0.8, ...) 8 | } 9 | \arguments{ 10 | \item{object}{A fully processed Seurat object.} 11 | 12 | \item{n}{Number of times you want to bootstrap.} 13 | 14 | \item{rate}{A number between 0 and 1 for subsampling the cells.} 15 | 16 | \item{...}{Other parameters passed to \code{\link{PreprocessSubsetData}}} 17 | } 18 | \value{ 19 | A list of lists containing the ident from the subsetted reclustered 20 | seurat objects. 21 | } 22 | \description{ 23 | Bootstrap for a fully processed Seurat object 24 | } 25 | \examples{ 26 | 27 | } 28 | -------------------------------------------------------------------------------- /man/scclusteval-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scclusteval-package.R 3 | \docType{package} 4 | \name{scclusteval-package} 5 | \alias{scclusteval} 6 | \alias{scclusteval-package} 7 | \title{scclusteval: Evaluate the single cell clustering} 8 | \description{ 9 | What the package does (one paragraph). 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://github.com/crazyhottommy/scclusteval} 15 | \item Report bugs at \url{https://github.com/crazyhottommy/scclusteval/issues} 16 | } 17 | 18 | } 19 | \author{ 20 | \strong{Maintainer}: Ming Tang \email{tangming2005@gmail.com} 21 | 22 | } 23 | \keyword{internal} 24 | -------------------------------------------------------------------------------- /scclusteval.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 8 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /tests/spelling.R: -------------------------------------------------------------------------------- 1 | if(requireNamespace('spelling', quietly=TRUE)) 2 | spelling::spell_check_test(vignettes = TRUE, error = FALSE, skip_on_cran = TRUE) 3 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/pbmc_example.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "walk through scclusteval using pbmc data" 3 | author: "Ming Tang" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{walk through scclusteval using pbmc data} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | 13 | ```{r} 14 | library(Seurat) 15 | library(dplyr) 16 | 17 | # Load the PBMC dataset 18 | pbmc.data <- Read10X(data.dir = "~/Downloads/filtered_gene_bc_matrices/hg19/") 19 | 20 | pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, 21 | project = "10X_PBMC") 22 | 23 | pbmc2 <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, 24 | project = "10X_PBMC") 25 | 26 | mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE) 27 | percent.mito <- Matrix::colSums(pbmc@raw.data[mito.genes, ])/Matrix::colSums(pbmc@raw.data) 28 | 29 | # AddMetaData adds columns to object@meta.data, and is a great place to 30 | # stash QC stats 31 | pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito") 32 | 33 | pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", 34 | scale.factor = 10000) 35 | 36 | pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR, 37 | x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5) 38 | 39 | pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito")) 40 | 41 | pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print = TRUE, pcs.print = 1:5, 42 | genes.print = 5, pcs.compute = 100) 43 | 44 | # this step takes long time. 45 | # the Seurat tutorial uses 20 PCs. for large data sets, we sometimes use 85 PCs, 46 | # I set 100 here for example. 47 | pbmc <- JackStraw(object = pbmc, num.pc = 100, num.replicate = 100, display.progress = T, 48 | do.par = T, num.cores = 6) 49 | 50 | pbmc@dr$pca@jackstraw@overall.p.values 51 | ## default threshold is 1e-5, keep that.keep the first 100 PCs's pvalue. 52 | pbmc<- JackStrawPlot(object = pbmc, PCs = 1:100, score.thresh = 1e-5) 53 | 54 | JackStrawPlot(pbmc, PCs=1:10) 55 | pc.use<- 10 56 | 57 | pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:pc.use, 58 | resolution = 0.6, print.output = FALSE, save.SNN = TRUE) 59 | 60 | pbmc@ident 61 | 62 | 63 | pbmc <- RunTSNE(object = pbmc, dims.use = 1:pc.use, do.fast = TRUE) 64 | 65 | pbmc_sub1<- RandomSubsetData(pbmc, rate = 0.8) 66 | pbmc_sub1<- PreprocessSubsetData(pbmc_sub1, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5, 67 | resolution = 0.6, num.pc = 20) 68 | 69 | pbmc_sub2<- RandomSubsetData(pbmc, rate = 0.8) 70 | pbmc_sub2<- PreprocessSubsetData(pbmc_sub2, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5, 71 | resolution = 0.6, num.pc = 20) 72 | 73 | 74 | pbmc_sub3<- RandomSubsetData(pbmc, rate = 0.8) 75 | pbmc_sub3<- PreprocessSubsetData(pbmc_sub3, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5, 76 | resolution = 0.6, num.pc = 20) 77 | 78 | 79 | (pbmc@ident == 1) %>% table() 80 | 81 | orignal_cluster0<- names(pbmc@ident[pbmc@ident == 0]) 82 | 83 | sub1_cluster0<- names(pbmc_sub1@ident[pbmc_sub1@ident == 0]) 84 | sub1_cluster1<- names(pbmc_sub1@ident[pbmc_sub1@ident == 1]) 85 | sub1_cluster2<- names(pbmc_sub1@ident[pbmc_sub1@ident == 2]) 86 | sub1_cluster3<- names(pbmc_sub1@ident[pbmc_sub1@ident == 3]) 87 | 88 | dist(orignal_cluster0, sub1_cluster0, method = "binary") 89 | 90 | bayesbio::jaccardSets(orignal_cluster0, sub1_cluster0) 91 | length(intersect(orignal_cluster0, sub1_cluster0))/length(unique(c(orignal_cluster0, sub1_cluster0))) 92 | 93 | 94 | total_cluster_ids<- length(unique(pbmc_sub1@ident)) 95 | levels(pbmc_sub1@ident) 96 | 97 | pbmc_sub1@ident == 1 98 | pbmc_sub1@meta.data %>% tibble::rownames_to_column(var = "cell_id") %>% select(cell_id, res.0.6) 99 | dat<- tibble(cell_id = names(pbmc_sub1@ident) , cluster = pbmc_sub1@ident) %>% 100 | tidyr::nest(-cluster) %>% 101 | arrange(cluster) 102 | 103 | dat %>% 104 | mutate(jaccard = purrr::map(data, ~JaccardSets(orignal_cluster0, .x$cell_id))) %>% 105 | pull(jaccard) %>% unlist() %>% max() 106 | 107 | pbmc@ident == 3 108 | levels(pbmc@ident) 109 | pbmc@meta.data$res.0.6 %>% head() 110 | pbmc_sub1@ident 111 | 112 | boot_test<- scClusterBoot(object = pbmc, n =3, rate = 0.8, x.low.cutoff = 0.0125, 113 | x.high.cutoff = 3, y.cutoff = 0.5,resolution = 0.6, 114 | num.pc = 20, num.cores = 8) 115 | 116 | boot_clusters<- purrr:::map(boot_test, "ident") 117 | 118 | ## total 8 clusters in the original data set 119 | pbmc@ident %>% unique() %>% length() 120 | 121 | # for loops are slow. 122 | 123 | TurnIdentToDf<- function(ident){ 124 | dat<- tibble(cell.id = names(ident) , cluster = ident) %>% 125 | tidyr::nest(-cluster) %>% 126 | arrange(cluster) 127 | return(dat) 128 | } 129 | 130 | TurnIdentToDf(boot_clusters[[1]]) 131 | TurnIdentToDf(pbmc@ident)$data[[1]] 132 | 133 | boot_clusters_df<- purrr::map(boot_clusters, TurnIdentToDf) 134 | 135 | 136 | boot_clusters_df[[3]] %>% 137 | mutate(jaccard = purrr::map_dbl(data, ~JaccardSets(TurnIdentToDf(pbmc@ident)$data[[1]]$cell.id, .x$cell.id))) 138 | 139 | boot_clusters_df[[3]] %>% 140 | mutate(jaccard = purrr::map_dbl(data, ~JaccardSets(TurnIdentToDf(pbmc@ident)$data[[2]]$cell.id, .x$cell.id))) 141 | 142 | boot_clusters_df[[3]] %>% 143 | mutate(jaccard = purrr::map_dbl(data, ~JaccardSets(TurnIdentToDf(pbmc@ident)$data[[3]]$cell.id, .x$cell.id))) 144 | 145 | boot_clusters_df[[3]] %>% 146 | mutate(jaccard = purrr::map_dbl(data, ~JaccardSets(TurnIdentToDf(pbmc@ident)$data[[4]]$cell.id, .x$cell.id))) 147 | 148 | boot_clusters_df[[3]] %>% 149 | mutate(jaccard = purrr::map_dbl(data, ~JaccardSets(TurnIdentToDf(pbmc@ident)$data[[5]]$cell.id, .x$cell.id))) 150 | 151 | 152 | boot_clusters_df[[3]] %>% 153 | mutate(jaccard = purrr::map_dbl(data, ~JaccardSets(TurnIdentToDf(pbmc@ident)$data[[6]]$cell.id, .x$cell.id))) 154 | 155 | 156 | split(names(boot_clusters[[1]]), boot_clusters[[1]]) %>% lapply(length) 157 | 158 | ## split the cells by cluster 159 | SplitIdentByCluster<- function(ident){ 160 | split(names(ident), ident) 161 | } 162 | 163 | SplitIndentByCluster(pbmc@ident) 164 | 165 | PairWiseJaccardSets<- function(ident1.list, ident2.list){ 166 | res<- c() 167 | for (i in seq_along(ident1.list)){ 168 | ind<- purrr::map_dbl(ident2.list, ~JaccardSets(ident1.list[[i]], .x)) 169 | res<- rbind(res, ind) 170 | } 171 | rownames(res)<- names(ident1.list) 172 | return(res) 173 | } 174 | 175 | mat<- PairWiseJaccardSets(SplitIdentByCluster(pbmc@ident), SplitIdentByCluster(boot_clusters[[2]])) 176 | mat<- PairWiseJaccardSets(ident1.list, ident2.list) 177 | library(ComplexHeatmap) 178 | 179 | Heatmap(mat, cluster_rows = F, cluster_columns = F) 180 | 181 | 182 | ident1.list<- SplitIdentByCluster(pbmc@ident) 183 | ident2.list<- SplitIdentByCluster(boot_clusters[[3]]) 184 | 185 | id1<- purrr::map_dbl(ident2.list, ~JaccardSets(ident1.list[[1]], .x)) 186 | id2<- purrr::map_dbl(ident2.list, ~JaccardSets(ident1.list[[2]], .x)) 187 | 188 | sum(ident1.list$`7` %in% unlist(ident2.list)) 189 | 190 | lapply( ident2.list, function(x) sum(ident1.list$`6` %in% x)) 191 | lapply(ident2.list, length) 192 | 193 | 194 | load("~/gather_bootstrap_cluster.rda") 195 | PairWiseJaccardSets(pbmc@ident, idents[[3]]) 196 | 197 | 198 | ### put in function 199 | `mat_list<- purrr::map(idents, ~PairWiseJaccardSets(ident1 = pbmc@ident, ident2 = .x)) 200 | 201 | mat_max<- purrr::map(mat_list, SelectHighestJaccard) 202 | 203 | mats<- purrr::reduce(mat_max, bind_rows)` 204 | 205 | mats %>% as_tibble() %>% tibble::rownames_to_column(var = "bootstrap") %>% 206 | tidyr::gather(-bootstrap, key= "cluster", value = "jaccard") %>% 207 | ggplot(aes(x = cluster, y = jaccard)) + 208 | geom_point() + 209 | geom_boxplot(aes(col = cluster)) 210 | 211 | source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R") 212 | 213 | mats %>% as_tibble() %>% tibble::rownames_to_column(var = "bootstrap") %>% 214 | tidyr::gather(-bootstrap, key= "cluster", value = "jaccard") %>% 215 | ggplot(aes(x = cluster, y = jaccard, fill = cluster)) + 216 | geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8) + 217 | geom_point(aes(y = jaccard, color = cluster), position = position_jitter(width = .15), size = .5, alpha = 0.8) + 218 | geom_boxplot(width = .1, guides = FALSE, outlier.shape = NA, alpha = 0.5) + 219 | theme(legend.position="none") 220 | 221 | 222 | 223 | mats %>% dplyr::summarise_all(median) %>% dplyr::mutate_all(~ifelse(.x >0.4, T, F)) 224 | 225 | mats %>% dplyr::summarise_all(mean) %>% dplyr::mutate_all(~ifelse(.x >0.4, T, F)) %>% unlist() %>% str() 226 | 227 | ## how many stable clusters? 228 | mats %>% dplyr::summarise_all(median) %>% dplyr::mutate_all(~ifelse(.x >0.4, T, F)) %>% 229 | unlist() %>% sum() 230 | 231 | 232 | ks_idents<- readRDS("~/gather_bootstrap_k.rds") 233 | 234 | ks_stable<- purrr::map(ks_idents, ~AssignStableCluster(pbmc@ident, .x )) 235 | 236 | k_20_seurat<- readRDS("~/bootstrap_k_preprocess/bootstrap_k_20.rds") 237 | k_25_seurat<- readRDS("~/bootstrap_k_preprocess/bootstrap_k_25.rds") 238 | k_30_seurat<- readRDS("~/bootstrap_k_preprocess/bootstrap_k_30.rds") 239 | k_35_seurat<- readRDS("~/bootstrap_k_preprocess/bootstrap_k_35.rds") 240 | 241 | pbmc<- k_30_seurat 242 | ks_idents_original<- list(k_20_seurat@ident, k_25_seurat@ident, k_30_seurat@ident, k_35_seurat@ident) 243 | names(ks_idents_original)<- c("k20", "k25", "k30", "k35") 244 | 245 | JaccardRainCloudPlot(k_20_seurat@ident, ks_idents$`20`) + geom_hline(yintercept = c(0.4, 0.8), linetype = 2) 246 | JaccardRainCloudPlot(k_25_seurat@ident, ks_idents$`25`) + geom_hline(yintercept = c(0.4, 0.8), linetype = 2) 247 | JaccardRainCloudPlot(k_30_seurat@ident, ks_idents$`30`) + geom_hline(yintercept = c(0.4, 0.8), linetype = 2) 248 | JaccardRainCloudPlot(k_35_seurat@ident, ks_idents$`35`) + geom_hline(yintercept = c(0.4, 0.8), linetype = 2) 249 | 250 | ## cluster7 and cluster 8 from k20 is the same cluster7 from k25 251 | PairWiseJaccardSetsHeatmap(PairWiseJaccardSets(k_20_seurat@ident, k_25_seurat@ident)) 252 | 253 | ks_stable<- purrr::map2(ks_idents_original, ks_idents, ~AssignStableCluster(ident1= .x, idents = .y)) 254 | 255 | ks_stable$k20 256 | ks_stable$k25 257 | ks_stable$k30 258 | ks_stable$k35 259 | 260 | map(ks_stable, c("perdent_cell_in_cluster", "number_of_stable_cluster")) 261 | 262 | table(k_20_seurat@ident, k_30_seurat@ident) 263 | 264 | jaccard_mat<- PairWiseJaccardSets(k_20_seurat@ident, k_25_seurat@ident) 265 | 266 | get_colname<- function(x){ 267 | id<- which.max(x) 268 | return(colnames(jaccard_mat)[id]) 269 | } 270 | 271 | ids<- apply(jaccard_mat, 1, get_colname) 272 | tibble::tibble(ident1 = names(ids), ident2 = ids) 273 | 274 | MatchClusters(k_20_seurat@ident, k_25_seurat@ident) 275 | 276 | mat<- PairWiseOverlappingIdents(k_20_seurat@ident, k_25_seurat@ident) 277 | rownames(mat)<- paste0("1_", rownames(mat)) 278 | colnames(mat)<- paste0("2_", colnames(mat)) 279 | circlize::circos.par(start.degree = 90, clock.wise = FALSE) 280 | #grid.col<- c("") 281 | circlize::chordDiagram(mat, big.gap = 20, transparency = 0.5, link.sort = TRUE, link.decreasing = FALSE, directional = -1) 282 | circlize::circos.clear() 283 | 284 | 285 | pbmc@ident %>% table() %>% as.data.frame() 286 | pbmc@ident %>% table() %>% rbind() %>% as.data.frame() 287 | 288 | cluster_size<- as.data.frame(table(pbmc@ident)) 289 | as.data.frame(table(pbmc@ident)) %>% 290 | dplyr::rename(cluster = Var1, size = Freq) %>% 291 | ggplot(aes(x = cluster, y = size)) + 292 | geom_bar(stat = "identity", fill = "blue") + 293 | geom_text(aes(label=size), vjust= -1.5, angle = 45) + 294 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) 295 | ``` 296 | 297 | 298 | 299 | --------------------------------------------------------------------------------