├── .travis.yml
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── R
    ├── calculatesilhouette.R
    ├── clusterviz.R
    ├── geomflatviolin.R
    ├── idents.R
    ├── mergemultipleseuratobjects.R
    ├── preprocesssubsetdata.R
    ├── preprocesssubsetdatav2.R
    ├── randomsubsetdata.R
    ├── scclusterboot.R
    ├── scclusteval-package.R
    ├── snncbi.R
    └── utils-pipe.R
├── README.Rmd
├── README.md
├── data
    └── idents.rda
├── man
    ├── AssignHighestJaccard.Rd
    ├── AssignStableCluster.Rd
    ├── CalculatePercentCellInStable.Rd
    ├── CalculateSilhouette.Rd
    ├── ClusterIdentityChordPlot.Rd
    ├── ClusterSizeBarplot.Rd
    ├── JaccardRainCloudPlot.Rd
    ├── JaccardSets.Rd
    ├── MatchClusters.Rd
    ├── MergeMultipleSeuratObjects.Rd
    ├── PairWiseJaccardSets.Rd
    ├── PairWiseJaccardSetsHeatmap.Rd
    ├── PairWiseOverlappingIdents.Rd
    ├── ParameterSetScatterPlot.Rd
    ├── PreprocessSubsetData.Rd
    ├── PreprocessSubsetDataV2.Rd
    ├── RandomSubsetData.Rd
    ├── SilhouetteRainCloudPlot.Rd
    ├── figures
    │   ├── .DS_Store
    │   ├── README-pressure-1.png
    │   ├── README-unnamed-chunk-1-1.png
    │   ├── README-unnamed-chunk-2-1.png
    │   ├── README-unnamed-chunk-2-2.png
    │   ├── README-unnamed-chunk-2-3.png
    │   ├── README-unnamed-chunk-2-4.png
    │   ├── README-unnamed-chunk-2-5.png
    │   ├── README-unnamed-chunk-3-1.png
    │   ├── README-unnamed-chunk-3-2.png
    │   ├── README-unnamed-chunk-3-3.png
    │   ├── README-unnamed-chunk-3-4.png
    │   ├── README-unnamed-chunk-4-1.png
    │   ├── README-unnamed-chunk-4-10.png
    │   ├── README-unnamed-chunk-4-2.png
    │   ├── README-unnamed-chunk-4-3.png
    │   ├── README-unnamed-chunk-4-4.png
    │   ├── README-unnamed-chunk-4-5.png
    │   ├── README-unnamed-chunk-4-6.png
    │   ├── README-unnamed-chunk-4-7.png
    │   ├── README-unnamed-chunk-4-8.png
    │   ├── README-unnamed-chunk-4-9.png
    │   ├── README-unnamed-chunk-5-1.png
    │   ├── jaccard_raincloud.png
    │   ├── scclusteval.png
    │   └── workflow.png
    ├── geom_flat_violin.Rd
    ├── idents.Rd
    ├── pipe.Rd
    ├── scClusterBoot.Rd
    └── scclusteval-package.Rd
├── scclusteval.Rproj
├── tests
    └── spelling.R
└── vignettes
    ├── .gitignore
    └── pbmc_example.Rmd


/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Use R
 2 | language: r
 3 | sudo: true
 4 | cache: packages
 5 | warnings_are_errors: false
 6 | 
 7 | # environment variables set for all builds
 8 | env:
 9 |   global:
10 |     - BIOC_USE_DEVEL="FALSE"  ## Use the current release version
11 |     - R_BUILD_ARGS="--no-build-vignettes --no-manual"
12 |     - R_CHECK_ARGS="--no-build-vignettes --no-manual --timings"  ## do not build vignettes or manual
13 |     - _R_CHECK_TIMINGS_="0"  ## get the timing information for the examples for all of your functions
14 | 
15 | r:
16 |  - release
17 | 
18 | # do not build vignettes...takes too long and times out on travis
19 | r_build_args: --no-build-vignettes --no-manual
20 | r_check_args: --no-build-vignettes --no-manual --timings
21 | 
22 | # for codecov
23 | r_packages:
24 |   - covr
25 |   
26 | # we need to install BiocInstaller for testing Bioconductor packages
27 | bioc_required: true
28 | 
29 | # only report coverage for the release version
30 | after_success:
31 |   - test $TRAVIS_R_VERSION_STRING = 'release' && Rscript -e 'covr::codecov()'
32 | 
33 | notifications:
34 |   email:
35 |     on_success: change
36 |     on_failure: change


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: scclusteval
 2 | Title: Evaluate the single cell clustering 
 3 | Version: 0.0.0.9000
 4 | Authors@R:
 5 |     person(given = "Ming",
 6 |            family = "Tang",
 7 |            role = c("aut", "cre"),
 8 |            email = "tangming2005@gmail.com")
 9 | Description: What the package does (one paragraph).
10 | License: MIT + file LICENSE
11 | Encoding: UTF-8
12 | LazyData: true
13 | Roxygen: list(markdown = TRUE)
14 | RoxygenNote: 7.1.0
15 | biocViews:
16 | Imports:
17 |     dplyr,
18 |     purrr,
19 |     tidyr,
20 |     magrittr,
21 |     BiocParallel,
22 |     future.apply,
23 |     ComplexHeatmap,
24 |     ggplot2,
25 |     cluster
26 | Suggests:
27 |     spelling,
28 |     knitr,
29 |     rmarkdown
30 | Language: en-US
31 | URL: https://github.com/crazyhottommy/scclusteval
32 | BugReports: https://github.com/crazyhottommy/scclusteval/issues
33 | VignetteBuilder: knitr
34 | Depends:
35 |     Seurat (>= 4.0.0)
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2018
2 | COPYRIGHT HOLDER: Ming Tang
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2018 Ming Tang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export("%>%")
 4 | export(AssignHighestJaccard)
 5 | export(AssignStableCluster)
 6 | export(CalculatePercentCellInStable)
 7 | export(CalculateSilhouette)
 8 | export(ClusterIdentityChordPlot)
 9 | export(ClusterSizeBarplot)
10 | export(JaccardRainCloudPlot)
11 | export(JaccardSets)
12 | export(MatchClusters)
13 | export(MergeMultipleSeuratObjects)
14 | export(PairWiseJaccardSets)
15 | export(PairWiseJaccardSetsHeatmap)
16 | export(PairWiseOverlappingIdents)
17 | export(ParameterSetScatterPlot)
18 | export(PreprocessSubsetData)
19 | export(PreprocessSubsetDataV2)
20 | export(RandomSubsetData)
21 | export(SilhouetteRainCloudPlot)
22 | export(geom_flat_violin)
23 | export(scClusterBoot)
24 | importFrom(magrittr,"%>%")
25 | 


--------------------------------------------------------------------------------
/R/calculatesilhouette.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' Calculate Silhouette width from PCA space for each cell after clustering
 3 | #' This is calculated from Seurat object
 4 | #' @param object A Seurat object with Idents set to cluster ids (factors)
 5 | #' @param dims default 1:50  dimension to use in the PCA space to calculate
 6 | #' eucledian distance
 7 | #'
 8 | #' @return a dataframe with silhouette width for each cell. see also \code{\link[cluster]{silhouette}}
 9 | #' @export
10 | #'
11 | #' @examples
12 | #' CalculateSilhouette(pbmc_small, dims = 1:15)
13 | #'
14 | CalculateSilhouette<- function(object, dims = 1:50){
15 |         if (length(dims) > ncol(object@reductions$pca@cell.embeddings)) {
16 |                 stop("please specify PCA dims smaller than calculated")
17 |         }
18 |         cell_distance<- dist(object@reductions$pca@cell.embeddings[, dims])
19 |         # or as.integer
20 |         cell_cluster<- as.numeric(as.character(Idents(object)))
21 |         silhouette_score<- cluster::silhouette(cell_cluster, cell_distance)
22 |         silhouette_score<- tibble::tibble(cluster = silhouette_score[,1],
23 |                                           width = silhouette_score[,3],
24 |                                           cell = colnames(object)) %>%
25 |                 dplyr::mutate(cluster = as.factor(cluster))
26 |         return(silhouette_score)
27 | }
28 | 


--------------------------------------------------------------------------------
/R/clusterviz.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' Make a Barplot for cluster size
  3 | #'
  4 | #' @param ident a named factor vector. names are the cell names, the values are
  5 | #' the cluster id.
  6 | #' @param bar_col color for the bar. Default is blue.
  7 | #' @param label_number whether or not put cell number in each cluster on top of the bar
  8 | #'
  9 | #' @return a ggplot2 bar graph object
 10 | #' @export
 11 | #'
 12 | #' @examples
 13 | #' data(pbmc_small)
 14 | #' ClusterSizeBarplot(Idents(pbmc_small))
 15 | #'
 16 | ClusterSizeBarplot<- function(ident, bar_col = "blue", label_number = TRUE){
 17 |         g<- as.data.frame(table(ident)) %>%
 18 |                 dplyr::rename(cluster = ident, size = Freq) %>%
 19 |                 ggplot2::ggplot(ggplot2::aes(x = cluster, y = size)) +
 20 |                 ggplot2::geom_bar(stat = "identity", fill = bar_col) +
 21 |                 ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
 22 |         if (!label_number){
 23 |                 return (g)
 24 | 
 25 |         } else {
 26 |                 g<- g + ggplot2::geom_text(ggplot2::aes(label=size), vjust= -1.5, angle = 45)
 27 |                 return (g)
 28 | 
 29 |         }
 30 | }
 31 | 
 32 | #' Make a Heatmap of the pairwise Jaccard distance between cluster ident of two
 33 | #' Seurat object
 34 | #'
 35 | #'
 36 | #' @param ident1 a named factor vector. names are the cell names, the values are
 37 | #' the cluster id.
 38 | #' @param ident2 a named factor vector. names are the cell names, the values are
 39 | #' the cluster id.
 40 | #' @param col_low Color for low Jaccard index.
 41 | #' @param col_high Color for high Jaccard index.
 42 | #' @param title  The title of the heatmap
 43 | #' @param cluster_rows  cluster row or not, default FALSE
 44 | #' @param cluster_columns cluster columns or not, default FASLE
 45 | #' @param show_column_dend Whether or not show column dendrogram
 46 | #' @param show_row_dend Whether or not show row dendrogram
 47 | #' @param best_match Whether or not only show the best match of ident1 from ident2.
 48 | #' if set to TRUE, the Jaccard index matrix will be subsetted using the ident2 column
 49 | #' from the output of \code{\link{MatchClusters}}, the row order will be in order from cluster
 50 | #' 0 to the total number of clusters, the columns will be the best match of ident1 from ident2,
 51 | #' and the columns idents could be duplicated. e.g. single cluster from ident2 matches multiple
 52 | #' clusters in ident1.
 53 | #' @param ... other parameters pass to \code{\link[ComplexHeatmap]{Heatmap}}
 54 | #'
 55 | #' @return A Heatmap representing the pair-wise Jaccard correlation, rows are ident1,
 56 | #' columns are ident2
 57 | #' @export
 58 | #'
 59 | #' @examples
 60 | #'
 61 | PairWiseJaccardSetsHeatmap<- function(ident1, ident2, best_match = FALSE,
 62 |                                       title = NULL, col_low = "white", col_high= "red",
 63 |                                       cluster_rows = F, cluster_columns =F,
 64 |                                       show_row_dend = F, show_column_dend = F, ...){
 65 |         cell_fun = function(j, i, x, y, width, height, fill) {
 66 |                 grid::grid.rect(x = x, y = y, width = width *0.99, height = height *0.99,
 67 |                           gp = grid::gpar(col = "grey", fill = fill, lty = 1, lwd = 0.5))
 68 |         }
 69 |         mat<- PairWiseJaccardSets(ident1, ident2)
 70 |         col_fun<- circlize::colorRamp2(c(0, 1), c(col_low, col_high))
 71 |         if (best_match){
 72 |                 cluster_rows = F
 73 |                 cluster_columns =F
 74 |                 show_row_dend = F
 75 |                 show_column_dend = F
 76 |                 match_idx<- MatchClusters(ident1, ident2)
 77 |                 ComplexHeatmap::Heatmap(mat[, match_idx$ident2],
 78 |                                         cluster_rows = cluster_rows, cluster_columns = cluster_columns,
 79 |                                         show_row_names = T, show_column_names = T,
 80 |                                         show_row_dend = show_row_dend,
 81 |                                         show_column_dend = show_column_dend,
 82 |                                         col = col_fun, rect_gp = grid::gpar(type = "none"),
 83 |                                         cell_fun = cell_fun,
 84 |                                         name = "Jaccard index",
 85 |                                         column_title = title,
 86 |                                         heatmap_legend_param = list(color_bar = "discrete"),
 87 |                                         ...)
 88 |         }
 89 |         else{
 90 |                 ComplexHeatmap::Heatmap(mat,
 91 |                                         cluster_rows = cluster_rows, cluster_columns = cluster_columns,
 92 |                                         show_row_names = T, show_column_names = T,
 93 |                                         show_row_dend = show_row_dend,
 94 |                                         show_column_dend = show_column_dend,
 95 |                                         col = col_fun, rect_gp = grid::gpar(type = "none"),
 96 |                                         cell_fun = cell_fun,
 97 |                                         name = "Jaccard index",
 98 |                                         column_title = title,
 99 |                                         heatmap_legend_param = list(color_bar = "discrete"),
100 |                                         ...)
101 | 
102 |         }
103 | 
104 | }
105 | 
106 | 
107 | #' Plot the Jaccard index distribution using raincloud plot
108 | #'
109 | #' @param idents1 A list of cluster identity from the subsampled data set
110 | #' before reclustering. (cluster id copied from the original full data set)
111 | #' @param idents2 A list of cluster identity from the subsampled data sets after
112 | #' reclustering.
113 | #' @param title Title of the plot
114 | #'
115 | #' @return A ggplot2 object
116 | #' @export
117 | #'
118 | #' @examples
119 | #'
120 | #'\dontrun{
121 | #'data(idents)
122 | #'## the pbmc here need to be fully processed.
123 | #'JaccardRainCloudPlot(idents, idents)
124 | #'}
125 | #'
126 | JaccardRainCloudPlot<- function(idents1, idents2, title= NULL){
127 |         mats<- AssignHighestJaccard(idents1, idents2)
128 |         g<- mats %>% tibble::as_tibble() %>% tibble::rownames_to_column(var = "bootstrap")  %>%
129 |                 tidyr::gather(-bootstrap, key= "cluster", value = "jaccard") %>%
130 |                 dplyr::mutate(cluster = as.factor(as.numeric(.$cluster))) %>%
131 |                 ggplot2::ggplot(ggplot2::aes(x = cluster, y = jaccard, fill = cluster)) +
132 |                 geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8) +
133 |                 ggplot2::geom_point(ggplot2::aes(y = jaccard, color = cluster), position = position_jitter(width = .15), size = .5, alpha = 0.8) +
134 |                 ggplot2::geom_boxplot(width = .1, outlier.shape = NA, alpha = 0.5) +
135 |                 ggplot2::theme_classic() +
136 |                 ggplot2::theme(legend.position="none") +
137 |                 ggplot2::ggtitle(title)
138 |         return(g)
139 | }
140 | 
141 | 
142 | #' Plot a scatter plot for different clustering parameters
143 | #'
144 | #' x-axis is the parameters tested (e.g. many different k.param)
145 | #' y-axis is the total number of clusters and total number of stable clusters based
146 | #' on the jaccard cutoff as determined by AssignStableClusters, or precentage of cells
147 | #' in stable clusters.
148 | #'
149 | #' @param stable_clusters a dataframe with list-columns for data, stable_cluster determined by
150 | #' \code{\link{AssignStableCluster}} and the rest of the columns are pc, resolution and k_param.
151 | #' @param fullsample_idents a dataframe with the list-column contain the original ident for
152 | #' the full dataset. This is the direct output from the Snakemake workflow.
153 | #' @param x_var one of "pc", "resolution" and "k_param".
154 | #' @param y_var one of "number" or "percentage". If it is "number",
155 | #' y-axis si the total number of clusters and total number of stable clusters.
156 | #' @param facet_rows one of "pc", "resolution" and "k_param" for ggplot2 to facet.
157 | #' @param facet_cols one of "pc", "resolution" and "k_param" for ggplot2 to facet.
158 | #'
159 | #' @return a ggplot2 object
160 | #' @export
161 | #'
162 | #' @examples
163 | ParameterSetScatterPlot<- function(stable_clusters,
164 |                                    fullsample_idents,
165 |                                    x_var,
166 |                                    y_var,
167 |                                    facet_rows,
168 |                                    facet_cols ) {
169 | 
170 |         df<- dplyr::left_join(stable_clusters, fullsample_idents) %>%
171 |                 dplyr::ungroup() %>%
172 |                 dplyr::mutate(total = map_dbl(stable_cluster, ~ length(.x$stable_cluster))) %>%
173 |                 dplyr::mutate(stable = map_dbl(stable_cluster, ~ .x$number_of_stable_cluster)) %>%
174 |                 dplyr::mutate(percentage = map2_dbl(original_ident_full, stable_cluster,
175 |                                                     function(x, y) CalculatePercentCellInStable(x,                                                                                      y$stable_cluster))) %>%
176 |                 dplyr::select(-data, - stable_cluster, -original_ident_full) %>%
177 |                 dplyr::mutate_if(is.character, function(x) as.factor(as.numeric(x))) %>%
178 |                 tidyr::gather(total:stable , key = "category", value = "number")
179 |         ## plotting
180 | 
181 |         if (!all(c(x_var, y_var, facet_rows, facet_cols) %in% colnames(df))) {
182 |                 stop("x_var, faect_rows and facet_cols must be one of the parameter columns in the dataframe,\n
183 |          y_var must be 'number' or 'percentage'.")
184 |         }
185 | 
186 |         if (y_var == "percentage") {
187 |                 p<- ggplot2::ggplot(df, ggplot2::aes(x=.data[[x_var]], y = .data[[y_var]])) +
188 |                         ggplot2::geom_point(color = "blue") +
189 |                         ggplot2::geom_line(ggplot2::aes(group = 1), color = "red") +
190 |                         ggplot2::scale_y_continuous(labels = scales::percent) +
191 |                         ggplot2::facet_grid(rows = vars(.data[[facet_rows]]), cols = vars(.data[[facet_cols]])) +
192 |                         ggplot2::xlab(x_var) +
193 |                         ggplot2::ylab(y_var)
194 |         }
195 |         if (y_var == "number"){
196 |                 p<- ggplot2::ggplot(df, ggplot2::aes(x=.data[[x_var]], y = .data[[y_var]])) +
197 |                         ggplot2::geom_point() +
198 |                         ggplot2::geom_line(ggplot2::aes(group = category, color = category )) +
199 |                         ggplot2::facet_grid(rows = vars(.data[[facet_rows]]), cols = vars(.data[[facet_cols]])) +
200 |                         ggplot2::xlab(x_var) +
201 |                         ggplot2::ylab(y_var)
202 |         }
203 | 
204 |         return(p)
205 | 
206 | }
207 | 
208 | ## see https://jokergoo.github.io/circlize_book/book/the-chorddiagram-function.html
209 | 
210 | #' Plot ChordDiagram of cell identity changes between two runs of clusters.
211 | #'
212 | #' @param ident1 a named factor vector. names are the cell names, the values are
213 | #' the cluster id.
214 | #' @param ident2 a named factor vector. names are the cell names, the values are
215 | #' the cluster id.
216 | #' @param clusters_to_show_ident1 A character vector of cluster ids to show for ident1.
217 | #' default is NULL, all clusters will be shown.
218 | #' @param big.gap Gap between sectors of two cluster runs.
219 | #' @param transparency Transparency of link colors, 0 means no transparency and 1 means full transparency.
220 | #' see \code{\link[circlize]{chordDiagramFromMatrix}}
221 | #' @param grid.col Grid colors which correspond to matrix rows/columns (or sectors).
222 | #' The length of the vector should be either 1 or length(union(rownames(mat), colnames(mat))).
223 | #' It's preferred that grid.col is a named vector of which names correspond to sectors.
224 | #' If it is not a named vector, the order of grid.col corresponds to order of sectors.
225 | #' see \code{\link[circlize]{chordDiagramFromMatrix}}
226 | #' @param link.sort whether sort links on every sector based on the width of the links on it.
227 | #' If it is set to "overall", all links are sorted regardless whether they are from rows or columns.
228 | #' see \code{\link[circlize]{chordDiagramFromMatrix}}
229 | #' @param link.decreasing for link.sort
230 | #' @param directional Whether links have directions. 1 means the direction is from the first column
231 | #' in df to the second column, -1 is the reverse, 0 is no direction, and 2 for two directional.
232 | #' see \code{\link[circlize]{chordDiagramFromMatrix}}
233 | #'
234 | #' @return A data frame which contains positions of links. see \code{\link[circlize]{chordDiagramFromMatrix}}
235 | #' @export
236 | #'
237 | #' @examples
238 | ClusterIdentityChordPlot<- function(ident1, ident2,
239 |                                     clusters_to_show_ident1 = NULL,
240 |                                     big.gap = 10, transparency = 0.5,
241 |                                     grid.col = NULL,
242 |                                     link.sort = TRUE, link.decreasing = TRUE,
243 |                                     directional = -1){
244 |         mat<- PairWiseOverlappingIdents(ident1, ident2)
245 |         if (!is.null(clusters_to_show_ident1)){
246 |                 mat<- mat[clusters_to_show_ident1, ]
247 |         }
248 |         rownames(mat)<- paste0("1_", rownames(mat))
249 |         colnames(mat)<- paste0("2_", colnames(mat))
250 |         circlize::circos.par(start.degree = 90, clock.wise = FALSE)
251 |         circlize::chordDiagram(mat, big.gap = big.gap, transparency = transparency,
252 |                                grid.col = grid.col,
253 |                                link.sort = link.sort, link.decreasing = link.decreasing,
254 |                                directional = directional)
255 |         circlize::circos.clear()
256 | }
257 | 
258 | 
259 | #' Plot raincloud plot for silhouette score
260 | #'
261 | #' @param silhouette_score a dataframe returned by \code{link[CalculateSilhouette]}
262 | #'
263 | #' @return a ggplot2 object
264 | #' @export
265 | #'
266 | #' @examples
267 | #'
268 | #' SilhouetteRainCloudPlot(CalculateSilhouette(pbmc_small, dims = 1:15))
269 | SilhouetteRainCloudPlot<- function(silhouette_score){
270 |                 g<- ggplot2::ggplot(silhouette_score, ggplot2::aes(x = cluster, y = width, fill = cluster)) +
271 |                 geom_flat_violin(position = ggplot2::position_nudge(x = .2, y = 0), alpha = .8) +
272 |                 ggplot2::geom_point(ggplot2::aes(y = width, color = cluster), position = ggplot2::position_jitter(width = .15), size = .5,
273 |                            alpha = 0.8) +
274 |                 ggplot2::geom_boxplot(width = .1, outlier.shape = NA, alpha = 0.5) +
275 |                 ggplot2::ylab("silhouette width") +
276 |                 ggplot2::theme_classic(base_size = 14) +
277 |                 ggplot2::theme(legend.position="none")
278 |                 return(g)
279 | }
280 | 


--------------------------------------------------------------------------------
/R/geomflatviolin.R:
--------------------------------------------------------------------------------
 1 | # somewhat hackish solution to:
 2 | # https://twitter.com/EamonCaddigan/status/646759751242620928
 3 | # based mostly on copy/pasting from ggplot2 geom_violin source:
 4 | # https://github.com/hadley/ggplot2/blob/master/R/geom-violin.r
 5 | # credit goes to David Robinson https://twitter.com/drob
 6 | 
 7 | 
 8 | 
 9 | "%||%" <- function(a, b) {
10 |         if (!is.null(a)) a else b
11 | }
12 | 
13 | #' A Flat Violin plot
14 | #'
15 | #' @param mapping See \code{\link[ggplot2]{geom_violin}}
16 | #' @param data See \code{\link[ggplot2]{geom_violin}}
17 | #' @param position See \code{\link[ggplot2]{geom_violin}}
18 | #' @param trim See \code{\link[ggplot2]{geom_violin}}
19 | #' @param scale See \code{\link[ggplot2]{geom_violin}}
20 | #' @param show.legend See \code{\link[ggplot2]{geom_violin}}
21 | #' @param inherit.aes See \code{\link[ggplot2]{geom_violin}}
22 | #' @param ...
23 | #'
24 | #' @export
25 | #'
26 | #' @examples
27 | #' library(ggplot2)
28 | #' ggplot(diamonds, aes(cut, carat)) +
29 | #' geom_flat_violin() +
30 | #' coord_flip()
31 | #'
32 | geom_flat_violin <- function(mapping = NULL, data = NULL, stat = "ydensity",
33 |                              position = "dodge", trim = TRUE, scale = "area",
34 |                              show.legend = NA, inherit.aes = TRUE, ...) {
35 |         ggplot2::layer(
36 |                 data = data,
37 |                 mapping = mapping,
38 |                 stat = stat,
39 |                 geom = GeomFlatViolin,
40 |                 position = position,
41 |                 show.legend = show.legend,
42 |                 inherit.aes = inherit.aes,
43 |                 params = list(
44 |                         trim = trim,
45 |                         scale = scale,
46 |                         ...
47 |                 )
48 |         )
49 | }
50 | 
51 | 
52 | GeomFlatViolin <-
53 |         ggplot2::ggproto("GeomFlatViolin", ggplot2::Geom,
54 |                 setup_data = function(data, params) {
55 |                         data$width <- data$width %||%
56 |                                 params$width %||% (ggplot2::resolution(data$x, FALSE) * 0.9)
57 | 
58 |                         # ymin, ymax, xmin, and xmax define the bounding rectangle for each group
59 |                         data %>%
60 |                                 dplyr::group_by(group) %>%
61 |                                 dplyr::mutate(ymin = min(y),
62 |                                        ymax = max(y),
63 |                                        xmin = x,
64 |                                        xmax = x + width / 2)
65 | 
66 |                 },
67 | 
68 |                 draw_group = function(data, panel_scales, coord) {
69 |                         # Find the points for the line to go all the way around
70 |                         data <- transform(data, xminv = x,
71 |                                           xmaxv = x + violinwidth * (xmax - x))
72 | 
73 |                         # Make sure it's sorted properly to draw the outline
74 |                         newdata <- rbind(plyr::arrange(transform(data, x = xminv), y),
75 |                                          plyr::arrange(transform(data, x = xmaxv), -y))
76 | 
77 |                         # Close the polygon: set first and last point the same
78 |                         # Needed for coord_polar and such
79 |                         newdata <- rbind(newdata, newdata[1,])
80 | 
81 |                         ggplot2:::ggname("geom_flat_violin", ggplot2::GeomPolygon$draw_panel(newdata, panel_scales, coord))
82 |                 },
83 | 
84 |                 draw_key = ggplot2::draw_key_polygon,
85 | 
86 |                 default_aes = ggplot2::aes(weight = 1, colour = "grey20", fill = "white", size = 0.5,
87 |                                   alpha = NA, linetype = "solid"),
88 | 
89 |                 required_aes = c("x", "y")
90 |         )
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/R/idents.R:
--------------------------------------------------------------------------------
 1 | #' cluster identity of subsetted pbmc data
 2 | #'
 3 | #' The 2700 cell pbmc data were subsetted to 80 percent of the cells for 100 times.
 4 | #' Each time, we fully re-processed the subsetted data from FindVaraiableGenes to
 5 | #' FindClusters using k =30 and resolution = 0.6, and record the cluster identity
 6 | #' from the processed seurat@@ident
 7 | #' slot and saved in to a list of factor.
 8 | #'
 9 | #' @docType data
10 | #'
11 | #' @usage data(idents)
12 | #'
13 | #' @format A list of factors
14 | #' @source \url{https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz}
15 | #'
16 | "idents"
17 | 


--------------------------------------------------------------------------------
/R/mergemultipleseuratobjects.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #' Read multiple 10x run into Seurat objects and merge into a single Seurat object
 4 | #'
 5 | #' Read multiple 10x run into Seurat objects and merge into a single Seurat object.
 6 | #' The names of the list of paths will be prepended to the cell name.
 7 | #'
 8 | #' @param input_folders A named list of folder path for each run.
 9 | #' @param do.normalize Whether or not normalize the data after mergeing, default is FALSE
10 | #' @param ... Other parameters for CreatSeuratObject in the Seurat package
11 | #'
12 | #' @return A single merged Seurat object from mulitple 10x runs.
13 | #' @export
14 | #'
15 | #' @examples
16 | #' \dontrun{
17 | #' library(fs)
18 | #' library(here)
19 | #' library(stringr)
20 | #' input_folders<- dir_ls( path = here("data"), recursive = T) %>% path_dir() %>%
21 | #' unique() %>% str_subset("mm10-1.2.0_premrna")
22 | #' merged_seurat<- MergeMultipleSeuratObjects(input_folders)
23 | #' }
24 | 
25 | 
26 | MergeMultipleSeuratObjects<- function(input_folders, do.normalize = FALSE, ...){
27 |         seurat_data<- purrr::map(input_folders, Read10X)
28 |         #prefix the sample name to the cell name, otherwise merge seurat objects gives error
29 |         add_sample_name_to_cell<- function(x, y){
30 |                 colnames(x)<- paste(y, colnames(x), sep = "_")
31 |                 return(x)
32 |         }
33 |         sample_names<- names(input_folders)
34 |         seurat_data<- purrr::map2(seurat_data, sample_names, add_sample_name_to_cell)
35 |         seurat_objects<- purrr::map2(seurat_data, sample_names,
36 |                                      function(x,y) CreateSeuratObject(raw.data = x,
37 |                                                                       project = y,
38 |                                                                       ...))
39 |         #merge to a single seurat object
40 |         merged_seurat<- purrr::reduce(seurat_objects,
41 |                                       function(x,y) {MergeSeurat(x,y,
42 |                                                                  do.normalize = do.normalize)})
43 | }
44 | 


--------------------------------------------------------------------------------
/R/preprocesssubsetdata.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' A wrapper for preprocessing subsetted Seurat object
  3 | #'
  4 | #' The wrapper does FindVeriableGenes, ScaleData, RunPCA, JackStraw to
  5 | #' determine how many PCs to use, ProjectPCA and FindClusters and retrun
  6 | #' a fully processed Seurat object. The input subsetted seurat object is
  7 | #' supposed to be fully processed as well. So the NormalizeData step is not
  8 | #' necessary.
  9 | #'
 10 | #' @param object A subsetted Seurat object created by RandomSubsetData
 11 | #' @param num.pc number of PCs to calculate in RunPCA, JackStraw and JackStrawPlot
 12 | #' step. The optimal PCs for FindClusters will be determined by only significant PCs
 13 | #' from JackStrawPlot or if pc.use is set, JackStraw step will be skipped and use pc.use
 14 | #' for FindClusters.
 15 | #' @param pc.use number of PCs used for FindClusters. if pc.use is set, JackStraw step
 16 | #' will be skipped and use pc.use for FindClusters. score.thresh and sig.pc.thresh will be ignored.
 17 | #' @param n.start Number of random start.
 18 | #' @param nn.eps Error bound when performing nearest neighbor seach using RANN;
 19 | #' default of 0.0 implies exact nearest neighbor search. See FindClusters.
 20 | #' @param resolution Value of the resolution parameter, use a value above (below)
 21 | #' 1.0 if you want to obtain a larger (smaller) number of communities. see FIndClusters.
 22 | #' @param k.param Defines k for the k-nearest neighbor algorithm.
 23 | #' @param score.thresh Threshold to use for the proportion test of PC significance.
 24 | #' @param sig.pc.thresh Threshold for the significance of a particular PC.
 25 | #' @param ... any other parameters
 26 | #' @param variable.features.n number of variable features for \code{\link[Seurat]{SCTransform}}
 27 | #' @param workers number of CPUs to use for \code{\link[future]{plan}} parallel processing
 28 | #'
 29 | #' @return a fully processed Seurat object
 30 | #' @export
 31 | #'
 32 | #' @examples
 33 | #' \dontrun{
 34 | #' pbmc_small_subset<- RandomSubsetData(pbmc_small, 0.8)
 35 | #' pbmc_small_subset_processed<- PreprocessSubsetData(pbmc_small_subset)
 36 | #' pbmc_small_subset_processed@meta.data
 37 | #' }
 38 | 
 39 | 
 40 | PreprocessSubsetData<- function(object,
 41 |                                 variable.features.n = 3000,
 42 |                                 num.pc = 20,
 43 |                                 pc.use = NULL,
 44 |                                 workers = 2,
 45 |                                 score.thresh = 1e-5,
 46 |                                 sig.pc.thresh = 0.05,
 47 |                                 n.start = 100,
 48 |                                 nn.eps = 0,
 49 |                                 resolution = 0.8,
 50 |                                 k.param = 30,
 51 |                                 useSCTransform = TRUE,
 52 |                                 ...){
 53 |         ## use future for parallelization
 54 |         future::plan("multiprocess", workers = workers)
 55 |         meta.data.colnames<- object@meta.data %>% colnames()
 56 |         vars.to.regress<- c("percent.mt","nFeature_RNA")
 57 |         # in case the seurat object does not have percent.mito in metadata
 58 |         vars.to.regress<- vars.to.regress[vars.to.regress %in% meta.data.colnames]
 59 |         # default is on variable features only, omit the features argument
 60 |         # SCTransform replaces NormalizeData, ScaleData and FindVariableFeatures
 61 |         
 62 |         if(!is.null(pc.use)){
 63 |                 if(pc.use > num.pc){
 64 |                         stop("Specify the maximum pc.use number as less than or equal to the total num.pc")
 65 |                 }
 66 |         }
 67 |         
 68 |         if(useSCTransform==TRUE){
 69 |                 object<- SCTransform(object, vars.to.regress = vars.to.regress,
 70 |                              variable.features.n = variable.features.n, verbose = FALSE)  
 71 |         }else{
 72 |               stop("The SCTransform method for normalization is the only method supported by this function.  If you wish to use the approach that involves NormalizeData, ScaleData, and FindVariableFeatures and enables use of the Jackstraw procedure for determining which PCs to use please use the PreprocessSubsetDataV2 function.")
 73 |                 
 74 |         }
 75 | 
 76 | 
 77 | 
 78 |         object<- RunPCA(object = object, features = VariableFeatures(object = object),
 79 |                         npcs = num.pc)
 80 |         
 81 |         
 82 | 
 83 |         # if (is.null(pc.use) & useSCTransform==FALSE){
 84 |         #         object<- JackStraw(object = object, num.replicate = 100, dims = num.pc)
 85 |         # 
 86 |         #         object <- ScoreJackStraw(object = object, dims = 1:num.pc, score.thresh = score.thresh)
 87 |         # 
 88 |         #         PC_pvalues<- object@reductions$pca@jackstraw@overall.p.values
 89 |         # 
 90 |         #         ## determin how many PCs to use.
 91 |         #         pc.use<- min(which(PC_pvalues[,"Score"] > sig.pc.thresh)) -1
 92 |         # 
 93 |         # }
 94 |         
 95 |         
 96 |         if(is.null(pc.use)){
 97 |                 pc.use <- num.pc
 98 |                 message("SCTransform is being used and the Jackstraw procedure for determining which PCs to use is not compatable with this procedure. Since pc.use was not specified it is being automatically set to num.pc")
 99 |         }
100 | 
101 |         # add significant pc number to metadata, need to have names same as the cells
102 |         pc.use.meta<- rep(pc.use, length(colnames(object)))
103 |         names(pc.use.meta)<- colnames(object)
104 |         object<- AddMetaData(object = object, metadata = pc.use.meta, col.name = "pc.use")
105 |         object<- FindNeighbors(object, dims = 1:pc.use, k.param = k.param, nn.eps = nn.eps,
106 |                                verbose = FALSE, reduction = "pca", force.recalc = TRUE)
107 |         object <- FindClusters(object = object,
108 |                                 n.start = n.start,
109 |                                 resolution = resolution,
110 |                                 verbose = FALSE)
111 |         return(object)
112 | }
113 | 


--------------------------------------------------------------------------------
/R/preprocesssubsetdatav2.R:
--------------------------------------------------------------------------------
 1 | #' A wrapper for preprocessing subsetted Seurat object using ScaleData
 2 | #'
 3 | #' The wrapper does FindVeriableGenes, ScaleData, RunPCA, JackStraw to
 4 | #' determine how many PCs to use, ProjectPCA and FindClusters and retrun
 5 | #' a fully processed Seurat object. The input subsetted seurat object is
 6 | #' supposed to be fully processed as well. So the NormalizeData step is not
 7 | #' necessary.
 8 | #'
 9 | #' @param object A subsetted Seurat object created by RandomSubsetData
10 | #' @param num.pc number of PCs to calculate in RunPCA, JackStraw and JackStrawPlot
11 | #' step. The optimal PCs for FindClusters will be determined by only significant PCs
12 | #' from JackStrawPlot or if pc.use is set, JackStraw step will be skipped and use pc.use
13 | #' for FindClusters.
14 | #' @param pc.use number of PCs used for FindClusters. if pc.use is set, JackStraw step
15 | #' will be skipped and use pc.use for FindClusters. score.thresh and sig.pc.thresh will be ignored.
16 | #' @param n.start Number of random start.
17 | #' @param nn.eps Error bound when performing nearest neighbor seach using RANN;
18 | #' default of 0.0 implies exact nearest neighbor search. See FindClusters.
19 | #' @param resolution Value of the resolution parameter, use a value above (below)
20 | #' 1.0 if you want to obtain a larger (smaller) number of communities. see FIndClusters.
21 | #' @param k.param Defines k for the k-nearest neighbor algorithm.
22 | #' @param score.thresh Threshold to use for the proportion test of PC significance.
23 | #' @param sig.pc.thresh Threshold for the significance of a particular PC.
24 | #' @param ... any other parameters
25 | #' @param variable.features.n number of variable features for \code{\link[Seurat]{SCTransform}}
26 | #' @param workers number of CPUs to use for \code{\link[future]{plan}} parallel processing
27 | #'
28 | #' @return a fully processed Seurat object
29 | #' @export
30 | #'
31 | #' @examples
32 | #' \dontrun{
33 | #' pbmc_small_subset<- RandomSubsetData(pbmc_small, 0.8)
34 | #' pbmc_small_subset_processed<- PreprocessSubsetData(pbmc_small_subset)
35 | #' pbmc_small_subset_processed@@meta.data
36 | #' }
37 | 
38 | 
39 | PreprocessSubsetDataV2<- function(object,
40 |                                 nfeatures = 2000,
41 |                                 num.pc = 20,
42 |                                 pc.use = NULL,
43 |                                 workers = 2,
44 |                                 score.thresh = 1e-5,
45 |                                 sig.pc.thresh = 0.05,
46 |                                 n.start = 100,
47 |                                 nn.eps = 0,
48 |                                 resolution = 0.8,
49 |                                 k.param = 30,
50 |                                 ...){
51 |         
52 |         if(!is.null(pc.use)){
53 |                 if(pc.use > num.pc){
54 |                         stop("Specify the maximum pc.use number as less than or equal to the total num.pc")
55 |                 }
56 |         }
57 |         
58 |         meta.data.colnames<- object@meta.data %>% colnames()
59 |         vars.to.regress<- c("percent.mt","nFeature_RNA")
60 |         # in case the seurat object does not have percent.mito in metadata
61 |         vars.to.regress<- vars.to.regress[vars.to.regress %in% meta.data.colnames]
62 |         # no need to use this for the integrated data
63 |         #object<- FindVariableFeatures(object, selection.method = "vst", nfeatures = nfeatures)
64 |         object<- ScaleData(object)
65 | 
66 |         object<- RunPCA(object = object, features = VariableFeatures(object = object),
67 |                         npcs = num.pc)
68 | 
69 |         if (is.null(pc.use)){
70 |                 
71 |                 if("SCT"%in%names(SObjFiltered@assays)){
72 |                         pc.use <- num.pc
73 |                         message("The SCTransform assay was detected in the object, and the Jackstraw procedure for determining which PCs to use is not compatable with this procedure. Since pc.use was not specified it is being automatically set to num.pc")  
74 |                 }else{
75 |                         object<- JackStraw( object = object, num.replicate = 100, dims = num.pc)
76 |                         
77 |                         object <- ScoreJackStraw(object = object, dims = 1:num.pc, score.thresh = score.thresh)
78 |                         
79 |                         PC_pvalues<- object@reductions$pca@jackstraw@overall.p.values
80 |                         
81 |                         ## determine how many PCs to use.
82 |                         pc.use<- min(which(PC_pvalues[,"Score"] > sig.pc.thresh)) -1  
83 |                 }
84 | 
85 |         }
86 | 
87 |         # add significant pc number to metadata, need to have names same as the cells
88 |         pc.use.meta<- rep(pc.use, length(colnames(object)))
89 |         names(pc.use.meta)<- colnames(object)
90 |         object<- AddMetaData(object = object, metadata = pc.use.meta, col.name = "pc.use")
91 |         object<- FindNeighbors(object, dims = 1:pc.use, k.param = k.param, nn.eps = nn.eps,
92 |                                verbose = FALSE, reduction = "pca", force.recalc = TRUE)
93 |         object <- FindClusters(object = object,
94 |                                n.start = n.start,
95 |                                resolution = resolution,
96 |                                verbose = FALSE)
97 |         return(object)
98 | }
99 | 


--------------------------------------------------------------------------------
/R/randomsubsetdata.R:
--------------------------------------------------------------------------------
 1 | #' Randomly subset (cells) seurat object by a rate
 2 | #'
 3 | #' @param object Seurat object
 4 | #' @param rate a number betwee 0-1 for subsetting
 5 | #' @param random.subset.seed set a random seed for sampling, default is NULL.
 6 | #' @param ... any other parameters to \code{\link[Seurat]{subset}}
 7 | #'
 8 | #' @return Returns a randomly subsetted seurat object
 9 | #' @export
10 | #'
11 | #' @examples
12 | #' pbmc_small
13 | #' pbmc_small_subset<- RandomSubsetData(pbmc_small, 0.8)
14 | #' dim(pbmc_small_subset@@meta.data)
15 | #'
16 | #'
17 | # read this issue https://github.com/satijalab/seurat/issues/243
18 | # Seurat V3 does not have do.clean =T any more
19 | # see https://github.com/satijalab/seurat/issues/1792 use DietSeurat
20 | RandomSubsetData<- function(object, rate, random.subset.seed = NULL, ...){
21 |         ncells<- nrow(object@meta.data)
22 |         ncells.subsample<- round(ncells * rate)
23 | 
24 |         set.seed(random.subset.seed)
25 | 
26 |         selected.cells<- sample(colnames(object), ncells.subsample)
27 |         object<- subset(object, cells =  selected.cells,
28 |                             ...)
29 |         return(object)
30 | }
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/R/scclusterboot.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #' Calculate jaccard distance for two sets of character vectors
  4 | #'
  5 | #' @param set1 character vector 1
  6 | #' @param set2 character vector 2
  7 | #'
  8 | #' @return jaccard distance
  9 | #' @export
 10 | #'
 11 | #' @examples
 12 | #' JaccardSets(sample(LETTERS, 10), sample(LETTERS, 10))
 13 | JaccardSets<- function(set1, set2){
 14 |         length(intersect(set1, set2))/length(unique(c(set1, set2)))
 15 | }
 16 | 
 17 | 
 18 | #' Calculate pair-wise Jaccard distance for @@ident slots from two Seurat objects
 19 | #'
 20 | #' Calculate pair-wise Jaccard distance for two named factor vector. e.g.
 21 | #' seurat_obj1@ident and seurat_obj2@ident
 22 | #'
 23 | #' @param ident1 a named factor vector. names are the cell names, the values are
 24 | #' the cluster id.
 25 | #' @param ident2  a named factor vector. names are the cell names, the values are
 26 | #' the cluster id.
 27 | #'
 28 | #' @return a matrix of pair-wise Jaccard distance. Rows are clusters from ident1,
 29 | #' columns are clusters from ident2
 30 | #' @export
 31 | #'
 32 | #' @examples
 33 | #' \dontrun{
 34 | #' PairWiseJaccardSets(pbmc@@ident, pbmc_small@@ident)
 35 | #'}
 36 | #'
 37 | PairWiseJaccardSets<- function(ident1, ident2){
 38 |         ident1.list<- split(names(ident1), ident1)
 39 |         ident2.list<- split(names(ident2), ident2)
 40 |         res<- matrix(nrow = length(ident1.list), ncol = length(ident2.list),
 41 |                      dimnames = list(names(ident1.list), names(ident2.list)))
 42 |         for (i in seq_along(ident1.list)){
 43 |                 res[i, ]<- purrr::map_dbl(ident2.list, ~JaccardSets(ident1.list[[i]], .x))
 44 |         }
 45 |         return(res)
 46 | }
 47 | 
 48 | 
 49 | #' Calculate pair-wise overlapping cluster identities for @@ident slots from two Seurat objects
 50 | #'
 51 | #'Calculate pair-wise overlapping cluster identities for two named factor vector. e.g.
 52 | #' seurat_obj1@ident and seurat_obj2@ident
 53 | #' @param ident1 a named factor vector. names are the cell names, the values are
 54 | #' the cluster id.
 55 | #' @param ident2 a named factor vector. names are the cell names, the values are
 56 | #' the cluster id.
 57 | #'
 58 | #' @return A matrix of pairwise number of common cell identities for each cluster.
 59 | #' @export
 60 | #'
 61 | #' @examples
 62 | #' \dontrun{
 63 | #' PairWiseOverlappingIdents(pbmc@@ident, pbmc_small@@ident)
 64 | #' }
 65 | PairWiseOverlappingIdents<- function(ident1, ident2){
 66 |         ident1.list<- split(names(ident1), ident1)
 67 |         ident2.list<- split(names(ident2), ident2)
 68 |         res<- c()
 69 |         for (i in seq_along(ident1.list)){
 70 |                 ind<- purrr::map_dbl(ident2.list, ~length(intersect(ident1.list[[i]], .x)))
 71 |                 res<- rbind(res, ind)
 72 |         }
 73 |         rownames(res)<- names(ident1.list)
 74 |         return(res)
 75 | 
 76 | }
 77 | 
 78 | 
 79 | #' Match two run of cluster ids with highest Jaccard index
 80 | #'
 81 | #' @param ident1 a named factor vector. names are the cell names, the values are
 82 | #' the cluster id.
 83 | #' @param ident2 a named factor vector. names are the cell names, the values are
 84 | #' the cluster id.
 85 | #'
 86 | #' @return A tibble with two columns, column 1 is the cluster ids from ident1, column2
 87 | #' is the cluster ids from ident2.
 88 | #' @export
 89 | #'
 90 | #' @examples
 91 | #'  \dontrun{
 92 | #'  MatchClusters(pbmc@@ident, pbmc_small@@ident)
 93 | #'  }
 94 | MatchClusters<- function(ident1, ident2){
 95 |         jaccard_mat<- PairWiseJaccardSets(ident1, ident2)
 96 | 
 97 |         get_corresponding_cluster<- function(x){
 98 |                 id<- which.max(x)
 99 |                 return(colnames(jaccard_mat)[id])
100 |         }
101 |         matching_ids<- apply(jaccard_mat, 1, get_corresponding_cluster)
102 |         return(tibble::tibble(ident1 = names(matching_ids), ident2 = matching_ids))
103 | }
104 | 
105 | 
106 | 
107 | #' Assign highest Jaccard index for each cluster of the subsampled data set before
108 | #' reclustering with the cluster identites of subsampled data set after reclustering
109 | #'
110 | #' @param idents1 A list of cluster identity copied from the orginal data sets.
111 | #' idents1 is a list of the cluster identity from the subsampled data sets before reclustering.
112 | #' @param idents2 A list of cluster identity from the subsampled data sets.
113 | #' idents2 is a list of the cluster identity from the subsampled data sets after reclustering.
114 | #' The order of identities in idents1 and idents2 should correspond to each other.
115 | #'
116 | #' @return A matrix with dimention of #number of subsampling * #number of clusters in the
117 | #' original data set.
118 | #' @export
119 | #'
120 | #' @examples
121 | AssignHighestJaccard<- function(idents1, idents2){
122 |         mat_list<- purrr::map2(idents1, idents2,  ~PairWiseJaccardSets(ident1 = .x, ident2 = .y))
123 |         SelectHighestJaccard<- function(mat){
124 |                 apply(mat, 1, max)
125 | 
126 |         }
127 |         # or use the anonymous function
128 |         mat_max<- purrr::map(mat_list, SelectHighestJaccard)
129 |         mats<- purrr::reduce(mat_max, dplyr::bind_rows)
130 |         return(mats)
131 | }
132 | 
133 | #' Assign stable cluster
134 | #'
135 | #' @param idents1 A list of cluster identity copied from the orginal data sets.
136 | #' idents1 is a list of the cluster identity from the subsampled data sets before reclustering.
137 | #' @param idents2 A list of cluster identity from the subsampled data sets.
138 | #' idents2 is a list of the cluster identity from the subsampled data sets after reclustering.
139 | #' The order of identities in idents1 and idents2 should correspond to each other.
140 | #' @param method what way to summarize the jaccard index across all simulations.
141 | #' to determine a cluster is stable or not. options are "Jaccard_mean", "Jaccard_median" and "Jaccard_percent"
142 | #' @param jaccard_cutoff Cutoff of the jaccard index to determin a cluster is stable or not.
143 | #' it is the mean or median cutoff when the method is "jaccard_mean" or "jaccard_median" and it is
144 | #' the cutoff for every subsampling when the method is "jaccard_percent"
145 | #' @param percent_cutoff The percentage of jaccard index greater than jaccard_cutoff. Used
146 | #' when method is "jaccard_percent". specify 0.6 when you mean 60%.
147 | #'
148 | #' @return A list containing the raw data for jaccard index for all simulations,
149 | #' TRUE or FALSE of stable cluster for each cluster and a number of stable clusters.
150 | #' A cluster is deemed as stable if the median (or mean) jaccard index is > cutoff.
151 | #' in addtion, a stable_index is calculated, which is the pecentage of jaccard index >
152 | #' cutoff for all the subsampling. e.g. for 100 times subsampling, 0.8 means 80% of the
153 | #' time, the jaccard index is > cutoff. Sometimes, we see bimodal distrbution of the
154 | #' 100 jaccard index, the percentage is a better measurement than the mean or median of the
155 | #' 100 jaccard index.
156 | #'
157 | #' @export
158 | #'
159 | #' @examples
160 | #'
161 | #' data(idents)
162 | #'
163 | #' AssignStableCluster(idents, idents)
164 | #'
165 | AssignStableCluster<- function(idents1, idents2,
166 |                                method = "jaccard_median",
167 |                                jaccard_cutoff = 0.6,
168 |                                percent_cutoff = 0.6){
169 |         mats<- AssignHighestJaccard(idents1, idents2)
170 | 
171 |         stable_index<- (mats > jaccard_cutoff) %>%
172 |                 as.data.frame() %>%
173 |                 dplyr::summarise_all(mean) %>%
174 |                 unlist()
175 | 
176 |         if (method == "jaccard_mean"){
177 |                 stable_cluster<- mats %>%
178 |                         dplyr::summarise_all(mean) %>%
179 |                         dplyr::mutate_all(~ifelse(.x > jaccard_cutoff, TRUE, FALSE)) %>%
180 |                         unlist()
181 |                 number_of_stable_cluster<- sum(stable_cluster)
182 | 
183 |         } else if (method == "jaccard_median"){
184 |                 stable_cluster<- mats %>%
185 |                         dplyr::summarise_all(median) %>%
186 |                         dplyr::mutate_all(~ifelse(.x > jaccard_cutoff, TRUE, FALSE)) %>%
187 |                         unlist()
188 |                 number_of_stable_cluster<- sum(stable_cluster)
189 |         } else if (method == "jaccard_percent"){
190 |                 number_of_stable_cluster<- sum(stable_index > percent_cutoff)
191 |                 stable_cluster<- stable_index > percent_cutoff
192 | 
193 |         } else {
194 |                 stop("please specify jaccard_mean, jaccard_median or jaccard_percent
195 |                      for method")
196 |         }
197 | 
198 |         return(list(jaccardIndex = mats, stable_cluster = stable_cluster,
199 |                     number_of_stable_cluster = number_of_stable_cluster,
200 |                     stable_index = stable_index))
201 | }
202 | 
203 | 
204 | #' Calculate the percentage of cells in stable clusters in the full data set
205 | #'
206 | #' @param ident. A named factor vector. names are the cell names, the values are
207 | #' the cluster id from the full data set.
208 | #' @param stable_cluster. A logical vector for each of the original cluster indicating
209 | #' it is stable or not, calculated from \code{\link{AssignStableCluster}}
210 | #'
211 | #' @return A percentage of cells in stable cluster
212 | #' @export
213 | #'
214 | #' @examples
215 | 
216 | CalculatePercentCellInStable<- function(ident, stable_cluster){
217 |         ident.list<- split(names(ident), ident)
218 |         number_of_cells_each_cluster<- purrr::map_int(ident.list, length)
219 |         percent_cell_in_stable<- sum(number_of_cells_each_cluster[stable_cluster])/sum(number_of_cells_each_cluster)
220 |         return(percent_cell_in_stable)
221 | 
222 | }
223 | 
224 | #' Bootstrap for a fully processed Seurat object
225 | #'
226 | #' @param object A fully processed Seurat object.
227 | #' @param n  Number of times you want to bootstrap.
228 | #' @param rate A number between 0 and 1 for subsampling the cells.
229 | #' @param ... Other parameters passed to \code{\link{PreprocessSubsetData}}
230 | #'
231 | #' @return A list of lists containing the ident from the subsetted reclustered
232 | #' seurat objects.
233 | #' @export
234 | #'
235 | #' @examples
236 | #'
237 | 
238 | # # see https://github.com/satijalab/seurat/issues/457
239 | # # parallelize Seurat functions. The authors decided to go with the future framework.
240 | # scClusterBoot<- function(object, n = 4, workers = 4, rate = 0.8, ...){
241 | #         multicoreParam <- BiocParallel::MulticoreParam(workers = workers)
242 | #         BiocParallel::register(multicoreParam)
243 | #         # the parameter n is not used inside the function
244 | #         GetProcessedSubsetDataCluster<- function(n, ...){
245 | #                 object<- RandomSubsetData(object, rate = rate)
246 | #                 object<- PreprocessSubsetData(object, ...)
247 | #                 return(list(ident = object@ident, pc.sig = object@meta.data$pc.sig))
248 | #         }
249 | #         boot_clusters<- BiocParallel::bplapply(1:n, GetProcessedSubsetDataCluster)
250 | #         return(boot_clusters)
251 | # }
252 | 
253 | 
254 | 
255 | # scClusterBoot<- function(object, n = 4, workers = 4, rate = 0.8, ...){
256 | #         future::plan(multiprocess)
257 | #         # the parameter n is not used inside the function
258 | #         GetProcessedSubsetDataCluster<- function(n, ...){
259 | #                 object<- RandomSubsetData(object, rate = rate)
260 | #                 object<- PreprocessSubsetData(object, ...)
261 | #                 return(list(ident = object@ident, pc.sig = object@meta.data$pc.sig))
262 | #         }
263 | #         boot_clusters<- future.apply::future_lapply(1:n, GetProcessedSubsetDataCluster)
264 | #         return(boot_clusters)
265 | # }
266 | 
267 | scClusterBoot<- function(object, n = 4, rate = 0.8, ...){
268 |         # the parameter n is not used inside the function
269 |         GetProcessedSubsetDataCluster<- function(n, ...){
270 |                 object<- RandomSubsetData(object, rate = rate)
271 |                 object<- PreprocessSubsetData(object, ...)
272 |                 return(list(ident = object@ident, pc.sig = object@meta.data$pc.sig))
273 |         }
274 |         boot_clusters<- lapply(1:n, GetProcessedSubsetDataCluster)
275 |         return(boot_clusters)
276 | }
277 | 
278 | 
279 | 
280 | 
281 | 


--------------------------------------------------------------------------------
/R/scclusteval-package.R:
--------------------------------------------------------------------------------
1 | #' @keywords internal
2 | "_PACKAGE"
3 | 


--------------------------------------------------------------------------------
/R/snncbi.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/R/snncbi.R


--------------------------------------------------------------------------------
/R/utils-pipe.R:
--------------------------------------------------------------------------------
 1 | #' Pipe operator
 2 | #'
 3 | #' See \code{magrittr::\link[magrittr]{\%>\%}} for details.
 4 | #'
 5 | #' @name %>%
 6 | #' @rdname pipe
 7 | #' @keywords internal
 8 | #' @export
 9 | #' @importFrom magrittr %>%
10 | #' @usage lhs \%>\% rhs
11 | NULL
12 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | [![Build Status](https://travis-ci.com/crazyhottommy/scclusteval.svg?branch=master)](https://travis-ci.com/crazyhottommy/scclusteval)
  8 | 
  9 | ```{r setup, include = FALSE}
 10 | knitr::opts_chunk$set(
 11 |   collapse = TRUE,
 12 |   comment = "#>",
 13 |   fig.path = "man/figures/README-",
 14 |   out.width = "60%",
 15 |   out.height = "60%"
 16 | )
 17 | ```
 18 | # scclusteval
 19 | 
 20 | ```{r pressure, echo=FALSE, fig.cap="hex sticker", out.width = '10%'}
 21 | knitr::include_graphics("man/figures/scclusteval.png")
 22 | ```
 23 | 
 24 | The goal of scclusteval(Single Cell Cluster Evaluation) is to evaluate the single cell clustering stability by subsampling the cells and provide many visualization methods for comparing clusters.
 25 | 
 26 | For Theory behind the method, see Christian Henning, “Cluster-wise assessment of
 27 | cluster stability,” Research Report 271, Dept. of Statistical Science, University 
 28 | College London, December 2006)
 29 | 
 30 | ### Citation
 31 | 
 32 | Ming Tang, Yasin Kaymaz,Brandon L. Logeman, Stephen Eichhorn, Zhengzheng S. Liang, Catherine Dulac and Timothy B. Sackton. Evaluating single-cell cluster stability using the Jaccard similarity index (2020) *Bioinformatics* In Press.
 33 | 
 34 | ### Parameters that affect the clustering
 35 | 
 36 | The most popular clustering method for single cell RNAseq is shared nearest neighbor (SNN)
 37 | which is implemented in `{Seurat::FindClusters}`. See a paper by Mark Robinson group for comparing
 38 | single cell RNAseq clustering methods:[A systematic performance evaluation of clustering methods for single-cell RNA-seq data](https://f1000research.com/articles/7-1141/v1)
 39 | SNN in Seurat is the most accurate and fast one.
 40 | 
 41 | The parameter `k.param` which specifies the number of nearest neighbors has a great effect on the number of clusters. Other Parameters such as the number of PCs and the resolution can affect the number of clusters as well.
 42 | 
 43 | The process is as follows.
 44 | 
 45 | To assess which k is best to use by subsampling the original data:
 46 | 
 47 | 1. Performing the clustering at many different K values on the full data set. 
 48 | 
 49 | 2. We then sample without replacement a subset of the data set (e.g. 80% of the 
 50 | cells in the full data set), and then repeat the clustering procedure on just 
 51 | this subset of data (so repeating all aspects of clustering, including calling 
 52 | variable genes, calculating PCs, building the neighbor graph, etc), and we do 
 53 | this n times. 
 54 | 
 55 | 3. So for each K value, we have 1 clustering outcome for the full data set, 
 56 | and 20 clustering outcomes for subsampled portions of the data set. From this we
 57 | identify the cluster in the first subsample clustering that is most similar to 
 58 | the full cluster 1 cells (the one that gives the maximum Jaccard coefficient) and 
 59 | record that value. If this maximum Jaccard coefficient is less than 0.6 (this is 
 60 | quite subjective), the original cluster is considered to be dissolved-it didn’t 
 61 | show up in the new clustering.  A cluster that’s dissolved too often is probably 
 62 | not a “real” cluster.
 63 | 
 64 | >As a rule of thumb, clusters with a stability value less than 0.6 should be considered 
 65 | unstable. Values between 0.6 and 0.75 indicate that the cluster is measuring a pattern 
 66 | in the data, but there isn’t high certainty about which points should be clustered 
 67 | together. Clusters with stability values above about 0.85 can be considered highly 
 68 | stable (they’re likely to be real clusters).
 69 | 
 70 | 
 71 | 4. Repeat this for all subsample clustering outcomes, and then the 
 72 | stability value of a cluster is the median or mean Jaccard coefficient. If it's 
 73 | greater than 0.6 (or a cutoff you set) we say it's stable, otherwise it's unstable. 
 74 | So for a given K value this gives you a stable/unstable assignment for each cluster.
 75 | We choose the  k value to select for clustering the data by looking at which k value 
 76 | yielded the largest number of stable clusters while still having most of the cells from the 
 77 | data set in a stable cluster.
 78 | 
 79 | 
 80 | We can repeat the 1-4 for different resolution and number of PCs and the combination of all different parameters.
 81 | 
 82 | The workflow is:
 83 | 
 84 | ![](man/figures/workflow.png)
 85 | 
 86 | ## The subsampling process is implemented in a Snakemake workflow
 87 | 
 88 | Because for each subsampling, one has to re-run the whole process of `FindVariableGenes`,
 89 | `ScaleData`, `RunPCA`, `JackStraw` and `FindClusters` and for large data set, it can
 90 | take very long time to run.
 91 | 
 92 | E.g. if you test 5 different K, and for each K you subsample the full dataset 100 times. that's
 93 | 500 runs.
 94 | 
 95 | Snakemake will take advantage of the HPC cluster with large number of CPUs avaiable.
 96 | 
 97 | The R package works with the output from the Snakemake workflow: [pyflow_seuratv3_parameter](https://github.com/crazyhottommy/pyflow_seuratv3_parameter).
 98 | 
 99 | ## The scclusteval R package is for downstream analysis
100 | 
101 | ### Installation
102 | 
103 | You can install the scclusteval from github:  
104 | 
105 | ``` r
106 | devtools::install_github("crazyhottommy/scclusteval")
107 | ```
108 | 
109 | 
110 | ## Useful functions
111 | 
112 | ```{r}
113 | library(scclusteval)
114 | ?RandomSubsetData
115 | ?MergeMultipleSeuratObjects
116 | ?PreprocessSubsetData
117 | ?PairWiseJaccardSets
118 | 
119 | ## in Rstudio type below and tab to see all avaiable functions
120 | ## scclusteval::
121 | ```
122 | 
123 | 
124 | ## Examples
125 | 
126 | Examples to use the `scclusteval` package can be found at https://crazyhottommy.github.io/EvaluateSingleCellClustering/
127 | 
128 | ![](man/figures/README-unnamed-chunk-1-1.png)
129 | 
130 | ![](man/figures/jaccard_raincloud.png)
131 | 
132 | ## Acknowledgements
133 | 
134 | Thanks to Tim Sackton and Catherine Dulac for their supervision and support.  
135 | Thanks to Yasin Kaymaz in Sackton group for fruitful discussion.  
136 | Thanks to Stephen Eichhorn in Xiaowei Zhuang lab for the idea and sharing the python code working on [Scanpy](https://github.com/theislab/scanpy) object.  
137 | Thanks to Sophia(Zhengzheng) Liang and Brandon Logeman in Dulac lab for sharing data and giving feedbacks.  
138 | Thanks [David Robinson](https://twitter.com/drob)'s `geomflatviolin` function which was used in the `raincloudplot`.
139 | 
140 | ## Why this package?
141 | 
142 | I saw `{fpc}` package has a function `clusterboot`. However, this function does not support
143 | SNN clustering. Although one can write a customer clustering function to feed into clusterboot, 
144 | I need to build things upon `Seurat` package and those two can not be easilily integrated. In
145 | addition, `clusterboot` is not parallelized, I have to implement the `snakemake` workflow for faster
146 | processing.
147 | 
148 | read this blog post http://www.win-vector.com/blog/2015/09/bootstrap-evaluation-of-clusters/
149 | and https://www.czasopisma.uni.lodz.pl/foe/article/view/983
150 | 
151 | 
152 | ## To do list
153 | 
154 | - [x] implement more visualization functions. 
155 | - [ ] plot number of cells subsampled for each cluster in each iteration in raincloudplot.
156 | - [ ] impurity metric for assessing cluster stability.
157 | - [ ] read this post from Jean Fan from Xiaowei Zhuang's lab https://jef.works/blog/2018/02/28/stability-testing/
158 | `getComMembership` function works on raw data matrix. It can be used independent of Seurat's `FindClusters`. chat with Jean for more details.
159 | - [ ] gene sets enrichment for each cluster.
160 | 
161 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | [![Build
  5 | Status](https://travis-ci.com/crazyhottommy/scclusteval.svg?branch=master)](https://travis-ci.com/crazyhottommy/scclusteval)
  6 | 
  7 | # scclusteval
  8 | 
  9 | <div class="figure">
 10 | 
 11 | <img src="man/figures/scclusteval.png" alt="hex sticker" width="10%" height="60%" />
 12 | 
 13 | <p class="caption">
 14 | 
 15 | hex sticker
 16 | 
 17 | </p>
 18 | 
 19 | </div>
 20 | 
 21 | The goal of scclusteval(Single Cell Cluster Evaluation) is to evaluate
 22 | the single cell clustering stability by subsampling the cells and
 23 | provide many visualization methods for comparing clusters.
 24 | 
 25 | For Theory behind the method, see Christian Henning, “Cluster-wise
 26 | assessment of cluster stability,” Research Report 271, Dept. of
 27 | Statistical Science, University College London, December 2006)
 28 | 
 29 | ### Citation
 30 | 
 31 | Ming Tang, Yasin Kaymaz,Brandon L. Logeman, Stephen Eichhorn, Zhengzheng
 32 | S. Liang, Catherine Dulac and Timothy B. Sackton. Evaluating single-cell
 33 | cluster stability using the Jaccard similarity index (2020)
 34 | *Bioinformatics*  btaa956, https://doi.org/10.1093/bioinformatics/btaa956.
 35 | 
 36 | ### Parameters that affect the clustering
 37 | 
 38 | The most popular clustering method for single cell RNAseq is shared
 39 | nearest neighbor (SNN) followed by Louvain community detection algorithm 
 40 | which is implemented in `{Seurat::FindClusters}`.
 41 | See a paper by Mark Robinson group for comparing single cell RNAseq
 42 | clustering methods:[A systematic performance evaluation of clustering
 43 | methods for single-cell RNA-seq
 44 | data](https://f1000research.com/articles/7-1141/v1) SNN in Seurat is the
 45 | most accurate and fast one.
 46 | 
 47 | The parameter `k.param` which specifies the number of nearest neighbors
 48 | has a great effect on the number of clusters. Other Parameters such as
 49 | the number of PCs and the resolution can affect the number of clusters
 50 | as well.
 51 | 
 52 | The process is as follows.
 53 | 
 54 | To assess which k is best to use by subsampling the original data:
 55 | 
 56 | 1.  Performing the clustering at many different K values on the full
 57 |     data set.
 58 | 
 59 | 2.  We then sample without replacement a subset of the data set
 60 |     (e.g. 80% of the cells in the full data set), and then repeat the
 61 |     clustering procedure on just this subset of data (so repeating all
 62 |     aspects of clustering, including calling variable genes, calculating
 63 |     PCs, building the neighbor graph, etc), and we do this n times.
 64 | 
 65 | 3.  So for each K value, we have 1 clustering outcome for the full data
 66 |     set, and 20 clustering outcomes for subsampled portions of the data
 67 |     set. From this we identify the cluster in the first subsample
 68 |     clustering that is most similar to the full cluster 1 cells (the one
 69 |     that gives the maximum Jaccard coefficient) and record that value.
 70 |     If this maximum Jaccard coefficient is less than 0.6 (this is quite
 71 |     subjective), the original cluster is considered to be dissolved-it
 72 |     didn’t show up in the new clustering. A cluster that’s dissolved too
 73 |     often is probably not a “real” cluster.
 74 | 
 75 | > As a rule of thumb, clusters with a stability value less than 0.6
 76 | > should be considered unstable. Values between 0.6 and 0.75 indicate
 77 | > that the cluster is measuring a pattern in the data, but there isn’t
 78 | > high certainty about which points should be clustered together.
 79 | > Clusters with stability values above about 0.85 can be considered
 80 | > highly stable (they’re likely to be real clusters).
 81 | 
 82 | 4.  Repeat this for all subsample clustering outcomes, and then the
 83 |     stability value of a cluster is the median or mean Jaccard
 84 |     coefficient. If it’s greater than 0.6 (or a cutoff you set) we say
 85 |     it’s stable, otherwise it’s unstable. So for a given K value this
 86 |     gives you a stable/unstable assignment for each cluster. We choose
 87 |     the k value to select for clustering the data by looking at which k
 88 |     value yielded the largest number of stable clusters while still
 89 |     having most of the cells from the data set in a stable cluster.
 90 | 
 91 | We can repeat the 1-4 for different resolution and number of PCs and the
 92 | combination of all different parameters.
 93 | 
 94 | The workflow is:
 95 | 
 96 | ![](man/figures/workflow.png)
 97 | 
 98 | ## The subsampling process is implemented in a Snakemake workflow
 99 | 
100 | Because for each subsampling, one has to re-run the whole process of
101 | `FindVariableGenes`, `ScaleData`, `RunPCA`, `JackStraw` and
102 | `FindClusters` and for large data set, it can take very long time to
103 | run.
104 | 
105 | E.g. if you test 5 different K, and for each K you subsample the full
106 | dataset 100 times. that’s 500 runs.
107 | 
108 | Snakemake will take advantage of the HPC cluster with large number of
109 | CPUs avaiable.
110 | 
111 | The R package works with the output from the Snakemake workflow:
112 | [pyflow\_seuratv3\_parameter](https://github.com/crazyhottommy/pyflow_seuratv3_parameter).
113 | 
114 | ## The scclusteval R package is for downstream analysis
115 | 
116 | ### Installation
117 | 
118 | You can install the scclusteval from github:
119 | 
120 | ``` r
121 | devtools::install_github("crazyhottommy/scclusteval")
122 | ```
123 | 
124 | ## Useful functions
125 | 
126 | ``` r
127 | library(scclusteval)
128 | #> Loading required package: Seurat
129 | ?RandomSubsetData
130 | ?MergeMultipleSeuratObjects
131 | ?PreprocessSubsetData
132 | ?PairWiseJaccardSets
133 | 
134 | ## in Rstudio type below and tab to see all avaiable functions
135 | ## scclusteval::
136 | ```
137 | 
138 | ## Examples
139 | 
140 | Examples to use the `scclusteval` package can be found at
141 | <https://crazyhottommy.github.io/EvaluateSingleCellClustering/>
142 | 
143 | ![](man/figures/README-unnamed-chunk-1-1.png)
144 | 
145 | ![](man/figures/jaccard_raincloud.png)
146 | 
147 | ## Acknowledgements
148 | 
149 | Thanks to Tim Sackton and Catherine Dulac for their supervision and
150 | support.  
151 | Thanks to Yasin Kaymaz in Sackton group for fruitful discussion.  
152 | Thanks to Stephen Eichhorn in Xiaowei Zhuang lab for the idea and
153 | sharing the python code working on
154 | [Scanpy](https://github.com/theislab/scanpy) object.  
155 | Thanks to Sophia(Zhengzheng) Liang and Brandon Logeman in Dulac lab for
156 | sharing data and giving feedbacks.  
157 | Thanks [David Robinson](https://twitter.com/drob)’s `geomflatviolin`
158 | function which was used in the `raincloudplot`.
159 | 
160 | ## Why this package?
161 | 
162 | I saw `{fpc}` package has a function `clusterboot`. However, this
163 | function does not support SNN clustering. Although one can write a
164 | customer clustering function to feed into clusterboot, I need to build
165 | things upon `Seurat` package and those two can not be easilily
166 | integrated. In addition, `clusterboot` is not parallelized, I have to
167 | implement the `snakemake` workflow for faster processing.
168 | 
169 | read this blog post
170 | <http://www.win-vector.com/blog/2015/09/bootstrap-evaluation-of-clusters/>
171 | and <https://www.czasopisma.uni.lodz.pl/foe/article/view/983>
172 | 
173 | ## To do list
174 | 
175 |   - \[x\] implement more visualization functions.
176 |   - \[ \] plot number of cells subsampled for each cluster in each
177 |     iteration in raincloudplot.
178 |   - \[ \] impurity metric for assessing cluster stability.
179 |   - \[ \] read this post from Jean Fan from Xiaowei Zhuang’s lab
180 |     <https://jef.works/blog/2018/02/28/stability-testing/>
181 |     `getComMembership` function works on raw data matrix. It can be used
182 |     independent of Seurat’s `FindClusters`. chat with Jean for more
183 |     details.
184 |   - \[ \] gene sets enrichment for each cluster.
185 | 


--------------------------------------------------------------------------------
/data/idents.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/data/idents.rda


--------------------------------------------------------------------------------
/man/AssignHighestJaccard.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scclusterboot.R
 3 | \name{AssignHighestJaccard}
 4 | \alias{AssignHighestJaccard}
 5 | \title{Assign highest Jaccard index for each cluster of the subsampled data set before
 6 | reclustering with the cluster identites of subsampled data set after reclustering}
 7 | \usage{
 8 | AssignHighestJaccard(idents1, idents2)
 9 | }
10 | \arguments{
11 | \item{idents1}{A list of cluster identity copied from the orginal data sets.
12 | idents1 is a list of the cluster identity from the subsampled data sets before reclustering.}
13 | 
14 | \item{idents2}{A list of cluster identity from the subsampled data sets.
15 | idents2 is a list of the cluster identity from the subsampled data sets after reclustering.
16 | The order of identities in idents1 and idents2 should correspond to each other.}
17 | }
18 | \value{
19 | A matrix with dimention of #number of subsampling * #number of clusters in the
20 | original data set.
21 | }
22 | \description{
23 | Assign highest Jaccard index for each cluster of the subsampled data set before
24 | reclustering with the cluster identites of subsampled data set after reclustering
25 | }
26 | 


--------------------------------------------------------------------------------
/man/AssignStableCluster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scclusterboot.R
 3 | \name{AssignStableCluster}
 4 | \alias{AssignStableCluster}
 5 | \title{Assign stable cluster}
 6 | \usage{
 7 | AssignStableCluster(
 8 |   idents1,
 9 |   idents2,
10 |   method = "jaccard_median",
11 |   jaccard_cutoff = 0.6,
12 |   percent_cutoff = 0.6
13 | )
14 | }
15 | \arguments{
16 | \item{idents1}{A list of cluster identity copied from the orginal data sets.
17 | idents1 is a list of the cluster identity from the subsampled data sets before reclustering.}
18 | 
19 | \item{idents2}{A list of cluster identity from the subsampled data sets.
20 | idents2 is a list of the cluster identity from the subsampled data sets after reclustering.
21 | The order of identities in idents1 and idents2 should correspond to each other.}
22 | 
23 | \item{method}{what way to summarize the jaccard index across all simulations.
24 | to determine a cluster is stable or not. options are "Jaccard_mean", "Jaccard_median" and "Jaccard_percent"}
25 | 
26 | \item{jaccard_cutoff}{Cutoff of the jaccard index to determin a cluster is stable or not.
27 | it is the mean or median cutoff when the method is "jaccard_mean" or "jaccard_median" and it is
28 | the cutoff for every subsampling when the method is "jaccard_percent"}
29 | 
30 | \item{percent_cutoff}{The percentage of jaccard index greater than jaccard_cutoff. Used
31 | when method is "jaccard_percent". specify 0.6 when you mean 60\%.}
32 | }
33 | \value{
34 | A list containing the raw data for jaccard index for all simulations,
35 | TRUE or FALSE of stable cluster for each cluster and a number of stable clusters.
36 | A cluster is deemed as stable if the median (or mean) jaccard index is > cutoff.
37 | in addtion, a stable_index is calculated, which is the pecentage of jaccard index >
38 | cutoff for all the subsampling. e.g. for 100 times subsampling, 0.8 means 80\% of the
39 | time, the jaccard index is > cutoff. Sometimes, we see bimodal distrbution of the
40 | 100 jaccard index, the percentage is a better measurement than the mean or median of the
41 | 100 jaccard index.
42 | }
43 | \description{
44 | Assign stable cluster
45 | }
46 | \examples{
47 | 
48 | data(idents)
49 | 
50 | AssignStableCluster(idents, idents)
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/man/CalculatePercentCellInStable.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scclusterboot.R
 3 | \name{CalculatePercentCellInStable}
 4 | \alias{CalculatePercentCellInStable}
 5 | \title{Calculate the percentage of cells in stable clusters in the full data set}
 6 | \usage{
 7 | CalculatePercentCellInStable(ident, stable_cluster)
 8 | }
 9 | \arguments{
10 | \item{ident.}{A named factor vector. names are the cell names, the values are
11 | the cluster id from the full data set.}
12 | 
13 | \item{stable_cluster.}{A logical vector for each of the original cluster indicating
14 | it is stable or not, calculated from \code{\link{AssignStableCluster}}}
15 | }
16 | \value{
17 | A percentage of cells in stable cluster
18 | }
19 | \description{
20 | Calculate the percentage of cells in stable clusters in the full data set
21 | }
22 | 


--------------------------------------------------------------------------------
/man/CalculateSilhouette.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/calculatesilhouette.R
 3 | \name{CalculateSilhouette}
 4 | \alias{CalculateSilhouette}
 5 | \title{Calculate Silhouette width from PCA space for each cell after clustering
 6 | This is calculated from Seurat object}
 7 | \usage{
 8 | CalculateSilhouette(object, dims = 1:50)
 9 | }
10 | \arguments{
11 | \item{object}{A Seurat object with Idents set to cluster ids (factors)}
12 | 
13 | \item{dims}{default 1:50  dimension to use in the PCA space to calculate
14 | eucledian distance}
15 | }
16 | \value{
17 | a dataframe with silhouette width for each cell. see also \code{\link[cluster]{silhouette}}
18 | }
19 | \description{
20 | Calculate Silhouette width from PCA space for each cell after clustering
21 | This is calculated from Seurat object
22 | }
23 | \examples{
24 | CalculateSilhouette(pbmc_small, dims = 1:15)
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/man/ClusterIdentityChordPlot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/clusterviz.R
 3 | \name{ClusterIdentityChordPlot}
 4 | \alias{ClusterIdentityChordPlot}
 5 | \title{Plot ChordDiagram of cell identity changes between two runs of clusters.}
 6 | \usage{
 7 | ClusterIdentityChordPlot(
 8 |   ident1,
 9 |   ident2,
10 |   clusters_to_show_ident1 = NULL,
11 |   big.gap = 10,
12 |   transparency = 0.5,
13 |   grid.col = NULL,
14 |   link.sort = TRUE,
15 |   link.decreasing = TRUE,
16 |   directional = -1
17 | )
18 | }
19 | \arguments{
20 | \item{ident1}{a named factor vector. names are the cell names, the values are
21 | the cluster id.}
22 | 
23 | \item{ident2}{a named factor vector. names are the cell names, the values are
24 | the cluster id.}
25 | 
26 | \item{clusters_to_show_ident1}{A character vector of cluster ids to show for ident1.
27 | default is NULL, all clusters will be shown.}
28 | 
29 | \item{big.gap}{Gap between sectors of two cluster runs.}
30 | 
31 | \item{transparency}{Transparency of link colors, 0 means no transparency and 1 means full transparency.
32 | see \code{\link[circlize]{chordDiagramFromMatrix}}}
33 | 
34 | \item{grid.col}{Grid colors which correspond to matrix rows/columns (or sectors).
35 | The length of the vector should be either 1 or length(union(rownames(mat), colnames(mat))).
36 | It's preferred that grid.col is a named vector of which names correspond to sectors.
37 | If it is not a named vector, the order of grid.col corresponds to order of sectors.
38 | see \code{\link[circlize]{chordDiagramFromMatrix}}}
39 | 
40 | \item{link.sort}{whether sort links on every sector based on the width of the links on it.
41 | If it is set to "overall", all links are sorted regardless whether they are from rows or columns.
42 | see \code{\link[circlize]{chordDiagramFromMatrix}}}
43 | 
44 | \item{link.decreasing}{for link.sort}
45 | 
46 | \item{directional}{Whether links have directions. 1 means the direction is from the first column
47 | in df to the second column, -1 is the reverse, 0 is no direction, and 2 for two directional.
48 | see \code{\link[circlize]{chordDiagramFromMatrix}}}
49 | }
50 | \value{
51 | A data frame which contains positions of links. see \code{\link[circlize]{chordDiagramFromMatrix}}
52 | }
53 | \description{
54 | Plot ChordDiagram of cell identity changes between two runs of clusters.
55 | }
56 | 


--------------------------------------------------------------------------------
/man/ClusterSizeBarplot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/clusterviz.R
 3 | \name{ClusterSizeBarplot}
 4 | \alias{ClusterSizeBarplot}
 5 | \title{Make a Barplot for cluster size}
 6 | \usage{
 7 | ClusterSizeBarplot(ident, bar_col = "blue", label_number = TRUE)
 8 | }
 9 | \arguments{
10 | \item{ident}{a named factor vector. names are the cell names, the values are
11 | the cluster id.}
12 | 
13 | \item{bar_col}{color for the bar. Default is blue.}
14 | 
15 | \item{label_number}{whether or not put cell number in each cluster on top of the bar}
16 | }
17 | \value{
18 | a ggplot2 bar graph object
19 | }
20 | \description{
21 | Make a Barplot for cluster size
22 | }
23 | \examples{
24 | data(pbmc_small)
25 | ClusterSizeBarplot(Idents(pbmc_small))
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/man/JaccardRainCloudPlot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/clusterviz.R
 3 | \name{JaccardRainCloudPlot}
 4 | \alias{JaccardRainCloudPlot}
 5 | \title{Plot the Jaccard index distribution using raincloud plot}
 6 | \usage{
 7 | JaccardRainCloudPlot(idents1, idents2, title = NULL)
 8 | }
 9 | \arguments{
10 | \item{idents1}{A list of cluster identity from the subsampled data set
11 | before reclustering. (cluster id copied from the original full data set)}
12 | 
13 | \item{idents2}{A list of cluster identity from the subsampled data sets after
14 | reclustering.}
15 | 
16 | \item{title}{Title of the plot}
17 | }
18 | \value{
19 | A ggplot2 object
20 | }
21 | \description{
22 | Plot the Jaccard index distribution using raincloud plot
23 | }
24 | \examples{
25 | 
26 | \dontrun{
27 | data(idents)
28 | ## the pbmc here need to be fully processed.
29 | JaccardRainCloudPlot(idents, idents)
30 | }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/man/JaccardSets.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scclusterboot.R
 3 | \name{JaccardSets}
 4 | \alias{JaccardSets}
 5 | \title{Calculate jaccard distance for two sets of character vectors}
 6 | \usage{
 7 | JaccardSets(set1, set2)
 8 | }
 9 | \arguments{
10 | \item{set1}{character vector 1}
11 | 
12 | \item{set2}{character vector 2}
13 | }
14 | \value{
15 | jaccard distance
16 | }
17 | \description{
18 | Calculate jaccard distance for two sets of character vectors
19 | }
20 | \examples{
21 | JaccardSets(sample(LETTERS, 10), sample(LETTERS, 10))
22 | }
23 | 


--------------------------------------------------------------------------------
/man/MatchClusters.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scclusterboot.R
 3 | \name{MatchClusters}
 4 | \alias{MatchClusters}
 5 | \title{Match two run of cluster ids with highest Jaccard index}
 6 | \usage{
 7 | MatchClusters(ident1, ident2)
 8 | }
 9 | \arguments{
10 | \item{ident1}{a named factor vector. names are the cell names, the values are
11 | the cluster id.}
12 | 
13 | \item{ident2}{a named factor vector. names are the cell names, the values are
14 | the cluster id.}
15 | }
16 | \value{
17 | A tibble with two columns, column 1 is the cluster ids from ident1, column2
18 | is the cluster ids from ident2.
19 | }
20 | \description{
21 | Match two run of cluster ids with highest Jaccard index
22 | }
23 | \examples{
24 |  \dontrun{
25 |  MatchClusters(pbmc@ident, pbmc_small@ident)
26 |  }
27 | }
28 | 


--------------------------------------------------------------------------------
/man/MergeMultipleSeuratObjects.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mergemultipleseuratobjects.R
 3 | \name{MergeMultipleSeuratObjects}
 4 | \alias{MergeMultipleSeuratObjects}
 5 | \title{Read multiple 10x run into Seurat objects and merge into a single Seurat object}
 6 | \usage{
 7 | MergeMultipleSeuratObjects(input_folders, do.normalize = FALSE, ...)
 8 | }
 9 | \arguments{
10 | \item{input_folders}{A named list of folder path for each run.}
11 | 
12 | \item{do.normalize}{Whether or not normalize the data after mergeing, default is FALSE}
13 | 
14 | \item{...}{Other parameters for CreatSeuratObject in the Seurat package}
15 | }
16 | \value{
17 | A single merged Seurat object from mulitple 10x runs.
18 | }
19 | \description{
20 | Read multiple 10x run into Seurat objects and merge into a single Seurat object.
21 | The names of the list of paths will be prepended to the cell name.
22 | }
23 | \examples{
24 | \dontrun{
25 | library(fs)
26 | library(here)
27 | library(stringr)
28 | input_folders<- dir_ls( path = here("data"), recursive = T) \%>\% path_dir() \%>\%
29 | unique() \%>\% str_subset("mm10-1.2.0_premrna")
30 | merged_seurat<- MergeMultipleSeuratObjects(input_folders)
31 | }
32 | }
33 | 


--------------------------------------------------------------------------------
/man/PairWiseJaccardSets.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scclusterboot.R
 3 | \name{PairWiseJaccardSets}
 4 | \alias{PairWiseJaccardSets}
 5 | \title{Calculate pair-wise Jaccard distance for @ident slots from two Seurat objects}
 6 | \usage{
 7 | PairWiseJaccardSets(ident1, ident2)
 8 | }
 9 | \arguments{
10 | \item{ident1}{a named factor vector. names are the cell names, the values are
11 | the cluster id.}
12 | 
13 | \item{ident2}{a named factor vector. names are the cell names, the values are
14 | the cluster id.}
15 | }
16 | \value{
17 | a matrix of pair-wise Jaccard distance. Rows are clusters from ident1,
18 | columns are clusters from ident2
19 | }
20 | \description{
21 | Calculate pair-wise Jaccard distance for two named factor vector. e.g.
22 | seurat_obj1@ident and seurat_obj2@ident
23 | }
24 | \examples{
25 | \dontrun{
26 | PairWiseJaccardSets(pbmc@ident, pbmc_small@ident)
27 | }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/man/PairWiseJaccardSetsHeatmap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/clusterviz.R
 3 | \name{PairWiseJaccardSetsHeatmap}
 4 | \alias{PairWiseJaccardSetsHeatmap}
 5 | \title{Make a Heatmap of the pairwise Jaccard distance between cluster ident of two
 6 | Seurat object}
 7 | \usage{
 8 | PairWiseJaccardSetsHeatmap(
 9 |   ident1,
10 |   ident2,
11 |   best_match = FALSE,
12 |   title = NULL,
13 |   col_low = "white",
14 |   col_high = "red",
15 |   cluster_rows = F,
16 |   cluster_columns = F,
17 |   show_row_dend = F,
18 |   show_column_dend = F,
19 |   ...
20 | )
21 | }
22 | \arguments{
23 | \item{ident1}{a named factor vector. names are the cell names, the values are
24 | the cluster id.}
25 | 
26 | \item{ident2}{a named factor vector. names are the cell names, the values are
27 | the cluster id.}
28 | 
29 | \item{best_match}{Whether or not only show the best match of ident1 from ident2.
30 | if set to TRUE, the Jaccard index matrix will be subsetted using the ident2 column
31 | from the output of \code{\link{MatchClusters}}, the row order will be in order from cluster
32 | 0 to the total number of clusters, the columns will be the best match of ident1 from ident2,
33 | and the columns idents could be duplicated. e.g. single cluster from ident2 matches multiple
34 | clusters in ident1.}
35 | 
36 | \item{title}{The title of the heatmap}
37 | 
38 | \item{col_low}{Color for low Jaccard index.}
39 | 
40 | \item{col_high}{Color for high Jaccard index.}
41 | 
42 | \item{cluster_rows}{cluster row or not, default FALSE}
43 | 
44 | \item{cluster_columns}{cluster columns or not, default FASLE}
45 | 
46 | \item{show_row_dend}{Whether or not show row dendrogram}
47 | 
48 | \item{show_column_dend}{Whether or not show column dendrogram}
49 | 
50 | \item{...}{other parameters pass to \code{\link[ComplexHeatmap]{Heatmap}}}
51 | }
52 | \value{
53 | A Heatmap representing the pair-wise Jaccard correlation, rows are ident1,
54 | columns are ident2
55 | }
56 | \description{
57 | Make a Heatmap of the pairwise Jaccard distance between cluster ident of two
58 | Seurat object
59 | }
60 | \examples{
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/man/PairWiseOverlappingIdents.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scclusterboot.R
 3 | \name{PairWiseOverlappingIdents}
 4 | \alias{PairWiseOverlappingIdents}
 5 | \title{Calculate pair-wise overlapping cluster identities for @ident slots from two Seurat objects}
 6 | \usage{
 7 | PairWiseOverlappingIdents(ident1, ident2)
 8 | }
 9 | \arguments{
10 | \item{ident1}{a named factor vector. names are the cell names, the values are
11 | the cluster id.}
12 | 
13 | \item{ident2}{a named factor vector. names are the cell names, the values are
14 | the cluster id.}
15 | }
16 | \value{
17 | A matrix of pairwise number of common cell identities for each cluster.
18 | }
19 | \description{
20 | Calculate pair-wise overlapping cluster identities for two named factor vector. e.g.
21 | seurat_obj1@ident and seurat_obj2@ident
22 | }
23 | \examples{
24 | \dontrun{
25 | PairWiseOverlappingIdents(pbmc@ident, pbmc_small@ident)
26 | }
27 | }
28 | 


--------------------------------------------------------------------------------
/man/ParameterSetScatterPlot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/clusterviz.R
 3 | \name{ParameterSetScatterPlot}
 4 | \alias{ParameterSetScatterPlot}
 5 | \title{Plot a scatter plot for different clustering parameters}
 6 | \usage{
 7 | ParameterSetScatterPlot(
 8 |   stable_clusters,
 9 |   fullsample_idents,
10 |   x_var,
11 |   y_var,
12 |   facet_rows,
13 |   facet_cols
14 | )
15 | }
16 | \arguments{
17 | \item{stable_clusters}{a dataframe with list-columns for data, stable_cluster determined by
18 | \code{\link{AssignStableCluster}} and the rest of the columns are pc, resolution and k_param.}
19 | 
20 | \item{fullsample_idents}{a dataframe with the list-column contain the original ident for
21 | the full dataset. This is the direct output from the Snakemake workflow.}
22 | 
23 | \item{x_var}{one of "pc", "resolution" and "k_param".}
24 | 
25 | \item{y_var}{one of "number" or "percentage". If it is "number",
26 | y-axis si the total number of clusters and total number of stable clusters.}
27 | 
28 | \item{facet_rows}{one of "pc", "resolution" and "k_param" for ggplot2 to facet.}
29 | 
30 | \item{facet_cols}{one of "pc", "resolution" and "k_param" for ggplot2 to facet.}
31 | }
32 | \value{
33 | a ggplot2 object
34 | }
35 | \description{
36 | x-axis is the parameters tested (e.g. many different k.param)
37 | y-axis is the total number of clusters and total number of stable clusters based
38 | on the jaccard cutoff as determined by AssignStableClusters, or precentage of cells
39 | in stable clusters.
40 | }
41 | 


--------------------------------------------------------------------------------
/man/PreprocessSubsetData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/preprocesssubsetdata.R
 3 | \name{PreprocessSubsetData}
 4 | \alias{PreprocessSubsetData}
 5 | \title{A wrapper for preprocessing subsetted Seurat object}
 6 | \usage{
 7 | PreprocessSubsetData(
 8 |   object,
 9 |   variable.features.n = 3000,
10 |   num.pc = 20,
11 |   pc.use = NULL,
12 |   workers = 2,
13 |   score.thresh = 1e-05,
14 |   sig.pc.thresh = 0.05,
15 |   n.start = 100,
16 |   nn.eps = 0,
17 |   resolution = 0.8,
18 |   k.param = 30,
19 |   ...
20 | )
21 | }
22 | \arguments{
23 | \item{object}{A subsetted Seurat object created by RandomSubsetData}
24 | 
25 | \item{variable.features.n}{number of variable features for \code{\link[Seurat]{SCTransform}}}
26 | 
27 | \item{num.pc}{number of PCs to calculate in RunPCA, JackStraw and JackStrawPlot
28 | step. The optimal PCs for FindClusters will be determined by only significant PCs
29 | from JackStrawPlot or if pc.use is set, JackStraw step will be skipped and use pc.use
30 | for FindClusters.}
31 | 
32 | \item{pc.use}{number of PCs used for FindClusters. if pc.use is set, JackStraw step
33 | will be skipped and use pc.use for FindClusters. score.thresh and sig.pc.thresh will be ignored.}
34 | 
35 | \item{workers}{number of CPUs to use for \code{\link[future]{plan}} parallel processing}
36 | 
37 | \item{score.thresh}{Threshold to use for the proportion test of PC significance.}
38 | 
39 | \item{sig.pc.thresh}{Threshold for the significance of a particular PC.}
40 | 
41 | \item{n.start}{Number of random start.}
42 | 
43 | \item{nn.eps}{Error bound when performing nearest neighbor seach using RANN;
44 | default of 0.0 implies exact nearest neighbor search. See FindClusters.}
45 | 
46 | \item{resolution}{Value of the resolution parameter, use a value above (below)
47 | 1.0 if you want to obtain a larger (smaller) number of communities. see FIndClusters.}
48 | 
49 | \item{k.param}{Defines k for the k-nearest neighbor algorithm.}
50 | 
51 | \item{...}{any other parameters}
52 | }
53 | \value{
54 | a fully processed Seurat object
55 | }
56 | \description{
57 | The wrapper does FindVeriableGenes, ScaleData, RunPCA, JackStraw to
58 | determine how many PCs to use, ProjectPCA and FindClusters and retrun
59 | a fully processed Seurat object. The input subsetted seurat object is
60 | supposed to be fully processed as well. So the NormalizeData step is not
61 | necessary.
62 | }
63 | \examples{
64 | \dontrun{
65 | pbmc_small_subset<- RandomSubsetData(pbmc_small, 0.8)
66 | pbmc_small_subset_processed<- PreprocessSubsetData(pbmc_small_subset)
67 | pbmc_small_subset_processed@meta.data
68 | }
69 | }
70 | 


--------------------------------------------------------------------------------
/man/PreprocessSubsetDataV2.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/preprocesssubsetdatav2.R
 3 | \name{PreprocessSubsetDataV2}
 4 | \alias{PreprocessSubsetDataV2}
 5 | \title{A wrapper for preprocessing subsetted Seurat object using ScaleData}
 6 | \usage{
 7 | PreprocessSubsetDataV2(
 8 |   object,
 9 |   nfeatures = 2000,
10 |   num.pc = 20,
11 |   pc.use = NULL,
12 |   workers = 2,
13 |   score.thresh = 1e-05,
14 |   sig.pc.thresh = 0.05,
15 |   n.start = 100,
16 |   nn.eps = 0,
17 |   resolution = 0.8,
18 |   k.param = 30,
19 |   ...
20 | )
21 | }
22 | \arguments{
23 | \item{object}{A subsetted Seurat object created by RandomSubsetData}
24 | 
25 | \item{num.pc}{number of PCs to calculate in RunPCA, JackStraw and JackStrawPlot
26 | step. The optimal PCs for FindClusters will be determined by only significant PCs
27 | from JackStrawPlot or if pc.use is set, JackStraw step will be skipped and use pc.use
28 | for FindClusters.}
29 | 
30 | \item{pc.use}{number of PCs used for FindClusters. if pc.use is set, JackStraw step
31 | will be skipped and use pc.use for FindClusters. score.thresh and sig.pc.thresh will be ignored.}
32 | 
33 | \item{workers}{number of CPUs to use for \code{\link[future]{plan}} parallel processing}
34 | 
35 | \item{score.thresh}{Threshold to use for the proportion test of PC significance.}
36 | 
37 | \item{sig.pc.thresh}{Threshold for the significance of a particular PC.}
38 | 
39 | \item{n.start}{Number of random start.}
40 | 
41 | \item{nn.eps}{Error bound when performing nearest neighbor seach using RANN;
42 | default of 0.0 implies exact nearest neighbor search. See FindClusters.}
43 | 
44 | \item{resolution}{Value of the resolution parameter, use a value above (below)
45 | 1.0 if you want to obtain a larger (smaller) number of communities. see FIndClusters.}
46 | 
47 | \item{k.param}{Defines k for the k-nearest neighbor algorithm.}
48 | 
49 | \item{...}{any other parameters}
50 | 
51 | \item{variable.features.n}{number of variable features for \code{\link[Seurat]{SCTransform}}}
52 | }
53 | \value{
54 | a fully processed Seurat object
55 | }
56 | \description{
57 | The wrapper does FindVeriableGenes, ScaleData, RunPCA, JackStraw to
58 | determine how many PCs to use, ProjectPCA and FindClusters and retrun
59 | a fully processed Seurat object. The input subsetted seurat object is
60 | supposed to be fully processed as well. So the NormalizeData step is not
61 | necessary.
62 | }
63 | \examples{
64 | \dontrun{
65 | pbmc_small_subset<- RandomSubsetData(pbmc_small, 0.8)
66 | pbmc_small_subset_processed<- PreprocessSubsetData(pbmc_small_subset)
67 | pbmc_small_subset_processed@meta.data
68 | }
69 | }
70 | 


--------------------------------------------------------------------------------
/man/RandomSubsetData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/randomsubsetdata.R
 3 | \name{RandomSubsetData}
 4 | \alias{RandomSubsetData}
 5 | \title{Randomly subset (cells) seurat object by a rate}
 6 | \usage{
 7 | RandomSubsetData(object, rate, random.subset.seed = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{Seurat object}
11 | 
12 | \item{rate}{a number betwee 0-1 for subsetting}
13 | 
14 | \item{random.subset.seed}{set a random seed for sampling, default is NULL.}
15 | 
16 | \item{...}{any other parameters to \code{\link[Seurat]{subset}}}
17 | }
18 | \value{
19 | Returns a randomly subsetted seurat object
20 | }
21 | \description{
22 | Randomly subset (cells) seurat object by a rate
23 | }
24 | \examples{
25 | pbmc_small
26 | pbmc_small_subset<- RandomSubsetData(pbmc_small, 0.8)
27 | dim(pbmc_small_subset@meta.data)
28 | 
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/man/SilhouetteRainCloudPlot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/clusterviz.R
 3 | \name{SilhouetteRainCloudPlot}
 4 | \alias{SilhouetteRainCloudPlot}
 5 | \title{Plot raincloud plot for silhouette score}
 6 | \usage{
 7 | SilhouetteRainCloudPlot(silhouette_score)
 8 | }
 9 | \arguments{
10 | \item{silhouette_score}{a dataframe returned by \code{link[CalculateSilhouette]}}
11 | }
12 | \value{
13 | a ggplot2 object
14 | }
15 | \description{
16 | Plot raincloud plot for silhouette score
17 | }
18 | \examples{
19 | 
20 | SilhouetteRainCloudPlot(CalculateSilhouette(pbmc_small, dims = 1:15))
21 | }
22 | 


--------------------------------------------------------------------------------
/man/figures/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/.DS_Store


--------------------------------------------------------------------------------
/man/figures/README-pressure-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-pressure-1.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-2-2.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-2-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-2-3.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-2-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-2-4.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-2-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-2-5.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-3-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-3-2.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-3-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-3-3.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-3-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-3-4.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-4-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-10.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-4-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-2.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-4-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-3.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-4-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-4.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-4-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-5.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-4-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-6.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-4-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-7.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-4-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-8.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-4-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-4-9.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/README-unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/man/figures/jaccard_raincloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/jaccard_raincloud.png


--------------------------------------------------------------------------------
/man/figures/scclusteval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/scclusteval.png


--------------------------------------------------------------------------------
/man/figures/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyhottommy/scclusteval/b1b22c794c6603aaa4469d94fb6ee4d81f445d4d/man/figures/workflow.png


--------------------------------------------------------------------------------
/man/geom_flat_violin.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/geomflatviolin.R
 3 | \name{geom_flat_violin}
 4 | \alias{geom_flat_violin}
 5 | \title{A Flat Violin plot}
 6 | \usage{
 7 | geom_flat_violin(
 8 |   mapping = NULL,
 9 |   data = NULL,
10 |   stat = "ydensity",
11 |   position = "dodge",
12 |   trim = TRUE,
13 |   scale = "area",
14 |   show.legend = NA,
15 |   inherit.aes = TRUE,
16 |   ...
17 | )
18 | }
19 | \arguments{
20 | \item{mapping}{See \code{\link[ggplot2]{geom_violin}}}
21 | 
22 | \item{data}{See \code{\link[ggplot2]{geom_violin}}}
23 | 
24 | \item{position}{See \code{\link[ggplot2]{geom_violin}}}
25 | 
26 | \item{trim}{See \code{\link[ggplot2]{geom_violin}}}
27 | 
28 | \item{scale}{See \code{\link[ggplot2]{geom_violin}}}
29 | 
30 | \item{show.legend}{See \code{\link[ggplot2]{geom_violin}}}
31 | 
32 | \item{inherit.aes}{See \code{\link[ggplot2]{geom_violin}}}
33 | 
34 | \item{...}{}
35 | }
36 | \description{
37 | A Flat Violin plot
38 | }
39 | \examples{
40 | library(ggplot2)
41 | ggplot(diamonds, aes(cut, carat)) +
42 | geom_flat_violin() +
43 | coord_flip()
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/man/idents.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/idents.R
 3 | \docType{data}
 4 | \name{idents}
 5 | \alias{idents}
 6 | \title{cluster identity of subsetted pbmc data}
 7 | \format{
 8 | A list of factors
 9 | }
10 | \source{
11 | \url{https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz}
12 | }
13 | \usage{
14 | data(idents)
15 | }
16 | \description{
17 | The 2700 cell pbmc data were subsetted to 80 percent of the cells for 100 times.
18 | Each time, we fully re-processed the subsetted data from FindVaraiableGenes to
19 | FindClusters using k =30 and resolution = 0.6, and record the cluster identity
20 | from the processed seurat@ident
21 | slot and saved in to a list of factor.
22 | }
23 | \keyword{datasets}
24 | 


--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils-pipe.R
 3 | \name{\%>\%}
 4 | \alias{\%>\%}
 5 | \title{Pipe operator}
 6 | \usage{
 7 | lhs \%>\% rhs
 8 | }
 9 | \description{
10 | See \code{magrittr::\link[magrittr]{\%>\%}} for details.
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/man/scClusterBoot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scclusterboot.R
 3 | \name{scClusterBoot}
 4 | \alias{scClusterBoot}
 5 | \title{Bootstrap for a fully processed Seurat object}
 6 | \usage{
 7 | scClusterBoot(object, n = 4, rate = 0.8, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{A fully processed Seurat object.}
11 | 
12 | \item{n}{Number of times you want to bootstrap.}
13 | 
14 | \item{rate}{A number between 0 and 1 for subsampling the cells.}
15 | 
16 | \item{...}{Other parameters passed to \code{\link{PreprocessSubsetData}}}
17 | }
18 | \value{
19 | A list of lists containing the ident from the subsetted reclustered
20 | seurat objects.
21 | }
22 | \description{
23 | Bootstrap for a fully processed Seurat object
24 | }
25 | \examples{
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/man/scclusteval-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scclusteval-package.R
 3 | \docType{package}
 4 | \name{scclusteval-package}
 5 | \alias{scclusteval}
 6 | \alias{scclusteval-package}
 7 | \title{scclusteval: Evaluate the single cell clustering}
 8 | \description{
 9 | What the package does (one paragraph).
10 | }
11 | \seealso{
12 | Useful links:
13 | \itemize{
14 |   \item \url{https://github.com/crazyhottommy/scclusteval}
15 |   \item Report bugs at \url{https://github.com/crazyhottommy/scclusteval/issues}
16 | }
17 | 
18 | }
19 | \author{
20 | \strong{Maintainer}: Ming Tang \email{tangming2005@gmail.com}
21 | 
22 | }
23 | \keyword{internal}
24 | 


--------------------------------------------------------------------------------
/scclusteval.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 8
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/tests/spelling.R:
--------------------------------------------------------------------------------
1 | if(requireNamespace('spelling', quietly=TRUE))
2 |   spelling::spell_check_test(vignettes = TRUE, error = FALSE, skip_on_cran = TRUE)
3 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/pbmc_example.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "walk through scclusteval using pbmc data"
  3 | author: "Ming Tang"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{walk through scclusteval using pbmc data}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | 
 13 | ```{r}
 14 | library(Seurat)
 15 | library(dplyr)
 16 | 
 17 | # Load the PBMC dataset
 18 | pbmc.data <- Read10X(data.dir = "~/Downloads/filtered_gene_bc_matrices/hg19/")
 19 | 
 20 | pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, 
 21 |     project = "10X_PBMC")
 22 | 
 23 | pbmc2 <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, 
 24 |     project = "10X_PBMC")
 25 | 
 26 | mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
 27 | percent.mito <- Matrix::colSums(pbmc@raw.data[mito.genes, ])/Matrix::colSums(pbmc@raw.data)
 28 | 
 29 | # AddMetaData adds columns to object@meta.data, and is a great place to
 30 | # stash QC stats
 31 | pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")
 32 | 
 33 | pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", 
 34 |     scale.factor = 10000)
 35 | 
 36 | pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR, 
 37 |     x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
 38 | 
 39 | pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))
 40 | 
 41 | pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print = TRUE, pcs.print = 1:5, 
 42 |     genes.print = 5, pcs.compute = 100)
 43 | 
 44 | # this step takes long time.
 45 | # the Seurat tutorial uses 20 PCs. for large data sets, we sometimes use 85 PCs, 
 46 | # I set 100 here for example.
 47 | pbmc <- JackStraw(object = pbmc, num.pc = 100,  num.replicate = 100, display.progress = T, 
 48 |                   do.par = T, num.cores = 6)
 49 | 
 50 | pbmc@dr$pca@jackstraw@overall.p.values
 51 | ## default threshold is 1e-5, keep that.keep the first 100 PCs's pvalue.
 52 | pbmc<- JackStrawPlot(object = pbmc, PCs = 1:100, score.thresh = 1e-5)
 53 | 
 54 | JackStrawPlot(pbmc, PCs=1:10)
 55 | pc.use<- 10
 56 | 
 57 | pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:pc.use, 
 58 |     resolution = 0.6, print.output = FALSE, save.SNN = TRUE)
 59 | 
 60 | pbmc@ident
 61 | 
 62 | 
 63 | pbmc <- RunTSNE(object = pbmc, dims.use = 1:pc.use, do.fast = TRUE)
 64 | 
 65 | pbmc_sub1<- RandomSubsetData(pbmc, rate = 0.8)
 66 | pbmc_sub1<- PreprocessSubsetData(pbmc_sub1, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5,
 67 |                              resolution = 0.6, num.pc = 20)
 68 | 
 69 | pbmc_sub2<- RandomSubsetData(pbmc, rate = 0.8)
 70 | pbmc_sub2<- PreprocessSubsetData(pbmc_sub2, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5,
 71 |                              resolution = 0.6, num.pc = 20)
 72 | 
 73 | 
 74 | pbmc_sub3<- RandomSubsetData(pbmc, rate = 0.8)
 75 | pbmc_sub3<- PreprocessSubsetData(pbmc_sub3, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5,
 76 |                              resolution = 0.6, num.pc = 20)
 77 | 
 78 | 
 79 | (pbmc@ident == 1) %>% table()
 80 | 
 81 | orignal_cluster0<- names(pbmc@ident[pbmc@ident == 0])
 82 | 
 83 | sub1_cluster0<- names(pbmc_sub1@ident[pbmc_sub1@ident == 0])
 84 | sub1_cluster1<- names(pbmc_sub1@ident[pbmc_sub1@ident == 1])
 85 | sub1_cluster2<- names(pbmc_sub1@ident[pbmc_sub1@ident == 2])
 86 | sub1_cluster3<- names(pbmc_sub1@ident[pbmc_sub1@ident == 3])
 87 | 
 88 | dist(orignal_cluster0, sub1_cluster0, method = "binary")
 89 | 
 90 | bayesbio::jaccardSets(orignal_cluster0, sub1_cluster0)
 91 | length(intersect(orignal_cluster0, sub1_cluster0))/length(unique(c(orignal_cluster0, sub1_cluster0)))
 92 | 
 93 | 
 94 | total_cluster_ids<- length(unique(pbmc_sub1@ident))
 95 | levels(pbmc_sub1@ident)
 96 | 
 97 | pbmc_sub1@ident == 1
 98 | pbmc_sub1@meta.data %>% tibble::rownames_to_column(var = "cell_id") %>% select(cell_id, res.0.6)
 99 | dat<- tibble(cell_id = names(pbmc_sub1@ident) , cluster = pbmc_sub1@ident) %>%
100 |   tidyr::nest(-cluster) %>% 
101 |   arrange(cluster)
102 | 
103 | dat %>% 
104 |   mutate(jaccard = purrr::map(data, ~JaccardSets(orignal_cluster0, .x$cell_id))) %>% 
105 |   pull(jaccard) %>% unlist() %>% max()
106 | 
107 | pbmc@ident == 3
108 | levels(pbmc@ident)
109 | pbmc@meta.data$res.0.6 %>% head()
110 | pbmc_sub1@ident
111 | 
112 | boot_test<- scClusterBoot(object = pbmc, n =3, rate = 0.8,  x.low.cutoff = 0.0125, 
113 |                           x.high.cutoff = 3, y.cutoff = 0.5,resolution = 0.6, 
114 |                           num.pc = 20, num.cores = 8)
115 | 
116 | boot_clusters<- purrr:::map(boot_test, "ident")
117 | 
118 | ## total 8 clusters in the original data set
119 | pbmc@ident %>% unique() %>% length()
120 | 
121 | # for loops are slow.
122 | 
123 | TurnIdentToDf<- function(ident){
124 |   dat<- tibble(cell.id = names(ident) , cluster = ident) %>%
125 |   tidyr::nest(-cluster) %>% 
126 |   arrange(cluster)
127 |   return(dat)
128 | }
129 | 
130 | TurnIdentToDf(boot_clusters[[1]])
131 | TurnIdentToDf(pbmc@ident)$data[[1]]
132 | 
133 | boot_clusters_df<- purrr::map(boot_clusters, TurnIdentToDf)
134 | 
135 | 
136 | boot_clusters_df[[3]] %>%
137 |   mutate(jaccard = purrr::map_dbl(data, ~JaccardSets(TurnIdentToDf(pbmc@ident)$data[[1]]$cell.id, .x$cell.id))) 
138 | 
139 | boot_clusters_df[[3]] %>%
140 |   mutate(jaccard = purrr::map_dbl(data, ~JaccardSets(TurnIdentToDf(pbmc@ident)$data[[2]]$cell.id, .x$cell.id))) 
141 | 
142 | boot_clusters_df[[3]] %>%
143 |   mutate(jaccard = purrr::map_dbl(data, ~JaccardSets(TurnIdentToDf(pbmc@ident)$data[[3]]$cell.id, .x$cell.id)))
144 | 
145 | boot_clusters_df[[3]] %>%
146 |   mutate(jaccard = purrr::map_dbl(data, ~JaccardSets(TurnIdentToDf(pbmc@ident)$data[[4]]$cell.id, .x$cell.id)))
147 | 
148 | boot_clusters_df[[3]] %>%
149 |   mutate(jaccard = purrr::map_dbl(data, ~JaccardSets(TurnIdentToDf(pbmc@ident)$data[[5]]$cell.id, .x$cell.id)))
150 | 
151 | 
152 | boot_clusters_df[[3]] %>%
153 |   mutate(jaccard = purrr::map_dbl(data, ~JaccardSets(TurnIdentToDf(pbmc@ident)$data[[6]]$cell.id, .x$cell.id)))
154 | 
155 | 
156 | split(names(boot_clusters[[1]]), boot_clusters[[1]]) %>% lapply(length)
157 | 
158 | ## split the cells by cluster
159 | SplitIdentByCluster<- function(ident){
160 |   split(names(ident), ident)
161 | }
162 | 
163 | SplitIndentByCluster(pbmc@ident)
164 | 
165 | PairWiseJaccardSets<- function(ident1.list, ident2.list){
166 |   res<- c()
167 |   for (i in seq_along(ident1.list)){
168 |     ind<- purrr::map_dbl(ident2.list, ~JaccardSets(ident1.list[[i]], .x))
169 |     res<- rbind(res, ind)
170 |   }
171 |   rownames(res)<- names(ident1.list)
172 |   return(res)
173 | }
174 | 
175 | mat<- PairWiseJaccardSets(SplitIdentByCluster(pbmc@ident), SplitIdentByCluster(boot_clusters[[2]])) 
176 | mat<- PairWiseJaccardSets(ident1.list, ident2.list)
177 | library(ComplexHeatmap)
178 | 
179 | Heatmap(mat, cluster_rows = F, cluster_columns = F)
180 | 
181 | 
182 | ident1.list<- SplitIdentByCluster(pbmc@ident)
183 | ident2.list<- SplitIdentByCluster(boot_clusters[[3]])
184 | 
185 | id1<- purrr::map_dbl(ident2.list, ~JaccardSets(ident1.list[[1]], .x)) 
186 | id2<-  purrr::map_dbl(ident2.list, ~JaccardSets(ident1.list[[2]], .x))
187 | 
188 | sum(ident1.list$`7` %in% unlist(ident2.list))
189 | 
190 | lapply( ident2.list, function(x) sum(ident1.list$`6` %in% x))
191 | lapply(ident2.list, length)
192 | 
193 | 
194 | load("~/gather_bootstrap_cluster.rda")
195 | PairWiseJaccardSets(pbmc@ident, idents[[3]])
196 | 
197 | 
198 | ### put in function 
199 | `mat_list<- purrr::map(idents, ~PairWiseJaccardSets(ident1 = pbmc@ident, ident2 = .x))
200 | 
201 | mat_max<- purrr::map(mat_list, SelectHighestJaccard)
202 | 
203 | mats<- purrr::reduce(mat_max, bind_rows)`
204 | 
205 | mats %>% as_tibble() %>% tibble::rownames_to_column(var = "bootstrap")  %>%
206 |   tidyr::gather(-bootstrap, key= "cluster", value = "jaccard") %>% 
207 |   ggplot(aes(x = cluster, y = jaccard)) + 
208 |   geom_point() + 
209 |   geom_boxplot(aes(col = cluster))
210 |   
211 | source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")
212 |   
213 | mats %>% as_tibble() %>% tibble::rownames_to_column(var = "bootstrap")  %>%
214 |   tidyr::gather(-bootstrap, key= "cluster", value = "jaccard") %>% 
215 |   ggplot(aes(x = cluster, y = jaccard, fill = cluster)) + 
216 |   geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8) +
217 |   geom_point(aes(y = jaccard, color = cluster), position = position_jitter(width = .15), size = .5, alpha = 0.8) +
218 |   geom_boxplot(width = .1, guides = FALSE, outlier.shape = NA, alpha = 0.5) +
219 |   theme(legend.position="none")
220 | 
221 | 
222 | 
223 | mats %>% dplyr::summarise_all(median) %>% dplyr::mutate_all(~ifelse(.x >0.4, T, F))
224 | 
225 | mats %>% dplyr::summarise_all(mean) %>% dplyr::mutate_all(~ifelse(.x >0.4, T, F)) %>% unlist() %>% str()
226 | 
227 | ## how many stable clusters?
228 | mats %>% dplyr::summarise_all(median) %>% dplyr::mutate_all(~ifelse(.x >0.4, T, F)) %>%
229 |   unlist() %>% sum()
230 | 
231 | 
232 | ks_idents<- readRDS("~/gather_bootstrap_k.rds")
233 | 
234 | ks_stable<- purrr::map(ks_idents, ~AssignStableCluster(pbmc@ident, .x ))
235 | 
236 | k_20_seurat<- readRDS("~/bootstrap_k_preprocess/bootstrap_k_20.rds")
237 | k_25_seurat<- readRDS("~/bootstrap_k_preprocess/bootstrap_k_25.rds")
238 | k_30_seurat<- readRDS("~/bootstrap_k_preprocess/bootstrap_k_30.rds")
239 | k_35_seurat<- readRDS("~/bootstrap_k_preprocess/bootstrap_k_35.rds")
240 | 
241 | pbmc<- k_30_seurat 
242 | ks_idents_original<- list(k_20_seurat@ident, k_25_seurat@ident, k_30_seurat@ident, k_35_seurat@ident)
243 | names(ks_idents_original)<- c("k20", "k25", "k30", "k35")
244 | 
245 | JaccardRainCloudPlot(k_20_seurat@ident, ks_idents$`20`) + geom_hline(yintercept = c(0.4, 0.8), linetype = 2)
246 | JaccardRainCloudPlot(k_25_seurat@ident, ks_idents$`25`) + geom_hline(yintercept = c(0.4, 0.8), linetype = 2)
247 | JaccardRainCloudPlot(k_30_seurat@ident, ks_idents$`30`) + geom_hline(yintercept = c(0.4, 0.8), linetype = 2)
248 | JaccardRainCloudPlot(k_35_seurat@ident, ks_idents$`35`) + geom_hline(yintercept = c(0.4, 0.8), linetype = 2)
249 | 
250 | ## cluster7 and cluster 8 from k20 is the same cluster7 from k25
251 | PairWiseJaccardSetsHeatmap(PairWiseJaccardSets(k_20_seurat@ident, k_25_seurat@ident))
252 | 
253 | ks_stable<- purrr::map2(ks_idents_original, ks_idents, ~AssignStableCluster(ident1= .x, idents = .y))
254 | 
255 | ks_stable$k20
256 | ks_stable$k25
257 | ks_stable$k30
258 | ks_stable$k35
259 | 
260 | map(ks_stable, c("perdent_cell_in_cluster", "number_of_stable_cluster"))
261 | 
262 | table(k_20_seurat@ident, k_30_seurat@ident)
263 | 
264 | jaccard_mat<- PairWiseJaccardSets(k_20_seurat@ident, k_25_seurat@ident)
265 | 
266 | get_colname<- function(x){
267 |   id<- which.max(x)
268 |   return(colnames(jaccard_mat)[id])
269 | }
270 | 
271 | ids<- apply(jaccard_mat, 1, get_colname) 
272 | tibble::tibble(ident1 = names(ids), ident2 = ids)
273 | 
274 | MatchClusters(k_20_seurat@ident, k_25_seurat@ident)
275 | 
276 | mat<- PairWiseOverlappingIdents(k_20_seurat@ident, k_25_seurat@ident)
277 | rownames(mat)<- paste0("1_", rownames(mat))
278 | colnames(mat)<- paste0("2_", colnames(mat))
279 | circlize::circos.par(start.degree = 90, clock.wise = FALSE)
280 | #grid.col<- c("")
281 | circlize::chordDiagram(mat, big.gap = 20, transparency = 0.5, link.sort = TRUE, link.decreasing = FALSE, directional = -1)
282 | circlize::circos.clear()
283 | 
284 | 
285 | pbmc@ident %>% table() %>% as.data.frame()
286 | pbmc@ident %>% table() %>% rbind() %>% as.data.frame()
287 | 
288 | cluster_size<- as.data.frame(table(pbmc@ident))
289 | as.data.frame(table(pbmc@ident)) %>% 
290 |   dplyr::rename(cluster = Var1, size = Freq) %>%
291 |   ggplot(aes(x = cluster, y = size)) +
292 |   geom_bar(stat = "identity", fill = "blue") +
293 |   geom_text(aes(label=size), vjust= -1.5, angle = 45) +
294 |   theme(axis.text.x = element_text(angle = 45, hjust = 1))
295 | ```
296 | 
297 | 
298 | 
299 | 


--------------------------------------------------------------------------------