├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── CRAN-SUBMISSION ├── DESCRIPTION ├── NAMESPACE ├── NEWS ├── R ├── RcppExports.R ├── RunHarmony.R ├── data.R ├── harmony-package.r ├── harmony_option.R ├── ui.R └── utils.R ├── README.md ├── appveyor.yml ├── cran-comments.md ├── data ├── cell_lines.rda ├── cell_lines_small.RData └── pbmc_stim.RData ├── doc ├── Seurat.R ├── Seurat.Rmd ├── Seurat.html ├── detailedWalkthrough.R ├── detailedWalkthrough.Rmd ├── detailedWalkthrough.html ├── parameters.R ├── parameters.Rmd ├── parameters.html ├── quickstart.R ├── quickstart.Rmd └── quickstart.html ├── man ├── HarmonyMatrix.Rd ├── RunHarmony.Rd ├── RunHarmony.Seurat.Rd ├── RunHarmony.SingleCellExperiment.Rd ├── RunHarmony.default.Rd ├── cell_lines.Rd ├── cell_lines_small.Rd ├── figures │ └── logo.png ├── harmony.Rd ├── harmony_options.Rd ├── moe_ridge_get_betas.Rd ├── pbmc.ctrl.Rd ├── pbmc.stim.Rd └── pipe.Rd ├── src ├── .gitignore ├── Makevars ├── Makevars.win ├── RcppExports.cpp ├── harmony.cpp ├── harmony.h ├── harmony_types.h ├── types.h ├── utils.cpp └── utils.h ├── tests ├── testthat.R └── testthat │ └── test_integration.R └── vignettes ├── .gitignore ├── Seurat.Rmd ├── detailedWalkthrough.Rmd ├── main.jpg └── quickstart.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | src/*\.so 2 | src/*\.o 3 | src/.cache 4 | src/Makefile 5 | src/compile_commands.json 6 | data/pbmc_stim_original.RData 7 | ^.*\.Rproj 8 | ^\.Rproj\.user$ 9 | ^.travis.yml$ 10 | appveyor\.yml 11 | ^CRAN-SUBMISSION$ 12 | ^cran-comments\.md$ 13 | ^doc$ 14 | ^\.Rproj\.user$ 15 | ^.*\.Rproj$ 16 | ^docs$ 17 | ^README.*\.md$ 18 | ^codecov\.yml$ 19 | ^NEWS\.md$ 20 | ^Meta$ 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | inst/doc 2 | *.ipynb 3 | .DS_Store 4 | .ipynb_checkpoints 5 | **/.ipynb_checkpoints 6 | src/*.o 7 | src/*.so 8 | config.log 9 | .Rproj.user 10 | config.status 11 | *.Rproj 12 | *.swp 13 | ..Rcheck 14 | /Meta/ 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | os: linux 3 | cache: packages 4 | warnings_are_errors: true 5 | r_check_args: "--no-manual --timings" 6 | 7 | bioc_packages: 8 | - BiocStyle 9 | - SingleCellExperiment 10 | 11 | jobs: 12 | include: 13 | - r: release 14 | os: osx 15 | - r: release 16 | os: linux 17 | 18 | env: 19 | global: 20 | - _R_CHECK_FORCE_SUGGESTS_: false 21 | - _R_CHECK_LENGTH_1_CONDITION_: verbose 22 | - _R_CHECK_LENGTH_1_LOGIC2_: verbose 23 | 24 | notifications: 25 | email: 26 | on_success: change 27 | on_failure: change 28 | -------------------------------------------------------------------------------- /CRAN-SUBMISSION: -------------------------------------------------------------------------------- 1 | Version: 1.1.0 2 | Date: 2023-10-20 11:26:27 UTC 3 | SHA: b1a43609415cbe30d56c6530c4e08b7182fa1885 4 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: harmony 2 | Title: Fast, Sensitive, and Accurate Integration of Single Cell Data 3 | Version: 1.2.3 4 | Authors@R: c( 5 | person("Ilya", "Korsunsky", email = "ilya.korsunsky@gmail.com", 6 | role = c("cre", "aut"), comment = c(ORCID = "0000-0003-4848-3948")), 7 | person("Martin", "Hemberg", email = "mhemberg@bwh.harvard.edu", 8 | role = c("aut"), comment = c(ORCID = "0000-0001-8895-5239")), 9 | person("Nikolaos", "Patikas", email = "nik.patik@gmail.com", 10 | role = c("aut", "ctb"), comment = c(ORCID = "0000-0002-3978-0134")), 11 | person("Hongcheng", "Yao", email = "hongchengyaonk@gmail.com", 12 | role = c("aut", "ctb"), comment = c(ORCID = "0000-0002-0743-4835")), 13 | person("Nghia", "Millard", email = "nmillard@g.harvard.edu", 14 | role = "aut", comment = c(ORCID = "0000-0002-0518-7674")), 15 | person("Jean", "Fan", email = "jeanfan@fas.harvard.edu", 16 | role = c("aut", "ctb"), comment = c(ORCID = "0000-0002-0212-5451")), 17 | person("Kamil", "Slowikowski", email = "kslowikowski@gmail.com", 18 | role = c("aut", "ctb"), comment = c(ORCID = "0000-0002-2843-6370")), 19 | person("Miles", "Smith", 20 | role = c("ctb")), 21 | person("Soumya", "Raychaudhuri", 22 | role = c("aut"), comment = c(ORCID = "0000-0002-1901-8265")) 23 | ) 24 | Description: Implementation of the Harmony algorithm for single cell integration, described in Korsunsky et al . Package includes a standalone Harmony function and interfaces to external frameworks. 25 | URL: https://github.com/immunogenomics/harmony 26 | License: GPL-3 27 | Encoding: UTF-8 28 | RoxygenNote: 7.2.3 29 | Depends: R(>= 3.5.0), Rcpp 30 | LazyData: true 31 | LazyDataCompression: gzip 32 | LinkingTo: Rcpp, RcppArmadillo, RcppProgress 33 | Imports: 34 | dplyr, 35 | cowplot, 36 | ggplot2, 37 | Matrix, 38 | methods, 39 | tibble, 40 | rlang, 41 | RhpcBLASctl 42 | Suggests: 43 | SingleCellExperiment, 44 | Seurat (>= 4.1.1), 45 | testthat, 46 | knitr, 47 | rmarkdown, 48 | ggthemes, 49 | ggrepel, 50 | patchwork, 51 | tidyverse, 52 | tidyr, 53 | data.table 54 | VignetteBuilder: knitr 55 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(RunHarmony,Seurat) 4 | S3method(RunHarmony,SingleCellExperiment) 5 | S3method(RunHarmony,default) 6 | export("%>%") 7 | export(HarmonyMatrix) 8 | export(RunHarmony) 9 | export(harmony_options) 10 | export(moe_ridge_get_betas) 11 | importFrom(Rcpp,loadModule) 12 | importFrom(Rcpp,sourceCpp) 13 | importFrom(cowplot,plot_grid) 14 | importFrom(dplyr,"%>%") 15 | importFrom(methods,as) 16 | importFrom(methods,hasArg) 17 | importFrom(methods,is) 18 | importFrom(methods,new) 19 | importFrom(rlang,.data) 20 | importFrom(rlang,`%||%`) 21 | importFrom(stats,model.matrix) 22 | useDynLib(harmony) 23 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | # harmony v1.2.0 - Oct 12 2023 2 | - Major performance enhancements, using indexes for the regression 3 | - update_R - Generate blocks correctly #214 4 | - lambda optimization - Lambda as a function of E. 5 | - New alpha parameter to estimate lambda during runtime 6 | - fail-safe for < 40 cells dataset . Setting block_size=0.2. Refuse to run with < 6 cells. 7 | - added progress bar for the integration step. 8 | # harmony v1.1.0 - Oct 12 2023 9 | - update_R bug - All cells are corrected exactly once per invocation 10 | - Improved documentation RunHarmony generc 11 | - Fix lambda failing on multiple covariates 12 | - verbose option suppresses all messages 13 | # harmony v1.0.0 - Jul 27 2023 14 | * API changes 15 | - removed do_pca functionality 16 | - removed reference_values functionality 17 | - removed cluster_prior functionality 18 | - beta feature: automatic parameterization of lambda when it is set to NULL 19 | - ncore parameter controls the use of multiple processors when parallelized BLAS exists. 20 | - Moved several parameters to the .options. Now they are accessible through harmony_options() 21 | * Documentation 22 | - Updated seurat vignette 23 | - Removed mudan Seurat2 and Seurat3 vignettes 24 | * Name changes 25 | - Integrated HarmonyMatrix function to the RunHarmony generic 26 | - HarmonyMatrix is deprecated 27 | * Backend changes 28 | - Sparse matrix coercion to yield performance enhancements 29 | - L2-normalization using armadillo routines 30 | - Supports parallel versions of BLAS. 31 | * Fixes 32 | - RunHarmony() for Seurat considers dimension set 33 | - RunHarmony() for SingleCellExperiment works 34 | - custom ceiling function to avoid conflicts for some block_size 35 | - Coercing covariate to factor when levels are numbers 36 | 37 | * New features 38 | - Automatic parameterization of lambda 39 | - Updated convergence plot 40 | 41 | 42 | # harmony v0.1.0 43 | * Initial release to CRAN 44 | 45 | # harmony v0.1.1 46 | * Updates pow to harmony_pow to avoid collision with new Armadillo pow function 47 | 48 | 49 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | kmeans_centers <- function(X, K) { 5 | .Call('_harmony_kmeans_centers', PACKAGE = 'harmony', X, K) 6 | } 7 | 8 | scaleRows_dgc <- function(x, p, i, ncol, nrow, thresh) { 9 | .Call('_harmony_scaleRows_dgc', PACKAGE = 'harmony', x, p, i, ncol, nrow, thresh) 10 | } 11 | 12 | find_lambda_cpp <- function(alpha, cluster_E) { 13 | .Call('_harmony_find_lambda_cpp', PACKAGE = 'harmony', alpha, cluster_E) 14 | } 15 | 16 | -------------------------------------------------------------------------------- /R/RunHarmony.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Generic function that runs the harmony algorithm on single-cell 4 | #' genomics cell embeddings. 5 | #' 6 | #' RunHarmony is generic function that runs the main Harmony 7 | #' algorithm. If working with single cell R objects, please refer to 8 | #' the documentation of the appropriate generic API: 9 | #' ([RunHarmony.Seurat()] or [RunHarmony.SingleCellExperiment()]). If 10 | #' users work with other forms of cell embeddings, the can pass them 11 | #' directly to harmony using [RunHarmony.default()] API. All the 12 | #' function arguments listed here are common in all RunHarmony 13 | #' interfaces. 14 | #' 15 | #' @family RunHarmony 16 | #' @rdname RunHarmony 17 | #' @inheritDotParams RunHarmony.default -data_mat -meta_data -vars_use -return_object 18 | #' 19 | #' 20 | #' @return If used with single-cell objects, it will return the 21 | #' updated single-sell object. For standalone operation, it 22 | #' returns the corrected cell embeddings or the R6 harmony object 23 | #' (see [RunHarmony.default()]). 24 | #' 25 | #' @export 26 | #' @md 27 | RunHarmony <- function(...) { 28 | UseMethod("RunHarmony") 29 | } 30 | 31 | 32 | 33 | #' Applies harmony on a Seurat object cell embedding. 34 | #' 35 | #' @rdname RunHarmony.Seurat 36 | #' @family RunHarmony 37 | #' @inheritDotParams RunHarmony.default -data_mat -meta_data -vars_use -return_object 38 | #' 39 | #' @param object the Seurat object. It needs to have the appropriate slot 40 | #' of cell embeddings precomputed. 41 | #' @param group.by.vars the name(s) of covariates that harmony will remove 42 | #' its effect on the data. 43 | #' @param reduction.use Name of dimension reduction to use. Default is pca. 44 | #' @param dims.use indices of the cell embedding features to be used 45 | #' @param reduction.save the name of the new slot that is going to be created by 46 | #' harmony. By default, harmony. 47 | #' @param project.dim Project dimension reduction loadings. Default TRUE. 48 | #' 49 | #' @return Seurat object. Harmony dimensions placed into a new slot in the Seurat 50 | #' object according to the reduction.save. For downstream Seurat analyses, 51 | #' use reduction='harmony'. 52 | #' 53 | #' @export 54 | #' 55 | #' @examples 56 | #' \dontrun{ 57 | #' ## seu is a Seurat single-Cell R object 58 | #' seu <- RunHarmony(seu, "donor_id") 59 | #' } 60 | RunHarmony.Seurat <- function( 61 | object, 62 | group.by.vars, 63 | reduction.use = 'pca', 64 | dims.use = NULL, 65 | reduction.save = "harmony", 66 | project.dim = TRUE, 67 | ... 68 | ) { 69 | if (!requireNamespace('Seurat', quietly = TRUE)) { 70 | stop("Running Harmony on a Seurat object requires Seurat") 71 | } 72 | if (!reduction.use %in% Seurat::Reductions(object = object)) { 73 | stop(paste(reduction.use, "cell embeddings not found in Seurat object.", 74 | "For a Seurat preprocessing walkthrough, please refer to the vignette")) 75 | } 76 | embedding <- Seurat::Embeddings(object, reduction = reduction.use) 77 | if (is.null(dims.use)) { 78 | dims.use <- seq_len(ncol(embedding)) 79 | } 80 | dims_avail <- seq_len(ncol(embedding)) 81 | if (!all(dims.use %in% dims_avail)) { 82 | stop("trying to use more dimensions than computed. Rerun dimension reduction 83 | with more dimensions or run Harmony with fewer dimensions") 84 | } 85 | if (length(dims.use) == 1) { 86 | stop("only specified one dimension in dims.use") 87 | } 88 | metavars_df <- Seurat::FetchData( 89 | object, 90 | group.by.vars, 91 | cells = Seurat::Cells(x = object[[reduction.use]]) 92 | ) 93 | 94 | harmonyEmbed <- RunHarmony( 95 | data_mat = embedding[, dims.use], 96 | meta_data = metavars_df, 97 | vars_use = group.by.vars, 98 | return_object = FALSE, 99 | ... 100 | ) 101 | 102 | reduction.key <- Seurat::Key(reduction.save, quiet = TRUE) 103 | rownames(harmonyEmbed) <- rownames(embedding) 104 | colnames(harmonyEmbed) <- paste0(reduction.key, seq_len(ncol(harmonyEmbed))) 105 | 106 | object[[reduction.save]] <- Seurat::CreateDimReducObject( 107 | embeddings = harmonyEmbed, 108 | stdev = as.numeric(apply(harmonyEmbed, 2, stats::sd)), 109 | assay = Seurat::DefaultAssay(object = object[[reduction.use]]), 110 | key = reduction.key 111 | ) 112 | if (project.dim) { 113 | object <- Seurat::ProjectDim( 114 | object, 115 | reduction = reduction.save, 116 | overwrite = TRUE, 117 | verbose = FALSE 118 | ) 119 | } 120 | return(object) 121 | } 122 | 123 | 124 | 125 | #' Applies harmony on PCA cell embeddings of a SingleCellExperiment. 126 | #' 127 | #' @rdname RunHarmony.SingleCellExperiment 128 | #' @inheritDotParams RunHarmony.default -data_mat -meta_data -vars_use -return_object 129 | #' @family RunHarmony 130 | #' 131 | #' @param object SingleCellExperiment with the PCA reducedDim cell embeddings populated 132 | #' @param group.by.vars the name(s) of covariates that harmony will remove 133 | #' its effect on the data. 134 | #' @param dims.use a vector of indices that allows only selected cell embeddings 135 | #' features to be used. 136 | #' @param verbose enable verbosity 137 | #' @param reduction.save the name of the new slot that is going to be created by 138 | #' harmony. By default, HARMONY. 139 | #' 140 | #' 141 | #' @return SingleCellExperiment object. After running RunHarmony, the corrected 142 | #' cell embeddings can be accessed with reducedDim(object, "Harmony"). 143 | #' @export 144 | #' 145 | #' @examples 146 | #' \dontrun{ 147 | #' ## sce is a SingleCellExperiment R object 148 | #' sce <- RunHarmony(sce, "donor_id") 149 | #' } 150 | RunHarmony.SingleCellExperiment <- function( 151 | object, 152 | group.by.vars, 153 | dims.use = NULL, 154 | verbose = TRUE, 155 | reduction.save = "HARMONY", 156 | ... 157 | ) { 158 | 159 | ## Get PCA embeddings 160 | if (!"PCA" %in% SingleCellExperiment::reducedDimNames(object)) { 161 | stop("PCA must be computed before running Harmony.") 162 | } 163 | pca_embedding <- SingleCellExperiment::reducedDim(object, "PCA") 164 | if (is.null(dims.use)) { 165 | dims.use <- seq_len(ncol(pca_embedding)) 166 | } 167 | 168 | if (is.null(dims.use)) { 169 | dims.use <- seq_len(ncol(pca_embedding)) 170 | } 171 | dims_avail <- seq_len(ncol(pca_embedding)) 172 | if (!all(dims.use %in% dims_avail)) { 173 | stop("trying to use more dimensions than computed with PCA. Rerun 174 | PCA with more dimensions or use fewer PCs") 175 | } 176 | 177 | metavars_df <- SingleCellExperiment::colData(object) 178 | if (!all(group.by.vars %in% colnames(metavars_df))) { 179 | stop('Trying to integrate over variables missing in colData') 180 | } 181 | 182 | harmonyEmbed <- RunHarmony( 183 | data_mat = pca_embedding[, dims.use], # is here an error? quick fix 184 | meta_data = metavars_df, 185 | vars_use = group.by.vars, 186 | return_object = FALSE, 187 | verbose = verbose, 188 | ... 189 | ) 190 | 191 | 192 | rownames(harmonyEmbed) <- row.names(metavars_df) 193 | colnames(harmonyEmbed) <- paste0(reduction.save, "_", seq_len(ncol(harmonyEmbed))) 194 | SingleCellExperiment::reducedDim(object, reduction.save) <- harmonyEmbed 195 | 196 | return(object) 197 | } 198 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' List of metadata table and scaled PCs matrix 2 | #' 3 | #' @format: 4 | #' meta_data: data.table of 9478 rows with defining dataset and cell_type 5 | #' scaled_pcs: data.table of 9478 rows (cells) and 20 columns (PCs) 6 | #' 7 | #' @source \url{https://www.10xgenomics.com} 8 | "cell_lines" 9 | 10 | #' Same as cell_lines but smaller (300 cells). 11 | #' 12 | #' @source \url{https://www.10xgenomics.com} 13 | "cell_lines_small" 14 | 15 | 16 | #' Gene expression data of control PBMC from Kang et al. 2017. This 17 | #' contains a sample of 1000 cells from that condition and is used for 18 | #' the Seurat Vignette. 19 | #' 20 | #' @source \doi{10.1038/nbt.4042} 21 | "pbmc.ctrl" 22 | 23 | 24 | #' Gene expression data of stimulated PBMC from Kang et al. 2017. This 25 | #' contains a sample of 1000 cells from that condition and is used for 26 | #' the Seurat Vignette. 27 | #' 28 | #' @source \doi{10.1038/nbt.4042} 29 | "pbmc.stim" 30 | 31 | 32 | -------------------------------------------------------------------------------- /R/harmony-package.r: -------------------------------------------------------------------------------- 1 | #' Harmony: fast, accurate, and robust single cell integration. 2 | #' 3 | #' Algorithm for single cell integration. 4 | #' 5 | #' @section Usage: 6 | #' 7 | #' 8 | #' ?RunHarmony to run Harmony on cell embeddings matrix, Seurat or 9 | #' SingleCellExperiment objects. 10 | #' 11 | #' @section Useful links: 12 | #' 13 | #' \enumerate{ 14 | #' \item Report bugs at \url{https://github.com/immunogenomics/harmony/issues} 15 | #' \item Read the manuscript 16 | #' \doi{10.1038/s41592-019-0619-0} 17 | #' } 18 | #' 19 | #' 20 | #' @name harmony 21 | #' @docType package 22 | #' @useDynLib harmony 23 | #' @importFrom Rcpp sourceCpp 24 | #' @importFrom Rcpp loadModule 25 | #' @importFrom methods new 26 | #' @importFrom methods as 27 | #' @importFrom methods is 28 | #' @importFrom cowplot plot_grid 29 | #' @importFrom rlang .data 30 | #' @importFrom rlang `%||%` 31 | #' @importFrom stats model.matrix 32 | loadModule("harmony_module", TRUE) 33 | NULL 34 | -------------------------------------------------------------------------------- /R/harmony_option.R: -------------------------------------------------------------------------------- 1 | #' Set advanced parameters for RunHarmony 2 | #' @param alpha When setting lambda = NULL and use lambda estimation mode, 3 | #' lambda would be determined by the expected number of cells assuming 4 | #' idependece between batches and clusters. i.e., lambda = alpha * expected 5 | #' number of cells, default 0.2 and alpha should be 0 < alpha < 1 6 | #' @param tau Protection against overclustering small datasets with 7 | #' large ones. `tau` is the expected number of cells per cluster. 8 | #' @param block.size What proportion of cells to update during clustering. 9 | #' Between 0 to 1, default 0.05. Larger values may be faster but less 10 | #' accurate. 11 | #' @param max.iter.cluster Maximum number of rounds to run clustering 12 | #' at each round of Harmony. 13 | #' @param epsilon.cluster Convergence tolerance for clustering round 14 | #' of Harmony. Set to -Inf to never stop early. 15 | #' @param epsilon.harmony Convergence tolerance for Harmony. Set to -Inf to 16 | #' never stop early. When `epsilon.harmony` is set to not NULL, then 17 | #' user-supplied values of `early_stop` is ignored. 18 | #' @returns Return a list for `.options` argument of `RunHarmony` 19 | #' @export 20 | #' @examples 21 | #' ## If want to set max.iter.cluster to be 100, do 22 | #' \dontrun{ 23 | #' RunHarmony(data_meta, meta_data, vars_use, 24 | #' .options = harmony_options(max.iter.cluster = 100)) 25 | #' } 26 | #' 27 | harmony_options <- function( 28 | alpha = 0.2, 29 | tau = 0, 30 | block.size = 0.05, 31 | max.iter.cluster = 20, 32 | epsilon.cluster = 1e-3, 33 | epsilon.harmony = 1e-2) { 34 | 35 | block.size <- validate_block.size(block.size) 36 | 37 | out <- list( 38 | alpha = alpha, 39 | tau = tau, 40 | block.size = block.size, 41 | max.iter.cluster = max.iter.cluster, 42 | epsilon.cluster = epsilon.cluster, 43 | epsilon.harmony = epsilon.harmony 44 | ) 45 | out <- structure(out, class = "harmony_options") 46 | return(out) 47 | } 48 | 49 | ## Validate functions ----------------------------------------------------------- 50 | validate_block.size <- function(block.size) { 51 | if(block.size <= 0 | block.size > 1){ 52 | stop('Error: block.size should be set between 0 and 1 (0 < block.size <= 1)') 53 | } 54 | return(block.size) 55 | } 56 | 57 | 58 | #' @importFrom methods hasArg 59 | check_legacy_args <- function(...) { 60 | if (hasArg("do_pca") || hasArg("npcs")) legacy_warning("do_pca_npcs") 61 | if (hasArg("tau")) legacy_warning("tau") 62 | if (hasArg("block.size")) legacy_warning("block.size") 63 | if (hasArg("max.iter.harmony")) legacy_warning("max.iter.harmony") 64 | if (hasArg("max.iter.cluster")) legacy_warning("max.iter.cluster") 65 | if (hasArg("epsilon.cluster")) legacy_warning("epsilon.cluster") 66 | if (hasArg("epsilon.harmony")) legacy_warning("epsilon.harmony") 67 | 68 | } 69 | 70 | 71 | 72 | 73 | legacy_warning <- function(param) { 74 | common_warn <- paste0( 75 | "Warning: The parameter ", param, " is deprecated. ", 76 | "It will be ignored for this function call ", 77 | "and please remove parameter ", param, " in future function calls. ", 78 | "Advanced users can set value of parameter ", param, 79 | " by using parameter .options and function harmony_options()." 80 | ) 81 | do_pca_npcs_warn <- paste0( 82 | "Warning: The parameters ", "do_pca and npcs", " are deprecated. ", 83 | "They will be ignored for this function call ", 84 | "and please remove parameters ", "do_pca and npcs", 85 | " and pass to harmony cell_embeddings directly." 86 | ) 87 | max.iter.harmony_warn <- paste0( 88 | "Warning: The parameter ", "max.iter.harmony ", 89 | "is replaced with parameter ", "max_iter. ", 90 | "It will be ignored for this function call ", 91 | "and please use parameter ", "max_iter ", "in future function calls." 92 | ) 93 | epsilon.harmony_warn <- paste0( 94 | "Warning: The parameter ", "epsilon.harmony", " is deprecated. ", 95 | "It will be ignored for this function call ", 96 | "and please remove parameter ", "epsilon.harmony", 97 | " in future function calls. ", 98 | "If users want to control if harmony would stop early or not, ", 99 | "use parameter ", "early_stop. ", 100 | "Advanced users can set value of parameter ", "epsilon.harmony", 101 | " by using parameter .options and function harmony_options()." 102 | ) 103 | 104 | 105 | if (param %in% c("tau", "block.size", "max.iter.cluster", 106 | "epsilon.cluster")) { 107 | warn_str <- common_warn 108 | } 109 | if (param == "do_pca_npcs") { 110 | warn_str <- do_pca_npcs_warn 111 | } 112 | if (param == "max.iter.harmony") { 113 | warn_str <- max.iter.harmony_warn 114 | } 115 | if (param == "epsilon.harmony") { 116 | warn_str <- epsilon.harmony_warn 117 | } 118 | 119 | rlang::warn(warn_str, .frequency = "once", .frequency_id = param) 120 | } 121 | -------------------------------------------------------------------------------- /R/ui.R: -------------------------------------------------------------------------------- 1 | #' This is the primary harmony interface. 2 | #' 3 | #' Use this generic with a cell embeddings matrix, a metadata table 4 | #' and a categorical covariate to run the Harmony algorithm directly 5 | #' on cell embedding matrix. 6 | #' 7 | #' @rdname RunHarmony.default 8 | #' @family RunHarmony 9 | #' 10 | #' @param data_mat Matrix of cell embeddings. Cells can be rows or 11 | #' columns and will be inferred by the rows of meta_data. 12 | #' @param meta_data Either (1) Dataframe with variables to integrate 13 | #' or (2) vector with labels. 14 | #' @param vars_use If meta_data is dataframe, this defined which 15 | #' variable(s) to remove (character vector). 16 | #' @param theta Diversity clustering penalty parameter. Specify for 17 | #' each variable in vars_use Default theta=2. theta=0 does not 18 | #' encourage any diversity. Larger values of theta result in more 19 | #' diverse clusters. 20 | #' @param sigma Width of soft kmeans clusters. Default 21 | #' sigma=0.1. Sigma scales the distance from a cell to cluster 22 | #' centroids. Larger values of sigma result in cells assigned to 23 | #' more clusters. Smaller values of sigma make soft kmeans cluster 24 | #' approach hard clustering. 25 | #' @param lambda Ridge regression penalty. Default lambda=1. Bigger 26 | #' values protect against over correction. If several covariates 27 | #' are specified, then lambda can also be a vector which needs to 28 | #' be equal length with the number of variables to be 29 | #' corrected. In this scenario, each covariate level group will be 30 | #' assigned the scalars specified by the user. If set to NULL, 31 | #' harmony will start lambda estimation mode to determine lambdas 32 | #' automatically and try to minimize overcorrection (Use with caution still 33 | #' in beta testing). 34 | #' @param nclust Number of clusters in model. nclust=1 equivalent to 35 | #' simple linear regression. 36 | #' @param max_iter Maximum number of rounds to run Harmony. One round 37 | #' of Harmony involves one clustering and one correction step. 38 | #' @param early_stop Enable early stopping for harmony. The 39 | #' harmonization process will stop when the change of objective 40 | #' function between corrections drops below 1e-4 41 | #' @param ncores Number of processors to be used for math operations 42 | #' when optimized BLAS is available. If BLAS is not supporting 43 | #' multithreaded then this option has no effect. By default, 44 | #' ncore=1 which runs as a single-threaded process. Although 45 | #' Harmony supports multiple cores, it is not optimized for 46 | #' multithreading. Increase this number for large datasets iff 47 | #' single-core performance is not adequate. 48 | #' @param plot_convergence Whether to print the convergence plot of 49 | #' the clustering objective function. TRUE to plot, FALSE to 50 | #' suppress. This can be useful for debugging. 51 | #' @param return_object (Advanced Usage) Whether to return the Harmony 52 | #' object or only the corrected PCA embeddings. 53 | #' @param verbose Whether to print progress messages. TRUE to print, 54 | #' FALSE to suppress. 55 | #' @param .options Setting advanced parameters of RunHarmony. This must be the 56 | #' result from a call to `harmony_options`. See ?`harmony_options` for 57 | #' parameters not listed above and more details. 58 | #' @param ... other parameters that are not part of the API 59 | #' 60 | #' @return By default, matrix with corrected PCA embeddings. If 61 | #' return_object is TRUE, returns the full Harmony object (R6 62 | #' reference class type). 63 | #' 64 | #' @export 65 | #' 66 | #' @examples 67 | #' 68 | #' 69 | #' ## By default, Harmony inputs a cell embedding matrix 70 | #' \dontrun{ 71 | #' harmony_embeddings <- RunHarmony(cell_embeddings, meta_data, 'dataset') 72 | #' } 73 | #' 74 | #' ## If PCA is the input, the PCs need to be scaled 75 | #' data(cell_lines_small) 76 | #' pca_matrix <- cell_lines_small$scaled_pcs 77 | #' meta_data <- cell_lines_small$meta_data 78 | #' harmony_embeddings <- RunHarmony(pca_matrix, meta_data, 'dataset') 79 | #' 80 | #' ## Output is a matrix of corrected PC embeddings 81 | #' dim(harmony_embeddings) 82 | #' harmony_embeddings[seq_len(5), seq_len(5)] 83 | #' 84 | #' ## Finally, we can return an object with all the underlying data structures 85 | #' harmony_object <- RunHarmony(pca_matrix, meta_data, 'dataset', return_object=TRUE) 86 | #' dim(harmony_object$Y) ## cluster centroids 87 | #' dim(harmony_object$R) ## soft cluster assignment 88 | #' dim(harmony_object$Z_corr) ## corrected PCA embeddings 89 | #' head(harmony_object$O) ## batch by cluster co-occurence matrix 90 | #' 91 | RunHarmony.default <- function( 92 | data_mat, 93 | meta_data, 94 | vars_use, 95 | theta = NULL, 96 | sigma = 0.1, 97 | lambda = 1, 98 | nclust = NULL, 99 | max_iter = 10, 100 | early_stop = TRUE, 101 | ncores = 1, 102 | plot_convergence = FALSE, 103 | return_object = FALSE, 104 | verbose = TRUE, 105 | .options = harmony_options(), 106 | ... 107 | ) { 108 | 109 | 110 | ## Try to set number of OPENBLAS cores for harmony. 111 | ## the function tries to set OpenMP threads 112 | ## In case OpenMP is not supported it returns FALSE so we don't 113 | ## set threads and harmony runs in single-thread mode 114 | set.cores <- setOMPthreads(ncores) 115 | 116 | 117 | tryCatch({ 118 | ## Check legacy arguments 119 | check_legacy_args(...) 120 | 121 | ## Set threads if BLAS threas are set/detected properly 122 | if (set.cores) { 123 | prev.ncores.blas <- RhpcBLASctl::blas_get_num_procs() 124 | prev.ncores.omp <- RhpcBLASctl::omp_get_num_procs() 125 | RhpcBLASctl::blas_set_num_threads(ncores) 126 | RhpcBLASctl::omp_set_num_threads(ncores) 127 | } 128 | 129 | 130 | ## Parameter setting -------------------------------------------------------- 131 | 132 | if (!inherits(.options, "harmony_options")) { 133 | stop("Error: .options must be created from harmony_options()!") 134 | } 135 | 136 | if (early_stop) { 137 | epsilon.harmony <- .options$epsilon.harmony 138 | } else { 139 | epsilon.harmony = -Inf 140 | } 141 | max.iter.harmony <- max_iter 142 | alpha <- .options$alpha 143 | tau <- .options$tau 144 | block.size <- .options$block.size 145 | max.iter.cluster <- .options$max.iter.cluster 146 | epsilon.cluster <- .options$epsilon.cluster 147 | 148 | 149 | 150 | ## TODO: check for 151 | ## partially observed batch variables (WARNING) 152 | ## batch variables with only 1 level (WARNING) 153 | ## if lambda given, check correct length 154 | ## if theta given, check correct length 155 | ## very small batch size and tau=0: suggest tau>0 156 | ## is PCA correct? 157 | if (!(is(meta_data, 'data.frame') | is(meta_data, 'DataFrame'))) { 158 | if (length(meta_data) %in% dim(data_mat)) { 159 | meta_data <- data.frame(batch_variable = meta_data) 160 | vars_use <- 'batch_variable' 161 | } else { 162 | stop('meta_data must be either a data.frame or a vector with batch 163 | values for each cell') 164 | } 165 | } 166 | 167 | if (is.null(vars_use) | any(!vars_use %in% colnames(meta_data))) { 168 | msg <- gettextf('must provide variables names (e.g. vars_use=%s)', 169 | sQuote('stim')) 170 | stop(msg) 171 | } 172 | 173 | ## Number of cells 174 | N <- nrow(meta_data) 175 | 176 | ## Check if we need to transpose our data 177 | if (nrow(data_mat) == N) { 178 | if (verbose) { 179 | message("Transposing data matrix") 180 | } 181 | data_mat <- Matrix::t(data_mat) 182 | } 183 | 184 | if (ncol(data_mat) != N) { 185 | stop("number of labels do not correspond to number of 186 | samples in data matrix") 187 | } 188 | 189 | 190 | # determine K if null 191 | if (is.null(nclust)) { 192 | nclust <- min(round(N / 30), 100) 193 | } 194 | 195 | # determine theta if null 196 | if (is.null(theta)) { 197 | theta <- rep(2, length(vars_use)) 198 | } else if (length(theta) != length(vars_use)) { 199 | stop('Please specify theta for each variable') 200 | } 201 | 202 | ## determine sigma if it is a scalar 203 | if (length(sigma) == 1 & nclust > 1) { 204 | sigma <- rep(sigma, nclust) 205 | } 206 | 207 | 208 | ## Pre-compute some useful statistics 209 | phi <- Reduce(rbind, lapply(vars_use, function(var_use) { 210 | res <- Matrix::sparse.model.matrix(~0 + as.factor(meta_data[[var_use]])) 211 | Matrix::t(res) 212 | })) 213 | 214 | ## ## number of cells per batch 215 | N_b <- Matrix::rowSums(phi) 216 | 217 | ## Number of factors per covariate 218 | B_vec <- Reduce(c, lapply(vars_use, function(var_use) { 219 | nlevels(as.factor(meta_data[[var_use]])) 220 | })) 221 | 222 | ## lambda=NULL means we have automatic estimation 223 | lambda.auto <- is.null(lambda) 224 | if (lambda.auto) { 225 | if(verbose){ 226 | message("Using automatic lambda estimation") 227 | } 228 | lambda_vec <- -1 ## Magic value for the backend 229 | } else { 230 | ## We use fixed lambdas 231 | if(!all(lambda > 0)) { 232 | stop("Provided lambdas must be positive") 233 | } 234 | if (length(lambda) == 1) { 235 | ## Single lambda is being used for all covariates 236 | lambda_vec <- c(0, rep(lambda, sum(B_vec))) 237 | } else { 238 | ## Several lambdas, one for each covariate 239 | if (length(lambda) != length(vars_use)) { 240 | stop(paste0("You specified a lambda value for each ", 241 | "covariate but the number of lambdas specified (", 242 | length(lambda), ") and the number of covariates (", 243 | length(vars_use),") mismatch.")) 244 | } 245 | lambda_vec <- unlist(lapply(seq_len(length(B_vec)), function(b) rep(lambda[b], B_vec[b]))) 246 | lambda_vec <- c(0, unname(lambda_vec)) 247 | } 248 | } 249 | 250 | 251 | 252 | ## Calculate theta (#covariates) x (#levels) 253 | theta <- Reduce(c, lapply(seq_len(length(B_vec)), function(b) 254 | rep(theta[b], B_vec[b]))) 255 | 256 | ## Theta scaling 257 | theta <- theta * (1 - exp(-(N_b / (nclust * tau))^2)) 258 | 259 | ## RUN HARMONY 260 | harmonyObj <- new(harmony) 261 | 262 | harmonyObj$setup( 263 | data_mat, phi, sigma, theta, lambda_vec, alpha, 264 | max.iter.cluster, epsilon.cluster, 265 | epsilon.harmony, nclust, block.size, 266 | B_vec, verbose 267 | ) 268 | 269 | 270 | if (verbose) { 271 | message("Initializing state using k-means centroids initialization") 272 | } 273 | harmonyObj$init_cluster_cpp() 274 | 275 | harmonize(harmonyObj, max.iter.harmony, verbose) 276 | 277 | if (plot_convergence) graphics::plot(HarmonyConvergencePlot(harmonyObj)) 278 | 279 | 280 | ## Return either the R6 Harmony object or the corrected PCA matrix 281 | if (return_object) { 282 | return(harmonyObj) 283 | } else { 284 | res <- as.matrix(harmonyObj$Z_corr) 285 | row.names(res) <- row.names(data_mat) 286 | colnames(res) <- colnames(data_mat) 287 | return(t(res)) 288 | } 289 | 290 | }, ## main tryCatch block ends here 291 | 292 | finally={ 293 | if(set.cores) { 294 | RhpcBLASctl::blas_set_num_threads(prev.ncores.blas) 295 | RhpcBLASctl::omp_set_num_threads(prev.ncores.omp) 296 | } 297 | }) 298 | 299 | 300 | 301 | } 302 | 303 | #' A proxy call to [RunHarmony()]. Deprecated. 304 | #' 305 | #' Maintain name backwards compatibility with version 0 of 306 | #' harmony. However, API is not backwards compatible with version 307 | #' 0. This function will be deprecated in later versions of Harmony. 308 | #' 309 | #' @inheritDotParams RunHarmony.default 310 | #' 311 | #' @export 312 | #' @md 313 | HarmonyMatrix <- function(...) { 314 | .Deprecated("RunHarmony", msg="HarmonyMatrix is deprecated and will be removed in the future from the API in the future") 315 | RunHarmony(...) 316 | } 317 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' @name %>% 4 | #' @rdname pipe 5 | #' @keywords internal 6 | #' @export 7 | #' @importFrom dplyr %>% 8 | #' @examples 9 | #' x <- 5 %>% sum(10) 10 | #' 11 | #' @usage lhs \%>\% rhs 12 | #' @return return value of rhs function. 13 | NULL 14 | 15 | harmonize <- function(harmonyObj, iter_harmony, verbose=TRUE) { 16 | if (iter_harmony < 1) { 17 | return(0) 18 | } 19 | 20 | for (iter in seq_len(iter_harmony)) { 21 | if (verbose) { 22 | message(gettextf('Harmony %d/%d', iter, iter_harmony)) 23 | } 24 | 25 | # STEP 1: do clustering 26 | err_status <- harmonyObj$cluster_cpp() 27 | if (err_status == -1) { 28 | stop('terminated by user') 29 | } else if (err_status != 0) { 30 | stop(gettextf('Harmony exited with non-zero exit status: %d', 31 | err_status)) 32 | } 33 | 34 | # STEP 2: regress out covariates 35 | harmonyObj$moe_correct_ridge_cpp() 36 | 37 | # STEP 3: check for convergence 38 | if (harmonyObj$check_convergence(1)) { 39 | if (verbose) { 40 | message(gettextf("Harmony converged after %d iterations", 41 | iter)) 42 | } 43 | return(0) 44 | } 45 | } 46 | } 47 | 48 | 49 | 50 | HarmonyConvergencePlot <- function( 51 | harmonyObj, round_start=1, round_end=Inf, do_wrap=FALSE 52 | ) { 53 | ## ignore initial value 54 | ## break down kmeans objective into rounds 55 | obj_fxn <- data.frame( 56 | kmeans_idx = Reduce(c, lapply(harmonyObj$kmeans_rounds, 57 | function(rounds) { 58 | seq_len(rounds) 59 | })), 60 | harmony_idx = Reduce(c, lapply( 61 | seq_len(length(harmonyObj$kmeans_rounds)), 62 | function(i) {rep(i, harmonyObj$kmeans_rounds[i])}) 63 | ), 64 | val = utils::tail(harmonyObj$objective_kmeans, -1) 65 | ) %>% 66 | dplyr::filter(.data$harmony_idx >= round_start) %>% 67 | dplyr::filter(.data$harmony_idx <= round_end) %>% 68 | tibble::rowid_to_column("idx") 69 | 70 | 71 | plt <- obj_fxn %>% ggplot2::ggplot(ggplot2::aes(.data$idx, .data$val, 72 | col = as.factor(.data$harmony_idx))) + 73 | ggplot2::geom_point() + 74 | ggplot2::labs(y = "Objective Function", x = "Clustering Step #", color = "Integration #") 75 | 76 | if (do_wrap) { 77 | plt <- plt + ggplot2::facet_grid(.~.data$harmony_idx, scales = 'free', 78 | space = 'free_x') 79 | } 80 | return(plt) 81 | } 82 | 83 | 84 | 85 | 86 | 87 | scaleData <- function(A, margin = 1, thresh = 10) { 88 | if (!"dgCMatrix" %in% class(A)) 89 | A <- methods::as(A, "dgCMatrix") 90 | 91 | if (margin != 1) A <- t(A) 92 | 93 | res <- scaleRows_dgc(A@x, A@p, A@i, ncol(A), nrow(A), thresh) 94 | if (margin != 1) res <- t(res) 95 | row.names(res) <- row.names(A) 96 | colnames(res) <- colnames(A) 97 | return(res) 98 | } 99 | 100 | 101 | #' Get beta Utility 102 | #' 103 | #' Utility function to get ridge regression coefficients from trained 104 | #' Harmony object 105 | #' 106 | #' @param harmonyObj Trained harmony object. Get this by running 107 | #' RunHarmony function with return_object=TRUE. 108 | #' @return Returns nothing, modifies object in place. 109 | #' @export 110 | moe_ridge_get_betas <- function(harmonyObj) { 111 | harmonyObj$moe_ridge_get_betas_cpp() 112 | } 113 | 114 | 115 | setOMPthreads <- function(ncores) { 116 | tryCatch({ 117 | ## The following block may fail in some build environments (if 118 | ## OpenMP is not available). In case OpenMP is not available, 119 | ## we control the flow and fail gracefully by catching the 120 | ## exception and warn the user. If ncores parameter, is not 121 | ## valid for the runtime environment then we prompt the user 122 | ## user 123 | 124 | ## Flag set in case user provides invalid number of cores 125 | invalid.number.of.cores <- FALSE 126 | 127 | ## If OpenMP is not supported, this may return NA 128 | max.cores <- RhpcBLASctl::omp_get_max_threads() 129 | ## Sanity check for number of cores 130 | ## NOTE: (ncores > max.cores) throws an exception if ncores is 131 | ## NA suggesting OpenMP is not supported 132 | if ((ncores != as.integer(ncores)) || (ncores < 1) || (ncores > max.cores)) { 133 | invalid.number.of.cores <- TRUE 134 | stop("")## Throw exception 135 | } 136 | 137 | }, 138 | error = function(e) { 139 | if(invalid.number.of.cores) { 140 | stop(paste0( 141 | "Invalid number of ncores provided: ", ncores, ". \n", 142 | "Maximum available cores: ", max.cores)) 143 | 144 | } else if(ncores != 1) { 145 | warning(paste( 146 | "Harmony was unable to set number of cores for BLAS.", 147 | "Running in single-thread mode instead" 148 | )) 149 | } 150 | return(FALSE) 151 | 152 | }) 153 | return(TRUE) 154 | } 155 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Harmony 2 | =========== 3 | 4 | [![Travis-CI Build Status](https://travis-ci.org/immunogenomics/harmony.svg?branch=master)](https://travis-ci.org/immunogenomics/harmony) 5 | [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/immunogenomics/harmony?branch=master&svg=true)](https://ci.appveyor.com/project/immunogenomics/harmony) 6 | [![DOI](https://zenodo.org/badge/doi/10.1038/s41592-019-0619-0.svg)](https://doi.org/10.1038/s41592-019-0619-0) 7 | 8 | *Fast, sensitive and accurate integration of single-cell data with Harmony* 9 | 10 | Check out the manuscript in Nature Methods: 11 | - [nature website](https://www.nature.com/articles/s41592-019-0619-0) 12 | - [read link](https://www.nature.com/articles/s41592-019-0619-0.epdf?shared_access_token=rDg_Rd07lrFXExt_ySj7V9RgN0jAjWel9jnR3ZoTv0NfDJkKCfDV_X9Mq3lweQmKiXEXxhrebQRjJEZdc-xNv6-7ZN1XotlD_mo5TSS4Z4eWn-kUo6mBwA5dEAKlTfR8OT6E10MZY_E-906ajbzvgg%3D%3D) 13 | 14 | For Python users, check out the [harmonypy package](https://github.com/slowkow/harmonypy) by Kamil Slowikowski. 15 | 16 | # System requirements 17 | 18 | Harmony has been tested on R versions >= 3.4. Please consult the DESCRIPTION file for more details on required R packages. Harmony has been tested on Linux, OS X, and Windows platforms. 19 | 20 | # Installation 21 | 22 | To run Harmony, open R and install harmony from CRAN: 23 | 24 | ```r 25 | install.packages("harmony") 26 | ``` 27 | 28 | If you'd like the latest development version, install from this github directly: 29 | 30 | ```r 31 | devtools::install_github("immunogenomics/harmony", build_vignettes=TRUE) 32 | ``` 33 | 34 | 35 | # Usage 36 | 37 | Harmony is designed to be user-friendly and supports some SingleCellExperiment and Seurat R analysis pipelines. Alternatively, it can be used in standalone mode. 38 | 39 | ## Quick Start 40 | 41 | ### Standalone Mode 42 | Check out this [vignette](http://htmlpreview.github.io/?https://github.com/immunogenomics/harmony/blob/master/doc/quickstart.html) for a quick start tutorial which demonstrates the usage of the tool in standalone mode. 43 | 44 | At minimum the following parameters need to be specified to achieve an integration. 45 | 46 | ```r 47 | library(harmony) 48 | my_harmony_embeddings <- RunHarmony(my_pca_embeddings, meta_data, "dataset") 49 | ``` 50 | 51 | 52 | ## Seurat Objects 53 | 54 | By default, the harmony API works on Seurats PCA cell embeddings and corrects them. You can run Harmony within your Seurat workflow with `RunHarmony()`. Prior `RunHarmony()` the PCA cell embeddings need to be precomputed through Seurat's API. For downstream analyses, use the `harmony` embeddings instead of `pca`. 55 | 56 | For example, the following snippet run Harmony and then calculates UMAP of the corrected input embeddings: 57 | 58 | ```r 59 | seuratObj <- RunHarmony(seuratObj, "dataset") 60 | seuratObj <- RunUMAP(seuratObj, reduction = "harmony") 61 | ``` 62 | 63 | For a more detailed overview of the `RunHarmony()` Seurat interface check, the [Seurat vignette](http://htmlpreview.github.io/?https://github.com/immunogenomics/harmony/blob/master/doc/Seurat.html) 64 | 65 | ## Harmony with two or more covariates 66 | 67 | Harmony can integrate over multiple covariates. To do this, specify a vector covariates to integrate. 68 | 69 | ```r 70 | my_harmony_embeddings <- RunHarmony( 71 | my_pca_embeddings, meta_data, c("dataset", "donor", "batch_id") 72 | ) 73 | ``` 74 | 75 | Do the same with your Seurat object: 76 | 77 | ```r 78 | seuratObject <- RunHarmony(seuratObject, c("dataset", "donor", "batch_id")) 79 | ``` 80 | 81 | ## Advanced tutorial 82 | 83 | The examples above all return integrated PCA embeddings. We created a [detailed walkthrough](http://htmlpreview.github.io/?https://github.com/immunogenomics/harmony/blob/master/doc/detailedWalkthrough.html) that explores the internal data structures and mechanics of the Harmony algorithm. 84 | 85 | 86 | # Performance Notes 87 | 88 | ## BLAS vs. OPENBLAS 89 | 90 | R distributions can be bundled with different scientific computing libraries. This can drastically impact harmony's performance. Rstudio comes by default with BLAS. In contrast, conda distributions of R are bundled with OPENBLAS. Overall, our benchmarks show that **harmony+OPENBLAS is substantially faster compared harmony+BLAS**. Therefore users with large datasets will benefit using OPENBLAS. 91 | 92 | ## Multithreading in OPENBLAS 93 | 94 | One caveat is that OPENBLAS uses OPENMP to parallelize operations. By default, OPENBLAS will utilize all cores for these operations. While in theory this accelerates runtimes, in practice harmony is not optimized for multi-threaded performance and the unoptimized parallelization granularity may result in significantly slower run times and inefficient resource utilization (wasted CPU cycles). Therefore, by default harmony turns off multi-threading. However, very large datasets >1M may benefit from parallelization. This behavior can be controlled by the `ncores` parameter which expects a number threads which harmony will use for its math operation. Users are advised to increase gradually `ncores` and assess potential performance benefits. 95 | 96 | 97 | # Reproducing results from manuscript 98 | 99 | Code to reproduce Harmony results from the Korsunsky et al 2019 manuscript will be made available on github.com/immunogenomics/harmony2019. 100 | 101 | 102 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # DO NOT CHANGE the "init" and "install" sections below 2 | 3 | # Download script file from GitHub 4 | init: 5 | ps: | 6 | $ErrorActionPreference = "Stop" 7 | Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" 8 | Import-Module '..\appveyor-tool.ps1' 9 | 10 | install: 11 | ps: Bootstrap 12 | 13 | # This is a good reference: 14 | # https://github.com/Bioconductor/BiocManager/blob/master/appveyor.yml 15 | 16 | cache: 17 | - C:\RLibrary 18 | 19 | environment: 20 | global: 21 | _R_CHECK_FORCE_SUGGESTS_: false 22 | R_ARCH: x64 23 | USE_RTOOLS: true 24 | R_CHECK_ARGS: "--no-manual --timings" 25 | 26 | matrix: 27 | - R_VERSION: release 28 | BIOC_USE_DEVEL: FALSE 29 | 30 | build_script: 31 | - echo Current directory=%CD% 32 | - travis-tool.sh install_deps 33 | - travis-tool.sh install_bioc_deps 34 | - travis-tool.sh install_r Seurat 35 | 36 | test_script: 37 | - travis-tool.sh run_tests 38 | 39 | on_failure: 40 | - 7z a failure.zip *.Rcheck\* 41 | - appveyor PushArtifact failure.zip 42 | 43 | artifacts: 44 | - path: '*.Rcheck\**\*.log' 45 | name: Logs 46 | 47 | - path: '*.Rcheck\**\*.out' 48 | name: Logs 49 | 50 | - path: '*.Rcheck\**\*.fail' 51 | name: Logs 52 | 53 | - path: '*.Rcheck\**\*.Rout' 54 | name: Logs 55 | 56 | - path: '\*_*.tar.gz' 57 | name: Bits 58 | 59 | - path: '\*_*.zip' 60 | name: Bits 61 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## R CMD check results 2 | 3 | This package was archived on 2022-10-30 4 | 5 | There was a conflict with the log() function from RcppArmadillo. 6 | 7 | All errors, warnings, and notes have been addressed. 8 | 9 | * This is a resubmission. 10 | 11 | -------------------------------------------------------------------------------- /data/cell_lines.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/immunogenomics/harmony/b36bab002c1767af6e665c81f186b40a87870e64/data/cell_lines.rda -------------------------------------------------------------------------------- /data/cell_lines_small.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/immunogenomics/harmony/b36bab002c1767af6e665c81f186b40a87870e64/data/cell_lines_small.RData -------------------------------------------------------------------------------- /data/pbmc_stim.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/immunogenomics/harmony/b36bab002c1767af6e665c81f186b40a87870e64/data/pbmc_stim.RData -------------------------------------------------------------------------------- /doc/Seurat.R: -------------------------------------------------------------------------------- 1 | ## ---- include = FALSE--------------------------------------------------------- 2 | knitr::opts_chunk$set( 3 | collapse = TRUE, 4 | comment = "#>" 5 | ) 6 | 7 | ## ----setup, message=FALSE, warning=FALSE-------------------------------------- 8 | library(harmony) 9 | library(Seurat) 10 | library(dplyr) 11 | library(cowplot) 12 | 13 | 14 | ## ----eval=FALSE--------------------------------------------------------------- 15 | # install.packages('harmony') 16 | 17 | ## ----------------------------------------------------------------------------- 18 | ## Source required data 19 | data("pbmc_stim") 20 | pbmc <- CreateSeuratObject(counts = cbind(pbmc.stim, pbmc.ctrl), project = "PBMC", min.cells = 5) 21 | 22 | ## Separate conditions 23 | 24 | pbmc@meta.data$stim <- c(rep("STIM", ncol(pbmc.stim)), rep("CTRL", ncol(pbmc.ctrl))) 25 | 26 | ## ----eval = FALSE, class.source='fold-hide'----------------------------------- 27 | # library(Matrix) 28 | # ## Download and extract files from GEO 29 | # ##setwd("/path/to/downloaded/files") 30 | # genes = read.table("GSE96583_batch2.genes.tsv.gz", header = FALSE, sep = "\t") 31 | # 32 | # pbmc.ctrl.full = as.readMM("GSM2560248_2.1.mtx.gz") 33 | # colnames(pbmc.ctrl.full) = paste0(read.table("GSM2560248_barcodes.tsv.gz", header = FALSE, sep = "\t")[,1], "-1") 34 | # rownames(pbmc.ctrl.full) = genes$V1 35 | # 36 | # pbmc.stim.full = readMM("GSM2560249_2.2.mtx.gz") 37 | # colnames(pbmc.stim.full) = paste0(read.table("GSM2560249_barcodes.tsv.gz", header = FALSE, sep = "\t")[,1], "-2") 38 | # rownames(pbmc.stim.full) = genes$V1 39 | # 40 | # library(Seurat) 41 | # 42 | # pbmc <- CreateSeuratObject(counts = cbind(pbmc.stim.full, pbmc.ctrl.full), project = "PBMC", min.cells = 5) 43 | # pbmc@meta.data$stim <- c(rep("STIM", ncol(pbmc.stim.full)), rep("CTRL", ncol(pbmc.ctrl.full))) 44 | # 45 | # 46 | # 47 | # 48 | # # Running Harmony 49 | # 50 | # Harmony works on an existing matrix with cell embeddings and outputs its transformed version with the datasets aligned according to some user-defined experimental conditions. By default, harmony will look up the `pca` cell embeddings and use these to run harmony. Therefore, it assumes that the Seurat object has these embeddings already precomputed. 51 | # 52 | # ## Calculate PCA cell embeddings 53 | # 54 | # Here, using `Seurat::NormalizeData()`, we will be generating a union of highly variable genes using each condition (the control and stimulated cells). These features are going to be subsequently used to generate the 20 PCs with `Seurat::RunPCA()`. 55 | # 56 | 57 | ## ----------------------------------------------------------------------------- 58 | pbmc <- pbmc %>% 59 | NormalizeData(verbose = FALSE) 60 | 61 | VariableFeatures(pbmc) <- split(row.names(pbmc@meta.data), pbmc@meta.data$stim) %>% lapply(function(cells_use) { 62 | pbmc[,cells_use] %>% 63 | FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 64 | VariableFeatures() 65 | }) %>% unlist %>% unique 66 | 67 | pbmc <- pbmc %>% 68 | ScaleData(verbose = FALSE) %>% 69 | RunPCA(features = VariableFeatures(pbmc), npcs = 20, verbose = FALSE) 70 | 71 | ## ---- eval=FALSE-------------------------------------------------------------- 72 | # ## run harmony with default parameters 73 | # pbmc <- pbmc %>% RunHarmony("stim") 74 | # ## is equivalent to: 75 | # pbmc <- RunHarmony(pbmc, "stim") 76 | 77 | ## ---- fig.width = 4, fig.height = 3, fig.align = "center", out.width="50%", fig.cap="By setting `plot_converge=TRUE`, harmony will generate a plot with its objective showing the flow of the integration. Each point represents the cost measured after a clustering round. Different colors represent different Harmony iterations which is controlled by `max_iter` (assuming that early_stop=FALSE). Here `max_iter=10` and up to 10 correction steps are expected. However, `early_stop=TRUE` so harmony will stop after the cost plateaus."---- 78 | 79 | pbmc <- pbmc %>% 80 | RunHarmony("stim", plot_convergence = TRUE, nclust = 50, max_iter = 10, early_stop = T) 81 | 82 | ## ----------------------------------------------------------------------------- 83 | harmony.embeddings <- Embeddings(pbmc, reduction = "harmony") 84 | 85 | ## ---- fig.width=7, fig.height=3, out.width="100%", fig.align="center", fig.cap="Evaluate harmonization of stim parameter in the harmony generated cell embeddings"---- 86 | 87 | p1 <- DimPlot(object = pbmc, reduction = "harmony", pt.size = .1, group.by = "stim") 88 | p2 <- VlnPlot(object = pbmc, features = "harmony_1", group.by = "stim", pt.size = .1) 89 | plot_grid(p1,p2) 90 | 91 | ## ---- fig.width = 6, fig.height=3, out.width="100%"--------------------------- 92 | 93 | DimHeatmap(object = pbmc, reduction = "harmony", cells = 500, dims = 1:3) 94 | 95 | ## ----------------------------------------------------------------------------- 96 | pbmc <- pbmc %>% 97 | FindNeighbors(reduction = "harmony") %>% 98 | FindClusters(resolution = 0.5) 99 | 100 | ## ---- fig.width=5, fig.height=2.5, fig.align="center", fig.cap="t-SNE Visualization of harmony embeddings"---- 101 | pbmc <- pbmc %>% 102 | RunTSNE(reduction = "harmony") 103 | 104 | 105 | p1 <- DimPlot(pbmc, reduction = "tsne", group.by = "stim", pt.size = .1) 106 | p2 <- DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = .1) 107 | plot_grid(p1, p2) 108 | 109 | 110 | ## ---- fig.width = 7, fig.height = 7, out.width="100%", fig.cap="Expression of gene panel heatmap in the harmonized PBMC dataset"---- 111 | FeaturePlot(object = pbmc, features= c("CD3D", "SELL", "CREM", "CD8A", "GNLY", "CD79A", "FCGR3A", "CCL2", "PPBP"), 112 | min.cutoff = "q9", cols = c("lightgrey", "blue"), pt.size = 0.5) 113 | 114 | 115 | ## ---- fig.width=5, fig.height=2.5, fig.align="center", fig.cap="UMAP Visualization of harmony embeddings"---- 116 | pbmc <- pbmc %>% 117 | RunUMAP(reduction = "harmony", dims = 1:20) 118 | 119 | p1 <- DimPlot(pbmc, reduction = "umap", group.by = "stim", pt.size = .1) 120 | p2 <- DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = .1) 121 | plot_grid(p1, p2) 122 | 123 | 124 | ## ----------------------------------------------------------------------------- 125 | sessionInfo() 126 | 127 | -------------------------------------------------------------------------------- /doc/Seurat.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using harmony in Seurat" 3 | output: 4 | rmarkdown::html_vignette: 5 | code_folding: show 6 | vignette: > 7 | %\VignetteIndexEntry{Using harmony in Seurat} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>" 16 | ) 17 | ``` 18 | 19 | ```{r setup, message=FALSE, warning=FALSE} 20 | library(harmony) 21 | library(Seurat) 22 | library(dplyr) 23 | library(cowplot) 24 | 25 | ``` 26 | # Introduction 27 | 28 | This tutorial describes how to use harmony in Seurat v5 single-cell analysis workflows. `RunHarmony()` is a generic function is designed to interact with Seurat objects. This vignette will walkthrough basic workflow of Harmony with Seurat objects. Also, it will provide some basic downstream analyses demonstrating the properties of harmonized cell embeddings and a brief explanation of the exposed algorithm parameters. 29 | 30 | Install Harmony from CRAN with standard commands. 31 | 32 | ```{r eval=FALSE} 33 | install.packages('harmony') 34 | ``` 35 | 36 | # Generating the dataset 37 | 38 | For this demo, we will be aligning two groups of PBMCs [Kang et al., 2017](https://doi.org/10.1038/nbt.4042). In this experiment, PBMCs are in stimulated and control conditions. The stimulated PBMC group was treated with interferon beta. 39 | 40 | 41 | ``` 42 | 43 | 44 | ## Generate SeuratObject 45 | 46 | ```{r} 47 | ## Source required data 48 | data("pbmc_stim") 49 | pbmc <- CreateSeuratObject(counts = cbind(pbmc.stim, pbmc.ctrl), project = "PBMC", min.cells = 5) 50 | 51 | ## Separate conditions 52 | 53 | pbmc@meta.data$stim <- c(rep("STIM", ncol(pbmc.stim)), rep("CTRL", ncol(pbmc.ctrl))) 54 | ``` 55 | 56 | 57 | ## (Optional) Download original data 58 | The example above contains only two thousand cells. The full [Kang et al., 2017](https://doi.org/10.1038/nbt.4042) dataset is deposited in the [GEO](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE96583). This analysis uses GSM2560248 and GSM2560249 samples from [GSE96583_RAW.tar](https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE96583&format=file) file and the [GSE96583_batch2.genes.tsv.gz](https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE96583&format=file&file=GSE96583%5Fbatch2%2Egenes%2Etsv%2Egz) gene file. 59 | 60 | ```{r eval = FALSE, class.source='fold-hide'} 61 | library(Matrix) 62 | ## Download and extract files from GEO 63 | ##setwd("/path/to/downloaded/files") 64 | genes = read.table("GSE96583_batch2.genes.tsv.gz", header = FALSE, sep = "\t") 65 | 66 | pbmc.ctrl.full = as.readMM("GSM2560248_2.1.mtx.gz") 67 | colnames(pbmc.ctrl.full) = paste0(read.table("GSM2560248_barcodes.tsv.gz", header = FALSE, sep = "\t")[,1], "-1") 68 | rownames(pbmc.ctrl.full) = genes$V1 69 | 70 | pbmc.stim.full = readMM("GSM2560249_2.2.mtx.gz") 71 | colnames(pbmc.stim.full) = paste0(read.table("GSM2560249_barcodes.tsv.gz", header = FALSE, sep = "\t")[,1], "-2") 72 | rownames(pbmc.stim.full) = genes$V1 73 | 74 | library(Seurat) 75 | 76 | pbmc <- CreateSeuratObject(counts = cbind(pbmc.stim.full, pbmc.ctrl.full), project = "PBMC", min.cells = 5) 77 | pbmc@meta.data$stim <- c(rep("STIM", ncol(pbmc.stim.full)), rep("CTRL", ncol(pbmc.ctrl.full))) 78 | 79 | 80 | 81 | 82 | # Running Harmony 83 | 84 | Harmony works on an existing matrix with cell embeddings and outputs its transformed version with the datasets aligned according to some user-defined experimental conditions. By default, harmony will look up the `pca` cell embeddings and use these to run harmony. Therefore, it assumes that the Seurat object has these embeddings already precomputed. 85 | 86 | ## Calculate PCA cell embeddings 87 | 88 | Here, using `Seurat::NormalizeData()`, we will be generating a union of highly variable genes using each condition (the control and stimulated cells). These features are going to be subsequently used to generate the 20 PCs with `Seurat::RunPCA()`. 89 | 90 | ```{r} 91 | pbmc <- pbmc %>% 92 | NormalizeData(verbose = FALSE) 93 | 94 | VariableFeatures(pbmc) <- split(row.names(pbmc@meta.data), pbmc@meta.data$stim) %>% lapply(function(cells_use) { 95 | pbmc[,cells_use] %>% 96 | FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 97 | VariableFeatures() 98 | }) %>% unlist %>% unique 99 | 100 | pbmc <- pbmc %>% 101 | ScaleData(verbose = FALSE) %>% 102 | RunPCA(features = VariableFeatures(pbmc), npcs = 20, verbose = FALSE) 103 | ``` 104 | 105 | ## Perform an integrated analysis 106 | 107 | To run harmony on Seurat object after it has been normalized, only one argument needs to be specified which contains the batch covariate located in the metadata. For this vignette, further parameters are specified to align the dataset but the minimum parameters are shown in the snippet below: 108 | 109 | ```{r, eval=FALSE} 110 | ## run harmony with default parameters 111 | pbmc <- pbmc %>% RunHarmony("stim") 112 | ## is equivalent to: 113 | pbmc <- RunHarmony(pbmc, "stim") 114 | ``` 115 | 116 | Here, we will be running harmony with some indicative parameters and plotting the convergence plot to illustrate some of the under the hood functionality. 117 | 118 | ```{r, fig.width = 4, fig.height = 3, fig.align = "center", out.width="50%", fig.cap="By setting `plot_converge=TRUE`, harmony will generate a plot with its objective showing the flow of the integration. Each point represents the cost measured after a clustering round. Different colors represent different Harmony iterations which is controlled by `max_iter` (assuming that early_stop=FALSE). Here `max_iter=10` and up to 10 correction steps are expected. However, `early_stop=TRUE` so harmony will stop after the cost plateaus."} 119 | 120 | pbmc <- pbmc %>% 121 | RunHarmony("stim", plot_convergence = TRUE, nclust = 50, max_iter = 10, early_stop = T) 122 | ``` 123 | 124 | 125 | 126 | ### Harmony API parameters on Seurat objects 127 | 128 | `RunHarmony` has several parameters accessible to users which are outlined below. 129 | 130 | #### `object` (required) 131 | 132 | The Seurat object. This vignette assumes Seurat objects are version 5. 133 | 134 | #### `group.by.vars` (required) 135 | 136 | A character vector that specifies all the experimental covariates to be corrected/harmonized by the algorithm. 137 | 138 | When using `RunHarmony()` with Seurat, harmony will look up the `group.by.vars` metadata fields in the Seurat Object metadata. 139 | 140 | For example, given the `pbmc[["stim"]]` exists as the stim condition, setting `group.by.vars="stim"` will perform integration of these samples accordingly. If you want to integrate on another variable, it needs to be present in Seurat object's meta.data. 141 | 142 | To correct for several covariates, specify them in a vector: `group.by.vars = c("stim", "new_covariate")`. 143 | 144 | #### `reduction.use` 145 | 146 | The cell embeddings to be used for the batch alignment. This parameter assumes that a reduced dimension already exists in the reduction slot of the Seurat object. By default, the `pca` reduction is used. 147 | 148 | 149 | #### `dims.use` 150 | 151 | Optional parameter which can use a name vector to select specific dimensions to be harmonized. 152 | 153 | 154 | ### Algorithm parameters 155 | ![Harmony Algorithm Overview](main.jpg){width=100%} 156 | 157 | #### `nclust` 158 | 159 | is a positive integer. Under the hood, harmony applies k-means soft-clustering. For this task, `k` needs to be determined. `nclust` corresponds to `k`. The harmonization results and performance are not particularly sensitive for a reasonable range of this parameter value. If this parameter is not set, harmony will autodetermine this based on the dataset size with a maximum cap of 200. For dataset with a vast amount of different cell types and batches this pamameter may need to be determined manually. 160 | 161 | #### `sigma` 162 | 163 | a positive scalar that controls the soft clustering probability assignment of single-cells to different clusters. Larger values will assign a larger probability to distant clusters of cells resulting in a different correction profile. Single-cells are assigned to clusters by their euclidean distance $d$ to some cluster center $Y$ after cosine normalization which is defined in the range [0,4]. The clustering probability of each cell is calculated as $e^{-\frac{d}{\sigma}}$ where $\sigma$ is controlled by the `sigma` parameter. Default value of `sigma` is 0.1 and it generally works well since it defines probability assignment of a cell in the range $[e^{-40}, e^0]$. Larger values of `sigma` restrict the dynamic range of probabilities that can be assigned to cells. For example, `sigma=1` will yield a probabilities in the range of $[e^{-4}, e^0]$. 164 | 165 | 166 | #### `theta` 167 | 168 | `theta` is a positive scalar vector that determines the coefficient of harmony's diversity penalty for each corrected experimental covariate. In challenging experimental conditions, increasing theta may result in better integration results. Theta is an expontential parameter of the diversity penalty, thus setting `theta=0` disables this penalty while increasing it to greater values than 1 will perform more aggressive corrections in an expontential manner. By default, it will set `theta=2` for each experimental covariate. 169 | 170 | #### `max_iter` 171 | 172 | The number of correction steps harmony will perform before completing the data set integration. In general, more iterations than necessary increases computational runtime especially which becomes evident in bigger datasets. Setting `early_stop=TRUE` may reduce the actual number of correction steps which will be smaller than `max_iter`. 173 | 174 | #### `early_stop` 175 | 176 | Under the hood, harmony minimizes its objective function through a series of clustering and integration tests. By setting `early_stop=TRUE`, when the objective function is less than `1e-4` after a correction step harmony exits before reaching the `max_iter` correction steps. This parameter can drastically reduce run-time in bigger datasets. 177 | 178 | #### `.options` 179 | A set of internal algorithm parameters that can be overriden. For advanced users only. 180 | 181 | 182 | 183 | ### Seurat specific parameters 184 | 185 | These parameters are Seurat-specific and do not affect the flow of the algorithm. 186 | 187 | #### `project_dim` 188 | 189 | Toggle-like parameter, by default `project_dim=TRUE`. When enabled, `RunHarmony()` calculates genomic feature loadings using Seurat's `ProjectDim()` that correspond to the harmonized cell embeddings. 190 | 191 | #### `reduction.save` 192 | 193 | The new Reduced Dimension slot identifier. By default, `reduction.save=TRUE`. This option allows several independent runs of harmony to be retained in the appropriate slots in the SeuratObjects. It is useful if you want to try Harmony with multiple parameters and save them as e.g. 'harmony_theta0', 'harmony_theta1', 'harmony_theta2'. 194 | 195 | ### Miscellaneous parameters 196 | 197 | These parameters help users troubleshoot harmony. 198 | 199 | #### `plot_convergence` 200 | 201 | Option that plots the convergence plot after the execution of the algorithm. By default `FALSE`. Setting it to `TRUE` will collect harmony's objective value and plot it allowing the user to troubleshoot the flow of the algorithm and fine-tune the parameters of the dataset integration procedure. 202 | 203 | 204 | 205 | ### Accessing the data 206 | 207 | `RunHarmony()` returns the Seurat object which contains the harmonized cell embeddings in a slot named **harmony**. This entry can be accessed via `pbmc@reductions$harmony`. To access the values of the cell embeddings we can also use: 208 | 209 | ```{r} 210 | harmony.embeddings <- Embeddings(pbmc, reduction = "harmony") 211 | ``` 212 | 213 | ### Inspection of the modalities 214 | 215 | After Harmony integration, we should inspect the quality of the harmonization and contrast it with the unharmonized algorithm input. Ideally, cells from different conditions will align along the Harmonized PCs. If they are not, you could increase the *theta* value above to force a more aggressive fit of the dataset and rerun the workflow. 216 | 217 | ```{r, fig.width=7, fig.height=3, out.width="100%", fig.align="center", fig.cap="Evaluate harmonization of stim parameter in the harmony generated cell embeddings"} 218 | 219 | p1 <- DimPlot(object = pbmc, reduction = "harmony", pt.size = .1, group.by = "stim") 220 | p2 <- VlnPlot(object = pbmc, features = "harmony_1", group.by = "stim", pt.size = .1) 221 | plot_grid(p1,p2) 222 | ``` 223 | 224 | Plot Genes correlated with the Harmonized PCs 225 | 226 | ```{r, fig.width = 6, fig.height=3, out.width="100%"} 227 | 228 | DimHeatmap(object = pbmc, reduction = "harmony", cells = 500, dims = 1:3) 229 | ``` 230 | 231 | # Using harmony embeddings for dimensionality reduction in Seurat 232 | 233 | The harmonized cell embeddings generated by harmony can be used for further integrated analyses. In this workflow, the Seurat object contains the harmony `reduction` modality name in the method that requires it. 234 | 235 | ## Perform clustering using the harmonized vectors of cells 236 | ```{r} 237 | pbmc <- pbmc %>% 238 | FindNeighbors(reduction = "harmony") %>% 239 | FindClusters(resolution = 0.5) 240 | ``` 241 | ## TSNE dimensionality reduction 242 | ```{r, fig.width=5, fig.height=2.5, fig.align="center", fig.cap="t-SNE Visualization of harmony embeddings"} 243 | pbmc <- pbmc %>% 244 | RunTSNE(reduction = "harmony") 245 | 246 | 247 | p1 <- DimPlot(pbmc, reduction = "tsne", group.by = "stim", pt.size = .1) 248 | p2 <- DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = .1) 249 | plot_grid(p1, p2) 250 | 251 | ``` 252 | 253 | One important observation is to assess that the harmonized data contain biological states of the cells. Therefore by checking the following genes we can see that biological cell states are preserved after harmonization. 254 | 255 | ```{r, fig.width = 7, fig.height = 7, out.width="100%", fig.cap="Expression of gene panel heatmap in the harmonized PBMC dataset"} 256 | FeaturePlot(object = pbmc, features= c("CD3D", "SELL", "CREM", "CD8A", "GNLY", "CD79A", "FCGR3A", "CCL2", "PPBP"), 257 | min.cutoff = "q9", cols = c("lightgrey", "blue"), pt.size = 0.5) 258 | 259 | ``` 260 | 261 | ## UMAP 262 | 263 | Very similarly with TSNE we can run UMAP by passing the harmony reduction in the function. 264 | 265 | ```{r, fig.width=5, fig.height=2.5, fig.align="center", fig.cap="UMAP Visualization of harmony embeddings"} 266 | pbmc <- pbmc %>% 267 | RunUMAP(reduction = "harmony", dims = 1:20) 268 | 269 | p1 <- DimPlot(pbmc, reduction = "umap", group.by = "stim", pt.size = .1) 270 | p2 <- DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = .1) 271 | plot_grid(p1, p2) 272 | 273 | ``` 274 | 275 | 276 | ```{r} 277 | sessionInfo() 278 | ``` 279 | 280 | -------------------------------------------------------------------------------- /doc/detailedWalkthrough.R: -------------------------------------------------------------------------------- 1 | ## ---- message=FALSE, warning=FALSE, class.source = 'fold-hide'---------------- 2 | 3 | ## Source required libraries 4 | library(data.table) 5 | library(tidyverse) 6 | library(ggthemes) 7 | library(ggrepel) 8 | library(harmony) 9 | library(patchwork) 10 | library(tidyr) 11 | 12 | ## Useful util functions 13 | 14 | cosine_normalize <- function(X, margin) { 15 | if (margin == 1) { 16 | res <- sweep(as.matrix(X), 1, sqrt(rowSums(X ^ 2)), '/') 17 | row.names(res) <- row.names(X) 18 | colnames(res) <- colnames(X) 19 | } else { 20 | res <- sweep(as.matrix(X), 2, sqrt(colSums(X ^ 2)), '/') 21 | row.names(res) <- row.names(X) 22 | colnames(res) <- colnames(X) 23 | } 24 | return(res) 25 | } 26 | 27 | onehot <- function(vals) { 28 | t(model.matrix(~0 + as.factor(vals))) 29 | } 30 | 31 | 32 | colors_use <- c(`jurkat` = rgb(129, 15, 124, maxColorValue=255), 33 | `t293` = rgb(208, 158, 45, maxColorValue=255), 34 | `half` = rgb(0, 109, 44, maxColorValue=255)) 35 | 36 | 37 | do_scatter <- function(umap_use, meta_data, label_name, no_guides = TRUE, do_labels = TRUE, nice_names, 38 | palette_use = colors_use, 39 | pt_size = 4, point_size = .5, base_size = 10, do_points = TRUE, do_density = FALSE, h = 4, w = 8) { 40 | umap_use <- umap_use[, 1:2] 41 | colnames(umap_use) <- c('X1', 'X2') 42 | plt_df <- umap_use %>% data.frame() %>% 43 | cbind(meta_data) %>% 44 | dplyr::sample_frac(1L) 45 | plt_df$given_name <- plt_df[[label_name]] 46 | 47 | if (!missing(nice_names)) { 48 | plt_df %<>% 49 | dplyr::inner_join(nice_names, by = "given_name") %>% 50 | subset(nice_name != "" & !is.na(nice_name)) 51 | 52 | plt_df[[label_name]] <- plt_df$nice_name 53 | } 54 | 55 | plt <- plt_df %>% 56 | ggplot(aes(X1, X2, colour = .data[[label_name]], fill = .data[[label_name]])) + 57 | theme_tufte(base_size = base_size) + 58 | theme(panel.background = element_rect(fill = NA, color = "black")) + 59 | guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4)), alpha = FALSE) + 60 | scale_color_manual(values = palette_use) + 61 | scale_fill_manual(values = palette_use) + 62 | theme(plot.title = element_text(hjust = .5)) + 63 | labs(x = "UMAP 1", y = "UMAP 2") 64 | 65 | if (do_points) 66 | plt <- plt + geom_point(size = 0.2) 67 | if (do_density) 68 | plt <- plt + geom_density_2d() 69 | 70 | 71 | if (no_guides) 72 | plt <- plt + guides("none") 73 | 74 | if (do_labels) 75 | plt <- plt + geom_label_repel(data = data.table(plt_df)[, .(X1 = mean(X1), X2 = mean(X2)), by = label_name], label.size = NA, 76 | aes(label = .data[[label_name]]), color = "white", size = pt_size, alpha = 1, segment.size = 0) + 77 | guides(col = FALSE, fill = FALSE) 78 | return(plt) 79 | } 80 | 81 | 82 | ## ----------------------------------------------------------------------------- 83 | data(cell_lines) 84 | V <- cell_lines$scaled_pcs 85 | V_cos <- cosine_normalize(V, 1) 86 | meta_data <- cell_lines$meta_data 87 | 88 | ## ---- warning=FALSE, fig.width=5, fig.height=3, fig.align="center"------------ 89 | do_scatter(V, meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 90 | labs(title = 'Colored by dataset', x = 'PC1', y = 'PC2') + 91 | do_scatter(V, meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 92 | labs(title = 'Colored by cell type', x = 'PC1', y = 'PC2') + 93 | NULL 94 | 95 | ## ----------------------------------------------------------------------------- 96 | 97 | set.seed(1) 98 | harmonyObj <- harmony::RunHarmony( 99 | data_mat = V, ## PCA embedding matrix of cells 100 | meta_data = meta_data, ## dataframe with cell labels 101 | theta = 1, ## cluster diversity enforcement 102 | vars_use = 'dataset', ## variable to integrate out 103 | nclust = 5, ## number of clusters in Harmony model 104 | max_iter = 0, ## stop after initialization 105 | return_object = TRUE ## return the full Harmony model object 106 | ) 107 | 108 | 109 | 110 | ## ---- fig.width=5, fig.height=3, fig.align="center"--------------------------- 111 | do_scatter(t(harmonyObj$Z_orig), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 112 | labs(title = 'Z_orig', subtitle = 'Euclidean distance', x = 'PC1', y = 'PC2') + 113 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 114 | labs(title = 'Z_cos', subtitle = 'Induced Cosine distance', x = 'PC1', y = 'PC2') 115 | 116 | 117 | ## ---- fig.width=8, fig.height=3, out.width="100%"----------------------------- 118 | 119 | harmonyObj$Z_cos %>% t %>% data.frame() %>% 120 | cbind(meta_data) %>% 121 | tidyr::gather(key, val, X1:X20) %>% 122 | ggplot(aes(reorder(gsub('X', 'PC', key), as.integer(gsub('X', '', key))), val)) + 123 | geom_boxplot(aes(color = dataset)) + 124 | scale_color_manual(values = colors_use) + 125 | labs(x = 'PC number', y = 'PC embedding value', title = 'Z_cos (unit scaled PCA embeddings) for all 20 PCs') + 126 | theme_tufte(base_size = 10) + geom_rangeframe() + 127 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) 128 | 129 | ## ---- fig.width=4, fig.height=3, fig.align="center"--------------------------- 130 | 131 | cluster_centroids <- harmonyObj$Y 132 | 133 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = FALSE, do_labels = FALSE) + 134 | labs(title = 'Initial kmeans cluster centroids', subtitle = '', x = 'PC1', y = 'PC2') + 135 | geom_point( 136 | data = data.frame(t(cluster_centroids)), 137 | color = 'black', fill = 'black', alpha = .8, 138 | shape = 21, size = 6 139 | ) + 140 | NULL 141 | 142 | 143 | ## ----------------------------------------------------------------------------- 144 | cluster_assignment_matrix <- harmonyObj$R 145 | 146 | 147 | ## ---- fig.height=5, fig.width=5----------------------------------------------- 148 | t(harmonyObj$Z_cos) %>% data.frame() %>% 149 | cbind(meta_data) %>% 150 | tibble::rowid_to_column('id') %>% 151 | dplyr::inner_join( 152 | cluster_assignment_matrix %>% t() %>% data.table() %>% 153 | tibble::rowid_to_column('id') %>% 154 | tidyr::gather(cluster, r, -id) %>% 155 | dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 156 | by = 'id' 157 | ) %>% 158 | dplyr::sample_frac(1L) %>% 159 | ggplot(aes(X1, X2, color = r)) + 160 | geom_point(size=0.2) + 161 | theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 162 | facet_grid(cluster ~ dataset) + 163 | scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) + 164 | labs(x = 'Scaled PC1', y = 'Scaled PC2', title = 'Initial probabilistic cluster assignments') 165 | 166 | ## ----------------------------------------------------------------------------- 167 | observed_counts <- with(harmonyObj, R %*% t(as.matrix(Phi))) 168 | round(observed_counts) 169 | 170 | 171 | 172 | ## ----------------------------------------------------------------------------- 173 | ## observed counts 174 | round(harmonyObj$O) 175 | 176 | ## observed counts 177 | round(harmonyObj$E) 178 | 179 | 180 | ## ----------------------------------------------------------------------------- 181 | phi_celltype <- onehot(meta_data$cell_type) 182 | observed_cell_counts <- harmonyObj$R %*% t(phi_celltype) 183 | round(observed_cell_counts) 184 | 185 | 186 | ## ----------------------------------------------------------------------------- 187 | harmonyObj$max_iter_kmeans 188 | 189 | ## ----------------------------------------------------------------------------- 190 | ## we can specify how many rounds of clustering to do 191 | harmonyObj$max_iter_kmeans <- 10 192 | harmonyObj$cluster_cpp() 193 | 194 | ## ----------------------------------------------------------------------------- 195 | round(harmonyObj$O) 196 | 197 | ## ---- fig.height=5, fig.width=5----------------------------------------------- 198 | new_cluster_assignment_matrix <- harmonyObj$R 199 | 200 | t(harmonyObj$Z_cos) %>% data.frame() %>% 201 | cbind(meta_data) %>% 202 | tibble::rowid_to_column('id') %>% 203 | dplyr::inner_join( 204 | new_cluster_assignment_matrix %>% t() %>% data.table() %>% 205 | tibble::rowid_to_column('id') %>% 206 | tidyr::gather(cluster, r, -id) %>% 207 | dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 208 | by = 'id' 209 | ) %>% 210 | dplyr::sample_frac(1L) %>% 211 | ggplot(aes(X1, X2, color = r)) + 212 | geom_point(shape = '.') + 213 | theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 214 | facet_grid(cluster ~ dataset) + 215 | scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) + 216 | labs(x = 'Scaled PC1', y = 'Scaled PC2', title = 'New probabilistic cluster assignments') 217 | 218 | ## ----------------------------------------------------------------------------- 219 | phi_celltype <- onehot(meta_data$cell_type) 220 | observed_cell_counts <- harmonyObj$R %*% t(phi_celltype) 221 | round(observed_cell_counts) 222 | 223 | ## ----------------------------------------------------------------------------- 224 | round(apply(prop.table(observed_cell_counts, 1), 1, min) * 100, 3) 225 | 226 | ## ----------------------------------------------------------------------------- 227 | 228 | with(harmonyObj, { 229 | distance_matrix <- 2 * (1 - t(Y) %*% Z_cos) 230 | distance_score <- exp(-distance_matrix / as.numeric(sigma)) 231 | diversity_score <- sweep(E / O, 2, theta, '/') %*% as.matrix(Phi) 232 | ## new assignments are based on distance and diversity 233 | R_new <- distance_score * diversity_score 234 | ## normalize R so each cell sums to 1 235 | R_new <- prop.table(R_new, 2) 236 | }) 237 | 238 | 239 | ## ----------------------------------------------------------------------------- 240 | ## with theta = 0 241 | with(harmonyObj, { 242 | (E / O) ^ 0 243 | }) 244 | 245 | ## ----------------------------------------------------------------------------- 246 | ## with theta = 1 247 | with(harmonyObj, { 248 | round((E / O) ^ 1, 2) 249 | }) 250 | 251 | 252 | ## ----------------------------------------------------------------------------- 253 | ## as theta approach infinity 254 | with(harmonyObj, { 255 | round((E / O) ^ 1e6, 2) 256 | }) 257 | 258 | 259 | ## ----------------------------------------------------------------------------- 260 | Y_unscaled <- with(harmonyObj, Z_cos %*% t(R)) 261 | 262 | ## ----------------------------------------------------------------------------- 263 | Y_new <- cosine_normalize(Y_unscaled, 2) 264 | 265 | ## ----------------------------------------------------------------------------- 266 | harmonyObj$moe_correct_ridge_cpp() 267 | 268 | ## ---- fig.width=5, fig.height=3, fig.align="center"--------------------------- 269 | 270 | do_scatter(cosine_normalize(t(harmonyObj$Z_orig), 1), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 271 | labs(title = 'Z_cos before MoE', x = 'PC1', y = 'PC2') + 272 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 273 | labs(title = 'Z_cos after MoE', x = 'PC1', y = 'PC2') 274 | 275 | ## ---- fig.width=8, fig.height=3, fig.align="center", out.width="100%"--------- 276 | 277 | do_scatter(t(harmonyObj$Z_orig), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 278 | labs(title = 'Z_orig', subtitle = 'Original PCA embeddings', x = 'PC1', y = 'PC2') + 279 | do_scatter(t(harmonyObj$Z_corr), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 280 | labs(title = 'Z_corr', subtitle = '= Z_orig - correction_factors', x = 'PC1', y = 'PC2') + 281 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 282 | labs(title = 'Z_cos', subtitle = '= Unit_scaled(Z_corr)', x = 'Scaled PC1', y = 'Scaled PC2') + 283 | NULL 284 | 285 | ## ---- fig.width=5, fig.height=3, fig.align="center"--------------------------- 286 | 287 | plt <- data.table(PC1_After = harmonyObj$Z_corr[1, ], PC1_Before = harmonyObj$Z_orig[1, ]) %>% 288 | cbind(meta_data) %>% 289 | dplyr::sample_frac(1L) %>% 290 | ggplot(aes(PC1_Before, PC1_After)) + 291 | geom_abline(slope = 1, intercept = 0) + 292 | theme_tufte(base_size = 10) + geom_rangeframe() + 293 | scale_color_tableau() + 294 | guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4))) + 295 | NULL 296 | 297 | plt + geom_point(shape = '.', aes(color = dataset)) + 298 | labs(x = 'PC1 before correction', y = 'PC1 after correction', 299 | title = 'PC1 correction for each cell', subtitle = 'Colored by Dataset') + 300 | plt + geom_point(shape = '.', aes(color = cell_type)) + 301 | labs(x = 'PC1 before correction', y = 'PC1 after correction', 302 | title = 'PC1 correction for each cell', subtitle = 'Colored by Cell Type') + 303 | NULL 304 | 305 | 306 | ## ---- echo=TRUE--------------------------------------------------------------- 307 | 308 | W <- list() 309 | ## Convert sparse data structures to dense matrix 310 | Phi.moe <- as.matrix(harmonyObj$Phi_moe) 311 | lambda <- diag(c(harmonyObj$lambda)) 312 | ## Get beta coeeficients for all the clusters 313 | for (k in 1:harmonyObj$K) { 314 | W[[k]] <- solve(Phi.moe %*% diag(harmonyObj$R[k, ]) %*% t(Phi.moe) + lambda) %*% (Phi.moe %*% diag(harmonyObj$R[k, ])) %*% t(harmonyObj$Z_orig) 315 | } 316 | 317 | 318 | 319 | ## ---- fig.width=5, fig.height=5----------------------------------------------- 320 | 321 | cluster_assignment_matrix <- harmonyObj$R 322 | 323 | t(harmonyObj$Z_orig) %>% data.frame() %>% 324 | cbind(meta_data) %>% 325 | tibble::rowid_to_column('id') %>% 326 | dplyr::inner_join( 327 | cluster_assignment_matrix %>% t() %>% data.table() %>% 328 | tibble::rowid_to_column('id') %>% 329 | tidyr::gather(cluster, r, -id) %>% 330 | dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 331 | by = 'id' 332 | ) %>% 333 | dplyr::sample_frac(1L) %>% 334 | ggplot(aes(X1, X2, color = r)) + 335 | geom_point(shape = 0.2) + 336 | theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 337 | facet_grid(cluster ~ dataset) + 338 | scale_color_gradient(low = 'grey', breaks = seq(0, 1, .2)) + 339 | labs(x = 'PC1', y = 'PC2', title = 'Cluster assigned in original PCA space (Z_orig)') 340 | 341 | 342 | ## ----------------------------------------------------------------------------- 343 | plt_list <- lapply(1:harmonyObj$K, function(k) { 344 | plt_df <- W[[k]] %>% data.frame() %>% 345 | dplyr::select(X1, X2) 346 | ## Append n 347 | plt_df <- plt_df %>% 348 | cbind( 349 | data.frame(t(matrix(unlist(c(c(0, 0), rep(plt_df[1, ], 3))), nrow = 2))) %>% 350 | dplyr::rename(x0 = X1, y0 = X2) 351 | ) %>% 352 | cbind(type = c('intercept', unique(meta_data$dataset))) 353 | plt <- plt_df %>% 354 | ggplot() + 355 | geom_point(aes(X1, X2), 356 | data = t(harmonyObj$Z_orig) %>% data.frame(), 357 | size = 0.5, 358 | color = 'grey' 359 | ) + 360 | geom_segment(aes(x = x0, y = y0, xend = X1 + x0, yend = X2 + y0, color = type), linewidth=1) + 361 | scale_color_manual(values = c('intercept' = 'black', colors_use)) + 362 | theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 363 | labs(x = 'PC 1', y = 'PC 2', title = sprintf('Cluster %d', k)) 364 | plt <- plt + guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16))) 365 | # if (k == harmonyObj$K) { 366 | # } else { 367 | # plt <- plt + guides(color = FALSE) 368 | # } 369 | plt 370 | }) 371 | 372 | 373 | 374 | ## ---- fig.height=6, fig.width=6----------------------------------------------- 375 | Reduce(`+`, plt_list) + 376 | patchwork::plot_annotation(title = 'Mixture of experts beta terms before correction (Z_orig)') + 377 | plot_layout(ncol = 2) 378 | 379 | ## ---- fig.width=4, fig.height=3, fig.align="center"--------------------------- 380 | 381 | plt_list <- lapply(1:harmonyObj$K, function(k) { 382 | plt_df <- W[[k]] %>% data.frame() %>% 383 | dplyr::select(X1, X2) 384 | 385 | plt_df <- plt_df %>% 386 | cbind( 387 | data.frame(t(matrix(unlist(c(c(0, 0), rep(plt_df[1, ], 3))), nrow = 2))) %>% 388 | dplyr::rename(x0 = X1, y0 = X2) 389 | ) %>% 390 | cbind(type = c('intercept', unique(meta_data$dataset))) 391 | 392 | plt <- plt_df %>% 393 | ggplot() + 394 | geom_point(aes(X1, X2), 395 | data = t(harmonyObj$Z_corr) %>% data.frame(), 396 | shape = '.', 397 | color = 'grey' 398 | ) + 399 | geom_segment(aes(x = x0, y = y0, xend = X1 + x0, yend = X2 + y0, color = type), linewidth=1) + 400 | scale_color_manual(values = c('intercept' = 'black', colors_use)) + 401 | theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 402 | labs(x = 'PC 1', y = 'PC 2', title = sprintf('Cluster %d', k)) 403 | plt <- plt + guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16))) 404 | plt 405 | }) 406 | 407 | 408 | 409 | ## ---- fig.height=6, fig.width=6----------------------------------------------- 410 | Reduce(`+`, plt_list) + 411 | patchwork::plot_annotation(title = 'Mixture of experts beta terms after correction (Z_corr)') + 412 | plot_layout(ncol = 2) 413 | 414 | ## ---- echo=TRUE--------------------------------------------------------------- 415 | 416 | Z_i <- harmonyObj$Z_orig[, 5] 417 | Z_i_pred <- Reduce(`+`, lapply(1:harmonyObj$K, function(k) { 418 | W[[k]] * harmonyObj$Phi_moe[, 5] * harmonyObj$R[k, 5] 419 | })) %>% colSums 420 | 421 | 422 | 423 | ## ---- fig.width=4, fig.height=3, fig.align="center"--------------------------- 424 | data.table(obs = Z_i, pred = Z_i_pred) %>% 425 | tibble::rowid_to_column('PC') %>% 426 | ggplot(aes(obs, pred)) + 427 | geom_point(shape = 21) + 428 | geom_label_repel(aes(label = PC)) + 429 | geom_abline(slope = 1, intercept = 0) + 430 | theme_tufte() + geom_rangeframe() + 431 | labs(x = 'Observed PC score', 'Predicted PC score', title = 'Observed and predicted values of PC scores\nfor cell 5') + 432 | NULL 433 | 434 | ## ----------------------------------------------------------------------------- 435 | delta <- Reduce(`+`, lapply(1:harmonyObj$K, function(k) { 436 | W[[k]][2:4, ] * harmonyObj$Phi[, 5] * harmonyObj$R[k, 5] 437 | })) %>% colSums 438 | 439 | Z_corrected <- harmonyObj$Z_orig[, 5] - delta 440 | 441 | 442 | ## ---- fig.width=3, fig.height=3, fig.align="center"--------------------------- 443 | 444 | 445 | harmonyObj$Z_orig %>% t %>% data.frame() %>% 446 | ggplot(aes(X1, X2)) + 447 | geom_point(shape = '.') + 448 | geom_point( 449 | data = data.frame(t(harmonyObj$Z_orig[, 5, drop = FALSE])), 450 | color = 'red' 451 | ) + 452 | geom_segment( 453 | data = data.table(x0 = harmonyObj$Z_orig[1, 5], 454 | y0 = harmonyObj$Z_orig[2, 5], 455 | x1 = Z_corrected[1], 456 | y1 = Z_corrected[2]), 457 | aes(x = x0, y = y0, xend = x1, yend = y1), 458 | linewidth = 1, 459 | color = 'red', 460 | arrow = arrow(length = unit(0.05, "npc"), type = 'closed') 461 | ) + 462 | theme_tufte(base_size = 10) + geom_rangeframe() + 463 | labs(x = 'PC1', y = 'PC2', title = 'Correction of cell #5') 464 | 465 | 466 | ## ----------------------------------------------------------------------------- 467 | 468 | harmonyObj <- RunHarmony( 469 | data_mat = V, ## PCA embedding matrix of cells 470 | meta_data = meta_data, ## dataframe with cell labels 471 | theta = 1, ## cluster diversity enforcement 472 | vars_use = 'dataset', ## (list of) variable(s) we'd like to Harmonize out 473 | nclust = 50, ## number of clusters in Harmony model 474 | max_iter = 0, ## don't actually run Harmony, stop after initialization 475 | return_object = TRUE ## return the full Harmony model object, not just the corrected PCA matrix 476 | ) 477 | 478 | 479 | ## ---- message=FALSE, fig.width=5, fig.height=3, fig.align="center"------------ 480 | 481 | i <- 0 482 | 483 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 484 | labs(title = sprintf('Round %d', i), subtitle = 'Colored by dataset', x = 'Scaled PC1', y = 'Scaled PC2') + 485 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 486 | labs(title = sprintf('Round %d', i), subtitle = 'Colored by cell type', x = 'Scaled PC1', y = 'Scaled PC2') + 487 | NULL 488 | 489 | ## ---- fig.width=5, fig.height=3, fig.align="center", message=FALSE------------ 490 | 491 | for (i in 1:2) { 492 | harmony:::harmonize(harmonyObj, 1) 493 | plt <- do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 494 | labs(title = sprintf('Round %d', i), subtitle = 'Colored by dataset', x = 'Scaled PC1', y = 'Scaled PC2') + 495 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 496 | labs(title = sprintf('Round %d', i), subtitle = 'Colored by cell type', x = 'Scaled PC1', y = 'Scaled PC2') + 497 | NULL 498 | plot(plt) 499 | } 500 | 501 | 502 | ## ----------------------------------------------------------------------------- 503 | sessionInfo() 504 | 505 | -------------------------------------------------------------------------------- /doc/detailedWalkthrough.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Detailed Walkthrough of Harmony Algorithm" 3 | author: "Korsunsky et al.: Fast, sensitive, and accurate integration of single cell data with Harmony" 4 | vignette: > 5 | %\VignetteIndexEntry{Detailed Walkthrough of Harmony Algorithm} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | output: rmarkdown::html_vignette 9 | --- 10 | 11 | # Motivation 12 | 13 | This notebook breaks down the Harmony algorithm and model in the context of a simple real-world dataset. 14 | 15 | After reading this, the user should have a better understanding of how 16 | 17 | 1. the equations connect to the algorithm 18 | 2. the algorithm works on real data 19 | 3. to access the different parts of the Harmony model from R 20 | 21 | 22 | # Prerequisites 23 | 24 | For this vignette we are going to use harmony among other tools that will help with the visualization and inspection of the algorithm intermediate states. Also, we provide a few helper functions that can be found in the source block below. 25 | 26 | ```{r, message=FALSE, warning=FALSE, class.source = 'fold-hide'} 27 | 28 | ## Source required libraries 29 | library(data.table) 30 | library(tidyverse) 31 | library(ggthemes) 32 | library(ggrepel) 33 | library(harmony) 34 | library(patchwork) 35 | library(tidyr) 36 | 37 | ## Useful util functions 38 | 39 | cosine_normalize <- function(X, margin) { 40 | if (margin == 1) { 41 | res <- sweep(as.matrix(X), 1, sqrt(rowSums(X ^ 2)), '/') 42 | row.names(res) <- row.names(X) 43 | colnames(res) <- colnames(X) 44 | } else { 45 | res <- sweep(as.matrix(X), 2, sqrt(colSums(X ^ 2)), '/') 46 | row.names(res) <- row.names(X) 47 | colnames(res) <- colnames(X) 48 | } 49 | return(res) 50 | } 51 | 52 | onehot <- function(vals) { 53 | t(model.matrix(~0 + as.factor(vals))) 54 | } 55 | 56 | 57 | colors_use <- c(`jurkat` = rgb(129, 15, 124, maxColorValue=255), 58 | `t293` = rgb(208, 158, 45, maxColorValue=255), 59 | `half` = rgb(0, 109, 44, maxColorValue=255)) 60 | 61 | 62 | do_scatter <- function(umap_use, meta_data, label_name, no_guides = TRUE, do_labels = TRUE, nice_names, 63 | palette_use = colors_use, 64 | pt_size = 4, point_size = .5, base_size = 10, do_points = TRUE, do_density = FALSE, h = 4, w = 8) { 65 | umap_use <- umap_use[, 1:2] 66 | colnames(umap_use) <- c('X1', 'X2') 67 | plt_df <- umap_use %>% data.frame() %>% 68 | cbind(meta_data) %>% 69 | dplyr::sample_frac(1L) 70 | plt_df$given_name <- plt_df[[label_name]] 71 | 72 | if (!missing(nice_names)) { 73 | plt_df %<>% 74 | dplyr::inner_join(nice_names, by = "given_name") %>% 75 | subset(nice_name != "" & !is.na(nice_name)) 76 | 77 | plt_df[[label_name]] <- plt_df$nice_name 78 | } 79 | 80 | plt <- plt_df %>% 81 | ggplot(aes(X1, X2, colour = .data[[label_name]], fill = .data[[label_name]])) + 82 | theme_tufte(base_size = base_size) + 83 | theme(panel.background = element_rect(fill = NA, color = "black")) + 84 | guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4)), alpha = FALSE) + 85 | scale_color_manual(values = palette_use) + 86 | scale_fill_manual(values = palette_use) + 87 | theme(plot.title = element_text(hjust = .5)) + 88 | labs(x = "UMAP 1", y = "UMAP 2") 89 | 90 | if (do_points) 91 | plt <- plt + geom_point(size = 0.2) 92 | if (do_density) 93 | plt <- plt + geom_density_2d() 94 | 95 | 96 | if (no_guides) 97 | plt <- plt + guides("none") 98 | 99 | if (do_labels) 100 | plt <- plt + geom_label_repel(data = data.table(plt_df)[, .(X1 = mean(X1), X2 = mean(X2)), by = label_name], label.size = NA, 101 | aes(label = .data[[label_name]]), color = "white", size = pt_size, alpha = 1, segment.size = 0) + 102 | guides(col = FALSE, fill = FALSE) 103 | return(plt) 104 | } 105 | 106 | ``` 107 | 108 | 109 | 110 | # Cell line data 111 | 112 | This dataset is described in figure 2 of the Harmony manuscript. We downloaded 3 cell line datasets from the 10X website. The first two (jurkat and 293t) come from pure cell lines while the *half* dataset is a 50:50 mixture of Jurkat and HEK293T cells. We inferred cell type with the canonical marker XIST, since the two cell lines come from 1 male and 1 female donor. 113 | 114 | * https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/jurkat 115 | * https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/293t 116 | * https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/jurkat:293t_50:50 117 | 118 | We library normalized the cells, log transformed the counts, and scaled the genes. Then we performed PCA and kept the top 20 PCs. We begin the analysis in this notebook from here. 119 | 120 | 121 | ```{r} 122 | data(cell_lines) 123 | V <- cell_lines$scaled_pcs 124 | V_cos <- cosine_normalize(V, 1) 125 | meta_data <- cell_lines$meta_data 126 | ``` 127 | 128 | To get a feel for the data, let's visualize the cells in PCA space. The plots below show the cells' PC1 and PC2 embeddings. We color the cells by dataset of origin (left) and cell type (right). 129 | 130 | 131 | ```{r, warning=FALSE, fig.width=5, fig.height=3, fig.align="center"} 132 | do_scatter(V, meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 133 | labs(title = 'Colored by dataset', x = 'PC1', y = 'PC2') + 134 | do_scatter(V, meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 135 | labs(title = 'Colored by cell type', x = 'PC1', y = 'PC2') + 136 | NULL 137 | ``` 138 | 139 | 140 | 141 | # Initialize a Harmony object 142 | 143 | The first thing we do is initialize a Harmony object. We pass 2 data structures: 144 | 145 | 1. V: the PCA embedding matrix of cells. 146 | 2. meta_data: a dataframe object containing the variables we'd like to Harmonize over. 147 | 148 | The rest of the parameters are described below. A few notes: 149 | 150 | * *nclust* in the R code below corresponds to the parameter *K* in the manuscript. 151 | * we set *max_iter* to 0 because in this tutorial, we don't want to actually run Harmony just yet. 152 | * setting *return_object* to *TRUE* means that *harmonyObj* below is not a corrected PCA embeddings matrix. Instead, it is the full Harmony model object. We'll have a closer look into the different pieces of this object as we go! 153 | 154 | 155 | ```{r} 156 | 157 | set.seed(1) 158 | harmonyObj <- harmony::RunHarmony( 159 | data_mat = V, ## PCA embedding matrix of cells 160 | meta_data = meta_data, ## dataframe with cell labels 161 | theta = 1, ## cluster diversity enforcement 162 | vars_use = 'dataset', ## variable to integrate out 163 | nclust = 5, ## number of clusters in Harmony model 164 | max_iter = 0, ## stop after initialization 165 | return_object = TRUE ## return the full Harmony model object 166 | ) 167 | 168 | 169 | ``` 170 | 171 | By initializing the object, we have prepared the data in 2 ways. First, we've scaled the PCA matrix to give each cell unit length. Second, we've initialized cluster centroids with regular kmeans clustering on these scaled data. We'll dig into these two steps below. 172 | 173 | ## L_2 scaling to induce cosine distance 174 | 175 | A key preprocessing step of Harmony clustering is L2 normalization. As shown in Haghverdi et al 2018, scaling each cell to have L2 norm equal to 1 induces a special property: Euclidean distance of the scaled cells is equivalent to cosine distance of the unscaled cells. Cosine distance is a considerably more robust measure of cell-to-cell similarity (CITE Martin and Vlad). Moreover, it has been used in clustering analysis of high dimensional text datasets (CITE NLP spherical kmeans). 176 | 177 | $L_2$ Normalization of cell $i$: 178 | 179 |
180 | $\hat{Z}_{\cdot, i} \leftarrow \frac{\hat{Z}_{\cdot, i}}{||{\hat{Z}_{\cdot, i}}||_{2}}$ 181 |
182 | 183 | 184 | TL;DR Harmony clustering uses cosine distance. By normalizing each cell to have unit length, we can directly visualize the cosine distances between cells (right). These relationships are not obvious in Euclidean space (left). 185 | 186 | 187 | In the Harmony object, we now have 2 copies of the cell embeddings. The first, $Z_{orig}$ is the original PCA matrix (PCs by cells). The second, $Z_{cos}$ is the new $L_2$ scaled matrix. Since this scaling projects cells into a unit hypersphere, cells appear pushed away from the origin (0,0). 188 | 189 | 190 | ```{r, fig.width=5, fig.height=3, fig.align="center"} 191 | do_scatter(t(harmonyObj$Z_orig), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 192 | labs(title = 'Z_orig', subtitle = 'Euclidean distance', x = 'PC1', y = 'PC2') + 193 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 194 | labs(title = 'Z_cos', subtitle = 'Induced Cosine distance', x = 'PC1', y = 'PC2') 195 | 196 | ``` 197 | 198 | 199 | In the $Z_{cos}$ scatterplot (right), cells that are nearby have a high cosine similarity. Although it is not obvious in this example, cells closeby in Euclidean space do not always have a high cosine similarity! 200 | 201 | Above, we only visualize the first two PCs. In this simple example with cell lines, this is sufficient to visualize most of the variation. Note, however, that all clustering and correction in Harmony uses all the PCs. For completeness, we can visualize the quantiles of PCA embeddings for all 20 PCs, colored by original dataset. 202 | 203 | 204 | ```{r, fig.width=8, fig.height=3, out.width="100%"} 205 | 206 | harmonyObj$Z_cos %>% t %>% data.frame() %>% 207 | cbind(meta_data) %>% 208 | tidyr::gather(key, val, X1:X20) %>% 209 | ggplot(aes(reorder(gsub('X', 'PC', key), as.integer(gsub('X', '', key))), val)) + 210 | geom_boxplot(aes(color = dataset)) + 211 | scale_color_manual(values = colors_use) + 212 | labs(x = 'PC number', y = 'PC embedding value', title = 'Z_cos (unit scaled PCA embeddings) for all 20 PCs') + 213 | theme_tufte(base_size = 10) + geom_rangeframe() + 214 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) 215 | ``` 216 | 217 | 218 | 219 | ## Initial clustering 220 | 221 | Initializing the Harmony object also triggered initialization of all the clustering data structures. Harmony currently uses regular kmeans, with 10 random restarts, to find initial locations for the cluster centroids. Let's visualize these centroids directly! We can do this by accessing the *Y* matrix in the Harmony object. This is a matrix with $d=20$ rows and $K=5$ columns, so each column represents one 20-dimensional centroid. 222 | 223 | Remember that we set the number of clusters to 5 above, so there are now 5 clusters below. 224 | 225 | ```{r, fig.width=4, fig.height=3, fig.align="center"} 226 | 227 | cluster_centroids <- harmonyObj$Y 228 | 229 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = FALSE, do_labels = FALSE) + 230 | labs(title = 'Initial kmeans cluster centroids', subtitle = '', x = 'PC1', y = 'PC2') + 231 | geom_point( 232 | data = data.frame(t(cluster_centroids)), 233 | color = 'black', fill = 'black', alpha = .8, 234 | shape = 21, size = 6 235 | ) + 236 | NULL 237 | 238 | ``` 239 | 240 | 241 | 242 | Based on these cluster centroids, we also assigned probabilistic cluster memberships to each cell. In the algorithm, this is done using the formula below. 243 | 244 |
245 | $R_{ki} \propto \exp(\frac{-||Z_i - Y_k||^2_2}{\sigma})$ 246 |
247 | 248 | Above, $R_{ki}$ is a value from $0$ to $1$ and denotes the probability that cell $i$ is assigned to cluster $k$. Accordingly, the squared distance $||Z_i - Y_k||^2_2$ is the distance between cell $i$ and the centroid of cluster $k$. Because we're using cosine distance (i.e. cells and centroids have unit length), we can simplify the distance computation: 249 | 250 |
251 | $R_{ki} \propto \exp(\frac{-(2(1 - Y^TZ))}{\sigma})$ 252 |
253 | 254 | Finally, the $\propto$ symbol means that we will normalize R to form a proper probability distribution for each cell: 255 | 256 |
257 | $\sum_k R_{ki} = 1$ 258 |
259 | 260 | Let's take a look at these initial cluster assignments. We can find these assignments in the $K$ row by $N$ column matrix $R$. 261 | 262 | 263 | ```{r} 264 | cluster_assignment_matrix <- harmonyObj$R 265 | 266 | ``` 267 | 268 | The plots below color each cell by cluster membership, from 0 (grey) to 1 (blue). For clarity, each column is a different dataset. Each row is one of the 5 clusters. 269 | 270 | 271 | ```{r, fig.height=5, fig.width=5} 272 | t(harmonyObj$Z_cos) %>% data.frame() %>% 273 | cbind(meta_data) %>% 274 | tibble::rowid_to_column('id') %>% 275 | dplyr::inner_join( 276 | cluster_assignment_matrix %>% t() %>% data.table() %>% 277 | tibble::rowid_to_column('id') %>% 278 | tidyr::gather(cluster, r, -id) %>% 279 | dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 280 | by = 'id' 281 | ) %>% 282 | dplyr::sample_frac(1L) %>% 283 | ggplot(aes(X1, X2, color = r)) + 284 | geom_point(size=0.2) + 285 | theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 286 | facet_grid(cluster ~ dataset) + 287 | scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) + 288 | labs(x = 'Scaled PC1', y = 'Scaled PC2', title = 'Initial probabilistic cluster assignments') 289 | ``` 290 | 291 | 292 | 293 | 294 | 295 | ## Evaluating initial cluster diversity 296 | 297 | A key part of clustering in Harmony is diversity. We can evaluate the initial diversity of clustering by aggregating the number of cells from each batch assigned to each cluster. For this, we need two data structures: 298 | 299 | 1) $\phi$ (B rows, N columns): the one-hot encoded design matrix. 300 | 301 | 2) $R$ (K rows, N columns): the cluster assignment matrix. 302 | 303 | The cross product $R\phi^T$ gives us a matrix of the number of cells from batch b (columns) that are in cluster k (rows). Note that since cluster assignment is probabilistic, the observed counts don't have to be integer valued. For simplicity, we round the values to their closest integers. 304 | 305 | 306 | ```{r} 307 | observed_counts <- with(harmonyObj, R %*% t(as.matrix(Phi))) 308 | round(observed_counts) 309 | 310 | 311 | ``` 312 | 313 | 314 | In fact, this information is already stored in the Harmony model object! The observed cluster by batch counts are stored in the $O$ matrix. The expected counts are in the $E$ matrix. We can check that the observed counts matrix has exactly the same values we computed above. 315 | 316 | 317 | ```{r} 318 | ## observed counts 319 | round(harmonyObj$O) 320 | 321 | ## observed counts 322 | round(harmonyObj$E) 323 | 324 | ``` 325 | 326 | 327 | It looks like clusters 2, 4, and 5 are not very diverse, with most cells coming from a single dataset. However, clusters 1 and 3 look pretty well mixed already! Cluster 1 has 900 cells from batch $b=1$ (*half* dataset) and 1574 cells from batch $b=3$ (*t293* dataset). As we move into the maximum diversity clustering, we should see the clusters getting more and more mixed! 328 | 329 | In this benchmark, we also have some ground truth cell types. In the same way that we evaluated the cluster diversity, we can evaluate the cluster accuracy. Since we didn't tell Harmony what the ground truth cell types are, we need to first construct a cell-type design matrix (shown below). We want these columns to be as mutually exclusive as possible. It looks like the initial clustering is fairly accurate. The only mistakes are the $n=2$ *jurkat* cells clustered with the $n=2472$ *293t* cells in cluster $k=1$ and $n=12$ *jurkat* cells clustered with $n=1952$ *t293* cells in cluster $k=3$. 330 | 331 | 332 | ```{r} 333 | phi_celltype <- onehot(meta_data$cell_type) 334 | observed_cell_counts <- harmonyObj$R %*% t(phi_celltype) 335 | round(observed_cell_counts) 336 | 337 | ``` 338 | 339 | # Maximum-diversity soft-clustering 340 | 341 | In the previous section, we initialized the Harmony object. At this point, we have some initial cluster assignments ($R$, $Y$), scaled PC embeddings ($Z_{cos}$), and statistics about cluster diversity ($O$, $E$). Now we're going to do some Harmony clustering to find more diverse clusters! 342 | 343 | We do this by calling the *cluster()* function defined in the Harmony package. This will perform a few rounds of clustering, defined by the parameter *max_iter_kmeans*. In each round, we iterate between two steps: centroid estimation and cluster assignment. We dig into both of these in more detail in the subsections below. 344 | 345 | ```{r} 346 | harmonyObj$max_iter_kmeans 347 | ``` 348 | 349 | ```{r} 350 | ## we can specify how many rounds of clustering to do 351 | harmonyObj$max_iter_kmeans <- 10 352 | harmonyObj$cluster_cpp() 353 | ``` 354 | 355 | 356 | Now that we've done some maximum diversity clustering, how have the clusters changed? Let's first look at the observed counts matrix $O$. 357 | 358 | In contrast to the $O$ matrix we started with above, this one looks much more diverse! 359 | 360 | ```{r} 361 | round(harmonyObj$O) 362 | ``` 363 | 364 | While clusters 1 and 3 were already diverse in the initial clustering, it seems that clusters 2, 4, and 5 are now considerably more mixed as well. Let's see how these assignments have changed in space. 365 | 366 | 367 | ```{r, fig.height=5, fig.width=5} 368 | new_cluster_assignment_matrix <- harmonyObj$R 369 | 370 | t(harmonyObj$Z_cos) %>% data.frame() %>% 371 | cbind(meta_data) %>% 372 | tibble::rowid_to_column('id') %>% 373 | dplyr::inner_join( 374 | new_cluster_assignment_matrix %>% t() %>% data.table() %>% 375 | tibble::rowid_to_column('id') %>% 376 | tidyr::gather(cluster, r, -id) %>% 377 | dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 378 | by = 'id' 379 | ) %>% 380 | dplyr::sample_frac(1L) %>% 381 | ggplot(aes(X1, X2, color = r)) + 382 | geom_point(shape = '.') + 383 | theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 384 | facet_grid(cluster ~ dataset) + 385 | scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) + 386 | labs(x = 'Scaled PC1', y = 'Scaled PC2', title = 'New probabilistic cluster assignments') 387 | ``` 388 | 389 | 390 | Of course, it is equally important to make sure that our clusters do not mix up different cell types. Recall that in this benchmark, we have access to these ground truth labels. 391 | 392 | 393 | ```{r} 394 | phi_celltype <- onehot(meta_data$cell_type) 395 | observed_cell_counts <- harmonyObj$R %*% t(phi_celltype) 396 | round(observed_cell_counts) 397 | ``` 398 | 399 | 400 | 401 | 402 | 403 | Initially, the largest error we had was in cluster 1 with 12 out of 1952 cells misclustered. So our initial error rate was at most 0.6%. Let's take a look at the error rates in our maximum diversity clustering (shown below). Applying the same kind of error analysis, we see that we have <0.6% error across all the clusters. 404 | 405 | It is worth noting that in the original clustering, clusters 2, 4, and 5 had 0% error. But they also had almost no diversity. These clusters have incurred a non-zero error but gained substantial diversity. This trade-off between accuracy and diversity is present in all integration settings. 406 | 407 | 408 | ```{r} 409 | round(apply(prop.table(observed_cell_counts, 1), 1, min) * 100, 3) 410 | ``` 411 | 412 | 413 | ## Diverse cluster assignment 414 | 415 | Now let's re-assign cells to cluster centroids. We did this above, when we assigned cells during the Harmony initialization step. The difference is that we want to assign cells to clusters that are both nearby and will increase diversity. 416 | 417 | In the algorithm, this assignment is defined by 418 | 419 |
420 | $R_{ki} \propto \exp(\frac{-(2(1 - Y^TZ))}{\sigma}) (\frac{E}{O})^\theta \phi$ 421 |
422 | 423 | Let's see what this looks like in code. Then we'll break down the formula to see what it does. 424 | 425 | 426 | ```{r} 427 | 428 | with(harmonyObj, { 429 | distance_matrix <- 2 * (1 - t(Y) %*% Z_cos) 430 | distance_score <- exp(-distance_matrix / as.numeric(sigma)) 431 | diversity_score <- sweep(E / O, 2, theta, '/') %*% as.matrix(Phi) 432 | ## new assignments are based on distance and diversity 433 | R_new <- distance_score * diversity_score 434 | ## normalize R so each cell sums to 1 435 | R_new <- prop.table(R_new, 2) 436 | }) 437 | 438 | ``` 439 | 440 | So how does the formula we used above help to create more diverse cluster assignment? 441 | 442 | The diversity penalty is encoded in the new term: $(\frac{E}{O})^\theta \phi$. This has some familiar data structures: $O$ for observed counts, $E$ for expected counts, and $\phi$ for the design matrix. $\theta$ is a new term. $\theta$ decides how much weight to give diversity versus distance to cluster centroids. 443 | 444 | With $\theta=0$, there is no penalty and each cluster gets a score of 1. 445 | 446 | 447 | ```{r} 448 | ## with theta = 0 449 | with(harmonyObj, { 450 | (E / O) ^ 0 451 | }) 452 | ``` 453 | 454 | 455 | 456 | As we increase $\theta$, let's see what happens (shown below). Recall that in cluster $k=1$, batches 1 and 3 were well represented. Below, note that in that cluster ($k=1$), the penalties for batches 1 and 3 are relatively low (0.98 and 0.47). On the other hand, batch 2 gets a penalty score of 30914. This means that cells from batches 1 and 3 will be encouraged to move into cluster $k=1$. On the other hand, cluster $k=2$ is replete with batch 2. The penalty for batch 2 in cluster $k=2$ is relatively low, and noticeably smaller than the penalty score for batch 2 in cluster $k=1$. Thus, cells from batch 2 will be discouraged from moving into cluster $k=1$, as this cluster has a higher penalty score for cells from batch 2 compared to other clusters (such as $k=1$). 457 | 458 | 459 | ```{r} 460 | ## with theta = 1 461 | with(harmonyObj, { 462 | round((E / O) ^ 1, 2) 463 | }) 464 | 465 | ``` 466 | 467 | 468 | 469 | We should always be wary of setting $\theta$ too high, since the diversity scores can go to $\infty$. Below, we set $\theta$ to 1 million. We do not recommend setting $\theta$ to 1 million! 470 | 471 | 472 | ```{r} 473 | ## as theta approach infinity 474 | with(harmonyObj, { 475 | round((E / O) ^ 1e6, 2) 476 | }) 477 | 478 | ``` 479 | 480 | 481 | Finally, it is important to note that we cannot re-assign cells independently as we did above. Why not? As soon as we re-assign one cell, the diversity counts in the $O$ and $E$ matrices change. Thus, the assignment formula for all other cells is different! For this reason, we need to assign one cell at a time and update the $O$ and $E$ as we go. In practice, we can update some chunk of cells (e.g. 5%), update the $O$ and $E$ matrices, and update another chunk of cells. 482 | 483 | ## Cluster centroid estimation 484 | 485 | In the previous step, we re-assigned cells to maximize diversity within the clusters. With this new assignment, we need to update the cluster centroids. In this step, we'll use the cell positions $Z_{cos}$ and the cluster assignments ($R$) to re-position cluster centroids to be close to their assigned cells. 486 | 487 |
488 | $Y \leftarrow Z_{cos}R^T$ 489 |
490 | 491 | ```{r} 492 | Y_unscaled <- with(harmonyObj, Z_cos %*% t(R)) 493 | ``` 494 | 495 | 496 | We then scale Y to make each centroid unit length: 497 | 498 |
499 | $Y \leftarrow \frac{Y}{\sum_{d}Y_d}$ 500 |
501 | 502 | ```{r} 503 | Y_new <- cosine_normalize(Y_unscaled, 2) 504 | ``` 505 | 506 | # Correction 507 | 508 | In the previous section, we performed clustering in order to identify shared groups of cells between batches. Now we make use of these groups in order to correct the data in a sensitive way. To run the correction step, we call the function *moe_correct_ridge()* from the Harmony package. First, let's see what happens to the cells. In the subsections that follow, we'll look deeper into how we got there. 509 | 510 | 511 | ```{r} 512 | harmonyObj$moe_correct_ridge_cpp() 513 | ``` 514 | 515 | 516 | ```{r, fig.width=5, fig.height=3, fig.align="center"} 517 | 518 | do_scatter(cosine_normalize(t(harmonyObj$Z_orig), 1), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 519 | labs(title = 'Z_cos before MoE', x = 'PC1', y = 'PC2') + 520 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 521 | labs(title = 'Z_cos after MoE', x = 'PC1', y = 'PC2') 522 | ``` 523 | 524 | 525 | 526 | 527 | We can see the the jurkat cells are starting to come together on the right (purple and green). There is also more local mixing of the 293T cells on the left (yellow and green). What happened to actually get them there? 528 | 529 | For each cell, we estimate how much its batch identity contributes to its PCA scores. We then subtract this contribution from that cell's PCA scores. That's it! 530 | 531 | Very importantly, this correction factor is not in the unit scaled space (i.e. $Z_{cos}$)! The data in $Z_{cos}$ have been projected onto a hypersphere. This makes the cells easier to cluster but the space is no longer closed under linear transformations! In other words, if we push a cell over a bit by adding 10 to PC1, that cell is no longer on the hypersphere. 532 | 533 | To query the Harmony model object, we need to introduce another variable: $Z_{corr}$. $Z_{corr}$ contains the cells' PCA embeddings post correction. However, we never scale cells in $Z_{corr}$ to have unit length. After we compute $Z_{corr}$, we immediately update $Z_{cos}$ as the unit scaled version of $Z_{corr}$. The plot below shows all three of Harmony's data structures that contain PCA embeddings. To summarize: 534 | 535 | - $Z_{orig}$: original PCA embeddings 536 | - $Z_{corr}$: corrected PCA embeddings, not scaled 537 | - $Z_{cos}$: corrected PCA embeddings, scaled to unit length 538 | 539 | 540 | ```{r, fig.width=8, fig.height=3, fig.align="center", out.width="100%"} 541 | 542 | do_scatter(t(harmonyObj$Z_orig), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 543 | labs(title = 'Z_orig', subtitle = 'Original PCA embeddings', x = 'PC1', y = 'PC2') + 544 | do_scatter(t(harmonyObj$Z_corr), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 545 | labs(title = 'Z_corr', subtitle = '= Z_orig - correction_factors', x = 'PC1', y = 'PC2') + 546 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 547 | labs(title = 'Z_cos', subtitle = '= Unit_scaled(Z_corr)', x = 'Scaled PC1', y = 'Scaled PC2') + 548 | NULL 549 | ``` 550 | 551 | 552 | 553 | Let's take a look a closer look at these cell specific correction factors. For exposition, let's focus on PC1 and compare each cell's position before (from $Z_{orig}$) and after (from $Z_{corr}$) correction. 554 | 555 | 556 | The plots below show the PC1 value before (x-axis) and after (y-axis) correction for each cell. The black line is drawn at $y=x$ to represent the level curve of no change. 557 | 558 | 559 | ```{r, fig.width=5, fig.height=3, fig.align="center"} 560 | 561 | plt <- data.table(PC1_After = harmonyObj$Z_corr[1, ], PC1_Before = harmonyObj$Z_orig[1, ]) %>% 562 | cbind(meta_data) %>% 563 | dplyr::sample_frac(1L) %>% 564 | ggplot(aes(PC1_Before, PC1_After)) + 565 | geom_abline(slope = 1, intercept = 0) + 566 | theme_tufte(base_size = 10) + geom_rangeframe() + 567 | scale_color_tableau() + 568 | guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4))) + 569 | NULL 570 | 571 | plt + geom_point(shape = '.', aes(color = dataset)) + 572 | labs(x = 'PC1 before correction', y = 'PC1 after correction', 573 | title = 'PC1 correction for each cell', subtitle = 'Colored by Dataset') + 574 | plt + geom_point(shape = '.', aes(color = cell_type)) + 575 | labs(x = 'PC1 before correction', y = 'PC1 after correction', 576 | title = 'PC1 correction for each cell', subtitle = 'Colored by Cell Type') + 577 | NULL 578 | 579 | ``` 580 | 581 | 582 | 583 | We can see a few interesting things from these plots. 584 | 585 | - The 293T cells from the 293T and half datasets have pretty much the same correction factor. Since these cells were already well mixed, this is expected. 586 | - There is a salient cloud of wandering Jurkat cells from the half dataset. Many of these itinerants find themselves with the same correction factor as 293T cells! What's going on with these erroneously corrected cells? These cells are located in the middle and have a small length (L2 norm). Thus, when these cells are unit length scaled, their location is unstable. These cells should have been filtered out as outliers. 587 | 588 | ## Mixture of Experts model 589 | 590 | The theory behind this algorithm is based on the Mixture of Experts model. This is a natural extension of linear modeling, in which each cluster is deemed an expert and is assigned its own linear model. 591 | 592 | We model each PC coordinate with a combination of linear factors. 593 | 594 |
595 | $Z_{d} = \sum_k \beta_{0,k} + \beta_{1,k} \mathbb{1}_{(dataset = jurkat)} + \beta_{2,k} \mathbb{1}_{(dataset = half)} + \beta_{3,k} \mathbb{1}_{(dataset = 293T)}$ 596 |
597 | 598 | In the model above, each cluster gets 4 $\beta$ terms: $\beta_{0,k}$ is the intercept term. This term is independent of which dataset a cell comes from. Therefore, it represents the contribution of cell type or cell state to the PC score. The other three $\beta$ terms are accompanied by an indicator variable. This means that a cell from dataset *half* will have $\mathbb{1}_{(dataset = half)}$ equal to 1 and the rest 0. 599 | 600 | Following this cell from dataset half *half*, we can write rewrite the MoE equation above as 601 | 602 |
603 | $Z_{di} = \sum_k \beta_{0,k} + \beta_{2,k} \mathbb{1}_{(dataset = half)}$ 604 |
605 | 606 | ## Estimate MoE model parameters 607 | 608 | We estimate the matrix of linear regression terms using the formula described in the manuscript: 609 | 610 |
611 | $W_k \leftarrow (\phi^* diag(R_k) \phi^{*T} + \lambda I)^{-1} \phi^* diag(R_k)Z_{orig}^T$ 612 |
613 | 614 | The matrix above contains linear regression terms for the the intercept $W_k[0] = \beta_{0,k}$ and the batch terms: 615 | 616 | $W_k[1] = \beta_{1, k} \mbox{ (for dataset half)}$ 617 | 618 | $W_k[2] = \beta_{2, k} \mbox{ (for dataset jurkat)}$ 619 | 620 | $W_k[3] = \beta_{3, k} \mbox{ (for dataset 293T)}$ 621 | 622 | 623 | ```{r, echo=TRUE} 624 | 625 | W <- list() 626 | ## Convert sparse data structures to dense matrix 627 | Phi.moe <- as.matrix(harmonyObj$Phi_moe) 628 | lambda <- diag(c(harmonyObj$lambda)) 629 | ## Get beta coeeficients for all the clusters 630 | for (k in 1:harmonyObj$K) { 631 | W[[k]] <- solve(Phi.moe %*% diag(harmonyObj$R[k, ]) %*% t(Phi.moe) + lambda) %*% (Phi.moe %*% diag(harmonyObj$R[k, ])) %*% t(harmonyObj$Z_orig) 632 | } 633 | 634 | 635 | ``` 636 | 637 | Let's take a look at how these regression terms relate to the data. Recall that the mixture of experts model is trying to estimate the contribution of intercept and batch to cell's positions in space. So first we'll take a look at the positions of each batch and each cluster in the original PCA embeddings. The color below represents soft cluster membership learned using the maximum diversity clustering above. 638 | 639 | 640 | ```{r, fig.width=5, fig.height=5} 641 | 642 | cluster_assignment_matrix <- harmonyObj$R 643 | 644 | t(harmonyObj$Z_orig) %>% data.frame() %>% 645 | cbind(meta_data) %>% 646 | tibble::rowid_to_column('id') %>% 647 | dplyr::inner_join( 648 | cluster_assignment_matrix %>% t() %>% data.table() %>% 649 | tibble::rowid_to_column('id') %>% 650 | tidyr::gather(cluster, r, -id) %>% 651 | dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 652 | by = 'id' 653 | ) %>% 654 | dplyr::sample_frac(1L) %>% 655 | ggplot(aes(X1, X2, color = r)) + 656 | geom_point(shape = 0.2) + 657 | theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 658 | facet_grid(cluster ~ dataset) + 659 | scale_color_gradient(low = 'grey', breaks = seq(0, 1, .2)) + 660 | labs(x = 'PC1', y = 'PC2', title = 'Cluster assigned in original PCA space (Z_orig)') 661 | 662 | ``` 663 | 664 | 665 | 666 | 667 | 668 | Now let's draw the $\beta$ terms into this space. For each cluster, we expect the sum of the intercept plus the batch terms to land squarely in the center of each batch:cluster. The arrows below represent the intercept (in black) and batch (colored) offsets. 669 | 670 | 671 | ```{r} 672 | plt_list <- lapply(1:harmonyObj$K, function(k) { 673 | plt_df <- W[[k]] %>% data.frame() %>% 674 | dplyr::select(X1, X2) 675 | ## Append n 676 | plt_df <- plt_df %>% 677 | cbind( 678 | data.frame(t(matrix(unlist(c(c(0, 0), rep(plt_df[1, ], 3))), nrow = 2))) %>% 679 | dplyr::rename(x0 = X1, y0 = X2) 680 | ) %>% 681 | cbind(type = c('intercept', unique(meta_data$dataset))) 682 | plt <- plt_df %>% 683 | ggplot() + 684 | geom_point(aes(X1, X2), 685 | data = t(harmonyObj$Z_orig) %>% data.frame(), 686 | size = 0.5, 687 | color = 'grey' 688 | ) + 689 | geom_segment(aes(x = x0, y = y0, xend = X1 + x0, yend = X2 + y0, color = type), linewidth=1) + 690 | scale_color_manual(values = c('intercept' = 'black', colors_use)) + 691 | theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 692 | labs(x = 'PC 1', y = 'PC 2', title = sprintf('Cluster %d', k)) 693 | plt <- plt + guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16))) 694 | # if (k == harmonyObj$K) { 695 | # } else { 696 | # plt <- plt + guides(color = FALSE) 697 | # } 698 | plt 699 | }) 700 | 701 | 702 | ``` 703 | 704 | 705 | ```{r, fig.height=6, fig.width=6} 706 | Reduce(`+`, plt_list) + 707 | patchwork::plot_annotation(title = 'Mixture of experts beta terms before correction (Z_orig)') + 708 | plot_layout(ncol = 2) 709 | ``` 710 | 711 | 712 | 713 | 714 | After correction, we remove the batch specific terms (colored arrows above). We can see the result in the corrected linear space ($Z_{corr}$). Notice that now, the cells are centered around the tips of the black arrows, which represent the intercept term. This is because we've removed the effect of the batch terms (colored arrows). 715 | 716 | 717 | ```{r, fig.width=4, fig.height=3, fig.align="center"} 718 | 719 | plt_list <- lapply(1:harmonyObj$K, function(k) { 720 | plt_df <- W[[k]] %>% data.frame() %>% 721 | dplyr::select(X1, X2) 722 | 723 | plt_df <- plt_df %>% 724 | cbind( 725 | data.frame(t(matrix(unlist(c(c(0, 0), rep(plt_df[1, ], 3))), nrow = 2))) %>% 726 | dplyr::rename(x0 = X1, y0 = X2) 727 | ) %>% 728 | cbind(type = c('intercept', unique(meta_data$dataset))) 729 | 730 | plt <- plt_df %>% 731 | ggplot() + 732 | geom_point(aes(X1, X2), 733 | data = t(harmonyObj$Z_corr) %>% data.frame(), 734 | shape = '.', 735 | color = 'grey' 736 | ) + 737 | geom_segment(aes(x = x0, y = y0, xend = X1 + x0, yend = X2 + y0, color = type), linewidth=1) + 738 | scale_color_manual(values = c('intercept' = 'black', colors_use)) + 739 | theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 740 | labs(x = 'PC 1', y = 'PC 2', title = sprintf('Cluster %d', k)) 741 | plt <- plt + guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16))) 742 | plt 743 | }) 744 | 745 | 746 | ``` 747 | 748 | 749 | 750 | ```{r, fig.height=6, fig.width=6} 751 | Reduce(`+`, plt_list) + 752 | patchwork::plot_annotation(title = 'Mixture of experts beta terms after correction (Z_corr)') + 753 | plot_layout(ncol = 2) 754 | ``` 755 | 756 | 757 | 758 | ## Cell specific corrections 759 | 760 | How does one cell get its correction factor? 761 | 762 | Recall from above that each cell $i$ is now modeled with intercept and batch specific terms: 763 | 764 | 765 | ```{r, echo=TRUE} 766 | 767 | Z_i <- harmonyObj$Z_orig[, 5] 768 | Z_i_pred <- Reduce(`+`, lapply(1:harmonyObj$K, function(k) { 769 | W[[k]] * harmonyObj$Phi_moe[, 5] * harmonyObj$R[k, 5] 770 | })) %>% colSums 771 | 772 | 773 | ``` 774 | 775 | The plot below shows the observed and predicted values of all 20 PCs for cell 5. 776 | 777 | 778 | ```{r, fig.width=4, fig.height=3, fig.align="center"} 779 | data.table(obs = Z_i, pred = Z_i_pred) %>% 780 | tibble::rowid_to_column('PC') %>% 781 | ggplot(aes(obs, pred)) + 782 | geom_point(shape = 21) + 783 | geom_label_repel(aes(label = PC)) + 784 | geom_abline(slope = 1, intercept = 0) + 785 | theme_tufte() + geom_rangeframe() + 786 | labs(x = 'Observed PC score', 'Predicted PC score', title = 'Observed and predicted values of PC scores\nfor cell 5') + 787 | NULL 788 | ``` 789 | 790 | 791 | 792 | 793 | Now that we've modeled all these contributions to PCs, we can remove the batch specific terms from cell $i$ to get its corrected position ($\hat{Z}_{i}$) in $Z_{corr}$: 794 | 795 |
796 | $\hat{Z}_i \leftarrow Z_i - \sum_k R_{ki} $ 797 |
798 | 799 | 800 | ```{r} 801 | delta <- Reduce(`+`, lapply(1:harmonyObj$K, function(k) { 802 | W[[k]][2:4, ] * harmonyObj$Phi[, 5] * harmonyObj$R[k, 5] 803 | })) %>% colSums 804 | 805 | Z_corrected <- harmonyObj$Z_orig[, 5] - delta 806 | 807 | ``` 808 | 809 | Let's see where this one cell moves in the original embeddings. Cell 5 is highlighted in red. It's individual correction factor is shown with the red arrow. 810 | 811 | 812 | ```{r, fig.width=3, fig.height=3, fig.align="center"} 813 | 814 | 815 | harmonyObj$Z_orig %>% t %>% data.frame() %>% 816 | ggplot(aes(X1, X2)) + 817 | geom_point(shape = '.') + 818 | geom_point( 819 | data = data.frame(t(harmonyObj$Z_orig[, 5, drop = FALSE])), 820 | color = 'red' 821 | ) + 822 | geom_segment( 823 | data = data.table(x0 = harmonyObj$Z_orig[1, 5], 824 | y0 = harmonyObj$Z_orig[2, 5], 825 | x1 = Z_corrected[1], 826 | y1 = Z_corrected[2]), 827 | aes(x = x0, y = y0, xend = x1, yend = y1), 828 | linewidth = 1, 829 | color = 'red', 830 | arrow = arrow(length = unit(0.05, "npc"), type = 'closed') 831 | ) + 832 | theme_tufte(base_size = 10) + geom_rangeframe() + 833 | labs(x = 'PC1', y = 'PC2', title = 'Correction of cell #5') 834 | 835 | ``` 836 | 837 | 838 | # Multiple iterations of Harmony 839 | 840 | The sections above broke down the Harmony algorithm. Now's let's take a more holistic look. In the code below, let's look at the corrected PC values ($Z_{cos}$) after each round of Harmony (clustering + correction). Since we're not visualizing the clusters in this section, let's increase nclust to 50. After the 1st and 2nd rounds, we can see considerably more mixing. By round 3 though, the cells are pretty well mixed and we stop. 841 | 842 | 843 | 844 | 845 | ```{r} 846 | 847 | harmonyObj <- RunHarmony( 848 | data_mat = V, ## PCA embedding matrix of cells 849 | meta_data = meta_data, ## dataframe with cell labels 850 | theta = 1, ## cluster diversity enforcement 851 | vars_use = 'dataset', ## (list of) variable(s) we'd like to Harmonize out 852 | nclust = 50, ## number of clusters in Harmony model 853 | max_iter = 0, ## don't actually run Harmony, stop after initialization 854 | return_object = TRUE ## return the full Harmony model object, not just the corrected PCA matrix 855 | ) 856 | 857 | ``` 858 | 859 | 860 | ```{r, message=FALSE, fig.width=5, fig.height=3, fig.align="center"} 861 | 862 | i <- 0 863 | 864 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 865 | labs(title = sprintf('Round %d', i), subtitle = 'Colored by dataset', x = 'Scaled PC1', y = 'Scaled PC2') + 866 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 867 | labs(title = sprintf('Round %d', i), subtitle = 'Colored by cell type', x = 'Scaled PC1', y = 'Scaled PC2') + 868 | NULL 869 | ``` 870 | 871 | ```{r, fig.width=5, fig.height=3, fig.align="center", message=FALSE} 872 | 873 | for (i in 1:2) { 874 | harmony:::harmonize(harmonyObj, 1) 875 | plt <- do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 876 | labs(title = sprintf('Round %d', i), subtitle = 'Colored by dataset', x = 'Scaled PC1', y = 'Scaled PC2') + 877 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 878 | labs(title = sprintf('Round %d', i), subtitle = 'Colored by cell type', x = 'Scaled PC1', y = 'Scaled PC2') + 879 | NULL 880 | plot(plt) 881 | } 882 | 883 | ``` 884 | 885 | 886 | 887 | # Session info 888 | 889 | 890 | ```{r} 891 | sessionInfo() 892 | ``` 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | 915 | 916 | -------------------------------------------------------------------------------- /doc/parameters.R: -------------------------------------------------------------------------------- 1 | ## ---- include = FALSE--------------------------------------------------------- 2 | knitr::opts_chunk$set( 3 | collapse = TRUE, 4 | comment = "#>" 5 | ) 6 | 7 | ## ----setup-------------------------------------------------------------------- 8 | library(harmony) 9 | library(ggplot2) 10 | 11 | 12 | 13 | 14 | ## ----------------------------------------------------------------------------- 15 | 16 | ## Old 17 | 18 | ## 19 | ## HarmonyMatrix(bos, opt.args = list(lambda = c(0,1))) 20 | 21 | 22 | 23 | 24 | 25 | 26 | ## ----------------------------------------------------------------------------- 27 | ## Source required data 28 | ## data("celllines") 29 | 30 | ## cell_lines <- zeros() 31 | ## pbmc <- CreateSeuratObject(counts = , project = "jurkat", min.cells = 5) 32 | 33 | ## Separate conditions 34 | 35 | ## pbmc@meta.data$stim <- c(rep("STIM", ncol(stim.sparse)), rep("CTRL", ncol(ctrl.sparse))) 36 | 37 | -------------------------------------------------------------------------------- /doc/parameters.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Advanced tutorial" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{Advanced tutorial} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | ``` 16 | 17 | # Introduction 18 | 19 | Harmony uses a set of parameters to ensure the different components of the algorithm work in harmony! By default, several of these parameters are set by the algorithm using heuristics or empirical values. Most of the time, the end-user does not need to find optimal values to run Harmony. In this vignette, we will be going through some use cases where user need to intervene and optimize the data or parameters harmony. 20 | 21 | There are two reasons that someone a user may need to change the parameters. The first one is to increase the quality of the data integration. The second one is to improve the performance of harmony. 22 | 23 | 24 | # Harmony algorithm objective diverges after a number of correction steps 25 | 26 | For some datasets, the objective function may diverge after a few steps. Here we are going to be looking into the Jurkat dataset that is bundled with harmony. 27 | 28 | 29 | ```{r setup} 30 | library(harmony) 31 | library(ggplot2) 32 | 33 | 34 | 35 | ``` 36 | 37 | 38 | ```{r} 39 | 40 | ## Old 41 | 42 | ## 43 | ## HarmonyMatrix(bos, opt.args = list(lambda = c(0,1))) 44 | 45 | 46 | 47 | 48 | 49 | ``` 50 | 51 | # An example of a problematic dataset 52 | 53 | ```{r} 54 | ## Source required data 55 | ## data("celllines") 56 | 57 | ## cell_lines <- zeros() 58 | ## pbmc <- CreateSeuratObject(counts = , project = "jurkat", min.cells = 5) 59 | 60 | ## Separate conditions 61 | 62 | ## pbmc@meta.data$stim <- c(rep("STIM", ncol(stim.sparse)), rep("CTRL", ncol(ctrl.sparse))) 63 | ``` 64 | 65 | 66 | ## Input data 67 | ## Number of PCs 68 | Using the correct number of components can become very important in certain scenarios. 69 | 70 | 71 | 72 | 73 | 74 | ## Nested data 75 | 76 | # Harmony Algorithm parameters 77 | ## theta 78 | ## lambda 79 | ## sigma 80 | ## nclust 81 | 82 | # Controlling harmony flow 83 | ## 84 | ## 85 | -------------------------------------------------------------------------------- /doc/parameters.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Advanced tutorial 16 | 17 | 30 | 31 | 39 | 40 | 41 | 42 | 50 | 115 | 143 | 144 | 145 | 146 | 147 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 |

Advanced tutorial

340 | 341 | 342 | 343 |
344 |

Introduction

345 |

Harmony uses a set of parameters to ensure the different components 346 | of the algorithm work in harmony! By default, several of these 347 | parameters are set by the algorithm using heuristics or empirical 348 | values. Most of the time, the end-user does not need to find optimal 349 | values to run Harmony. In this vignette, we will be going through some 350 | use cases where user need to intervene and optimize the data or 351 | parameters harmony.

352 |

There are two reasons that someone a user may need to change the 353 | parameters. The first one is to increase the quality of the data 354 | integration. The second one is to improve the performance of 355 | harmony.

356 |
357 |
358 |

Harmony algorithm objective diverges after a number of correction 359 | steps

360 |

For some datasets, the objective function may diverge after a few 361 | steps. Here we are going to be looking into the Jurkat dataset that is 362 | bundled with harmony.

363 |
library(harmony)
364 | #> Loading required package: Rcpp
365 | library(ggplot2)
366 |

367 | ## Old
368 | 
369 | ## 
370 | ## HarmonyMatrix(bos, opt.args = list(lambda = c(0,1)))
371 |
372 |
373 |

An example of a problematic dataset

374 |
## Source required data
375 | ## data("celllines")
376 | 
377 | ## cell_lines <- zeros()
378 | ## pbmc <- CreateSeuratObject(counts = , project = "jurkat", min.cells = 5)
379 | 
380 | ## Separate conditions
381 | 
382 | ## pbmc@meta.data$stim <- c(rep("STIM", ncol(stim.sparse)), rep("CTRL", ncol(ctrl.sparse)))
383 |
384 |

Input data

385 |
386 |
387 |

Number of PCs

388 |

Using the correct number of components can become very important in 389 | certain scenarios.

390 |
391 |
392 |

Nested data

393 |
394 |
395 |
396 |

Harmony Algorithm parameters

397 |
398 |

theta

399 |
400 |
401 |

lambda

402 |
403 |
404 |

sigma

405 |
406 |
407 |

nclust

408 |
409 |
410 |
411 |

Controlling harmony flow

412 |
413 |

414 |
415 |
416 |

417 |
418 |
419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 434 | 435 | 436 | 437 | -------------------------------------------------------------------------------- /doc/quickstart.R: -------------------------------------------------------------------------------- 1 | ## ----eval=FALSE--------------------------------------------------------------- 2 | # install.packages('harmony') 3 | 4 | ## ----------------------------------------------------------------------------- 5 | library(harmony) 6 | 7 | ## ----------------------------------------------------------------------------- 8 | data(cell_lines) 9 | V <- cell_lines$scaled_pcs 10 | meta_data <- cell_lines$meta_data 11 | 12 | 13 | ## ----class.source='fold-hide', fig.width=5, fig.height=3, fig.align="center"---- 14 | 15 | library(ggplot2) 16 | 17 | do_scatter <- function(xy, meta_data, label_name, base_size = 12) { 18 | palette_use <- c(`jurkat` = '#810F7C', `t293` = '#D09E2D',`half` = '#006D2C') 19 | xy <- xy[, 1:2] 20 | colnames(xy) <- c('X1', 'X2') 21 | plt_df <- xy %>% data.frame() %>% cbind(meta_data) 22 | plt <- ggplot(plt_df, aes(X1, X2, col = !!rlang::sym(label_name), fill = !!rlang::sym(label_name))) + 23 | theme_test(base_size = base_size) + 24 | guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, 25 | shape = 16, size = 4))) + 26 | scale_color_manual(values = palette_use) + 27 | scale_fill_manual(values = palette_use) + 28 | theme(plot.title = element_text(hjust = .5)) + 29 | labs(x = "PC 1", y = "PC 2") + 30 | theme(legend.position = "none") + 31 | geom_point(shape = '.') 32 | 33 | ## Add labels 34 | data_labels <- plt_df %>% 35 | dplyr::group_by(!!rlang::sym(label_name)) %>% 36 | dplyr::summarise(X1 = mean(X1), X2 = mean(X2)) %>% 37 | dplyr::ungroup() 38 | plt + geom_label(data = data_labels, aes(label = !!rlang::sym(label_name)), 39 | color = "white", size = 4) 40 | } 41 | p1 <- do_scatter(V, meta_data, 'dataset') + 42 | labs(title = 'Colored by dataset') 43 | p2 <- do_scatter(V, meta_data, 'cell_type') + 44 | labs(title = 'Colored by cell type') 45 | 46 | cowplot::plot_grid(p1, p2) 47 | 48 | 49 | ## ----------------------------------------------------------------------------- 50 | harmony_embeddings <- harmony::RunHarmony( 51 | V, meta_data, 'dataset', verbose=FALSE 52 | ) 53 | 54 | 55 | ## ---- fig.width=5, fig.height=3, fig.align="center"--------------------------- 56 | p1 <- do_scatter(harmony_embeddings, meta_data, 'dataset') + 57 | labs(title = 'Colored by dataset') 58 | p2 <- do_scatter(harmony_embeddings, meta_data, 'cell_type') + 59 | labs(title = 'Colored by cell type') 60 | cowplot::plot_grid(p1, p2, nrow = 1) 61 | 62 | 63 | ## ----------------------------------------------------------------------------- 64 | sessionInfo() 65 | 66 | 67 | -------------------------------------------------------------------------------- /doc/quickstart.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Quick start to Harmony" 3 | author: "Korsunsky et al.: Fast, sensitive, and accurate integration of single 4 | cell data with Harmony" 5 | output: 6 | rmarkdown::html_vignette: 7 | code_folding: show 8 | vignette: > 9 | %\VignetteIndexEntry{Quick start to Harmony} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | %\VignetteEncoding{UTF-8} 12 | --- 13 | 14 | 15 | # Introduction 16 | 17 | Harmony is an algorithm for performing integration of single cell genomics 18 | datasets. Please check out our latest 19 | [manuscript on Nature Methods](https://www.nature.com/articles/s41592-019-0619-0). 20 | 21 | ![](main.jpg){width=100%} 22 | 23 | 24 | # Installation 25 | 26 | Install Harmony from CRAN with standard commands. 27 | 28 | ```{r eval=FALSE} 29 | install.packages('harmony') 30 | ``` 31 | 32 | Once Harmony is installed, load it up! 33 | 34 | ```{r} 35 | library(harmony) 36 | ``` 37 | 38 | 39 | # Integrating cell line datasets from 10X 40 | 41 | The example below follows Figure 2 in the manuscript. 42 | 43 | We downloaded 3 cell line datasets from the 10X website. The first two (jurkat 44 | and 293t) come from pure cell lines while the *half* dataset is a 50:50 45 | mixture of Jurkat and HEK293T cells. We inferred cell type with the canonical 46 | marker XIST, since the two cell lines come from 1 male and 1 female donor. 47 | 48 | * support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/jurkat 49 | * support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/293t 50 | * support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/jurkat:293t_50:50 51 | 52 | We library normalized the cells, log transformed the counts, and scaled the 53 | genes. Then we performed PCA and kept the top 20 PCs. The PCA embeddings and 54 | meta data are available as part of this package. 55 | 56 | ```{r} 57 | data(cell_lines) 58 | V <- cell_lines$scaled_pcs 59 | meta_data <- cell_lines$meta_data 60 | 61 | ``` 62 | 63 | 64 | Initially, the cells cluster by both dataset (left) and cell type (right). 65 | 66 | ```{r class.source='fold-hide', fig.width=5, fig.height=3, fig.align="center"} 67 | 68 | library(ggplot2) 69 | 70 | do_scatter <- function(xy, meta_data, label_name, base_size = 12) { 71 | palette_use <- c(`jurkat` = '#810F7C', `t293` = '#D09E2D',`half` = '#006D2C') 72 | xy <- xy[, 1:2] 73 | colnames(xy) <- c('X1', 'X2') 74 | plt_df <- xy %>% data.frame() %>% cbind(meta_data) 75 | plt <- ggplot(plt_df, aes(X1, X2, col = !!rlang::sym(label_name), fill = !!rlang::sym(label_name))) + 76 | theme_test(base_size = base_size) + 77 | guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, 78 | shape = 16, size = 4))) + 79 | scale_color_manual(values = palette_use) + 80 | scale_fill_manual(values = palette_use) + 81 | theme(plot.title = element_text(hjust = .5)) + 82 | labs(x = "PC 1", y = "PC 2") + 83 | theme(legend.position = "none") + 84 | geom_point(shape = '.') 85 | 86 | ## Add labels 87 | data_labels <- plt_df %>% 88 | dplyr::group_by(!!rlang::sym(label_name)) %>% 89 | dplyr::summarise(X1 = mean(X1), X2 = mean(X2)) %>% 90 | dplyr::ungroup() 91 | plt + geom_label(data = data_labels, aes(label = !!rlang::sym(label_name)), 92 | color = "white", size = 4) 93 | } 94 | p1 <- do_scatter(V, meta_data, 'dataset') + 95 | labs(title = 'Colored by dataset') 96 | p2 <- do_scatter(V, meta_data, 'cell_type') + 97 | labs(title = 'Colored by cell type') 98 | 99 | cowplot::plot_grid(p1, p2) 100 | 101 | ``` 102 | 103 | Let's run Harmony to remove the influence of dataset-of-origin from the cell 104 | embeddings. 105 | 106 | ```{r} 107 | harmony_embeddings <- harmony::RunHarmony( 108 | V, meta_data, 'dataset', verbose=FALSE 109 | ) 110 | 111 | ``` 112 | 113 | After Harmony, the datasets are now mixed (left) and the cell types are still 114 | separate (right). 115 | 116 | ```{r, fig.width=5, fig.height=3, fig.align="center"} 117 | p1 <- do_scatter(harmony_embeddings, meta_data, 'dataset') + 118 | labs(title = 'Colored by dataset') 119 | p2 <- do_scatter(harmony_embeddings, meta_data, 'cell_type') + 120 | labs(title = 'Colored by cell type') 121 | cowplot::plot_grid(p1, p2, nrow = 1) 122 | 123 | ``` 124 | 125 | # Next Steps 126 | 127 | ## Interfacing to software packages 128 | 129 | You can also run Harmony as part of an established pipeline in several packages, such as Seurat. For these vignettes, please [visit our github page](https://github.com/immunogenomics/harmony/). 130 | 131 | 132 | ## Detailed breakdown of the Harmony algorithm 133 | 134 | For more details on how each part of Harmony works, consult our more detailed 135 | [vignette](https://htmlpreview.github.io/?https://github.com/immunogenomics/harmony/blob/master/doc/detailedWalkthrough.html) 136 | "Detailed Walkthrough of Harmony Algorithm". 137 | 138 | # Session Info 139 | 140 | ```{r} 141 | sessionInfo() 142 | 143 | ``` 144 | -------------------------------------------------------------------------------- /man/HarmonyMatrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ui.R 3 | \name{HarmonyMatrix} 4 | \alias{HarmonyMatrix} 5 | \title{A proxy call to \code{\link[=RunHarmony]{RunHarmony()}}. Deprecated.} 6 | \usage{ 7 | HarmonyMatrix(...) 8 | } 9 | \arguments{ 10 | \item{...}{ 11 | Arguments passed on to \code{\link[=RunHarmony.default]{RunHarmony.default}} 12 | \describe{ 13 | \item{\code{data_mat}}{Matrix of cell embeddings. Cells can be rows or 14 | columns and will be inferred by the rows of meta_data.} 15 | \item{\code{meta_data}}{Either (1) Dataframe with variables to integrate 16 | or (2) vector with labels.} 17 | \item{\code{vars_use}}{If meta_data is dataframe, this defined which 18 | variable(s) to remove (character vector).} 19 | \item{\code{theta}}{Diversity clustering penalty parameter. Specify for 20 | each variable in vars_use Default theta=2. theta=0 does not 21 | encourage any diversity. Larger values of theta result in more 22 | diverse clusters.} 23 | \item{\code{sigma}}{Width of soft kmeans clusters. Default 24 | sigma=0.1. Sigma scales the distance from a cell to cluster 25 | centroids. Larger values of sigma result in cells assigned to 26 | more clusters. Smaller values of sigma make soft kmeans cluster 27 | approach hard clustering.} 28 | \item{\code{lambda}}{Ridge regression penalty. Default lambda=1. Bigger 29 | values protect against over correction. If several covariates 30 | are specified, then lambda can also be a vector which needs to 31 | be equal length with the number of variables to be 32 | corrected. In this scenario, each covariate level group will be 33 | assigned the scalars specified by the user. If set to NULL, 34 | harmony will start lambda estimation mode to determine lambdas 35 | automatically and try to minimize overcorrection (Use with caution still 36 | in beta testing).} 37 | \item{\code{nclust}}{Number of clusters in model. nclust=1 equivalent to 38 | simple linear regression.} 39 | \item{\code{max_iter}}{Maximum number of rounds to run Harmony. One round 40 | of Harmony involves one clustering and one correction step.} 41 | \item{\code{early_stop}}{Enable early stopping for harmony. The 42 | harmonization process will stop when the change of objective 43 | function between corrections drops below 1e-4} 44 | \item{\code{ncores}}{Number of processors to be used for math operations 45 | when optimized BLAS is available. If BLAS is not supporting 46 | multithreaded then this option has no effect. By default, 47 | ncore=1 which runs as a single-threaded process. Although 48 | Harmony supports multiple cores, it is not optimized for 49 | multithreading. Increase this number for large datasets iff 50 | single-core performance is not adequate.} 51 | \item{\code{plot_convergence}}{Whether to print the convergence plot of 52 | the clustering objective function. TRUE to plot, FALSE to 53 | suppress. This can be useful for debugging.} 54 | \item{\code{return_object}}{(Advanced Usage) Whether to return the Harmony 55 | object or only the corrected PCA embeddings.} 56 | \item{\code{verbose}}{Whether to print progress messages. TRUE to print, 57 | FALSE to suppress.} 58 | \item{\code{.options}}{Setting advanced parameters of RunHarmony. This must be the 59 | result from a call to `harmony_options`. See ?`harmony_options` for 60 | parameters not listed above and more details.} 61 | }} 62 | } 63 | \description{ 64 | Maintain name backwards compatibility with version 0 of 65 | harmony. However, API is not backwards compatible with version 66 | 0. This function will be deprecated in later versions of Harmony. 67 | } 68 | -------------------------------------------------------------------------------- /man/RunHarmony.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RunHarmony.R 3 | \name{RunHarmony} 4 | \alias{RunHarmony} 5 | \title{Generic function that runs the harmony algorithm on single-cell 6 | genomics cell embeddings.} 7 | \usage{ 8 | RunHarmony(...) 9 | } 10 | \arguments{ 11 | \item{...}{ 12 | Arguments passed on to \code{\link[=RunHarmony.default]{RunHarmony.default}} 13 | \describe{ 14 | \item{\code{theta}}{Diversity clustering penalty parameter. Specify for 15 | each variable in vars_use Default theta=2. theta=0 does not 16 | encourage any diversity. Larger values of theta result in more 17 | diverse clusters.} 18 | \item{\code{sigma}}{Width of soft kmeans clusters. Default 19 | sigma=0.1. Sigma scales the distance from a cell to cluster 20 | centroids. Larger values of sigma result in cells assigned to 21 | more clusters. Smaller values of sigma make soft kmeans cluster 22 | approach hard clustering.} 23 | \item{\code{lambda}}{Ridge regression penalty. Default lambda=1. Bigger 24 | values protect against over correction. If several covariates 25 | are specified, then lambda can also be a vector which needs to 26 | be equal length with the number of variables to be 27 | corrected. In this scenario, each covariate level group will be 28 | assigned the scalars specified by the user. If set to NULL, 29 | harmony will start lambda estimation mode to determine lambdas 30 | automatically and try to minimize overcorrection (Use with caution still 31 | in beta testing).} 32 | \item{\code{nclust}}{Number of clusters in model. nclust=1 equivalent to 33 | simple linear regression.} 34 | \item{\code{max_iter}}{Maximum number of rounds to run Harmony. One round 35 | of Harmony involves one clustering and one correction step.} 36 | \item{\code{early_stop}}{Enable early stopping for harmony. The 37 | harmonization process will stop when the change of objective 38 | function between corrections drops below 1e-4} 39 | \item{\code{ncores}}{Number of processors to be used for math operations 40 | when optimized BLAS is available. If BLAS is not supporting 41 | multithreaded then this option has no effect. By default, 42 | ncore=1 which runs as a single-threaded process. Although 43 | Harmony supports multiple cores, it is not optimized for 44 | multithreading. Increase this number for large datasets iff 45 | single-core performance is not adequate.} 46 | \item{\code{plot_convergence}}{Whether to print the convergence plot of 47 | the clustering objective function. TRUE to plot, FALSE to 48 | suppress. This can be useful for debugging.} 49 | \item{\code{verbose}}{Whether to print progress messages. TRUE to print, 50 | FALSE to suppress.} 51 | \item{\code{.options}}{Setting advanced parameters of RunHarmony. This must be the 52 | result from a call to `harmony_options`. See ?`harmony_options` for 53 | parameters not listed above and more details.} 54 | }} 55 | } 56 | \value{ 57 | If used with single-cell objects, it will return the 58 | updated single-sell object. For standalone operation, it 59 | returns the corrected cell embeddings or the R6 harmony object 60 | (see \code{\link[=RunHarmony.default]{RunHarmony.default()}}). 61 | } 62 | \description{ 63 | RunHarmony is generic function that runs the main Harmony 64 | algorithm. If working with single cell R objects, please refer to 65 | the documentation of the appropriate generic API: 66 | (\code{\link[=RunHarmony.Seurat]{RunHarmony.Seurat()}} or \code{\link[=RunHarmony.SingleCellExperiment]{RunHarmony.SingleCellExperiment()}}). If 67 | users work with other forms of cell embeddings, the can pass them 68 | directly to harmony using \code{\link[=RunHarmony.default]{RunHarmony.default()}} API. All the 69 | function arguments listed here are common in all RunHarmony 70 | interfaces. 71 | } 72 | \seealso{ 73 | Other RunHarmony: 74 | \code{\link{RunHarmony.Seurat}()}, 75 | \code{\link{RunHarmony.SingleCellExperiment}()}, 76 | \code{\link{RunHarmony.default}()} 77 | } 78 | \concept{RunHarmony} 79 | -------------------------------------------------------------------------------- /man/RunHarmony.Seurat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RunHarmony.R 3 | \name{RunHarmony.Seurat} 4 | \alias{RunHarmony.Seurat} 5 | \title{Applies harmony on a Seurat object cell embedding.} 6 | \usage{ 7 | \method{RunHarmony}{Seurat}( 8 | object, 9 | group.by.vars, 10 | reduction.use = "pca", 11 | dims.use = NULL, 12 | reduction.save = "harmony", 13 | project.dim = TRUE, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{object}{the Seurat object. It needs to have the appropriate slot 19 | of cell embeddings precomputed.} 20 | 21 | \item{group.by.vars}{the name(s) of covariates that harmony will remove 22 | its effect on the data.} 23 | 24 | \item{reduction.use}{Name of dimension reduction to use. Default is pca.} 25 | 26 | \item{dims.use}{indices of the cell embedding features to be used} 27 | 28 | \item{reduction.save}{the name of the new slot that is going to be created by 29 | harmony. By default, harmony.} 30 | 31 | \item{project.dim}{Project dimension reduction loadings. Default TRUE.} 32 | 33 | \item{...}{ 34 | Arguments passed on to \code{\link[=RunHarmony.default]{RunHarmony.default}} 35 | \describe{ 36 | \item{\code{theta}}{Diversity clustering penalty parameter. Specify for 37 | each variable in vars_use Default theta=2. theta=0 does not 38 | encourage any diversity. Larger values of theta result in more 39 | diverse clusters.} 40 | \item{\code{sigma}}{Width of soft kmeans clusters. Default 41 | sigma=0.1. Sigma scales the distance from a cell to cluster 42 | centroids. Larger values of sigma result in cells assigned to 43 | more clusters. Smaller values of sigma make soft kmeans cluster 44 | approach hard clustering.} 45 | \item{\code{lambda}}{Ridge regression penalty. Default lambda=1. Bigger 46 | values protect against over correction. If several covariates 47 | are specified, then lambda can also be a vector which needs to 48 | be equal length with the number of variables to be 49 | corrected. In this scenario, each covariate level group will be 50 | assigned the scalars specified by the user. If set to NULL, 51 | harmony will start lambda estimation mode to determine lambdas 52 | automatically and try to minimize overcorrection (Use with caution still 53 | in beta testing).} 54 | \item{\code{nclust}}{Number of clusters in model. nclust=1 equivalent to 55 | simple linear regression.} 56 | \item{\code{max_iter}}{Maximum number of rounds to run Harmony. One round 57 | of Harmony involves one clustering and one correction step.} 58 | \item{\code{early_stop}}{Enable early stopping for harmony. The 59 | harmonization process will stop when the change of objective 60 | function between corrections drops below 1e-4} 61 | \item{\code{ncores}}{Number of processors to be used for math operations 62 | when optimized BLAS is available. If BLAS is not supporting 63 | multithreaded then this option has no effect. By default, 64 | ncore=1 which runs as a single-threaded process. Although 65 | Harmony supports multiple cores, it is not optimized for 66 | multithreading. Increase this number for large datasets iff 67 | single-core performance is not adequate.} 68 | \item{\code{plot_convergence}}{Whether to print the convergence plot of 69 | the clustering objective function. TRUE to plot, FALSE to 70 | suppress. This can be useful for debugging.} 71 | \item{\code{verbose}}{Whether to print progress messages. TRUE to print, 72 | FALSE to suppress.} 73 | \item{\code{.options}}{Setting advanced parameters of RunHarmony. This must be the 74 | result from a call to `harmony_options`. See ?`harmony_options` for 75 | parameters not listed above and more details.} 76 | }} 77 | } 78 | \value{ 79 | Seurat object. Harmony dimensions placed into a new slot in the Seurat 80 | object according to the reduction.save. For downstream Seurat analyses, 81 | use reduction='harmony'. 82 | } 83 | \description{ 84 | Applies harmony on a Seurat object cell embedding. 85 | } 86 | \examples{ 87 | \dontrun{ 88 | ## seu is a Seurat single-Cell R object 89 | seu <- RunHarmony(seu, "donor_id") 90 | } 91 | } 92 | \seealso{ 93 | Other RunHarmony: 94 | \code{\link{RunHarmony.SingleCellExperiment}()}, 95 | \code{\link{RunHarmony.default}()}, 96 | \code{\link{RunHarmony}()} 97 | } 98 | \concept{RunHarmony} 99 | -------------------------------------------------------------------------------- /man/RunHarmony.SingleCellExperiment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RunHarmony.R 3 | \name{RunHarmony.SingleCellExperiment} 4 | \alias{RunHarmony.SingleCellExperiment} 5 | \title{Applies harmony on PCA cell embeddings of a SingleCellExperiment.} 6 | \usage{ 7 | \method{RunHarmony}{SingleCellExperiment}( 8 | object, 9 | group.by.vars, 10 | dims.use = NULL, 11 | verbose = TRUE, 12 | reduction.save = "HARMONY", 13 | ... 14 | ) 15 | } 16 | \arguments{ 17 | \item{object}{SingleCellExperiment with the PCA reducedDim cell embeddings populated} 18 | 19 | \item{group.by.vars}{the name(s) of covariates that harmony will remove 20 | its effect on the data.} 21 | 22 | \item{dims.use}{a vector of indices that allows only selected cell embeddings 23 | features to be used.} 24 | 25 | \item{verbose}{enable verbosity} 26 | 27 | \item{reduction.save}{the name of the new slot that is going to be created by 28 | harmony. By default, HARMONY.} 29 | 30 | \item{...}{ 31 | Arguments passed on to \code{\link[=RunHarmony.default]{RunHarmony.default}} 32 | \describe{ 33 | \item{\code{theta}}{Diversity clustering penalty parameter. Specify for 34 | each variable in vars_use Default theta=2. theta=0 does not 35 | encourage any diversity. Larger values of theta result in more 36 | diverse clusters.} 37 | \item{\code{sigma}}{Width of soft kmeans clusters. Default 38 | sigma=0.1. Sigma scales the distance from a cell to cluster 39 | centroids. Larger values of sigma result in cells assigned to 40 | more clusters. Smaller values of sigma make soft kmeans cluster 41 | approach hard clustering.} 42 | \item{\code{lambda}}{Ridge regression penalty. Default lambda=1. Bigger 43 | values protect against over correction. If several covariates 44 | are specified, then lambda can also be a vector which needs to 45 | be equal length with the number of variables to be 46 | corrected. In this scenario, each covariate level group will be 47 | assigned the scalars specified by the user. If set to NULL, 48 | harmony will start lambda estimation mode to determine lambdas 49 | automatically and try to minimize overcorrection (Use with caution still 50 | in beta testing).} 51 | \item{\code{nclust}}{Number of clusters in model. nclust=1 equivalent to 52 | simple linear regression.} 53 | \item{\code{max_iter}}{Maximum number of rounds to run Harmony. One round 54 | of Harmony involves one clustering and one correction step.} 55 | \item{\code{early_stop}}{Enable early stopping for harmony. The 56 | harmonization process will stop when the change of objective 57 | function between corrections drops below 1e-4} 58 | \item{\code{ncores}}{Number of processors to be used for math operations 59 | when optimized BLAS is available. If BLAS is not supporting 60 | multithreaded then this option has no effect. By default, 61 | ncore=1 which runs as a single-threaded process. Although 62 | Harmony supports multiple cores, it is not optimized for 63 | multithreading. Increase this number for large datasets iff 64 | single-core performance is not adequate.} 65 | \item{\code{plot_convergence}}{Whether to print the convergence plot of 66 | the clustering objective function. TRUE to plot, FALSE to 67 | suppress. This can be useful for debugging.} 68 | \item{\code{.options}}{Setting advanced parameters of RunHarmony. This must be the 69 | result from a call to `harmony_options`. See ?`harmony_options` for 70 | parameters not listed above and more details.} 71 | }} 72 | } 73 | \value{ 74 | SingleCellExperiment object. After running RunHarmony, the corrected 75 | cell embeddings can be accessed with reducedDim(object, "Harmony"). 76 | } 77 | \description{ 78 | Applies harmony on PCA cell embeddings of a SingleCellExperiment. 79 | } 80 | \examples{ 81 | \dontrun{ 82 | ## sce is a SingleCellExperiment R object 83 | sce <- RunHarmony(sce, "donor_id") 84 | } 85 | } 86 | \seealso{ 87 | Other RunHarmony: 88 | \code{\link{RunHarmony.Seurat}()}, 89 | \code{\link{RunHarmony.default}()}, 90 | \code{\link{RunHarmony}()} 91 | } 92 | \concept{RunHarmony} 93 | -------------------------------------------------------------------------------- /man/RunHarmony.default.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ui.R 3 | \name{RunHarmony.default} 4 | \alias{RunHarmony.default} 5 | \title{This is the primary harmony interface.} 6 | \usage{ 7 | \method{RunHarmony}{default}( 8 | data_mat, 9 | meta_data, 10 | vars_use, 11 | theta = NULL, 12 | sigma = 0.1, 13 | lambda = 1, 14 | nclust = NULL, 15 | max_iter = 10, 16 | early_stop = TRUE, 17 | ncores = 1, 18 | plot_convergence = FALSE, 19 | return_object = FALSE, 20 | verbose = TRUE, 21 | .options = harmony_options(), 22 | ... 23 | ) 24 | } 25 | \arguments{ 26 | \item{data_mat}{Matrix of cell embeddings. Cells can be rows or 27 | columns and will be inferred by the rows of meta_data.} 28 | 29 | \item{meta_data}{Either (1) Dataframe with variables to integrate 30 | or (2) vector with labels.} 31 | 32 | \item{vars_use}{If meta_data is dataframe, this defined which 33 | variable(s) to remove (character vector).} 34 | 35 | \item{theta}{Diversity clustering penalty parameter. Specify for 36 | each variable in vars_use Default theta=2. theta=0 does not 37 | encourage any diversity. Larger values of theta result in more 38 | diverse clusters.} 39 | 40 | \item{sigma}{Width of soft kmeans clusters. Default 41 | sigma=0.1. Sigma scales the distance from a cell to cluster 42 | centroids. Larger values of sigma result in cells assigned to 43 | more clusters. Smaller values of sigma make soft kmeans cluster 44 | approach hard clustering.} 45 | 46 | \item{lambda}{Ridge regression penalty. Default lambda=1. Bigger 47 | values protect against over correction. If several covariates 48 | are specified, then lambda can also be a vector which needs to 49 | be equal length with the number of variables to be 50 | corrected. In this scenario, each covariate level group will be 51 | assigned the scalars specified by the user. If set to NULL, 52 | harmony will start lambda estimation mode to determine lambdas 53 | automatically and try to minimize overcorrection (Use with caution still 54 | in beta testing).} 55 | 56 | \item{nclust}{Number of clusters in model. nclust=1 equivalent to 57 | simple linear regression.} 58 | 59 | \item{max_iter}{Maximum number of rounds to run Harmony. One round 60 | of Harmony involves one clustering and one correction step.} 61 | 62 | \item{early_stop}{Enable early stopping for harmony. The 63 | harmonization process will stop when the change of objective 64 | function between corrections drops below 1e-4} 65 | 66 | \item{ncores}{Number of processors to be used for math operations 67 | when optimized BLAS is available. If BLAS is not supporting 68 | multithreaded then this option has no effect. By default, 69 | ncore=1 which runs as a single-threaded process. Although 70 | Harmony supports multiple cores, it is not optimized for 71 | multithreading. Increase this number for large datasets iff 72 | single-core performance is not adequate.} 73 | 74 | \item{plot_convergence}{Whether to print the convergence plot of 75 | the clustering objective function. TRUE to plot, FALSE to 76 | suppress. This can be useful for debugging.} 77 | 78 | \item{return_object}{(Advanced Usage) Whether to return the Harmony 79 | object or only the corrected PCA embeddings.} 80 | 81 | \item{verbose}{Whether to print progress messages. TRUE to print, 82 | FALSE to suppress.} 83 | 84 | \item{.options}{Setting advanced parameters of RunHarmony. This must be the 85 | result from a call to `harmony_options`. See ?`harmony_options` for 86 | parameters not listed above and more details.} 87 | 88 | \item{...}{other parameters that are not part of the API} 89 | } 90 | \value{ 91 | By default, matrix with corrected PCA embeddings. If 92 | return_object is TRUE, returns the full Harmony object (R6 93 | reference class type). 94 | } 95 | \description{ 96 | Use this generic with a cell embeddings matrix, a metadata table 97 | and a categorical covariate to run the Harmony algorithm directly 98 | on cell embedding matrix. 99 | } 100 | \examples{ 101 | 102 | 103 | ## By default, Harmony inputs a cell embedding matrix 104 | \dontrun{ 105 | harmony_embeddings <- RunHarmony(cell_embeddings, meta_data, 'dataset') 106 | } 107 | 108 | ## If PCA is the input, the PCs need to be scaled 109 | data(cell_lines_small) 110 | pca_matrix <- cell_lines_small$scaled_pcs 111 | meta_data <- cell_lines_small$meta_data 112 | harmony_embeddings <- RunHarmony(pca_matrix, meta_data, 'dataset') 113 | 114 | ## Output is a matrix of corrected PC embeddings 115 | dim(harmony_embeddings) 116 | harmony_embeddings[seq_len(5), seq_len(5)] 117 | 118 | ## Finally, we can return an object with all the underlying data structures 119 | harmony_object <- RunHarmony(pca_matrix, meta_data, 'dataset', return_object=TRUE) 120 | dim(harmony_object$Y) ## cluster centroids 121 | dim(harmony_object$R) ## soft cluster assignment 122 | dim(harmony_object$Z_corr) ## corrected PCA embeddings 123 | head(harmony_object$O) ## batch by cluster co-occurence matrix 124 | 125 | } 126 | \seealso{ 127 | Other RunHarmony: 128 | \code{\link{RunHarmony.Seurat}()}, 129 | \code{\link{RunHarmony.SingleCellExperiment}()}, 130 | \code{\link{RunHarmony}()} 131 | } 132 | \concept{RunHarmony} 133 | -------------------------------------------------------------------------------- /man/cell_lines.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{cell_lines} 5 | \alias{cell_lines} 6 | \title{List of metadata table and scaled PCs matrix} 7 | \format{ 8 | : 9 | meta_data: data.table of 9478 rows with defining dataset and cell_type 10 | scaled_pcs: data.table of 9478 rows (cells) and 20 columns (PCs) 11 | } 12 | \source{ 13 | \url{https://www.10xgenomics.com} 14 | } 15 | \usage{ 16 | cell_lines 17 | } 18 | \description{ 19 | List of metadata table and scaled PCs matrix 20 | } 21 | \keyword{datasets} 22 | -------------------------------------------------------------------------------- /man/cell_lines_small.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{cell_lines_small} 5 | \alias{cell_lines_small} 6 | \title{Same as cell_lines but smaller (300 cells).} 7 | \format{ 8 | An object of class \code{list} of length 2. 9 | } 10 | \source{ 11 | \url{https://www.10xgenomics.com} 12 | } 13 | \usage{ 14 | cell_lines_small 15 | } 16 | \description{ 17 | Same as cell_lines but smaller (300 cells). 18 | } 19 | \keyword{datasets} 20 | -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/immunogenomics/harmony/b36bab002c1767af6e665c81f186b40a87870e64/man/figures/logo.png -------------------------------------------------------------------------------- /man/harmony.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/harmony-package.r 3 | \docType{package} 4 | \name{harmony} 5 | \alias{harmony} 6 | \title{Harmony: fast, accurate, and robust single cell integration.} 7 | \description{ 8 | Algorithm for single cell integration. 9 | } 10 | \section{Usage}{ 11 | 12 | 13 | 14 | ?RunHarmony to run Harmony on cell embeddings matrix, Seurat or 15 | SingleCellExperiment objects. 16 | } 17 | 18 | \section{Useful links}{ 19 | 20 | 21 | \enumerate{ 22 | \item Report bugs at \url{https://github.com/immunogenomics/harmony/issues} 23 | \item Read the manuscript 24 | \doi{10.1038/s41592-019-0619-0} 25 | } 26 | } 27 | 28 | -------------------------------------------------------------------------------- /man/harmony_options.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/harmony_option.R 3 | \name{harmony_options} 4 | \alias{harmony_options} 5 | \title{Set advanced parameters for RunHarmony} 6 | \usage{ 7 | harmony_options( 8 | alpha = 0.2, 9 | tau = 0, 10 | block.size = 0.05, 11 | max.iter.cluster = 20, 12 | epsilon.cluster = 0.001, 13 | epsilon.harmony = 0.01 14 | ) 15 | } 16 | \arguments{ 17 | \item{alpha}{When setting lambda = NULL and use lambda estimation mode, 18 | lambda would be determined by the expected number of cells assuming 19 | idependece between batches and clusters. i.e., lambda = alpha * expected 20 | number of cells, default 0.2 and alpha should be 0 < alpha < 1} 21 | 22 | \item{tau}{Protection against overclustering small datasets with 23 | large ones. `tau` is the expected number of cells per cluster.} 24 | 25 | \item{block.size}{What proportion of cells to update during clustering. 26 | Between 0 to 1, default 0.05. Larger values may be faster but less 27 | accurate.} 28 | 29 | \item{max.iter.cluster}{Maximum number of rounds to run clustering 30 | at each round of Harmony.} 31 | 32 | \item{epsilon.cluster}{Convergence tolerance for clustering round 33 | of Harmony. Set to -Inf to never stop early.} 34 | 35 | \item{epsilon.harmony}{Convergence tolerance for Harmony. Set to -Inf to 36 | never stop early. When `epsilon.harmony` is set to not NULL, then 37 | user-supplied values of `early_stop` is ignored.} 38 | } 39 | \value{ 40 | Return a list for `.options` argument of `RunHarmony` 41 | } 42 | \description{ 43 | Set advanced parameters for RunHarmony 44 | } 45 | \examples{ 46 | ## If want to set max.iter.cluster to be 100, do 47 | \dontrun{ 48 | RunHarmony(data_meta, meta_data, vars_use, 49 | .options = harmony_options(max.iter.cluster = 100)) 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /man/moe_ridge_get_betas.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{moe_ridge_get_betas} 4 | \alias{moe_ridge_get_betas} 5 | \title{Get beta Utility} 6 | \usage{ 7 | moe_ridge_get_betas(harmonyObj) 8 | } 9 | \arguments{ 10 | \item{harmonyObj}{Trained harmony object. Get this by running 11 | RunHarmony function with return_object=TRUE.} 12 | } 13 | \value{ 14 | Returns nothing, modifies object in place. 15 | } 16 | \description{ 17 | Utility function to get ridge regression coefficients from trained 18 | Harmony object 19 | } 20 | -------------------------------------------------------------------------------- /man/pbmc.ctrl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{pbmc.ctrl} 5 | \alias{pbmc.ctrl} 6 | \title{Gene expression data of control PBMC from Kang et al. 2017. This 7 | contains a sample of 1000 cells from that condition and is used for 8 | the Seurat Vignette.} 9 | \format{ 10 | An object of class \code{dgCMatrix} with 9015 rows and 1000 columns. 11 | } 12 | \source{ 13 | \doi{10.1038/nbt.4042} 14 | } 15 | \usage{ 16 | pbmc.ctrl 17 | } 18 | \description{ 19 | Gene expression data of control PBMC from Kang et al. 2017. This 20 | contains a sample of 1000 cells from that condition and is used for 21 | the Seurat Vignette. 22 | } 23 | \keyword{datasets} 24 | -------------------------------------------------------------------------------- /man/pbmc.stim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{pbmc.stim} 5 | \alias{pbmc.stim} 6 | \title{Gene expression data of stimulated PBMC from Kang et al. 2017. This 7 | contains a sample of 1000 cells from that condition and is used for 8 | the Seurat Vignette.} 9 | \format{ 10 | An object of class \code{dgCMatrix} with 9015 rows and 1000 columns. 11 | } 12 | \source{ 13 | \doi{10.1038/nbt.4042} 14 | } 15 | \usage{ 16 | pbmc.stim 17 | } 18 | \description{ 19 | Gene expression data of stimulated PBMC from Kang et al. 2017. This 20 | contains a sample of 1000 cells from that condition and is used for 21 | the Seurat Vignette. 22 | } 23 | \keyword{datasets} 24 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \value{ 10 | return value of rhs function. 11 | } 12 | \description{ 13 | Pipe operator 14 | } 15 | \examples{ 16 | x <- 5 \%>\% sum(10) 17 | 18 | } 19 | \keyword{internal} 20 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | .cache 5 | compile_commands.json 6 | MakefileBear -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 2 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | PKG_LIBS = $(shell $(R_HOME)/bin/Rscript.exe -e "Rcpp:::LdFlags()") $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 2 | 3 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include "harmony_types.h" 5 | #include 6 | #include 7 | 8 | using namespace Rcpp; 9 | 10 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 11 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 12 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 13 | #endif 14 | 15 | // kmeans_centers 16 | arma::mat kmeans_centers(const arma::mat& X, const int K); 17 | RcppExport SEXP _harmony_kmeans_centers(SEXP XSEXP, SEXP KSEXP) { 18 | BEGIN_RCPP 19 | Rcpp::RObject rcpp_result_gen; 20 | Rcpp::RNGScope rcpp_rngScope_gen; 21 | Rcpp::traits::input_parameter< const arma::mat& >::type X(XSEXP); 22 | Rcpp::traits::input_parameter< const int >::type K(KSEXP); 23 | rcpp_result_gen = Rcpp::wrap(kmeans_centers(X, K)); 24 | return rcpp_result_gen; 25 | END_RCPP 26 | } 27 | // scaleRows_dgc 28 | MATTYPE scaleRows_dgc(const VECTYPE& x, const VECTYPE& p, const VECTYPE& i, int ncol, int nrow, float thresh); 29 | RcppExport SEXP _harmony_scaleRows_dgc(SEXP xSEXP, SEXP pSEXP, SEXP iSEXP, SEXP ncolSEXP, SEXP nrowSEXP, SEXP threshSEXP) { 30 | BEGIN_RCPP 31 | Rcpp::RObject rcpp_result_gen; 32 | Rcpp::RNGScope rcpp_rngScope_gen; 33 | Rcpp::traits::input_parameter< const VECTYPE& >::type x(xSEXP); 34 | Rcpp::traits::input_parameter< const VECTYPE& >::type p(pSEXP); 35 | Rcpp::traits::input_parameter< const VECTYPE& >::type i(iSEXP); 36 | Rcpp::traits::input_parameter< int >::type ncol(ncolSEXP); 37 | Rcpp::traits::input_parameter< int >::type nrow(nrowSEXP); 38 | Rcpp::traits::input_parameter< float >::type thresh(threshSEXP); 39 | rcpp_result_gen = Rcpp::wrap(scaleRows_dgc(x, p, i, ncol, nrow, thresh)); 40 | return rcpp_result_gen; 41 | END_RCPP 42 | } 43 | // find_lambda_cpp 44 | arma::vec find_lambda_cpp(const float alpha, const arma::vec& cluster_E); 45 | RcppExport SEXP _harmony_find_lambda_cpp(SEXP alphaSEXP, SEXP cluster_ESEXP) { 46 | BEGIN_RCPP 47 | Rcpp::RObject rcpp_result_gen; 48 | Rcpp::RNGScope rcpp_rngScope_gen; 49 | Rcpp::traits::input_parameter< const float >::type alpha(alphaSEXP); 50 | Rcpp::traits::input_parameter< const arma::vec& >::type cluster_E(cluster_ESEXP); 51 | rcpp_result_gen = Rcpp::wrap(find_lambda_cpp(alpha, cluster_E)); 52 | return rcpp_result_gen; 53 | END_RCPP 54 | } 55 | 56 | RcppExport SEXP _rcpp_module_boot_harmony_module(); 57 | 58 | static const R_CallMethodDef CallEntries[] = { 59 | {"_harmony_kmeans_centers", (DL_FUNC) &_harmony_kmeans_centers, 2}, 60 | {"_harmony_scaleRows_dgc", (DL_FUNC) &_harmony_scaleRows_dgc, 6}, 61 | {"_harmony_find_lambda_cpp", (DL_FUNC) &_harmony_find_lambda_cpp, 2}, 62 | {"_rcpp_module_boot_harmony_module", (DL_FUNC) &_rcpp_module_boot_harmony_module, 0}, 63 | {NULL, NULL, 0} 64 | }; 65 | 66 | RcppExport void R_init_harmony(DllInfo *dll) { 67 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 68 | R_useDynamicSymbols(dll, FALSE); 69 | } 70 | -------------------------------------------------------------------------------- /src/harmony.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "harmony.h" 5 | #include "types.h" 6 | #include "utils.h" 7 | 8 | 9 | 10 | 11 | harmony::harmony() : 12 | window_size(3), 13 | ran_setup(false), 14 | ran_init(false), 15 | lambda_estimation(false), 16 | verbose(false) 17 | 18 | {} 19 | 20 | 21 | 22 | void harmony::setup(const MATTYPE& __Z, const arma::sp_mat& __Phi, 23 | const VECTYPE __sigma, const VECTYPE __theta, const VECTYPE __lambda, const float __alpha, const int __max_iter_kmeans, 24 | const float __epsilon_kmeans, const float __epsilon_harmony, 25 | const int __K, const float __block_size, 26 | const std::vector& __B_vec, const bool __verbose) { 27 | 28 | // Algorithm constants 29 | N = __Z.n_cols; 30 | B = __Phi.n_rows; 31 | d = __Z.n_rows; 32 | 33 | Z_orig = __Z; 34 | Z_cos = arma::normalise(__Z, 2, 0); 35 | Z_corr = zeros(size(Z_orig)); 36 | 37 | 38 | Phi = __Phi; 39 | Phi_t = Phi.t(); 40 | 41 | // Create index 42 | std::vectorcounters; 43 | arma::vec sizes(sum(Phi, 1)); 44 | // std::cout << sizes << std::endl; 45 | for (unsigned i = 0; i < sizes.n_elem; i++) { 46 | arma::uvec a(int(sizes(i))); 47 | index.push_back(a); 48 | counters.push_back(0); 49 | } 50 | 51 | arma::sp_mat::const_iterator it = Phi.begin(); 52 | arma::sp_mat::const_iterator it_end = Phi.end(); 53 | for(; it != it_end; ++it) 54 | { 55 | unsigned int row_idx = it.row(); 56 | unsigned int col_idx = it.col(); 57 | index[row_idx](counters[row_idx]++) = col_idx; 58 | } 59 | 60 | Pr_b = sum(Phi, 1) / N; 61 | 62 | 63 | epsilon_kmeans = __epsilon_kmeans; 64 | epsilon_harmony = __epsilon_harmony; 65 | 66 | // Hyperparameters 67 | K = __K; 68 | if (__lambda(0) == -1) { 69 | lambda_estimation = true; 70 | } else { 71 | lambda = __lambda; 72 | } 73 | B_vec = __B_vec; 74 | sigma = __sigma; 75 | 76 | if(__Z.n_cols < 6) { 77 | std::string error_message = "Refusing to run with less than 6 cells"; 78 | Rcpp::stop(error_message); 79 | } else if (__Z.n_cols < 40) { 80 | Rcpp::warning("Too few cells. Setting block_size to 0.2"); 81 | block_size = 0.2; 82 | } else { 83 | block_size = __block_size; 84 | } 85 | theta = __theta; 86 | max_iter_kmeans = __max_iter_kmeans; 87 | 88 | verbose = __verbose; 89 | 90 | allocate_buffers(); 91 | ran_setup = true; 92 | 93 | alpha = __alpha; 94 | 95 | 96 | } 97 | 98 | 99 | void harmony::allocate_buffers() { 100 | 101 | _scale_dist = zeros(K, N); 102 | dist_mat = zeros(K, N); 103 | O = E = zeros(K, B); 104 | 105 | // Hack: create matrix of ones by creating zeros and then add one! 106 | arma::sp_mat intcpt = zeros(1, N); 107 | intcpt = intcpt+1; 108 | 109 | Phi_moe = join_cols(intcpt, Phi); 110 | Phi_moe_t = Phi_moe.t(); 111 | 112 | 113 | W = zeros(B + 1, d); 114 | } 115 | 116 | 117 | void harmony::init_cluster_cpp() { 118 | 119 | Y = kmeans_centers(Z_cos, K).t(); 120 | 121 | // Cosine normalization of data centrods 122 | Y = arma::normalise(Y, 2, 0); 123 | 124 | // (2) ASSIGN CLUSTER PROBABILITIES 125 | // using a nice property of cosine distance, 126 | // compute squared distance directly with cross product 127 | dist_mat = 2 * (1 - Y.t() * Z_cos); 128 | 129 | R = -dist_mat; 130 | R.each_col() /= sigma; 131 | R = exp(R); 132 | R.each_row() /= sum(R, 0); 133 | 134 | 135 | // (3) BATCH DIVERSITY STATISTICS 136 | E = sum(R, 1) * Pr_b.t(); 137 | O = R * Phi_t; 138 | 139 | compute_objective(); 140 | objective_harmony.push_back(objective_kmeans.back()); 141 | 142 | dist_mat = 2 * (1 - Y.t() * Z_cos); // Z_cos was changed 143 | 144 | ran_init = true; 145 | 146 | } 147 | 148 | void harmony::compute_objective() { 149 | const float norm_const = 2000/((float)N); 150 | float kmeans_error = as_scalar(accu(R % dist_mat)); 151 | float _entropy = as_scalar(accu(safe_entropy(R).each_col() % sigma)); // NEW: vector sigma 152 | float _cross_entropy = as_scalar( 153 | accu((R.each_col() % sigma) % ((arma::repmat(theta.t(), K, 1) % log((O + E) / E)) * Phi))); 154 | 155 | // Push back the data 156 | objective_kmeans.push_back((kmeans_error + _entropy + _cross_entropy) * norm_const); 157 | objective_kmeans_dist.push_back(kmeans_error * norm_const); 158 | objective_kmeans_entropy.push_back(_entropy * norm_const); 159 | objective_kmeans_cross.push_back(_cross_entropy * norm_const); 160 | } 161 | 162 | 163 | bool harmony::check_convergence(int type) { 164 | float obj_new, obj_old; 165 | switch (type) { 166 | case 0: 167 | // Clustering 168 | // compute new window mean 169 | obj_old = 0; 170 | obj_new = 0; 171 | for (unsigned i = 0; i < window_size; i++) { 172 | obj_old += objective_kmeans[objective_kmeans.size() - 2 - i]; 173 | obj_new += objective_kmeans[objective_kmeans.size() - 1 - i]; 174 | } 175 | if ((obj_old - obj_new) / abs(obj_old) < epsilon_kmeans) { 176 | return(true); 177 | } else { 178 | return(false); 179 | } 180 | case 1: 181 | // Harmony 182 | obj_old = objective_harmony[objective_harmony.size() - 2]; 183 | obj_new = objective_harmony[objective_harmony.size() - 1]; 184 | if ((obj_old - obj_new) / abs(obj_old) < epsilon_harmony) { 185 | return(true); 186 | } else { 187 | return(false); 188 | } 189 | } 190 | 191 | // gives warning if we don't give default return value 192 | return(true); 193 | } 194 | 195 | 196 | int harmony::cluster_cpp() { 197 | int err_status = 0; 198 | Progress p(max_iter_kmeans, verbose); 199 | unsigned iter; 200 | 201 | // Z_cos has changed 202 | // R has assumed to not change 203 | // so update Y to match new integrated data 204 | for (iter = 0; iter < max_iter_kmeans; iter++) { 205 | 206 | p.increment(); 207 | if (Progress::check_abort()) 208 | return(-1); 209 | 210 | // STEP 1: Update Y (cluster centroids) 211 | Y = arma::normalise(Z_cos * R.t(), 2, 0); 212 | 213 | dist_mat = 2 * (1 - Y.t() * Z_cos); // Y was changed 214 | 215 | 216 | // STEP 3: Update R 217 | err_status = update_R(); 218 | if (err_status != 0) { 219 | // Rcout << "Compute R failed. Exiting from clustering." << endl; 220 | return err_status; 221 | } 222 | 223 | // STEP 4: Check for convergence 224 | compute_objective(); 225 | 226 | if (iter > window_size) { 227 | bool convergence_status = check_convergence(0); 228 | if (convergence_status) { 229 | iter++; 230 | break; 231 | } 232 | } 233 | } 234 | 235 | kmeans_rounds.push_back(iter); 236 | objective_harmony.push_back(objective_kmeans.back()); 237 | return 0; 238 | } 239 | 240 | 241 | 242 | 243 | 244 | 245 | int harmony::update_R() { 246 | 247 | // Generate the 0,N-1 indices 248 | uvec indices = linspace(0, N - 1, N); 249 | update_order = shuffle(indices); 250 | 251 | // Inverse index 252 | uvec reverse_index(N, arma::fill::zeros); 253 | reverse_index.rows(update_order) = indices; 254 | 255 | _scale_dist = -dist_mat; // K x N 256 | _scale_dist.each_col() /= sigma; // NEW: vector sigma 257 | _scale_dist = exp(_scale_dist); 258 | _scale_dist = arma::normalise(_scale_dist, 1, 0); 259 | 260 | // GENERAL CASE: online updates, in blocks of size (N * block_size) 261 | unsigned n_blocks = (int)(my_ceil(1.0 / block_size)); 262 | unsigned cells_per_block = unsigned(N * block_size); 263 | 264 | // Allocate new matrices 265 | MATTYPE R_randomized = R.cols(update_order); 266 | arma::sp_mat Phi_randomized(Phi.cols(update_order)); 267 | arma::sp_mat Phi_t_randomized(Phi_randomized.t()); 268 | MATTYPE _scale_dist_randomized = _scale_dist.cols(update_order); 269 | 270 | for (unsigned i = 0; i < n_blocks; i++) { 271 | unsigned idx_min = i*cells_per_block; 272 | unsigned idx_max = ((i+1) * cells_per_block) - 1; // - 1 because of submat 273 | if (i == n_blocks-1) { 274 | // we are in the last block, so include everything. Up to 19 275 | // extra cells. 276 | idx_max = N - 1; 277 | } 278 | 279 | auto Rcells = R_randomized.submat(0, idx_min, R_randomized.n_rows - 1, idx_max); 280 | auto Phicells = Phi_randomized.submat(0, idx_min, Phi_randomized.n_rows - 1, idx_max); 281 | auto Phi_tcells = Phi_t_randomized.submat(idx_min, 0, idx_max, Phi_t_randomized.n_cols - 1); 282 | auto _scale_distcells = _scale_dist_randomized.submat(0, idx_min, _scale_dist_randomized.n_rows - 1, idx_max); 283 | 284 | // Step 1: remove cells 285 | E -= sum(Rcells, 1) * Pr_b.t(); 286 | O -= Rcells * Phi_tcells; 287 | 288 | // Step 2: recompute R for removed cells 289 | Rcells = _scale_distcells; 290 | Rcells = Rcells % (harmony_pow(E/(O + E), theta) * Phicells); 291 | Rcells = normalise(Rcells, 1, 0); // L1 norm columns 292 | 293 | // Step 3: put cells back 294 | E += sum(Rcells, 1) * Pr_b.t(); 295 | O += Rcells * Phi_tcells; 296 | } 297 | this->R = R_randomized.cols(reverse_index); 298 | return 0; 299 | } 300 | 301 | 302 | void harmony::moe_correct_ridge_cpp() { 303 | 304 | arma::sp_mat _Rk(N, N); 305 | arma::sp_mat lambda_mat(B + 1, B + 1); 306 | 307 | if(!lambda_estimation) { 308 | // Set lambda if we have to 309 | lambda_mat.diag() = lambda; 310 | } 311 | Z_corr = Z_orig; 312 | Progress p(K, verbose); 313 | for (unsigned k = 0; k < K; k++) { 314 | p.increment(); 315 | if (Progress::check_abort()) 316 | return; 317 | if (lambda_estimation) { 318 | lambda_mat.diag() = find_lambda_cpp(alpha, E.row(k).t()); 319 | } 320 | _Rk.diag() = R.row(k); 321 | arma::sp_mat Phi_Rk = Phi_moe * _Rk; 322 | 323 | arma::mat inv_cov(arma::inv(arma::mat(Phi_Rk * Phi_moe_t + lambda_mat))); 324 | 325 | // Calculate R-scaled PCs once 326 | arma::mat Z_tmp = Z_orig.each_row() % R.row(k); 327 | 328 | // Generate the betas contribution of the intercept using the data 329 | // This erases whatever was written before in W 330 | W = inv_cov.unsafe_col(0) * sum(Z_tmp, 1).t(); 331 | 332 | // Calculate betas by calculating each batch contribution 333 | for(unsigned b=0; b < B; b++) { 334 | // inv_conv is B+1xB+1 whereas index is B long 335 | W += inv_cov.unsafe_col(b+1) * sum(Z_tmp.cols(index[b]), 1).t(); 336 | } 337 | 338 | W.row(0).zeros(); // do not remove the intercept 339 | Z_corr -= W.t() * Phi_Rk; 340 | } 341 | Z_cos = arma::normalise(Z_corr, 2, 0); 342 | } 343 | 344 | CUBETYPE harmony::moe_ridge_get_betas_cpp() { 345 | CUBETYPE W_cube(B+1, d, K); // rows, cols, slices 346 | 347 | arma::sp_mat _Rk(N, N); 348 | arma::sp_mat lambda_mat(B + 1, B + 1); 349 | 350 | if (!lambda_estimation) { 351 | // Set lambda if we have to 352 | lambda_mat.diag() = lambda; 353 | } 354 | 355 | for (unsigned k = 0; k < K; k++) { 356 | _Rk.diag() = R.row(k); 357 | if (lambda_estimation){ 358 | lambda_mat.diag() = find_lambda_cpp(alpha, E.row(k).t()); 359 | } 360 | arma::sp_mat Phi_Rk = Phi_moe * _Rk; 361 | W_cube.slice(k) = arma::inv(arma::mat(Phi_Rk * Phi_moe_t + lambda_mat)) * Phi_Rk * Z_orig.t(); 362 | } 363 | 364 | return W_cube; 365 | } 366 | 367 | RCPP_MODULE(harmony_module) { 368 | class_("harmony") 369 | .constructor() 370 | .field("Z_corr", &harmony::Z_corr) 371 | .field("Z_cos", &harmony::Z_cos) 372 | .field("Z_orig", &harmony::Z_orig) 373 | .field("Phi", &harmony::Phi) 374 | .field("Phi_moe", &harmony::Phi_moe) 375 | .field("N", &harmony::N) 376 | .field("B", &harmony::B) 377 | .field("K", &harmony::K) 378 | .field("d", &harmony::d) 379 | .field("O", &harmony::O) 380 | .field("E", &harmony::E) 381 | .field("Y", &harmony::Y) 382 | .field("Pr_b", &harmony::Pr_b) 383 | .field("W", &harmony::W) 384 | .field("R", &harmony::R) 385 | .field("theta", &harmony::theta) 386 | .field("sigma", &harmony::sigma) 387 | .field("lambda", &harmony::lambda) 388 | .field("kmeans_rounds", &harmony::kmeans_rounds) 389 | .field("objective_kmeans", &harmony::objective_kmeans) 390 | .field("objective_kmeans_dist", &harmony::objective_kmeans_dist) 391 | .field("objective_kmeans_entropy", &harmony::objective_kmeans_entropy) 392 | .field("objective_kmeans_cross", &harmony::objective_kmeans_cross) 393 | .field("objective_harmony", &harmony::objective_harmony) 394 | .field("max_iter_kmeans", &harmony::max_iter_kmeans) 395 | .method("check_convergence", &harmony::check_convergence) 396 | .method("setup", &harmony::setup) 397 | .method("compute_objective", &harmony::compute_objective) 398 | .method("init_cluster_cpp", &harmony::init_cluster_cpp) 399 | .method("cluster_cpp", &harmony::cluster_cpp) 400 | .method("moe_correct_ridge_cpp", &harmony::moe_correct_ridge_cpp) 401 | .method("moe_ridge_get_betas_cpp", &harmony::moe_ridge_get_betas_cpp) 402 | .field("B_vec", &harmony::B_vec) 403 | .field("alpha", &harmony::alpha) 404 | ; 405 | } 406 | -------------------------------------------------------------------------------- /src/harmony.h: -------------------------------------------------------------------------------- 1 | #include "types.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace Rcpp; 9 | using namespace arma; 10 | // [[Rcpp::depends(RcppArmadillo)]] 11 | // [[Rcpp::depends(RcppProgress)]] 12 | 13 | using namespace std; 14 | 15 | class harmony; 16 | RCPP_EXPOSED_CLASS(harmony) 17 | 18 | 19 | #include "harmony_types.h" 20 | 21 | class harmony { 22 | public: 23 | 24 | harmony(); 25 | 26 | void setup(const MATTYPE& __Z, const arma::sp_mat& __Phi, 27 | const VECTYPE __sigma, const VECTYPE __theta, 28 | const VECTYPE __lambda, const float __alpha, const int __max_iter_kmeans, 29 | const float __epsilon_kmeans, const float __epsilon_harmony, 30 | const int __K, const float __block_size, 31 | const vector& __B_vec, const bool __verbose); 32 | 33 | /* METHODS */ 34 | void moe_correct_ridge_cpp(); 35 | CUBETYPE moe_ridge_get_betas_cpp(); 36 | int cluster_cpp(); 37 | 38 | void init_cluster_cpp(); 39 | void allocate_buffers(); 40 | void compute_objective(); 41 | int update_R(); 42 | bool check_convergence(int type); 43 | void setY(const MATTYPE& Z); 44 | 45 | /* FIELDS */ 46 | MATTYPE R, Z_orig, Z_corr, Z_cos, Y; 47 | arma::sp_mat Phi, Phi_moe, Phi_moe_t, Phi_t, Rk; 48 | VECTYPE Pr_b, theta, N_b, sigma, lambda; 49 | 50 | // auxilary data structures 51 | vector objective_kmeans, objective_kmeans_dist, objective_kmeans_entropy, objective_kmeans_cross, objective_harmony; 52 | vector kmeans_rounds, B_vec; // OLD: Kb 53 | std::vectorindex; 54 | 55 | float block_size, epsilon_kmeans, epsilon_harmony, alpha; 56 | unsigned int N, K, B, d, max_iter_kmeans, window_size; 57 | 58 | // buffers 59 | MATTYPE W, _scale_dist, dist_mat, O, E, dir_prior; // N_k, N_kb, N_b, numerator, denominator, C; 60 | uvec update_order, cells_update; 61 | 62 | 63 | // flags 64 | bool ran_setup, ran_init, lambda_estimation, verbose; // do_merge_R; 65 | 66 | }; 67 | 68 | 69 | -------------------------------------------------------------------------------- /src/harmony_types.h: -------------------------------------------------------------------------------- 1 | #include "types.h" 2 | 3 | -------------------------------------------------------------------------------- /src/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #define ARMA_64BIT_WORD 3 | #include 4 | 5 | typedef arma::mat MATTYPE; 6 | typedef arma::vec VECTYPE; 7 | typedef arma::rowvec ROWVECTYPE; 8 | typedef arma::cube CUBETYPE; 9 | -------------------------------------------------------------------------------- /src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | #include "types.h" 3 | 4 | //[[Rcpp::export]] 5 | arma::mat kmeans_centers(const arma::mat& X, const int K) { 6 | 7 | // Environment 8 | Rcpp::Environment stats_env("package:stats"); 9 | // Cast function as callable from C++ 10 | Rcpp::Function kmeans = stats_env["kmeans"]; 11 | // Call the function and receive its list output 12 | Rcpp::List res = kmeans(Rcpp::_["x"] = X.t(), 13 | Rcpp::_["centers"] = K, 14 | Rcpp::_["iter.max"] = 25, 15 | Rcpp::_["nstart"] = 10 16 | ); 17 | return res["centers"]; 18 | } 19 | 20 | 21 | MATTYPE safe_entropy(const MATTYPE& X) { 22 | MATTYPE A = X % log(X); 23 | A.elem(find_nonfinite(A)).zeros(); 24 | return(A); 25 | } 26 | 27 | // Overload pow to work on a MATTYPErix and vector 28 | MATTYPE harmony_pow(MATTYPE A, const VECTYPE& T) { 29 | 30 | for (unsigned c = 0; c < A.n_cols; c++) { 31 | A.unsafe_col(c) = pow(A.unsafe_col(c), as_scalar(T.row(c))); 32 | } 33 | return(A); 34 | } 35 | 36 | VECTYPE calculate_norm(const MATTYPE& M) { 37 | VECTYPE x(M.n_cols); 38 | for(unsigned i = 0; i < M.n_cols; i++){ 39 | x(i) = norm(M.col(i)); 40 | } 41 | return x; 42 | } 43 | 44 | 45 | //https://stackoverflow.com/questions/8377412/ceil-function-how-can-we-implement-it-ourselves 46 | int my_ceil(float num) { 47 | int inum = (int)num; 48 | if (num == (float)inum) { 49 | return inum; 50 | } 51 | return inum + 1; 52 | } 53 | 54 | 55 | // [[Rcpp::export]] 56 | MATTYPE scaleRows_dgc(const VECTYPE& x, const VECTYPE& p, const VECTYPE& i, int ncol, int nrow, float thresh) { 57 | 58 | // (0) fill in non-zero elements 59 | MATTYPE res = arma::zeros(nrow, ncol); 60 | for (int c = 0; c < ncol; c++) { 61 | for (int j = p[c]; j < p[c + 1]; j++) { 62 | res(i[j], c) = x(j); 63 | } 64 | } 65 | 66 | // (1) compute means 67 | VECTYPE mean_vec = arma::zeros(nrow); 68 | for (int c = 0; c < ncol; c++) { 69 | for (int j = p[c]; j < p[c + 1]; j++) { 70 | mean_vec(i[j]) += x[j]; 71 | } 72 | } 73 | mean_vec /= ncol; 74 | 75 | // (2) compute SDs 76 | VECTYPE sd_vec = arma::zeros(nrow); 77 | arma::uvec nz = arma::zeros(nrow); 78 | nz.fill(ncol); 79 | for (int c = 0; c < ncol; c++) { 80 | for (int j = p[c]; j < p[c + 1]; j++) { 81 | sd_vec(i[j]) += (x[j] - mean_vec(i[j])) * (x[j] - mean_vec(i[j])); // (x - mu)^2 82 | nz(i[j])--; 83 | } 84 | } 85 | 86 | // count for the zeros 87 | for (int r = 0; r < nrow; r++) { 88 | sd_vec(r) += nz(r) * mean_vec(r) * mean_vec(r); 89 | } 90 | 91 | sd_vec = arma::sqrt(sd_vec / (ncol - 1)); 92 | 93 | // (3) scale values 94 | res.each_col() -= mean_vec; 95 | res.each_col() /= sd_vec; 96 | res.elem(find(res > thresh)).fill(thresh); 97 | res.elem(find(res < -thresh)).fill(-thresh); 98 | return res; 99 | } 100 | 101 | 102 | // [[Rcpp::export]] 103 | arma::vec find_lambda_cpp(const float alpha, const arma::vec& cluster_E) { 104 | arma::vec lambda_dym_vec(cluster_E.n_rows + 1, arma::fill::zeros); 105 | lambda_dym_vec.subvec(1, lambda_dym_vec.n_rows - 1) = cluster_E * alpha; 106 | return lambda_dym_vec; 107 | } 108 | -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "types.h" 3 | #include 4 | 5 | arma::mat kmeans_centers(const arma::mat& X, const int K); 6 | 7 | MATTYPE safe_entropy(const MATTYPE& X); 8 | 9 | MATTYPE harmony_pow(MATTYPE A, const VECTYPE& T); 10 | 11 | VECTYPE calculate_norm(const MATTYPE& M); 12 | 13 | 14 | int my_ceil(float num); 15 | 16 | 17 | arma::vec find_lambda_cpp(const float alpha, const arma::vec& cluster_E); 18 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(harmony) 3 | 4 | test_check("harmony") 5 | -------------------------------------------------------------------------------- /tests/testthat/test_integration.R: -------------------------------------------------------------------------------- 1 | context('Test main Harmony integration function: RunHarmony') 2 | library(harmony) 3 | data(cell_lines_small) 4 | 5 | obj <- RunHarmony(cell_lines_small$scaled_pcs, cell_lines_small$meta_data, 'dataset', 6 | theta = 1, nclust = 50, lambda = .1, max_iter = 5, return_object = TRUE, 7 | verbose = FALSE, .options = harmony_options(max.iter.cluster = 10)) 8 | 9 | test_that('dimensions match in Harmony object data structures', { 10 | expect_equal(dim(obj$Y), c(obj$d, obj$K)) 11 | expect_equal(dim(obj$Z_corr), c(obj$d, obj$N)) 12 | expect_equal(dim(obj$Z_cos), c(obj$d, obj$N)) 13 | expect_equal(dim(obj$R), c(obj$K, obj$N)) 14 | }) 15 | 16 | test_that('R defines proper probability distributions', { 17 | expect_gte(min(obj$R), 0) 18 | expect_lte(max(obj$R), 1) 19 | expect_equal(colSums(obj$R), rep(1, obj$N)) 20 | }) 21 | 22 | test_that('there are no null values in the corrected embedding', { 23 | expect_true(all(!is.infinite(obj$Z_corr))) 24 | expect_true(all(!is.na(obj$Z_corr))) 25 | expect_true(all(!is.infinite(obj$Z_cos))) 26 | expect_true(all(!is.na(obj$Z_cos))) 27 | }) 28 | 29 | 30 | test_that('increasing theta decreases chi2 between Cluster and Batch assign', { 31 | obj0 <- RunHarmony(cell_lines_small$scaled_pcs, cell_lines_small$meta_data, 'dataset', 32 | theta = 0, nclust = 20, lambda = .1, max_iter = 2, return_object = TRUE, 33 | verbose = FALSE, .options = harmony_options(max.iter.cluster = 5)) 34 | obj1 <- RunHarmony(cell_lines_small$scaled_pcs, cell_lines_small$meta_data, 'dataset', 35 | theta = 1, nclust = 20, lambda = .1, max_iter = 2, return_object = TRUE, 36 | verbose = FALSE, .options = harmony_options(max.iter.cluster = 5)) 37 | 38 | expect_gt( 39 | sum(((obj0$O - obj0$E) ^ 2) / obj0$E), 40 | sum(((obj1$O - obj1$E) ^ 2) / obj1$E) 41 | ) 42 | }) 43 | 44 | test_that('error messages work', { 45 | expect_error( 46 | RunHarmony(cell_lines_small$scaled_pcs, cell_lines_small$meta_data, 'fake_variable') 47 | ) 48 | 49 | expect_error( 50 | RunHarmony(cell_lines_small$scaled_pcs, cell_lines_small$meta_data, 'dataset', lambda = c(1,2)) 51 | ) 52 | 53 | expect_error( 54 | RunHarmony(cell_lines_small$scaled_pcs, head(cell_lines_small$meta_data, -1), 'dataset') 55 | ) 56 | 57 | }) 58 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /vignettes/Seurat.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using harmony in Seurat" 3 | output: 4 | rmarkdown::html_vignette: 5 | code_folding: show 6 | vignette: > 7 | %\VignetteIndexEntry{Using harmony in Seurat} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>" 16 | ) 17 | ``` 18 | 19 | ```{r setup, message=FALSE, warning=FALSE} 20 | library(harmony) 21 | library(Seurat) 22 | library(dplyr) 23 | library(cowplot) 24 | 25 | ``` 26 | # Introduction 27 | 28 | This tutorial describes how to use harmony in Seurat v5 single-cell analysis workflows. `RunHarmony()` is a generic function is designed to interact with Seurat objects. This vignette will walkthrough basic workflow of Harmony with Seurat objects. Also, it will provide some basic downstream analyses demonstrating the properties of harmonized cell embeddings and a brief explanation of the exposed algorithm parameters. 29 | 30 | Install Harmony from CRAN with standard commands. 31 | 32 | ```{r eval=FALSE} 33 | install.packages('harmony') 34 | ``` 35 | 36 | # Generating the dataset 37 | 38 | For this demo, we will be aligning two groups of PBMCs [Kang et al., 2017](https://doi.org/10.1038/nbt.4042). In this experiment, PBMCs are in stimulated and control conditions. The stimulated PBMC group was treated with interferon beta. 39 | 40 | 41 | ``` 42 | 43 | 44 | ## Generate SeuratObject 45 | 46 | ```{r} 47 | ## Source required data 48 | data("pbmc_stim") 49 | pbmc <- CreateSeuratObject(counts = cbind(pbmc.stim, pbmc.ctrl), project = "PBMC", min.cells = 5) 50 | 51 | ## Separate conditions 52 | 53 | pbmc@meta.data$stim <- c(rep("STIM", ncol(pbmc.stim)), rep("CTRL", ncol(pbmc.ctrl))) 54 | ``` 55 | 56 | 57 | ## (Optional) Download original data 58 | The example above contains only two thousand cells. The full [Kang et al., 2017](https://doi.org/10.1038/nbt.4042) dataset is deposited in the [GEO](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE96583). This analysis uses GSM2560248 and GSM2560249 samples from [GSE96583_RAW.tar](https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE96583&format=file) file and the [GSE96583_batch2.genes.tsv.gz](https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE96583&format=file&file=GSE96583%5Fbatch2%2Egenes%2Etsv%2Egz) gene file. 59 | 60 | ```{r eval = FALSE, class.source='fold-hide'} 61 | library(Matrix) 62 | ## Download and extract files from GEO 63 | ##setwd("/path/to/downloaded/files") 64 | genes = read.table("GSE96583_batch2.genes.tsv.gz", header = FALSE, sep = "\t") 65 | 66 | pbmc.ctrl.full = as.readMM("GSM2560248_2.1.mtx.gz") 67 | colnames(pbmc.ctrl.full) = paste0(read.table("GSM2560248_barcodes.tsv.gz", header = FALSE, sep = "\t")[,1], "-1") 68 | rownames(pbmc.ctrl.full) = genes$V1 69 | 70 | pbmc.stim.full = readMM("GSM2560249_2.2.mtx.gz") 71 | colnames(pbmc.stim.full) = paste0(read.table("GSM2560249_barcodes.tsv.gz", header = FALSE, sep = "\t")[,1], "-2") 72 | rownames(pbmc.stim.full) = genes$V1 73 | 74 | library(Seurat) 75 | 76 | pbmc <- CreateSeuratObject(counts = cbind(pbmc.stim.full, pbmc.ctrl.full), project = "PBMC", min.cells = 5) 77 | pbmc@meta.data$stim <- c(rep("STIM", ncol(pbmc.stim.full)), rep("CTRL", ncol(pbmc.ctrl.full))) 78 | 79 | 80 | 81 | 82 | # Running Harmony 83 | 84 | Harmony works on an existing matrix with cell embeddings and outputs its transformed version with the datasets aligned according to some user-defined experimental conditions. By default, harmony will look up the `pca` cell embeddings and use these to run harmony. Therefore, it assumes that the Seurat object has these embeddings already precomputed. 85 | 86 | ## Calculate PCA cell embeddings 87 | 88 | Here, using `Seurat::NormalizeData()`, we will be generating a union of highly variable genes using each condition (the control and stimulated cells). These features are going to be subsequently used to generate the 20 PCs with `Seurat::RunPCA()`. 89 | 90 | ```{r} 91 | pbmc <- pbmc %>% 92 | NormalizeData(verbose = FALSE) 93 | 94 | VariableFeatures(pbmc) <- split(row.names(pbmc@meta.data), pbmc@meta.data$stim) %>% lapply(function(cells_use) { 95 | pbmc[,cells_use] %>% 96 | FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 97 | VariableFeatures() 98 | }) %>% unlist %>% unique 99 | 100 | pbmc <- pbmc %>% 101 | ScaleData(verbose = FALSE) %>% 102 | RunPCA(features = VariableFeatures(pbmc), npcs = 20, verbose = FALSE) 103 | ``` 104 | 105 | ## Perform an integrated analysis 106 | 107 | To run harmony on Seurat object after it has been normalized, only one argument needs to be specified which contains the batch covariate located in the metadata. For this vignette, further parameters are specified to align the dataset but the minimum parameters are shown in the snippet below: 108 | 109 | ```{r, eval=FALSE} 110 | ## run harmony with default parameters 111 | pbmc <- pbmc %>% RunHarmony("stim") 112 | ## is equivalent to: 113 | pbmc <- RunHarmony(pbmc, "stim") 114 | ``` 115 | 116 | Here, we will be running harmony with some indicative parameters and plotting the convergence plot to illustrate some of the under the hood functionality. 117 | 118 | ```{r, fig.width = 4, fig.height = 3, fig.align = "center", out.width="50%", fig.cap="By setting `plot_converge=TRUE`, harmony will generate a plot with its objective showing the flow of the integration. Each point represents the cost measured after a clustering round. Different colors represent different Harmony iterations which is controlled by `max_iter` (assuming that early_stop=FALSE). Here `max_iter=10` and up to 10 correction steps are expected. However, `early_stop=TRUE` so harmony will stop after the cost plateaus."} 119 | 120 | pbmc <- pbmc %>% 121 | RunHarmony("stim", plot_convergence = TRUE, nclust = 50, max_iter = 10, early_stop = T) 122 | ``` 123 | 124 | 125 | 126 | ### Harmony API parameters on Seurat objects 127 | 128 | `RunHarmony` has several parameters accessible to users which are outlined below. 129 | 130 | #### `object` (required) 131 | 132 | The Seurat object. This vignette assumes Seurat objects are version 5. 133 | 134 | #### `group.by.vars` (required) 135 | 136 | A character vector that specifies all the experimental covariates to be corrected/harmonized by the algorithm. 137 | 138 | When using `RunHarmony()` with Seurat, harmony will look up the `group.by.vars` metadata fields in the Seurat Object metadata. 139 | 140 | For example, given the `pbmc[["stim"]]` exists as the stim condition, setting `group.by.vars="stim"` will perform integration of these samples accordingly. If you want to integrate on another variable, it needs to be present in Seurat object's meta.data. 141 | 142 | To correct for several covariates, specify them in a vector: `group.by.vars = c("stim", "new_covariate")`. 143 | 144 | #### `reduction.use` 145 | 146 | The cell embeddings to be used for the batch alignment. This parameter assumes that a reduced dimension already exists in the reduction slot of the Seurat object. By default, the `pca` reduction is used. 147 | 148 | 149 | #### `dims.use` 150 | 151 | Optional parameter which can use a name vector to select specific dimensions to be harmonized. 152 | 153 | 154 | ### Algorithm parameters 155 | ![Harmony Algorithm Overview](main.jpg){width=100%} 156 | 157 | #### `nclust` 158 | 159 | is a positive integer. Under the hood, harmony applies k-means soft-clustering. For this task, `k` needs to be determined. `nclust` corresponds to `k`. The harmonization results and performance are not particularly sensitive for a reasonable range of this parameter value. If this parameter is not set, harmony will autodetermine this based on the dataset size with a maximum cap of 200. For dataset with a vast amount of different cell types and batches this pamameter may need to be determined manually. 160 | 161 | #### `sigma` 162 | 163 | a positive scalar that controls the soft clustering probability assignment of single-cells to different clusters. Larger values will assign a larger probability to distant clusters of cells resulting in a different correction profile. Single-cells are assigned to clusters by their euclidean distance $d$ to some cluster center $Y$ after cosine normalization which is defined in the range [0,4]. The clustering probability of each cell is calculated as $e^{-\frac{d}{\sigma}}$ where $\sigma$ is controlled by the `sigma` parameter. Default value of `sigma` is 0.1 and it generally works well since it defines probability assignment of a cell in the range $[e^{-40}, e^0]$. Larger values of `sigma` restrict the dynamic range of probabilities that can be assigned to cells. For example, `sigma=1` will yield a probabilities in the range of $[e^{-4}, e^0]$. 164 | 165 | 166 | #### `theta` 167 | 168 | `theta` is a positive scalar vector that determines the coefficient of harmony's diversity penalty for each corrected experimental covariate. In challenging experimental conditions, increasing theta may result in better integration results. Theta is an expontential parameter of the diversity penalty, thus setting `theta=0` disables this penalty while increasing it to greater values than 1 will perform more aggressive corrections in an expontential manner. By default, it will set `theta=2` for each experimental covariate. 169 | 170 | #### `max_iter` 171 | 172 | The number of correction steps harmony will perform before completing the data set integration. In general, more iterations than necessary increases computational runtime especially which becomes evident in bigger datasets. Setting `early_stop=TRUE` may reduce the actual number of correction steps which will be smaller than `max_iter`. 173 | 174 | #### `early_stop` 175 | 176 | Under the hood, harmony minimizes its objective function through a series of clustering and integration tests. By setting `early_stop=TRUE`, when the objective function is less than `1e-4` after a correction step harmony exits before reaching the `max_iter` correction steps. This parameter can drastically reduce run-time in bigger datasets. 177 | 178 | #### `.options` 179 | A set of internal algorithm parameters that can be overriden. For advanced users only. 180 | 181 | 182 | 183 | ### Seurat specific parameters 184 | 185 | These parameters are Seurat-specific and do not affect the flow of the algorithm. 186 | 187 | #### `project_dim` 188 | 189 | Toggle-like parameter, by default `project_dim=TRUE`. When enabled, `RunHarmony()` calculates genomic feature loadings using Seurat's `ProjectDim()` that correspond to the harmonized cell embeddings. 190 | 191 | #### `reduction.save` 192 | 193 | The new Reduced Dimension slot identifier. By default, `reduction.save=TRUE`. This option allows several independent runs of harmony to be retained in the appropriate slots in the SeuratObjects. It is useful if you want to try Harmony with multiple parameters and save them as e.g. 'harmony_theta0', 'harmony_theta1', 'harmony_theta2'. 194 | 195 | ### Miscellaneous parameters 196 | 197 | These parameters help users troubleshoot harmony. 198 | 199 | #### `plot_convergence` 200 | 201 | Option that plots the convergence plot after the execution of the algorithm. By default `FALSE`. Setting it to `TRUE` will collect harmony's objective value and plot it allowing the user to troubleshoot the flow of the algorithm and fine-tune the parameters of the dataset integration procedure. 202 | 203 | 204 | 205 | ### Accessing the data 206 | 207 | `RunHarmony()` returns the Seurat object which contains the harmonized cell embeddings in a slot named **harmony**. This entry can be accessed via `pbmc@reductions$harmony`. To access the values of the cell embeddings we can also use: 208 | 209 | ```{r} 210 | harmony.embeddings <- Embeddings(pbmc, reduction = "harmony") 211 | ``` 212 | 213 | ### Inspection of the modalities 214 | 215 | After Harmony integration, we should inspect the quality of the harmonization and contrast it with the unharmonized algorithm input. Ideally, cells from different conditions will align along the Harmonized PCs. If they are not, you could increase the *theta* value above to force a more aggressive fit of the dataset and rerun the workflow. 216 | 217 | ```{r, fig.width=7, fig.height=3, out.width="100%", fig.align="center", fig.cap="Evaluate harmonization of stim parameter in the harmony generated cell embeddings"} 218 | 219 | p1 <- DimPlot(object = pbmc, reduction = "harmony", pt.size = .1, group.by = "stim") 220 | p2 <- VlnPlot(object = pbmc, features = "harmony_1", group.by = "stim", pt.size = .1) 221 | plot_grid(p1,p2) 222 | ``` 223 | 224 | Plot Genes correlated with the Harmonized PCs 225 | 226 | ```{r, fig.width = 6, fig.height=3, out.width="100%"} 227 | 228 | DimHeatmap(object = pbmc, reduction = "harmony", cells = 500, dims = 1:3) 229 | ``` 230 | 231 | # Using harmony embeddings for dimensionality reduction in Seurat 232 | 233 | The harmonized cell embeddings generated by harmony can be used for further integrated analyses. In this workflow, the Seurat object contains the harmony `reduction` modality name in the method that requires it. 234 | 235 | ## Perform clustering using the harmonized vectors of cells 236 | ```{r} 237 | pbmc <- pbmc %>% 238 | FindNeighbors(reduction = "harmony") %>% 239 | FindClusters(resolution = 0.5) 240 | ``` 241 | ## TSNE dimensionality reduction 242 | ```{r, fig.width=5, fig.height=2.5, fig.align="center", fig.cap="t-SNE Visualization of harmony embeddings"} 243 | pbmc <- pbmc %>% 244 | RunTSNE(reduction = "harmony") 245 | 246 | 247 | p1 <- DimPlot(pbmc, reduction = "tsne", group.by = "stim", pt.size = .1) 248 | p2 <- DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = .1) 249 | plot_grid(p1, p2) 250 | 251 | ``` 252 | 253 | One important observation is to assess that the harmonized data contain biological states of the cells. Therefore by checking the following genes we can see that biological cell states are preserved after harmonization. 254 | 255 | ```{r, fig.width = 7, fig.height = 7, out.width="100%", fig.cap="Expression of gene panel heatmap in the harmonized PBMC dataset"} 256 | FeaturePlot(object = pbmc, features= c("CD3D", "SELL", "CREM", "CD8A", "GNLY", "CD79A", "FCGR3A", "CCL2", "PPBP"), 257 | min.cutoff = "q9", cols = c("lightgrey", "blue"), pt.size = 0.5) 258 | 259 | ``` 260 | 261 | ## UMAP 262 | 263 | Very similarly with TSNE we can run UMAP by passing the harmony reduction in the function. 264 | 265 | ```{r, fig.width=5, fig.height=2.5, fig.align="center", fig.cap="UMAP Visualization of harmony embeddings"} 266 | pbmc <- pbmc %>% 267 | RunUMAP(reduction = "harmony", dims = 1:20) 268 | 269 | p1 <- DimPlot(pbmc, reduction = "umap", group.by = "stim", pt.size = .1) 270 | p2 <- DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = .1) 271 | plot_grid(p1, p2) 272 | 273 | ``` 274 | 275 | 276 | ```{r} 277 | sessionInfo() 278 | ``` 279 | 280 | -------------------------------------------------------------------------------- /vignettes/main.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/immunogenomics/harmony/b36bab002c1767af6e665c81f186b40a87870e64/vignettes/main.jpg -------------------------------------------------------------------------------- /vignettes/quickstart.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Quick start to Harmony" 3 | author: "Korsunsky et al.: Fast, sensitive, and accurate integration of single 4 | cell data with Harmony" 5 | output: 6 | rmarkdown::html_vignette: 7 | code_folding: show 8 | vignette: > 9 | %\VignetteIndexEntry{Quick start to Harmony} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | %\VignetteEncoding{UTF-8} 12 | --- 13 | 14 | 15 | # Introduction 16 | 17 | Harmony is an algorithm for performing integration of single cell genomics 18 | datasets. Please check out our latest 19 | [manuscript on Nature Methods](https://www.nature.com/articles/s41592-019-0619-0). 20 | 21 | ![](main.jpg){width=100%} 22 | 23 | 24 | # Installation 25 | 26 | Install Harmony from CRAN with standard commands. 27 | 28 | ```{r eval=FALSE} 29 | install.packages('harmony') 30 | ``` 31 | 32 | Once Harmony is installed, load it up! 33 | 34 | ```{r} 35 | library(harmony) 36 | ``` 37 | 38 | 39 | # Integrating cell line datasets from 10X 40 | 41 | The example below follows Figure 2 in the manuscript. 42 | 43 | We downloaded 3 cell line datasets from the 10X website. The first two (jurkat 44 | and 293t) come from pure cell lines while the *half* dataset is a 50:50 45 | mixture of Jurkat and HEK293T cells. We inferred cell type with the canonical 46 | marker XIST, since the two cell lines come from 1 male and 1 female donor. 47 | 48 | * support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/jurkat 49 | * support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/293t 50 | * support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/jurkat:293t_50:50 51 | 52 | We library normalized the cells, log transformed the counts, and scaled the 53 | genes. Then we performed PCA and kept the top 20 PCs. The PCA embeddings and 54 | meta data are available as part of this package. 55 | 56 | ```{r} 57 | data(cell_lines) 58 | V <- cell_lines$scaled_pcs 59 | meta_data <- cell_lines$meta_data 60 | 61 | ``` 62 | 63 | 64 | Initially, the cells cluster by both dataset (left) and cell type (right). 65 | 66 | ```{r class.source='fold-hide', fig.width=5, fig.height=3, fig.align="center"} 67 | 68 | library(ggplot2) 69 | 70 | do_scatter <- function(xy, meta_data, label_name, base_size = 12) { 71 | palette_use <- c(`jurkat` = '#810F7C', `t293` = '#D09E2D',`half` = '#006D2C') 72 | xy <- xy[, 1:2] 73 | colnames(xy) <- c('X1', 'X2') 74 | plt_df <- xy %>% data.frame() %>% cbind(meta_data) 75 | plt <- ggplot(plt_df, aes(X1, X2, col = !!rlang::sym(label_name), fill = !!rlang::sym(label_name))) + 76 | theme_test(base_size = base_size) + 77 | guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, 78 | shape = 16, size = 4))) + 79 | scale_color_manual(values = palette_use) + 80 | scale_fill_manual(values = palette_use) + 81 | theme(plot.title = element_text(hjust = .5)) + 82 | labs(x = "PC 1", y = "PC 2") + 83 | theme(legend.position = "none") + 84 | geom_point(shape = '.') 85 | 86 | ## Add labels 87 | data_labels <- plt_df %>% 88 | dplyr::group_by(!!rlang::sym(label_name)) %>% 89 | dplyr::summarise(X1 = mean(X1), X2 = mean(X2)) %>% 90 | dplyr::ungroup() 91 | plt + geom_label(data = data_labels, aes(label = !!rlang::sym(label_name)), 92 | color = "white", size = 4) 93 | } 94 | p1 <- do_scatter(V, meta_data, 'dataset') + 95 | labs(title = 'Colored by dataset') 96 | p2 <- do_scatter(V, meta_data, 'cell_type') + 97 | labs(title = 'Colored by cell type') 98 | 99 | cowplot::plot_grid(p1, p2) 100 | 101 | ``` 102 | 103 | Let's run Harmony to remove the influence of dataset-of-origin from the cell 104 | embeddings. 105 | 106 | ```{r} 107 | harmony_embeddings <- harmony::RunHarmony( 108 | V, meta_data, 'dataset', verbose=FALSE 109 | ) 110 | 111 | ``` 112 | 113 | After Harmony, the datasets are now mixed (left) and the cell types are still 114 | separate (right). 115 | 116 | ```{r, fig.width=5, fig.height=3, fig.align="center"} 117 | p1 <- do_scatter(harmony_embeddings, meta_data, 'dataset') + 118 | labs(title = 'Colored by dataset') 119 | p2 <- do_scatter(harmony_embeddings, meta_data, 'cell_type') + 120 | labs(title = 'Colored by cell type') 121 | cowplot::plot_grid(p1, p2, nrow = 1) 122 | 123 | ``` 124 | 125 | # Next Steps 126 | 127 | ## Interfacing to software packages 128 | 129 | You can also run Harmony as part of an established pipeline in several packages, such as Seurat. For these vignettes, please [visit our github page](https://github.com/immunogenomics/harmony/). 130 | 131 | 132 | ## Detailed breakdown of the Harmony algorithm 133 | 134 | For more details on how each part of Harmony works, consult our more detailed 135 | [vignette](https://htmlpreview.github.io/?https://github.com/immunogenomics/harmony/blob/master/doc/detailedWalkthrough.html) 136 | "Detailed Walkthrough of Harmony Algorithm". 137 | 138 | # Session Info 139 | 140 | ```{r} 141 | sessionInfo() 142 | 143 | ``` 144 | --------------------------------------------------------------------------------