├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── CRAN-SUBMISSION
├── DESCRIPTION
├── NAMESPACE
├── NEWS
├── R
    ├── RcppExports.R
    ├── RunHarmony.R
    ├── data.R
    ├── harmony-package.r
    ├── harmony_option.R
    ├── ui.R
    └── utils.R
├── README.md
├── appveyor.yml
├── cran-comments.md
├── data
    ├── cell_lines.rda
    ├── cell_lines_small.RData
    └── pbmc_stim.RData
├── doc
    ├── Seurat.R
    ├── Seurat.Rmd
    ├── Seurat.html
    ├── detailedWalkthrough.R
    ├── detailedWalkthrough.Rmd
    ├── detailedWalkthrough.html
    ├── parameters.R
    ├── parameters.Rmd
    ├── parameters.html
    ├── quickstart.R
    ├── quickstart.Rmd
    └── quickstart.html
├── man
    ├── HarmonyMatrix.Rd
    ├── RunHarmony.Rd
    ├── RunHarmony.Seurat.Rd
    ├── RunHarmony.SingleCellExperiment.Rd
    ├── RunHarmony.default.Rd
    ├── cell_lines.Rd
    ├── cell_lines_small.Rd
    ├── figures
    │   └── logo.png
    ├── harmony.Rd
    ├── harmony_options.Rd
    ├── moe_ridge_get_betas.Rd
    ├── pbmc.ctrl.Rd
    ├── pbmc.stim.Rd
    └── pipe.Rd
├── src
    ├── .gitignore
    ├── Makevars
    ├── Makevars.win
    ├── RcppExports.cpp
    ├── harmony.cpp
    ├── harmony.h
    ├── harmony_types.h
    ├── types.h
    ├── utils.cpp
    └── utils.h
├── tests
    ├── testthat.R
    └── testthat
    │   └── test_integration.R
└── vignettes
    ├── .gitignore
    ├── Seurat.Rmd
    ├── detailedWalkthrough.Rmd
    ├── main.jpg
    └── quickstart.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | src/*\.so
 2 | src/*\.o
 3 | src/.cache
 4 | src/Makefile
 5 | src/compile_commands.json
 6 | data/pbmc_stim_original.RData
 7 | ^.*\.Rproj
 8 | ^\.Rproj\.user$
 9 | ^.travis.yml$
10 | appveyor\.yml
11 | ^CRAN-SUBMISSION$
12 | ^cran-comments\.md$
13 | ^doc$
14 | ^\.Rproj\.user$
15 | ^.*\.Rproj$
16 | ^docs$
17 | ^README.*\.md$
18 | ^codecov\.yml$
19 | ^NEWS\.md$
20 | ^Meta$
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | inst/doc
 2 | *.ipynb
 3 | .DS_Store
 4 | .ipynb_checkpoints
 5 | **/.ipynb_checkpoints
 6 | src/*.o
 7 | src/*.so
 8 | config.log
 9 | .Rproj.user
10 | config.status
11 | *.Rproj
12 | *.swp
13 | ..Rcheck
14 | /Meta/
15 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: r
 2 | os: linux
 3 | cache: packages
 4 | warnings_are_errors: true
 5 | r_check_args: "--no-manual --timings"
 6 | 
 7 | bioc_packages:
 8 |     - BiocStyle
 9 |     - SingleCellExperiment
10 | 
11 | jobs:
12 |   include:
13 |   - r: release
14 |     os: osx
15 |   - r: release
16 |     os: linux
17 | 
18 | env:
19 |  global:
20 |    - _R_CHECK_FORCE_SUGGESTS_: false
21 |    - _R_CHECK_LENGTH_1_CONDITION_: verbose
22 |    - _R_CHECK_LENGTH_1_LOGIC2_: verbose
23 | 
24 | notifications:
25 |   email:
26 |     on_success: change
27 |     on_failure: change
28 | 


--------------------------------------------------------------------------------
/CRAN-SUBMISSION:
--------------------------------------------------------------------------------
1 | Version: 1.1.0
2 | Date: 2023-10-20 11:26:27 UTC
3 | SHA: b1a43609415cbe30d56c6530c4e08b7182fa1885
4 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: harmony
 2 | Title: Fast, Sensitive, and Accurate Integration of Single Cell Data
 3 | Version: 1.2.3
 4 | Authors@R: c(
 5 |     person("Ilya", "Korsunsky", email = "ilya.korsunsky@gmail.com",
 6 |         role = c("cre", "aut"), comment = c(ORCID = "0000-0003-4848-3948")),
 7 |     person("Martin", "Hemberg", email = "mhemberg@bwh.harvard.edu",
 8 |         role = c("aut"), comment = c(ORCID = "0000-0001-8895-5239")),
 9 |     person("Nikolaos", "Patikas", email = "nik.patik@gmail.com",
10 |         role = c("aut", "ctb"), comment = c(ORCID = "0000-0002-3978-0134")),
11 |     person("Hongcheng", "Yao", email = "hongchengyaonk@gmail.com",
12 |         role = c("aut", "ctb"), comment = c(ORCID = "0000-0002-0743-4835")),
13 |     person("Nghia", "Millard", email = "nmillard@g.harvard.edu",
14 |         role = "aut", comment = c(ORCID = "0000-0002-0518-7674")),
15 |     person("Jean", "Fan", email = "jeanfan@fas.harvard.edu",
16 |         role = c("aut", "ctb"), comment = c(ORCID = "0000-0002-0212-5451")),
17 |     person("Kamil", "Slowikowski", email = "kslowikowski@gmail.com",
18 |         role = c("aut", "ctb"), comment = c(ORCID = "0000-0002-2843-6370")),
19 |     person("Miles", "Smith",
20 |         role = c("ctb")),
21 |     person("Soumya", "Raychaudhuri",
22 |         role = c("aut"), comment = c(ORCID = "0000-0002-1901-8265"))
23 |     )
24 | Description: Implementation of the Harmony algorithm for single cell integration, described in Korsunsky et al <doi:10.1038/s41592-019-0619-0>. Package includes a standalone Harmony function and interfaces to external frameworks.
25 | URL: https://github.com/immunogenomics/harmony
26 | License: GPL-3
27 | Encoding: UTF-8
28 | RoxygenNote: 7.2.3
29 | Depends: R(>= 3.5.0), Rcpp
30 | LazyData: true
31 | LazyDataCompression: gzip
32 | LinkingTo: Rcpp, RcppArmadillo, RcppProgress
33 | Imports:
34 |     dplyr,
35 |     cowplot,
36 |     ggplot2,
37 |     Matrix,
38 |     methods,
39 |     tibble,
40 |     rlang,
41 |     RhpcBLASctl
42 | Suggests:
43 |     SingleCellExperiment,
44 |     Seurat (>= 4.1.1),
45 |     testthat,
46 |     knitr,
47 |     rmarkdown,
48 |     ggthemes,
49 |     ggrepel,
50 |     patchwork,
51 |     tidyverse,
52 |     tidyr,
53 |     data.table
54 | VignetteBuilder: knitr
55 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(RunHarmony,Seurat)
 4 | S3method(RunHarmony,SingleCellExperiment)
 5 | S3method(RunHarmony,default)
 6 | export("%>%")
 7 | export(HarmonyMatrix)
 8 | export(RunHarmony)
 9 | export(harmony_options)
10 | export(moe_ridge_get_betas)
11 | importFrom(Rcpp,loadModule)
12 | importFrom(Rcpp,sourceCpp)
13 | importFrom(cowplot,plot_grid)
14 | importFrom(dplyr,"%>%")
15 | importFrom(methods,as)
16 | importFrom(methods,hasArg)
17 | importFrom(methods,is)
18 | importFrom(methods,new)
19 | importFrom(rlang,.data)
20 | importFrom(rlang,`%||%`)
21 | importFrom(stats,model.matrix)
22 | useDynLib(harmony)
23 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | # harmony v1.2.0 - Oct 12 2023
 2 |   - Major performance enhancements, using indexes for the regression
 3 |   - update_R - Generate blocks correctly #214
 4 |   - lambda optimization - Lambda as a function of E.
 5 |   - New alpha parameter to estimate lambda during runtime
 6 |   - fail-safe for < 40 cells dataset . Setting block_size=0.2. Refuse to run with < 6 cells.
 7 |   - added progress bar for the integration step.
 8 | # harmony v1.1.0 - Oct 12 2023
 9 |   - update_R bug - All cells are corrected exactly once per invocation
10 |   - Improved documentation RunHarmony generc
11 |   - Fix lambda failing on multiple covariates
12 |   - verbose option suppresses all messages  
13 | # harmony v1.0.0 - Jul 27 2023
14 | * API changes
15 |   - removed do_pca functionality
16 |   - removed reference_values functionality
17 |   - removed cluster_prior functionality
18 |   - beta feature: automatic parameterization of lambda when it is set to NULL
19 |   - ncore parameter controls the use of multiple processors when parallelized BLAS exists.
20 |   - Moved several parameters to the .options. Now they are accessible through harmony_options()
21 | * Documentation
22 |   - Updated seurat vignette
23 |   - Removed mudan Seurat2 and Seurat3 vignettes
24 | * Name changes
25 |   - Integrated HarmonyMatrix function to the RunHarmony generic
26 |   - HarmonyMatrix is deprecated
27 | * Backend changes
28 |   - Sparse matrix coercion to yield performance enhancements
29 |   - L2-normalization using armadillo routines
30 |   - Supports parallel versions of BLAS.
31 | * Fixes
32 |   - RunHarmony() for Seurat considers dimension set
33 |   - RunHarmony() for SingleCellExperiment works
34 |   - custom ceiling function to avoid conflicts for some block_size
35 |   - Coercing covariate to factor when levels are numbers
36 |   
37 | * New features
38 |   - Automatic parameterization of lambda
39 |   - Updated convergence plot
40 | 
41 | 
42 | # harmony v0.1.0
43 | * Initial release to CRAN
44 | 
45 | # harmony v0.1.1
46 | * Updates pow to harmony_pow to avoid collision with new Armadillo pow function 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | kmeans_centers <- function(X, K) {
 5 |     .Call('_harmony_kmeans_centers', PACKAGE = 'harmony', X, K)
 6 | }
 7 | 
 8 | scaleRows_dgc <- function(x, p, i, ncol, nrow, thresh) {
 9 |     .Call('_harmony_scaleRows_dgc', PACKAGE = 'harmony', x, p, i, ncol, nrow, thresh)
10 | }
11 | 
12 | find_lambda_cpp <- function(alpha, cluster_E) {
13 |     .Call('_harmony_find_lambda_cpp', PACKAGE = 'harmony', alpha, cluster_E)
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/R/RunHarmony.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #' Generic function that runs the harmony algorithm on single-cell
  4 | #' genomics cell embeddings.
  5 | #'
  6 | #' RunHarmony is generic function that runs the main Harmony
  7 | #' algorithm. If working with single cell R objects, please refer to
  8 | #' the documentation of the appropriate generic API:
  9 | #' ([RunHarmony.Seurat()] or [RunHarmony.SingleCellExperiment()]). If
 10 | #' users work with other forms of cell embeddings, the can pass them
 11 | #' directly to harmony using [RunHarmony.default()] API. All the
 12 | #' function arguments listed here are common in all RunHarmony
 13 | #' interfaces.
 14 | #' 
 15 | #' @family RunHarmony
 16 | #' @rdname RunHarmony
 17 | #' @inheritDotParams RunHarmony.default -data_mat -meta_data -vars_use -return_object
 18 | #' 
 19 | #' 
 20 | #' @return If used with single-cell objects, it will return the
 21 | #'     updated single-sell object. For standalone operation, it
 22 | #'     returns the corrected cell embeddings or the R6 harmony object
 23 | #'     (see [RunHarmony.default()]).
 24 | #' 
 25 | #' @export
 26 | #' @md
 27 | RunHarmony <- function(...) {
 28 |     UseMethod("RunHarmony")
 29 | }
 30 | 
 31 | 
 32 | 
 33 | #' Applies harmony on a Seurat object cell embedding.
 34 | #'
 35 | #' @rdname RunHarmony.Seurat
 36 | #' @family RunHarmony
 37 | #' @inheritDotParams RunHarmony.default -data_mat -meta_data -vars_use -return_object
 38 | #' 
 39 | #' @param object the Seurat object. It needs to have the appropriate slot
 40 | #'     of cell embeddings precomputed.
 41 | #' @param group.by.vars the name(s) of covariates that harmony will remove
 42 | #'     its effect on the data.
 43 | #' @param reduction.use Name of dimension reduction to use. Default is pca.
 44 | #' @param dims.use indices of the cell embedding features to be used
 45 | #' @param reduction.save the name of the new slot that is going to be created by
 46 | #'     harmony. By default, harmony.
 47 | #' @param project.dim Project dimension reduction loadings. Default TRUE.
 48 | #' 
 49 | #' @return Seurat object. Harmony dimensions placed into a new slot in the Seurat
 50 | #' object according to the reduction.save. For downstream Seurat analyses,
 51 | #' use reduction='harmony'.
 52 | #' 
 53 | #' @export
 54 | #'
 55 | #' @examples
 56 | #' \dontrun{
 57 | #' ## seu is a Seurat single-Cell R object
 58 | #' seu <- RunHarmony(seu, "donor_id")
 59 | #' }
 60 | RunHarmony.Seurat <- function(
 61 |   object,
 62 |   group.by.vars,
 63 |   reduction.use = 'pca',
 64 |   dims.use = NULL,
 65 |   reduction.save = "harmony",
 66 |   project.dim = TRUE,
 67 |   ...
 68 | ) {
 69 |   if (!requireNamespace('Seurat', quietly = TRUE)) {
 70 |     stop("Running Harmony on a Seurat object requires Seurat")
 71 |   }
 72 |   if (!reduction.use %in% Seurat::Reductions(object = object)) {
 73 |       stop(paste(reduction.use, "cell embeddings not found in Seurat object.",
 74 |                  "For a Seurat preprocessing walkthrough, please refer to the vignette"))
 75 |   }
 76 |   embedding <- Seurat::Embeddings(object, reduction = reduction.use)
 77 |   if (is.null(dims.use)) {
 78 |     dims.use <- seq_len(ncol(embedding))
 79 |   }
 80 |   dims_avail <- seq_len(ncol(embedding))
 81 |   if (!all(dims.use %in% dims_avail)) {
 82 |     stop("trying to use more dimensions than computed. Rerun dimension reduction
 83 |          with more dimensions or run Harmony with fewer dimensions")
 84 |   }
 85 |   if (length(dims.use) == 1) {
 86 |     stop("only specified one dimension in dims.use")
 87 |   }
 88 |   metavars_df <- Seurat::FetchData(
 89 |     object,
 90 |     group.by.vars,
 91 |     cells = Seurat::Cells(x = object[[reduction.use]])
 92 |   )
 93 | 
 94 |   harmonyEmbed <- RunHarmony(
 95 |     data_mat = embedding[, dims.use],
 96 |     meta_data = metavars_df,
 97 |     vars_use = group.by.vars,
 98 |     return_object = FALSE,
 99 |     ...
100 |   )
101 | 
102 |   reduction.key <- Seurat::Key(reduction.save, quiet = TRUE)
103 |   rownames(harmonyEmbed) <- rownames(embedding)
104 |   colnames(harmonyEmbed) <- paste0(reduction.key, seq_len(ncol(harmonyEmbed)))
105 | 
106 |   object[[reduction.save]] <- Seurat::CreateDimReducObject(
107 |     embeddings = harmonyEmbed,
108 |     stdev = as.numeric(apply(harmonyEmbed, 2, stats::sd)),
109 |     assay = Seurat::DefaultAssay(object = object[[reduction.use]]),
110 |     key = reduction.key
111 |   )
112 |   if (project.dim) {
113 |     object <- Seurat::ProjectDim(
114 |       object,
115 |       reduction = reduction.save,
116 |       overwrite = TRUE,
117 |       verbose = FALSE
118 |     )
119 |   }
120 |   return(object)
121 | }
122 | 
123 | 
124 | 
125 | #' Applies harmony on PCA cell embeddings of a SingleCellExperiment.
126 | #'
127 | #' @rdname RunHarmony.SingleCellExperiment
128 | #' @inheritDotParams RunHarmony.default -data_mat -meta_data -vars_use -return_object
129 | #' @family RunHarmony
130 | #' 
131 | #' @param object SingleCellExperiment with the PCA reducedDim cell embeddings populated 
132 | #' @param group.by.vars the name(s) of covariates that harmony will remove
133 | #'     its effect on the data.
134 | #' @param dims.use a vector of indices that allows only selected cell embeddings
135 | #'     features to be used.
136 | #' @param verbose enable verbosity 
137 | #' @param reduction.save the name of the new slot that is going to be created by
138 | #'     harmony. By default, HARMONY.
139 | #'
140 | #' 
141 | #' @return SingleCellExperiment object. After running RunHarmony, the corrected
142 | #' cell embeddings can be accessed with reducedDim(object, "Harmony").
143 | #' @export
144 | #'
145 | #' @examples
146 | #' \dontrun{
147 | #' ## sce is a SingleCellExperiment R object
148 | #' sce <- RunHarmony(sce, "donor_id")
149 | #' }
150 | RunHarmony.SingleCellExperiment <- function(
151 |     object,
152 |     group.by.vars,
153 |     dims.use = NULL,
154 |     verbose = TRUE,
155 |     reduction.save = "HARMONY",
156 |     ...
157 | ) {
158 | 
159 |     ## Get PCA embeddings
160 |     if (!"PCA" %in% SingleCellExperiment::reducedDimNames(object)) {
161 |         stop("PCA must be computed before running Harmony.")
162 |     }
163 |     pca_embedding <- SingleCellExperiment::reducedDim(object, "PCA")
164 |     if (is.null(dims.use)) {
165 |         dims.use <- seq_len(ncol(pca_embedding))
166 |     }
167 | 
168 |     if (is.null(dims.use)) {
169 |         dims.use <- seq_len(ncol(pca_embedding))
170 |     }
171 |     dims_avail <- seq_len(ncol(pca_embedding))
172 |     if (!all(dims.use %in% dims_avail)) {
173 |         stop("trying to use more dimensions than computed with PCA. Rerun
174 |             PCA with more dimensions or use fewer PCs")
175 |     }
176 | 
177 |     metavars_df <- SingleCellExperiment::colData(object)
178 |     if (!all(group.by.vars %in% colnames(metavars_df))) {
179 |         stop('Trying to integrate over variables missing in colData')
180 |     }
181 | 
182 |     harmonyEmbed <- RunHarmony(
183 |         data_mat = pca_embedding[, dims.use], # is here an error? quick fix 
184 |         meta_data = metavars_df,
185 |         vars_use = group.by.vars,
186 |         return_object = FALSE,
187 |         verbose = verbose,
188 |         ...
189 |     )
190 |    
191 | 
192 |     rownames(harmonyEmbed) <- row.names(metavars_df)
193 |     colnames(harmonyEmbed) <- paste0(reduction.save, "_", seq_len(ncol(harmonyEmbed)))
194 |     SingleCellExperiment::reducedDim(object, reduction.save) <- harmonyEmbed
195 | 
196 |     return(object)
197 | }
198 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' List of metadata table and scaled PCs matrix
 2 | #' 
 3 | #' @format:
 4 | #'   meta_data: data.table of 9478 rows with defining dataset and cell_type
 5 | #'   scaled_pcs: data.table of 9478 rows (cells) and 20 columns (PCs)
 6 | #' 
 7 | #' @source \url{https://www.10xgenomics.com}
 8 | "cell_lines"
 9 | 
10 | #' Same as cell_lines but smaller (300 cells).
11 | #' 
12 | #' @source \url{https://www.10xgenomics.com}
13 | "cell_lines_small"
14 | 
15 | 
16 | #' Gene expression data of control PBMC from Kang et al. 2017. This
17 | #' contains a sample of 1000 cells from that condition and is used for
18 | #' the Seurat Vignette.
19 | #' 
20 | #' @source \doi{10.1038/nbt.4042}
21 | "pbmc.ctrl"
22 | 
23 | 
24 | #' Gene expression data of stimulated PBMC from Kang et al. 2017. This
25 | #' contains a sample of 1000 cells from that condition and is used for
26 | #' the Seurat Vignette.
27 | #' 
28 | #' @source \doi{10.1038/nbt.4042}
29 | "pbmc.stim"
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/R/harmony-package.r:
--------------------------------------------------------------------------------
 1 | #' Harmony: fast, accurate, and robust single cell integration.
 2 | #'
 3 | #' Algorithm for single cell integration.
 4 | #'
 5 | #' @section Usage:
 6 | #'
 7 | #' 
 8 | #' ?RunHarmony to run Harmony on cell embeddings matrix, Seurat or
 9 | #' SingleCellExperiment objects.
10 | #' 
11 | #' @section Useful links:
12 | #'
13 | #' \enumerate{
14 | #' \item Report bugs at \url{https://github.com/immunogenomics/harmony/issues}
15 | #' \item Read the manuscript
16 | #' \doi{10.1038/s41592-019-0619-0}
17 | #' }
18 | #'
19 | #'
20 | #' @name harmony
21 | #' @docType package
22 | #' @useDynLib harmony
23 | #' @importFrom Rcpp sourceCpp
24 | #' @importFrom Rcpp loadModule
25 | #' @importFrom methods new
26 | #' @importFrom methods as
27 | #' @importFrom methods is
28 | #' @importFrom cowplot plot_grid
29 | #' @importFrom rlang .data
30 | #' @importFrom rlang `%||%`
31 | #' @importFrom stats model.matrix
32 | loadModule("harmony_module", TRUE)
33 | NULL
34 | 


--------------------------------------------------------------------------------
/R/harmony_option.R:
--------------------------------------------------------------------------------
  1 | #' Set advanced parameters for RunHarmony
  2 | #' @param alpha When setting lambda = NULL and use lambda estimation mode, 
  3 | #'     lambda would be determined by the expected number of cells assuming 
  4 | #'     idependece between batches and clusters. i.e., lambda = alpha * expected
  5 | #'     number of cells, default 0.2 and alpha should be 0 < alpha < 1
  6 | #' @param tau Protection against overclustering small datasets with 
  7 | #'     large ones. `tau` is the expected number of cells per cluster.
  8 | #' @param block.size What proportion of cells to update during clustering. 
  9 | #'     Between 0 to 1, default 0.05. Larger values may be faster but less 
 10 | #'     accurate.
 11 | #' @param max.iter.cluster Maximum number of rounds to run clustering 
 12 | #'     at each round of Harmony.
 13 | #' @param epsilon.cluster Convergence tolerance for clustering round 
 14 | #'     of Harmony. Set to -Inf to never stop early.
 15 | #' @param epsilon.harmony Convergence tolerance for Harmony. Set to -Inf to
 16 | #'     never stop early. When `epsilon.harmony` is set to not NULL, then
 17 | #'     user-supplied values of `early_stop` is ignored.
 18 | #' @returns Return a list for `.options` argument of `RunHarmony`
 19 | #' @export
 20 | #' @examples
 21 | #' ## If want to set max.iter.cluster to be 100, do
 22 | #' \dontrun{
 23 | #' RunHarmony(data_meta, meta_data, vars_use,
 24 | #'               .options = harmony_options(max.iter.cluster = 100))
 25 | #' }
 26 | #' 
 27 | harmony_options <- function(
 28 |   alpha = 0.2,
 29 |   tau = 0,
 30 |   block.size = 0.05,
 31 |   max.iter.cluster = 20,
 32 |   epsilon.cluster = 1e-3,
 33 |   epsilon.harmony = 1e-2) {
 34 |     
 35 |     block.size <- validate_block.size(block.size)
 36 |     
 37 |     out <- list(
 38 |         alpha = alpha,
 39 |         tau = tau,
 40 |         block.size = block.size,
 41 |         max.iter.cluster = max.iter.cluster,
 42 |         epsilon.cluster = epsilon.cluster,
 43 |         epsilon.harmony = epsilon.harmony
 44 |     )
 45 |     out <- structure(out, class = "harmony_options")
 46 |     return(out)
 47 | }
 48 | 
 49 | ## Validate functions -----------------------------------------------------------
 50 | validate_block.size <- function(block.size) {
 51 |     if(block.size <= 0 | block.size > 1){
 52 |         stop('Error: block.size should be set between 0 and 1 (0 < block.size <= 1)')
 53 |     }
 54 |     return(block.size)
 55 | }
 56 | 
 57 | 
 58 | #' @importFrom methods hasArg
 59 | check_legacy_args <- function(...) {
 60 |     if (hasArg("do_pca") || hasArg("npcs")) legacy_warning("do_pca_npcs")
 61 |     if (hasArg("tau")) legacy_warning("tau")
 62 |     if (hasArg("block.size")) legacy_warning("block.size")
 63 |     if (hasArg("max.iter.harmony")) legacy_warning("max.iter.harmony")
 64 |     if (hasArg("max.iter.cluster")) legacy_warning("max.iter.cluster")
 65 |     if (hasArg("epsilon.cluster")) legacy_warning("epsilon.cluster")
 66 |     if (hasArg("epsilon.harmony")) legacy_warning("epsilon.harmony")
 67 |     
 68 | }
 69 | 
 70 | 
 71 | 
 72 | 
 73 | legacy_warning <- function(param) {
 74 |     common_warn <- paste0(
 75 |         "Warning: The parameter ", param, " is deprecated. ",
 76 |         "It will be ignored for this function call ",
 77 |         "and please remove parameter ", param, " in future function calls. ",
 78 |         "Advanced users can set value of parameter ", param,
 79 |         " by using parameter .options and function harmony_options()."
 80 |     )
 81 |     do_pca_npcs_warn <- paste0(
 82 |         "Warning: The parameters ", "do_pca and npcs", " are deprecated. ",
 83 |         "They will be ignored for this function call ",
 84 |         "and please remove parameters ", "do_pca and npcs",
 85 |         " and pass to harmony cell_embeddings directly."
 86 |     )
 87 |     max.iter.harmony_warn <- paste0(
 88 |         "Warning: The parameter ", "max.iter.harmony ",
 89 |         "is replaced with parameter ", "max_iter. ",
 90 |         "It will be ignored for this function call ",
 91 |         "and please use parameter ", "max_iter ", "in future function calls."
 92 |     )
 93 |     epsilon.harmony_warn <- paste0(
 94 |         "Warning: The parameter ", "epsilon.harmony", " is deprecated. ",
 95 |         "It will be ignored for this function call ",
 96 |         "and please remove parameter ", "epsilon.harmony",
 97 |         " in future function calls. ",
 98 |         "If users want to control if harmony would stop early or not, ",
 99 |         "use parameter ", "early_stop. ",
100 |         "Advanced users can set value of parameter ", "epsilon.harmony",
101 |         " by using parameter .options and function harmony_options()."
102 |     )
103 | 
104 | 
105 |     if (param %in% c("tau", "block.size", "max.iter.cluster",
106 |                      "epsilon.cluster")) {
107 |         warn_str <- common_warn
108 |     }
109 |     if (param == "do_pca_npcs") {
110 |         warn_str <- do_pca_npcs_warn
111 |     }
112 |     if (param == "max.iter.harmony") {
113 |         warn_str <- max.iter.harmony_warn
114 |     }
115 |     if (param == "epsilon.harmony") {
116 |         warn_str <- epsilon.harmony_warn
117 |     }
118 | 
119 |     rlang::warn(warn_str, .frequency = "once", .frequency_id = param)
120 | }
121 | 


--------------------------------------------------------------------------------
/R/ui.R:
--------------------------------------------------------------------------------
  1 | #' This is the primary harmony interface.
  2 | #' 
  3 | #' Use this generic with a cell embeddings matrix, a metadata table
  4 | #' and a categorical covariate to run the Harmony algorithm directly
  5 | #' on cell embedding matrix.
  6 | #'
  7 | #' @rdname RunHarmony.default
  8 | #' @family RunHarmony
  9 | #' 
 10 | #' @param data_mat Matrix of cell embeddings. Cells can be rows or
 11 | #'     columns and will be inferred by the rows of meta_data.
 12 | #' @param meta_data Either (1) Dataframe with variables to integrate
 13 | #'     or (2) vector with labels.
 14 | #' @param vars_use If meta_data is dataframe, this defined which
 15 | #'     variable(s) to remove (character vector).
 16 | #' @param theta Diversity clustering penalty parameter. Specify for
 17 | #'     each variable in vars_use Default theta=2. theta=0 does not
 18 | #'     encourage any diversity. Larger values of theta result in more
 19 | #'     diverse clusters.
 20 | #' @param sigma Width of soft kmeans clusters. Default
 21 | #'     sigma=0.1. Sigma scales the distance from a cell to cluster
 22 | #'     centroids. Larger values of sigma result in cells assigned to
 23 | #'     more clusters. Smaller values of sigma make soft kmeans cluster
 24 | #'     approach hard clustering.
 25 | #' @param lambda Ridge regression penalty. Default lambda=1. Bigger
 26 | #'     values protect against over correction. If several covariates
 27 | #'     are specified, then lambda can also be a vector which needs to
 28 | #'     be equal length with the number of variables to be
 29 | #'     corrected. In this scenario, each covariate level group will be
 30 | #'     assigned the scalars specified by the user. If set to NULL,
 31 | #'     harmony will start lambda estimation mode to determine lambdas
 32 | #'     automatically and try to minimize overcorrection (Use with caution still
 33 | #'     in beta testing).
 34 | #' @param nclust Number of clusters in model. nclust=1 equivalent to
 35 | #'     simple linear regression.
 36 | #' @param max_iter Maximum number of rounds to run Harmony. One round
 37 | #'     of Harmony involves one clustering and one correction step.
 38 | #' @param early_stop Enable early stopping for harmony. The
 39 | #'     harmonization process will stop when the change of objective
 40 | #'     function between corrections drops below 1e-4
 41 | #' @param ncores Number of processors to be used for math operations
 42 | #'     when optimized BLAS is available. If BLAS is not supporting
 43 | #'     multithreaded then this option has no effect. By default,
 44 | #'     ncore=1 which runs as a single-threaded process. Although
 45 | #'     Harmony supports multiple cores, it is not optimized for
 46 | #'     multithreading. Increase this number for large datasets iff
 47 | #'     single-core performance is not adequate.
 48 | #' @param plot_convergence Whether to print the convergence plot of
 49 | #'     the clustering objective function. TRUE to plot, FALSE to
 50 | #'     suppress. This can be useful for debugging.
 51 | #' @param return_object (Advanced Usage) Whether to return the Harmony
 52 | #'     object or only the corrected PCA embeddings.
 53 | #' @param verbose Whether to print progress messages. TRUE to print,
 54 | #'     FALSE to suppress.
 55 | #' @param .options Setting advanced parameters of RunHarmony. This must be the
 56 | #'     result from a call to `harmony_options`. See ?`harmony_options` for 
 57 | #'     parameters not listed above and more details.
 58 | #' @param ... other parameters that are not part of the API
 59 | #' 
 60 | #' @return By default, matrix with corrected PCA embeddings. If
 61 | #'     return_object is TRUE, returns the full Harmony object (R6
 62 | #'     reference class type).
 63 | #'
 64 | #' @export 
 65 | #' 
 66 | #' @examples
 67 | #' 
 68 | #' 
 69 | #' ## By default, Harmony inputs a cell embedding matrix
 70 | #' \dontrun{
 71 | #' harmony_embeddings <- RunHarmony(cell_embeddings, meta_data, 'dataset')
 72 | #' }
 73 | #' 
 74 | #' ## If PCA is the input, the PCs need to be scaled
 75 | #' data(cell_lines_small)
 76 | #' pca_matrix <- cell_lines_small$scaled_pcs
 77 | #' meta_data <- cell_lines_small$meta_data
 78 | #' harmony_embeddings <- RunHarmony(pca_matrix, meta_data, 'dataset')
 79 | #' 
 80 | #' ## Output is a matrix of corrected PC embeddings
 81 | #' dim(harmony_embeddings)
 82 | #' harmony_embeddings[seq_len(5), seq_len(5)]
 83 | #' 
 84 | #' ## Finally, we can return an object with all the underlying data structures
 85 | #' harmony_object <- RunHarmony(pca_matrix, meta_data, 'dataset', return_object=TRUE)
 86 | #' dim(harmony_object$Y) ## cluster centroids
 87 | #' dim(harmony_object$R) ## soft cluster assignment
 88 | #' dim(harmony_object$Z_corr) ## corrected PCA embeddings
 89 | #' head(harmony_object$O) ## batch by cluster co-occurence matrix
 90 | #' 
 91 | RunHarmony.default <- function(
 92 |   data_mat,
 93 |   meta_data,
 94 |   vars_use,
 95 |   theta = NULL,
 96 |   sigma = 0.1,
 97 |   lambda = 1,
 98 |   nclust = NULL,
 99 |   max_iter = 10,
100 |   early_stop = TRUE,
101 |   ncores = 1,
102 |   plot_convergence = FALSE,
103 |   return_object = FALSE,
104 |   verbose = TRUE,
105 |   .options = harmony_options(),
106 |   ...
107 |   ) {
108 |     
109 | 
110 |     ## Try to set number of OPENBLAS cores for harmony.
111 |     ## the function tries to set OpenMP threads
112 |     ## In case OpenMP is not supported it returns FALSE so we don't
113 |     ## set threads and harmony runs in single-thread mode
114 |     set.cores <- setOMPthreads(ncores)
115 |     
116 |     
117 |     tryCatch({
118 |         ## Check legacy arguments
119 |         check_legacy_args(...)
120 | 
121 |         ## Set threads if BLAS threas are set/detected properly
122 |         if (set.cores) {
123 |             prev.ncores.blas <- RhpcBLASctl::blas_get_num_procs()
124 |             prev.ncores.omp <- RhpcBLASctl::omp_get_num_procs()
125 |             RhpcBLASctl::blas_set_num_threads(ncores)
126 |             RhpcBLASctl::omp_set_num_threads(ncores)
127 |         }
128 |         
129 |         
130 |         ## Parameter setting --------------------------------------------------------
131 | 
132 |         if (!inherits(.options, "harmony_options")) {
133 |             stop("Error: .options must be created from harmony_options()!")
134 |         }
135 | 
136 |         if (early_stop) {
137 |             epsilon.harmony <- .options$epsilon.harmony
138 |         } else {
139 |             epsilon.harmony = -Inf
140 |         }
141 |         max.iter.harmony <- max_iter
142 |         alpha <- .options$alpha
143 |         tau <- .options$tau
144 |         block.size <- .options$block.size
145 |         max.iter.cluster <- .options$max.iter.cluster
146 |         epsilon.cluster <- .options$epsilon.cluster   
147 |         
148 |         
149 | 
150 |         ## TODO: check for 
151 |         ##    partially observed batch variables (WARNING)
152 |         ##    batch variables with only 1 level (WARNING)
153 |         ##    if lambda given, check correct length
154 |         ##    if theta given, check correct length
155 |         ##    very small batch size and tau=0: suggest tau>0
156 |         ##    is PCA correct? 
157 |         if (!(is(meta_data, 'data.frame') | is(meta_data, 'DataFrame'))) {
158 |             if (length(meta_data) %in% dim(data_mat)) {
159 |                 meta_data <- data.frame(batch_variable = meta_data)
160 |                 vars_use <- 'batch_variable'
161 |             } else {
162 |                 stop('meta_data must be either a data.frame or a vector with batch 
163 |                 values for each cell')
164 |             }
165 |         }
166 |         
167 |         if (is.null(vars_use) | any(!vars_use %in% colnames(meta_data))) {
168 |             msg <- gettextf('must provide variables names (e.g. vars_use=%s)', 
169 |                             sQuote('stim'))
170 |             stop(msg)
171 |         }
172 | 
173 |         ## Number of cells
174 |         N <- nrow(meta_data)
175 |         
176 |         ## Check if we need to transpose our data
177 |         if (nrow(data_mat) == N) {
178 |             if (verbose) {
179 |                 message("Transposing data matrix")
180 |             }
181 |             data_mat <- Matrix::t(data_mat)
182 |         }
183 | 
184 |         if (ncol(data_mat) != N) {
185 |             stop("number of labels do not correspond to number of 
186 |                 samples in data matrix")
187 |         }
188 |         
189 | 
190 |                                         # determine K if null
191 |         if (is.null(nclust)) {
192 |             nclust <- min(round(N / 30), 100)
193 |         }
194 |         
195 |                                         # determine theta if null
196 |         if (is.null(theta)) {
197 |             theta <- rep(2, length(vars_use))
198 |         } else if (length(theta) != length(vars_use)) {
199 |             stop('Please specify theta for each variable')
200 |         }
201 |         
202 |         ## determine sigma if it is a scalar
203 |         if (length(sigma) == 1 & nclust > 1) {
204 |             sigma <- rep(sigma, nclust)
205 |         }
206 |         
207 | 
208 |         ## Pre-compute some useful statistics
209 |         phi <- Reduce(rbind, lapply(vars_use, function(var_use) {
210 |             res <- Matrix::sparse.model.matrix(~0 + as.factor(meta_data[[var_use]]))
211 |             Matrix::t(res)
212 |         }))
213 | 
214 |         ## ## number of cells per batch
215 |         N_b <- Matrix::rowSums(phi)
216 | 
217 |         ## Number of factors per covariate
218 |         B_vec <- Reduce(c, lapply(vars_use, function(var_use) {
219 |             nlevels(as.factor(meta_data[[var_use]]))
220 |         }))
221 | 
222 |         ## lambda=NULL means we have automatic estimation
223 |         lambda.auto <- is.null(lambda)
224 |         if (lambda.auto) {
225 |             if(verbose){
226 |                 message("Using automatic lambda estimation")
227 |             }
228 |             lambda_vec <- -1 ## Magic value for the backend
229 |         } else {
230 |             ## We use fixed lambdas
231 |             if(!all(lambda > 0)) {
232 |                 stop("Provided lambdas must be positive")
233 |             }
234 |             if (length(lambda) == 1) {
235 |                 ## Single lambda is being used for all covariates
236 |                 lambda_vec <- c(0, rep(lambda, sum(B_vec)))
237 |             } else {
238 |                 ## Several lambdas, one for each covariate
239 |                 if (length(lambda) != length(vars_use)) {
240 |                     stop(paste0("You specified a lambda value for each ",
241 |                                 "covariate but the number of lambdas specified (",
242 |                                 length(lambda), ") and the number of covariates (",
243 |                                 length(vars_use),") mismatch."))
244 |                 }
245 |                 lambda_vec <- unlist(lapply(seq_len(length(B_vec)), function(b) rep(lambda[b], B_vec[b])))
246 |                 lambda_vec <- c(0, unname(lambda_vec))
247 |             }
248 |         }
249 | 
250 | 
251 | 
252 |         ## Calculate theta (#covariates) x (#levels)
253 |         theta <- Reduce(c, lapply(seq_len(length(B_vec)), function(b)
254 |             rep(theta[b], B_vec[b])))
255 | 
256 |         ## Theta scaling
257 |         theta <- theta * (1 - exp(-(N_b / (nclust * tau))^2))
258 |         
259 |         ## RUN HARMONY
260 |         harmonyObj <- new(harmony)
261 |         
262 |         harmonyObj$setup(
263 |                        data_mat, phi, sigma, theta, lambda_vec, alpha,
264 |                        max.iter.cluster, epsilon.cluster,
265 |                        epsilon.harmony, nclust, block.size,
266 |                        B_vec, verbose
267 |                    )
268 | 
269 |         
270 |         if (verbose) {
271 |             message("Initializing state using k-means centroids initialization")
272 |         }
273 |         harmonyObj$init_cluster_cpp()
274 |         
275 |         harmonize(harmonyObj, max.iter.harmony, verbose)
276 |         
277 |         if (plot_convergence) graphics::plot(HarmonyConvergencePlot(harmonyObj))
278 | 
279 |         
280 |         ## Return either the R6 Harmony object or the corrected PCA matrix
281 |         if (return_object) {
282 |             return(harmonyObj)
283 |         } else {
284 |             res <- as.matrix(harmonyObj$Z_corr)
285 |             row.names(res) <- row.names(data_mat)
286 |             colnames(res) <- colnames(data_mat)
287 |             return(t(res))
288 |         }
289 | 
290 |     }, ## main tryCatch block ends here
291 |     
292 |     finally={
293 |         if(set.cores) {
294 |             RhpcBLASctl::blas_set_num_threads(prev.ncores.blas)
295 |             RhpcBLASctl::omp_set_num_threads(prev.ncores.omp)
296 |         }
297 |     })
298 |     
299 |     
300 |     
301 | }
302 | 
303 | #' A proxy call to [RunHarmony()]. Deprecated.
304 | #'
305 | #' Maintain name backwards compatibility with version 0 of
306 | #' harmony. However, API is not backwards compatible with version
307 | #' 0. This function will be deprecated in later versions of Harmony.
308 | #'
309 | #' @inheritDotParams RunHarmony.default
310 | #'
311 | #' @export
312 | #' @md
313 | HarmonyMatrix <- function(...) {
314 |     .Deprecated("RunHarmony", msg="HarmonyMatrix is deprecated and will be removed in the future from the API in the future")
315 |     RunHarmony(...)
316 | }
317 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | #' Pipe operator
  2 | #'
  3 | #' @name %>%
  4 | #' @rdname pipe
  5 | #' @keywords internal
  6 | #' @export
  7 | #' @importFrom dplyr %>%
  8 | #' @examples
  9 | #' x <- 5 %>% sum(10)
 10 | #' 
 11 | #' @usage lhs \%>\% rhs
 12 | #' @return return value of rhs function. 
 13 | NULL
 14 | 
 15 | harmonize <- function(harmonyObj, iter_harmony, verbose=TRUE) {
 16 |     if (iter_harmony < 1) {
 17 |         return(0)
 18 |     }
 19 |     
 20 |     for (iter in seq_len(iter_harmony)) {
 21 |         if (verbose) {
 22 |             message(gettextf('Harmony %d/%d', iter, iter_harmony))        
 23 |         }
 24 |         
 25 |         # STEP 1: do clustering
 26 |         err_status <- harmonyObj$cluster_cpp()
 27 |         if (err_status == -1) {
 28 |             stop('terminated by user')
 29 |         } else if (err_status != 0) {
 30 |             stop(gettextf('Harmony exited with non-zero exit status: %d', 
 31 |                             err_status))
 32 |         }
 33 |         
 34 |         # STEP 2: regress out covariates
 35 |         harmonyObj$moe_correct_ridge_cpp()
 36 |         
 37 |         # STEP 3: check for convergence
 38 |         if (harmonyObj$check_convergence(1)) {
 39 |             if (verbose) {
 40 |                 message(gettextf("Harmony converged after %d iterations", 
 41 |                         iter))    
 42 |             }
 43 |             return(0)
 44 |         }
 45 |     }
 46 | }
 47 | 
 48 | 
 49 | 
 50 | HarmonyConvergencePlot <- function(
 51 |         harmonyObj, round_start=1, round_end=Inf, do_wrap=FALSE
 52 |     ) {  
 53 |     ## ignore initial value
 54 |     ## break down kmeans objective into rounds
 55 |     obj_fxn <- data.frame(
 56 |         kmeans_idx = Reduce(c, lapply(harmonyObj$kmeans_rounds, 
 57 |                         function(rounds) {
 58 |             seq_len(rounds)
 59 |         })),
 60 |         harmony_idx = Reduce(c, lapply(
 61 |             seq_len(length(harmonyObj$kmeans_rounds)),
 62 |             function(i) {rep(i, harmonyObj$kmeans_rounds[i])})
 63 |         ),
 64 |         val = utils::tail(harmonyObj$objective_kmeans, -1)
 65 |     ) %>%
 66 |         dplyr::filter(.data$harmony_idx >= round_start) %>% 
 67 |         dplyr::filter(.data$harmony_idx <= round_end) %>% 
 68 |         tibble::rowid_to_column("idx") 
 69 |     
 70 |     
 71 |     plt <- obj_fxn %>% ggplot2::ggplot(ggplot2::aes(.data$idx, .data$val,
 72 |                                                     col = as.factor(.data$harmony_idx))) + 
 73 |         ggplot2::geom_point() + 
 74 |         ggplot2::labs(y = "Objective Function", x = "Clustering Step #", color = "Integration #")
 75 |     
 76 |     if (do_wrap) {
 77 |         plt <- plt + ggplot2::facet_grid(.~.data$harmony_idx, scales = 'free',
 78 |             space = 'free_x')
 79 |     } 
 80 |     return(plt)
 81 | }
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | scaleData <- function(A, margin = 1, thresh = 10) {
 88 |     if (!"dgCMatrix" %in% class(A))
 89 |         A <- methods::as(A, "dgCMatrix")
 90 | 
 91 |     if (margin != 1) A <- t(A)
 92 | 
 93 |     res <- scaleRows_dgc(A@x, A@p, A@i, ncol(A), nrow(A), thresh)
 94 |     if (margin != 1) res <- t(res)
 95 |     row.names(res) <- row.names(A)
 96 |     colnames(res) <- colnames(A)
 97 |     return(res)
 98 | }
 99 | 
100 | 
101 | #' Get beta Utility 
102 | #' 
103 | #' Utility function to get ridge regression coefficients from trained
104 | #' Harmony object 
105 | #' 
106 | #' @param harmonyObj Trained harmony object. Get this by running 
107 | #' RunHarmony function with return_object=TRUE.
108 | #' @return Returns nothing, modifies object in place. 
109 | #' @export
110 | moe_ridge_get_betas <- function(harmonyObj) {
111 |     harmonyObj$moe_ridge_get_betas_cpp()
112 | }
113 | 
114 | 
115 | setOMPthreads <- function(ncores) {
116 |     tryCatch({
117 |         ## The following block may fail in some build environments (if
118 |         ## OpenMP is not available). In case OpenMP is not available,
119 |         ## we control the flow and fail gracefully by catching the
120 |         ## exception and warn the user. If ncores parameter, is not
121 |         ## valid for the runtime environment then we prompt the user
122 |         ## user
123 | 
124 |         ## Flag set in case user provides invalid number of cores
125 |         invalid.number.of.cores <- FALSE
126 | 
127 |         ## If OpenMP is not supported, this may return NA
128 |         max.cores <- RhpcBLASctl::omp_get_max_threads()
129 |         ## Sanity check for number of cores
130 |         ## NOTE: (ncores > max.cores) throws an exception if ncores is
131 |         ## NA suggesting OpenMP is not supported
132 |         if ((ncores != as.integer(ncores)) || (ncores < 1) || (ncores > max.cores)) {
133 |             invalid.number.of.cores <- TRUE
134 |             stop("")## Throw exception
135 |         }
136 | 
137 |     },
138 |     error = function(e) {
139 |         if(invalid.number.of.cores) {
140 |             stop(paste0(
141 |                 "Invalid number of ncores provided: ", ncores, ". \n",
142 |                 "Maximum available cores: ", max.cores))
143 | 
144 |         } else if(ncores != 1) {
145 |             warning(paste(
146 |                 "Harmony was unable to set number of cores for BLAS.",
147 |                 "Running in single-thread mode instead"
148 |             ))
149 |         }
150 |         return(FALSE)
151 | 
152 |     })
153 |     return(TRUE)
154 | }
155 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Harmony <img src="man/figures/logo.png" width="181px" align="right" />
  2 | ===========
  3 | 
  4 | [![Travis-CI Build Status](https://travis-ci.org/immunogenomics/harmony.svg?branch=master)](https://travis-ci.org/immunogenomics/harmony)
  5 | [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/immunogenomics/harmony?branch=master&svg=true)](https://ci.appveyor.com/project/immunogenomics/harmony)
  6 | [![DOI](https://zenodo.org/badge/doi/10.1038/s41592-019-0619-0.svg)](https://doi.org/10.1038/s41592-019-0619-0)
  7 | 
  8 | *Fast, sensitive and accurate integration of single-cell data with Harmony*
  9 | 
 10 | Check out the manuscript in Nature Methods: 
 11 | - [nature website](https://www.nature.com/articles/s41592-019-0619-0)
 12 | - [read link](https://www.nature.com/articles/s41592-019-0619-0.epdf?shared_access_token=rDg_Rd07lrFXExt_ySj7V9RgN0jAjWel9jnR3ZoTv0NfDJkKCfDV_X9Mq3lweQmKiXEXxhrebQRjJEZdc-xNv6-7ZN1XotlD_mo5TSS4Z4eWn-kUo6mBwA5dEAKlTfR8OT6E10MZY_E-906ajbzvgg%3D%3D)
 13 | 
 14 | For Python users, check out the [harmonypy package](https://github.com/slowkow/harmonypy) by Kamil Slowikowski. 
 15 | 
 16 | # System requirements 
 17 | 
 18 | Harmony has been tested on R versions >= 3.4. Please consult the DESCRIPTION file for more details on required R packages. Harmony has been tested on Linux, OS X, and Windows platforms.
 19 | 
 20 | # Installation
 21 | 
 22 | To run Harmony, open R and install harmony from CRAN: 
 23 | 
 24 | ```r
 25 | install.packages("harmony")
 26 | ```
 27 | 
 28 | If you'd like the latest development version, install from this github directly: 
 29 | 
 30 | ```r
 31 | devtools::install_github("immunogenomics/harmony", build_vignettes=TRUE)
 32 | ```
 33 | 
 34 | 
 35 | # Usage
 36 | 
 37 |  Harmony is designed to be user-friendly and supports some SingleCellExperiment and Seurat R analysis pipelines. Alternatively, it can be used in standalone mode.
 38 | 
 39 | ## Quick Start 
 40 | 
 41 | ### Standalone Mode
 42 | Check out this [vignette](http://htmlpreview.github.io/?https://github.com/immunogenomics/harmony/blob/master/doc/quickstart.html) for a quick start tutorial which demonstrates the usage of the tool in standalone mode.
 43 | 
 44 | At minimum the following parameters need to be specified to achieve an integration. 
 45 | 
 46 | ```r
 47 | library(harmony)
 48 | my_harmony_embeddings <- RunHarmony(my_pca_embeddings, meta_data, "dataset")
 49 | ```
 50 | 
 51 | 
 52 | ## Seurat Objects
 53 | 
 54 | By default, the harmony API works on Seurats PCA cell embeddings and corrects them. You can run Harmony within your Seurat workflow with `RunHarmony()`. Prior `RunHarmony()` the PCA cell embeddings need to be precomputed through Seurat's API. For downstream analyses, use the `harmony` embeddings instead of `pca`.
 55 | 
 56 | For example, the following snippet run Harmony and then calculates UMAP of the corrected input embeddings:
 57 | 
 58 | ```r
 59 | seuratObj <- RunHarmony(seuratObj, "dataset")
 60 | seuratObj <- RunUMAP(seuratObj, reduction = "harmony")
 61 | ```
 62 | 
 63 | For a more detailed overview of the `RunHarmony()` Seurat interface check, the [Seurat vignette](http://htmlpreview.github.io/?https://github.com/immunogenomics/harmony/blob/master/doc/Seurat.html)
 64 | 
 65 | ## Harmony with two or more covariates
 66 | 
 67 | Harmony can integrate over multiple covariates. To do this, specify a vector covariates to integrate. 
 68 | 
 69 | ```r
 70 | my_harmony_embeddings <- RunHarmony(
 71 |   my_pca_embeddings, meta_data, c("dataset", "donor", "batch_id")
 72 | )
 73 | ```
 74 | 
 75 | Do the same with your Seurat object: 
 76 | 
 77 | ```r
 78 | seuratObject <- RunHarmony(seuratObject, c("dataset", "donor", "batch_id"))
 79 | ```
 80 | 
 81 | ## Advanced tutorial 
 82 | 
 83 | The examples above all return integrated PCA embeddings. We created a [detailed walkthrough](http://htmlpreview.github.io/?https://github.com/immunogenomics/harmony/blob/master/doc/detailedWalkthrough.html) that explores the internal data structures and mechanics of the Harmony algorithm.
 84 | 
 85 | 
 86 | # Performance Notes
 87 | 
 88 | ## BLAS vs. OPENBLAS
 89 | 
 90 | R distributions can be bundled with different scientific computing libraries. This can drastically impact harmony's performance. Rstudio comes by default with BLAS. In contrast, conda distributions of R are bundled with OPENBLAS. Overall, our benchmarks show that **harmony+OPENBLAS is substantially faster compared harmony+BLAS**. Therefore users with large datasets will benefit using OPENBLAS.
 91 | 
 92 | ## Multithreading in OPENBLAS
 93 | 
 94 | One caveat is that OPENBLAS uses OPENMP to parallelize operations. By default, OPENBLAS will utilize all cores for these operations. While in theory this accelerates runtimes, in practice harmony is not optimized for multi-threaded performance and the unoptimized parallelization granularity may result in significantly slower run times and inefficient resource utilization (wasted CPU cycles). Therefore, by default harmony turns off multi-threading. However, very large datasets >1M may benefit from parallelization. This behavior can be controlled by the `ncores` parameter which expects a number threads which harmony will use for its math operation. Users are advised to increase gradually `ncores` and assess potential performance benefits.
 95 | 
 96 | 
 97 | # Reproducing results from manuscript
 98 | 
 99 | Code to reproduce Harmony results from the Korsunsky et al 2019 manuscript will be made available on github.com/immunogenomics/harmony2019. 
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # DO NOT CHANGE the "init" and "install" sections below
 2 | 
 3 | # Download script file from GitHub
 4 | init:
 5 |   ps: |
 6 |         $ErrorActionPreference = "Stop"
 7 |         Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1"
 8 |         Import-Module '..\appveyor-tool.ps1'
 9 | 
10 | install:
11 |   ps: Bootstrap
12 | 
13 | # This is a good reference:
14 | # https://github.com/Bioconductor/BiocManager/blob/master/appveyor.yml
15 | 
16 | cache:
17 |   - C:\RLibrary
18 | 
19 | environment:
20 |   global:
21 |     _R_CHECK_FORCE_SUGGESTS_: false
22 |     R_ARCH: x64
23 |     USE_RTOOLS: true
24 |     R_CHECK_ARGS: "--no-manual --timings"
25 | 
26 |   matrix:
27 |   - R_VERSION: release
28 |     BIOC_USE_DEVEL: FALSE
29 | 
30 | build_script:
31 |   - echo Current directory=%CD%
32 |   - travis-tool.sh install_deps
33 |   - travis-tool.sh install_bioc_deps
34 |   - travis-tool.sh install_r Seurat
35 | 
36 | test_script:
37 |   - travis-tool.sh run_tests
38 | 
39 | on_failure:
40 |   - 7z a failure.zip *.Rcheck\*
41 |   - appveyor PushArtifact failure.zip
42 | 
43 | artifacts:
44 |   - path: '*.Rcheck\**\*.log'
45 |     name: Logs
46 | 
47 |   - path: '*.Rcheck\**\*.out'
48 |     name: Logs
49 | 
50 |   - path: '*.Rcheck\**\*.fail'
51 |     name: Logs
52 | 
53 |   - path: '*.Rcheck\**\*.Rout'
54 |     name: Logs
55 | 
56 |   - path: '\*_*.tar.gz'
57 |     name: Bits
58 | 
59 |   - path: '\*_*.zip'
60 |     name: Bits
61 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## R CMD check results
 2 | 
 3 | This package was archived on 2022-10-30
 4 | 
 5 | There was a conflict with the log() function from RcppArmadillo. 
 6 | 
 7 | All errors, warnings, and notes have been addressed. 
 8 | 
 9 | * This is a resubmission.
10 | 
11 | 


--------------------------------------------------------------------------------
/data/cell_lines.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/immunogenomics/harmony/b36bab002c1767af6e665c81f186b40a87870e64/data/cell_lines.rda


--------------------------------------------------------------------------------
/data/cell_lines_small.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/immunogenomics/harmony/b36bab002c1767af6e665c81f186b40a87870e64/data/cell_lines_small.RData


--------------------------------------------------------------------------------
/data/pbmc_stim.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/immunogenomics/harmony/b36bab002c1767af6e665c81f186b40a87870e64/data/pbmc_stim.RData


--------------------------------------------------------------------------------
/doc/Seurat.R:
--------------------------------------------------------------------------------
  1 | ## ---- include = FALSE---------------------------------------------------------
  2 | knitr::opts_chunk$set(
  3 |   collapse = TRUE,
  4 |   comment = "#>"
  5 | )
  6 | 
  7 | ## ----setup, message=FALSE, warning=FALSE--------------------------------------
  8 | library(harmony)
  9 | library(Seurat)
 10 | library(dplyr)
 11 | library(cowplot)
 12 | 
 13 | 
 14 | ## ----eval=FALSE---------------------------------------------------------------
 15 | #  install.packages('harmony')
 16 | 
 17 | ## -----------------------------------------------------------------------------
 18 | ## Source required data
 19 | data("pbmc_stim")
 20 | pbmc <- CreateSeuratObject(counts = cbind(pbmc.stim, pbmc.ctrl), project = "PBMC", min.cells = 5)
 21 | 
 22 | ## Separate conditions
 23 | 
 24 | pbmc@meta.data$stim <- c(rep("STIM", ncol(pbmc.stim)), rep("CTRL", ncol(pbmc.ctrl)))
 25 | 
 26 | ## ----eval = FALSE, class.source='fold-hide'-----------------------------------
 27 | #  library(Matrix)
 28 | #  ## Download and extract files from GEO
 29 | #  ##setwd("/path/to/downloaded/files")
 30 | #  genes =  read.table("GSE96583_batch2.genes.tsv.gz", header = FALSE, sep = "\t")
 31 | #  
 32 | #  pbmc.ctrl.full = as.readMM("GSM2560248_2.1.mtx.gz")
 33 | #  colnames(pbmc.ctrl.full) = paste0(read.table("GSM2560248_barcodes.tsv.gz", header = FALSE, sep = "\t")[,1], "-1")
 34 | #  rownames(pbmc.ctrl.full) = genes$V1
 35 | #  
 36 | #  pbmc.stim.full = readMM("GSM2560249_2.2.mtx.gz")
 37 | #  colnames(pbmc.stim.full) = paste0(read.table("GSM2560249_barcodes.tsv.gz", header = FALSE, sep = "\t")[,1], "-2")
 38 | #  rownames(pbmc.stim.full) = genes$V1
 39 | #  
 40 | #  library(Seurat)
 41 | #  
 42 | #  pbmc <- CreateSeuratObject(counts = cbind(pbmc.stim.full, pbmc.ctrl.full), project = "PBMC", min.cells = 5)
 43 | #  pbmc@meta.data$stim <- c(rep("STIM", ncol(pbmc.stim.full)), rep("CTRL", ncol(pbmc.ctrl.full)))
 44 | #  
 45 | #  
 46 | #  
 47 | #  
 48 | #  # Running Harmony
 49 | #  
 50 | #  Harmony works on an existing matrix with cell embeddings and outputs its transformed version with the datasets aligned according to some user-defined experimental conditions. By default, harmony will look up the `pca` cell embeddings and use these to run harmony. Therefore, it assumes that the Seurat object has these embeddings already precomputed.
 51 | #  
 52 | #  ## Calculate PCA cell embeddings
 53 | #  
 54 | #  Here, using `Seurat::NormalizeData()`, we will be generating a union of highly variable genes using each condition (the control and stimulated cells). These features are going to be subsequently used to generate the 20 PCs with `Seurat::RunPCA()`.
 55 | #  
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | pbmc <- pbmc %>%
 59 |     NormalizeData(verbose = FALSE)
 60 | 
 61 | VariableFeatures(pbmc) <- split(row.names(pbmc@meta.data), pbmc@meta.data$stim) %>% lapply(function(cells_use) {
 62 |     pbmc[,cells_use] %>%
 63 |         FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 
 64 |         VariableFeatures()
 65 | }) %>% unlist %>% unique
 66 | 
 67 | pbmc <- pbmc %>% 
 68 |     ScaleData(verbose = FALSE) %>% 
 69 |     RunPCA(features = VariableFeatures(pbmc), npcs = 20, verbose = FALSE)
 70 | 
 71 | ## ---- eval=FALSE--------------------------------------------------------------
 72 | #  ## run harmony with default parameters
 73 | #  pbmc <- pbmc %>% RunHarmony("stim")
 74 | #  ## is equivalent to:
 75 | #  pbmc <- RunHarmony(pbmc, "stim")
 76 | 
 77 | ## ---- fig.width = 4, fig.height = 3, fig.align = "center", out.width="50%", fig.cap="By setting `plot_converge=TRUE`, harmony will generate a plot with its objective showing the flow of the integration. Each point represents the cost measured after a clustering round. Different colors represent different Harmony iterations which is controlled by `max_iter` (assuming that early_stop=FALSE). Here `max_iter=10` and up to 10 correction steps are expected. However, `early_stop=TRUE` so harmony will stop after the cost plateaus."----
 78 | 
 79 | pbmc <- pbmc %>% 
 80 |     RunHarmony("stim", plot_convergence = TRUE, nclust = 50, max_iter = 10, early_stop = T)
 81 | 
 82 | ## -----------------------------------------------------------------------------
 83 | harmony.embeddings <- Embeddings(pbmc, reduction = "harmony")
 84 | 
 85 | ## ---- fig.width=7, fig.height=3, out.width="100%", fig.align="center", fig.cap="Evaluate harmonization of stim parameter in the harmony generated cell embeddings"----
 86 | 
 87 | p1 <- DimPlot(object = pbmc, reduction = "harmony", pt.size = .1, group.by = "stim")
 88 | p2 <- VlnPlot(object = pbmc, features = "harmony_1", group.by = "stim",  pt.size = .1)
 89 | plot_grid(p1,p2)
 90 | 
 91 | ## ---- fig.width = 6, fig.height=3, out.width="100%"---------------------------
 92 | 
 93 | DimHeatmap(object = pbmc, reduction = "harmony", cells = 500, dims = 1:3)
 94 | 
 95 | ## -----------------------------------------------------------------------------
 96 | pbmc <- pbmc %>%
 97 |     FindNeighbors(reduction = "harmony") %>%
 98 |     FindClusters(resolution = 0.5) 
 99 | 
100 | ## ---- fig.width=5, fig.height=2.5, fig.align="center", fig.cap="t-SNE Visualization of harmony embeddings"----
101 | pbmc <- pbmc %>%
102 |     RunTSNE(reduction = "harmony")
103 | 
104 | 
105 | p1 <- DimPlot(pbmc, reduction = "tsne", group.by = "stim", pt.size = .1)
106 | p2 <- DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = .1)
107 | plot_grid(p1, p2)
108 | 
109 | 
110 | ## ---- fig.width = 7, fig.height = 7, out.width="100%", fig.cap="Expression of gene panel heatmap in the harmonized PBMC dataset"----
111 | FeaturePlot(object = pbmc, features= c("CD3D", "SELL", "CREM", "CD8A", "GNLY", "CD79A", "FCGR3A", "CCL2", "PPBP"), 
112 |             min.cutoff = "q9", cols = c("lightgrey", "blue"), pt.size = 0.5)
113 | 
114 | 
115 | ## ---- fig.width=5, fig.height=2.5, fig.align="center", fig.cap="UMAP Visualization of harmony embeddings"----
116 | pbmc <- pbmc %>%
117 |     RunUMAP(reduction = "harmony",  dims = 1:20)
118 | 
119 | p1 <- DimPlot(pbmc, reduction = "umap", group.by = "stim", pt.size = .1)
120 | p2 <- DimPlot(pbmc, reduction = "umap", label = TRUE,  pt.size = .1)
121 | plot_grid(p1, p2)
122 | 
123 | 
124 | ## -----------------------------------------------------------------------------
125 | sessionInfo()
126 | 
127 | 


--------------------------------------------------------------------------------
/doc/Seurat.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Using harmony in Seurat"
  3 | output:
  4 |   rmarkdown::html_vignette:
  5 |     code_folding: show
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Using harmony in Seurat}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r, include = FALSE}
 13 | knitr::opts_chunk$set(
 14 |   collapse = TRUE,
 15 |   comment = "#>"
 16 | )
 17 | ```
 18 | 
 19 | ```{r setup, message=FALSE, warning=FALSE}
 20 | library(harmony)
 21 | library(Seurat)
 22 | library(dplyr)
 23 | library(cowplot)
 24 | 
 25 | ```
 26 | # Introduction
 27 | 
 28 | This tutorial describes how to use harmony in Seurat v5 single-cell analysis workflows. `RunHarmony()` is a generic function is designed to interact with Seurat objects. This vignette will walkthrough basic workflow of Harmony with Seurat objects. Also, it will provide some basic downstream analyses demonstrating the properties of harmonized cell embeddings and a brief explanation of the exposed algorithm parameters.
 29 | 
 30 | Install Harmony from CRAN with standard commands.
 31 | 
 32 | ```{r eval=FALSE}
 33 | install.packages('harmony')
 34 | ```
 35 | 
 36 | # Generating the dataset
 37 | 
 38 | For this demo, we will be aligning two groups of PBMCs [Kang et al., 2017](https://doi.org/10.1038/nbt.4042). In this experiment, PBMCs are in stimulated and control conditions. The stimulated PBMC group was treated with interferon beta.
 39 | 
 40 | 
 41 | ```
 42 | 
 43 | 
 44 | ## Generate SeuratObject
 45 | 
 46 | ```{r}
 47 | ## Source required data
 48 | data("pbmc_stim")
 49 | pbmc <- CreateSeuratObject(counts = cbind(pbmc.stim, pbmc.ctrl), project = "PBMC", min.cells = 5)
 50 | 
 51 | ## Separate conditions
 52 | 
 53 | pbmc@meta.data$stim <- c(rep("STIM", ncol(pbmc.stim)), rep("CTRL", ncol(pbmc.ctrl)))
 54 | ```
 55 | 
 56 | 
 57 | ## (Optional) Download original data
 58 | The example above contains only two thousand cells. The full [Kang et al., 2017](https://doi.org/10.1038/nbt.4042) dataset is deposited in the [GEO](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE96583). This analysis uses GSM2560248 and GSM2560249 samples from [GSE96583_RAW.tar](https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE96583&format=file) file and the [GSE96583_batch2.genes.tsv.gz](https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE96583&format=file&file=GSE96583%5Fbatch2%2Egenes%2Etsv%2Egz) gene file.
 59 | 
 60 | ```{r eval = FALSE, class.source='fold-hide'}
 61 | library(Matrix)
 62 | ## Download and extract files from GEO
 63 | ##setwd("/path/to/downloaded/files")
 64 | genes =  read.table("GSE96583_batch2.genes.tsv.gz", header = FALSE, sep = "\t")
 65 | 
 66 | pbmc.ctrl.full = as.readMM("GSM2560248_2.1.mtx.gz")
 67 | colnames(pbmc.ctrl.full) = paste0(read.table("GSM2560248_barcodes.tsv.gz", header = FALSE, sep = "\t")[,1], "-1")
 68 | rownames(pbmc.ctrl.full) = genes$V1
 69 | 
 70 | pbmc.stim.full = readMM("GSM2560249_2.2.mtx.gz")
 71 | colnames(pbmc.stim.full) = paste0(read.table("GSM2560249_barcodes.tsv.gz", header = FALSE, sep = "\t")[,1], "-2")
 72 | rownames(pbmc.stim.full) = genes$V1
 73 | 
 74 | library(Seurat)
 75 | 
 76 | pbmc <- CreateSeuratObject(counts = cbind(pbmc.stim.full, pbmc.ctrl.full), project = "PBMC", min.cells = 5)
 77 | pbmc@meta.data$stim <- c(rep("STIM", ncol(pbmc.stim.full)), rep("CTRL", ncol(pbmc.ctrl.full)))
 78 | 
 79 | 
 80 | 
 81 | 
 82 | # Running Harmony
 83 | 
 84 | Harmony works on an existing matrix with cell embeddings and outputs its transformed version with the datasets aligned according to some user-defined experimental conditions. By default, harmony will look up the `pca` cell embeddings and use these to run harmony. Therefore, it assumes that the Seurat object has these embeddings already precomputed.
 85 | 
 86 | ## Calculate PCA cell embeddings
 87 | 
 88 | Here, using `Seurat::NormalizeData()`, we will be generating a union of highly variable genes using each condition (the control and stimulated cells). These features are going to be subsequently used to generate the 20 PCs with `Seurat::RunPCA()`.
 89 | 
 90 | ```{r}
 91 | pbmc <- pbmc %>%
 92 |     NormalizeData(verbose = FALSE)
 93 | 
 94 | VariableFeatures(pbmc) <- split(row.names(pbmc@meta.data), pbmc@meta.data$stim) %>% lapply(function(cells_use) {
 95 |     pbmc[,cells_use] %>%
 96 |         FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 
 97 |         VariableFeatures()
 98 | }) %>% unlist %>% unique
 99 | 
100 | pbmc <- pbmc %>% 
101 |     ScaleData(verbose = FALSE) %>% 
102 |     RunPCA(features = VariableFeatures(pbmc), npcs = 20, verbose = FALSE)
103 | ```
104 | 
105 | ## Perform an integrated analysis
106 | 
107 | To run harmony on Seurat object after it has been normalized, only one argument needs to be specified which contains the batch covariate located in the metadata. For this vignette, further parameters are specified to align the dataset but the minimum parameters are shown in the snippet below:
108 | 
109 | ```{r, eval=FALSE}
110 | ## run harmony with default parameters
111 | pbmc <- pbmc %>% RunHarmony("stim")
112 | ## is equivalent to:
113 | pbmc <- RunHarmony(pbmc, "stim")
114 | ```
115 | 
116 | Here, we will be running harmony with some indicative parameters and plotting the convergence plot to illustrate some of the under the hood functionality.
117 | 
118 | ```{r, fig.width = 4, fig.height = 3, fig.align = "center", out.width="50%", fig.cap="By setting `plot_converge=TRUE`, harmony will generate a plot with its objective showing the flow of the integration. Each point represents the cost measured after a clustering round. Different colors represent different Harmony iterations which is controlled by `max_iter` (assuming that early_stop=FALSE). Here `max_iter=10` and up to 10 correction steps are expected. However, `early_stop=TRUE` so harmony will stop after the cost plateaus."}
119 | 
120 | pbmc <- pbmc %>% 
121 |     RunHarmony("stim", plot_convergence = TRUE, nclust = 50, max_iter = 10, early_stop = T)
122 | ```
123 | 
124 | 
125 | 
126 | ### Harmony API parameters on Seurat objects
127 | 
128 | `RunHarmony` has several parameters accessible to users which are outlined below.
129 | 
130 | #### `object` (required)
131 | 
132 | The Seurat object. This vignette assumes Seurat objects are version 5.
133 | 
134 | #### `group.by.vars` (required)
135 | 
136 | A character vector that specifies all the experimental covariates to be corrected/harmonized by the algorithm.
137 | 
138 | When using `RunHarmony()` with Seurat, harmony will look up the `group.by.vars` metadata fields in the Seurat Object metadata.
139 | 
140 | For example, given the `pbmc[["stim"]]` exists as the stim condition, setting `group.by.vars="stim"` will perform integration of these samples accordingly. If you want to integrate on another variable, it needs to be present in Seurat object's meta.data.
141 | 
142 | To correct for several covariates, specify them in a vector: `group.by.vars = c("stim", "new_covariate")`.
143 | 
144 | #### `reduction.use`
145 | 
146 | The cell embeddings to be used for the batch alignment. This parameter assumes that a reduced dimension already exists in the reduction slot of the Seurat object.  By default, the `pca` reduction is used.
147 | 
148 | 
149 | #### `dims.use`
150 | 
151 | Optional parameter which can use a name vector to select specific dimensions to be harmonized.
152 | 
153 | 
154 | ### Algorithm parameters
155 | ![Harmony Algorithm Overview](main.jpg){width=100%}
156 | 
157 | #### `nclust`
158 | 
159 | is a positive integer. Under the hood, harmony applies k-means soft-clustering. For this task, `k` needs to be determined. `nclust` corresponds to `k`. The harmonization results and performance are not particularly sensitive for a reasonable range of this parameter value. If this parameter is not set, harmony will autodetermine this based on the dataset size with a maximum cap of 200. For dataset with a vast amount of different cell types and batches this pamameter may need to be determined manually.
160 | 
161 | #### `sigma`
162 | 
163 | a positive scalar that controls the soft clustering probability assignment of single-cells to different clusters. Larger values will assign a larger probability to distant clusters of cells resulting in a different correction profile. Single-cells are assigned to clusters by their euclidean distance $d$ to some cluster center $Y$ after cosine normalization which is defined in the range [0,4]. The clustering probability of each cell is calculated as $e^{-\frac{d}{\sigma}}$ where $\sigma$ is controlled by the `sigma` parameter. Default value of `sigma` is 0.1 and it generally works well since it defines probability assignment of a cell in the range $[e^{-40}, e^0]$. Larger values of `sigma`  restrict the dynamic range of probabilities that can be assigned to cells. For example, `sigma=1` will yield a probabilities in the range of $[e^{-4}, e^0]$.
164 | 
165 | 
166 | #### `theta`
167 | 
168 | `theta` is a positive scalar vector that determines the coefficient of harmony's diversity penalty for each corrected experimental covariate. In challenging experimental conditions, increasing theta may result in better integration results. Theta is an expontential parameter of the diversity penalty, thus setting `theta=0` disables this penalty while increasing it to greater values than 1 will perform more aggressive corrections in an expontential manner. By default, it will set `theta=2` for each experimental covariate.
169 | 
170 | #### `max_iter`
171 | 
172 | The number of correction steps harmony will perform before completing the data set integration. In general, more iterations than necessary increases computational runtime especially which becomes evident in bigger datasets. Setting `early_stop=TRUE` may reduce the actual number of correction steps which will be smaller than `max_iter`.
173 | 
174 | #### `early_stop`
175 | 
176 | Under the hood, harmony minimizes its objective function through a series of clustering and integration tests. By setting `early_stop=TRUE`, when the objective function is less than `1e-4` after a correction step harmony exits before reaching the `max_iter` correction steps. This parameter can drastically reduce run-time in bigger datasets. 
177 | 
178 | #### `.options`
179 | A set of internal algorithm parameters that can be overriden. For advanced users only.
180 | 
181 | 
182 | 
183 | ### Seurat specific parameters
184 | 
185 | These parameters are Seurat-specific and do not affect the flow of the algorithm.
186 | 
187 | #### `project_dim`
188 | 
189 | Toggle-like parameter, by default `project_dim=TRUE`. When enabled, `RunHarmony()` calculates genomic feature loadings using Seurat's `ProjectDim()` that correspond to the harmonized cell embeddings.
190 | 
191 | #### `reduction.save`
192 | 
193 | The new Reduced Dimension slot identifier. By default, `reduction.save=TRUE`. This option allows several independent runs of harmony to be retained in the appropriate slots in the SeuratObjects. It is useful if you want to try Harmony with multiple parameters and save them as e.g. 'harmony_theta0', 'harmony_theta1', 'harmony_theta2'.
194 | 
195 | ### Miscellaneous parameters
196 | 
197 | These parameters help users troubleshoot harmony. 
198 | 
199 | #### `plot_convergence`
200 | 
201 | Option that plots the convergence plot after the execution of the algorithm. By default `FALSE`. Setting it to `TRUE` will collect harmony's objective value and plot it allowing the user to troubleshoot the flow of the algorithm and fine-tune the parameters of the dataset integration procedure.
202 | 
203 | 
204 | 
205 | ### Accessing the data
206 | 
207 | `RunHarmony()` returns the Seurat object which contains the harmonized cell embeddings in a slot named **harmony**. This entry can be accessed via `pbmc@reductions$harmony`. To access the values of the cell embeddings we can also use:
208 | 
209 | ```{r}
210 | harmony.embeddings <- Embeddings(pbmc, reduction = "harmony")
211 | ```
212 | 
213 | ### Inspection of the modalities
214 | 
215 | After Harmony integration, we should inspect the quality of the harmonization and contrast it with the unharmonized algorithm input. Ideally, cells from different conditions will align along the Harmonized PCs. If they are not, you could increase the *theta* value above to force a more aggressive fit of the dataset and rerun the workflow.
216 | 
217 | ```{r, fig.width=7, fig.height=3, out.width="100%", fig.align="center", fig.cap="Evaluate harmonization of stim parameter in the harmony generated cell embeddings"}
218 | 
219 | p1 <- DimPlot(object = pbmc, reduction = "harmony", pt.size = .1, group.by = "stim")
220 | p2 <- VlnPlot(object = pbmc, features = "harmony_1", group.by = "stim",  pt.size = .1)
221 | plot_grid(p1,p2)
222 | ```
223 | 
224 | Plot Genes correlated with the Harmonized PCs
225 | 
226 | ```{r, fig.width = 6, fig.height=3, out.width="100%"}
227 | 
228 | DimHeatmap(object = pbmc, reduction = "harmony", cells = 500, dims = 1:3)
229 | ```
230 | 
231 | # Using harmony embeddings for dimensionality reduction in Seurat
232 | 
233 | The harmonized cell embeddings generated by harmony can be used  for further integrated analyses. In this workflow, the Seurat object contains the harmony `reduction` modality name in the method that requires it.
234 | 
235 | ## Perform clustering using the harmonized vectors of cells
236 | ```{r}
237 | pbmc <- pbmc %>%
238 |     FindNeighbors(reduction = "harmony") %>%
239 |     FindClusters(resolution = 0.5) 
240 | ```
241 | ## TSNE dimensionality reduction
242 | ```{r, fig.width=5, fig.height=2.5, fig.align="center", fig.cap="t-SNE Visualization of harmony embeddings"}
243 | pbmc <- pbmc %>%
244 |     RunTSNE(reduction = "harmony")
245 | 
246 | 
247 | p1 <- DimPlot(pbmc, reduction = "tsne", group.by = "stim", pt.size = .1)
248 | p2 <- DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = .1)
249 | plot_grid(p1, p2)
250 | 
251 | ```
252 | 
253 | One important observation is to assess that the harmonized data contain biological states of the cells. Therefore by checking the following genes we can see that biological cell states are preserved after harmonization.
254 | 
255 | ```{r, fig.width = 7, fig.height = 7, out.width="100%", fig.cap="Expression of gene panel heatmap in the harmonized PBMC dataset"}
256 | FeaturePlot(object = pbmc, features= c("CD3D", "SELL", "CREM", "CD8A", "GNLY", "CD79A", "FCGR3A", "CCL2", "PPBP"), 
257 |             min.cutoff = "q9", cols = c("lightgrey", "blue"), pt.size = 0.5)
258 | 
259 | ```
260 | 
261 | ## UMAP
262 | 
263 | Very similarly with TSNE we can run UMAP by passing the harmony reduction in the function.
264 | 
265 | ```{r, fig.width=5, fig.height=2.5, fig.align="center", fig.cap="UMAP Visualization of harmony embeddings"}
266 | pbmc <- pbmc %>%
267 |     RunUMAP(reduction = "harmony",  dims = 1:20)
268 | 
269 | p1 <- DimPlot(pbmc, reduction = "umap", group.by = "stim", pt.size = .1)
270 | p2 <- DimPlot(pbmc, reduction = "umap", label = TRUE,  pt.size = .1)
271 | plot_grid(p1, p2)
272 | 
273 | ```
274 | 
275 | 
276 | ```{r}
277 | sessionInfo()
278 | ```
279 | 
280 | 


--------------------------------------------------------------------------------
/doc/detailedWalkthrough.R:
--------------------------------------------------------------------------------
  1 | ## ---- message=FALSE, warning=FALSE, class.source = 'fold-hide'----------------
  2 | 
  3 | ## Source required libraries
  4 | library(data.table)
  5 | library(tidyverse)
  6 | library(ggthemes)
  7 | library(ggrepel)
  8 | library(harmony)
  9 | library(patchwork)
 10 | library(tidyr)
 11 | 
 12 | ## Useful util functions
 13 | 
 14 | cosine_normalize <- function(X, margin) {
 15 |     if (margin == 1) {
 16 |         res <- sweep(as.matrix(X), 1, sqrt(rowSums(X ^ 2)), '/')
 17 |         row.names(res) <- row.names(X)
 18 |         colnames(res) <- colnames(X)        
 19 |     } else {
 20 |         res <- sweep(as.matrix(X), 2, sqrt(colSums(X ^ 2)), '/')
 21 |         row.names(res) <- row.names(X)
 22 |         colnames(res) <- colnames(X)
 23 |     }
 24 |     return(res)
 25 | }
 26 | 
 27 | onehot <- function(vals) {
 28 |     t(model.matrix(~0 + as.factor(vals)))
 29 | }
 30 | 
 31 | 
 32 | colors_use <- c(`jurkat` = rgb(129, 15, 124, maxColorValue=255),
 33 |                 `t293` = rgb(208, 158, 45, maxColorValue=255),
 34 |                 `half` = rgb(0, 109, 44, maxColorValue=255))
 35 | 
 36 | 
 37 | do_scatter <- function(umap_use, meta_data, label_name, no_guides = TRUE, do_labels = TRUE, nice_names, 
 38 |                        palette_use = colors_use,
 39 |                        pt_size = 4, point_size = .5, base_size = 10, do_points = TRUE, do_density = FALSE, h = 4, w = 8) {
 40 |     umap_use <- umap_use[, 1:2]
 41 |     colnames(umap_use) <- c('X1', 'X2')
 42 |     plt_df <- umap_use %>% data.frame() %>% 
 43 |         cbind(meta_data) %>% 
 44 |         dplyr::sample_frac(1L) 
 45 |     plt_df$given_name <- plt_df[[label_name]]
 46 |     
 47 |     if (!missing(nice_names)) {
 48 |         plt_df %<>%
 49 |             dplyr::inner_join(nice_names, by = "given_name") %>% 
 50 |             subset(nice_name != "" & !is.na(nice_name))
 51 | 
 52 |         plt_df[[label_name]] <- plt_df$nice_name        
 53 |     }
 54 |         
 55 |     plt <- plt_df %>% 
 56 |         ggplot(aes(X1, X2, colour = .data[[label_name]], fill = .data[[label_name]])) + 
 57 |         theme_tufte(base_size = base_size) + 
 58 |         theme(panel.background = element_rect(fill = NA, color = "black")) + 
 59 |         guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4)), alpha = FALSE) +
 60 |         scale_color_manual(values = palette_use) + 
 61 |         scale_fill_manual(values = palette_use) +    
 62 |         theme(plot.title = element_text(hjust = .5)) + 
 63 |         labs(x = "UMAP 1", y = "UMAP 2") 
 64 |     
 65 |     if (do_points) 
 66 |         plt <- plt + geom_point(size = 0.2)
 67 |     if (do_density) 
 68 |         plt <- plt + geom_density_2d()    
 69 |         
 70 | 
 71 |     if (no_guides)
 72 |         plt <- plt + guides("none")
 73 |     
 74 |     if (do_labels) 
 75 |         plt <- plt + geom_label_repel(data = data.table(plt_df)[, .(X1 = mean(X1), X2 = mean(X2)), by = label_name], label.size = NA,
 76 |                                       aes(label = .data[[label_name]]), color = "white", size = pt_size, alpha = 1, segment.size = 0) + 
 77 |         guides(col = FALSE, fill = FALSE)
 78 |     return(plt)
 79 | }
 80 | 
 81 | 
 82 | ## -----------------------------------------------------------------------------
 83 | data(cell_lines)
 84 | V <- cell_lines$scaled_pcs
 85 | V_cos <- cosine_normalize(V, 1)
 86 | meta_data <- cell_lines$meta_data
 87 | 
 88 | ## ---- warning=FALSE, fig.width=5, fig.height=3, fig.align="center"------------
 89 | do_scatter(V, meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
 90 |     labs(title = 'Colored by dataset', x = 'PC1', y = 'PC2') +
 91 | do_scatter(V, meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 
 92 |     labs(title = 'Colored by cell type', x = 'PC1', y = 'PC2') +
 93 | NULL
 94 | 
 95 | ## -----------------------------------------------------------------------------
 96 | 
 97 | set.seed(1)
 98 | harmonyObj <- harmony::RunHarmony(
 99 |     data_mat = V, ## PCA embedding matrix of cells
100 |     meta_data = meta_data, ## dataframe with cell labels
101 |     theta = 1, ## cluster diversity enforcement
102 |     vars_use = 'dataset', ## variable to integrate out
103 |     nclust = 5, ## number of clusters in Harmony model
104 |     max_iter = 0, ## stop after initialization
105 |     return_object = TRUE ## return the full Harmony model object
106 | )
107 | 
108 | 
109 | 
110 | ## ---- fig.width=5, fig.height=3, fig.align="center"---------------------------
111 | do_scatter(t(harmonyObj$Z_orig), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
112 |     labs(title = 'Z_orig', subtitle = 'Euclidean distance', x = 'PC1', y = 'PC2') +
113 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
114 |     labs(title = 'Z_cos', subtitle = 'Induced Cosine distance', x = 'PC1', y = 'PC2')
115 | 
116 | 
117 | ## ---- fig.width=8, fig.height=3, out.width="100%"-----------------------------
118 | 
119 | harmonyObj$Z_cos %>% t %>% data.frame() %>% 
120 |     cbind(meta_data) %>% 
121 |     tidyr::gather(key, val, X1:X20) %>% 
122 |     ggplot(aes(reorder(gsub('X', 'PC', key), as.integer(gsub('X', '', key))), val)) + 
123 |         geom_boxplot(aes(color = dataset)) + 
124 |         scale_color_manual(values = colors_use) + 
125 |         labs(x = 'PC number', y = 'PC embedding value', title = 'Z_cos (unit scaled PCA embeddings) for all 20 PCs') + 
126 |         theme_tufte(base_size = 10) + geom_rangeframe() + 
127 |         theme(axis.text.x = element_text(angle = 45, hjust = 1))
128 | 
129 | ## ---- fig.width=4, fig.height=3, fig.align="center"---------------------------
130 | 
131 | cluster_centroids <- harmonyObj$Y
132 | 
133 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = FALSE, do_labels = FALSE) + 
134 |     labs(title = 'Initial kmeans cluster centroids', subtitle = '', x = 'PC1', y = 'PC2') +
135 |     geom_point(
136 |         data = data.frame(t(cluster_centroids)), 
137 |         color = 'black', fill = 'black', alpha = .8,
138 |         shape = 21, size = 6
139 |     ) +
140 | NULL
141 | 
142 | 
143 | ## -----------------------------------------------------------------------------
144 | cluster_assignment_matrix <- harmonyObj$R
145 | 
146 | 
147 | ## ---- fig.height=5, fig.width=5-----------------------------------------------
148 | t(harmonyObj$Z_cos) %>% data.frame() %>%
149 |     cbind(meta_data) %>% 
150 |     tibble::rowid_to_column('id') %>% 
151 |     dplyr::inner_join(
152 |         cluster_assignment_matrix %>% t() %>% data.table() %>% 
153 |             tibble::rowid_to_column('id') %>%
154 |             tidyr::gather(cluster, r, -id) %>% 
155 |             dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 
156 |         by = 'id'
157 |     ) %>% 
158 |     dplyr::sample_frac(1L) %>% 
159 |     ggplot(aes(X1, X2, color = r)) + 
160 |         geom_point(size=0.2) + 
161 |         theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
162 |         facet_grid(cluster ~ dataset) + 
163 |         scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) + 
164 |         labs(x = 'Scaled PC1', y = 'Scaled PC2', title = 'Initial probabilistic cluster assignments')
165 | 
166 | ## -----------------------------------------------------------------------------
167 | observed_counts <- with(harmonyObj, R %*% t(as.matrix(Phi)))
168 | round(observed_counts)
169 | 
170 | 
171 | 
172 | ## -----------------------------------------------------------------------------
173 | ## observed counts
174 | round(harmonyObj$O)
175 | 
176 | ## observed counts
177 | round(harmonyObj$E)
178 | 
179 | 
180 | ## -----------------------------------------------------------------------------
181 | phi_celltype <- onehot(meta_data$cell_type) 
182 | observed_cell_counts <- harmonyObj$R %*% t(phi_celltype)
183 | round(observed_cell_counts)
184 | 
185 | 
186 | ## -----------------------------------------------------------------------------
187 | harmonyObj$max_iter_kmeans
188 | 
189 | ## -----------------------------------------------------------------------------
190 | ## we can specify how many rounds of clustering to do
191 | harmonyObj$max_iter_kmeans <- 10
192 | harmonyObj$cluster_cpp()
193 | 
194 | ## -----------------------------------------------------------------------------
195 | round(harmonyObj$O)
196 | 
197 | ## ---- fig.height=5, fig.width=5-----------------------------------------------
198 | new_cluster_assignment_matrix <- harmonyObj$R
199 | 
200 | t(harmonyObj$Z_cos) %>% data.frame() %>%
201 |     cbind(meta_data) %>% 
202 |     tibble::rowid_to_column('id') %>% 
203 |     dplyr::inner_join(
204 |         new_cluster_assignment_matrix %>% t() %>% data.table() %>% 
205 |             tibble::rowid_to_column('id') %>%
206 |             tidyr::gather(cluster, r, -id) %>% 
207 |             dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 
208 |         by = 'id'
209 |     ) %>% 
210 |     dplyr::sample_frac(1L) %>% 
211 |     ggplot(aes(X1, X2, color = r)) + 
212 |         geom_point(shape = '.') + 
213 |         theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
214 |         facet_grid(cluster ~ dataset) + 
215 |         scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) + 
216 |         labs(x = 'Scaled PC1', y = 'Scaled PC2', title = 'New probabilistic cluster assignments')
217 | 
218 | ## -----------------------------------------------------------------------------
219 | phi_celltype <- onehot(meta_data$cell_type)
220 | observed_cell_counts <- harmonyObj$R %*% t(phi_celltype)
221 | round(observed_cell_counts)
222 | 
223 | ## -----------------------------------------------------------------------------
224 | round(apply(prop.table(observed_cell_counts, 1), 1, min) * 100, 3)
225 | 
226 | ## -----------------------------------------------------------------------------
227 | 
228 | with(harmonyObj, {
229 |     distance_matrix <- 2 * (1 - t(Y) %*% Z_cos)
230 |     distance_score <- exp(-distance_matrix / as.numeric(sigma))
231 |     diversity_score <- sweep(E / O, 2, theta, '/') %*% as.matrix(Phi)
232 |     ## new assignments are based on distance and diversity
233 |     R_new <- distance_score * diversity_score  
234 |     ## normalize R so each cell sums to 1
235 |     R_new <- prop.table(R_new, 2)    
236 | })
237 | 
238 | 
239 | ## -----------------------------------------------------------------------------
240 | ## with theta = 0
241 | with(harmonyObj, {
242 |     (E / O) ^ 0
243 | })
244 | 
245 | ## -----------------------------------------------------------------------------
246 | ## with theta = 1
247 | with(harmonyObj, {
248 |     round((E / O) ^ 1, 2)
249 | })
250 | 
251 | 
252 | ## -----------------------------------------------------------------------------
253 | ## as theta approach infinity
254 | with(harmonyObj, {
255 |     round((E / O) ^ 1e6, 2)
256 | })
257 | 
258 | 
259 | ## -----------------------------------------------------------------------------
260 | Y_unscaled <- with(harmonyObj, Z_cos %*% t(R))
261 | 
262 | ## -----------------------------------------------------------------------------
263 | Y_new <- cosine_normalize(Y_unscaled, 2)
264 | 
265 | ## -----------------------------------------------------------------------------
266 | harmonyObj$moe_correct_ridge_cpp()
267 | 
268 | ## ---- fig.width=5, fig.height=3, fig.align="center"---------------------------
269 | 
270 | do_scatter(cosine_normalize(t(harmonyObj$Z_orig), 1), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
271 |     labs(title = 'Z_cos before MoE', x = 'PC1', y = 'PC2') +
272 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
273 |     labs(title = 'Z_cos after MoE', x = 'PC1', y = 'PC2')
274 | 
275 | ## ---- fig.width=8, fig.height=3, fig.align="center", out.width="100%"---------
276 | 
277 | do_scatter(t(harmonyObj$Z_orig), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
278 |     labs(title = 'Z_orig', subtitle = 'Original PCA embeddings', x = 'PC1', y = 'PC2') +
279 | do_scatter(t(harmonyObj$Z_corr), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
280 |     labs(title = 'Z_corr', subtitle = '= Z_orig - correction_factors', x = 'PC1', y = 'PC2') +
281 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
282 |     labs(title = 'Z_cos', subtitle = '= Unit_scaled(Z_corr)', x = 'Scaled PC1', y = 'Scaled PC2') +
283 | NULL
284 | 
285 | ## ---- fig.width=5, fig.height=3, fig.align="center"---------------------------
286 | 
287 | plt <- data.table(PC1_After = harmonyObj$Z_corr[1, ], PC1_Before = harmonyObj$Z_orig[1, ]) %>% 
288 |     cbind(meta_data) %>% 
289 |     dplyr::sample_frac(1L) %>% 
290 |     ggplot(aes(PC1_Before, PC1_After)) + 
291 |         geom_abline(slope = 1, intercept = 0) + 
292 |         theme_tufte(base_size = 10) + geom_rangeframe() + 
293 |         scale_color_tableau() + 
294 |         guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4))) + 
295 |         NULL
296 | 
297 | plt + geom_point(shape = '.', aes(color = dataset)) + 
298 |         labs(x = 'PC1 before correction', y = 'PC1 after correction', 
299 |              title = 'PC1 correction for each cell', subtitle = 'Colored by Dataset') + 
300 | plt + geom_point(shape = '.', aes(color = cell_type)) + 
301 |         labs(x = 'PC1 before correction', y = 'PC1 after correction', 
302 |              title = 'PC1 correction for each cell', subtitle = 'Colored by Cell Type') + 
303 | NULL
304 | 
305 | 
306 | ## ---- echo=TRUE---------------------------------------------------------------
307 | 
308 | W <- list()
309 | ## Convert sparse data structures to dense matrix
310 | Phi.moe <- as.matrix(harmonyObj$Phi_moe)
311 | lambda <- diag(c(harmonyObj$lambda))
312 | ## Get beta coeeficients for all the clusters
313 | for (k in 1:harmonyObj$K) {
314 |     W[[k]] <- solve(Phi.moe %*% diag(harmonyObj$R[k, ]) %*% t(Phi.moe) + lambda) %*% (Phi.moe %*% diag(harmonyObj$R[k, ])) %*% t(harmonyObj$Z_orig)
315 | }
316 | 
317 | 
318 | 
319 | ## ---- fig.width=5, fig.height=5-----------------------------------------------
320 | 
321 | cluster_assignment_matrix <- harmonyObj$R
322 | 
323 | t(harmonyObj$Z_orig) %>% data.frame() %>%
324 |     cbind(meta_data) %>% 
325 |     tibble::rowid_to_column('id') %>% 
326 |     dplyr::inner_join(
327 |         cluster_assignment_matrix %>% t() %>% data.table() %>% 
328 |             tibble::rowid_to_column('id') %>%
329 |             tidyr::gather(cluster, r, -id) %>% 
330 |             dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 
331 |         by = 'id'
332 |     ) %>% 
333 |     dplyr::sample_frac(1L) %>% 
334 |     ggplot(aes(X1, X2, color = r)) + 
335 |         geom_point(shape = 0.2) + 
336 |         theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
337 |         facet_grid(cluster ~ dataset) + 
338 |         scale_color_gradient(low = 'grey', breaks = seq(0, 1, .2)) + 
339 |         labs(x = 'PC1', y = 'PC2', title = 'Cluster assigned in original PCA space (Z_orig)')
340 | 
341 | 
342 | ## -----------------------------------------------------------------------------
343 | plt_list <- lapply(1:harmonyObj$K, function(k) {
344 |     plt_df <- W[[k]] %>% data.frame() %>% 
345 |         dplyr::select(X1, X2)
346 |     ## Append n
347 |     plt_df <- plt_df %>% 
348 |         cbind(
349 |             data.frame(t(matrix(unlist(c(c(0, 0), rep(plt_df[1, ], 3))), nrow = 2))) %>% 
350 |                 dplyr::rename(x0 = X1, y0 = X2) 
351 |         ) %>%
352 |         cbind(type = c('intercept', unique(meta_data$dataset)))
353 |     plt <- plt_df %>% 
354 |         ggplot() + 
355 |             geom_point(aes(X1, X2),
356 |                        data = t(harmonyObj$Z_orig) %>% data.frame(),
357 |                        size = 0.5,
358 |                        color = 'grey'
359 |             ) + 
360 |             geom_segment(aes(x = x0, y = y0, xend = X1 + x0, yend = X2 + y0, color = type), linewidth=1) + 
361 |             scale_color_manual(values = c('intercept' = 'black', colors_use)) + 
362 |             theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
363 |             labs(x = 'PC 1', y = 'PC 2', title = sprintf('Cluster %d', k))
364 |     plt <- plt + guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16)))    
365 |     # if (k == harmonyObj$K) {
366 |     # } else {
367 |     #     plt <- plt + guides(color = FALSE)
368 |     # }
369 |     plt
370 | })
371 | 
372 | 
373 | 
374 | ## ---- fig.height=6, fig.width=6-----------------------------------------------
375 | Reduce(`+`, plt_list) + 
376 |   patchwork::plot_annotation(title = 'Mixture of experts beta terms before correction (Z_orig)') + 
377 |   plot_layout(ncol = 2)
378 | 
379 | ## ---- fig.width=4, fig.height=3, fig.align="center"---------------------------
380 | 
381 | plt_list <- lapply(1:harmonyObj$K, function(k) {
382 |     plt_df <- W[[k]] %>% data.frame() %>% 
383 |         dplyr::select(X1, X2)
384 | 
385 |     plt_df <- plt_df %>% 
386 |         cbind(
387 |             data.frame(t(matrix(unlist(c(c(0, 0), rep(plt_df[1, ], 3))), nrow = 2))) %>% 
388 |                 dplyr::rename(x0 = X1, y0 = X2) 
389 |         ) %>%
390 |         cbind(type = c('intercept', unique(meta_data$dataset))) 
391 | 
392 |     plt <- plt_df %>% 
393 |         ggplot() + 
394 |             geom_point(aes(X1, X2),
395 |                 data = t(harmonyObj$Z_corr) %>% data.frame(),
396 |                 shape = '.', 
397 |                 color = 'grey'
398 |             ) + 
399 |             geom_segment(aes(x = x0, y = y0, xend = X1 + x0, yend = X2 + y0, color = type), linewidth=1) + 
400 |             scale_color_manual(values = c('intercept' = 'black', colors_use)) + 
401 |             theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
402 |             labs(x = 'PC 1', y = 'PC 2', title = sprintf('Cluster %d', k))
403 |     plt <- plt + guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16)))
404 |     plt
405 | })
406 | 
407 | 
408 | 
409 | ## ---- fig.height=6, fig.width=6-----------------------------------------------
410 | Reduce(`+`, plt_list) + 
411 |   patchwork::plot_annotation(title = 'Mixture of experts beta terms after correction (Z_corr)') + 
412 |   plot_layout(ncol = 2)
413 | 
414 | ## ---- echo=TRUE---------------------------------------------------------------
415 | 
416 | Z_i <- harmonyObj$Z_orig[, 5]
417 | Z_i_pred <- Reduce(`+`, lapply(1:harmonyObj$K, function(k) {
418 |     W[[k]] * harmonyObj$Phi_moe[, 5] * harmonyObj$R[k, 5]
419 | })) %>% colSums
420 | 
421 | 
422 | 
423 | ## ---- fig.width=4, fig.height=3, fig.align="center"---------------------------
424 | data.table(obs = Z_i, pred = Z_i_pred) %>% 
425 |     tibble::rowid_to_column('PC') %>% 
426 |     ggplot(aes(obs, pred)) + 
427 |         geom_point(shape = 21) + 
428 |         geom_label_repel(aes(label = PC)) + 
429 |         geom_abline(slope = 1, intercept = 0) + 
430 |         theme_tufte() + geom_rangeframe() + 
431 |         labs(x = 'Observed PC score', 'Predicted PC score', title = 'Observed and predicted values of PC scores\nfor cell 5') + 
432 |         NULL        
433 | 
434 | ## -----------------------------------------------------------------------------
435 | delta <- Reduce(`+`, lapply(1:harmonyObj$K, function(k) {
436 |     W[[k]][2:4, ] * harmonyObj$Phi[, 5] * harmonyObj$R[k, 5]
437 | })) %>% colSums
438 | 
439 | Z_corrected <- harmonyObj$Z_orig[, 5] - delta
440 | 
441 | 
442 | ## ---- fig.width=3, fig.height=3, fig.align="center"---------------------------
443 | 
444 | 
445 | harmonyObj$Z_orig %>% t %>% data.frame() %>% 
446 |     ggplot(aes(X1, X2)) + 
447 |         geom_point(shape = '.') + 
448 |         geom_point(
449 |             data = data.frame(t(harmonyObj$Z_orig[, 5, drop = FALSE])), 
450 |             color = 'red'
451 |         ) + 
452 |         geom_segment(
453 |             data = data.table(x0 = harmonyObj$Z_orig[1, 5], 
454 |                               y0 = harmonyObj$Z_orig[2, 5], 
455 |                               x1 = Z_corrected[1],
456 |                               y1 = Z_corrected[2]), 
457 |             aes(x = x0, y = y0, xend = x1, yend = y1),
458 |             linewidth = 1,
459 |             color = 'red', 
460 |             arrow = arrow(length = unit(0.05, "npc"), type = 'closed')            
461 |         ) + 
462 |         theme_tufte(base_size = 10) + geom_rangeframe() + 
463 |         labs(x = 'PC1', y = 'PC2', title = 'Correction of cell #5')
464 | 
465 | 
466 | ## -----------------------------------------------------------------------------
467 | 
468 | harmonyObj <- RunHarmony(
469 |     data_mat = V, ## PCA embedding matrix of cells
470 |     meta_data = meta_data, ## dataframe with cell labels
471 |     theta = 1, ## cluster diversity enforcement
472 |     vars_use = 'dataset', ## (list of) variable(s) we'd like to Harmonize out
473 |     nclust = 50, ## number of clusters in Harmony model
474 |     max_iter = 0, ## don't actually run Harmony, stop after initialization
475 |     return_object = TRUE ## return the full Harmony model object, not just the corrected PCA matrix
476 | )
477 | 
478 | 
479 | ## ---- message=FALSE, fig.width=5, fig.height=3, fig.align="center"------------
480 | 
481 | i <- 0
482 | 
483 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
484 |     labs(title = sprintf('Round %d', i), subtitle = 'Colored by dataset', x = 'Scaled PC1', y = 'Scaled PC2') +
485 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 
486 |     labs(title = sprintf('Round %d', i), subtitle = 'Colored by cell type', x = 'Scaled PC1', y = 'Scaled PC2') +
487 | NULL
488 | 
489 | ## ---- fig.width=5, fig.height=3, fig.align="center", message=FALSE------------
490 | 
491 | for (i in 1:2) {
492 |     harmony:::harmonize(harmonyObj, 1)
493 |     plt <- do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
494 |         labs(title = sprintf('Round %d', i), subtitle = 'Colored by dataset', x = 'Scaled PC1', y = 'Scaled PC2') +
495 |     do_scatter(t(harmonyObj$Z_cos), meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 
496 |         labs(title = sprintf('Round %d', i), subtitle = 'Colored by cell type', x = 'Scaled PC1', y = 'Scaled PC2') +
497 |     NULL
498 |     plot(plt)
499 | }
500 |     
501 | 
502 | ## -----------------------------------------------------------------------------
503 | sessionInfo()
504 | 
505 | 


--------------------------------------------------------------------------------
/doc/detailedWalkthrough.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Detailed Walkthrough of Harmony Algorithm"
  3 | author: "Korsunsky et al.: Fast, sensitive, and accurate integration of single cell data with Harmony"
  4 | vignette: >
  5 |   %\VignetteIndexEntry{Detailed Walkthrough of Harmony Algorithm}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | output: rmarkdown::html_vignette
  9 | ---
 10 | 
 11 | # Motivation
 12 | 
 13 | This notebook breaks down the Harmony algorithm and model in the context of a simple real-world dataset.
 14 | 
 15 | After reading this, the user should have a better understanding of how 
 16 | 
 17 | 1. the equations connect to the algorithm
 18 | 2. the algorithm works on real data 
 19 | 3. to access the different parts of the Harmony model from R
 20 | 
 21 | 
 22 | # Prerequisites
 23 | 
 24 | For this vignette we are going to use harmony among other tools that will help with the visualization and inspection of the algorithm intermediate states. Also, we provide a few helper functions that can be found in the source block below.
 25 | 
 26 | ```{r, message=FALSE, warning=FALSE, class.source = 'fold-hide'}
 27 | 
 28 | ## Source required libraries
 29 | library(data.table)
 30 | library(tidyverse)
 31 | library(ggthemes)
 32 | library(ggrepel)
 33 | library(harmony)
 34 | library(patchwork)
 35 | library(tidyr)
 36 | 
 37 | ## Useful util functions
 38 | 
 39 | cosine_normalize <- function(X, margin) {
 40 |     if (margin == 1) {
 41 |         res <- sweep(as.matrix(X), 1, sqrt(rowSums(X ^ 2)), '/')
 42 |         row.names(res) <- row.names(X)
 43 |         colnames(res) <- colnames(X)        
 44 |     } else {
 45 |         res <- sweep(as.matrix(X), 2, sqrt(colSums(X ^ 2)), '/')
 46 |         row.names(res) <- row.names(X)
 47 |         colnames(res) <- colnames(X)
 48 |     }
 49 |     return(res)
 50 | }
 51 | 
 52 | onehot <- function(vals) {
 53 |     t(model.matrix(~0 + as.factor(vals)))
 54 | }
 55 | 
 56 | 
 57 | colors_use <- c(`jurkat` = rgb(129, 15, 124, maxColorValue=255),
 58 |                 `t293` = rgb(208, 158, 45, maxColorValue=255),
 59 |                 `half` = rgb(0, 109, 44, maxColorValue=255))
 60 | 
 61 | 
 62 | do_scatter <- function(umap_use, meta_data, label_name, no_guides = TRUE, do_labels = TRUE, nice_names, 
 63 |                        palette_use = colors_use,
 64 |                        pt_size = 4, point_size = .5, base_size = 10, do_points = TRUE, do_density = FALSE, h = 4, w = 8) {
 65 |     umap_use <- umap_use[, 1:2]
 66 |     colnames(umap_use) <- c('X1', 'X2')
 67 |     plt_df <- umap_use %>% data.frame() %>% 
 68 |         cbind(meta_data) %>% 
 69 |         dplyr::sample_frac(1L) 
 70 |     plt_df$given_name <- plt_df[[label_name]]
 71 |     
 72 |     if (!missing(nice_names)) {
 73 |         plt_df %<>%
 74 |             dplyr::inner_join(nice_names, by = "given_name") %>% 
 75 |             subset(nice_name != "" & !is.na(nice_name))
 76 | 
 77 |         plt_df[[label_name]] <- plt_df$nice_name        
 78 |     }
 79 |         
 80 |     plt <- plt_df %>% 
 81 |         ggplot(aes(X1, X2, colour = .data[[label_name]], fill = .data[[label_name]])) + 
 82 |         theme_tufte(base_size = base_size) + 
 83 |         theme(panel.background = element_rect(fill = NA, color = "black")) + 
 84 |         guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4)), alpha = FALSE) +
 85 |         scale_color_manual(values = palette_use) + 
 86 |         scale_fill_manual(values = palette_use) +    
 87 |         theme(plot.title = element_text(hjust = .5)) + 
 88 |         labs(x = "UMAP 1", y = "UMAP 2") 
 89 |     
 90 |     if (do_points) 
 91 |         plt <- plt + geom_point(size = 0.2)
 92 |     if (do_density) 
 93 |         plt <- plt + geom_density_2d()    
 94 |         
 95 | 
 96 |     if (no_guides)
 97 |         plt <- plt + guides("none")
 98 |     
 99 |     if (do_labels) 
100 |         plt <- plt + geom_label_repel(data = data.table(plt_df)[, .(X1 = mean(X1), X2 = mean(X2)), by = label_name], label.size = NA,
101 |                                       aes(label = .data[[label_name]]), color = "white", size = pt_size, alpha = 1, segment.size = 0) + 
102 |         guides(col = FALSE, fill = FALSE)
103 |     return(plt)
104 | }
105 | 
106 | ```
107 | 
108 | 
109 | 
110 | # Cell line data 
111 | 
112 | This dataset is described in figure 2 of the Harmony manuscript. We downloaded 3 cell line datasets from the 10X website. The first two (jurkat and 293t) come from pure cell lines while the *half* dataset is a 50:50 mixture of Jurkat and HEK293T cells. We inferred cell type with the canonical marker XIST, since the two cell lines come from 1 male and 1 female donor. 
113 | 
114 | * https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/jurkat
115 | * https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/293t
116 | * https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/jurkat:293t_50:50
117 | 
118 | We library normalized the cells, log transformed the counts, and scaled the genes. Then we performed PCA and kept the top 20 PCs. We begin the analysis in this notebook from here. 
119 | 
120 | 
121 | ```{r}
122 | data(cell_lines)
123 | V <- cell_lines$scaled_pcs
124 | V_cos <- cosine_normalize(V, 1)
125 | meta_data <- cell_lines$meta_data
126 | ```
127 | 
128 | To get a feel for the data, let's visualize the cells in PCA space. The plots below show the cells' PC1 and PC2 embeddings. We color the cells by dataset of origin (left) and cell type (right). 
129 | 
130 | 
131 | ```{r, warning=FALSE, fig.width=5, fig.height=3, fig.align="center"}
132 | do_scatter(V, meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
133 |     labs(title = 'Colored by dataset', x = 'PC1', y = 'PC2') +
134 | do_scatter(V, meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 
135 |     labs(title = 'Colored by cell type', x = 'PC1', y = 'PC2') +
136 | NULL
137 | ```
138 | 
139 | 
140 | 
141 | # Initialize a Harmony object
142 | 
143 | The first thing we do is initialize a Harmony object. We pass 2 data structures: 
144 | 
145 | 1. V: the PCA embedding matrix of cells. 
146 | 2. meta_data: a dataframe object containing the variables we'd like to Harmonize over. 
147 | 
148 | The rest of the parameters are described below. A few notes: 
149 | 
150 | * *nclust* in the R code below corresponds to the parameter *K* in the manuscript. 
151 | * we set *max_iter* to 0 because in this tutorial, we don't want to actually run Harmony just yet. 
152 | * setting *return_object* to *TRUE* means that *harmonyObj* below is not a corrected PCA embeddings matrix. Instead, it is the full Harmony model object. We'll have a closer look into the different pieces of this object as we go!
153 | 
154 | 
155 | ```{r}
156 | 
157 | set.seed(1)
158 | harmonyObj <- harmony::RunHarmony(
159 |     data_mat = V, ## PCA embedding matrix of cells
160 |     meta_data = meta_data, ## dataframe with cell labels
161 |     theta = 1, ## cluster diversity enforcement
162 |     vars_use = 'dataset', ## variable to integrate out
163 |     nclust = 5, ## number of clusters in Harmony model
164 |     max_iter = 0, ## stop after initialization
165 |     return_object = TRUE ## return the full Harmony model object
166 | )
167 | 
168 | 
169 | ```
170 | 
171 | By initializing the object, we have prepared the data in 2 ways. First, we've scaled the PCA matrix to give each cell unit length. Second, we've initialized cluster centroids with regular kmeans clustering on these scaled data. We'll dig into these two steps below. 
172 | 
173 | ## L_2 scaling to induce cosine distance
174 | 
175 | A key preprocessing step of Harmony clustering is L2 normalization. As shown in Haghverdi et al 2018, scaling each cell to have L2 norm equal to 1 induces a special property: Euclidean distance of the scaled cells is equivalent to cosine distance of the unscaled cells. Cosine distance is a considerably more robust measure of cell-to-cell similarity (CITE Martin and Vlad). Moreover, it has been used in clustering analysis of high dimensional text datasets (CITE NLP spherical kmeans). 
176 | 
177 | $L_2$ Normalization of cell $i$: 
178 | 
179 | <center>
180 | $\hat{Z}_{\cdot, i} \leftarrow  \frac{\hat{Z}_{\cdot, i}}{||{\hat{Z}_{\cdot, i}}||_{2}}$ 
181 | </center>
182 | 
183 | 
184 | TL;DR Harmony clustering uses cosine distance. By normalizing each cell to have unit length, we can directly visualize the cosine distances between cells (right). These relationships are not obvious in Euclidean space (left). 
185 | 
186 | 
187 | In the Harmony object, we now have 2 copies of the cell embeddings. The first, $Z_{orig}$ is the original PCA matrix (PCs by cells). The second, $Z_{cos}$ is the new $L_2$ scaled matrix. Since this scaling projects cells into a unit hypersphere, cells appear pushed away from the origin (0,0). 
188 | 
189 | 
190 | ```{r, fig.width=5, fig.height=3, fig.align="center"}
191 | do_scatter(t(harmonyObj$Z_orig), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
192 |     labs(title = 'Z_orig', subtitle = 'Euclidean distance', x = 'PC1', y = 'PC2') +
193 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
194 |     labs(title = 'Z_cos', subtitle = 'Induced Cosine distance', x = 'PC1', y = 'PC2')
195 | 
196 | ```
197 | 
198 | 
199 | In the $Z_{cos}$ scatterplot (right), cells that are nearby have a high cosine similarity. Although it is not obvious in this example, cells closeby in Euclidean space do not always have a high cosine similarity! 
200 | 
201 | Above, we only visualize the first two PCs. In this simple example with cell lines, this is sufficient to visualize most of the variation. Note, however, that all clustering and correction in Harmony uses all the PCs. For completeness, we can visualize the quantiles of PCA embeddings for all 20 PCs, colored by original dataset. 
202 | 
203 | 
204 | ```{r, fig.width=8, fig.height=3, out.width="100%"}
205 | 
206 | harmonyObj$Z_cos %>% t %>% data.frame() %>% 
207 |     cbind(meta_data) %>% 
208 |     tidyr::gather(key, val, X1:X20) %>% 
209 |     ggplot(aes(reorder(gsub('X', 'PC', key), as.integer(gsub('X', '', key))), val)) + 
210 |         geom_boxplot(aes(color = dataset)) + 
211 |         scale_color_manual(values = colors_use) + 
212 |         labs(x = 'PC number', y = 'PC embedding value', title = 'Z_cos (unit scaled PCA embeddings) for all 20 PCs') + 
213 |         theme_tufte(base_size = 10) + geom_rangeframe() + 
214 |         theme(axis.text.x = element_text(angle = 45, hjust = 1))
215 | ```
216 | 
217 | 
218 | 
219 | ## Initial clustering 
220 | 
221 | Initializing the Harmony object also triggered initialization of all the clustering data structures. Harmony currently uses regular kmeans, with 10 random restarts, to find initial locations for the cluster centroids. Let's visualize these centroids directly! We can do this by accessing the *Y* matrix in the Harmony object. This is a matrix with $d=20$ rows and $K=5$ columns, so each column represents one 20-dimensional centroid. 
222 | 
223 | Remember that we set the number of clusters to 5 above, so there are now 5 clusters below. 
224 | 
225 | ```{r, fig.width=4, fig.height=3, fig.align="center"}
226 | 
227 | cluster_centroids <- harmonyObj$Y
228 | 
229 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = FALSE, do_labels = FALSE) + 
230 |     labs(title = 'Initial kmeans cluster centroids', subtitle = '', x = 'PC1', y = 'PC2') +
231 |     geom_point(
232 |         data = data.frame(t(cluster_centroids)), 
233 |         color = 'black', fill = 'black', alpha = .8,
234 |         shape = 21, size = 6
235 |     ) +
236 | NULL
237 | 
238 | ```
239 | 
240 | 
241 | 
242 | Based on these cluster centroids, we also assigned probabilistic cluster memberships to each cell. In the algorithm, this is done using the formula below. 
243 | 
244 | <center>
245 | $R_{ki} \propto \exp(\frac{-||Z_i - Y_k||^2_2}{\sigma})$
246 | </center>
247 | 
248 | Above, $R_{ki}$ is a value from $0$ to $1$ and denotes the probability that cell $i$ is assigned to cluster $k$. Accordingly, the squared distance $||Z_i - Y_k||^2_2$ is the distance between cell $i$ and the centroid of cluster $k$. Because we're using cosine distance (i.e. cells and centroids have unit length), we can simplify the distance computation: 
249 | 
250 | <center>
251 | $R_{ki} \propto \exp(\frac{-(2(1 - Y^TZ))}{\sigma})$
252 | </center>
253 | 
254 | Finally, the $\propto$ symbol means that we will normalize R to form a proper probability distribution for each cell: 
255 | 
256 | <center>
257 | $\sum_k R_{ki} = 1$
258 | </center>
259 | 
260 | Let's take a look at these initial cluster assignments. We can find these assignments in the $K$ row by $N$ column matrix $R$. 
261 | 
262 | 
263 | ```{r}
264 | cluster_assignment_matrix <- harmonyObj$R
265 | 
266 | ```
267 | 
268 | The plots below color each cell by cluster membership, from 0 (grey) to 1 (blue). For clarity, each column is a different dataset. Each row is one of the 5 clusters. 
269 | 
270 | 
271 | ```{r, fig.height=5, fig.width=5}
272 | t(harmonyObj$Z_cos) %>% data.frame() %>%
273 |     cbind(meta_data) %>% 
274 |     tibble::rowid_to_column('id') %>% 
275 |     dplyr::inner_join(
276 |         cluster_assignment_matrix %>% t() %>% data.table() %>% 
277 |             tibble::rowid_to_column('id') %>%
278 |             tidyr::gather(cluster, r, -id) %>% 
279 |             dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 
280 |         by = 'id'
281 |     ) %>% 
282 |     dplyr::sample_frac(1L) %>% 
283 |     ggplot(aes(X1, X2, color = r)) + 
284 |         geom_point(size=0.2) + 
285 |         theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
286 |         facet_grid(cluster ~ dataset) + 
287 |         scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) + 
288 |         labs(x = 'Scaled PC1', y = 'Scaled PC2', title = 'Initial probabilistic cluster assignments')
289 | ```
290 | 
291 | 
292 | 
293 | 
294 | 
295 | ## Evaluating initial cluster diversity 
296 | 
297 | A key part of clustering in Harmony is diversity. We can evaluate the initial diversity of clustering by aggregating the number of cells from each batch assigned to each cluster. For this, we need two data structures: 
298 | 
299 | 1) $\phi$ (B rows, N columns): the one-hot encoded design matrix. 
300 | 
301 | 2) $R$ (K rows, N columns): the cluster assignment matrix. 
302 | 
303 | The cross product $R\phi^T$ gives us a matrix of the number of cells from batch b (columns) that are in cluster k (rows). Note that since cluster assignment is probabilistic, the observed counts don't have to be integer valued. For simplicity, we round the values to their closest integers. 
304 | 
305 | 
306 | ```{r}
307 | observed_counts <- with(harmonyObj, R %*% t(as.matrix(Phi)))
308 | round(observed_counts)
309 | 
310 | 
311 | ```
312 | 
313 | 
314 | In fact, this information is already stored in the Harmony model object! The observed cluster by batch counts are stored in the $O$ matrix. The expected counts are in the $E$ matrix. We can check that the observed counts matrix has exactly the same values we computed above.
315 | 
316 | 
317 | ```{r}
318 | ## observed counts
319 | round(harmonyObj$O)
320 | 
321 | ## observed counts
322 | round(harmonyObj$E)
323 | 
324 | ```
325 | 
326 | 
327 | It looks like clusters 2, 4, and 5 are not very diverse, with most cells coming from a single dataset. However, clusters 1 and 3 look pretty well mixed already! Cluster 1 has 900 cells from batch $b=1$ (*half* dataset) and 1574 cells from batch $b=3$ (*t293* dataset). As we move into the maximum diversity clustering, we should see the clusters getting more and more mixed! 
328 | 
329 | In this benchmark, we also have some ground truth cell types. In the same way that we evaluated the cluster diversity, we can evaluate the cluster accuracy. Since we didn't tell Harmony what the ground truth cell types are, we need to first construct a cell-type design matrix (shown below). We want these columns to be as mutually exclusive as possible. It looks like the initial clustering is fairly accurate. The only mistakes are the $n=2$ *jurkat* cells clustered with the $n=2472$ *293t* cells in cluster $k=1$ and $n=12$ *jurkat* cells clustered with $n=1952$ *t293* cells in cluster $k=3$. 
330 | 
331 | 
332 | ```{r}
333 | phi_celltype <- onehot(meta_data$cell_type) 
334 | observed_cell_counts <- harmonyObj$R %*% t(phi_celltype)
335 | round(observed_cell_counts)
336 | 
337 | ```
338 | 
339 | # Maximum-diversity soft-clustering
340 | 
341 | In the previous section, we initialized the Harmony object. At this point, we have some initial cluster assignments ($R$, $Y$), scaled PC embeddings ($Z_{cos}$), and statistics about cluster diversity ($O$, $E$). Now we're going to do some Harmony clustering to find more diverse clusters! 
342 | 
343 | We do this by calling the *cluster()* function defined in the Harmony package. This will perform a few rounds of clustering, defined by the parameter *max_iter_kmeans*. In each round, we iterate between two steps: centroid estimation and cluster assignment. We dig into both of these in more detail in the subsections below. 
344 | 
345 | ```{r}
346 | harmonyObj$max_iter_kmeans
347 | ```
348 | 
349 | ```{r}
350 | ## we can specify how many rounds of clustering to do
351 | harmonyObj$max_iter_kmeans <- 10
352 | harmonyObj$cluster_cpp()
353 | ```
354 | 
355 | 
356 | Now that we've done some maximum diversity clustering, how have the clusters changed? Let's first look at the observed counts matrix $O$. 
357 | 
358 | In contrast to the $O$ matrix we started with above, this one looks much more diverse!
359 | 
360 | ```{r}
361 | round(harmonyObj$O)
362 | ```
363 | 
364 | While clusters 1 and 3 were already diverse in the initial clustering, it seems that clusters 2, 4, and 5 are now considerably more mixed as well. Let's see how these assignments have changed in space. 
365 | 
366 | 
367 | ```{r, fig.height=5, fig.width=5}
368 | new_cluster_assignment_matrix <- harmonyObj$R
369 | 
370 | t(harmonyObj$Z_cos) %>% data.frame() %>%
371 |     cbind(meta_data) %>% 
372 |     tibble::rowid_to_column('id') %>% 
373 |     dplyr::inner_join(
374 |         new_cluster_assignment_matrix %>% t() %>% data.table() %>% 
375 |             tibble::rowid_to_column('id') %>%
376 |             tidyr::gather(cluster, r, -id) %>% 
377 |             dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 
378 |         by = 'id'
379 |     ) %>% 
380 |     dplyr::sample_frac(1L) %>% 
381 |     ggplot(aes(X1, X2, color = r)) + 
382 |         geom_point(shape = '.') + 
383 |         theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
384 |         facet_grid(cluster ~ dataset) + 
385 |         scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) + 
386 |         labs(x = 'Scaled PC1', y = 'Scaled PC2', title = 'New probabilistic cluster assignments')
387 | ```
388 | 
389 | 
390 | Of course, it is equally important to make sure that our clusters do not mix up different cell types. Recall that in this benchmark, we have access to these ground truth labels. 
391 | 
392 | 
393 | ```{r}
394 | phi_celltype <- onehot(meta_data$cell_type)
395 | observed_cell_counts <- harmonyObj$R %*% t(phi_celltype)
396 | round(observed_cell_counts)
397 | ```
398 | 
399 | 
400 | 
401 | 
402 | 
403 | Initially, the largest error we had was in cluster 1 with 12 out of 1952 cells misclustered. So our initial error rate was at most 0.6%. Let's take a look at the error rates in our maximum diversity clustering (shown below). Applying the same kind of error analysis, we see that we have <0.6% error across all the clusters. 
404 | 
405 | It is worth noting that in the original clustering, clusters 2, 4, and 5 had 0% error. But they also had almost no diversity. These clusters have incurred a non-zero error but gained substantial diversity. This trade-off between accuracy and diversity is present in all integration settings. 
406 | 
407 | 
408 | ```{r}
409 | round(apply(prop.table(observed_cell_counts, 1), 1, min) * 100, 3)
410 | ```
411 | 
412 | 
413 | ## Diverse cluster assignment
414 | 
415 | Now let's re-assign cells to cluster centroids. We did this above, when we assigned cells during the Harmony initialization step. The difference is that we want to assign cells to clusters that are both nearby and will increase diversity. 
416 | 
417 | In the algorithm, this assignment is defined by 
418 | 
419 | <center>
420 | $R_{ki} \propto \exp(\frac{-(2(1 - Y^TZ))}{\sigma}) (\frac{E}{O})^\theta \phi$
421 | </center>
422 | 
423 | Let's see what this looks like in code. Then we'll break down the formula to see what it does. 
424 | 
425 | 
426 | ```{r}
427 | 
428 | with(harmonyObj, {
429 |     distance_matrix <- 2 * (1 - t(Y) %*% Z_cos)
430 |     distance_score <- exp(-distance_matrix / as.numeric(sigma))
431 |     diversity_score <- sweep(E / O, 2, theta, '/') %*% as.matrix(Phi)
432 |     ## new assignments are based on distance and diversity
433 |     R_new <- distance_score * diversity_score  
434 |     ## normalize R so each cell sums to 1
435 |     R_new <- prop.table(R_new, 2)    
436 | })
437 | 
438 | ```
439 | 
440 | So how does the formula we used above help to create more diverse cluster assignment? 
441 | 
442 | The diversity penalty is encoded in the new term: $(\frac{E}{O})^\theta \phi$. This has some familiar data structures: $O$ for observed counts, $E$ for expected counts, and $\phi$ for the design matrix. $\theta$ is a new term. $\theta$ decides how much weight to give diversity versus distance to cluster centroids. 
443 | 
444 | With $\theta=0$, there is no penalty and each cluster gets a score of 1. 
445 | 
446 | 
447 | ```{r}
448 | ## with theta = 0
449 | with(harmonyObj, {
450 |     (E / O) ^ 0
451 | })
452 | ```
453 | 
454 | 
455 | 
456 | As we increase $\theta$, let's see what happens (shown below). Recall that in cluster $k=1$, batches 1 and 3 were well represented. Below, note that in that cluster ($k=1$), the penalties for batches 1 and 3 are relatively low (0.98 and 0.47). On the other hand, batch 2 gets a penalty score of 30914. This means that cells from batches 1 and 3 will be encouraged to move into cluster $k=1$. On the other hand, cluster $k=2$ is replete with batch 2. The penalty for batch 2 in cluster $k=2$ is relatively low, and noticeably smaller than the penalty score for batch 2 in cluster $k=1$. Thus, cells from batch 2 will be discouraged from moving into cluster $k=1$, as this cluster has a higher penalty score for cells from batch 2 compared to other clusters (such as $k=1$). 
457 | 
458 | 
459 | ```{r}
460 | ## with theta = 1
461 | with(harmonyObj, {
462 |     round((E / O) ^ 1, 2)
463 | })
464 | 
465 | ```
466 | 
467 | 
468 | 
469 | We should always be wary of setting $\theta$ too high, since the diversity scores can go to $\infty$. Below, we set $\theta$ to 1 million. We do not recommend setting $\theta$ to 1 million! 
470 | 
471 | 
472 | ```{r}
473 | ## as theta approach infinity
474 | with(harmonyObj, {
475 |     round((E / O) ^ 1e6, 2)
476 | })
477 | 
478 | ```
479 | 
480 | 
481 | Finally, it is important to note that we cannot re-assign cells independently as we did above. Why not? As soon as we re-assign one cell, the diversity counts in the $O$ and $E$ matrices change. Thus, the assignment formula for all other cells is different! For this reason, we need to assign one cell at a time and update the $O$ and $E$ as we go. In practice, we can update some chunk of cells (e.g. 5%), update the $O$ and $E$ matrices, and update another chunk of cells. 
482 | 
483 | ## Cluster centroid estimation
484 | 
485 | In the previous step, we re-assigned cells to maximize diversity within the clusters. With this new assignment, we need to update the cluster centroids. In this step, we'll use the cell positions $Z_{cos}$ and the cluster assignments ($R$) to re-position cluster centroids to be close to their assigned cells. 
486 | 
487 | <center>
488 | $Y \leftarrow Z_{cos}R^T$
489 | </center>
490 | 
491 | ```{r}
492 | Y_unscaled <- with(harmonyObj, Z_cos %*% t(R))
493 | ```
494 | 
495 | 
496 | We then scale Y to make each centroid unit length: 
497 | 
498 | <center>
499 | $Y \leftarrow \frac{Y}{\sum_{d}Y_d}$
500 | </center>
501 | 
502 | ```{r}
503 | Y_new <- cosine_normalize(Y_unscaled, 2)
504 | ```
505 | 
506 | # Correction
507 | 
508 | In the previous section, we performed clustering in order to identify shared groups of cells between batches. Now we make use of these groups in order to correct the data in a sensitive way. To run the correction step, we call the function *moe_correct_ridge()* from the Harmony package. First, let's see what happens to the cells. In the subsections that follow, we'll look deeper into how we got there. 
509 | 
510 | 
511 | ```{r}
512 | harmonyObj$moe_correct_ridge_cpp()
513 | ```
514 | 
515 | 
516 | ```{r, fig.width=5, fig.height=3, fig.align="center"}
517 | 
518 | do_scatter(cosine_normalize(t(harmonyObj$Z_orig), 1), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
519 |     labs(title = 'Z_cos before MoE', x = 'PC1', y = 'PC2') +
520 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
521 |     labs(title = 'Z_cos after MoE', x = 'PC1', y = 'PC2')
522 | ```
523 | 
524 | 
525 | 
526 | 
527 | We can see the the jurkat cells are starting to come together on the right (purple and green). There is also more local mixing of the 293T cells on the left (yellow and green). What happened to actually get them there? 
528 | 
529 | For each cell, we estimate how much its batch identity contributes to its PCA scores. We then subtract this contribution from that cell's PCA scores. That's it! 
530 | 
531 | Very importantly, this correction factor is not in the unit scaled space (i.e. $Z_{cos}$)! The data in $Z_{cos}$ have been projected onto a hypersphere. This makes the cells easier to cluster but the space is no longer closed under linear transformations! In other words, if we push a cell over a bit by adding 10 to PC1, that cell is no longer on the hypersphere. 
532 | 
533 | To query the Harmony model object, we need to introduce another variable: $Z_{corr}$. $Z_{corr}$ contains the cells' PCA embeddings post correction. However, we never scale cells in $Z_{corr}$ to have unit length. After we compute $Z_{corr}$, we immediately update $Z_{cos}$ as the unit scaled version of $Z_{corr}$. The plot below shows all three of Harmony's data structures that contain PCA embeddings. To summarize: 
534 | 
535 | - $Z_{orig}$: original PCA embeddings
536 | - $Z_{corr}$: corrected PCA embeddings, not scaled
537 | - $Z_{cos}$: corrected PCA embeddings, scaled to unit length
538 | 
539 | 
540 | ```{r, fig.width=8, fig.height=3, fig.align="center", out.width="100%"}
541 | 
542 | do_scatter(t(harmonyObj$Z_orig), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
543 |     labs(title = 'Z_orig', subtitle = 'Original PCA embeddings', x = 'PC1', y = 'PC2') +
544 | do_scatter(t(harmonyObj$Z_corr), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
545 |     labs(title = 'Z_corr', subtitle = '= Z_orig - correction_factors', x = 'PC1', y = 'PC2') +
546 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
547 |     labs(title = 'Z_cos', subtitle = '= Unit_scaled(Z_corr)', x = 'Scaled PC1', y = 'Scaled PC2') +
548 | NULL
549 | ```
550 | 
551 | 
552 | 
553 | Let's take a look a closer look at these cell specific correction factors. For exposition, let's focus on PC1 and compare each cell's position before (from $Z_{orig}$) and after (from $Z_{corr}$) correction. 
554 | 
555 | 
556 | The plots below show the PC1 value before (x-axis) and after (y-axis) correction for each cell. The black line is drawn at $y=x$ to represent the level curve of no change. 
557 | 
558 | 
559 | ```{r, fig.width=5, fig.height=3, fig.align="center"}
560 | 
561 | plt <- data.table(PC1_After = harmonyObj$Z_corr[1, ], PC1_Before = harmonyObj$Z_orig[1, ]) %>% 
562 |     cbind(meta_data) %>% 
563 |     dplyr::sample_frac(1L) %>% 
564 |     ggplot(aes(PC1_Before, PC1_After)) + 
565 |         geom_abline(slope = 1, intercept = 0) + 
566 |         theme_tufte(base_size = 10) + geom_rangeframe() + 
567 |         scale_color_tableau() + 
568 |         guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4))) + 
569 |         NULL
570 | 
571 | plt + geom_point(shape = '.', aes(color = dataset)) + 
572 |         labs(x = 'PC1 before correction', y = 'PC1 after correction', 
573 |              title = 'PC1 correction for each cell', subtitle = 'Colored by Dataset') + 
574 | plt + geom_point(shape = '.', aes(color = cell_type)) + 
575 |         labs(x = 'PC1 before correction', y = 'PC1 after correction', 
576 |              title = 'PC1 correction for each cell', subtitle = 'Colored by Cell Type') + 
577 | NULL
578 | 
579 | ```
580 | 
581 | 
582 | 
583 | We can see a few interesting things from these plots. 
584 | 
585 | - The 293T cells from the 293T and half datasets have pretty much the same correction factor. Since these cells were already well mixed, this is expected. 
586 | - There is a salient cloud of wandering Jurkat cells from the half dataset. Many of these itinerants find themselves with the same correction factor as 293T cells! What's going on with these erroneously corrected cells? These cells are located in the middle and have a small length (L2 norm). Thus, when these cells are unit length scaled, their location is unstable. These cells should have been filtered out as outliers. 
587 | 
588 | ## Mixture of Experts model
589 | 
590 | The theory behind this algorithm is based on the Mixture of Experts model. This is a natural extension of linear modeling, in which each cluster is deemed an expert and is assigned its own linear model. 
591 | 
592 | We model each PC coordinate with a combination of linear factors. 
593 | 
594 | <center>
595 | $Z_{d} = \sum_k \beta_{0,k} + \beta_{1,k} \mathbb{1}_{(dataset = jurkat)} + \beta_{2,k} \mathbb{1}_{(dataset = half)} + \beta_{3,k} \mathbb{1}_{(dataset = 293T)}$
596 | </center>
597 | 
598 | In the model above, each cluster gets 4 $\beta$ terms: $\beta_{0,k}$ is the intercept term. This term is independent of which dataset a cell comes from. Therefore, it represents the contribution of cell type or cell state to the PC score. The other three $\beta$ terms are accompanied by an indicator variable. This means that a cell from dataset *half* will have $\mathbb{1}_{(dataset = half)}$ equal to 1 and the rest 0. 
599 | 
600 | Following this cell from dataset half *half*, we can write rewrite the MoE equation above as
601 | 
602 | <center>
603 | $Z_{di} = \sum_k \beta_{0,k}  + \beta_{2,k} \mathbb{1}_{(dataset = half)}$
604 | </center>
605 | 
606 | ## Estimate MoE model parameters
607 | 
608 | We estimate the matrix of linear regression terms using the formula described in the manuscript: 
609 | 
610 | <center>
611 | $W_k \leftarrow (\phi^* diag(R_k) \phi^{*T} + \lambda I)^{-1} \phi^* diag(R_k)Z_{orig}^T$
612 | </center>
613 | 
614 | The matrix above contains linear regression terms for the the intercept $W_k[0] = \beta_{0,k}$ and the batch terms: 
615 | 
616 | $W_k[1] = \beta_{1, k} \mbox{  (for dataset half)}$
617 | 
618 | $W_k[2] = \beta_{2, k} \mbox{  (for dataset jurkat)}$
619 | 
620 | $W_k[3] = \beta_{3, k} \mbox{  (for dataset 293T)}$
621 | 
622 | 
623 | ```{r, echo=TRUE}
624 | 
625 | W <- list()
626 | ## Convert sparse data structures to dense matrix
627 | Phi.moe <- as.matrix(harmonyObj$Phi_moe)
628 | lambda <- diag(c(harmonyObj$lambda))
629 | ## Get beta coeeficients for all the clusters
630 | for (k in 1:harmonyObj$K) {
631 |     W[[k]] <- solve(Phi.moe %*% diag(harmonyObj$R[k, ]) %*% t(Phi.moe) + lambda) %*% (Phi.moe %*% diag(harmonyObj$R[k, ])) %*% t(harmonyObj$Z_orig)
632 | }
633 | 
634 | 
635 | ```
636 | 
637 | Let's take a look at how these regression terms relate to the data. Recall that the mixture of experts model is trying to estimate the contribution of intercept and batch to cell's positions in space. So first we'll take a look at the positions of each batch and each cluster in the original PCA embeddings. The color below represents soft cluster membership learned using the maximum diversity clustering above. 
638 | 
639 | 
640 | ```{r, fig.width=5, fig.height=5}
641 | 
642 | cluster_assignment_matrix <- harmonyObj$R
643 | 
644 | t(harmonyObj$Z_orig) %>% data.frame() %>%
645 |     cbind(meta_data) %>% 
646 |     tibble::rowid_to_column('id') %>% 
647 |     dplyr::inner_join(
648 |         cluster_assignment_matrix %>% t() %>% data.table() %>% 
649 |             tibble::rowid_to_column('id') %>%
650 |             tidyr::gather(cluster, r, -id) %>% 
651 |             dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 
652 |         by = 'id'
653 |     ) %>% 
654 |     dplyr::sample_frac(1L) %>% 
655 |     ggplot(aes(X1, X2, color = r)) + 
656 |         geom_point(shape = 0.2) + 
657 |         theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
658 |         facet_grid(cluster ~ dataset) + 
659 |         scale_color_gradient(low = 'grey', breaks = seq(0, 1, .2)) + 
660 |         labs(x = 'PC1', y = 'PC2', title = 'Cluster assigned in original PCA space (Z_orig)')
661 | 
662 | ```
663 | 
664 | 
665 | 
666 | 
667 | 
668 | Now let's draw the $\beta$ terms into this space. For each cluster, we expect the sum of the intercept plus the batch terms to land squarely in the center of each batch:cluster. The arrows below represent the intercept (in black) and batch (colored) offsets. 
669 | 
670 | 
671 | ```{r}
672 | plt_list <- lapply(1:harmonyObj$K, function(k) {
673 |     plt_df <- W[[k]] %>% data.frame() %>% 
674 |         dplyr::select(X1, X2)
675 |     ## Append n
676 |     plt_df <- plt_df %>% 
677 |         cbind(
678 |             data.frame(t(matrix(unlist(c(c(0, 0), rep(plt_df[1, ], 3))), nrow = 2))) %>% 
679 |                 dplyr::rename(x0 = X1, y0 = X2) 
680 |         ) %>%
681 |         cbind(type = c('intercept', unique(meta_data$dataset)))
682 |     plt <- plt_df %>% 
683 |         ggplot() + 
684 |             geom_point(aes(X1, X2),
685 |                        data = t(harmonyObj$Z_orig) %>% data.frame(),
686 |                        size = 0.5,
687 |                        color = 'grey'
688 |             ) + 
689 |             geom_segment(aes(x = x0, y = y0, xend = X1 + x0, yend = X2 + y0, color = type), linewidth=1) + 
690 |             scale_color_manual(values = c('intercept' = 'black', colors_use)) + 
691 |             theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
692 |             labs(x = 'PC 1', y = 'PC 2', title = sprintf('Cluster %d', k))
693 |     plt <- plt + guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16)))    
694 |     # if (k == harmonyObj$K) {
695 |     # } else {
696 |     #     plt <- plt + guides(color = FALSE)
697 |     # }
698 |     plt
699 | })
700 | 
701 | 
702 | ```
703 | 
704 | 
705 | ```{r, fig.height=6, fig.width=6}
706 | Reduce(`+`, plt_list) + 
707 |   patchwork::plot_annotation(title = 'Mixture of experts beta terms before correction (Z_orig)') + 
708 |   plot_layout(ncol = 2)
709 | ```
710 | 
711 | 
712 | 
713 | 
714 | After correction, we remove the batch specific terms (colored arrows above). We can see the result in the corrected linear space ($Z_{corr}$). Notice that now, the cells are centered around the tips of the black arrows, which represent the intercept term. This is because we've removed the effect of the batch terms (colored arrows). 
715 | 
716 | 
717 | ```{r, fig.width=4, fig.height=3, fig.align="center"}
718 | 
719 | plt_list <- lapply(1:harmonyObj$K, function(k) {
720 |     plt_df <- W[[k]] %>% data.frame() %>% 
721 |         dplyr::select(X1, X2)
722 | 
723 |     plt_df <- plt_df %>% 
724 |         cbind(
725 |             data.frame(t(matrix(unlist(c(c(0, 0), rep(plt_df[1, ], 3))), nrow = 2))) %>% 
726 |                 dplyr::rename(x0 = X1, y0 = X2) 
727 |         ) %>%
728 |         cbind(type = c('intercept', unique(meta_data$dataset))) 
729 | 
730 |     plt <- plt_df %>% 
731 |         ggplot() + 
732 |             geom_point(aes(X1, X2),
733 |                 data = t(harmonyObj$Z_corr) %>% data.frame(),
734 |                 shape = '.', 
735 |                 color = 'grey'
736 |             ) + 
737 |             geom_segment(aes(x = x0, y = y0, xend = X1 + x0, yend = X2 + y0, color = type), linewidth=1) + 
738 |             scale_color_manual(values = c('intercept' = 'black', colors_use)) + 
739 |             theme_tufte(base_size = 10) + theme(panel.background = element_rect()) + 
740 |             labs(x = 'PC 1', y = 'PC 2', title = sprintf('Cluster %d', k))
741 |     plt <- plt + guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16)))
742 |     plt
743 | })
744 | 
745 | 
746 | ```
747 | 
748 | 
749 | 
750 | ```{r, fig.height=6, fig.width=6}
751 | Reduce(`+`, plt_list) + 
752 |   patchwork::plot_annotation(title = 'Mixture of experts beta terms after correction (Z_corr)') + 
753 |   plot_layout(ncol = 2)
754 | ```
755 | 
756 | 
757 | 
758 | ## Cell specific corrections
759 | 
760 | How does one cell get its correction factor? 
761 | 
762 | Recall from above that each cell $i$ is now modeled with intercept and batch specific terms:     
763 | 
764 | 
765 | ```{r, echo=TRUE}
766 | 
767 | Z_i <- harmonyObj$Z_orig[, 5]
768 | Z_i_pred <- Reduce(`+`, lapply(1:harmonyObj$K, function(k) {
769 |     W[[k]] * harmonyObj$Phi_moe[, 5] * harmonyObj$R[k, 5]
770 | })) %>% colSums
771 | 
772 | 
773 | ```
774 | 
775 | The plot below shows the observed and predicted values of all 20 PCs for cell 5. 
776 | 
777 | 
778 | ```{r, fig.width=4, fig.height=3, fig.align="center"}
779 | data.table(obs = Z_i, pred = Z_i_pred) %>% 
780 |     tibble::rowid_to_column('PC') %>% 
781 |     ggplot(aes(obs, pred)) + 
782 |         geom_point(shape = 21) + 
783 |         geom_label_repel(aes(label = PC)) + 
784 |         geom_abline(slope = 1, intercept = 0) + 
785 |         theme_tufte() + geom_rangeframe() + 
786 |         labs(x = 'Observed PC score', 'Predicted PC score', title = 'Observed and predicted values of PC scores\nfor cell 5') + 
787 |         NULL        
788 | ```
789 | 
790 | 
791 | 
792 | 
793 | Now that we've modeled all these contributions to PCs, we can remove the batch specific terms from cell $i$ to get its corrected position ($\hat{Z}_{i}$) in $Z_{corr}$: 
794 | 
795 | <center>
796 | $\hat{Z}_i \leftarrow Z_i - \sum_k R_{ki} <W_k[1:B, ], \phi_i>$
797 | </center>
798 | 
799 | 
800 | ```{r}
801 | delta <- Reduce(`+`, lapply(1:harmonyObj$K, function(k) {
802 |     W[[k]][2:4, ] * harmonyObj$Phi[, 5] * harmonyObj$R[k, 5]
803 | })) %>% colSums
804 | 
805 | Z_corrected <- harmonyObj$Z_orig[, 5] - delta
806 | 
807 | ```
808 | 
809 | Let's see where this one cell moves in the original embeddings. Cell 5 is highlighted in red. It's individual correction factor is shown with the red arrow. 
810 | 
811 | 
812 | ```{r, fig.width=3, fig.height=3, fig.align="center"}
813 | 
814 | 
815 | harmonyObj$Z_orig %>% t %>% data.frame() %>% 
816 |     ggplot(aes(X1, X2)) + 
817 |         geom_point(shape = '.') + 
818 |         geom_point(
819 |             data = data.frame(t(harmonyObj$Z_orig[, 5, drop = FALSE])), 
820 |             color = 'red'
821 |         ) + 
822 |         geom_segment(
823 |             data = data.table(x0 = harmonyObj$Z_orig[1, 5], 
824 |                               y0 = harmonyObj$Z_orig[2, 5], 
825 |                               x1 = Z_corrected[1],
826 |                               y1 = Z_corrected[2]), 
827 |             aes(x = x0, y = y0, xend = x1, yend = y1),
828 |             linewidth = 1,
829 |             color = 'red', 
830 |             arrow = arrow(length = unit(0.05, "npc"), type = 'closed')            
831 |         ) + 
832 |         theme_tufte(base_size = 10) + geom_rangeframe() + 
833 |         labs(x = 'PC1', y = 'PC2', title = 'Correction of cell #5')
834 | 
835 | ```
836 | 
837 | 
838 | # Multiple iterations of Harmony
839 | 
840 | The sections above broke down the Harmony algorithm. Now's let's take a more holistic look. In the code below, let's look at the corrected PC values ($Z_{cos}$) after each round of Harmony (clustering + correction). Since we're not visualizing the clusters in this section, let's increase nclust to 50. After the 1st and 2nd rounds, we can see considerably more mixing. By round 3 though, the cells are pretty well mixed and we stop. 
841 | 
842 | 
843 | 
844 | 
845 | ```{r}
846 | 
847 | harmonyObj <- RunHarmony(
848 |     data_mat = V, ## PCA embedding matrix of cells
849 |     meta_data = meta_data, ## dataframe with cell labels
850 |     theta = 1, ## cluster diversity enforcement
851 |     vars_use = 'dataset', ## (list of) variable(s) we'd like to Harmonize out
852 |     nclust = 50, ## number of clusters in Harmony model
853 |     max_iter = 0, ## don't actually run Harmony, stop after initialization
854 |     return_object = TRUE ## return the full Harmony model object, not just the corrected PCA matrix
855 | )
856 | 
857 | ```
858 | 
859 | 
860 | ```{r, message=FALSE, fig.width=5, fig.height=3, fig.align="center"}
861 | 
862 | i <- 0
863 | 
864 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
865 |     labs(title = sprintf('Round %d', i), subtitle = 'Colored by dataset', x = 'Scaled PC1', y = 'Scaled PC2') +
866 | do_scatter(t(harmonyObj$Z_cos), meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 
867 |     labs(title = sprintf('Round %d', i), subtitle = 'Colored by cell type', x = 'Scaled PC1', y = 'Scaled PC2') +
868 | NULL
869 | ```
870 | 
871 | ```{r, fig.width=5, fig.height=3, fig.align="center", message=FALSE}
872 | 
873 | for (i in 1:2) {
874 |     harmony:::harmonize(harmonyObj, 1)
875 |     plt <- do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) + 
876 |         labs(title = sprintf('Round %d', i), subtitle = 'Colored by dataset', x = 'Scaled PC1', y = 'Scaled PC2') +
877 |     do_scatter(t(harmonyObj$Z_cos), meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) + 
878 |         labs(title = sprintf('Round %d', i), subtitle = 'Colored by cell type', x = 'Scaled PC1', y = 'Scaled PC2') +
879 |     NULL
880 |     plot(plt)
881 | }
882 |     
883 | ```
884 | 
885 | 
886 | 
887 | # Session info
888 | 
889 | 
890 | ```{r}
891 | sessionInfo()
892 | ```
893 | 
894 | 
895 | 
896 | 
897 | 
898 | 
899 | 
900 | 
901 | 
902 | 
903 | 
904 | 
905 | 
906 | 
907 | 
908 | 
909 | 
910 | 
911 | 
912 | 
913 | 
914 | 
915 | 
916 | 


--------------------------------------------------------------------------------
/doc/parameters.R:
--------------------------------------------------------------------------------
 1 | ## ---- include = FALSE---------------------------------------------------------
 2 | knitr::opts_chunk$set(
 3 |   collapse = TRUE,
 4 |   comment = "#>"
 5 | )
 6 | 
 7 | ## ----setup--------------------------------------------------------------------
 8 | library(harmony)
 9 | library(ggplot2)
10 | 
11 | 
12 | 
13 | 
14 | ## -----------------------------------------------------------------------------
15 | 
16 | ## Old
17 | 
18 | ## 
19 | ## HarmonyMatrix(bos, opt.args = list(lambda = c(0,1)))
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | ## -----------------------------------------------------------------------------
27 | ## Source required data
28 | ## data("celllines")
29 | 
30 | ## cell_lines <- zeros()
31 | ## pbmc <- CreateSeuratObject(counts = , project = "jurkat", min.cells = 5)
32 | 
33 | ## Separate conditions
34 | 
35 | ## pbmc@meta.data$stim <- c(rep("STIM", ncol(stim.sparse)), rep("CTRL", ncol(ctrl.sparse)))
36 | 
37 | 


--------------------------------------------------------------------------------
/doc/parameters.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Advanced tutorial"
 3 | output: rmarkdown::html_vignette
 4 | vignette: >
 5 |   %\VignetteIndexEntry{Advanced tutorial}
 6 |   %\VignetteEngine{knitr::rmarkdown}
 7 |   %\VignetteEncoding{UTF-8}
 8 | ---
 9 | 
10 | ```{r, include = FALSE}
11 | knitr::opts_chunk$set(
12 |   collapse = TRUE,
13 |   comment = "#>"
14 | )
15 | ```
16 | 
17 | # Introduction
18 | 
19 | Harmony uses a set of parameters to ensure the different components of the algorithm work in harmony! By default, several of these parameters are set by the algorithm using heuristics or empirical values. Most of the time, the end-user does not need to find optimal values to run Harmony. In this vignette, we will be going through some use cases where user need to intervene and optimize the data or parameters harmony.
20 | 
21 | There are two reasons that someone a user may need to change the parameters. The first one is to increase the quality of the data integration. The second one is to improve the performance of harmony.
22 | 
23 | 
24 | # Harmony algorithm objective diverges after a number of correction steps
25 | 
26 | For some datasets, the objective function may diverge after a few steps. Here we are going to be looking into the Jurkat dataset that is bundled with harmony.
27 | 
28 | 
29 | ```{r setup}
30 | library(harmony)
31 | library(ggplot2)
32 | 
33 | 
34 | 
35 | ```
36 | 
37 | 
38 | ```{r}
39 | 
40 | ## Old
41 | 
42 | ## 
43 | ## HarmonyMatrix(bos, opt.args = list(lambda = c(0,1)))
44 | 
45 | 
46 | 
47 | 
48 | 
49 | ```
50 | 
51 | # An example of a problematic dataset
52 | 
53 | ```{r}
54 | ## Source required data
55 | ## data("celllines")
56 | 
57 | ## cell_lines <- zeros()
58 | ## pbmc <- CreateSeuratObject(counts = , project = "jurkat", min.cells = 5)
59 | 
60 | ## Separate conditions
61 | 
62 | ## pbmc@meta.data$stim <- c(rep("STIM", ncol(stim.sparse)), rep("CTRL", ncol(ctrl.sparse)))
63 | ```
64 | 
65 | 
66 | ## Input data
67 | ## Number of PCs
68 | Using the correct number of components can become very important in certain scenarios.
69 | 
70 | 
71 | 
72 | 
73 | 
74 | ## Nested data
75 | 
76 | # Harmony Algorithm parameters
77 | ## theta
78 | ## lambda
79 | ## sigma
80 | ## nclust
81 | 
82 | # Controlling harmony flow
83 | ## 
84 | ##
85 | 


--------------------------------------------------------------------------------
/doc/parameters.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | 
  3 | <html>
  4 | 
  5 | <head>
  6 | 
  7 | <meta charset="utf-8" />
  8 | <meta name="generator" content="pandoc" />
  9 | <meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
 10 | 
 11 | <meta name="viewport" content="width=device-width, initial-scale=1" />
 12 | 
 13 | 
 14 | 
 15 | <title>Advanced tutorial</title>
 16 | 
 17 | <script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
 18 | // be compatible with the behavior of Pandoc < 2.8).
 19 | document.addEventListener('DOMContentLoaded', function(e) {
 20 |   var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
 21 |   var i, h, a;
 22 |   for (i = 0; i < hs.length; i++) {
 23 |     h = hs[i];
 24 |     if (!/^h[1-6]$/i.test(h.tagName)) continue;  // it should be a header h1-h6
 25 |     a = h.attributes;
 26 |     while (a.length > 0) h.removeAttribute(a[0].name);
 27 |   }
 28 | });
 29 | </script>
 30 | 
 31 | <style type="text/css">
 32 | code{white-space: pre-wrap;}
 33 | span.smallcaps{font-variant: small-caps;}
 34 | span.underline{text-decoration: underline;}
 35 | div.column{display: inline-block; vertical-align: top; width: 50%;}
 36 | div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
 37 | ul.task-list{list-style: none;}
 38 | </style>
 39 | 
 40 | 
 41 | 
 42 | <style type="text/css">
 43 | code {
 44 | white-space: pre;
 45 | }
 46 | .sourceCode {
 47 | overflow: visible;
 48 | }
 49 | </style>
 50 | <style type="text/css" data-origin="pandoc">
 51 | pre > code.sourceCode { white-space: pre; position: relative; }
 52 | pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
 53 | pre > code.sourceCode > span:empty { height: 1.2em; }
 54 | .sourceCode { overflow: visible; }
 55 | code.sourceCode > span { color: inherit; text-decoration: inherit; }
 56 | div.sourceCode { margin: 1em 0; }
 57 | pre.sourceCode { margin: 0; }
 58 | @media screen {
 59 | div.sourceCode { overflow: auto; }
 60 | }
 61 | @media print {
 62 | pre > code.sourceCode { white-space: pre-wrap; }
 63 | pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
 64 | }
 65 | pre.numberSource code
 66 | { counter-reset: source-line 0; }
 67 | pre.numberSource code > span
 68 | { position: relative; left: -4em; counter-increment: source-line; }
 69 | pre.numberSource code > span > a:first-child::before
 70 | { content: counter(source-line);
 71 | position: relative; left: -1em; text-align: right; vertical-align: baseline;
 72 | border: none; display: inline-block;
 73 | -webkit-touch-callout: none; -webkit-user-select: none;
 74 | -khtml-user-select: none; -moz-user-select: none;
 75 | -ms-user-select: none; user-select: none;
 76 | padding: 0 4px; width: 4em;
 77 | color: #aaaaaa;
 78 | }
 79 | pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
 80 | div.sourceCode
 81 | { }
 82 | @media screen {
 83 | pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
 84 | }
 85 | code span.al { color: #ff0000; font-weight: bold; } 
 86 | code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } 
 87 | code span.at { color: #7d9029; } 
 88 | code span.bn { color: #40a070; } 
 89 | code span.bu { color: #008000; } 
 90 | code span.cf { color: #007020; font-weight: bold; } 
 91 | code span.ch { color: #4070a0; } 
 92 | code span.cn { color: #880000; } 
 93 | code span.co { color: #60a0b0; font-style: italic; } 
 94 | code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } 
 95 | code span.do { color: #ba2121; font-style: italic; } 
 96 | code span.dt { color: #902000; } 
 97 | code span.dv { color: #40a070; } 
 98 | code span.er { color: #ff0000; font-weight: bold; } 
 99 | code span.ex { } 
100 | code span.fl { color: #40a070; } 
101 | code span.fu { color: #06287e; } 
102 | code span.im { color: #008000; font-weight: bold; } 
103 | code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } 
104 | code span.kw { color: #007020; font-weight: bold; } 
105 | code span.op { color: #666666; } 
106 | code span.ot { color: #007020; } 
107 | code span.pp { color: #bc7a00; } 
108 | code span.sc { color: #4070a0; } 
109 | code span.ss { color: #bb6688; } 
110 | code span.st { color: #4070a0; } 
111 | code span.va { color: #19177c; } 
112 | code span.vs { color: #4070a0; } 
113 | code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } 
114 | </style>
115 | <script>
116 | // apply pandoc div.sourceCode style to pre.sourceCode instead
117 | (function() {
118 |   var sheets = document.styleSheets;
119 |   for (var i = 0; i < sheets.length; i++) {
120 |     if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
121 |     try { var rules = sheets[i].cssRules; } catch (e) { continue; }
122 |     var j = 0;
123 |     while (j < rules.length) {
124 |       var rule = rules[j];
125 |       // check if there is a div.sourceCode rule
126 |       if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
127 |         j++;
128 |         continue;
129 |       }
130 |       var style = rule.style.cssText;
131 |       // check if color or background-color is set
132 |       if (rule.style.color === '' && rule.style.backgroundColor === '') {
133 |         j++;
134 |         continue;
135 |       }
136 |       // replace div.sourceCode by a pre.sourceCode rule
137 |       sheets[i].deleteRule(j);
138 |       sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
139 |     }
140 |   }
141 | })();
142 | </script>
143 | 
144 | 
145 | 
146 | 
147 | <style type="text/css">body {
148 | background-color: #fff;
149 | margin: 1em auto;
150 | max-width: 700px;
151 | overflow: visible;
152 | padding-left: 2em;
153 | padding-right: 2em;
154 | font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
155 | font-size: 14px;
156 | line-height: 1.35;
157 | }
158 | #TOC {
159 | clear: both;
160 | margin: 0 0 10px 10px;
161 | padding: 4px;
162 | width: 400px;
163 | border: 1px solid #CCCCCC;
164 | border-radius: 5px;
165 | background-color: #f6f6f6;
166 | font-size: 13px;
167 | line-height: 1.3;
168 | }
169 | #TOC .toctitle {
170 | font-weight: bold;
171 | font-size: 15px;
172 | margin-left: 5px;
173 | }
174 | #TOC ul {
175 | padding-left: 40px;
176 | margin-left: -1.5em;
177 | margin-top: 5px;
178 | margin-bottom: 5px;
179 | }
180 | #TOC ul ul {
181 | margin-left: -2em;
182 | }
183 | #TOC li {
184 | line-height: 16px;
185 | }
186 | table {
187 | margin: 1em auto;
188 | border-width: 1px;
189 | border-color: #DDDDDD;
190 | border-style: outset;
191 | border-collapse: collapse;
192 | }
193 | table th {
194 | border-width: 2px;
195 | padding: 5px;
196 | border-style: inset;
197 | }
198 | table td {
199 | border-width: 1px;
200 | border-style: inset;
201 | line-height: 18px;
202 | padding: 5px 5px;
203 | }
204 | table, table th, table td {
205 | border-left-style: none;
206 | border-right-style: none;
207 | }
208 | table thead, table tr.even {
209 | background-color: #f7f7f7;
210 | }
211 | p {
212 | margin: 0.5em 0;
213 | }
214 | blockquote {
215 | background-color: #f6f6f6;
216 | padding: 0.25em 0.75em;
217 | }
218 | hr {
219 | border-style: solid;
220 | border: none;
221 | border-top: 1px solid #777;
222 | margin: 28px 0;
223 | }
224 | dl {
225 | margin-left: 0;
226 | }
227 | dl dd {
228 | margin-bottom: 13px;
229 | margin-left: 13px;
230 | }
231 | dl dt {
232 | font-weight: bold;
233 | }
234 | ul {
235 | margin-top: 0;
236 | }
237 | ul li {
238 | list-style: circle outside;
239 | }
240 | ul ul {
241 | margin-bottom: 0;
242 | }
243 | pre, code {
244 | background-color: #f7f7f7;
245 | border-radius: 3px;
246 | color: #333;
247 | white-space: pre-wrap; 
248 | }
249 | pre {
250 | border-radius: 3px;
251 | margin: 5px 0px 10px 0px;
252 | padding: 10px;
253 | }
254 | pre:not([class]) {
255 | background-color: #f7f7f7;
256 | }
257 | code {
258 | font-family: Consolas, Monaco, 'Courier New', monospace;
259 | font-size: 85%;
260 | }
261 | p > code, li > code {
262 | padding: 2px 0px;
263 | }
264 | div.figure {
265 | text-align: center;
266 | }
267 | img {
268 | background-color: #FFFFFF;
269 | padding: 2px;
270 | border: 1px solid #DDDDDD;
271 | border-radius: 3px;
272 | border: 1px solid #CCCCCC;
273 | margin: 0 5px;
274 | }
275 | h1 {
276 | margin-top: 0;
277 | font-size: 35px;
278 | line-height: 40px;
279 | }
280 | h2 {
281 | border-bottom: 4px solid #f7f7f7;
282 | padding-top: 10px;
283 | padding-bottom: 2px;
284 | font-size: 145%;
285 | }
286 | h3 {
287 | border-bottom: 2px solid #f7f7f7;
288 | padding-top: 10px;
289 | font-size: 120%;
290 | }
291 | h4 {
292 | border-bottom: 1px solid #f7f7f7;
293 | margin-left: 8px;
294 | font-size: 105%;
295 | }
296 | h5, h6 {
297 | border-bottom: 1px solid #ccc;
298 | font-size: 105%;
299 | }
300 | a {
301 | color: #0033dd;
302 | text-decoration: none;
303 | }
304 | a:hover {
305 | color: #6666ff; }
306 | a:visited {
307 | color: #800080; }
308 | a:visited:hover {
309 | color: #BB00BB; }
310 | a[href^="http:"] {
311 | text-decoration: underline; }
312 | a[href^="https:"] {
313 | text-decoration: underline; }
314 | 
315 | code > span.kw { color: #555; font-weight: bold; } 
316 | code > span.dt { color: #902000; } 
317 | code > span.dv { color: #40a070; } 
318 | code > span.bn { color: #d14; } 
319 | code > span.fl { color: #d14; } 
320 | code > span.ch { color: #d14; } 
321 | code > span.st { color: #d14; } 
322 | code > span.co { color: #888888; font-style: italic; } 
323 | code > span.ot { color: #007020; } 
324 | code > span.al { color: #ff0000; font-weight: bold; } 
325 | code > span.fu { color: #900; font-weight: bold; } 
326 | code > span.er { color: #a61717; background-color: #e3d2d2; } 
327 | </style>
328 | 
329 | 
330 | 
331 | 
332 | </head>
333 | 
334 | <body>
335 | 
336 | 
337 | 
338 | 
339 | <h1 class="title toc-ignore">Advanced tutorial</h1>
340 | 
341 | 
342 | 
343 | <div id="introduction" class="section level1">
344 | <h1>Introduction</h1>
345 | <p>Harmony uses a set of parameters to ensure the different components
346 | of the algorithm work in harmony! By default, several of these
347 | parameters are set by the algorithm using heuristics or empirical
348 | values. Most of the time, the end-user does not need to find optimal
349 | values to run Harmony. In this vignette, we will be going through some
350 | use cases where user need to intervene and optimize the data or
351 | parameters harmony.</p>
352 | <p>There are two reasons that someone a user may need to change the
353 | parameters. The first one is to increase the quality of the data
354 | integration. The second one is to improve the performance of
355 | harmony.</p>
356 | </div>
357 | <div id="harmony-algorithm-objective-diverges-after-a-number-of-correction-steps" class="section level1">
358 | <h1>Harmony algorithm objective diverges after a number of correction
359 | steps</h1>
360 | <p>For some datasets, the objective function may diverge after a few
361 | steps. Here we are going to be looking into the Jurkat dataset that is
362 | bundled with harmony.</p>
363 | <div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" tabindex="-1"></a><span class="fu">library</span>(harmony)</span>
364 | <span id="cb1-2"><a href="#cb1-2" tabindex="-1"></a><span class="co">#&gt; Loading required package: Rcpp</span></span>
365 | <span id="cb1-3"><a href="#cb1-3" tabindex="-1"></a><span class="fu">library</span>(ggplot2)</span></code></pre></div>
366 | <div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" tabindex="-1"></a></span>
367 | <span id="cb2-2"><a href="#cb2-2" tabindex="-1"></a><span class="do">## Old</span></span>
368 | <span id="cb2-3"><a href="#cb2-3" tabindex="-1"></a></span>
369 | <span id="cb2-4"><a href="#cb2-4" tabindex="-1"></a><span class="do">## </span></span>
370 | <span id="cb2-5"><a href="#cb2-5" tabindex="-1"></a><span class="do">## HarmonyMatrix(bos, opt.args = list(lambda = c(0,1)))</span></span></code></pre></div>
371 | </div>
372 | <div id="an-example-of-a-problematic-dataset" class="section level1">
373 | <h1>An example of a problematic dataset</h1>
374 | <div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" tabindex="-1"></a><span class="do">## Source required data</span></span>
375 | <span id="cb3-2"><a href="#cb3-2" tabindex="-1"></a><span class="do">## data(&quot;celllines&quot;)</span></span>
376 | <span id="cb3-3"><a href="#cb3-3" tabindex="-1"></a></span>
377 | <span id="cb3-4"><a href="#cb3-4" tabindex="-1"></a><span class="do">## cell_lines &lt;- zeros()</span></span>
378 | <span id="cb3-5"><a href="#cb3-5" tabindex="-1"></a><span class="do">## pbmc &lt;- CreateSeuratObject(counts = , project = &quot;jurkat&quot;, min.cells = 5)</span></span>
379 | <span id="cb3-6"><a href="#cb3-6" tabindex="-1"></a></span>
380 | <span id="cb3-7"><a href="#cb3-7" tabindex="-1"></a><span class="do">## Separate conditions</span></span>
381 | <span id="cb3-8"><a href="#cb3-8" tabindex="-1"></a></span>
382 | <span id="cb3-9"><a href="#cb3-9" tabindex="-1"></a><span class="do">## pbmc@meta.data$stim &lt;- c(rep(&quot;STIM&quot;, ncol(stim.sparse)), rep(&quot;CTRL&quot;, ncol(ctrl.sparse)))</span></span></code></pre></div>
383 | <div id="input-data" class="section level2">
384 | <h2>Input data</h2>
385 | </div>
386 | <div id="number-of-pcs" class="section level2">
387 | <h2>Number of PCs</h2>
388 | <p>Using the correct number of components can become very important in
389 | certain scenarios.</p>
390 | </div>
391 | <div id="nested-data" class="section level2">
392 | <h2>Nested data</h2>
393 | </div>
394 | </div>
395 | <div id="harmony-algorithm-parameters" class="section level1">
396 | <h1>Harmony Algorithm parameters</h1>
397 | <div id="theta" class="section level2">
398 | <h2>theta</h2>
399 | </div>
400 | <div id="lambda" class="section level2">
401 | <h2>lambda</h2>
402 | </div>
403 | <div id="sigma" class="section level2">
404 | <h2>sigma</h2>
405 | </div>
406 | <div id="nclust" class="section level2">
407 | <h2>nclust</h2>
408 | </div>
409 | </div>
410 | <div id="controlling-harmony-flow" class="section level1">
411 | <h1>Controlling harmony flow</h1>
412 | <div id="section" class="section level2">
413 | <h2></h2>
414 | </div>
415 | <div id="section-1" class="section level2">
416 | <h2></h2>
417 | </div>
418 | </div>
419 | 
420 | 
421 | 
422 | <!-- code folding -->
423 | 
424 | 
425 | <!-- dynamically load mathjax for compatibility with self-contained -->
426 | <script>
427 |   (function () {
428 |     var script = document.createElement("script");
429 |     script.type = "text/javascript";
430 |     script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
431 |     document.getElementsByTagName("head")[0].appendChild(script);
432 |   })();
433 | </script>
434 | 
435 | </body>
436 | </html>
437 | 


--------------------------------------------------------------------------------
/doc/quickstart.R:
--------------------------------------------------------------------------------
 1 | ## ----eval=FALSE---------------------------------------------------------------
 2 | #  install.packages('harmony')
 3 | 
 4 | ## -----------------------------------------------------------------------------
 5 | library(harmony)
 6 | 
 7 | ## -----------------------------------------------------------------------------
 8 | data(cell_lines)
 9 | V <- cell_lines$scaled_pcs
10 | meta_data <- cell_lines$meta_data
11 | 
12 | 
13 | ## ----class.source='fold-hide', fig.width=5, fig.height=3, fig.align="center"----
14 | 
15 | library(ggplot2)
16 | 
17 | do_scatter <- function(xy, meta_data, label_name, base_size = 12) {    
18 |     palette_use <- c(`jurkat` = '#810F7C', `t293` = '#D09E2D',`half` = '#006D2C')
19 |     xy <- xy[, 1:2]
20 |     colnames(xy) <- c('X1', 'X2')
21 |     plt_df <- xy %>% data.frame() %>% cbind(meta_data)
22 |     plt <- ggplot(plt_df, aes(X1, X2, col = !!rlang::sym(label_name), fill = !!rlang::sym(label_name))) + 
23 |         theme_test(base_size = base_size) +
24 |         guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1,
25 |                                                         shape = 16, size = 4))) +
26 |         scale_color_manual(values = palette_use) +
27 |         scale_fill_manual(values = palette_use) +
28 |         theme(plot.title = element_text(hjust = .5)) +
29 |         labs(x = "PC 1", y = "PC 2") +
30 |         theme(legend.position = "none") +
31 |         geom_point(shape = '.')
32 |     
33 |     ## Add labels
34 |     data_labels <- plt_df %>%
35 |         dplyr::group_by(!!rlang::sym(label_name)) %>%
36 |         dplyr::summarise(X1 = mean(X1), X2 = mean(X2)) %>%
37 |         dplyr::ungroup()
38 |     plt + geom_label(data = data_labels, aes(label = !!rlang::sym(label_name)), 
39 |                             color = "white", size = 4)
40 | }
41 | p1 <- do_scatter(V, meta_data, 'dataset') + 
42 |     labs(title = 'Colored by dataset')
43 | p2 <- do_scatter(V, meta_data, 'cell_type') + 
44 |     labs(title = 'Colored by cell type')
45 | 
46 | cowplot::plot_grid(p1, p2)
47 | 
48 | 
49 | ## -----------------------------------------------------------------------------
50 | harmony_embeddings <- harmony::RunHarmony(
51 |     V, meta_data, 'dataset', verbose=FALSE
52 | )
53 | 
54 | 
55 | ## ---- fig.width=5, fig.height=3, fig.align="center"---------------------------
56 | p1 <- do_scatter(harmony_embeddings, meta_data, 'dataset') + 
57 |     labs(title = 'Colored by dataset')
58 | p2 <- do_scatter(harmony_embeddings, meta_data, 'cell_type') + 
59 |     labs(title = 'Colored by cell type')
60 | cowplot::plot_grid(p1, p2, nrow = 1)
61 | 
62 | 
63 | ## -----------------------------------------------------------------------------
64 | sessionInfo()
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/doc/quickstart.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Quick start to Harmony"
  3 | author: "Korsunsky et al.: Fast, sensitive, and accurate integration of single 
  4 | cell data with Harmony"
  5 | output:
  6 |   rmarkdown::html_vignette:
  7 |     code_folding: show
  8 | vignette: >
  9 |     %\VignetteIndexEntry{Quick start to Harmony}
 10 |     %\VignetteEngine{knitr::rmarkdown}
 11 |     %\VignetteEncoding{UTF-8} 
 12 | ---
 13 |   
 14 | 
 15 | # Introduction
 16 | 
 17 | Harmony is an algorithm for performing integration of single cell genomics
 18 | datasets. Please check out our latest 
 19 | [manuscript on Nature Methods](https://www.nature.com/articles/s41592-019-0619-0). 
 20 | 
 21 | ![](main.jpg){width=100%}
 22 | 
 23 | 
 24 | # Installation
 25 | 
 26 | Install Harmony from CRAN with standard commands.
 27 | 
 28 | ```{r eval=FALSE}
 29 | install.packages('harmony')
 30 | ```
 31 | 
 32 | Once Harmony is installed, load it up! 
 33 | 
 34 | ```{r}
 35 | library(harmony)
 36 | ```
 37 | 
 38 | 
 39 | # Integrating cell line datasets from 10X
 40 | 
 41 | The example below follows Figure 2 in the manuscript. 
 42 | 
 43 | We downloaded 3 cell line datasets from the 10X website. The first two (jurkat
 44 | and 293t) come from pure cell lines while the *half* dataset is a 50:50
 45 | mixture of Jurkat and HEK293T cells. We inferred cell type with the canonical 
 46 | marker XIST, since the two cell lines come from 1 male and 1 female donor. 
 47 | 
 48 | * support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/jurkat
 49 | * support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/293t
 50 | * support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/jurkat:293t_50:50
 51 | 
 52 | We library normalized the cells, log transformed the counts, and scaled the 
 53 | genes. Then we performed PCA and kept the top 20 PCs. The PCA embeddings and 
 54 | meta data are available as part of this package. 
 55 | 
 56 | ```{r}
 57 | data(cell_lines)
 58 | V <- cell_lines$scaled_pcs
 59 | meta_data <- cell_lines$meta_data
 60 | 
 61 | ```
 62 | 
 63 | 
 64 | Initially, the cells cluster by both dataset (left) and cell type (right). 
 65 | 
 66 | ```{r class.source='fold-hide', fig.width=5, fig.height=3, fig.align="center"}
 67 | 
 68 | library(ggplot2)
 69 | 
 70 | do_scatter <- function(xy, meta_data, label_name, base_size = 12) {    
 71 |     palette_use <- c(`jurkat` = '#810F7C', `t293` = '#D09E2D',`half` = '#006D2C')
 72 |     xy <- xy[, 1:2]
 73 |     colnames(xy) <- c('X1', 'X2')
 74 |     plt_df <- xy %>% data.frame() %>% cbind(meta_data)
 75 |     plt <- ggplot(plt_df, aes(X1, X2, col = !!rlang::sym(label_name), fill = !!rlang::sym(label_name))) + 
 76 |         theme_test(base_size = base_size) +
 77 |         guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1,
 78 |                                                         shape = 16, size = 4))) +
 79 |         scale_color_manual(values = palette_use) +
 80 |         scale_fill_manual(values = palette_use) +
 81 |         theme(plot.title = element_text(hjust = .5)) +
 82 |         labs(x = "PC 1", y = "PC 2") +
 83 |         theme(legend.position = "none") +
 84 |         geom_point(shape = '.')
 85 |     
 86 |     ## Add labels
 87 |     data_labels <- plt_df %>%
 88 |         dplyr::group_by(!!rlang::sym(label_name)) %>%
 89 |         dplyr::summarise(X1 = mean(X1), X2 = mean(X2)) %>%
 90 |         dplyr::ungroup()
 91 |     plt + geom_label(data = data_labels, aes(label = !!rlang::sym(label_name)), 
 92 |                             color = "white", size = 4)
 93 | }
 94 | p1 <- do_scatter(V, meta_data, 'dataset') + 
 95 |     labs(title = 'Colored by dataset')
 96 | p2 <- do_scatter(V, meta_data, 'cell_type') + 
 97 |     labs(title = 'Colored by cell type')
 98 | 
 99 | cowplot::plot_grid(p1, p2)
100 | 
101 | ```
102 | 
103 | Let's run Harmony to remove the influence of dataset-of-origin from the cell
104 | embeddings.
105 | 
106 | ```{r}
107 | harmony_embeddings <- harmony::RunHarmony(
108 |     V, meta_data, 'dataset', verbose=FALSE
109 | )
110 | 
111 | ```
112 | 
113 | After Harmony, the datasets are now mixed (left) and the cell types are still
114 | separate (right). 
115 | 
116 | ```{r, fig.width=5, fig.height=3, fig.align="center"}
117 | p1 <- do_scatter(harmony_embeddings, meta_data, 'dataset') + 
118 |     labs(title = 'Colored by dataset')
119 | p2 <- do_scatter(harmony_embeddings, meta_data, 'cell_type') + 
120 |     labs(title = 'Colored by cell type')
121 | cowplot::plot_grid(p1, p2, nrow = 1)
122 | 
123 | ```
124 | 
125 | # Next Steps
126 | 
127 | ## Interfacing to software packages
128 | 
129 | You can also run Harmony as part of an established pipeline in several packages, such as Seurat. For these vignettes, please [visit our github page](https://github.com/immunogenomics/harmony/).
130 | 
131 | 
132 | ## Detailed breakdown of the Harmony algorithm
133 | 
134 | For more details on how each part of Harmony works, consult our more detailed
135 | [vignette](https://htmlpreview.github.io/?https://github.com/immunogenomics/harmony/blob/master/doc/detailedWalkthrough.html)
136 | "Detailed Walkthrough of Harmony Algorithm".
137 | 
138 | # Session Info
139 | 
140 | ```{r}
141 | sessionInfo()
142 | 
143 | ```
144 | 


--------------------------------------------------------------------------------
/man/HarmonyMatrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ui.R
 3 | \name{HarmonyMatrix}
 4 | \alias{HarmonyMatrix}
 5 | \title{A proxy call to \code{\link[=RunHarmony]{RunHarmony()}}. Deprecated.}
 6 | \usage{
 7 | HarmonyMatrix(...)
 8 | }
 9 | \arguments{
10 | \item{...}{
11 |   Arguments passed on to \code{\link[=RunHarmony.default]{RunHarmony.default}}
12 |   \describe{
13 |     \item{\code{data_mat}}{Matrix of cell embeddings. Cells can be rows or
14 | columns and will be inferred by the rows of meta_data.}
15 |     \item{\code{meta_data}}{Either (1) Dataframe with variables to integrate
16 | or (2) vector with labels.}
17 |     \item{\code{vars_use}}{If meta_data is dataframe, this defined which
18 | variable(s) to remove (character vector).}
19 |     \item{\code{theta}}{Diversity clustering penalty parameter. Specify for
20 | each variable in vars_use Default theta=2. theta=0 does not
21 | encourage any diversity. Larger values of theta result in more
22 | diverse clusters.}
23 |     \item{\code{sigma}}{Width of soft kmeans clusters. Default
24 | sigma=0.1. Sigma scales the distance from a cell to cluster
25 | centroids. Larger values of sigma result in cells assigned to
26 | more clusters. Smaller values of sigma make soft kmeans cluster
27 | approach hard clustering.}
28 |     \item{\code{lambda}}{Ridge regression penalty. Default lambda=1. Bigger
29 | values protect against over correction. If several covariates
30 | are specified, then lambda can also be a vector which needs to
31 | be equal length with the number of variables to be
32 | corrected. In this scenario, each covariate level group will be
33 | assigned the scalars specified by the user. If set to NULL,
34 | harmony will start lambda estimation mode to determine lambdas
35 | automatically and try to minimize overcorrection (Use with caution still
36 | in beta testing).}
37 |     \item{\code{nclust}}{Number of clusters in model. nclust=1 equivalent to
38 | simple linear regression.}
39 |     \item{\code{max_iter}}{Maximum number of rounds to run Harmony. One round
40 | of Harmony involves one clustering and one correction step.}
41 |     \item{\code{early_stop}}{Enable early stopping for harmony. The
42 | harmonization process will stop when the change of objective
43 | function between corrections drops below 1e-4}
44 |     \item{\code{ncores}}{Number of processors to be used for math operations
45 | when optimized BLAS is available. If BLAS is not supporting
46 | multithreaded then this option has no effect. By default,
47 | ncore=1 which runs as a single-threaded process. Although
48 | Harmony supports multiple cores, it is not optimized for
49 | multithreading. Increase this number for large datasets iff
50 | single-core performance is not adequate.}
51 |     \item{\code{plot_convergence}}{Whether to print the convergence plot of
52 | the clustering objective function. TRUE to plot, FALSE to
53 | suppress. This can be useful for debugging.}
54 |     \item{\code{return_object}}{(Advanced Usage) Whether to return the Harmony
55 | object or only the corrected PCA embeddings.}
56 |     \item{\code{verbose}}{Whether to print progress messages. TRUE to print,
57 | FALSE to suppress.}
58 |     \item{\code{.options}}{Setting advanced parameters of RunHarmony. This must be the
59 | result from a call to `harmony_options`. See ?`harmony_options` for 
60 | parameters not listed above and more details.}
61 |   }}
62 | }
63 | \description{
64 | Maintain name backwards compatibility with version 0 of
65 | harmony. However, API is not backwards compatible with version
66 | 0. This function will be deprecated in later versions of Harmony.
67 | }
68 | 


--------------------------------------------------------------------------------
/man/RunHarmony.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RunHarmony.R
 3 | \name{RunHarmony}
 4 | \alias{RunHarmony}
 5 | \title{Generic function that runs the harmony algorithm on single-cell
 6 | genomics cell embeddings.}
 7 | \usage{
 8 | RunHarmony(...)
 9 | }
10 | \arguments{
11 | \item{...}{
12 |   Arguments passed on to \code{\link[=RunHarmony.default]{RunHarmony.default}}
13 |   \describe{
14 |     \item{\code{theta}}{Diversity clustering penalty parameter. Specify for
15 | each variable in vars_use Default theta=2. theta=0 does not
16 | encourage any diversity. Larger values of theta result in more
17 | diverse clusters.}
18 |     \item{\code{sigma}}{Width of soft kmeans clusters. Default
19 | sigma=0.1. Sigma scales the distance from a cell to cluster
20 | centroids. Larger values of sigma result in cells assigned to
21 | more clusters. Smaller values of sigma make soft kmeans cluster
22 | approach hard clustering.}
23 |     \item{\code{lambda}}{Ridge regression penalty. Default lambda=1. Bigger
24 | values protect against over correction. If several covariates
25 | are specified, then lambda can also be a vector which needs to
26 | be equal length with the number of variables to be
27 | corrected. In this scenario, each covariate level group will be
28 | assigned the scalars specified by the user. If set to NULL,
29 | harmony will start lambda estimation mode to determine lambdas
30 | automatically and try to minimize overcorrection (Use with caution still
31 | in beta testing).}
32 |     \item{\code{nclust}}{Number of clusters in model. nclust=1 equivalent to
33 | simple linear regression.}
34 |     \item{\code{max_iter}}{Maximum number of rounds to run Harmony. One round
35 | of Harmony involves one clustering and one correction step.}
36 |     \item{\code{early_stop}}{Enable early stopping for harmony. The
37 | harmonization process will stop when the change of objective
38 | function between corrections drops below 1e-4}
39 |     \item{\code{ncores}}{Number of processors to be used for math operations
40 | when optimized BLAS is available. If BLAS is not supporting
41 | multithreaded then this option has no effect. By default,
42 | ncore=1 which runs as a single-threaded process. Although
43 | Harmony supports multiple cores, it is not optimized for
44 | multithreading. Increase this number for large datasets iff
45 | single-core performance is not adequate.}
46 |     \item{\code{plot_convergence}}{Whether to print the convergence plot of
47 | the clustering objective function. TRUE to plot, FALSE to
48 | suppress. This can be useful for debugging.}
49 |     \item{\code{verbose}}{Whether to print progress messages. TRUE to print,
50 | FALSE to suppress.}
51 |     \item{\code{.options}}{Setting advanced parameters of RunHarmony. This must be the
52 | result from a call to `harmony_options`. See ?`harmony_options` for 
53 | parameters not listed above and more details.}
54 |   }}
55 | }
56 | \value{
57 | If used with single-cell objects, it will return the
58 | updated single-sell object. For standalone operation, it
59 | returns the corrected cell embeddings or the R6 harmony object
60 | (see \code{\link[=RunHarmony.default]{RunHarmony.default()}}).
61 | }
62 | \description{
63 | RunHarmony is generic function that runs the main Harmony
64 | algorithm. If working with single cell R objects, please refer to
65 | the documentation of the appropriate generic API:
66 | (\code{\link[=RunHarmony.Seurat]{RunHarmony.Seurat()}} or \code{\link[=RunHarmony.SingleCellExperiment]{RunHarmony.SingleCellExperiment()}}). If
67 | users work with other forms of cell embeddings, the can pass them
68 | directly to harmony using \code{\link[=RunHarmony.default]{RunHarmony.default()}} API. All the
69 | function arguments listed here are common in all RunHarmony
70 | interfaces.
71 | }
72 | \seealso{
73 | Other RunHarmony: 
74 | \code{\link{RunHarmony.Seurat}()},
75 | \code{\link{RunHarmony.SingleCellExperiment}()},
76 | \code{\link{RunHarmony.default}()}
77 | }
78 | \concept{RunHarmony}
79 | 


--------------------------------------------------------------------------------
/man/RunHarmony.Seurat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RunHarmony.R
 3 | \name{RunHarmony.Seurat}
 4 | \alias{RunHarmony.Seurat}
 5 | \title{Applies harmony on a Seurat object cell embedding.}
 6 | \usage{
 7 | \method{RunHarmony}{Seurat}(
 8 |   object,
 9 |   group.by.vars,
10 |   reduction.use = "pca",
11 |   dims.use = NULL,
12 |   reduction.save = "harmony",
13 |   project.dim = TRUE,
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{object}{the Seurat object. It needs to have the appropriate slot
19 | of cell embeddings precomputed.}
20 | 
21 | \item{group.by.vars}{the name(s) of covariates that harmony will remove
22 | its effect on the data.}
23 | 
24 | \item{reduction.use}{Name of dimension reduction to use. Default is pca.}
25 | 
26 | \item{dims.use}{indices of the cell embedding features to be used}
27 | 
28 | \item{reduction.save}{the name of the new slot that is going to be created by
29 | harmony. By default, harmony.}
30 | 
31 | \item{project.dim}{Project dimension reduction loadings. Default TRUE.}
32 | 
33 | \item{...}{
34 |   Arguments passed on to \code{\link[=RunHarmony.default]{RunHarmony.default}}
35 |   \describe{
36 |     \item{\code{theta}}{Diversity clustering penalty parameter. Specify for
37 | each variable in vars_use Default theta=2. theta=0 does not
38 | encourage any diversity. Larger values of theta result in more
39 | diverse clusters.}
40 |     \item{\code{sigma}}{Width of soft kmeans clusters. Default
41 | sigma=0.1. Sigma scales the distance from a cell to cluster
42 | centroids. Larger values of sigma result in cells assigned to
43 | more clusters. Smaller values of sigma make soft kmeans cluster
44 | approach hard clustering.}
45 |     \item{\code{lambda}}{Ridge regression penalty. Default lambda=1. Bigger
46 | values protect against over correction. If several covariates
47 | are specified, then lambda can also be a vector which needs to
48 | be equal length with the number of variables to be
49 | corrected. In this scenario, each covariate level group will be
50 | assigned the scalars specified by the user. If set to NULL,
51 | harmony will start lambda estimation mode to determine lambdas
52 | automatically and try to minimize overcorrection (Use with caution still
53 | in beta testing).}
54 |     \item{\code{nclust}}{Number of clusters in model. nclust=1 equivalent to
55 | simple linear regression.}
56 |     \item{\code{max_iter}}{Maximum number of rounds to run Harmony. One round
57 | of Harmony involves one clustering and one correction step.}
58 |     \item{\code{early_stop}}{Enable early stopping for harmony. The
59 | harmonization process will stop when the change of objective
60 | function between corrections drops below 1e-4}
61 |     \item{\code{ncores}}{Number of processors to be used for math operations
62 | when optimized BLAS is available. If BLAS is not supporting
63 | multithreaded then this option has no effect. By default,
64 | ncore=1 which runs as a single-threaded process. Although
65 | Harmony supports multiple cores, it is not optimized for
66 | multithreading. Increase this number for large datasets iff
67 | single-core performance is not adequate.}
68 |     \item{\code{plot_convergence}}{Whether to print the convergence plot of
69 | the clustering objective function. TRUE to plot, FALSE to
70 | suppress. This can be useful for debugging.}
71 |     \item{\code{verbose}}{Whether to print progress messages. TRUE to print,
72 | FALSE to suppress.}
73 |     \item{\code{.options}}{Setting advanced parameters of RunHarmony. This must be the
74 | result from a call to `harmony_options`. See ?`harmony_options` for 
75 | parameters not listed above and more details.}
76 |   }}
77 | }
78 | \value{
79 | Seurat object. Harmony dimensions placed into a new slot in the Seurat
80 | object according to the reduction.save. For downstream Seurat analyses,
81 | use reduction='harmony'.
82 | }
83 | \description{
84 | Applies harmony on a Seurat object cell embedding.
85 | }
86 | \examples{
87 | \dontrun{
88 | ## seu is a Seurat single-Cell R object
89 | seu <- RunHarmony(seu, "donor_id")
90 | }
91 | }
92 | \seealso{
93 | Other RunHarmony: 
94 | \code{\link{RunHarmony.SingleCellExperiment}()},
95 | \code{\link{RunHarmony.default}()},
96 | \code{\link{RunHarmony}()}
97 | }
98 | \concept{RunHarmony}
99 | 


--------------------------------------------------------------------------------
/man/RunHarmony.SingleCellExperiment.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RunHarmony.R
 3 | \name{RunHarmony.SingleCellExperiment}
 4 | \alias{RunHarmony.SingleCellExperiment}
 5 | \title{Applies harmony on PCA cell embeddings of a SingleCellExperiment.}
 6 | \usage{
 7 | \method{RunHarmony}{SingleCellExperiment}(
 8 |   object,
 9 |   group.by.vars,
10 |   dims.use = NULL,
11 |   verbose = TRUE,
12 |   reduction.save = "HARMONY",
13 |   ...
14 | )
15 | }
16 | \arguments{
17 | \item{object}{SingleCellExperiment with the PCA reducedDim cell embeddings populated}
18 | 
19 | \item{group.by.vars}{the name(s) of covariates that harmony will remove
20 | its effect on the data.}
21 | 
22 | \item{dims.use}{a vector of indices that allows only selected cell embeddings
23 | features to be used.}
24 | 
25 | \item{verbose}{enable verbosity}
26 | 
27 | \item{reduction.save}{the name of the new slot that is going to be created by
28 | harmony. By default, HARMONY.}
29 | 
30 | \item{...}{
31 |   Arguments passed on to \code{\link[=RunHarmony.default]{RunHarmony.default}}
32 |   \describe{
33 |     \item{\code{theta}}{Diversity clustering penalty parameter. Specify for
34 | each variable in vars_use Default theta=2. theta=0 does not
35 | encourage any diversity. Larger values of theta result in more
36 | diverse clusters.}
37 |     \item{\code{sigma}}{Width of soft kmeans clusters. Default
38 | sigma=0.1. Sigma scales the distance from a cell to cluster
39 | centroids. Larger values of sigma result in cells assigned to
40 | more clusters. Smaller values of sigma make soft kmeans cluster
41 | approach hard clustering.}
42 |     \item{\code{lambda}}{Ridge regression penalty. Default lambda=1. Bigger
43 | values protect against over correction. If several covariates
44 | are specified, then lambda can also be a vector which needs to
45 | be equal length with the number of variables to be
46 | corrected. In this scenario, each covariate level group will be
47 | assigned the scalars specified by the user. If set to NULL,
48 | harmony will start lambda estimation mode to determine lambdas
49 | automatically and try to minimize overcorrection (Use with caution still
50 | in beta testing).}
51 |     \item{\code{nclust}}{Number of clusters in model. nclust=1 equivalent to
52 | simple linear regression.}
53 |     \item{\code{max_iter}}{Maximum number of rounds to run Harmony. One round
54 | of Harmony involves one clustering and one correction step.}
55 |     \item{\code{early_stop}}{Enable early stopping for harmony. The
56 | harmonization process will stop when the change of objective
57 | function between corrections drops below 1e-4}
58 |     \item{\code{ncores}}{Number of processors to be used for math operations
59 | when optimized BLAS is available. If BLAS is not supporting
60 | multithreaded then this option has no effect. By default,
61 | ncore=1 which runs as a single-threaded process. Although
62 | Harmony supports multiple cores, it is not optimized for
63 | multithreading. Increase this number for large datasets iff
64 | single-core performance is not adequate.}
65 |     \item{\code{plot_convergence}}{Whether to print the convergence plot of
66 | the clustering objective function. TRUE to plot, FALSE to
67 | suppress. This can be useful for debugging.}
68 |     \item{\code{.options}}{Setting advanced parameters of RunHarmony. This must be the
69 | result from a call to `harmony_options`. See ?`harmony_options` for 
70 | parameters not listed above and more details.}
71 |   }}
72 | }
73 | \value{
74 | SingleCellExperiment object. After running RunHarmony, the corrected
75 | cell embeddings can be accessed with reducedDim(object, "Harmony").
76 | }
77 | \description{
78 | Applies harmony on PCA cell embeddings of a SingleCellExperiment.
79 | }
80 | \examples{
81 | \dontrun{
82 | ## sce is a SingleCellExperiment R object
83 | sce <- RunHarmony(sce, "donor_id")
84 | }
85 | }
86 | \seealso{
87 | Other RunHarmony: 
88 | \code{\link{RunHarmony.Seurat}()},
89 | \code{\link{RunHarmony.default}()},
90 | \code{\link{RunHarmony}()}
91 | }
92 | \concept{RunHarmony}
93 | 


--------------------------------------------------------------------------------
/man/RunHarmony.default.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/ui.R
  3 | \name{RunHarmony.default}
  4 | \alias{RunHarmony.default}
  5 | \title{This is the primary harmony interface.}
  6 | \usage{
  7 | \method{RunHarmony}{default}(
  8 |   data_mat,
  9 |   meta_data,
 10 |   vars_use,
 11 |   theta = NULL,
 12 |   sigma = 0.1,
 13 |   lambda = 1,
 14 |   nclust = NULL,
 15 |   max_iter = 10,
 16 |   early_stop = TRUE,
 17 |   ncores = 1,
 18 |   plot_convergence = FALSE,
 19 |   return_object = FALSE,
 20 |   verbose = TRUE,
 21 |   .options = harmony_options(),
 22 |   ...
 23 | )
 24 | }
 25 | \arguments{
 26 | \item{data_mat}{Matrix of cell embeddings. Cells can be rows or
 27 | columns and will be inferred by the rows of meta_data.}
 28 | 
 29 | \item{meta_data}{Either (1) Dataframe with variables to integrate
 30 | or (2) vector with labels.}
 31 | 
 32 | \item{vars_use}{If meta_data is dataframe, this defined which
 33 | variable(s) to remove (character vector).}
 34 | 
 35 | \item{theta}{Diversity clustering penalty parameter. Specify for
 36 | each variable in vars_use Default theta=2. theta=0 does not
 37 | encourage any diversity. Larger values of theta result in more
 38 | diverse clusters.}
 39 | 
 40 | \item{sigma}{Width of soft kmeans clusters. Default
 41 | sigma=0.1. Sigma scales the distance from a cell to cluster
 42 | centroids. Larger values of sigma result in cells assigned to
 43 | more clusters. Smaller values of sigma make soft kmeans cluster
 44 | approach hard clustering.}
 45 | 
 46 | \item{lambda}{Ridge regression penalty. Default lambda=1. Bigger
 47 | values protect against over correction. If several covariates
 48 | are specified, then lambda can also be a vector which needs to
 49 | be equal length with the number of variables to be
 50 | corrected. In this scenario, each covariate level group will be
 51 | assigned the scalars specified by the user. If set to NULL,
 52 | harmony will start lambda estimation mode to determine lambdas
 53 | automatically and try to minimize overcorrection (Use with caution still
 54 | in beta testing).}
 55 | 
 56 | \item{nclust}{Number of clusters in model. nclust=1 equivalent to
 57 | simple linear regression.}
 58 | 
 59 | \item{max_iter}{Maximum number of rounds to run Harmony. One round
 60 | of Harmony involves one clustering and one correction step.}
 61 | 
 62 | \item{early_stop}{Enable early stopping for harmony. The
 63 | harmonization process will stop when the change of objective
 64 | function between corrections drops below 1e-4}
 65 | 
 66 | \item{ncores}{Number of processors to be used for math operations
 67 | when optimized BLAS is available. If BLAS is not supporting
 68 | multithreaded then this option has no effect. By default,
 69 | ncore=1 which runs as a single-threaded process. Although
 70 | Harmony supports multiple cores, it is not optimized for
 71 | multithreading. Increase this number for large datasets iff
 72 | single-core performance is not adequate.}
 73 | 
 74 | \item{plot_convergence}{Whether to print the convergence plot of
 75 | the clustering objective function. TRUE to plot, FALSE to
 76 | suppress. This can be useful for debugging.}
 77 | 
 78 | \item{return_object}{(Advanced Usage) Whether to return the Harmony
 79 | object or only the corrected PCA embeddings.}
 80 | 
 81 | \item{verbose}{Whether to print progress messages. TRUE to print,
 82 | FALSE to suppress.}
 83 | 
 84 | \item{.options}{Setting advanced parameters of RunHarmony. This must be the
 85 | result from a call to `harmony_options`. See ?`harmony_options` for 
 86 | parameters not listed above and more details.}
 87 | 
 88 | \item{...}{other parameters that are not part of the API}
 89 | }
 90 | \value{
 91 | By default, matrix with corrected PCA embeddings. If
 92 |     return_object is TRUE, returns the full Harmony object (R6
 93 |     reference class type).
 94 | }
 95 | \description{
 96 | Use this generic with a cell embeddings matrix, a metadata table
 97 | and a categorical covariate to run the Harmony algorithm directly
 98 | on cell embedding matrix.
 99 | }
100 | \examples{
101 | 
102 | 
103 | ## By default, Harmony inputs a cell embedding matrix
104 | \dontrun{
105 | harmony_embeddings <- RunHarmony(cell_embeddings, meta_data, 'dataset')
106 | }
107 | 
108 | ## If PCA is the input, the PCs need to be scaled
109 | data(cell_lines_small)
110 | pca_matrix <- cell_lines_small$scaled_pcs
111 | meta_data <- cell_lines_small$meta_data
112 | harmony_embeddings <- RunHarmony(pca_matrix, meta_data, 'dataset')
113 | 
114 | ## Output is a matrix of corrected PC embeddings
115 | dim(harmony_embeddings)
116 | harmony_embeddings[seq_len(5), seq_len(5)]
117 | 
118 | ## Finally, we can return an object with all the underlying data structures
119 | harmony_object <- RunHarmony(pca_matrix, meta_data, 'dataset', return_object=TRUE)
120 | dim(harmony_object$Y) ## cluster centroids
121 | dim(harmony_object$R) ## soft cluster assignment
122 | dim(harmony_object$Z_corr) ## corrected PCA embeddings
123 | head(harmony_object$O) ## batch by cluster co-occurence matrix
124 | 
125 | }
126 | \seealso{
127 | Other RunHarmony: 
128 | \code{\link{RunHarmony.Seurat}()},
129 | \code{\link{RunHarmony.SingleCellExperiment}()},
130 | \code{\link{RunHarmony}()}
131 | }
132 | \concept{RunHarmony}
133 | 


--------------------------------------------------------------------------------
/man/cell_lines.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{cell_lines}
 5 | \alias{cell_lines}
 6 | \title{List of metadata table and scaled PCs matrix}
 7 | \format{
 8 | :
 9 |   meta_data: data.table of 9478 rows with defining dataset and cell_type
10 |   scaled_pcs: data.table of 9478 rows (cells) and 20 columns (PCs)
11 | }
12 | \source{
13 | \url{https://www.10xgenomics.com}
14 | }
15 | \usage{
16 | cell_lines
17 | }
18 | \description{
19 | List of metadata table and scaled PCs matrix
20 | }
21 | \keyword{datasets}
22 | 


--------------------------------------------------------------------------------
/man/cell_lines_small.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{cell_lines_small}
 5 | \alias{cell_lines_small}
 6 | \title{Same as cell_lines but smaller (300 cells).}
 7 | \format{
 8 | An object of class \code{list} of length 2.
 9 | }
10 | \source{
11 | \url{https://www.10xgenomics.com}
12 | }
13 | \usage{
14 | cell_lines_small
15 | }
16 | \description{
17 | Same as cell_lines but smaller (300 cells).
18 | }
19 | \keyword{datasets}
20 | 


--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/immunogenomics/harmony/b36bab002c1767af6e665c81f186b40a87870e64/man/figures/logo.png


--------------------------------------------------------------------------------
/man/harmony.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/harmony-package.r
 3 | \docType{package}
 4 | \name{harmony}
 5 | \alias{harmony}
 6 | \title{Harmony: fast, accurate, and robust single cell integration.}
 7 | \description{
 8 | Algorithm for single cell integration.
 9 | }
10 | \section{Usage}{
11 | 
12 | 
13 | 
14 | ?RunHarmony to run Harmony on cell embeddings matrix, Seurat or
15 | SingleCellExperiment objects.
16 | }
17 | 
18 | \section{Useful links}{
19 | 
20 | 
21 | \enumerate{
22 | \item Report bugs at \url{https://github.com/immunogenomics/harmony/issues}
23 | \item Read the manuscript
24 | \doi{10.1038/s41592-019-0619-0}
25 | }
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/man/harmony_options.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/harmony_option.R
 3 | \name{harmony_options}
 4 | \alias{harmony_options}
 5 | \title{Set advanced parameters for RunHarmony}
 6 | \usage{
 7 | harmony_options(
 8 |   alpha = 0.2,
 9 |   tau = 0,
10 |   block.size = 0.05,
11 |   max.iter.cluster = 20,
12 |   epsilon.cluster = 0.001,
13 |   epsilon.harmony = 0.01
14 | )
15 | }
16 | \arguments{
17 | \item{alpha}{When setting lambda = NULL and use lambda estimation mode, 
18 | lambda would be determined by the expected number of cells assuming 
19 | idependece between batches and clusters. i.e., lambda = alpha * expected
20 | number of cells, default 0.2 and alpha should be 0 < alpha < 1}
21 | 
22 | \item{tau}{Protection against overclustering small datasets with 
23 | large ones. `tau` is the expected number of cells per cluster.}
24 | 
25 | \item{block.size}{What proportion of cells to update during clustering. 
26 | Between 0 to 1, default 0.05. Larger values may be faster but less 
27 | accurate.}
28 | 
29 | \item{max.iter.cluster}{Maximum number of rounds to run clustering 
30 | at each round of Harmony.}
31 | 
32 | \item{epsilon.cluster}{Convergence tolerance for clustering round 
33 | of Harmony. Set to -Inf to never stop early.}
34 | 
35 | \item{epsilon.harmony}{Convergence tolerance for Harmony. Set to -Inf to
36 | never stop early. When `epsilon.harmony` is set to not NULL, then
37 | user-supplied values of `early_stop` is ignored.}
38 | }
39 | \value{
40 | Return a list for `.options` argument of `RunHarmony`
41 | }
42 | \description{
43 | Set advanced parameters for RunHarmony
44 | }
45 | \examples{
46 | ## If want to set max.iter.cluster to be 100, do
47 | \dontrun{
48 | RunHarmony(data_meta, meta_data, vars_use,
49 |               .options = harmony_options(max.iter.cluster = 100))
50 | }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/man/moe_ridge_get_betas.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{moe_ridge_get_betas}
 4 | \alias{moe_ridge_get_betas}
 5 | \title{Get beta Utility}
 6 | \usage{
 7 | moe_ridge_get_betas(harmonyObj)
 8 | }
 9 | \arguments{
10 | \item{harmonyObj}{Trained harmony object. Get this by running 
11 | RunHarmony function with return_object=TRUE.}
12 | }
13 | \value{
14 | Returns nothing, modifies object in place.
15 | }
16 | \description{
17 | Utility function to get ridge regression coefficients from trained
18 | Harmony object
19 | }
20 | 


--------------------------------------------------------------------------------
/man/pbmc.ctrl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{pbmc.ctrl}
 5 | \alias{pbmc.ctrl}
 6 | \title{Gene expression data of control PBMC from Kang et al. 2017. This
 7 | contains a sample of 1000 cells from that condition and is used for
 8 | the Seurat Vignette.}
 9 | \format{
10 | An object of class \code{dgCMatrix} with 9015 rows and 1000 columns.
11 | }
12 | \source{
13 | \doi{10.1038/nbt.4042}
14 | }
15 | \usage{
16 | pbmc.ctrl
17 | }
18 | \description{
19 | Gene expression data of control PBMC from Kang et al. 2017. This
20 | contains a sample of 1000 cells from that condition and is used for
21 | the Seurat Vignette.
22 | }
23 | \keyword{datasets}
24 | 


--------------------------------------------------------------------------------
/man/pbmc.stim.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{pbmc.stim}
 5 | \alias{pbmc.stim}
 6 | \title{Gene expression data of stimulated PBMC from Kang et al. 2017. This
 7 | contains a sample of 1000 cells from that condition and is used for
 8 | the Seurat Vignette.}
 9 | \format{
10 | An object of class \code{dgCMatrix} with 9015 rows and 1000 columns.
11 | }
12 | \source{
13 | \doi{10.1038/nbt.4042}
14 | }
15 | \usage{
16 | pbmc.stim
17 | }
18 | \description{
19 | Gene expression data of stimulated PBMC from Kang et al. 2017. This
20 | contains a sample of 1000 cells from that condition and is used for
21 | the Seurat Vignette.
22 | }
23 | \keyword{datasets}
24 | 


--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{\%>\%}
 4 | \alias{\%>\%}
 5 | \title{Pipe operator}
 6 | \usage{
 7 | lhs \%>\% rhs
 8 | }
 9 | \value{
10 | return value of rhs function.
11 | }
12 | \description{
13 | Pipe operator
14 | }
15 | \examples{
16 | x <- 5 \%>\% sum(10)
17 | 
18 | }
19 | \keyword{internal}
20 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 | .cache
5 | compile_commands.json
6 | MakefileBear


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
2 | 


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
1 | PKG_LIBS = $(shell $(R_HOME)/bin/Rscript.exe -e "Rcpp:::LdFlags()") $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
2 | 
3 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include "harmony_types.h"
 5 | #include <RcppArmadillo.h>
 6 | #include <Rcpp.h>
 7 | 
 8 | using namespace Rcpp;
 9 | 
10 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
11 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
12 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
13 | #endif
14 | 
15 | // kmeans_centers
16 | arma::mat kmeans_centers(const arma::mat& X, const int K);
17 | RcppExport SEXP _harmony_kmeans_centers(SEXP XSEXP, SEXP KSEXP) {
18 | BEGIN_RCPP
19 |     Rcpp::RObject rcpp_result_gen;
20 |     Rcpp::RNGScope rcpp_rngScope_gen;
21 |     Rcpp::traits::input_parameter< const arma::mat& >::type X(XSEXP);
22 |     Rcpp::traits::input_parameter< const int >::type K(KSEXP);
23 |     rcpp_result_gen = Rcpp::wrap(kmeans_centers(X, K));
24 |     return rcpp_result_gen;
25 | END_RCPP
26 | }
27 | // scaleRows_dgc
28 | MATTYPE scaleRows_dgc(const VECTYPE& x, const VECTYPE& p, const VECTYPE& i, int ncol, int nrow, float thresh);
29 | RcppExport SEXP _harmony_scaleRows_dgc(SEXP xSEXP, SEXP pSEXP, SEXP iSEXP, SEXP ncolSEXP, SEXP nrowSEXP, SEXP threshSEXP) {
30 | BEGIN_RCPP
31 |     Rcpp::RObject rcpp_result_gen;
32 |     Rcpp::RNGScope rcpp_rngScope_gen;
33 |     Rcpp::traits::input_parameter< const VECTYPE& >::type x(xSEXP);
34 |     Rcpp::traits::input_parameter< const VECTYPE& >::type p(pSEXP);
35 |     Rcpp::traits::input_parameter< const VECTYPE& >::type i(iSEXP);
36 |     Rcpp::traits::input_parameter< int >::type ncol(ncolSEXP);
37 |     Rcpp::traits::input_parameter< int >::type nrow(nrowSEXP);
38 |     Rcpp::traits::input_parameter< float >::type thresh(threshSEXP);
39 |     rcpp_result_gen = Rcpp::wrap(scaleRows_dgc(x, p, i, ncol, nrow, thresh));
40 |     return rcpp_result_gen;
41 | END_RCPP
42 | }
43 | // find_lambda_cpp
44 | arma::vec find_lambda_cpp(const float alpha, const arma::vec& cluster_E);
45 | RcppExport SEXP _harmony_find_lambda_cpp(SEXP alphaSEXP, SEXP cluster_ESEXP) {
46 | BEGIN_RCPP
47 |     Rcpp::RObject rcpp_result_gen;
48 |     Rcpp::RNGScope rcpp_rngScope_gen;
49 |     Rcpp::traits::input_parameter< const float >::type alpha(alphaSEXP);
50 |     Rcpp::traits::input_parameter< const arma::vec& >::type cluster_E(cluster_ESEXP);
51 |     rcpp_result_gen = Rcpp::wrap(find_lambda_cpp(alpha, cluster_E));
52 |     return rcpp_result_gen;
53 | END_RCPP
54 | }
55 | 
56 | RcppExport SEXP _rcpp_module_boot_harmony_module();
57 | 
58 | static const R_CallMethodDef CallEntries[] = {
59 |     {"_harmony_kmeans_centers", (DL_FUNC) &_harmony_kmeans_centers, 2},
60 |     {"_harmony_scaleRows_dgc", (DL_FUNC) &_harmony_scaleRows_dgc, 6},
61 |     {"_harmony_find_lambda_cpp", (DL_FUNC) &_harmony_find_lambda_cpp, 2},
62 |     {"_rcpp_module_boot_harmony_module", (DL_FUNC) &_rcpp_module_boot_harmony_module, 0},
63 |     {NULL, NULL, 0}
64 | };
65 | 
66 | RcppExport void R_init_harmony(DllInfo *dll) {
67 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
68 |     R_useDynamicSymbols(dll, FALSE);
69 | }
70 | 


--------------------------------------------------------------------------------
/src/harmony.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <chrono>
  3 | 
  4 | #include "harmony.h"
  5 | #include "types.h"
  6 | #include "utils.h"
  7 | 
  8 | 
  9 | 
 10 | 
 11 | harmony::harmony() :
 12 |     window_size(3),
 13 |     ran_setup(false),
 14 |     ran_init(false),
 15 |     lambda_estimation(false),
 16 |     verbose(false)
 17 |     
 18 | {}
 19 | 
 20 | 
 21 | 
 22 | void harmony::setup(const MATTYPE& __Z, const arma::sp_mat& __Phi,
 23 |                     const VECTYPE __sigma, const VECTYPE __theta, const VECTYPE __lambda, const float __alpha, const int __max_iter_kmeans,
 24 |                     const float __epsilon_kmeans, const float __epsilon_harmony,
 25 |                     const int __K, const float __block_size,
 26 |                     const std::vector<int>& __B_vec, const bool __verbose) {
 27 |     
 28 |   // Algorithm constants
 29 |   N = __Z.n_cols;
 30 |   B = __Phi.n_rows;
 31 |   d = __Z.n_rows;
 32 |   
 33 |   Z_orig = __Z;
 34 |   Z_cos = arma::normalise(__Z, 2, 0);
 35 |   Z_corr = zeros(size(Z_orig));
 36 | 
 37 |   
 38 |   Phi = __Phi;
 39 |   Phi_t = Phi.t();
 40 |   
 41 |   // Create index
 42 |   std::vector<unsigned>counters;
 43 |   arma::vec sizes(sum(Phi, 1));
 44 |   // std::cout << sizes << std::endl;
 45 |   for (unsigned i = 0; i < sizes.n_elem; i++) {
 46 |     arma::uvec a(int(sizes(i)));
 47 |     index.push_back(a);
 48 |     counters.push_back(0);
 49 |   }
 50 | 
 51 |   arma::sp_mat::const_iterator it =     Phi.begin();
 52 |   arma::sp_mat::const_iterator it_end = Phi.end();
 53 |   for(; it != it_end; ++it)
 54 |   {
 55 |     unsigned int row_idx = it.row();
 56 |     unsigned int col_idx = it.col();
 57 |     index[row_idx](counters[row_idx]++) = col_idx;
 58 |   }
 59 | 
 60 |   Pr_b = sum(Phi, 1) / N;
 61 | 
 62 |   
 63 |   epsilon_kmeans = __epsilon_kmeans;
 64 |   epsilon_harmony = __epsilon_harmony;
 65 | 
 66 |   // Hyperparameters
 67 |   K = __K;
 68 |   if (__lambda(0) == -1) {
 69 |     lambda_estimation = true;
 70 |   } else {
 71 |     lambda = __lambda;
 72 |   }
 73 |   B_vec = __B_vec;
 74 |   sigma = __sigma;
 75 | 
 76 |   if(__Z.n_cols < 6) {
 77 |     std::string error_message = "Refusing to run with less than 6 cells";
 78 |     Rcpp::stop(error_message);
 79 |   } else if (__Z.n_cols < 40) {
 80 |     Rcpp::warning("Too few cells. Setting block_size to 0.2");
 81 |     block_size = 0.2;
 82 |   } else {
 83 |     block_size = __block_size;
 84 |   } 
 85 |   theta = __theta;
 86 |   max_iter_kmeans = __max_iter_kmeans;
 87 | 
 88 |   verbose = __verbose;
 89 |   
 90 |   allocate_buffers();
 91 |   ran_setup = true;
 92 | 
 93 |   alpha = __alpha;
 94 |   
 95 |   
 96 | }
 97 | 
 98 | 
 99 | void harmony::allocate_buffers() {
100 |   
101 |   _scale_dist = zeros<MATTYPE>(K, N);
102 |   dist_mat = zeros<MATTYPE>(K, N);
103 |   O = E = zeros<MATTYPE>(K, B);
104 |   
105 |   // Hack: create matrix of ones by creating zeros and then add one!
106 |   arma::sp_mat intcpt = zeros<arma::sp_mat>(1, N);
107 |   intcpt = intcpt+1;
108 |   
109 |   Phi_moe = join_cols(intcpt, Phi);
110 |   Phi_moe_t = Phi_moe.t();
111 | 
112 | 
113 |   W = zeros<MATTYPE>(B + 1, d);
114 | }
115 | 
116 | 
117 | void harmony::init_cluster_cpp() {
118 | 
119 |   Y = kmeans_centers(Z_cos, K).t();
120 |   
121 |   // Cosine normalization of data centrods
122 |   Y = arma::normalise(Y, 2, 0);
123 | 
124 |   // (2) ASSIGN CLUSTER PROBABILITIES
125 |   // using a nice property of cosine distance,
126 |   // compute squared distance directly with cross product
127 |   dist_mat = 2 * (1 - Y.t() * Z_cos);
128 |   
129 |   R = -dist_mat;
130 |   R.each_col() /= sigma;
131 |   R = exp(R);
132 |   R.each_row() /= sum(R, 0);
133 |   
134 |   
135 |   // (3) BATCH DIVERSITY STATISTICS
136 |   E = sum(R, 1) * Pr_b.t();
137 |   O = R * Phi_t;
138 |   
139 |   compute_objective();
140 |   objective_harmony.push_back(objective_kmeans.back());
141 |   
142 |   dist_mat = 2 * (1 - Y.t() * Z_cos); // Z_cos was changed
143 | 
144 |   ran_init = true;
145 |   
146 | }
147 | 
148 | void harmony::compute_objective() {
149 |   const float norm_const = 2000/((float)N);
150 |   float kmeans_error = as_scalar(accu(R % dist_mat));  
151 |   float _entropy = as_scalar(accu(safe_entropy(R).each_col() % sigma)); // NEW: vector sigma
152 |   float _cross_entropy = as_scalar(
153 |       accu((R.each_col() % sigma) % ((arma::repmat(theta.t(), K, 1) % log((O + E) / E)) * Phi)));
154 | 
155 |   // Push back the data
156 |   objective_kmeans.push_back((kmeans_error + _entropy + _cross_entropy) * norm_const);
157 |   objective_kmeans_dist.push_back(kmeans_error * norm_const);
158 |   objective_kmeans_entropy.push_back(_entropy * norm_const);
159 |   objective_kmeans_cross.push_back(_cross_entropy * norm_const);
160 | }
161 | 
162 | 
163 | bool harmony::check_convergence(int type) {
164 |   float obj_new, obj_old;
165 |   switch (type) {
166 |     case 0: 
167 |       // Clustering 
168 |       // compute new window mean
169 |       obj_old = 0;
170 |       obj_new = 0;
171 |       for (unsigned i = 0; i < window_size; i++) {
172 |         obj_old += objective_kmeans[objective_kmeans.size() - 2 - i];
173 |         obj_new += objective_kmeans[objective_kmeans.size() - 1 - i];
174 |       }
175 |       if ((obj_old - obj_new) / abs(obj_old) < epsilon_kmeans) {
176 |         return(true); 
177 |       } else {
178 |         return(false);
179 |       }
180 |     case 1:
181 |       // Harmony
182 |       obj_old = objective_harmony[objective_harmony.size() - 2];
183 |       obj_new = objective_harmony[objective_harmony.size() - 1];
184 |       if ((obj_old - obj_new) / abs(obj_old) < epsilon_harmony) {
185 |         return(true);              
186 |       } else {
187 |         return(false);              
188 |       }
189 |   }
190 |   
191 |   // gives warning if we don't give default return value
192 |   return(true);
193 | }
194 | 
195 | 
196 | int harmony::cluster_cpp() {
197 |   int err_status = 0;
198 |   Progress p(max_iter_kmeans, verbose);
199 |   unsigned iter;
200 |   
201 |   // Z_cos has changed
202 |   // R has assumed to not change
203 |   // so update Y to match new integrated data  
204 |   for (iter = 0; iter < max_iter_kmeans; iter++) {
205 |       
206 |       p.increment();
207 |       if (Progress::check_abort())
208 | 	  return(-1);
209 |     
210 |       // STEP 1: Update Y (cluster centroids)
211 |       Y = arma::normalise(Z_cos * R.t(), 2, 0);
212 | 
213 |       dist_mat = 2 * (1 - Y.t() * Z_cos); // Y was changed
214 | 
215 |         
216 |       // STEP 3: Update R    
217 |       err_status = update_R();
218 |       if (err_status != 0) {
219 | 	  // Rcout << "Compute R failed. Exiting from clustering." << endl;
220 | 	  return err_status;
221 |       }
222 |     
223 |       // STEP 4: Check for convergence
224 |       compute_objective();
225 |     
226 |       if (iter > window_size) {
227 | 	  bool convergence_status = check_convergence(0);
228 | 	  if (convergence_status) {
229 | 	      iter++;
230 | 	      break;
231 | 	  }
232 |       }
233 |   }
234 |   
235 |   kmeans_rounds.push_back(iter);
236 |   objective_harmony.push_back(objective_kmeans.back());
237 |   return 0;
238 | }
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | int harmony::update_R() {
246 | 
247 |   // Generate the 0,N-1 indices
248 |   uvec indices = linspace<uvec>(0, N - 1, N);
249 |   update_order = shuffle(indices);
250 |   
251 |   // Inverse index
252 |   uvec reverse_index(N, arma::fill::zeros);
253 |   reverse_index.rows(update_order) = indices;
254 |   
255 |   _scale_dist = -dist_mat; // K x N
256 |   _scale_dist.each_col() /= sigma; // NEW: vector sigma
257 |   _scale_dist = exp(_scale_dist);
258 |   _scale_dist = arma::normalise(_scale_dist, 1, 0);
259 | 
260 |   // GENERAL CASE: online updates, in blocks of size (N * block_size)
261 |   unsigned n_blocks = (int)(my_ceil(1.0 / block_size));
262 |   unsigned cells_per_block = unsigned(N * block_size);
263 |   
264 |   // Allocate new matrices
265 |   MATTYPE R_randomized = R.cols(update_order);
266 |   arma::sp_mat Phi_randomized(Phi.cols(update_order));
267 |   arma::sp_mat Phi_t_randomized(Phi_randomized.t());
268 |   MATTYPE _scale_dist_randomized = _scale_dist.cols(update_order);
269 |   
270 |   for (unsigned i = 0; i < n_blocks; i++) {
271 |     unsigned idx_min = i*cells_per_block;
272 |     unsigned idx_max = ((i+1) * cells_per_block) - 1; // - 1 because of submat
273 |     if (i == n_blocks-1) {
274 |       // we are in the last block, so include everything. Up to 19
275 |       // extra cells.
276 |       idx_max = N - 1;
277 |     }
278 | 
279 |     auto Rcells = R_randomized.submat(0, idx_min, R_randomized.n_rows - 1, idx_max);
280 |     auto Phicells = Phi_randomized.submat(0, idx_min, Phi_randomized.n_rows - 1, idx_max);
281 |     auto Phi_tcells = Phi_t_randomized.submat(idx_min, 0, idx_max, Phi_t_randomized.n_cols - 1);
282 |     auto _scale_distcells = _scale_dist_randomized.submat(0, idx_min, _scale_dist_randomized.n_rows - 1, idx_max);
283 | 
284 |     // Step 1: remove cells
285 |     E -= sum(Rcells, 1) * Pr_b.t();
286 |     O -= Rcells * Phi_tcells;
287 | 
288 |     // Step 2: recompute R for removed cells
289 |     Rcells = _scale_distcells;
290 |     Rcells = Rcells % (harmony_pow(E/(O + E), theta) * Phicells);
291 |     Rcells = normalise(Rcells, 1, 0); // L1 norm columns
292 | 
293 |     // Step 3: put cells back 
294 |     E += sum(Rcells, 1) * Pr_b.t();
295 |     O += Rcells * Phi_tcells;
296 |   }
297 |   this->R = R_randomized.cols(reverse_index);
298 |   return 0;
299 | }
300 | 
301 | 
302 | void harmony::moe_correct_ridge_cpp() {
303 |   
304 |   arma::sp_mat _Rk(N, N);
305 |   arma::sp_mat lambda_mat(B + 1, B + 1);
306 | 
307 |   if(!lambda_estimation) {
308 |     // Set lambda if we have to
309 |     lambda_mat.diag() = lambda;
310 |   }
311 |   Z_corr = Z_orig;
312 |   Progress p(K, verbose);
313 |   for (unsigned k = 0; k < K; k++) {
314 |     p.increment();
315 |     if (Progress::check_abort())
316 |       return;
317 |     if (lambda_estimation) {
318 |       lambda_mat.diag() = find_lambda_cpp(alpha, E.row(k).t());
319 |     }
320 |     _Rk.diag() = R.row(k);
321 |     arma::sp_mat Phi_Rk = Phi_moe * _Rk;
322 |     
323 |     arma::mat inv_cov(arma::inv(arma::mat(Phi_Rk * Phi_moe_t + lambda_mat)));
324 | 
325 |     // Calculate R-scaled PCs once
326 |     arma::mat Z_tmp = Z_orig.each_row() % R.row(k);
327 |     
328 |     // Generate the betas contribution of the intercept using the data
329 |     // This erases whatever was written before in W
330 |     W = inv_cov.unsafe_col(0) * sum(Z_tmp, 1).t();
331 | 
332 |     // Calculate betas by calculating each batch contribution
333 |     for(unsigned b=0; b < B; b++) {
334 |       // inv_conv is B+1xB+1 whereas index is B long
335 |       W += inv_cov.unsafe_col(b+1) * sum(Z_tmp.cols(index[b]), 1).t();
336 |     }
337 |     
338 |     W.row(0).zeros(); // do not remove the intercept
339 |     Z_corr -= W.t() * Phi_Rk;
340 |   }
341 |   Z_cos = arma::normalise(Z_corr, 2, 0);
342 | }
343 | 
344 | CUBETYPE harmony::moe_ridge_get_betas_cpp() {
345 |   CUBETYPE W_cube(B+1, d, K); // rows, cols, slices
346 | 
347 |   arma::sp_mat _Rk(N, N);
348 |   arma::sp_mat lambda_mat(B + 1, B + 1);
349 | 
350 |   if (!lambda_estimation) {
351 |     // Set lambda if we have to
352 |     lambda_mat.diag() = lambda;
353 |   }
354 | 
355 |   for (unsigned k = 0; k < K; k++) {
356 |       _Rk.diag() = R.row(k);
357 |       if (lambda_estimation){
358 |         lambda_mat.diag() = find_lambda_cpp(alpha, E.row(k).t());
359 |       }
360 |       arma::sp_mat Phi_Rk = Phi_moe * _Rk;
361 |       W_cube.slice(k) = arma::inv(arma::mat(Phi_Rk * Phi_moe_t + lambda_mat)) * Phi_Rk * Z_orig.t();
362 |   }
363 | 
364 |   return W_cube;
365 | }
366 | 
367 | RCPP_MODULE(harmony_module) {
368 |   class_<harmony>("harmony")
369 |       .constructor()
370 |       .field("Z_corr", &harmony::Z_corr)
371 |       .field("Z_cos", &harmony::Z_cos)
372 |       .field("Z_orig", &harmony::Z_orig)
373 |       .field("Phi", &harmony::Phi)
374 |       .field("Phi_moe", &harmony::Phi_moe)
375 |       .field("N", &harmony::N)
376 |       .field("B", &harmony::B)
377 |       .field("K", &harmony::K)
378 |       .field("d", &harmony::d)
379 |       .field("O", &harmony::O)
380 |       .field("E", &harmony::E)
381 |       .field("Y", &harmony::Y)
382 |       .field("Pr_b", &harmony::Pr_b)
383 |       .field("W", &harmony::W)
384 |       .field("R", &harmony::R)
385 |       .field("theta", &harmony::theta)
386 |       .field("sigma", &harmony::sigma)
387 |       .field("lambda", &harmony::lambda)
388 |       .field("kmeans_rounds", &harmony::kmeans_rounds)
389 |       .field("objective_kmeans", &harmony::objective_kmeans)
390 |       .field("objective_kmeans_dist", &harmony::objective_kmeans_dist)
391 |       .field("objective_kmeans_entropy", &harmony::objective_kmeans_entropy)
392 |       .field("objective_kmeans_cross", &harmony::objective_kmeans_cross)    
393 |       .field("objective_harmony", &harmony::objective_harmony)
394 |       .field("max_iter_kmeans", &harmony::max_iter_kmeans)
395 |       .method("check_convergence", &harmony::check_convergence)
396 |       .method("setup", &harmony::setup)
397 |       .method("compute_objective", &harmony::compute_objective)
398 |       .method("init_cluster_cpp", &harmony::init_cluster_cpp)
399 |       .method("cluster_cpp", &harmony::cluster_cpp)	  
400 |       .method("moe_correct_ridge_cpp", &harmony::moe_correct_ridge_cpp)
401 |       .method("moe_ridge_get_betas_cpp", &harmony::moe_ridge_get_betas_cpp)
402 |       .field("B_vec", &harmony::B_vec)
403 |       .field("alpha", &harmony::alpha)
404 |       ;
405 | }
406 | 


--------------------------------------------------------------------------------
/src/harmony.h:
--------------------------------------------------------------------------------
 1 | #include "types.h"
 2 | #include <Rcpp.h>
 3 | #include <iostream>
 4 | #include <vector>
 5 | #include <algorithm>
 6 | #include <progress.hpp>
 7 | 
 8 | using namespace Rcpp;
 9 | using namespace arma;
10 | // [[Rcpp::depends(RcppArmadillo)]]
11 | // [[Rcpp::depends(RcppProgress)]]
12 | 
13 | using namespace std;
14 | 
15 | class harmony;
16 | RCPP_EXPOSED_CLASS(harmony)
17 |   
18 |   
19 | #include "harmony_types.h"
20 |   
21 | class harmony { 
22 | public:
23 |   
24 |   harmony();
25 |   
26 |   void setup(const MATTYPE& __Z, const arma::sp_mat& __Phi,
27 | 	     const VECTYPE __sigma, const VECTYPE __theta,
28 | 	     const VECTYPE __lambda, const float __alpha, const int __max_iter_kmeans,
29 | 	     const float __epsilon_kmeans, const float __epsilon_harmony,
30 | 	     const int __K, const float __block_size, 
31 | 	     const vector<int>& __B_vec, const bool __verbose);
32 |   
33 |   /* METHODS */
34 |   void moe_correct_ridge_cpp();
35 |   CUBETYPE moe_ridge_get_betas_cpp();
36 |   int cluster_cpp();
37 | 
38 |   void init_cluster_cpp();
39 |   void allocate_buffers();
40 |   void compute_objective(); 
41 |   int update_R();
42 |   bool check_convergence(int type);
43 |   void setY(const MATTYPE& Z);
44 | 
45 |   /* FIELDS */
46 |   MATTYPE R, Z_orig, Z_corr, Z_cos, Y;
47 |   arma::sp_mat Phi, Phi_moe, Phi_moe_t, Phi_t, Rk;
48 |   VECTYPE Pr_b, theta, N_b, sigma, lambda;
49 | 
50 |   // auxilary data structures
51 |   vector<float> objective_kmeans, objective_kmeans_dist, objective_kmeans_entropy, objective_kmeans_cross, objective_harmony;
52 |   vector<int> kmeans_rounds, B_vec; // OLD: Kb
53 |   std::vector<arma::uvec>index;
54 |   
55 |   float block_size, epsilon_kmeans, epsilon_harmony, alpha;
56 |   unsigned int N, K, B, d, max_iter_kmeans, window_size;
57 | 
58 |   // buffers
59 |   MATTYPE W, _scale_dist, dist_mat, O, E, dir_prior; // N_k, N_kb, N_b, numerator, denominator, C;
60 |   uvec update_order, cells_update;
61 |   
62 | 
63 |   // flags
64 |   bool ran_setup, ran_init, lambda_estimation,  verbose; // do_merge_R;
65 |   
66 | };
67 | 
68 |   
69 | 


--------------------------------------------------------------------------------
/src/harmony_types.h:
--------------------------------------------------------------------------------
1 | #include "types.h"
2 | 
3 | 


--------------------------------------------------------------------------------
/src/types.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #define ARMA_64BIT_WORD
3 | #include <RcppArmadillo.h>
4 | 
5 | typedef arma::mat MATTYPE;
6 | typedef arma::vec VECTYPE;
7 | typedef arma::rowvec ROWVECTYPE;
8 | typedef arma::cube CUBETYPE;
9 | 


--------------------------------------------------------------------------------
/src/utils.cpp:
--------------------------------------------------------------------------------
  1 | #include "utils.h"
  2 | #include "types.h"
  3 | 
  4 | //[[Rcpp::export]]
  5 | arma::mat kmeans_centers(const arma::mat& X, const int K) {
  6 |   
  7 |   // Environment 
  8 |   Rcpp::Environment stats_env("package:stats");
  9 |   // Cast function as callable from C++
 10 |   Rcpp::Function kmeans = stats_env["kmeans"];
 11 |   // Call the function and receive its list output
 12 |   Rcpp::List res = kmeans(Rcpp::_["x"] = X.t(),
 13 |                           Rcpp::_["centers"] = K,
 14 |                           Rcpp::_["iter.max"] = 25,
 15 |                           Rcpp::_["nstart"] = 10
 16 |                           );
 17 |   return res["centers"];
 18 | }
 19 | 
 20 | 
 21 | MATTYPE safe_entropy(const MATTYPE& X) {
 22 |   MATTYPE A = X % log(X);
 23 |   A.elem(find_nonfinite(A)).zeros();
 24 |   return(A);
 25 | }
 26 | 
 27 | // Overload pow to work on a MATTYPErix and vector
 28 | MATTYPE harmony_pow(MATTYPE A, const VECTYPE& T) {
 29 | 
 30 |   for (unsigned c = 0; c < A.n_cols; c++) {
 31 |     A.unsafe_col(c) = pow(A.unsafe_col(c), as_scalar(T.row(c)));
 32 |   }
 33 |   return(A);
 34 | }
 35 | 
 36 | VECTYPE calculate_norm(const MATTYPE& M) {
 37 |   VECTYPE x(M.n_cols);
 38 |   for(unsigned i = 0; i < M.n_cols; i++){
 39 |     x(i) = norm(M.col(i));
 40 |   }
 41 |   return x;
 42 | }
 43 | 
 44 | 
 45 | //https://stackoverflow.com/questions/8377412/ceil-function-how-can-we-implement-it-ourselves
 46 | int my_ceil(float num) {
 47 |     int inum = (int)num;
 48 |     if (num == (float)inum) {
 49 |         return inum;
 50 |     }
 51 |     return inum + 1;
 52 | }
 53 | 
 54 | 
 55 | // [[Rcpp::export]]
 56 | MATTYPE scaleRows_dgc(const VECTYPE& x, const VECTYPE& p, const VECTYPE& i, int ncol, int nrow, float thresh) {
 57 |   
 58 |     // (0) fill in non-zero elements
 59 |     MATTYPE res = arma::zeros<MATTYPE>(nrow, ncol);
 60 |     for (int c = 0; c < ncol; c++) {
 61 |         for (int j = p[c]; j < p[c + 1]; j++) {
 62 |             res(i[j], c) = x(j);
 63 |         }
 64 |     }
 65 | 
 66 |     // (1) compute means
 67 |     VECTYPE mean_vec = arma::zeros<VECTYPE>(nrow);
 68 |     for (int c = 0; c < ncol; c++) {
 69 |         for (int j = p[c]; j < p[c + 1]; j++) {
 70 |             mean_vec(i[j]) += x[j];
 71 |         }
 72 |     }
 73 |     mean_vec /= ncol;
 74 | 
 75 |     // (2) compute SDs
 76 |     VECTYPE sd_vec = arma::zeros<VECTYPE>(nrow);
 77 |     arma::uvec nz = arma::zeros<arma::uvec>(nrow);
 78 |     nz.fill(ncol);
 79 |     for (int c = 0; c < ncol; c++) {
 80 |         for (int j = p[c]; j < p[c + 1]; j++) {
 81 |             sd_vec(i[j]) += (x[j] - mean_vec(i[j])) * (x[j] - mean_vec(i[j])); // (x - mu)^2
 82 |             nz(i[j])--;
 83 |         }
 84 |     }
 85 | 
 86 |     // count for the zeros
 87 |     for (int r = 0; r < nrow; r++) {
 88 |         sd_vec(r) += nz(r) * mean_vec(r) * mean_vec(r);
 89 |     }
 90 | 
 91 |     sd_vec = arma::sqrt(sd_vec / (ncol - 1));
 92 | 
 93 |     // (3) scale values
 94 |     res.each_col() -= mean_vec;
 95 |     res.each_col() /= sd_vec;
 96 |     res.elem(find(res > thresh)).fill(thresh);
 97 |     res.elem(find(res < -thresh)).fill(-thresh);
 98 |     return res;
 99 | }
100 | 
101 | 
102 | // [[Rcpp::export]]
103 | arma::vec find_lambda_cpp(const float alpha, const arma::vec& cluster_E) {
104 |   arma::vec lambda_dym_vec(cluster_E.n_rows + 1, arma::fill::zeros);
105 |   lambda_dym_vec.subvec(1, lambda_dym_vec.n_rows - 1) = cluster_E * alpha;
106 |   return lambda_dym_vec;
107 | }
108 | 


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "types.h"
 3 | #include <RcppArmadillo.h>
 4 | 
 5 | arma::mat kmeans_centers(const arma::mat& X, const int K);
 6 | 
 7 | MATTYPE safe_entropy(const MATTYPE& X);
 8 | 
 9 | MATTYPE harmony_pow(MATTYPE A, const VECTYPE& T);
10 | 
11 | VECTYPE calculate_norm(const MATTYPE& M);
12 | 
13 | 
14 | int my_ceil(float num);
15 | 
16 | 
17 | arma::vec find_lambda_cpp(const float alpha, const arma::vec& cluster_E);
18 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(harmony)
3 | 
4 | test_check("harmony")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test_integration.R:
--------------------------------------------------------------------------------
 1 | context('Test main Harmony integration function: RunHarmony')
 2 | library(harmony)
 3 | data(cell_lines_small)
 4 | 
 5 | obj <- RunHarmony(cell_lines_small$scaled_pcs, cell_lines_small$meta_data, 'dataset',
 6 |                   theta = 1, nclust = 50, lambda = .1, max_iter = 5, return_object = TRUE,
 7 |                   verbose = FALSE, .options = harmony_options(max.iter.cluster = 10))
 8 | 
 9 | test_that('dimensions match in Harmony object data structures', {
10 |     expect_equal(dim(obj$Y), c(obj$d, obj$K))
11 |     expect_equal(dim(obj$Z_corr), c(obj$d, obj$N))
12 |     expect_equal(dim(obj$Z_cos), c(obj$d, obj$N))
13 |     expect_equal(dim(obj$R), c(obj$K, obj$N))
14 | })
15 | 
16 | test_that('R defines proper probability distributions', {
17 |     expect_gte(min(obj$R), 0)
18 |     expect_lte(max(obj$R), 1)
19 |     expect_equal(colSums(obj$R), rep(1, obj$N))
20 | })
21 | 
22 | test_that('there are no null values in the corrected embedding', {
23 |     expect_true(all(!is.infinite(obj$Z_corr)))
24 |     expect_true(all(!is.na(obj$Z_corr)))
25 |     expect_true(all(!is.infinite(obj$Z_cos)))
26 |     expect_true(all(!is.na(obj$Z_cos)))
27 | })
28 | 
29 | 
30 | test_that('increasing theta decreases chi2 between Cluster and Batch assign', {
31 |     obj0 <- RunHarmony(cell_lines_small$scaled_pcs, cell_lines_small$meta_data, 'dataset',
32 |                        theta = 0, nclust = 20, lambda = .1, max_iter = 2, return_object = TRUE,
33 |                        verbose = FALSE, .options = harmony_options(max.iter.cluster = 5))
34 |     obj1 <- RunHarmony(cell_lines_small$scaled_pcs, cell_lines_small$meta_data, 'dataset',
35 |                        theta = 1, nclust = 20, lambda = .1, max_iter = 2, return_object = TRUE,
36 |                        verbose = FALSE, .options = harmony_options(max.iter.cluster = 5))
37 | 
38 |     expect_gt(
39 |         sum(((obj0$O - obj0$E) ^ 2) / obj0$E),
40 |         sum(((obj1$O - obj1$E) ^ 2) / obj1$E)
41 |     )
42 | })
43 | 
44 | test_that('error messages work', {
45 |     expect_error(
46 |         RunHarmony(cell_lines_small$scaled_pcs, cell_lines_small$meta_data, 'fake_variable')
47 |     )
48 | 
49 |     expect_error(
50 |         RunHarmony(cell_lines_small$scaled_pcs, cell_lines_small$meta_data, 'dataset', lambda = c(1,2))
51 |     )
52 | 
53 |     expect_error(
54 |         RunHarmony(cell_lines_small$scaled_pcs, head(cell_lines_small$meta_data, -1), 'dataset')
55 |     )
56 | 
57 | })
58 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/vignettes/Seurat.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Using harmony in Seurat"
  3 | output:
  4 |   rmarkdown::html_vignette:
  5 |     code_folding: show
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Using harmony in Seurat}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r, include = FALSE}
 13 | knitr::opts_chunk$set(
 14 |   collapse = TRUE,
 15 |   comment = "#>"
 16 | )
 17 | ```
 18 | 
 19 | ```{r setup, message=FALSE, warning=FALSE}
 20 | library(harmony)
 21 | library(Seurat)
 22 | library(dplyr)
 23 | library(cowplot)
 24 | 
 25 | ```
 26 | # Introduction
 27 | 
 28 | This tutorial describes how to use harmony in Seurat v5 single-cell analysis workflows. `RunHarmony()` is a generic function is designed to interact with Seurat objects. This vignette will walkthrough basic workflow of Harmony with Seurat objects. Also, it will provide some basic downstream analyses demonstrating the properties of harmonized cell embeddings and a brief explanation of the exposed algorithm parameters.
 29 | 
 30 | Install Harmony from CRAN with standard commands.
 31 | 
 32 | ```{r eval=FALSE}
 33 | install.packages('harmony')
 34 | ```
 35 | 
 36 | # Generating the dataset
 37 | 
 38 | For this demo, we will be aligning two groups of PBMCs [Kang et al., 2017](https://doi.org/10.1038/nbt.4042). In this experiment, PBMCs are in stimulated and control conditions. The stimulated PBMC group was treated with interferon beta.
 39 | 
 40 | 
 41 | ```
 42 | 
 43 | 
 44 | ## Generate SeuratObject
 45 | 
 46 | ```{r}
 47 | ## Source required data
 48 | data("pbmc_stim")
 49 | pbmc <- CreateSeuratObject(counts = cbind(pbmc.stim, pbmc.ctrl), project = "PBMC", min.cells = 5)
 50 | 
 51 | ## Separate conditions
 52 | 
 53 | pbmc@meta.data$stim <- c(rep("STIM", ncol(pbmc.stim)), rep("CTRL", ncol(pbmc.ctrl)))
 54 | ```
 55 | 
 56 | 
 57 | ## (Optional) Download original data
 58 | The example above contains only two thousand cells. The full [Kang et al., 2017](https://doi.org/10.1038/nbt.4042) dataset is deposited in the [GEO](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE96583). This analysis uses GSM2560248 and GSM2560249 samples from [GSE96583_RAW.tar](https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE96583&format=file) file and the [GSE96583_batch2.genes.tsv.gz](https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE96583&format=file&file=GSE96583%5Fbatch2%2Egenes%2Etsv%2Egz) gene file.
 59 | 
 60 | ```{r eval = FALSE, class.source='fold-hide'}
 61 | library(Matrix)
 62 | ## Download and extract files from GEO
 63 | ##setwd("/path/to/downloaded/files")
 64 | genes =  read.table("GSE96583_batch2.genes.tsv.gz", header = FALSE, sep = "\t")
 65 | 
 66 | pbmc.ctrl.full = as.readMM("GSM2560248_2.1.mtx.gz")
 67 | colnames(pbmc.ctrl.full) = paste0(read.table("GSM2560248_barcodes.tsv.gz", header = FALSE, sep = "\t")[,1], "-1")
 68 | rownames(pbmc.ctrl.full) = genes$V1
 69 | 
 70 | pbmc.stim.full = readMM("GSM2560249_2.2.mtx.gz")
 71 | colnames(pbmc.stim.full) = paste0(read.table("GSM2560249_barcodes.tsv.gz", header = FALSE, sep = "\t")[,1], "-2")
 72 | rownames(pbmc.stim.full) = genes$V1
 73 | 
 74 | library(Seurat)
 75 | 
 76 | pbmc <- CreateSeuratObject(counts = cbind(pbmc.stim.full, pbmc.ctrl.full), project = "PBMC", min.cells = 5)
 77 | pbmc@meta.data$stim <- c(rep("STIM", ncol(pbmc.stim.full)), rep("CTRL", ncol(pbmc.ctrl.full)))
 78 | 
 79 | 
 80 | 
 81 | 
 82 | # Running Harmony
 83 | 
 84 | Harmony works on an existing matrix with cell embeddings and outputs its transformed version with the datasets aligned according to some user-defined experimental conditions. By default, harmony will look up the `pca` cell embeddings and use these to run harmony. Therefore, it assumes that the Seurat object has these embeddings already precomputed.
 85 | 
 86 | ## Calculate PCA cell embeddings
 87 | 
 88 | Here, using `Seurat::NormalizeData()`, we will be generating a union of highly variable genes using each condition (the control and stimulated cells). These features are going to be subsequently used to generate the 20 PCs with `Seurat::RunPCA()`.
 89 | 
 90 | ```{r}
 91 | pbmc <- pbmc %>%
 92 |     NormalizeData(verbose = FALSE)
 93 | 
 94 | VariableFeatures(pbmc) <- split(row.names(pbmc@meta.data), pbmc@meta.data$stim) %>% lapply(function(cells_use) {
 95 |     pbmc[,cells_use] %>%
 96 |         FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 
 97 |         VariableFeatures()
 98 | }) %>% unlist %>% unique
 99 | 
100 | pbmc <- pbmc %>% 
101 |     ScaleData(verbose = FALSE) %>% 
102 |     RunPCA(features = VariableFeatures(pbmc), npcs = 20, verbose = FALSE)
103 | ```
104 | 
105 | ## Perform an integrated analysis
106 | 
107 | To run harmony on Seurat object after it has been normalized, only one argument needs to be specified which contains the batch covariate located in the metadata. For this vignette, further parameters are specified to align the dataset but the minimum parameters are shown in the snippet below:
108 | 
109 | ```{r, eval=FALSE}
110 | ## run harmony with default parameters
111 | pbmc <- pbmc %>% RunHarmony("stim")
112 | ## is equivalent to:
113 | pbmc <- RunHarmony(pbmc, "stim")
114 | ```
115 | 
116 | Here, we will be running harmony with some indicative parameters and plotting the convergence plot to illustrate some of the under the hood functionality.
117 | 
118 | ```{r, fig.width = 4, fig.height = 3, fig.align = "center", out.width="50%", fig.cap="By setting `plot_converge=TRUE`, harmony will generate a plot with its objective showing the flow of the integration. Each point represents the cost measured after a clustering round. Different colors represent different Harmony iterations which is controlled by `max_iter` (assuming that early_stop=FALSE). Here `max_iter=10` and up to 10 correction steps are expected. However, `early_stop=TRUE` so harmony will stop after the cost plateaus."}
119 | 
120 | pbmc <- pbmc %>% 
121 |     RunHarmony("stim", plot_convergence = TRUE, nclust = 50, max_iter = 10, early_stop = T)
122 | ```
123 | 
124 | 
125 | 
126 | ### Harmony API parameters on Seurat objects
127 | 
128 | `RunHarmony` has several parameters accessible to users which are outlined below.
129 | 
130 | #### `object` (required)
131 | 
132 | The Seurat object. This vignette assumes Seurat objects are version 5.
133 | 
134 | #### `group.by.vars` (required)
135 | 
136 | A character vector that specifies all the experimental covariates to be corrected/harmonized by the algorithm.
137 | 
138 | When using `RunHarmony()` with Seurat, harmony will look up the `group.by.vars` metadata fields in the Seurat Object metadata.
139 | 
140 | For example, given the `pbmc[["stim"]]` exists as the stim condition, setting `group.by.vars="stim"` will perform integration of these samples accordingly. If you want to integrate on another variable, it needs to be present in Seurat object's meta.data.
141 | 
142 | To correct for several covariates, specify them in a vector: `group.by.vars = c("stim", "new_covariate")`.
143 | 
144 | #### `reduction.use`
145 | 
146 | The cell embeddings to be used for the batch alignment. This parameter assumes that a reduced dimension already exists in the reduction slot of the Seurat object.  By default, the `pca` reduction is used.
147 | 
148 | 
149 | #### `dims.use`
150 | 
151 | Optional parameter which can use a name vector to select specific dimensions to be harmonized.
152 | 
153 | 
154 | ### Algorithm parameters
155 | ![Harmony Algorithm Overview](main.jpg){width=100%}
156 | 
157 | #### `nclust`
158 | 
159 | is a positive integer. Under the hood, harmony applies k-means soft-clustering. For this task, `k` needs to be determined. `nclust` corresponds to `k`. The harmonization results and performance are not particularly sensitive for a reasonable range of this parameter value. If this parameter is not set, harmony will autodetermine this based on the dataset size with a maximum cap of 200. For dataset with a vast amount of different cell types and batches this pamameter may need to be determined manually.
160 | 
161 | #### `sigma`
162 | 
163 | a positive scalar that controls the soft clustering probability assignment of single-cells to different clusters. Larger values will assign a larger probability to distant clusters of cells resulting in a different correction profile. Single-cells are assigned to clusters by their euclidean distance $d$ to some cluster center $Y$ after cosine normalization which is defined in the range [0,4]. The clustering probability of each cell is calculated as $e^{-\frac{d}{\sigma}}$ where $\sigma$ is controlled by the `sigma` parameter. Default value of `sigma` is 0.1 and it generally works well since it defines probability assignment of a cell in the range $[e^{-40}, e^0]$. Larger values of `sigma`  restrict the dynamic range of probabilities that can be assigned to cells. For example, `sigma=1` will yield a probabilities in the range of $[e^{-4}, e^0]$.
164 | 
165 | 
166 | #### `theta`
167 | 
168 | `theta` is a positive scalar vector that determines the coefficient of harmony's diversity penalty for each corrected experimental covariate. In challenging experimental conditions, increasing theta may result in better integration results. Theta is an expontential parameter of the diversity penalty, thus setting `theta=0` disables this penalty while increasing it to greater values than 1 will perform more aggressive corrections in an expontential manner. By default, it will set `theta=2` for each experimental covariate.
169 | 
170 | #### `max_iter`
171 | 
172 | The number of correction steps harmony will perform before completing the data set integration. In general, more iterations than necessary increases computational runtime especially which becomes evident in bigger datasets. Setting `early_stop=TRUE` may reduce the actual number of correction steps which will be smaller than `max_iter`.
173 | 
174 | #### `early_stop`
175 | 
176 | Under the hood, harmony minimizes its objective function through a series of clustering and integration tests. By setting `early_stop=TRUE`, when the objective function is less than `1e-4` after a correction step harmony exits before reaching the `max_iter` correction steps. This parameter can drastically reduce run-time in bigger datasets. 
177 | 
178 | #### `.options`
179 | A set of internal algorithm parameters that can be overriden. For advanced users only.
180 | 
181 | 
182 | 
183 | ### Seurat specific parameters
184 | 
185 | These parameters are Seurat-specific and do not affect the flow of the algorithm.
186 | 
187 | #### `project_dim`
188 | 
189 | Toggle-like parameter, by default `project_dim=TRUE`. When enabled, `RunHarmony()` calculates genomic feature loadings using Seurat's `ProjectDim()` that correspond to the harmonized cell embeddings.
190 | 
191 | #### `reduction.save`
192 | 
193 | The new Reduced Dimension slot identifier. By default, `reduction.save=TRUE`. This option allows several independent runs of harmony to be retained in the appropriate slots in the SeuratObjects. It is useful if you want to try Harmony with multiple parameters and save them as e.g. 'harmony_theta0', 'harmony_theta1', 'harmony_theta2'.
194 | 
195 | ### Miscellaneous parameters
196 | 
197 | These parameters help users troubleshoot harmony. 
198 | 
199 | #### `plot_convergence`
200 | 
201 | Option that plots the convergence plot after the execution of the algorithm. By default `FALSE`. Setting it to `TRUE` will collect harmony's objective value and plot it allowing the user to troubleshoot the flow of the algorithm and fine-tune the parameters of the dataset integration procedure.
202 | 
203 | 
204 | 
205 | ### Accessing the data
206 | 
207 | `RunHarmony()` returns the Seurat object which contains the harmonized cell embeddings in a slot named **harmony**. This entry can be accessed via `pbmc@reductions$harmony`. To access the values of the cell embeddings we can also use:
208 | 
209 | ```{r}
210 | harmony.embeddings <- Embeddings(pbmc, reduction = "harmony")
211 | ```
212 | 
213 | ### Inspection of the modalities
214 | 
215 | After Harmony integration, we should inspect the quality of the harmonization and contrast it with the unharmonized algorithm input. Ideally, cells from different conditions will align along the Harmonized PCs. If they are not, you could increase the *theta* value above to force a more aggressive fit of the dataset and rerun the workflow.
216 | 
217 | ```{r, fig.width=7, fig.height=3, out.width="100%", fig.align="center", fig.cap="Evaluate harmonization of stim parameter in the harmony generated cell embeddings"}
218 | 
219 | p1 <- DimPlot(object = pbmc, reduction = "harmony", pt.size = .1, group.by = "stim")
220 | p2 <- VlnPlot(object = pbmc, features = "harmony_1", group.by = "stim",  pt.size = .1)
221 | plot_grid(p1,p2)
222 | ```
223 | 
224 | Plot Genes correlated with the Harmonized PCs
225 | 
226 | ```{r, fig.width = 6, fig.height=3, out.width="100%"}
227 | 
228 | DimHeatmap(object = pbmc, reduction = "harmony", cells = 500, dims = 1:3)
229 | ```
230 | 
231 | # Using harmony embeddings for dimensionality reduction in Seurat
232 | 
233 | The harmonized cell embeddings generated by harmony can be used  for further integrated analyses. In this workflow, the Seurat object contains the harmony `reduction` modality name in the method that requires it.
234 | 
235 | ## Perform clustering using the harmonized vectors of cells
236 | ```{r}
237 | pbmc <- pbmc %>%
238 |     FindNeighbors(reduction = "harmony") %>%
239 |     FindClusters(resolution = 0.5) 
240 | ```
241 | ## TSNE dimensionality reduction
242 | ```{r, fig.width=5, fig.height=2.5, fig.align="center", fig.cap="t-SNE Visualization of harmony embeddings"}
243 | pbmc <- pbmc %>%
244 |     RunTSNE(reduction = "harmony")
245 | 
246 | 
247 | p1 <- DimPlot(pbmc, reduction = "tsne", group.by = "stim", pt.size = .1)
248 | p2 <- DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = .1)
249 | plot_grid(p1, p2)
250 | 
251 | ```
252 | 
253 | One important observation is to assess that the harmonized data contain biological states of the cells. Therefore by checking the following genes we can see that biological cell states are preserved after harmonization.
254 | 
255 | ```{r, fig.width = 7, fig.height = 7, out.width="100%", fig.cap="Expression of gene panel heatmap in the harmonized PBMC dataset"}
256 | FeaturePlot(object = pbmc, features= c("CD3D", "SELL", "CREM", "CD8A", "GNLY", "CD79A", "FCGR3A", "CCL2", "PPBP"), 
257 |             min.cutoff = "q9", cols = c("lightgrey", "blue"), pt.size = 0.5)
258 | 
259 | ```
260 | 
261 | ## UMAP
262 | 
263 | Very similarly with TSNE we can run UMAP by passing the harmony reduction in the function.
264 | 
265 | ```{r, fig.width=5, fig.height=2.5, fig.align="center", fig.cap="UMAP Visualization of harmony embeddings"}
266 | pbmc <- pbmc %>%
267 |     RunUMAP(reduction = "harmony",  dims = 1:20)
268 | 
269 | p1 <- DimPlot(pbmc, reduction = "umap", group.by = "stim", pt.size = .1)
270 | p2 <- DimPlot(pbmc, reduction = "umap", label = TRUE,  pt.size = .1)
271 | plot_grid(p1, p2)
272 | 
273 | ```
274 | 
275 | 
276 | ```{r}
277 | sessionInfo()
278 | ```
279 | 
280 | 


--------------------------------------------------------------------------------
/vignettes/main.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/immunogenomics/harmony/b36bab002c1767af6e665c81f186b40a87870e64/vignettes/main.jpg


--------------------------------------------------------------------------------
/vignettes/quickstart.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Quick start to Harmony"
  3 | author: "Korsunsky et al.: Fast, sensitive, and accurate integration of single 
  4 | cell data with Harmony"
  5 | output:
  6 |   rmarkdown::html_vignette:
  7 |     code_folding: show
  8 | vignette: >
  9 |     %\VignetteIndexEntry{Quick start to Harmony}
 10 |     %\VignetteEngine{knitr::rmarkdown}
 11 |     %\VignetteEncoding{UTF-8} 
 12 | ---
 13 |   
 14 | 
 15 | # Introduction
 16 | 
 17 | Harmony is an algorithm for performing integration of single cell genomics
 18 | datasets. Please check out our latest 
 19 | [manuscript on Nature Methods](https://www.nature.com/articles/s41592-019-0619-0). 
 20 | 
 21 | ![](main.jpg){width=100%}
 22 | 
 23 | 
 24 | # Installation
 25 | 
 26 | Install Harmony from CRAN with standard commands.
 27 | 
 28 | ```{r eval=FALSE}
 29 | install.packages('harmony')
 30 | ```
 31 | 
 32 | Once Harmony is installed, load it up! 
 33 | 
 34 | ```{r}
 35 | library(harmony)
 36 | ```
 37 | 
 38 | 
 39 | # Integrating cell line datasets from 10X
 40 | 
 41 | The example below follows Figure 2 in the manuscript. 
 42 | 
 43 | We downloaded 3 cell line datasets from the 10X website. The first two (jurkat
 44 | and 293t) come from pure cell lines while the *half* dataset is a 50:50
 45 | mixture of Jurkat and HEK293T cells. We inferred cell type with the canonical 
 46 | marker XIST, since the two cell lines come from 1 male and 1 female donor. 
 47 | 
 48 | * support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/jurkat
 49 | * support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/293t
 50 | * support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/jurkat:293t_50:50
 51 | 
 52 | We library normalized the cells, log transformed the counts, and scaled the 
 53 | genes. Then we performed PCA and kept the top 20 PCs. The PCA embeddings and 
 54 | meta data are available as part of this package. 
 55 | 
 56 | ```{r}
 57 | data(cell_lines)
 58 | V <- cell_lines$scaled_pcs
 59 | meta_data <- cell_lines$meta_data
 60 | 
 61 | ```
 62 | 
 63 | 
 64 | Initially, the cells cluster by both dataset (left) and cell type (right). 
 65 | 
 66 | ```{r class.source='fold-hide', fig.width=5, fig.height=3, fig.align="center"}
 67 | 
 68 | library(ggplot2)
 69 | 
 70 | do_scatter <- function(xy, meta_data, label_name, base_size = 12) {    
 71 |     palette_use <- c(`jurkat` = '#810F7C', `t293` = '#D09E2D',`half` = '#006D2C')
 72 |     xy <- xy[, 1:2]
 73 |     colnames(xy) <- c('X1', 'X2')
 74 |     plt_df <- xy %>% data.frame() %>% cbind(meta_data)
 75 |     plt <- ggplot(plt_df, aes(X1, X2, col = !!rlang::sym(label_name), fill = !!rlang::sym(label_name))) + 
 76 |         theme_test(base_size = base_size) +
 77 |         guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1,
 78 |                                                         shape = 16, size = 4))) +
 79 |         scale_color_manual(values = palette_use) +
 80 |         scale_fill_manual(values = palette_use) +
 81 |         theme(plot.title = element_text(hjust = .5)) +
 82 |         labs(x = "PC 1", y = "PC 2") +
 83 |         theme(legend.position = "none") +
 84 |         geom_point(shape = '.')
 85 |     
 86 |     ## Add labels
 87 |     data_labels <- plt_df %>%
 88 |         dplyr::group_by(!!rlang::sym(label_name)) %>%
 89 |         dplyr::summarise(X1 = mean(X1), X2 = mean(X2)) %>%
 90 |         dplyr::ungroup()
 91 |     plt + geom_label(data = data_labels, aes(label = !!rlang::sym(label_name)), 
 92 |                             color = "white", size = 4)
 93 | }
 94 | p1 <- do_scatter(V, meta_data, 'dataset') + 
 95 |     labs(title = 'Colored by dataset')
 96 | p2 <- do_scatter(V, meta_data, 'cell_type') + 
 97 |     labs(title = 'Colored by cell type')
 98 | 
 99 | cowplot::plot_grid(p1, p2)
100 | 
101 | ```
102 | 
103 | Let's run Harmony to remove the influence of dataset-of-origin from the cell
104 | embeddings.
105 | 
106 | ```{r}
107 | harmony_embeddings <- harmony::RunHarmony(
108 |     V, meta_data, 'dataset', verbose=FALSE
109 | )
110 | 
111 | ```
112 | 
113 | After Harmony, the datasets are now mixed (left) and the cell types are still
114 | separate (right). 
115 | 
116 | ```{r, fig.width=5, fig.height=3, fig.align="center"}
117 | p1 <- do_scatter(harmony_embeddings, meta_data, 'dataset') + 
118 |     labs(title = 'Colored by dataset')
119 | p2 <- do_scatter(harmony_embeddings, meta_data, 'cell_type') + 
120 |     labs(title = 'Colored by cell type')
121 | cowplot::plot_grid(p1, p2, nrow = 1)
122 | 
123 | ```
124 | 
125 | # Next Steps
126 | 
127 | ## Interfacing to software packages
128 | 
129 | You can also run Harmony as part of an established pipeline in several packages, such as Seurat. For these vignettes, please [visit our github page](https://github.com/immunogenomics/harmony/).
130 | 
131 | 
132 | ## Detailed breakdown of the Harmony algorithm
133 | 
134 | For more details on how each part of Harmony works, consult our more detailed
135 | [vignette](https://htmlpreview.github.io/?https://github.com/immunogenomics/harmony/blob/master/doc/detailedWalkthrough.html)
136 | "Detailed Walkthrough of Harmony Algorithm".
137 | 
138 | # Session Info
139 | 
140 | ```{r}
141 | sessionInfo()
142 | 
143 | ```
144 | 


--------------------------------------------------------------------------------