├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── RcppExports.R ├── align.R ├── bulk_de_wrappers.R ├── differential_geometry.R ├── estimate_linear_coefficients.R ├── find_de_neighborhoods.R ├── geodesic_regression.R ├── glioblastoma_example_data.R ├── handle_data_parameter.R ├── handle_design_parameter.R ├── handle_test_data_parameter.R ├── harmony_wrapper.R ├── lemur-package.R ├── lemur.R ├── lemur_fit.R ├── parse_contrasts.R ├── pca.R ├── predict.R ├── project_on_fit.R ├── recursive_least_squares.R ├── ridge_regression.R ├── test_de.R └── util.R ├── README.Rmd ├── README.md ├── data └── glioblastoma_example_data.rda ├── inst └── CITATION ├── man ├── align_harmony.Rd ├── align_impl.Rd ├── cash-lemur_fit-method.Rd ├── figures │ ├── README-fig-Neighborhood_size_vs_significance-1.png │ ├── README-fig-lemur_umap-1.png │ ├── README-fig-raw_umap-1.png │ ├── README-fig-tumor_cell_annotation1-1.png │ ├── README-fig-tumor_cell_annotation2-1.png │ ├── README-fig-tumor_de_neighborhood_plot-1.png │ ├── README-fig-umap_de-1.png │ ├── README-fig-umap_de-2.png │ ├── README-fig-umap_de2-1.png │ ├── README-fig-umap_de3-1.png │ ├── README-fig-volcano_plot-1.png │ ├── equation_schematic.png │ └── lemur-art.jpg ├── find_de_neighborhoods.Rd ├── fold_left.Rd ├── glioblastoma_example_data.Rd ├── grapes-zero_dom_mat_mult-grapes.Rd ├── grassmann_geodesic_regression.Rd ├── grassmann_lm.Rd ├── harmony_new_object.Rd ├── lemur.Rd ├── lemur_fit.Rd ├── mply_dbl.Rd ├── one_hot_encoding.Rd ├── predict.lemur_fit.Rd ├── project_on_lemur_fit.Rd ├── pseudoinverse.Rd ├── recursive_least_squares.Rd ├── reexports.Rd ├── residuals-lemur_fit-method.Rd ├── ridge_regression.Rd ├── stack_slice.Rd ├── test_de.Rd └── test_global.Rd ├── src ├── .gitignore ├── Makevars ├── Makevars.win ├── RcppExports.cpp └── util.cpp ├── tests ├── testthat.R └── testthat │ ├── helper.R │ ├── test-align.R │ ├── test-differential_geometry.R │ ├── test-find_de_neighborhoods.R │ ├── test-geodesic_regression.R │ ├── test-lemur.R │ ├── test-parse_contrasts.R │ ├── test-pca.R │ ├── test-recursive_least_squares.R │ ├── test-ridge_penalty.R │ ├── test-test_de.R │ └── test-util.R └── vignettes ├── .gitignore ├── Introduction.qmd └── man └── figures └── equation_schematic.png /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^man/.*\.ai$ 5 | ^man/figures/README-.*$ 6 | ^vignettes/man/figures/README-.*$ 7 | ^doc$ 8 | ^Meta$ 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.Rproj 2 | .Rproj.user 3 | .Rhistory 4 | .RData 5 | .Ruserdata 6 | tmp_scripts 7 | inst/doc 8 | *.DS_Store 9 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: lemur 2 | Type: Package 3 | Title: Latent Embedding Multivariate Regression 4 | Version: 1.3.6 5 | Authors@R: person("Constantin", "Ahlmann-Eltze", email = "artjom31415@googlemail.com", 6 | role = c("aut", "cre"), comment = c(ORCID = "0000-0002-3762-068X")) 7 | Description: Fit a latent embedding multivariate regression (LEMUR) model to multi-condition 8 | single-cell data. The model provides a parametric description of single-cell data measured 9 | with treatment vs. control or more complex experimental designs. 10 | The parametric model is used to (1) align conditions, (2) predict 11 | log fold changes between conditions for all cells, and (3) identify cell neighborhoods with 12 | consistent log fold changes. For those neighborhoods, a pseudobulked differential expression test 13 | is conducted to assess which genes are significantly changed. 14 | URL: https://github.com/const-ae/lemur 15 | BugReports: https://github.com/const-ae/lemur/issues 16 | License: MIT + file LICENSE 17 | Encoding: UTF-8 18 | LazyData: false 19 | Imports: 20 | stats, 21 | utils, 22 | irlba, 23 | methods, 24 | SingleCellExperiment, 25 | SummarizedExperiment, 26 | rlang (>= 1.1.0), 27 | vctrs (>= 0.6.0), 28 | glmGamPoi (>= 1.12.0), 29 | BiocGenerics, 30 | S4Vectors, 31 | Matrix, 32 | DelayedMatrixStats, 33 | HDF5Array, 34 | MatrixGenerics, 35 | matrixStats, 36 | Rcpp, 37 | harmony (>= 1.2.0), 38 | limma, 39 | BiocNeighbors 40 | Suggests: 41 | testthat (>= 3.0.0), 42 | tidyverse, 43 | uwot, 44 | dplyr, 45 | edgeR, 46 | knitr, 47 | quarto, 48 | BiocStyle 49 | LinkingTo: 50 | Rcpp, 51 | RcppArmadillo 52 | Depends: 53 | R (>= 4.1) 54 | Config/testthat/edition: 3 55 | Roxygen: list(markdown = TRUE) 56 | RoxygenNote: 7.3.1 57 | biocViews: Transcriptomics, DifferentialExpression, SingleCell, DimensionReduction, Regression 58 | VignetteBuilder: 59 | quarto 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2023 2 | COPYRIGHT HOLDER: lemur authors 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2023 lemur authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(.DollarNames,lemur_fit) 4 | S3method(predict,lemur_fit) 5 | export(.lemur_fit) 6 | export(align_by_grouping) 7 | export(align_harmony) 8 | export(find_de_neighborhoods) 9 | export(lemur) 10 | export(project_on_lemur_fit) 11 | export(test_de) 12 | export(vars) 13 | exportClasses(lemur_fit) 14 | exportMethods("[") 15 | exportMethods(design) 16 | exportMethods(residuals) 17 | import(SingleCellExperiment, except = weights) 18 | import(methods) 19 | import(stats) 20 | importFrom(BiocGenerics,design) 21 | importFrom(Matrix,t) 22 | importFrom(Rcpp,sourceCpp) 23 | importFrom(S4Vectors,DataFrame) 24 | importFrom(S4Vectors,`metadata<-`) 25 | importFrom(S4Vectors,metadata) 26 | importFrom(SingleCellExperiment,reducedDims) 27 | importFrom(SummarizedExperiment,`assay<-`) 28 | importFrom(SummarizedExperiment,`colData<-`) 29 | importFrom(SummarizedExperiment,`rowData<-`) 30 | importFrom(SummarizedExperiment,assay) 31 | importFrom(SummarizedExperiment,assayNames) 32 | importFrom(SummarizedExperiment,colData) 33 | importFrom(SummarizedExperiment,rowData) 34 | importFrom(glmGamPoi,vars) 35 | importFrom(utils,.DollarNames) 36 | importFrom(utils,capture.output) 37 | importFrom(utils,head) 38 | importFrom(utils,setTxtProgressBar) 39 | importFrom(utils,txtProgressBar) 40 | useDynLib(lemur, .registration = TRUE) 41 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # v1.3 2 | 3 | * `lemur` now automatically inserts the variables from the design formula into 4 | the `group_by` argument for `find_de_neighborhoods`. (thanks Katha for pushing for this feature) 5 | * The formula parsing automatically detects global variables and adds them to the 6 | colData. This avoids problems with the random test / training assignment. 7 | * Duplicate column names in colData are now longer allowed. 8 | * Require harmony version >= 1.2.0 (thanks Maija for reporting the problem) 9 | 10 | 11 | # v1.1 12 | 13 | * Make `predict` function faster and less memory intensive for subset fits. 14 | * Speed-up internal function `get_groups` 15 | * Gracefully handle duplicated column names in `colData(fit)` 16 | * Give better error message in `test_de` if `cond(..)` is used for a fit 17 | that was not specified with a design formula (thanks @MaximilianNuber for reporting) 18 | 19 | # v1.0 20 | 21 | * Bug fix in subsetting logic affecting `predict` and `test_de`. The problem occured if a `fit` object was 22 | subsetted with indices or gene names and the order changed, and resulted in a wrong order of the predictions. 23 | 24 | # v0.99.1-0.99.8 25 | 26 | * Submission to Bioconductor, thus the jump in version number. 27 | * Adjusted internal code to handle breaking changes in `harmony` v1.0.0. 28 | * Multiple small fixes to comply with Bioconductor guidelines 29 | (see https://github.com/Bioconductor/Contributions/issues/3152) 30 | 31 | # v0.0.27 32 | 33 | * Instead of `include_complement`, the `find_de_neighborhoods` function gains a `add_diff_in_diff` argument. If it is true, the function calculates the difference between the DE results inside the neighborhood vs. outside. 34 | * Change `indices` columns to `neighborhood` and store list of cell name vectors in output of `find_de_neighborhoods`. 35 | * Enforce unique column and row names. 36 | 37 | # v0.0.26 38 | 39 | * Make the neighborhoods more consistent: (1) include cells which are connected to many cells inside the neighborhood, (2) exclude cells from the neighborhood which are not well connected to the other cells in the neighborhood. 40 | * Add a `control_parameters` argument to `find_de_neighborhoods`. 41 | * Add `BiocNeighbor` as a dependency. 42 | 43 | # v0.0.25 44 | 45 | * Detect problematic neighborhoods and skip them. 46 | * Replace `test_data_cell_size_factors` by `size_factor_method`, which is more flexibel. Setting `size_factor_method = "ratio"` uses the size factor method described in the original DESeq paper 47 | 48 | 49 | # v0.0.24 50 | 51 | * Fix bug in `find_de_neighborhoods` that meant that accidentally additionally zeros where included in each 52 | neighborhood pseudobulk. The test should have more power now. 53 | * Expose `min_neighborhood_size` argument in `find_de_neighborhoods`. 54 | * Add `test_data_cell_size_factors` argument to `find_de_neighborhoods` which is useful if the function is called 55 | with a subsetted `fit` argument. 56 | 57 | # v0.0.23 58 | 59 | * Improve alignment functions: simplify algorithm, find linear approximation to Harmony's steps, 60 | include an intercept. 61 | * Avoid calling private methods from `harmony`. 62 | * Convert character columns in `colData` to factors to avoid problems when dividing data into 63 | test and training data. 64 | * Fix bug in `find_de_neighborhoods` where I didn't embrace an argument. 65 | * Remove `BiocNeighbors` dependency. 66 | 67 | # v0.0.21 68 | 69 | * Minor bug fix in `find_de_neighborhoods`. The function threw an error if `alignment_design != design`. 70 | * Better error messages if `find_de_neighborhoods` is called without having called `test_de` before. 71 | 72 | # v0.0.20 73 | 74 | * Change defaults for `find_de_neighborhoods`. Increase the `ridge_penalty` and add a `min_neighborhood_size = 10` argument 75 | to avoid creation of very small neighborhoods. 76 | 77 | # v0.0.19 78 | 79 | * Add new `test_fraction` argument to `lemur()` function. It automatically defines a hold-out datasets for the fitting step. 80 | These hold-out data is used to infer the differential expression of the neighborhoods in `find_de_neighborhoods`. This change 81 | addresses the double-dipping problem, where it was previously left to the user to provide an independent matrix for the 82 | `find_de_neighborhoods` function. 83 | * As a consequence of these changes, the structure of `lemur_fit` objects has changed. They gain three new fields called 84 | `fit$test_data`, `fit$training_data`, and `fit$is_test_data`. 85 | * The order and names of the arguments for `find_de_neighborhoods` has changed. 86 | 87 | # v0.0.18 88 | 89 | * Remove `alignment_method` field from `lemur_fit` objects as it was not used for anything. 90 | 91 | # v0.0.17 92 | 93 | * Rename argument name for `align_by_template` from `alignment_template` to `template` 94 | * Tweak algorithm for alignment to take cluster sizes into account during optimization 95 | 96 | # v0.0.13-v0.0.16 97 | 98 | * Change in the alignment model. Previously, the method tried to align cells using 99 | rotations and / or stretching, however, the method could not represent reflections! 100 | To fix this, I now allow arbitrary linear transformations where $R(x) = (I + sum_k x_k V_k)^{-1}$. The 101 | new alignment is more flexible and easier to infer. The downside is the term inside the parantheses can be 102 | singular which would lead to an error. 103 | * Skip iteration step: first infer centering and then infer latent space. Previously, I iterated between these steps 104 | but that either didn't add anything or actually degraded the results. 105 | * Set `center = FALSE` in `find_base_point`. Centering the data before fitting the base point caused 106 | problems and made the data look less integrated in some cases. 107 | * Remove ambient PCA step. This was originally conceived as an performance optimization, however 108 | it had detrimental effects on the inference. Since a few version it was skipped per default, so removing 109 | it should not change the inference. 110 | * Add `linear_coefficient_estimator` to give more flexibility how or if the conditions are centered. 111 | * Reduce the `minimum_cluster_membership` default to `0.001` in `align_harmony` to make it more sensitive. 112 | * Make `test_global` an internal function again until further more extensive testing. 113 | * Remove `base_point` argument from `lemur()`. It wasn't used anyways. 114 | 115 | # pre v0.0.13 116 | * Refactor `find_de_neighborhoods`: the function can now combine the results of 117 | different directions, selection criteria, and pseudobulk test (on counts or 118 | continuous values). To implement this, I changed the names of the arguments and 119 | added parameters. 120 | * Remove many superfluous method generics and only provide the accession via `$` 121 | * Fix documentation warnings 122 | * Rename class from 'lemur_fit_obj' to 'lemur_fit' 123 | * Store 'contrast' in `lemur_fit` after calling `test_de` 124 | * Add option to fit count model in `find_de_neighborhoods` with [edgeR](https://bioconductor.org/packages/edgeR/) 125 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | cumz_which_abs_max <- function(x, min_neighborhood_size) { 5 | .Call(`_lemur_cumz_which_abs_max`, x, min_neighborhood_size) 6 | } 7 | 8 | cum_brls_which_abs_max <- function(y, X, group, contrast, penalty, min_neighborhood_size) { 9 | .Call(`_lemur_cum_brls_which_abs_max`, y, X, group, contrast, penalty, min_neighborhood_size) 10 | } 11 | 12 | count_neighbors_fast <- function(knn_mat, indices) { 13 | .Call(`_lemur_count_neighbors_fast`, knn_mat, indices) 14 | } 15 | 16 | -------------------------------------------------------------------------------- /R/align.R: -------------------------------------------------------------------------------- 1 | #' Enforce additional alignment of cell clusters beyond the direct differential embedding 2 | #' 3 | #' @param fit a `lemur_fit` object 4 | #' @param grouping argument specific for `align_by_grouping`. Either a vector which assigns 5 | #' each cell to one group or a matrix with `ncol(fit)` columns where the rows are a soft-assignment 6 | #' to a cluster (i.e., columns sum to `1`). `NA`'s are allowed. 7 | #' @param design a specification of the design (matrix or formula) that is used 8 | #' for the transformation. Default: `fit$design_matrix` 9 | #' @param ridge_penalty specification how much the flexibility of the transformation 10 | #' should be regularized. Default: `0.01` 11 | #' @param max_iter argument specific for `align_harmony`. The number of iterations. Default: `10` 12 | #' @param preserve_position_of_NAs argument specific for `align_by_grouping`. 13 | #' Boolean flag to decide if `NA`s in the `grouping` mean that these cells should stay where they are (if 14 | #' possible) or if they are free to move around. Default: `FALSE` 15 | #' @param ... additional parameters that are passed on to relevant functions 16 | #' @param verbose Should the method print information during the fitting. Default: `TRUE`. 17 | #' 18 | #' @returns The `fit` object with the updated `fit$embedding` and `fit$alignment_coefficients`. 19 | #' 20 | #' @examples 21 | #' data(glioblastoma_example_data) 22 | #' fit <- lemur(glioblastoma_example_data, design = ~ patient_id + condition, 23 | #' n_emb = 5, verbose = FALSE) 24 | #' # Creating some grouping for illustration 25 | #' cell_types <- sample(c("tumor cell", "neuron", "leukocyte"), size = ncol(fit), replace = TRUE) 26 | #' fit_al1 <- align_by_grouping(fit, grouping = cell_types) 27 | #' 28 | #' # Alternatively, use harmony to automatically group cells 29 | #' fit_al2 <- align_harmony(fit) 30 | #' fit_al2 31 | #' 32 | #' # The alignment coefficients are a 3D array 33 | #' fit_al2$alignment_coefficients 34 | #' 35 | #' @export 36 | align_harmony <- function(fit, design = fit$alignment_design, 37 | ridge_penalty = 0.01, max_iter = 10, ..., verbose = TRUE){ 38 | if(verbose) message("Select cells that are considered close with 'harmony'") 39 | if(is.null(attr(design, "ignore_degeneracy"))){ 40 | # It doesn't matter for harmony if the design is degenerate 41 | attr(design, "ignore_degeneracy") <- TRUE 42 | } 43 | design_matrix <- handle_design_parameter(design, fit, glmGamPoi:::get_col_data(fit, NULL), verbose = verbose)$design_matrix 44 | act_design_matrix <- design_matrix[!fit$is_test_data,,drop=FALSE] 45 | 46 | if(! requireNamespace("harmony", quietly = TRUE)){ 47 | stop("'harmony' is not installed. Please install it from CRAN.") 48 | } 49 | training_fit <- fit$training_data 50 | # Ignore best practice and call private methods from harmony 51 | harm_obj <- harmony_init(training_fit$embedding, act_design_matrix, ..., verbose = verbose) 52 | for(idx in seq_len(max_iter)){ 53 | harm_obj <- harmony_max_div_clustering(harm_obj) 54 | 55 | alignment <- align_impl(training_fit$embedding, harm_obj$R, act_design_matrix, ridge_penalty = ridge_penalty) 56 | 57 | harm_obj$Z_corr <- alignment$embedding 58 | harm_obj$Z_cos <- t(t(harm_obj$Z_corr) / sqrt(colSums(harm_obj$Z_corr^2))) 59 | if(harm_obj$check_convergence(1)){ 60 | if(verbose) message("Converged") 61 | break 62 | } 63 | } 64 | 65 | correct_fit(fit, alignment$alignment_coefficients, design) 66 | } 67 | 68 | #' @rdname align_harmony 69 | #' @export 70 | align_by_grouping <- function(fit, grouping, design = fit$alignment_design, 71 | ridge_penalty = 0.01, preserve_position_of_NAs = FALSE, verbose = TRUE){ 72 | if(verbose) message("Received sets of cells that are considered close") 73 | 74 | if(is.list(grouping)){ 75 | stop("'grouping' must be a vector/factor with distinct elements for each group or ", 76 | "a matrix with `ncol(fit)` columns and one row per group.") 77 | }else if(! is.matrix(grouping)){ 78 | stopifnot(length(grouping) == ncol(fit)) 79 | grouping <- grouping[! fit$is_test_data] 80 | }else{ 81 | stopifnot(ncol(grouping) == ncol(fit)) 82 | grouping <- grouping[,! fit$is_test_data,drop=FALSE] 83 | } 84 | 85 | design_matrix <- handle_design_parameter(design, fit, glmGamPoi:::get_col_data(fit, NULL), verbose = verbose)$design_matrix 86 | act_design_matrix <- design_matrix[!fit$is_test_data,,drop=FALSE] 87 | 88 | res <- align_impl(fit$training_data$embedding, grouping, act_design_matrix, 89 | ridge_penalty = ridge_penalty, calculate_new_embedding = FALSE) 90 | 91 | correct_fit(fit, res$alignment_coefficients, design) 92 | } 93 | 94 | 95 | #' Align the points according to some grouping 96 | #' 97 | #' @returns A list with the new embedding and the coefficients 98 | #' 99 | #' @keywords internal 100 | align_impl <- function(embedding, grouping, design_matrix, ridge_penalty = 0.01, 101 | preserve_position_of_NAs = FALSE, calculate_new_embedding = TRUE){ 102 | if(! is.matrix(grouping)){ 103 | grouping_matrix <- one_hot_encoding(grouping) 104 | }else{ 105 | stopifnot(ncol(grouping) == ncol(embedding)) 106 | stopifnot(all(grouping >= 0, na.rm = TRUE)) 107 | # Make sure the entries sum to 1 (and don't touch them if the column is all zero) 108 | col_sums <- MatrixGenerics::colSums2(grouping) 109 | col_sums[col_sums == 0] <- 1 110 | grouping_matrix <- t(t(grouping) / col_sums) 111 | } 112 | 113 | # NA's are converted to zero columns ensuring that `diff %*% grouping_matrix = 0` 114 | grouping_matrix[,MatrixGenerics::colAnyNAs(grouping_matrix)] <- 0 115 | if(! preserve_position_of_NAs){ 116 | all_zero_col <- MatrixGenerics::colSums2(grouping_matrix) == 0 117 | grouping_matrix <- grouping_matrix[,! all_zero_col,drop=FALSE] 118 | embedding <- embedding[,! all_zero_col,drop=FALSE] 119 | design_matrix <- design_matrix[! all_zero_col,,drop=FALSE] 120 | } 121 | 122 | stopifnot(ncol(embedding) == ncol(grouping_matrix)) 123 | stopifnot(ncol(embedding) == nrow(design_matrix)) 124 | 125 | n_groups <- nrow(grouping_matrix) 126 | n_emb <- nrow(embedding) 127 | K <- ncol(design_matrix) 128 | 129 | conditions <- get_groups(design_matrix) 130 | conds <- unique(conditions) 131 | 132 | # Calculate mean per cell_type+condition 133 | cond_ct_means <- lapply(conds, \(co){ 134 | t(mply_dbl(seq_len(n_groups), \(idx){ 135 | MatrixGenerics::rowWeightedMeans(embedding, w = grouping_matrix[idx,], cols = conditions == co) 136 | }, ncol = n_emb)) 137 | }) 138 | 139 | # Calculate target as mean of `cond_ct_means` per cell_type 140 | target <- matrix(NA, nrow = n_emb, ncol = n_groups) 141 | for(idx in seq_len(n_groups)){ 142 | target[,idx] <- colMeans(mply_dbl(conds, \(co) cond_ct_means[[co]][,idx], ncol = n_emb), na.rm = TRUE) 143 | } 144 | 145 | # Shift all cells by `ctc_mean - ct_target` (`new_pos`) 146 | new_pos <- embedding 147 | for(co in conds){ 148 | diff <- target - cond_ct_means[[co]] 149 | new_pos[,conditions == co] <- new_pos[,conditions == co] + diff %zero_dom_mat_mult% grouping_matrix[,conditions == co] 150 | } 151 | 152 | # Approximate shift by regressing `new_pos ~ S(x) * orig_pos + offset(x)` 153 | interact_design_matrix <- duplicate_cols(design_matrix, each = n_emb + 1) * duplicate_cols(t(rbind(1, embedding)), times = ncol(design_matrix)) 154 | # interact_design_matrix <- duplicate_cols(design_matrix, each = n_emb) * duplicate_cols(t(embedding), times = ncol(design_matrix)) 155 | alignment_coefs <- ridge_regression(new_pos - embedding, interact_design_matrix, ridge_penalty = ridge_penalty) 156 | alignment_coefs <- array(alignment_coefs, dim = c(n_emb, n_emb + 1, ncol(design_matrix))) 157 | 158 | new_emb <-if(calculate_new_embedding){ 159 | apply_linear_transformation(embedding, alignment_coefs, design_matrix) 160 | }else{ 161 | NULL 162 | } 163 | list(alignment_coefficients = alignment_coefs, embedding = new_emb) 164 | } 165 | 166 | #' Take a vector and convert it to a one-hot encoded matrix 167 | #' 168 | #' @returns A matrix with `length(unique(groups))` rows and `length(groups)` columns. 169 | #' 170 | #' @keywords internal 171 | one_hot_encoding <- function(groups){ 172 | if(is.factor(groups)){ 173 | levels <- levels(groups) 174 | }else{ 175 | levels <- unique(groups) 176 | } 177 | 178 | res <- matrix(0, nrow = length(levels), ncol = length(groups), dimnames = list(levels, names(groups))) 179 | for(i in seq_along(levels)){ 180 | if(is.na(levels[i])){ 181 | # Do nothing 182 | }else{ 183 | res[i, groups == levels[i]] <- 1 184 | } 185 | } 186 | res 187 | } 188 | 189 | forward_linear_transformation <- function(alignment_coefficients, design_vector){ 190 | n_emb <- dim(alignment_coefficients)[1] 191 | if(n_emb == 0){ 192 | sum_tangent_vectors(alignment_coefficients, design_vector) 193 | }else{ 194 | cbind(0, diag(nrow = n_emb)) + sum_tangent_vectors(alignment_coefficients, design_vector) 195 | } 196 | } 197 | 198 | reverse_linear_transformation <- function(alignment_coefficients, design_vector){ 199 | n_embedding <- dim(alignment_coefficients)[1] 200 | if(n_embedding == 0){ 201 | matrix(nrow = 0, ncol = 0) 202 | }else{ 203 | solve(diag(nrow = n_embedding) + sum_tangent_vectors(alignment_coefficients[,-1,,drop=FALSE], design_vector)) 204 | } 205 | } 206 | 207 | apply_linear_transformation <- function(A, alignment_coefs, design){ 208 | mm_groups <- get_groups(design) 209 | groups <- unique(mm_groups) 210 | for(gr in groups){ 211 | A[,mm_groups == gr] <- forward_linear_transformation(alignment_coefs, design[which(mm_groups == gr)[1],]) %*% rbind(1, A[,mm_groups == gr,drop=FALSE]) 212 | } 213 | A 214 | } 215 | 216 | correct_fit <- function(fit, alignment_coefs, design){ 217 | old <- S4Vectors:::disableValidity() 218 | if (!isTRUE(old)) { 219 | S4Vectors:::disableValidity(TRUE) 220 | on.exit(S4Vectors:::disableValidity(old)) 221 | } 222 | 223 | if(! all(fit$alignment_coefficients == 0)) stop("Can only apply alignment once") 224 | 225 | metadata(fit)[["alignment_coefficients"]] <- alignment_coefs 226 | des <- handle_design_parameter(design, fit, fit$colData) 227 | metadata(fit)[["alignment_design"]] <- des$design_formula 228 | metadata(fit)[["alignment_design_matrix"]] <- des$design_matrix 229 | colData(fit) <- des$col_data 230 | reducedDim(fit, "embedding") <- t(apply_linear_transformation(fit$embedding, alignment_coefs, des$design_matrix)) 231 | fit 232 | } 233 | 234 | handle_ridge_penalty_parameter <- function(ridge_penalty){ 235 | if(any(names(ridge_penalty) %in% c("rotation", "stretching"))){ 236 | stop("The alignment function has changed and the rotation and stretching specification is now defunct") 237 | } 238 | ridge_penalty 239 | } 240 | -------------------------------------------------------------------------------- /R/bulk_de_wrappers.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | edger_fit <- function(counts, design, offset, col_data = NULL, 4 | abundance.trend = TRUE, robust = TRUE){ 5 | 6 | if(is.matrix(design)){ 7 | design_matrix <- design 8 | }else{ 9 | design_matrix <- model.matrix(design, data = col_data) 10 | } 11 | 12 | edger_y <- edgeR::DGEList(counts = counts) 13 | edger_y <- edgeR::scaleOffset(edger_y, offset = offset) 14 | edger_y <- edgeR::estimateDisp(edger_y, design_matrix) 15 | edgeR::glmQLFit(edger_y, design_matrix, abundance.trend = abundance.trend, robust = robust) 16 | } 17 | 18 | 19 | edger_test_de <- function(edger_fit, contrast, design = NULL){ 20 | cntrst <- parse_contrast({{contrast}}, design, simplify = TRUE) 21 | 22 | edger_fit <- edgeR::glmQLFTest(edger_fit, contrast = cntrst) 23 | edger_res <- edgeR::topTags(edger_fit, n = nrow(edger_fit$counts), sort.by = "none")$table 24 | data.frame(name = rownames(edger_res), pval = edger_res$PValue, adj_pval = edger_res$FDR, 25 | f_statistic = edger_res$F, df1 = edger_fit$df.test, df2 = edger_fit$df.total, lfc = edger_res$logFC) 26 | } 27 | 28 | 29 | limma_fit <- function(values, design, col_data = NULL){ 30 | if(is.matrix(design)){ 31 | design_matrix <- design 32 | }else{ 33 | design_matrix <- model.matrix(design, data = col_data) 34 | } 35 | if(! is_contrast_estimable(cntrst, design_matrix)){ 36 | stop("The contrast is not estimable from the design_matrix") 37 | } 38 | 39 | suppressWarnings({ 40 | # limma warns about missing values. Here we expect missing values though. 41 | lm_fit <- limma::lmFit(values, design_matrix) 42 | }) 43 | lm_fit 44 | } 45 | 46 | limma_test_de <- function(lm_fit, contrast, design, values = NULL, shrink = TRUE, trend = TRUE, robust = TRUE){ 47 | cntrst <- matrix(parse_contrast({{contrast}}, formula = design, simplify = TRUE), ncol = 1) 48 | cntrst <- evaluate_contrast_tree(cntrst, cntrst, \(x, .) x) 49 | 50 | lm_fit <- limma::contrasts.fit(lm_fit, contrasts = cntrst) 51 | if(shrink){ 52 | lm_fit <- tryCatch({ 53 | limma::eBayes(lm_fit, trend = trend, robust = robust) 54 | }, error = function(err){ 55 | limma::eBayes(lm_fit, trend = FALSE, robust = TRUE) 56 | }) 57 | }else{ 58 | lm_fit <- limma_eBayes_without_shrinkage(lm_fit) 59 | } 60 | tt <- limma::topTable(lm_fit, number = nrow(lm_fit$coefficients), adjust.method = "BH", sort.by = "none") 61 | if(! is.null(values)){ 62 | for(row in which(MatrixGenerics::rowAnyNAs(values))){ 63 | # limma can return misleading results if there missing values in the wrong 64 | # places (see https://support.bioconductor.org/p/9150300/) 65 | if(! is_contrast_estimable(cntrst, lm_fit$design[!is.na(values[row,]),,drop=FALSE])){ 66 | tt[row, c("P.Value", "adj.P.Val", "t", "logFC")] <- NA_real_ 67 | } 68 | } 69 | } 70 | data.frame(name = rownames(tt), pval = tt$P.Value, adj_pval = tt$adj.P.Val, t_statistic = tt$t, lfc = tt$logFC) 71 | } 72 | 73 | 74 | -------------------------------------------------------------------------------- /R/differential_geometry.R: -------------------------------------------------------------------------------- 1 | 2 | # Grassmann (Gr(n, k)) 3 | 4 | grassmann_map <- function(x, base_point){ 5 | # Adapted from https://github.com/JuliaManifolds/Manifolds.jl/blob/master/src/manifolds/GrassmannStiefel.jl#L93 6 | if(ncol(base_point) == 0 || nrow(base_point) == 0){ 7 | base_point 8 | }else if(any(is.na(x))){ 9 | matrix(NA, nrow = nrow(x), ncol = ncol(x)) 10 | }else{ 11 | svd <- svd(x) 12 | z <- base_point %*% svd$v %*% diag(cos(svd$d), nrow = length(svd$d)) %*% t(svd$v) + 13 | svd$u %*% diag(sin(svd$d), nrow = length(svd$d)) %*% t(svd$v) 14 | # Calling `qr.Q(qr(z))` is problematic because it can flip the signs 15 | z 16 | } 17 | } 18 | 19 | grassmann_log <- function(p, q){ 20 | # Adapted from https://github.com/JuliaManifolds/Manifolds.jl/blob/master/src/manifolds/GrassmannStiefel.jl#L174 21 | # The Grassmann manifold handbook proposes an alternative algorithm in section 5.2 22 | n <- nrow(p) 23 | k <- ncol(p) 24 | stopifnot(nrow(q) == n, ncol(q) == k) 25 | if(n == 0 || k == 0){ 26 | p 27 | }else{ 28 | z <- t(q) %*% p 29 | At <- t(q) - z %*% t(p) 30 | Bt <- lm.fit(z, At)$coefficients 31 | svd <- svd(t(Bt), k, k) 32 | svd$u %*% diag(atan(svd$d), nrow = k) %*% t(svd$v) 33 | } 34 | } 35 | 36 | project_grassmann <- function(x){ 37 | qr.Q(qr(x)) 38 | } 39 | 40 | project_grassmann_tangent <- function(x, base_point){ 41 | x - base_point %*% t(base_point) %*% x 42 | } 43 | 44 | 45 | random_grassmann_point <- function(n, k, ...){ 46 | V <- randn(n, k, ...) 47 | project_grassmann(V) 48 | } 49 | 50 | random_grassmann_tangent <- function(p, ...){ 51 | n <- nrow(p) 52 | k <- ncol(p) 53 | Z <- randn(n, k, ...) 54 | project_grassmann_tangent(Z, p) 55 | } 56 | 57 | 58 | grassmann_angle_from_tangent <- function(x, normalized = TRUE){ 59 | # Conversion of tangent vector to angle taken from Proposition 5.1 of the Grassmann manifold handbook 60 | thetas <- (svd(x)$d / pi * 180) 61 | if(normalized){ 62 | thetas <- thetas %% 180 63 | max(pmin(thetas, 180 - thetas)) 64 | }else{ 65 | thetas[1] 66 | } 67 | } 68 | 69 | grassmann_angle_from_points <- function(p, q){ 70 | grassmann_angle_from_tangent(grassmann_log(p, q)) 71 | } 72 | 73 | 74 | -------------------------------------------------------------------------------- /R/estimate_linear_coefficients.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | estimate_linear_coefficient <- function(Y, design_matrix, method = c("linear", "mean", "cluster_median", "zero")){ 4 | method <- match.arg(method) 5 | 6 | if(method == "linear"){ 7 | linear_fit <- lm.fit(design_matrix, t(Y)) 8 | t(linear_fit$coefficients) 9 | }else if(method == "mean"){ 10 | # Check for intercept column / columns 11 | ones <- rep(1, nrow(design_matrix)) 12 | intercept_fit <- lm.fit(design_matrix, ones) 13 | if(! sum(intercept_fit$residuals^2) < 1e-12){ 14 | stop("The design matrix does not have an intercept. Cannot apply a single mean offset. Please change", 15 | "'linear_coefficient_estimator' to 'linear', 'cluster_median', or 'zero'.") 16 | } 17 | means <- MatrixGenerics::rowMeans2(Y) 18 | matrix(means, ncol = 1) %*% matrix(intercept_fit$coefficients, nrow = 1) 19 | }else if(method == "zero"){ 20 | matrix(0, nrow = nrow(Y), ncol = ncol(design_matrix)) 21 | }else if(method == "cluster_median"){ 22 | min_cluster_membership <- 0.01 23 | pca <- pca(Y, n = 20) 24 | harm_obj <- harmony_init(pca$embedding, design_matrix, nclust = 30, verbose = FALSE) 25 | harm_obj <- harmony_max_div_clustering(harm_obj) 26 | Yt <- as.matrix(t(Y)) 27 | coef <- do.call(cbind, lapply(seq_len(nrow(harm_obj$R)), \(cl){ 28 | threshold <- min(min_cluster_membership, max(harm_obj$R) * 0.5) 29 | sel <- harm_obj$R[cl, ] > threshold 30 | tryCatch({ 31 | fit <- lm.wfit(design_matrix[sel,,drop=FALSE], y = Yt[sel,,drop=FALSE], w = harm_obj$R[cl, sel]) 32 | as.numeric(t(fit$coefficients)) 33 | }, error = function(e){ 34 | rep(NA_real_, nrow(Y) * ncol(design_matrix)) 35 | }) 36 | })) 37 | wmed <- matrixStats::rowWeightedMedians(coef, w = rowSums(harm_obj$R), na.rm = TRUE, interpolate = FALSE) 38 | matrix(wmed, nrow = nrow(Y), ncol = ncol(design_matrix)) 39 | } 40 | } 41 | 42 | 43 | -------------------------------------------------------------------------------- /R/geodesic_regression.R: -------------------------------------------------------------------------------- 1 | 2 | ####################### 3 | # Grassmann Manifold # 4 | ####################### 5 | 6 | 7 | #' Solve d(P, exp_p(V * x))^2 for V 8 | #' 9 | #' @returns A three-dimensional array with the coefficients `V`. 10 | #' 11 | #' @keywords internal 12 | grassmann_geodesic_regression <- function(coordsystems, design, base_point, weights = 1, tangent_regression = FALSE){ 13 | # Validate input 14 | n_obs <- nrow(design) 15 | n_coef <- ncol(design) 16 | n_amb <- nrow(base_point) 17 | n_emb <- ncol(base_point) 18 | 19 | coordsystems <- if(is.list(coordsystems)){ 20 | coordsystems 21 | }else if(is.array(coordsystems)){ 22 | stopifnot(length(dim(coordsystems)) == 3) 23 | destack_slice(coordsystems) 24 | }else{ 25 | stop("Cannot handle coordsystems of type: ", toString(class(coordsystems), width = 100)) 26 | } 27 | stopifnot(length(coordsystems) == n_obs) 28 | stopifnot(all(vapply(coordsystems, \(emb) nrow(emb) == n_amb && ncol(emb) == n_emb, FUN.VALUE = logical(1L)))) 29 | # stopifnot(all(vapply(coordsystems, \(emb) is_grassmann_element(emb), FUN.VALUE = logical(1L)))) 30 | weights <- rep_len(weights, n_obs) 31 | 32 | 33 | 34 | # Initialize with tangent regression (if possible) 35 | tangent_vecs <- lapply(coordsystems, \(emb) as.vector(grassmann_log(base_point, emb))) 36 | merged_vecs <- stack_cols(tangent_vecs) 37 | tangent_fit <- if(nrow(merged_vecs) == 0){ 38 | matrix(nrow = 0, ncol = ncol(design)) 39 | }else{ 40 | t(lm.wfit(design, t(merged_vecs), w = weights)$coefficients) 41 | } 42 | coef <- stack_slice(lapply(seq_len(ncol(tangent_fit)), \(idx) matrix(tangent_fit[,idx], nrow = n_amb, ncol = n_emb))) 43 | dimnames(coef) <- list(NULL, NULL, colnames(tangent_fit)) 44 | 45 | 46 | if(tangent_regression){ 47 | coef 48 | }else{ 49 | # warning("Refine regression using Riemannian optimization. (Not yet implemented)") 50 | coef 51 | } 52 | } 53 | 54 | #' Solve ||Y - exp_p(V * x) Y ||^2_2 for V 55 | #' 56 | #' @returns A three-dimensional array with the coefficients `V`. 57 | #' 58 | #' @keywords internal 59 | grassmann_lm <- function(data, design, base_point, tangent_regression = FALSE){ 60 | nas <- apply(data, 2, anyNA) | apply(design, 1, anyNA) 61 | data <- data[,!nas,drop=FALSE] 62 | design <- design[!nas,,drop=FALSE] 63 | 64 | n_obs <- nrow(design) 65 | n_coef <- ncol(design) 66 | n_amb <- nrow(base_point) 67 | n_emb <- ncol(base_point) 68 | 69 | # Initialize with tangent regression 70 | mm_groups <- get_groups(design) 71 | groups <- unique(mm_groups) 72 | reduced_design <- mply_dbl(groups, \(gr) design[which(mm_groups == gr)[1],], ncol = ncol(design)) 73 | if(any(table(mm_groups) < n_emb)){ 74 | n_occur <- c(table(mm_groups)) 75 | sel_indices <- head(order(n_occur), n = 4) 76 | problematic_mat <- cbind(n_occurrences = n_occur[sel_indices], reduced_design[sel_indices,,drop=FALSE]) 77 | stop("Too few datapoints in some design matrix group.\nIf a covariate is continuous, ", 78 | "please discretize it into an (ordered) factor.\nIf for one factor level there are too few cells, consider removing ", 79 | "that factor level or merging it with another.\n\n", glmGamPoi:::format_matrix(problematic_mat), 80 | "\nEach row must occurr at least n_embedding=", n_emb, " times.\n") 81 | } 82 | group_planes <- lapply(groups, \(gr) pca(data[,mm_groups == gr,drop=FALSE], n = n_emb, center = FALSE)$coordsystem) 83 | group_sizes <- vapply(groups, \(gr) sum(mm_groups == gr), FUN.VALUE = 0L) 84 | coef <- grassmann_geodesic_regression(group_planes, design = reduced_design, base_point = base_point, weights = group_sizes, tangent_regression = TRUE) 85 | if(tangent_regression){ 86 | coef 87 | }else{ 88 | # warning("Refine regression using Riemannian optimization. (Not yet implemented)") 89 | coef 90 | } 91 | } 92 | 93 | 94 | get_groups <- function (design_matrix) { 95 | vctrs::vec_group_id(unclass(design_matrix)) 96 | } 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /R/glioblastoma_example_data.R: -------------------------------------------------------------------------------- 1 | 2 | #' The `glioblastoma_example_data` dataset 3 | #' 4 | #' The dataset is a [`SingleCellExperiment`] object subset to 5,000 cells and 5 | #' 300 genes. The `colData` contain an entry for each cell from which patient 6 | #' it came and to which treatment condition it belonged (`"ctrl"` or `"panobinostat"`). 7 | #' 8 | #' The original data was collected by Zhao et al. (2021). 9 | #' 10 | #' @returns A [`SingleCellExperiment`] object. 11 | #' 12 | #' @references 13 | #' * Zhao, Wenting, Athanassios Dovas, Eleonora Francesca Spinazzi, Hanna Mendes Levitin, Matei Alexandru Banu, Pavan Upadhyayula, Tejaswi Sudhakar, et al. 14 | #' “Deconvolution of Cell Type-Specific Drug Responses in Human Tumor Tissue with Single-Cell RNA-Seq.” Genome Medicine 13, no. 1 15 | #' (December 2021): 82. https://doi.org/10.1186/s13073-021-00894-y. 16 | #' 17 | #' @name glioblastoma_example_data 18 | NULL 19 | 20 | -------------------------------------------------------------------------------- /R/handle_data_parameter.R: -------------------------------------------------------------------------------- 1 | 2 | handle_data_parameter <- function(data, on_disk, assay){ 3 | if(is.matrix(data)){ 4 | if(! is.numeric(data)){ 5 | stop("The data argument must consist of numeric values and not of ", mode(data), " values") 6 | } 7 | if(is.null(on_disk) || isFALSE(on_disk)){ 8 | data_mat <- data 9 | }else if(isTRUE(on_disk)){ 10 | data_mat <- HDF5Array::writeHDF5Array(data) 11 | }else{ 12 | stop("Illegal argument type for on_disk. Can only handle 'NULL', 'TRUE', or 'FALSE'") 13 | } 14 | }else if(is(data, "DelayedArray")){ 15 | if(is.null(on_disk) || isTRUE(on_disk)){ 16 | data_mat <- data 17 | }else if(isFALSE(on_disk)){ 18 | data_mat <- as.matrix(data) 19 | }else{ 20 | stop("Illegal argument type for on_disk. Can only handle 'NULL', 'TRUE', or 'FALSE'") 21 | } 22 | }else if(is(data, "SummarizedExperiment")){ 23 | data_mat <- handle_data_parameter(SummarizedExperiment::assay(data, assay), on_disk) 24 | }else if(canCoerce(data, "SummarizedExperiment")){ 25 | se <- as(data, "SummarizedExperiment") 26 | data_mat <- handle_data_parameter(SummarizedExperiment::assay(se, assay), on_disk) 27 | }else if(is(data, "dgCMatrix") || is(data, "dgTMatrix")) { 28 | if(isTRUE(on_disk)){ 29 | data_mat <- HDF5Array::writeHDF5Array(data) 30 | }else if(isFALSE(on_disk)){ 31 | data_mat <- as.matrix(data) 32 | }else{ 33 | stop("glmGamPoi does not yet support sparse input data of type '", class(data),"'. ", 34 | "Please explicitly set the 'on_disk' parameter to force a conversion to a dense format either in-memory ('on_disk = FALSE') ", 35 | "or on-disk ('on_disk = TRUE')") 36 | } 37 | }else{ 38 | stop("Cannot handle data of class '", class(data), "'.", 39 | "It must be of type dense matrix object (i.e., a base matrix or DelayedArray),", 40 | " or a container for such a matrix (for example: SummarizedExperiment).") 41 | } 42 | 43 | if(! is.null(colnames(data_mat))){ 44 | if(ncol(data_mat) != length(unique(colnames(data_mat)))) stop("The colnames of the data are not unique.") 45 | } 46 | if(! is.null(rownames(data_mat))){ 47 | if(nrow(data_mat) != length(unique(rownames(data_mat)))) stop("The rownames of the data are not unique.") 48 | } 49 | 50 | data_mat 51 | } 52 | 53 | 54 | convert_dataframe_cols_chr_to_fct <- function(col_data){ 55 | character_cols <- vapply(col_data, is.character, logical(1L)) 56 | col_data[character_cols] <- lapply(col_data[character_cols], as.factor) 57 | col_data 58 | } 59 | 60 | -------------------------------------------------------------------------------- /R/handle_design_parameter.R: -------------------------------------------------------------------------------- 1 | 2 | handle_design_parameter <- function(design, data, col_data, verbose = FALSE){ 3 | n_samples <- ncol(data) 4 | 5 | ignore_degeneracy <- isTRUE(attr(design, "ignore_degeneracy")) 6 | 7 | # Handle the design parameter 8 | if(is.matrix(design)){ 9 | design_matrix <- design 10 | design_formula <- NULL 11 | }else if((is.vector(design) || is.factor(design))){ 12 | if(length(design) != n_samples){ 13 | if(length(design) == 1 && design == 1){ 14 | stop("The specified design vector length (", length(design), ") does not match ", 15 | "the number of samples: ", n_samples, "\n", 16 | "Did you maybe mean: `design = ~ 1`?") 17 | }else{ 18 | stop("The specified design vector length (", length(design), ") does not match ", 19 | "the number of samples: ", n_samples) 20 | } 21 | } 22 | tmp <- glmGamPoi:::convert_chr_vec_to_model_matrix(design, NULL) 23 | design_matrix <- tmp$model_matrix 24 | design_formula <- NULL 25 | }else if(inherits(design,"formula")){ 26 | tmp <- convert_formula_to_design_matrix(design, col_data) 27 | design_matrix <- tmp$design_matrix 28 | design_formula <- tmp$formula 29 | col_data <- add_global_variables_to_col_data(design, col_data) 30 | attr(design_formula, "constructed_from") <- "formula" 31 | }else{ 32 | stop("design argment of class ", class(design), " is not supported. Please ", 33 | "specify a `design_matrix`, a `character vector`, or a `formula`.") 34 | } 35 | 36 | if(nrow(design_matrix) != ncol(data)) stop("Number of rows in col_data does not match number of columns of data.") 37 | if(! is.null(rownames(design_matrix)) && 38 | ! all(rownames(design_matrix) == as.character(seq_len(nrow(design_matrix)))) && # That's the default rownames 39 | ! is.null(colnames(data))){ 40 | if(! all(rownames(design_matrix) == colnames(data))){ 41 | if(setequal(rownames(design_matrix), colnames(data))){ 42 | # Rearrange the rows to match the columns of data 43 | design_matrix <- design_matrix[colnames(data), ,drop=FALSE] 44 | }else{ 45 | stop("The rownames of the design_matrix / col_data do not match the column names of data.") 46 | } 47 | } 48 | } 49 | 50 | if(any(matrixStats::rowAnyNAs(design_matrix))){ 51 | stop("The design matrix contains 'NA's for sample ", 52 | paste0(head(which(DelayedMatrixStats::rowAnyNAs(design_matrix))), collapse = ", "), 53 | ". Please remove them before you call 'lemur()'.") 54 | } 55 | 56 | if(ncol(design_matrix) >= n_samples && ! ignore_degeneracy){ 57 | stop("The design_matrix has more columns (", ncol(design_matrix), 58 | ") than the there are samples in the data matrix (", n_samples, " columns).\n", 59 | "Too few replicates / too many coefficients to fit model.\n", 60 | "The head of the design matrix: \n", glmGamPoi:::format_matrix(head(design_matrix, n = 3))) 61 | } 62 | 63 | if(verbose && is.null(design_formula)){ 64 | message("The 'design' was not specified with a formula. This means that you cannot use 'cond(...)' in 'test_de(...)'.") 65 | } 66 | 67 | # Check rank of design_matrix 68 | qr_mm <- qr(design_matrix) 69 | if(qr_mm$rank < ncol(design_matrix) && n_samples > 0 && ! ignore_degeneracy){ 70 | is_zero_column <- DelayedMatrixStats::colCounts(design_matrix, value = 0) == nrow(design_matrix) 71 | if(any(is_zero_column)){ 72 | stop("The model matrix seems degenerate ('matrix_rank(design_matrix) < ncol(design_matrix)'). ", 73 | "Column ", paste0(head(which(is_zero_column), n=10), collapse = ", "), " contains only zeros. \n", 74 | "The head of the design matrix: \n", glmGamPoi:::format_matrix(head(design_matrix, n = 3))) 75 | }else{ 76 | stop("The model matrix seems degenerate ('matrix_rank(design_matrix) < ncol(design_matrix)'). ", 77 | "Some columns are perfectly collinear. Did you maybe include the same coefficient twice?\n", 78 | "The head of the design matrix: \n", glmGamPoi:::format_matrix(head(design_matrix, n = 3))) 79 | } 80 | } 81 | 82 | rownames(design_matrix) <- colnames(data) 83 | validate_design_matrix(design_matrix, data) 84 | list(design_matrix = design_matrix, design_formula = design_formula, col_data = col_data) 85 | } 86 | 87 | 88 | 89 | convert_formula_to_design_matrix <- function(formula, col_data){ 90 | attr(col_data, "na.action") <- "na.pass" 91 | tryCatch({ 92 | mf <- model.frame(formula, data = col_data, drop.unused.levels = FALSE) 93 | terms <- attr(mf, "terms") 94 | # xlevels is used for reconstructing the model matrix 95 | attr(terms, "xlevels") <- stats::.getXlevels(terms, mf) 96 | # vars_xlevels is used to check input to cond(...) 97 | attr(terms, "vars_xlevels") <- xlevels_for_formula_vars(terms, col_data) 98 | mm <- stats::model.matrix.default(terms, mf) 99 | attr(terms, "contrasts") <- attr(mm, "contrasts") 100 | }, error = function(e){ 101 | # Try to extract text from error message 102 | match <- regmatches(e$message, regexec("object '(.+)' not found", e$message))[[1]] 103 | if(length(match) == 2){ 104 | stop("Problem parsing the formula (", formula, ").\n", 105 | "Variable '", match[2], "' not found in col_data or global environment. Possible variables are:\n", 106 | paste0(colnames(col_data), collapse = ", "), call. = FALSE) 107 | }else{ 108 | stop(e$message) 109 | } 110 | }) 111 | 112 | # Otherwise every copy of the model stores the whole global environment! 113 | attr(terms, ".Environment") <- c() 114 | colnames(mm)[colnames(mm) == "(Intercept)"] <- "Intercept" 115 | list(formula = terms, design_matrix = mm) 116 | } 117 | 118 | add_global_variables_to_col_data <- function(formula, col_data){ 119 | # Check if var is global and put it into col_data 120 | formula_env <- attr(formula, ".Environment") 121 | if(is.null(formula_env)) formula_env <- rlang::empty_env() 122 | for(gv in setdiff(all.vars(formula), colnames(col_data))){ 123 | value <- rlang::eval_tidy(rlang::sym(gv), data = NULL, env = formula_env) 124 | is_vector_type <- (is(col_data, "DFrame") && is(value, "Vector")) || vctrs::obj_is_vector(value) 125 | if(is_vector_type){ 126 | has_correct_length <- NROW(value) == 1 || NROW(value) == nrow(col_data) 127 | if(! has_correct_length){ 128 | stop("Trying store global variables from formula in colData, however '", gv, "' ", 129 | "has length ", NROW(value), ", but it needs to be 1 or nrow(col_data) (", nrow(col_data), ").") 130 | } 131 | }else{ 132 | stop("Trying store global variables from formula in colData, however '", gv, "' ", 133 | "is of type ", class(value)[1]," and not a vector-type.") 134 | } 135 | col_data[[gv]] <- value 136 | } 137 | col_data 138 | } 139 | 140 | xlevels_for_formula_vars <- function(formula, data){ 141 | # if(! is.null( attr(formula, "xlevels"))){ 142 | # attr(formula, "xlevels") 143 | if(! is.null( attr(formula, "vars_xlevels"))){ 144 | attr(formula, "vars_xlevels") 145 | }else{ 146 | # For all character / factor vars get xlevel 147 | all_vars <- all.vars(formula) 148 | formula_env <- attr(formula, ".Environment") 149 | if(is.null(formula_env)) formula_env <- rlang::empty_env() 150 | xlev <- lapply(all_vars, \(v){ 151 | value <- rlang::eval_tidy(rlang::sym(v), data = as.data.frame(data), env = formula_env) 152 | if(is.character(value)){ 153 | levels(as.factor(value)) 154 | }else if(is.factor(value)){ 155 | levels(value) 156 | }else{ 157 | NULL 158 | } 159 | }) 160 | names(xlev) <- all_vars 161 | xlev[!vapply(xlev, is.null, TRUE)] 162 | } 163 | } 164 | 165 | validate_design_matrix <- function(matrix, data){ 166 | stopifnot(is.matrix(matrix)) 167 | stopifnot(nrow(matrix) == ncol(data)) 168 | } 169 | 170 | -------------------------------------------------------------------------------- /R/handle_test_data_parameter.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | handle_test_data_parameter <- function(fit, test_data, test_data_col_data, continuous_assay_name){ 4 | if(is(test_data, "SummarizedExperiment")){ 5 | if(! continuous_assay_name %in% assayNames(test_data)){ 6 | stop("Cannot find assay '", continuous_assay_name, "' in the assays of 'independent_data'") 7 | } 8 | # Nothing else to be done here 9 | }else if(is.list(test_data)){ 10 | if(! continuous_assay_name %in% names(test_data)){ 11 | stop("Cannot find assay '", continuous_assay_name, "' in the names of 'independent_data'") 12 | } 13 | if(is.null(test_data_col_data)){ 14 | stop("'independent_data_col_data' must not be NULL") 15 | } 16 | test_data <- SingleCellExperiment(assays = test_data) 17 | }else if(is.matrix(test_data)){ 18 | message("'independent_data' is a matrix treating it as continuous values") 19 | if(is.null(test_data_col_data)){ 20 | stop("'independent_data_col_data' must not be NULL") 21 | } 22 | test_data <- SingleCellExperiment(assays = setNames(list(test_data), continuous_assay_name)) 23 | }else if(is.null(test_data)){ 24 | # This is necessary to satisfy model.matrix in 'project_on_lemur_fit' 25 | col_data_copy <- fit$colData 26 | character_cols <- vapply(col_data_copy, is.character, logical(1L)) 27 | col_data_copy[character_cols] <- lapply(col_data_copy[character_cols], as.factor) 28 | test_data <- SingleCellExperiment(assays = setNames(list(matrix(nrow = nrow(fit), ncol = 0) * 1.0), continuous_assay_name), 29 | colData = col_data_copy[integer(0L),,drop=FALSE]) 30 | }else{ 31 | stop("Cannot handle 'indepdendet_data' of type: ", toString(class(test_data), width = 100)) 32 | } 33 | 34 | colData(test_data) <- S4Vectors::DataFrame(glmGamPoi:::get_col_data(test_data, test_data_col_data), check.names = FALSE) 35 | 36 | test_data 37 | } 38 | -------------------------------------------------------------------------------- /R/harmony_wrapper.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | harmony_init <- function(embedding, design_matrix, 4 | theta = 2, lambda = 1, sigma = 0.1, nclust = min(round(ncol(embedding) / 30), 100), 5 | tau = 0, block.size = 0.05, max.iter.cluster = 200, 6 | epsilon.cluster = 1e-5, epsilon.harmony = 1e-4, verbose = TRUE){ 7 | 8 | mm_groups <- get_groups(design_matrix) 9 | 10 | # Adapted from https://github.com/immunogenomics/harmony/blob/c8f4901ef92d6e9b4e1373c52de3b67ff052db3e/R/ui.R#L161 11 | n_groups <- length(unique(mm_groups)) 12 | phi <- matrix(0, nrow = n_groups, ncol = ncol(embedding)) 13 | phi[mm_groups + n_groups * (seq_along(mm_groups)-1)] <- 1 14 | phi <- as(phi, "sparseMatrix") 15 | 16 | N <- ncol(embedding) 17 | N_b <- MatrixGenerics::rowSums2(phi) 18 | B_vec <- rep(1, n_groups) 19 | 20 | theta <- rep_len(theta, n_groups) 21 | theta <- theta * (1 - exp(-(N_b / (nclust * tau)) ^ 2)) 22 | 23 | lambda <- rep_len(lambda, n_groups) 24 | lambda_vec <- c(0, lambda) 25 | 26 | 27 | sigma <- rep_len(sigma, nclust) 28 | lambda_range = c(0.1, 10) 29 | if(packageVersion("harmony") < "1.2.0"){ 30 | stop("Your 'harmony' version is outdated: ", packageVersion("harmony"), ". Please update to version >= 1.2.0") 31 | }else{ 32 | alpha <- 0.2 33 | harmonyObj <- harmony::RunHarmony(embedding, mm_groups, nclust = nclust, max.iter = 0, return_object = TRUE, verbose = FALSE) 34 | harmonyObj$setup( 35 | embedding, phi, 36 | sigma, theta, lambda_vec, alpha, max.iter.cluster, epsilon.cluster, 37 | epsilon.harmony, nclust, block.size, B_vec, verbose 38 | ) 39 | } 40 | harmony_init_clustering(harmonyObj) 41 | harmonyObj 42 | } 43 | 44 | #' Create an arbitrary Harmony object so that I can modify it later 45 | #' 46 | #' @returns The full [`harmony`] object (R6 reference class type). 47 | #' 48 | #' @keywords internal 49 | harmony_new_object <- function(){ 50 | Y <- randn(3, 100) 51 | harmony::RunHarmony(Y, rep(c("a", "b"), length.out = 100), nclust = 2, max.iter = 0, return_object = TRUE, verbose = FALSE) 52 | harmonyObj 53 | } 54 | 55 | harmony_init_clustering <- function(harmonyObj, iter.max = 25, nstart = 10){ 56 | stopifnot(is(harmonyObj, "Rcpp_harmony")) 57 | harmonyObj$Y <- t(stats::kmeans(t(harmonyObj$Z_cos), centers = harmonyObj$K, iter.max = iter.max, nstart = nstart)$centers) 58 | harmonyObj$init_cluster_cpp() 59 | harmonyObj 60 | } 61 | 62 | harmony_max_div_clustering <- function(harmonyObj){ 63 | stopifnot(is(harmonyObj, "Rcpp_harmony")) 64 | err_status <- harmonyObj$cluster_cpp() 65 | if (err_status == -1) { 66 | stop('terminated by user') 67 | } else if (err_status != 0) { 68 | stop(gettextf('Harmony exited with non-zero exit status: %d', 69 | err_status)) 70 | } 71 | harmonyObj 72 | } 73 | 74 | -------------------------------------------------------------------------------- /R/lemur-package.R: -------------------------------------------------------------------------------- 1 | 2 | #' 3 | #' @import methods 4 | #' @import stats 5 | #' @rawNamespace import(SingleCellExperiment, except = weights) 6 | #' @importFrom S4Vectors metadata `metadata<-` DataFrame 7 | #' @importFrom SummarizedExperiment assayNames assay `assay<-` colData `colData<-` rowData `rowData<-` 8 | #' @importFrom SingleCellExperiment reducedDims 9 | #' @importFrom BiocGenerics design 10 | #' @importFrom utils .DollarNames capture.output head txtProgressBar setTxtProgressBar 11 | #' @importFrom Matrix t 12 | NULL 13 | 14 | ## usethis namespace: start 15 | #' @importFrom Rcpp sourceCpp 16 | ## usethis namespace: end 17 | NULL 18 | 19 | ## usethis namespace: start 20 | #' @useDynLib lemur, .registration = TRUE 21 | ## usethis namespace: end 22 | NULL 23 | 24 | # Satisfy NOTE from CRAN CMD check 25 | # Both are used in 'find_de_neighborhoods' 26 | utils::globalVariables(c("cntrst", "..did_indicator")) 27 | -------------------------------------------------------------------------------- /R/lemur.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Main function to fit the latent embedding multivariate regression (LEMUR) model 4 | #' 5 | #' @param data a matrix with observations in the columns and features in the rows. 6 | #' Or a `SummarizedExperiment` / `SingleCellExperiment` object 7 | #' @param design a formula referring to global objects or column in the `colData` of `data` 8 | #' and `col_data` argument 9 | #' @param col_data an optional data frame with `ncol(data)` rows. 10 | #' @param n_embedding the dimension of the $k$-plane that is rotated through space. 11 | #' @param linear_coefficient_estimator specify which estimator is used to center the conditions. 12 | #' `"linear"` runs simple regression it works well in many circumstances but can produce poor 13 | #' results if the composition of the cell types changes between conditions (e.g., one cell type 14 | #' disappears). `"mean"`, `"cluster_median"` and `"zero"` are alternative estimators, which 15 | #' are each supposed to be more robust against compositional changes but cannot account 16 | #' for genes that change for all cells between conditions. 17 | #' `"linear"` is the default as it works best with subsequent alignment steps. 18 | #' @param use_assay if `data` is a `SummarizedExperiment` / `SingleCellExperiment` object, 19 | #' which assay should be used. 20 | #' @param test_fraction the fraction of cells that are split of before the model fit to keep an 21 | #' independent set of test observations. Alternatively, a logical vector of length `ncol(data)`. 22 | #' Default: 20% (`0.2`). 23 | #' @param ... additional parameters that are passed on to the internal function `lemur_impl`. 24 | #' @param verbose Should the method print information during the fitting. Default: `TRUE`. 25 | #' 26 | #' @return An object of class `lemur_fit` which extends [`SingleCellExperiment`]. Accordingly, 27 | #' all functions that work for `sce`'s also work for `lemur_fit`'s. In addition, we 28 | #' give easy access to the fitted values using the dollar notation (e.g., `fit$embedding`). 29 | #' For details see the [lemur_fit-class] help page. 30 | #' 31 | #' @references 32 | #' * Ahlmann-Eltze, C. & Huber, W. (2023). Analysis of multi-condition single-cell data with latent 33 | #' embedding multivariate regression. bioRxiv [https://doi.org/10.1101/2023.03.06.531268](https://doi.org/10.1101/2023.03.06.531268) 34 | #' 35 | #' @seealso [`align_by_grouping`], [`align_harmony`], [`test_de`], [`find_de_neighborhoods`] 36 | #' 37 | #' @examples 38 | #' data(glioblastoma_example_data) 39 | #' fit <- lemur(glioblastoma_example_data, design = ~ patient_id + condition, n_emb = 5) 40 | #' fit 41 | #' 42 | #' 43 | #' @export 44 | lemur <- function(data, design = ~ 1, col_data = NULL, 45 | n_embedding = 15, 46 | linear_coefficient_estimator = c("linear", "mean", "cluster_median", "zero"), 47 | use_assay = "logcounts", 48 | test_fraction = 0.2, 49 | ..., 50 | verbose = TRUE){ 51 | 52 | data_mat <- handle_data_parameter(data, on_disk = FALSE, assay = use_assay) 53 | col_data <- glmGamPoi:::get_col_data(data, col_data) 54 | if(! all(colnames(col_data) == make.names(colnames(col_data), unique = TRUE))){ 55 | stop("The column names of 'colData(fit)' are not unique. Please fix this for example by calling: \n", 56 | "`colnames(colData(data)) <- make.names(colnames(colData(data)), unique = TRUE)`") 57 | } 58 | 59 | des <- handle_design_parameter(design, data, col_data, verbose = verbose) 60 | al_des <- des 61 | col_data <- des$col_data 62 | 63 | if(! is(data, "SummarizedExperiment")){ 64 | data <- SingleCellExperiment(assays = setNames(list(data_mat), use_assay), colData = col_data) 65 | } 66 | 67 | # Avoid problems when splitting into test and training data 68 | col_data <- convert_dataframe_cols_chr_to_fct(col_data) 69 | 70 | # Create indicator vector which cells are used for training and which for testing 71 | is_test_data <- rep(FALSE, ncol(data)) 72 | if(is.logical(test_fraction) && length(ncol(data))){ 73 | if(any(is.na(test_fraction))) stop("test_fraction must not contain 'NA's.") 74 | is_test_data <- test_fraction 75 | }else if(length(test_fraction) != 1){ 76 | stop("'test_fraction' must be a boolean vector of length 'ncol(data)' or a single number between 0 and 1.") 77 | }else if(test_fraction < 0 || test_fraction >= 1){ 78 | stop("'test_fraction' must be at least 0 and smaller than 1.") 79 | }else{ 80 | if(verbose) message("Storing ", round(test_fraction, 2) * 100, "% of the data (", round(ncol(data) * test_fraction), " cells)", 81 | " as test data.") 82 | is_test_data[sample.int(ncol(data), size = round(ncol(data) * test_fraction), replace = FALSE)] <- TRUE 83 | } 84 | 85 | design_matrix <- des$design_matrix[!is_test_data,,drop=FALSE] 86 | res <- lemur_impl(data_mat[,!is_test_data,drop=FALSE], design_matrix, n_embedding = n_embedding, 87 | linear_coefficient_estimator = linear_coefficient_estimator, verbose = verbose, ...) 88 | alignment_design <- if(matrix_equals(design_matrix, res$alignment_design_matrix)){ 89 | des$design_formula 90 | }else{ 91 | NULL 92 | } 93 | embedding <- matrix(NA, nrow = res$n_embedding, ncol = ncol(data)) 94 | embedding[,!is_test_data] <- res$embedding 95 | embedding[,is_test_data] <- project_on_lemur_fit_impl(Y = data_mat[,is_test_data,drop=FALSE], design_matrix = des$design_matrix[is_test_data,,drop=FALSE], 96 | alignment_design_matrix = al_des$design_matrix[is_test_data,,drop=FALSE], 97 | coefficients = res$coefficients, linear_coefficients = res$linear_coefficients, 98 | alignment_coefficients = res$alignment_coefficients, base_point = res$base_point) 99 | 100 | lemur_fit(data, col_data = col_data, 101 | row_data = if(is(data, "SummarizedExperiment")) rowData(data) else NULL, 102 | n_embedding = res$n_embedding, 103 | design = des$design_formula, 104 | design_matrix = des$design_matrix, 105 | linear_coefficients = res$linear_coefficients, 106 | base_point = res$base_point, 107 | coefficients = res$coefficients, 108 | embedding = embedding, 109 | alignment_coefficients = res$alignment_coefficients, 110 | alignment_design = alignment_design, 111 | alignment_design_matrix = al_des$design_matrix, 112 | use_assay = use_assay, 113 | is_test_data = is_test_data) 114 | } 115 | 116 | 117 | lemur_impl <- function(Y, design_matrix, 118 | n_embedding = 15, 119 | base_point = c("global_embedding", "mean"), 120 | linear_coefficient_estimator = c("linear", "mean", "cluster_median", "zero"), 121 | linear_coefficients = NULL, 122 | coefficients = NULL, 123 | embedding = NULL, 124 | alignment_coefficients = NULL, 125 | alignment_design_matrix = NULL, 126 | n_iter = 10, tol = 1e-8, 127 | reshuffling_fraction = 0, 128 | verbose = TRUE){ 129 | alignment_coef_fixed_but_embedding_fitted <- ! is.null(alignment_coefficients) && is.null(embedding) 130 | linear_coefficient_estimator <- match.arg(linear_coefficient_estimator) 131 | 132 | # Set reduced dimensions 133 | stopifnot(n_embedding >= 0) 134 | n_ambient_eff <- nrow(Y) 135 | n_embedding <- min(n_embedding, nrow(Y), ncol(Y)) 136 | linear_coef_fixed <- ! is.null(linear_coefficients) 137 | diffemb_coef_fixed <- ! is.null(coefficients) 138 | embedding_fixed <- ! is.null(embedding) 139 | alignment_coef_fixed <- ! is.null(alignment_coefficients) 140 | if(is.null(alignment_design_matrix)){ 141 | alignment_design_matrix <- design_matrix 142 | } 143 | 144 | # Initialize values 145 | if(linear_coef_fixed){ 146 | if(length(linear_coefficients) == 1){ 147 | linear_coefficients <- matrix(linear_coefficients, nrow = n_ambient_eff, ncol = ncol(design_matrix)) 148 | } 149 | stopifnot(nrow(linear_coefficients) == n_ambient_eff & ncol(linear_coefficients)) 150 | 151 | }else{ 152 | if(verbose) message("Regress out global effects using ", linear_coefficient_estimator, " method.") 153 | linear_coefficients <- estimate_linear_coefficient(Y = Y, design_matrix = design_matrix, method = linear_coefficient_estimator) 154 | } 155 | if(linear_coefficient_estimator == "zero"){ 156 | Y_clean <- Y 157 | }else{ 158 | Y_clean <- Y - linear_coefficients %*% t(design_matrix) 159 | } 160 | if(!is.matrix(base_point)){ 161 | if(verbose) message("Find base point for differential embedding") 162 | base_point <- find_base_point(Y_clean, base_point, n_embedding = n_embedding) 163 | } 164 | 165 | initial_error <- sum(Y_clean^2) 166 | if(verbose) message("Fit differential embedding model") 167 | if(verbose) message("Initial error: ", sprintf("%.3g", initial_error)) 168 | if(! diffemb_coef_fixed){ 169 | coefficients <- array(0, dim = c(n_ambient_eff, n_embedding, ncol(design_matrix))) 170 | } 171 | if(! embedding_fixed){ 172 | embedding <- project_data_on_diffemb(Y_clean, design = design_matrix, 173 | coefficients = coefficients, base_point = base_point) 174 | } 175 | 176 | if(! diffemb_coef_fixed){ 177 | if(verbose) message("---Fit Grassmann linear model") 178 | coefficients <- grassmann_lm(Y_clean, design = design_matrix, base_point = base_point) 179 | } 180 | if(! embedding_fixed){ 181 | embedding <- project_data_on_diffemb(Y_clean, design = design_matrix, 182 | coefficients = coefficients, base_point = base_point) 183 | } 184 | if(verbose){ 185 | residuals <- Y - project_diffemb_into_data_space(embedding, design = design_matrix, coefficients = coefficients, base_point = base_point) - linear_coefficients %*% t(design_matrix) 186 | error <- sum(residuals^2) 187 | message("Final error: ", sprintf("%.3g", error)) 188 | } 189 | 190 | 191 | if(alignment_coef_fixed_but_embedding_fitted){ 192 | # Rotate the embedding if it wasn't provided 193 | stop("Fixing 'alignment_coefficients' without fixing 'embedding' is not implemented") 194 | }else{ 195 | alignment_coefficients <- array(0, c(n_embedding, n_embedding+1, ncol(alignment_design_matrix))) 196 | } 197 | 198 | # Make sure that axes are ordered by variance 199 | if(prod(dim(embedding)) > 0 && all(!is.na(embedding))){ 200 | svd_emb <- svd(embedding) 201 | rot <- svd_emb$u 202 | base_point <- base_point %*% rot 203 | for(idx in seq_len(dim(coefficients)[3])) { 204 | coefficients[,,idx] <- coefficients[,,idx] %*% rot 205 | } 206 | embedding <- t(svd_emb$v) * svd_emb$d 207 | } 208 | 209 | list(n_embedding = n_embedding, 210 | design_matrix = design_matrix, data = Y, 211 | linear_coefficients = linear_coefficients, 212 | base_point = base_point, 213 | coefficients = coefficients, 214 | embedding = embedding, 215 | alignment_coefficients = alignment_coefficients) 216 | } 217 | 218 | 219 | find_base_point <- function(Y_clean, base_point, n_embedding){ 220 | n_genes <- nrow(Y_clean) 221 | if(is.matrix(base_point)){ 222 | stopifnot(nrow(base_point) == n_genes) 223 | stopifnot(ncol(base_point) == n_embedding) 224 | 225 | # Check if it is orthogonal 226 | orth <- t(base_point) %*% base_point 227 | if(sum((orth - diag(nrow = n_embedding))^2) > 1e-8){ 228 | stop("The provided 'base_point' is not orthogonal") 229 | } 230 | base_point 231 | }else{ 232 | base_point_meth <- match.arg(base_point, c("global_embedding", "mean")) 233 | if(base_point_meth == "global_embedding"){ 234 | pca(Y_clean, n = n_embedding, center = FALSE)$coordsystem 235 | }else if(base_point_meth == "mean"){ 236 | stop("'base_point = \"mean\"' is not implemented. Please use 'global_embedding'.") 237 | } 238 | } 239 | } 240 | 241 | 242 | project_diffemb_into_data_space <- function(embedding, design, coefficients, base_point){ 243 | n_genes <- nrow(base_point) 244 | res <- matrix(NA, nrow = n_genes, ncol = ncol(embedding)) 245 | mm_groups <- get_groups(design) 246 | for(gr in unique(mm_groups)){ 247 | covars <- design[which(mm_groups == gr)[1], ] 248 | res[,mm_groups == gr] <- grassmann_map(sum_tangent_vectors(coefficients, covars), base_point) %*% embedding[,mm_groups == gr,drop=FALSE] 249 | } 250 | res 251 | } 252 | 253 | project_data_on_diffemb <- function(Y_clean, design, coefficients, base_point){ 254 | n_emb <- ncol(base_point) 255 | res <- matrix(NA, nrow = n_emb, ncol = ncol(Y_clean)) 256 | mm_groups <- get_groups(design) 257 | for(gr in unique(mm_groups)){ 258 | covars <- design[which(mm_groups == gr)[1], ] 259 | res[,mm_groups == gr] <- t(grassmann_map(sum_tangent_vectors(coefficients, covars), base_point)) %*% Y_clean[,mm_groups == gr,drop=FALSE] 260 | } 261 | res 262 | } 263 | 264 | sum_tangent_vectors <- function(tangent_block, covariates){ 265 | stopifnot(length(covariates) == dim(tangent_block)[3]) 266 | res <- matrix(0, nrow = dim(tangent_block)[1], ncol = dim(tangent_block)[2]) 267 | for(idx in seq_len(length(covariates))){ 268 | res <- res + tangent_block[,,idx] * covariates[idx] 269 | } 270 | res 271 | } 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | -------------------------------------------------------------------------------- /R/lemur_fit.R: -------------------------------------------------------------------------------- 1 | #' The `lemur_fit` class 2 | #' 3 | #' The `lemur_fit` class extends [`SingleCellExperiment`] and provides 4 | #' additional accessors to get the values of the fit produced by [`lemur`]. It is the class of the result returned by [`lemur`]. 5 | #' 6 | #' @param object the `lemur_fit` object for the [`BiocGenerics::design`] generic 7 | #' @param x,i,j,...,drop the `lemur_fit` object and indices for the `[` subsetting operator 8 | #' 9 | #' @details 10 | #' 11 | #' To access the values produced by [`lemur`], use the dollar notation (`$`): 12 | #' \describe{ 13 | #' \item{`fit$n_embedding`}{the dimension of the latent space.} 14 | #' \item{`fit$design`}{the specification of the design in [`lemur`]. Usually this is a design formula, see [`stats::formula`].} 15 | #' \item{`fit$base_point`}{a matrix of size `nrow(fit)` x `fit$n_embedding` with the base point for the Grassmann exponential map.} 16 | #' \item{`fit$coefficients`}{a three-dimensional tensor of size `nrow(fit)` x `fit$n_embedding` x `ncol(fit$design_matrix)` with the coefficients for the exponential map.} 17 | #' \item{`fit$embedding`}{a matrix of size `fit$n_embedding` x `ncol(fit)` with the latent space coordinates of each cell.} 18 | #' \item{`fit$design_matrix`}{a matrix with the covariate values for each cell, of size `ncol(fit)` x `ncol(fit$design_matrix)`.} 19 | #' \item{`fit$linear_coefficients`}{a matrix (of size `nrow(fit)` x `ncol(fit$design_matrix)` with the coefficients for the linear regression.} 20 | #' \item{`fit$alignment_coefficients`}{a 3-tensor with the coefficients for the alignment, of size `fit$n_embedding` x `fit$n_embedding` x `ncol(fit$design_matrix)`.} 21 | #' \item{`fit$alignment_design`}{an alternative specification of the alignment, using a design, typically a [`stats::formula`].} 22 | #' \item{`fit$alignment_design_matrix`}{an alternative specification of the alignment, using a design matrix.} 23 | #' \item{`fit$contrast`}{a parsed version of the contrast specification from the `test_de` function or `NULL`.} 24 | #' \item{`fit$colData`}{the column annotation `DataFrame`.} 25 | #' \item{`fit$rowData`}{the row annotation `DataFrame`.} 26 | #' } 27 | #' 28 | #' @seealso [`lemur`], [`predict`][predict.lemur_fit], [`residuals`][residuals,lemur_fit-method] 29 | #' 30 | #' @returns An object of class `lemur_fit`. 31 | #' 32 | #' @rdname lemur_fit 33 | #' @aliases lemur_fit 34 | #' 35 | #' @examples 36 | #' # The easiest way to make a lemur_fit object is to call `lemur` 37 | #' data("glioblastoma_example_data") 38 | #' fit <- lemur(glioblastoma_example_data, design = ~ patient_id + condition, 39 | #' n_emb = 5, verbose = FALSE) 40 | #' 41 | #' fit$n_embedding 42 | #' fit$embedding[,1:10] 43 | #' fit$n_embedding 44 | #' fit$embedding[,1:10] 45 | #' fit$design_matrix[1:10,] 46 | #' fit$coefficients[1:3,,] 47 | #' 48 | #' @export 49 | .lemur_fit <- setClass("lemur_fit", contains = "SingleCellExperiment") 50 | 51 | lemur_fit <- function(data, col_data, row_data, 52 | n_embedding, 53 | design, design_matrix, linear_coefficients, 54 | base_point, coefficients, embedding, 55 | alignment_coefficients, 56 | alignment_design, alignment_design_matrix, 57 | use_assay, is_test_data, row_mask = NULL){ 58 | if(is.null(data)){ 59 | data <- SingleCellExperiment(assays = list()) 60 | }else if(! is(data, "SummarizedExperiment")){ 61 | data <- SingleCellExperiment(assays = setNames(list(data), use_assay)) 62 | }else{ 63 | data <- as(data, "SingleCellExperiment") 64 | } 65 | 66 | n_features <- nrow(data) 67 | linearFit <- LinearEmbeddingMatrix(design_matrix, linear_coefficients) 68 | colnames(linearFit) <- colnames(linear_coefficients) 69 | 70 | # Do some more graceful merging of the existing and the new information. 71 | colData(data) <- (colData(data) %update_values% col_data) |> as("DataFrame") 72 | SingleCellExperiment::int_colData(data)$is_test_data <- is_test_data 73 | rowData(data) <- (rowData(data) %update_values% row_data) |> as("DataFrame") 74 | reducedDims(data) <- reducedDims(data) %update_values% list(linearFit = linearFit, embedding = t(embedding)) 75 | metadata(data) <- metadata(data) %update_values% list( 76 | n_embedding = n_embedding, design = design, base_point = base_point, 77 | coefficients = coefficients, alignment_coefficients = alignment_coefficients, 78 | alignment_design = alignment_design, alignment_design_matrix = alignment_design_matrix, 79 | use_assay = use_assay, row_mask = if(is.null(row_mask)) seq_len(n_features) else row_mask) 80 | 81 | .lemur_fit(data) 82 | } 83 | 84 | 85 | S4Vectors::setValidity2("lemur_fit", function(obj){ 86 | old <- S4Vectors:::disableValidity() 87 | if (!isTRUE(old)) { 88 | S4Vectors:::disableValidity(TRUE) 89 | on.exit(S4Vectors:::disableValidity(old)) 90 | } 91 | 92 | msg <- NULL 93 | 94 | row_mask <- metadata(obj)$row_mask 95 | n_features_original <- nrow(obj$base_point) 96 | n_features <- nrow(obj) 97 | n_obs <- ncol(obj) 98 | 99 | n_embedding <- obj$n_embedding 100 | if(is.null(n_embedding)) msg <- c(msg, "'n_embedding' must not be NULL") 101 | base_point <- obj$base_point 102 | if(is.null(base_point)) msg <- c(msg, "'base_point' must not be NULL") 103 | coefficients <- obj$coefficients 104 | if(is.null(coefficients)) msg <- c(msg, "'coefficients' must not be NULL") 105 | alignment_coefficients <- obj$alignment_coefficients 106 | if(is.null(alignment_coefficients)) msg <- c(msg, "'alignment_coefficients' must not be NULL") 107 | alignment_design <- obj$alignment_design 108 | alignment_design_matrix <- obj$alignment_design_matrix 109 | if(is.null(alignment_design_matrix)) msg <- c(msg, "'alignment_design_matrix' must not be NULL") 110 | embedding <- obj$embedding 111 | if(is.null(embedding)) msg <- c(msg, "'embedding' must not be NULL") 112 | design <- obj$design 113 | design_matrix <- sampleFactors(reducedDim(obj, "linearFit")) 114 | if(is.null(design_matrix)) msg <- c(msg, "'design_matrix' must not be NULL") 115 | linear_coefficients <- featureLoadings(reducedDim(obj, "linearFit")) 116 | if(is.null(linear_coefficients)) msg <- c(msg, "'linear_coefficients' must not be NULL") 117 | use_assay <- obj$use_assay 118 | if(is.null(use_assay)) msg <- c(msg, "'use_assay' must not be NULL") 119 | is_test_data <- obj$is_test_data 120 | if(is.null(is_test_data)) msg <- c(msg, "'is_test_data' must not be NULL") 121 | col_names <- colnames(obj) 122 | row_names <- rownames(obj) 123 | 124 | if(! is.null(design_matrix) && nrow(design_matrix) != n_obs) msg <- c(msg, "`nrow(design_matrix)` does not match number of observations") 125 | if(! is.null(linear_coefficients) && nrow(linear_coefficients) != n_features_original) msg <- c(msg, "`nrow(linear_coefficients)` does not match `n_features_original`") 126 | if(! is.null(linear_coefficients) && ncol(linear_coefficients) != ncol(design_matrix)) msg <- c(msg, "`ncol(linear_coefficients)` does not match `ncol(design_matrix)`") 127 | if(! is.null(base_point) && nrow(base_point) != n_features_original) msg <- c(msg, "`nrow(base_point)` does not match `n_features_original`") 128 | if(! is.null(base_point) && ncol(base_point) != n_embedding) msg <- c(msg, "`ncol(base_point)` does not match `n_embedding`") 129 | if(! is.null(coefficients) && ! is.array(coefficients) || length(dim(coefficients)) != 3) msg <- c(msg, "`coefficients` must be a three dimensional array") 130 | if(! is.null(coefficients) && dim(coefficients)[1] != n_features_original) msg <- c(msg, "`dim(coefficients)[1]` does not match `n_features_original`") 131 | if(! is.null(coefficients) && dim(coefficients)[2] != n_embedding) msg <- c(msg, "`dim(coefficients)[2]` does not match `n_embedding`") 132 | if(! is.null(coefficients) && dim(coefficients)[3] != ncol(design_matrix)) msg <- c(msg, "`dim(coefficients)[3]` does not match `ncol(design_matrix)`") 133 | if(! is.null(embedding) && nrow(embedding) != n_embedding) msg <- c(msg, "`nrow(embedding)` does not match `n_embedding`") 134 | if(! is.null(embedding) && ncol(embedding) != n_obs) msg <- c(msg, "`ncol(embedding)` does not match number of observations") 135 | if(! is.null(alignment_coefficients) && ! is.array(alignment_coefficients) || length(dim(alignment_coefficients)) != 3) msg <- c(msg, "`alignment_coefficients` must be a three dimensional array") 136 | if(! is.null(alignment_coefficients) && dim(alignment_coefficients)[1] != n_embedding) msg <- c(msg, "`dim(alignment_coefficients)[1]` does not match `n_embedding`") 137 | if(! is.null(alignment_coefficients) && dim(alignment_coefficients)[2] != n_embedding + 1) msg <- c(msg, "`dim(alignment_coefficients)[2]` does not match `n_embedding + 1`") 138 | if(! is.null(alignment_coefficients) && dim(alignment_coefficients)[3] != ncol(alignment_design_matrix)) msg <- c(msg, "`dim(alignment_coefficients)[3]` does not match `ncol(alignment_design_matrix)`") 139 | if(! is.null(alignment_design_matrix) && nrow(alignment_design_matrix) != n_obs) msg <- c(msg, "`nrow(alignment_design_matrix)` does not match number of observations") 140 | if(! is.null(alignment_design) && ! inherits(alignment_design, "formula")) msg <- c(msg, "`alignment_design` must inherit from formula or be NULL") 141 | if(! is.null(design) && ! inherits(design, "formula")) msg <- c(msg, "`design` must inherit from formula or be NULL") 142 | if(! is.null(use_assay) && ! use_assay %in% assayNames(obj)) msg <- c(msg, "`use_assay` must be one of the assays") 143 | if(! is.null(is_test_data) && ! is.logical(is_test_data)) msg <- c(msg, "`is_test_data` must be a logical vector") 144 | if(! is.null(is_test_data) && length(is_test_data) != n_obs ) msg <- c(msg, "length `is_test_data` must match the number of observations") 145 | if(! is.null(col_names) && length(col_names) != length(unique(col_names))) msg <- c(msg, "`colnames` are not unique") 146 | if(! is.null(row_names) && length(row_names) != length(unique(row_names))) msg <- c(msg, "`rownames` are not unique") 147 | if(is.logical(row_mask)) msg <- c(msg, "`row_mask` is a logical. This is no longer allowed (changed in version 1.0.4 to fix a bug). Please rerun `lemur`.") 148 | if(max(row_mask) > n_features_original || min(row_mask) < 0 || any(is.na(row_mask))) msg <- c(msg, "`row_mask` contains illegal index. This is a bug!") 149 | 150 | if(is.null(msg)){ 151 | TRUE 152 | }else{ 153 | msg 154 | } 155 | }) 156 | 157 | # Subsetting 158 | 159 | #' @rdname lemur_fit 160 | #' @export 161 | setMethod("[", c("lemur_fit", "ANY", "ANY"), function(x, i, j, ...) { 162 | old <- S4Vectors:::disableValidity() 163 | if (!isTRUE(old)) { 164 | S4Vectors:::disableValidity(TRUE) 165 | on.exit(S4Vectors:::disableValidity(old)) 166 | } 167 | 168 | i_missing <- missing(i) 169 | j_missing <- missing(j) 170 | 171 | if (! i_missing) { 172 | # Update metadata 173 | ii <- convert_subset_to_index(i, rownames(x)) 174 | metadata(x)$row_mask <- metadata(x)$row_mask[ii] 175 | } 176 | if(! j_missing){ 177 | jj <- convert_subset_to_index(j, colnames(x)) 178 | metadata(x)[["alignment_design_matrix"]] <- metadata(x)[["alignment_design_matrix"]][jj,,drop=FALSE] 179 | } 180 | 181 | callNextMethod() 182 | }) 183 | 184 | convert_subset_to_index <- function(subset, names){ 185 | if (is.character(subset)) { 186 | orig <- subset 187 | subset <- match(subset, names) 188 | if (any(bad <- is.na(subset))) { 189 | bad_examples <- toString(orig[bad], width = 100) 190 | stop("index out of bounds: ", bad_examples) 191 | } 192 | }else if(is.logical(subset)){ 193 | subset <- which(subset) 194 | } 195 | return(as.vector(subset)) 196 | } 197 | 198 | .methods_to_suggest <- c("n_embedding", "embedding", 199 | "design", "design_matrix", "base_point", 200 | "coefficients", "linear_coefficients", "alignment_coefficients", 201 | "alignment_design", "alignment_design_matrix", 202 | "contrast", "use_assay", "colData", "rowData", 203 | "test_data", "training_data", "is_test_data") 204 | 205 | 206 | #' @rdname lemur_fit 207 | #' @export 208 | setMethod("design", signature = "lemur_fit", function(object){ 209 | metadata(object)[["design"]] 210 | }) 211 | 212 | 213 | 214 | #' @rdname cash-lemur_fit-method 215 | #' @export 216 | .DollarNames.lemur_fit <- function(x, pattern = ""){ 217 | grep(pattern, .methods_to_suggest, value = TRUE) 218 | } 219 | 220 | #' Access values from a `lemur_fit` 221 | #' 222 | #' @param x the `lemur_fit` 223 | #' @param pattern the pattern from looking up potential values interactively 224 | #' @param name the name of the value behind the dollar 225 | #' @param value the replacement value. This only works for `colData` and 226 | #' `rowData`. 227 | #' 228 | #' @returns The respective value stored in the `lemur_fit` object. 229 | #' 230 | #' @seealso [`lemur_fit-class`] for more documentation on the 231 | #' accessor functions. 232 | #' @aliases dollar_methods 233 | setMethod("$", "lemur_fit", 234 | function(x, name){ 235 | old <- S4Vectors:::disableValidity() 236 | if (!isTRUE(old)) { 237 | S4Vectors:::disableValidity(TRUE) 238 | on.exit(S4Vectors:::disableValidity(old)) 239 | } 240 | 241 | if(! name %in% .methods_to_suggest){ 242 | stop("Illegal name after '$' sign: ", name) 243 | } 244 | switch(name, 245 | n_embedding = metadata(x)[["n_embedding"]], 246 | design = design(x), 247 | base_point = metadata(x)[["base_point"]], 248 | coefficients = metadata(x)[["coefficients"]], 249 | embedding = t(reducedDim(x, "embedding")), 250 | design_matrix = sampleFactors(reducedDim(x, "linearFit")), 251 | linear_coefficients = featureLoadings(reducedDim(x, "linearFit")), 252 | alignment_design = metadata(x)[["alignment_design"]], 253 | alignment_design_matrix = metadata(x)[["alignment_design_matrix"]], 254 | alignment_coefficients = metadata(x)[["alignment_coefficients"]], 255 | contrast = metadata(x)[["contrast"]], 256 | use_assay = metadata(x)[["use_assay"]], 257 | colData = colData(x), 258 | rowData = rowData(x), 259 | test_data = get_test_data(x), 260 | training_data = get_training_data(x), 261 | is_test_data = int_colData(x)[["is_test_data"]], 262 | stop("Invalid `name` value.") 263 | ) 264 | }) 265 | 266 | get_test_data <- function(fit){ 267 | fit[,fit$is_test_data] 268 | } 269 | 270 | get_training_data <- function(fit){ 271 | fit[,!fit$is_test_data] 272 | } 273 | 274 | #' @rdname cash-lemur_fit-method 275 | setReplaceMethod("$", "lemur_fit", 276 | function(x, name, value){ 277 | if(! name %in% .methods_to_suggest){ 278 | stop("Illegal name after '$' sign: ", name) 279 | } 280 | switch(name, 281 | colData = {SummarizedExperiment::colData(x) <- value}, 282 | rowData = {SummarizedExperiment::rowData(x) <- value}, 283 | stop("It is illegal to modify the content of lemur_fit object") 284 | ) 285 | x 286 | }) 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | -------------------------------------------------------------------------------- /R/parse_contrasts.R: -------------------------------------------------------------------------------- 1 | 2 | # This function is adapted from proDA 3 | parse_contrast <- function(contrast, formula, simplify = FALSE) { 4 | 5 | if(missing(contrast)){ 6 | stop("No contrast argument was provided!") 7 | } 8 | covar <- all.vars(formula) 9 | 10 | cnt_capture <- rlang::enquo(contrast) 11 | if(is.null(formula)){ 12 | data_mask <- NULL 13 | }else{ 14 | data_mask <- create_contrast_data_mask(formula) 15 | } 16 | 17 | tryCatch({ 18 | res <- rlang::eval_tidy(cnt_capture, data = data_mask) 19 | if(! is.numeric(res)){ 20 | if(is.character(res)){ 21 | # If contrast was a string, eval will just spit it out the same way 22 | res <- rlang::eval_tidy(rlang::parse_expr(res), data = data_mask) 23 | } 24 | } 25 | }, error = function(e){ 26 | # Try to extract text from error message 27 | wrong_arg_error <- regmatches(e$message, regexec("object '(.+)' not found", e$message))[[1]] 28 | no_cond_error <- grepl(e$message, "could not find function \"cond\"") 29 | if(length(wrong_arg_error) == 2){ 30 | covars1 <- paste0(paste0(covar, " = ?"), collapse = ", ") 31 | covars2 <- paste0(paste0(covar, " = ?"), collapse = ", ") 32 | stop("Object '", wrong_arg_error[2], "' not found. Please specify the contrast using:\n", 33 | "'cond(", covars1, ") - cond(", covars2, ")'", call. = FALSE) 34 | }else if(no_cond_error && is.null(formula)){ 35 | stop("'fit$design' or 'fit$alignment_design' is 'NULL'. This means that you cannot use 'cond(...)' to ", 36 | "specify the contrast. Typically, 'fit$design' or 'fit$alignment_design' are 'NULL' if they are defined ", 37 | "using a matrix and not with a formula.") 38 | }else{ 39 | stop(e$message) 40 | } 41 | }) 42 | 43 | if(simplify){ 44 | evaluate_contrast_tree(res, res, \(x, y) x) 45 | }else{ 46 | res 47 | } 48 | } 49 | 50 | 51 | create_contrast_data_mask <- function(formula){ 52 | top <- rlang::new_environment(list( 53 | cond = function(...){ 54 | .cond(formula, rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE)) 55 | }, 56 | "+" = .plus, "-" = .minus, "/" = .divide, "*" = .multiply, 57 | "==" = .equal, 58 | "<" = .lt, "<=" = .lt, 59 | ">" = .gt, ">=" = .gt 60 | )) 61 | bottom <- rlang::new_environment(parent = top) 62 | data_mask <- rlang::new_data_mask(bottom = bottom, top = top) 63 | data_mask$.cntrst <- rlang::as_data_pronoun(bottom) 64 | data_mask 65 | } 66 | 67 | 68 | .cond <- function(formula, level_sets = list()){ 69 | if(is.null(formula)){ 70 | stop("You called 'cond()' inside the contrast, however the original model ", 71 | "was not specified with a formula. Thus 'cond()' doesn't work and you ", 72 | "need to specify the contrast using the column names of the design matrix.") 73 | } 74 | if(is.null(attr(formula, "xlevels"))){ 75 | warning("The formula has no 'xlevels' attribute. This is supicious and might indicate a bug.") 76 | } 77 | if(any(names(level_sets) == "")){ 78 | stop("All arguments to 'cond()' must be named.") 79 | } 80 | if(any(duplicated(names(level_sets)))){ 81 | stop("All arguments to 'cond()' must be unique.") 82 | } 83 | covar <- all.vars(formula) 84 | new_dat <- as.list(rep(0, length(covar))) 85 | names(new_dat) <- covar 86 | xlevels <- attr(formula, "vars_xlevels") 87 | for(n in names(xlevels)){ 88 | new_dat[[n]] <- factor(xlevels[[n]][1], levels = xlevels[[n]]) 89 | } 90 | for(n in names(level_sets)){ 91 | if(! n %in% names(new_dat)){ 92 | stop("Setting the level of '", n, "' failed. You can only set the level of the following variables: ", paste0(covar, collapse = ", ")) 93 | } 94 | if(length(level_sets[[n]]) != 1){ 95 | stop("Each argument to 'cond()' must be length one. '", n, "' has length ", length(level_sets[[n]])) 96 | } 97 | if(n %in% names(xlevels)){ 98 | if(! level_sets[[n]] %in% xlevels[[n]]){ 99 | stop("You are trying to set '", n, "=", level_sets[[n]], "'. However only the following values for ", n, 100 | " are valid: ", paste0(xlevels[[n]], collapse = ", ")) 101 | } 102 | new_dat[[n]] <- factor(level_sets[[n]], levels = xlevels[[n]]) 103 | }else{ 104 | new_dat[[n]] <- level_sets[[n]] 105 | } 106 | } 107 | res <- drop(model.matrix(formula, new_dat, contrasts.arg = attr(formula, "contrasts"), xlev = attr(formula, "xlevels"))) 108 | attr(res, "assign") <- NULL 109 | attr(res, "contrasts") <- NULL 110 | class(res) <- "model_vec" 111 | res 112 | } 113 | 114 | 115 | evaluate_contrast_tree <-function(c1, c2, FUN){ 116 | stopifnot(all(class(c2) == class(c1))) 117 | if(inherits(c1, "contrast_relation")){ 118 | stopifnot(c1$relation == c2$relation) 119 | if(c1$relation == "minus" && is.null(c1$rhs)){ # Unary minus 120 | - evaluate_contrast_tree(c1$lhs, c2$lhs, FUN = FUN) 121 | }else if(c1$relation == "minus" && is.null(c1$rhs)){ # Unary plus 122 | + evaluate_contrast_tree(c1$lhs, c2$lhs, FUN = FUN) 123 | }else if(c1$relation == "minus"){ 124 | evaluate_contrast_tree(c1$lhs, c2$lhs, FUN = FUN) - evaluate_contrast_tree(c1$rhs, c2$rhs, FUN = FUN) 125 | }else if(c1$relation == "plus"){ 126 | evaluate_contrast_tree(c1$lhs, c2$lhs, FUN = FUN) + evaluate_contrast_tree(c1$rhs, c2$rhs, FUN = FUN) 127 | }else if(c1$relation == "multiply"){ 128 | evaluate_contrast_tree(c1$lhs, c2$lhs, FUN = FUN) * evaluate_contrast_tree(c1$rhs, c2$rhs, FUN = FUN) 129 | }else if(c1$relation == "divide"){ 130 | evaluate_contrast_tree(c1$lhs, c2$lhs, FUN = FUN) / evaluate_contrast_tree(c1$rhs, c2$rhs, FUN = FUN) 131 | }else if(c1$relation %in% c("equal", "less_than", "greater_than")){ 132 | stop("(In)equalities are not allowed in contrasts") 133 | }else{ 134 | stop("Canot handle contrast relationship of type: ", c1$relation) 135 | } 136 | }else if(inherits(c1, "model_vec")){ 137 | FUN(c1, c2) 138 | }else{ 139 | stopifnot(all(c1 == c2)) 140 | c1 141 | } 142 | } 143 | 144 | 145 | .divide <- function(x, y){ 146 | res <- list(lhs = x, rhs = y, relation = "divide") 147 | class(res) <- "contrast_relation" 148 | res 149 | } 150 | 151 | .multiply <- function(x, y){ 152 | res <- list(lhs = x, rhs = y, relation = "multiply") 153 | class(res) <- "contrast_relation" 154 | res 155 | } 156 | 157 | .plus <- function(x, y){ 158 | if(missing(y)){ 159 | # Unary plus 160 | y <- NULL 161 | } 162 | res <- list(lhs = x, rhs = y, relation = "plus") 163 | class(res) <- "contrast_relation" 164 | res 165 | } 166 | 167 | .minus <- function(x, y){ 168 | if(missing(y)){ 169 | # Unary minus 170 | y <- NULL 171 | } 172 | res <- list(lhs = x, rhs = y, relation = "minus") 173 | class(res) <- "contrast_relation" 174 | res 175 | } 176 | 177 | .equal <- function(x, y){ 178 | res <- list(lhs = x, rhs = y, relation = "equal") 179 | class(res) <- "contrast_relation" 180 | res 181 | } 182 | 183 | .lt <- function(x, y){ 184 | res <- list(lhs = x, rhs = y, relation = "less_than") 185 | class(res) <- "contrast_relation" 186 | res 187 | } 188 | 189 | .gt <- function(x, y){ 190 | res <- list(lhs = x, rhs = y, relation = "greater_than") 191 | class(res) <- "contrast_relation" 192 | res 193 | } 194 | 195 | 196 | -------------------------------------------------------------------------------- /R/pca.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | pca <- function(Y, n, center = TRUE){ 4 | center_ind <- center 5 | min_dim <- min(dim(Y)) 6 | 7 | # Center 8 | center <- if(center_ind){ 9 | MatrixGenerics::rowMeans2(Y) 10 | }else{ 11 | rep(0, nrow(Y)) 12 | } 13 | 14 | if(n == 0){ 15 | res <- list(rotation = matrix(nrow = nrow(Y), ncol = 0), x = matrix(nrow = 0, ncol = ncol(Y))) 16 | }else if(min_dim <= n * 2){ 17 | # Do exact PCA 18 | res <- prcomp(t(Y), rank. = n, center = center_ind, scale. = FALSE) 19 | }else{ 20 | # Do approximate PCA 21 | res <- irlba::prcomp_irlba(t(Y), n = n, center = center_ind, scale. = FALSE) 22 | } 23 | list(coordsystem = unname(res$rotation), embedding = unname(t(res$x)), offset = center) 24 | } 25 | 26 | -------------------------------------------------------------------------------- /R/predict.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Predict values from `lemur_fit` object 4 | #' 5 | #' @param object an `lemur_fit` object 6 | #' @param newdata a data.frame which passed to [`model.matrix`] with 7 | #' `design` to make the `newdesign` matrix 8 | #' @param newdesign a matrix with the covariates for which the output 9 | #' is predicted. If `NULL`, the `object$design_matrix` is used. If 10 | #' it is a vector it is repeated `ncol(embedding)` times to create 11 | #' a design matrix with the same entry for each cell. 12 | #' @param newcondition an unquoted expression with a call to `cond()` specifying 13 | #' the covariates of the prediction. See the `contrast` argument in [test_de] 14 | #' for more details. Note that combinations of multiple calls to `cond()` are 15 | #' not allowed (e.g., `cond(a = 1) - cond(a = 2)`). If specified, `newdata` 16 | #' and `newdesign` are ignored. 17 | #' @param embedding the low-dimensional cell position for which the 18 | #' output is predicted. 19 | #' @param with_linear_model a boolean to indicate if the linear regression 20 | #' offset is included in the prediction. 21 | #' @param with_embedding a boolean to indicate if the embedding contributes 22 | #' to the output. 23 | #' @param with_alignment a boolean to indicate if the alignment effect 24 | #' is removed from the output. 25 | #' @param ... additional parameters passed to `predict_impl`. 26 | #' 27 | #' @returns A matrix with the same dimension `nrow(object) * nrow(newdesign)`. 28 | #' 29 | #' 30 | #' @seealso [`residuals`][residuals,lemur_fit-method] 31 | #' 32 | #' @examples 33 | #' 34 | #' data(glioblastoma_example_data) 35 | #' fit <- lemur(glioblastoma_example_data, design = ~ patient_id + condition, 36 | #' n_emb = 5, verbose = FALSE) 37 | #' 38 | #' pred <- predict(fit) 39 | #' 40 | #' pred_ctrl <- predict(fit, newdesign = c(1, 0, 0, 0, 0, 0)) 41 | #' pred_trt <- predict(fit, newdesign = c(1, 0, 0, 0, 0, 1)) 42 | #' # This is the same as the test_de result 43 | #' fit <- test_de(fit, cond(condition = "panobinostat") - cond(condition = "ctrl")) 44 | #' all.equal(SummarizedExperiment::assay(fit, "DE"), pred_trt - pred_ctrl, 45 | #' check.attributes = FALSE) 46 | #' 47 | #' @export 48 | predict.lemur_fit <- function(object, newdata = NULL, newdesign = NULL, 49 | newcondition = NULL, 50 | embedding = object$embedding, 51 | with_linear_model = TRUE, 52 | with_embedding = TRUE, 53 | with_alignment = TRUE, 54 | ...){ 55 | predict_impl(object, newdata = newdata, newdesign = newdesign, newcondition = {{newcondition}}, 56 | embedding = embedding, with_linear_model = with_linear_model, 57 | with_embedding = with_embedding, with_alignment = with_alignment, ...) 58 | 59 | } 60 | 61 | predict_impl <- function(object, newdata = NULL, newdesign = NULL, 62 | newcondition = NULL, 63 | embedding = object$embedding, 64 | with_linear_model = TRUE, 65 | with_embedding = TRUE, 66 | with_alignment = TRUE, 67 | n_embedding = object$n_embedding, 68 | design_matrix = object$design_matrix, 69 | design = object$design, 70 | linear_coefficients = object$linear_coefficients, 71 | coefficients = object$coefficients, 72 | base_point = object$base_point, 73 | alignment_coefficients = object$alignment_coefficients, 74 | alignment_design = object$alignment_design, 75 | alignment_design_matrix = object$alignment_design_matrix, 76 | row_mask = metadata(object)$row_mask, 77 | ...){ 78 | if(! rlang::quo_is_null(rlang::quo({{newcondition}}))){ 79 | if(! is.null(newdesign) || !is.null(newdata)) warning("If 'newcondition' is used, 'newdesign' and 'newdata' are ignored.") 80 | newdesign <- parse_contrast({{newcondition}}, design) 81 | alignment_design_matrix <- parse_contrast({{newcondition}}, alignment_design) 82 | if(inherits(newdesign, "contrast_relation")) stop("Contrast relations using + or - are not allowed") 83 | newdesign <- matrix(newdesign, nrow = ncol(embedding), ncol = length(newdesign), byrow = TRUE) 84 | alignment_design_matrix <- matrix(alignment_design_matrix, nrow = ncol(embedding), ncol = length(alignment_design_matrix), byrow = TRUE) 85 | }else if(is.null(newdesign) && is.null(newdata)){ 86 | newdesign <- design_matrix 87 | }else if(! is.null(newdata)){ 88 | if(is.null(design)) stop("'newdata' is provided, but 'object' does not contain a design formula.") 89 | newdesign <- model.matrix(design, newdata) 90 | }else if(! is.matrix(newdesign)){ 91 | newdesign <- matrix(newdesign, nrow = ncol(embedding), ncol = length(newdesign), byrow = TRUE) 92 | } 93 | if(! is.matrix(alignment_design_matrix)){ 94 | alignment_design_matrix <- matrix(alignment_design_matrix, nrow = ncol(embedding), ncol = length(alignment_design_matrix), byrow = TRUE) 95 | } 96 | 97 | if(all(dim(design_matrix) == dim(alignment_design_matrix)) && all(design_matrix == alignment_design_matrix)){ 98 | # The design matrices were identical, presume that the newdesign should also be identical 99 | alignment_design_matrix <- newdesign 100 | } 101 | 102 | if(nrow(newdesign) != ncol(embedding)){ 103 | stop("The number of rows in 'newdesign' must match the number of columns in 'embedding'") 104 | } 105 | if(nrow(newdesign) != nrow(alignment_design_matrix)){ 106 | stop("The number of rows in 'newdesign' (", nrow(newdesign) ,") and 'alignment_design_matrix'(", nrow(alignment_design_matrix) ,") must be the same") 107 | } 108 | approx <- if(with_linear_model){ 109 | linear_coefficients[row_mask,,drop=FALSE] %*% t(newdesign) 110 | }else{ 111 | matrix(0, nrow = length(row_mask), ncol = nrow(newdesign)) 112 | } 113 | 114 | if(with_embedding){ 115 | mm_groups <- get_groups(newdesign) 116 | mm_al_groups <- get_groups(alignment_design_matrix) 117 | stopifnot(length(mm_groups) == length(mm_al_groups)) 118 | mmg <- unique(cbind(mm_groups, mm_al_groups)) 119 | for(idx in seq_len(nrow(mmg))){ 120 | gr1 <- mmg[idx,1] 121 | gr2 <- mmg[idx,2] 122 | covar1 <- newdesign[which(mm_groups == gr1)[1],] 123 | diffemb <- grassmann_map(sum_tangent_vectors(coefficients, covar1), base_point) 124 | if(with_alignment){ 125 | covar2 <- alignment_design_matrix[which(mm_al_groups == gr2)[1],] 126 | alignment <- reverse_linear_transformation(alignment_coefficients, covar2) 127 | offset <- c(matrix(alignment_coefficients[,1,], ncol = length(covar2)) %*% covar2) 128 | }else{ 129 | alignment <- diag(nrow = n_embedding) 130 | offset <- 0 131 | } 132 | sel <- gr1 == mm_groups & gr2 == mm_al_groups 133 | approx[,sel] <- approx[,sel] + diffemb[row_mask,,drop=FALSE] %*% (alignment %*% (embedding[,sel] - offset)) 134 | } 135 | } 136 | 137 | colnames(approx) <- rownames(newdesign) 138 | rownames(approx) <- rownames(object) 139 | approx 140 | } 141 | 142 | 143 | #' Predict values from `lemur_fit` object 144 | #' 145 | #' @inheritParams predict.lemur_fit 146 | #' @param ... ignored. 147 | #' 148 | #' @returns A matrix with the same dimension `dim(object)`. 149 | #' 150 | #' @seealso [predict.lemur_fit] 151 | #' 152 | #' @examples 153 | #' data(glioblastoma_example_data) 154 | #' fit <- lemur(glioblastoma_example_data, design = ~ patient_id + condition, 155 | #' n_emb = 5, verbose = FALSE) 156 | #' 157 | #' resid <- residuals(fit) 158 | #' dim(resid) 159 | #' 160 | #' 161 | #' @export 162 | setMethod("residuals", signature = "lemur_fit", function(object, 163 | with_linear_model = TRUE, 164 | with_embedding = TRUE, ...){ 165 | residuals_impl(object, with_linear_model = with_linear_model, with_embedding = with_embedding) 166 | }) 167 | 168 | 169 | residuals_impl <- function(object, 170 | with_linear_model = TRUE, 171 | with_embedding = TRUE){ 172 | assay(object, object$use_assay) - predict(object, with_linear_model = with_linear_model, with_embedding = with_embedding) 173 | } 174 | 175 | 176 | 177 | get_residuals_for_alt_fit <- function(fit, Y = assay(fit, fit$use_assay), reduced_design_mat, with_linear_model = TRUE, with_embedding = TRUE){ 178 | if(with_embedding){ 179 | fit_alt <- lemur_impl(Y, design_matrix = reduced_design_mat, n_embedding = fit$n_embedding, 180 | base_point = fit$base_point, verbose = FALSE) 181 | Y - predict_impl(object = fit_alt, embedding = fit_alt$embedding, 182 | with_linear_model = TRUE, with_embedding = TRUE, 183 | n_embedding = fit_alt$n_embedding, 184 | design_matrix = fit_alt$design_matrix, design = fit_alt$design, 185 | linear_coefficients = fit_alt$linear_coefficients, coefficients = fit_alt$coefficients, 186 | base_point = fit_alt$base_point, alignment_design_matrix = fit_alt$alignment_design_matrix, 187 | alignment_coefficients = fit_alt$alignment_coefficients, 188 | row_mask = rep(TRUE, nrow(Y))) 189 | }else{ 190 | fit_alt <- lemur_impl(Y, design_matrix = reduced_design_mat, 191 | n_embedding = 0, 192 | base_point = matrix(nrow = nrow(fit), ncol = 0), 193 | verbose = FALSE) 194 | Y - predict_impl(object = fit_alt, embedding = fit_alt$embedding, 195 | with_linear_model = TRUE, with_embedding = FALSE, 196 | n_embedding = fit_alt$n_embedding, 197 | design_matrix = fit_alt$design_matrix, design = fit_alt$design, 198 | linear_coefficients = fit_alt$linear_coefficients, coefficients = fit_alt$coefficients, 199 | base_point = fit_alt$base_point, alignment_design_matrix = fit$alignment_design_matrix, 200 | alignment_coefficients = fit_alt$alignment_coefficients, 201 | row_mask = rep(TRUE, nrow(Y))) 202 | } 203 | } 204 | 205 | 206 | -------------------------------------------------------------------------------- /R/project_on_fit.R: -------------------------------------------------------------------------------- 1 | 2 | #' Project new data onto the latent spaces of an existing lemur fit 3 | #' 4 | #' @param fit an `lemur_fit` object 5 | #' @param data a matrix with observations in the columns and features in the rows. 6 | #' Or a `SummarizedExperiment` / `SingleCellExperiment` object. The features must 7 | #' match the features in `fit`. 8 | #' @param col_data col_data an optional data frame with `ncol(data)` rows. 9 | #' @param use_assay if `data` is a `SummarizedExperiment` / `SingleCellExperiment` object, 10 | #' which assay should be used. 11 | #' @param design,alignment_design the design formulas or design matrices that are used 12 | #' to project the data on the correct latent subspace. Both default to the designs 13 | #' from the `fit` object. 14 | #' @param return which data structure is returned. 15 | #' 16 | #' 17 | #' @returns Either a matrix with the low-dimensional embeddings of the `data` or 18 | #' an object of class `lemur_fit` wrapping that embedding. 19 | #' 20 | #' @examples 21 | #' 22 | #' data(glioblastoma_example_data) 23 | #' 24 | #' subset1 <- glioblastoma_example_data[,1:2500] 25 | #' subset2 <- glioblastoma_example_data[,2501:5000] 26 | #' 27 | #' fit <- lemur(subset1, design = ~ condition, n_emb = 5, 28 | #' test_fraction = 0, verbose = FALSE) 29 | #' 30 | #' # Returns a `lemur_fit` object with the projection of `subset2` 31 | #' fit2 <- project_on_lemur_fit(fit, subset2, return = "lemur_fit") 32 | #' fit2 33 | #' 34 | #' 35 | #' 36 | #' @export 37 | project_on_lemur_fit <- function(fit, data, col_data = NULL, use_assay = "logcounts", 38 | design = fit$design, alignment_design = fit$alignment_design, 39 | return = c("matrix", "lemur_fit")){ 40 | return <- match.arg(return) 41 | Y <- handle_data_parameter(data, on_disk = FALSE, assay = use_assay) 42 | col_data <- glmGamPoi:::get_col_data(data, col_data) 43 | 44 | xlevel <- attr(design, "xlevel") %default_to% attr(alignment_design, "xlevel") 45 | for(lvl in names(xlevel)){ 46 | if(lvl %in% names(col_data)){ 47 | col_data[[lvl]] <- factor(col_data[[lvl]], levels = xlevel[[lvl]]) 48 | }else{ 49 | stop("The column data does not contain the covariate ", lvl, ". This is a problem", 50 | "because it was used in the original design.") 51 | } 52 | } 53 | attr(design, "ignore_degeneracy") <- TRUE 54 | attr(alignment_design, "ignore_degeneracy") <- TRUE 55 | 56 | des <- handle_design_parameter(design, data, col_data) 57 | col_data <- des$col_data 58 | al_des <- handle_design_parameter(alignment_design, data, col_data) 59 | col_data <- al_des$col_data 60 | embedding <- project_on_lemur_fit_impl(Y, des$design_matrix, al_des$design_matrix, 61 | fit$coefficients, fit$linear_coefficients, fit$alignment_coefficients, 62 | fit$base_point) 63 | colnames(embedding) <- colnames(data) 64 | 65 | if(return == "matrix"){ 66 | embedding 67 | }else if(return == "lemur_fit"){ 68 | lemur_fit(data, col_data = col_data, 69 | row_data = if(is(data, "SummarizedExperiment")) rowData(data) else NULL, 70 | n_embedding = fit$n_embedding, 71 | design = des$design_formula, design_matrix = des$design_matrix, 72 | linear_coefficients = fit$linear_coefficients, 73 | base_point = fit$base_point, 74 | coefficients = fit$coefficients, 75 | embedding = embedding, 76 | alignment_coefficients = fit$alignment_coefficients, 77 | alignment_design = al_des$design_formula, 78 | alignment_design_matrix = al_des$design_matrix, 79 | use_assay = use_assay, is_test_data = rep(FALSE, ncol(embedding)), 80 | row_mask = metadata(fit)$row_mask) 81 | } 82 | } 83 | 84 | project_on_lemur_fit_impl <- function(Y, design_matrix, alignment_design_matrix, coefficients, linear_coefficients, alignment_coefficients, base_point){ 85 | Y_clean <- Y - linear_coefficients %*% t(design_matrix) 86 | embedding <- project_data_on_diffemb(Y_clean, design = design_matrix, coefficients = coefficients, base_point = base_point) 87 | embedding <- apply_linear_transformation(embedding, alignment_coefficients, alignment_design_matrix) 88 | # TODO: subset to row_mask? And then potentially also remove the check in find_de_neighborhoods line 143. 89 | embedding 90 | } 91 | 92 | -------------------------------------------------------------------------------- /R/recursive_least_squares.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | #' Iteratively calculate the least squares solution 6 | #' 7 | #' Both functions are for testing purposes. There is a faster implementation 8 | #' called `cum_brls_which_abs_max`. 9 | #' 10 | #' @param y a vector with observations 11 | #' @param X a design matrix 12 | #' 13 | #' @returns a matrix where column i is the 14 | #' solution to `y[1:i] ~ X[1:i,]`. 15 | #' 16 | #' 17 | #' @keywords internal 18 | recursive_least_squares <- function(y, X){ 19 | stopifnot(length(y) == nrow(X)) 20 | n <- length(y) 21 | k <- ncol(X) 22 | res <- matrix(NA, nrow = k, ncol = n) 23 | gamma <- solve(crossprod(X[seq_len(k),])) 24 | beta <- gamma %*% t(X[seq_len(k),]) %*% y[seq_len(k)] 25 | res[,k] <- beta 26 | for(idx in seq(k+1, n)){ 27 | yi <- y[idx] 28 | xi <- t(X[idx,,drop=FALSE]) 29 | gamma <- gamma - (gamma %*% xi %*% t(xi) %*% gamma) / c(1 + t(xi) %*% gamma %*% xi) 30 | beta <- beta - gamma %*% xi %*% (t(xi) %*% beta - yi) 31 | res[,idx] <- beta 32 | } 33 | res 34 | } 35 | 36 | 37 | #' Enable pseudobulking and directly calculate the contrast 38 | #' 39 | #' @rdname recursive_least_squares 40 | bulked_recursive_least_squares_contrast <- function(y, X, group, contrast, ridge_penalty = 1e-6){ 41 | stopifnot(length(y) == nrow(X)) 42 | stopifnot(length(y) == length(group)) 43 | if(! is.matrix(contrast)){ 44 | contrast <- matrix(contrast, nrow = 1) 45 | } 46 | stopifnot(nrow(contrast) == 1) 47 | stopifnot(ncol(contrast) == ncol(X)) 48 | 49 | n <- length(y) 50 | k <- ncol(X) 51 | g <- max(group) 52 | 53 | m <- rep(0, g) 54 | count <- rep(0, g) 55 | X_act <- matrix(0, nrow = g, ncol = k) 56 | 57 | res <- matrix(NA, nrow = k, ncol = n) 58 | t_stat <- rep(NA, n) 59 | gamma <- diag(1/ridge_penalty, nrow = k) 60 | beta <- rep(0, k) 61 | rss <- 0 62 | n_obs <- 0 63 | 64 | for(idx in seq(1, n)){ 65 | yi <- y[idx] 66 | xi <- t(X[idx,,drop=FALSE]) 67 | gi <- group[idx] 68 | 69 | # Alternative formula for mu: m[gi] <- (m[gi] * count[gi] + yi) / (count[gi] + 1) 70 | delta_m <- 1/(count[gi] + 1) * yi - (1 - count[gi] / (count[gi] + 1)) * m[gi] 71 | m[gi] <- m[gi] + delta_m 72 | count[gi] <- count[gi] + 1 73 | 74 | if(count[gi] == 1){ 75 | X_act[gi,] <- xi 76 | n_obs <- n_obs + 1L 77 | gamma <- gamma - (gamma %*% xi %*% t(xi) %*% gamma) / c(1 + t(xi) %*% gamma %*% xi) 78 | # Below is a more efficient version of: beta <- gamma %*% t(X_act) %*% m 79 | beta <- beta + gamma %*% xi %*% (m[gi] - t(xi) %*% beta) 80 | }else{ 81 | beta <- beta + gamma %*% (xi * delta_m) 82 | } 83 | # I can't find a recursive way to calculate the residual sum of squares 84 | rss <- max(1e-6, sum((m - X_act %*% beta)^2)) 85 | # Avoid zero or negative numbers 86 | covar <- rss / max(1e-8, n_obs - k) * gamma 87 | se_sq <- contrast %*% covar %*% t(contrast) 88 | if(se_sq > 0){ 89 | t_stat[idx] <- sum(drop(contrast) * beta) / sqrt(se_sq) 90 | } 91 | res[,idx] <- beta 92 | } 93 | list(coef = res, t_stat = t_stat) 94 | } 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /R/ridge_regression.R: -------------------------------------------------------------------------------- 1 | 2 | #' Ridge regression 3 | #' 4 | #' The function does not treat the intercept special. 5 | #' 6 | #' @param Y the observations matrix (`features x samples`) 7 | #' @param X the design matrix (`samples x covariates`) 8 | #' @param ridge_penalty a numeric vector or matrix of size (`covariates` or 9 | #' `covariates x covariates` respectively) 10 | #' @param weights a vector of observation weights 11 | #' 12 | #' @returns The matrix of coefficients. 13 | #' 14 | #' @keywords internal 15 | ridge_regression <- function(Y, X, ridge_penalty = 0, weights = rep(1, nrow(X))){ 16 | stopifnot(length(weights) == nrow(X)) 17 | if(! is.matrix(ridge_penalty)){ 18 | stopifnot(length(ridge_penalty) == 1 || length(ridge_penalty) == ncol(X)) 19 | ridge_penalty <- diag(ridge_penalty, nrow = ncol(X)) 20 | } 21 | ridge_penalty_sq <- sqrt(sum(weights)) * (t(ridge_penalty) %*% ridge_penalty) 22 | weights_sqrt <- sqrt(weights) 23 | 24 | X_extended <- rbind(X * weights_sqrt, ridge_penalty_sq) 25 | Y_extended <- cbind(t(t(Y) * weights_sqrt), matrix(0, nrow = nrow(Y), ncol = ncol(X))) 26 | qr <- qr(X_extended) 27 | res <- t(solve(qr, t(Y_extended))) 28 | colnames(res) <- colnames(X) 29 | res 30 | } 31 | -------------------------------------------------------------------------------- /R/test_de.R: -------------------------------------------------------------------------------- 1 | 2 | #' Predict log fold changes between conditions for each cell 3 | #' 4 | #' @param fit the result of calling [`lemur()`] 5 | #' @param contrast Specification of the contrast: a call to `cond()` specifying a full observation 6 | #' (e.g. `cond(treatment = "A", sex = "male") - cond(treatment = "C", sex = "male")` to 7 | #' compare treatment A vs C for male observations). Unspecified factors default to the reference level. 8 | #' @param embedding matrix of size `n_embedding` \eqn{\times} `n` that specifies where in the latent space 9 | #' the differential expression is tested. It defaults to the position of all cells from the original fit. 10 | #' @param consider specify which part of the model are considered for the differential expression test. 11 | #' @param new_assay_name the name of the assay added to the `fit` object. Default: `"DE"`. 12 | #' 13 | #' @returns If `is.null(embedding)` the `fit` object with a new assay called `"DE"`. Otherwise 14 | #' return a matrix with the differential expression values. 15 | #' 16 | #' @seealso [find_de_neighborhoods] 17 | #' 18 | #' @examples 19 | #' library(SummarizedExperiment) 20 | #' library(SingleCellExperiment) 21 | #' 22 | #' data(glioblastoma_example_data) 23 | #' fit <- lemur(glioblastoma_example_data, design = ~ patient_id + condition, 24 | #' n_emb = 5, verbose = FALSE) 25 | #' # Optional alignment 26 | #' # fit <- align_harmony(fit) 27 | #' fit <- test_de(fit, contrast = cond(condition = "panobinostat") - cond(condition = "ctrl")) 28 | #' 29 | #' # The fit object contains a new assay called "DE" 30 | #' assayNames(fit) 31 | #' 32 | #' # The DE assay captures differences between conditions 33 | #' is_ctrl_cond <- fit$colData$condition == "ctrl" 34 | #' mean(logcounts(fit)[1,!is_ctrl_cond]) - mean(logcounts(fit)[1,is_ctrl_cond]) 35 | #' mean(assay(fit, "DE")[1,]) 36 | #' 37 | #' @export 38 | test_de <- function(fit, 39 | contrast, 40 | embedding = NULL, 41 | consider = c("embedding+linear", "embedding", "linear"), 42 | new_assay_name = "DE"){ 43 | if(is.null(embedding)){ 44 | embedding <- fit$embedding 45 | use_provided_diff_emb <- FALSE 46 | }else{ 47 | use_provided_diff_emb <- TRUE 48 | } 49 | 50 | consider <- match.arg(consider) 51 | with_lm <- consider == "embedding+linear" || consider == "linear" 52 | with_emb <- consider == "embedding+linear" || consider == "embedding" 53 | 54 | cntrst <- parse_contrast({{contrast}}, formula = fit$design) 55 | al_cntrst <- parse_contrast({{contrast}}, formula = fit$alignment_design) 56 | diff <- evaluate_contrast_tree(cntrst, al_cntrst, \(x, y){ 57 | predict(fit, newdesign = x, alignment_design_matrix = y, embedding = embedding, with_linear_model = with_lm, with_embedding = with_emb) 58 | }) 59 | 60 | colnames(diff) <- colnames(embedding) 61 | rownames(diff) <- rownames(fit) 62 | if(use_provided_diff_emb){ 63 | diff 64 | }else{ 65 | assay(fit, new_assay_name) <- diff 66 | metadata(fit)[["contrast"]] <- cntrst 67 | 68 | fit 69 | } 70 | } 71 | 72 | 73 | #' Differential embedding for each condition 74 | #' 75 | #' @inheritParams test_de 76 | #' @param reduced_design an alternative specification of the null hypothesis. 77 | #' @param consider specify which part of the model are considered for the differential expression test. 78 | #' @param variance_est How or if the variance should be estimated. `'analytical'` is only compatible with `consider = "linear"`. `'resampling'` is the most flexible (to adapt the number 79 | #' of resampling iterations, set `n_resampling_iter`. Default: `100`) 80 | #' @param verbose should the method print information during the fitting. Default: `TRUE`. 81 | #' @param ... additional arguments. 82 | #' 83 | #' @return a data.frame 84 | #' 85 | test_global <- function(fit, 86 | contrast, 87 | reduced_design = NULL, 88 | consider = c("embedding+linear", "embedding", "linear"), 89 | variance_est = c("analytical", "resampling", "none"), verbose = TRUE, 90 | ...){ 91 | 92 | 93 | variance_est <- match.arg(variance_est) 94 | full_design <- fit$design_matrix 95 | consider <- match.arg(consider) 96 | with_lm <- consider == "embedding+linear" || consider == "linear" 97 | with_emb <- consider == "embedding+linear" || consider == "embedding" 98 | 99 | # Implement with a likelihood ratio test (see glmGamPoi) 100 | if(is.null(reduced_design) == missing(contrast)){ 101 | stop("Please provide either an alternative design (formula or matrix) or a contrast.") 102 | }else if(! missing(contrast)){ 103 | cntrst <- parse_contrast({{contrast}}, formula = fit$design) 104 | if(inherits(cntrst, "contrast_relation") && cntrst$relation == "minus" && 105 | inherits(cntrst$lhs, "model_vec") && inherits(cntrst$rhs, "model_vec")){ 106 | lfc_diffemb <- grassmann_log(grassmann_map(sum_tangent_vectors(fit$coefficients, c(cntrst$lhs)), fit$base_point), 107 | grassmann_map(sum_tangent_vectors(fit$coefficients, c(cntrst$rhs)), fit$base_point)) 108 | cntrst <- cntrst$lhs - cntrst$rhs 109 | }else{ 110 | cntrst <- evaluate_contrast_tree(cntrst, cntrst, \(x, .) x) # Collapse tree 111 | lfc_diffemb <- sum_tangent_vectors(fit$coefficients, c(cntrst)) 112 | } 113 | 114 | cntrst <- as.matrix(cntrst) 115 | if(nrow(cntrst) != ncol(full_design)){ 116 | stop("The length of the contrast vector does not match the number of coefficients in the model (", 117 | ncol(full_design), ")\n", glmGamPoi:::format_matrix(cntrst)) 118 | } 119 | # The modifying matrix of reduced_design has ncol(design_matrix) - 1 columns and rank. 120 | # The columns are all orthogonal to cntrst. 121 | # see: https://scicomp.stackexchange.com/a/27835/36204 122 | # Think about this as a rotation of of the design matrix. The QR decomposition approach 123 | # has the added benefit that the new columns are all orthogonal to each other, which 124 | # isn't necessary, but makes fitting more robust 125 | # The following is a simplified version of edgeR's glmLRT (line 159 in glmfit.R) 126 | qrc <- qr(cntrst) 127 | rot <- qr.Q(qrc, complete = TRUE)[,-1,drop=FALSE] 128 | reduced_design_mat <- full_design %*% rot 129 | 130 | lfc_linear_model <- fit$linear_coefficients %*% cntrst 131 | }else{ 132 | reduced_design_mat <- handle_design_parameter(reduced_design, fit, fit$colData)$design_matrix 133 | if(ncol(reduced_design_mat) >= ncol(full_design)){ 134 | stop("The reduced model is as complex (or even more complex) than ", 135 | "the 'fit' model. The 'reduced_design' should contain fewer terms ", 136 | "the original 'design'.") 137 | } 138 | rot <- matrix(lm.fit(full_design, reduced_design_mat)$coefficients, ncol = ncol(reduced_design_mat)) 139 | if(any(abs(reduced_design_mat - full_design %*% rot) > 1e-10)){ 140 | warning("Although the 'reduced_design' matrix has fewer columns than ", 141 | "'fit$design_matrix', it appears that the 'reduced_design' is not ", 142 | "nested in the 'fit$design_matrix'. Accordingly, the results of the ", 143 | "statistical test will be unreliable.") 144 | } 145 | } 146 | 147 | 148 | 149 | if(variance_est == "analytical"){ 150 | if(with_emb){ 151 | stop("Analytical differential embedding test is not implemented. You can set 'consider=\"linear\"") 152 | }else{ # only linear test 153 | resid_full <- as.matrix(residuals(fit, with_linear_model = TRUE, with_embedding = FALSE)) 154 | resid_red <- as.matrix(get_residuals_for_alt_fit(fit, reduced_design_mat = reduced_design_mat, with_linear_model = TRUE, with_embedding = FALSE)) 155 | pval <- multivar_wilks_ftest(RSS_full = resid_full %*% t(resid_full), 156 | RSS_red = resid_red %*% t(resid_red), 157 | n_features = nrow(fit), full_design, reduced_design_mat) 158 | } 159 | }else if(variance_est == "resampling"){ 160 | if("n_resampling_iter" %in% ...names()){ 161 | n_resampling_iter <- list(...)[["n_resampling_iter"]] 162 | }else{ 163 | n_resampling_iter <- 99 164 | } 165 | if(verbose) message("Estimating null distribution of deviance using ", n_resampling_iter, " iterations.") 166 | # Applying the Freedman-Lane (1983) permutation method of the residuals 167 | # Fit the full 168 | deviance_ref <- sum(residuals(fit, with_linear_model = with_lm, with_embedding = with_emb)^2) 169 | # Fit the reduced model 170 | resid_red <- get_residuals_for_alt_fit(fit, reduced_design_mat = reduced_design_mat, with_linear_model = with_lm, with_embedding = with_emb) 171 | predict_red <- assay(fit, fit$use_assay) - resid_red 172 | deviance_red <- sum(resid_red^2) 173 | deviance_delta_null <- vapply(seq_len(n_resampling_iter), \(iter){ 174 | new_Y <- predict_red + resid_red[,sample.int(ncol(resid_red), replace = FALSE),drop=FALSE] 175 | deviance_ref_new <- sum(get_residuals_for_alt_fit(fit, Y = new_Y, reduced_design_mat = full_design, with_linear_model = with_lm, with_embedding = with_emb)^2) 176 | deviance_red_new <- sum(get_residuals_for_alt_fit(fit, Y = new_Y, reduced_design_mat = reduced_design_mat, with_linear_model = with_lm, with_embedding = with_emb)^2) 177 | deviance_red_new - deviance_ref_new 178 | }, FUN.VALUE = numeric(1L)) 179 | pval <- (sum((deviance_red - deviance_ref) < deviance_delta_null) + 1) / (n_resampling_iter + 1) 180 | }else{ # variance_est == "none" 181 | pval <- NA 182 | } 183 | 184 | 185 | if(! missing(contrast)){ 186 | data.frame(contrast = rlang::as_label(rlang::enquo(contrast)), 187 | pval = pval, 188 | delta_diffemb = I(list(lfc_diffemb)), 189 | delta_linear = I(list(lfc_linear_model)), 190 | angle_degrees = grassmann_angle_from_tangent(lfc_diffemb, normalized = TRUE)) 191 | }else{ 192 | data.frame(full_design = if(! is.null(fit$design)) rlang::as_label(fit$design) else rlang::as_label(fit$design_matrix), 193 | reduced_design = rlang::as_label(reduced_design), 194 | pval = pval) 195 | } 196 | } 197 | 198 | 199 | multivar_wilks_ftest <- function(RSS_full, RSS_red, n_features, design_matrix_full, design_matrix_red){ 200 | # Following https://socialsciences.mcmaster.ca/jfox/Books/Companion/appendices/Appendix-Multivariate-Linear-Models.pdf 201 | stopifnot(nrow(design_matrix_full) == nrow(design_matrix_red)) 202 | k1 <- ncol(design_matrix_full) 203 | k2 <- ncol(design_matrix_red) 204 | 205 | lambdas <- Re(eigen((RSS_red - RSS_full) %*% solve(RSS_full))$values) 206 | wilks_lambda <- prod(1/(1 + lambdas)) 207 | df <- k1 - k2 208 | r <- nrow(design_matrix_full) - k1 - 1 - (n_features - df + 1) / 2 209 | u <- (n_features * df - 2) / 4 210 | t <- ifelse(n_features^2 + df^2 - 5 <= 0, 0, sqrt(n_features^2 * df^2 - 4) / (n_features^2 + df^2 - 5)) 211 | fstat <- (1 - wilks_lambda^(1/t)) / wilks_lambda^(1/t) * (r * t - 2 * u) / (n_features * df) 212 | pf(fstat, df1 = n_features * df, df2 = r * t - 2 * u, lower.tail = FALSE) 213 | } 214 | 215 | -------------------------------------------------------------------------------- /R/util.R: -------------------------------------------------------------------------------- 1 | randn <- function(n, m, ...){ 2 | matrix(rnorm(n * m, ...), nrow = n, ncol = m) 3 | } 4 | 5 | skew <- function(M){ 6 | 0.5 * (M - t(M)) 7 | } 8 | 9 | sym <- function(M){ 10 | 0.5 * (M + t(M)) 11 | } 12 | 13 | `%update_values%` <- function(x, y){ 14 | if(is.null(x) && is.null(y)){ 15 | NULL 16 | }else if(is.null(x)){ 17 | y 18 | }else if(is.null(y)){ 19 | x 20 | }else{ 21 | for(n in names(y)){ 22 | x[[n]] <- y[[n]] 23 | } 24 | x 25 | } 26 | } 27 | 28 | `%default_to%` <- function(x, y){ 29 | if(is.null(x) && is.null(y)){ 30 | NULL 31 | }else if(is.null(x)){ 32 | y 33 | }else if(is.null(y)){ 34 | x 35 | }else{ 36 | for(n in setdiff(names(y), names(x))){ 37 | x[[n]] <- y[[n]] 38 | } 39 | x 40 | } 41 | } 42 | 43 | 44 | 45 | #' Iterating function that returns a matrix 46 | #' 47 | #' The length of `x` determines the number of rows. The length of 48 | #' `FUN(x[i])` determines the number of columns. Must match `ncol`. 49 | #' 50 | #' @param x the sequence that is mapped to a matrix 51 | #' @param FUN the function that returns a vector of length `ncol` 52 | #' @param ncol the length of the output vector 53 | #' @param ... additional arguments that are passed to `FUN` 54 | #' 55 | #' @returns A matrix with `length(x)` / `nrow(x)` rows and `ncol` columns. 56 | #' For `msply_dbl` the number of columns depends on the output of `FUN`. 57 | #' 58 | #' @keywords internal 59 | mply_dbl <- function(x, FUN, ncol=1, ...){ 60 | if(!is.matrix(x)){ 61 | res <- vapply(x, FUN, FUN.VALUE=rep(0.0, times=ncol), ...) 62 | }else{ 63 | res <- apply(x, 1, FUN, ...) * 1.0 64 | if(nrow(x) > 0 && length(res) == 0){ 65 | # Empty result, make matrix 66 | res <- matrix(numeric(0),nrow=0, ncol=nrow(x)) 67 | }else if(nrow(x) == 0){ 68 | res <- matrix(numeric(0), nrow=ncol, ncol=0) 69 | } 70 | if((ncol == 1 && ! is.vector(res)) || (ncol > 1 && nrow(res) != ncol)){ 71 | stop("values must be length ", ncol, ", but result is length ", nrow(res)) 72 | } 73 | } 74 | 75 | if(ncol == 1){ 76 | as.matrix(res, nrow=length(res), ncol=1) 77 | }else{ 78 | t(res) 79 | } 80 | } 81 | 82 | 83 | 84 | #' 85 | #' @describeIn mply_dbl Each list element becomes a row in a matrix 86 | stack_rows <- function(x){ 87 | stopifnot(is.list(x)) 88 | do.call(rbind, x) 89 | } 90 | 91 | #' 92 | #' @describeIn mply_dbl Each list element becomes a row in a matrix 93 | stack_cols <- function(x){ 94 | stopifnot(is.list(x)) 95 | do.call(cbind, x) 96 | } 97 | 98 | #' Make a cube from a list of matrices 99 | #' 100 | #' The length of the list will become the third dimension of the cube. 101 | #' 102 | #' @param x a list of vectors/matrices that are stacked 103 | #' 104 | #' @returns A three-dimensional array. 105 | #' 106 | #' @keywords internal 107 | stack_slice <- function(x){ 108 | stopifnot(is.list(x)) 109 | x <- lapply(x, as.matrix) 110 | if(length(x) == 0){ 111 | array(dim = c(0, 0, 0)) 112 | }else{ 113 | dim <- dim(x[[1]]) 114 | res <- array(NA, dim = c(dim, length(x))) 115 | for(idx in seq_along(x)){ 116 | elem <- x[[idx]] 117 | if(nrow(elem) != dim[1] || ncol(elem) != dim[2]){ 118 | stop("Size doesn't match") 119 | } 120 | res[,,idx] <- elem 121 | } 122 | res 123 | } 124 | } 125 | 126 | #' @describeIn stack_slice Make a list of matrices from a cube 127 | #' 128 | destack_slice <- function(x){ 129 | stopifnot(is.array(x)) 130 | stopifnot(length(dim(x)) == 3) 131 | lapply(seq_len(dim(x)[3]), \(idx) x[,,idx]) 132 | } 133 | 134 | 135 | duplicate_rows <- function(m, times, each){ 136 | ncols <- if(is.matrix(m)) ncol(m) else length(m) 137 | if(missing(times) && missing(each)){ 138 | do.call(rbind, list(m)) 139 | }else if(! missing(times)){ 140 | if(times == 0){ 141 | matrix(nrow = 0, ncol = ncols) 142 | }else{ 143 | do.call(rbind, lapply(seq_len(times), \(i) m)) 144 | } 145 | }else if(! missing(each)){ 146 | if(each == 0){ 147 | matrix(nrow = 0, ncol = ncols) 148 | }else{ 149 | matrix(rep(m, each = each), nrow = each * nrow(m), ncol = ncol(m)) 150 | } 151 | }else{ 152 | stop("Specify either 'times' or 'each'") 153 | } 154 | } 155 | 156 | duplicate_cols <- function(m, times, each){ 157 | t(duplicate_rows(t(m), times = times, each = each)) 158 | } 159 | 160 | 161 | normalize_vec_length <- function(x){ 162 | vec_lens <- sqrt(colSums(x^2)) 163 | t(t(x) / vec_lens) 164 | } 165 | 166 | #' Fold left over a sequence 167 | #' 168 | #' @param init initial value. If not specified `NULL` 169 | #' @param x the sequence to iterate over 170 | #' @param FUN a function with first argument named `elem` and second argument 171 | #' named `accum` 172 | #' 173 | #' 174 | #' @examples 175 | #' \dontrun{ 176 | #' # This produces ... 177 | #' fold_left(0)(1:10, \(elem, accum) accum + elem) 178 | #' # ... the same as 179 | #' sum(1:10) 180 | #' } 181 | #' 182 | #' @returns The final value of `accum`. 183 | #' 184 | #' @keywords internal 185 | fold_left <- function(init){ 186 | if(missing(init)){ 187 | init <- NULL 188 | } 189 | function(x, FUN){ 190 | val <- init 191 | for(elem in x){ 192 | val <- FUN(elem = elem, accum = val) 193 | } 194 | val 195 | } 196 | } 197 | 198 | #' Fold right over a sequence 199 | #' @rdname fold_left 200 | fold_right<- function(init){ 201 | if(missing(init)){ 202 | init <- NULL 203 | } 204 | function(x, FUN){ 205 | val <- init 206 | for(elem in rev(x)){ 207 | val <- FUN(elem = elem, accum = val) 208 | } 209 | val 210 | } 211 | } 212 | 213 | 214 | 215 | resample <- function(size, cluster = NULL){ 216 | if(is.null(cluster)){ 217 | sample.int(size, replace = TRUE) 218 | }else{ 219 | # For a description see "Using Cluster Bootstrapping to Analyze Nested Data With a Few Clusters" 220 | # by Huang (2018) 221 | stopifnot(length(cluster) == size) 222 | indices <- seq_len(size) 223 | cluster_levels <- unique(cluster) 224 | resamp <- sample(cluster_levels, size = length(cluster_levels), replace = TRUE) 225 | unlist(lapply(resamp, \(lvl) indices[cluster == lvl])) 226 | } 227 | } 228 | 229 | matrix_equals <- function(m1, m2){ 230 | all(dim(m1) == dim(m2)) && all(m1 == m2) 231 | } 232 | 233 | 234 | seq_excl <- function(start, end){ 235 | if(start >= end){ 236 | integer(0L) 237 | }else{ 238 | seq(start + 1L, end) 239 | } 240 | } 241 | 242 | which_extreme <- function(x, ignore = NULL){ 243 | if(is.null(ignore)){ 244 | which.max(abs(x)) 245 | }else{ 246 | stopifnot(length(ignore) == length(x)) 247 | extreme_idx <- NA_integer_ 248 | max <- -Inf 249 | for(idx in seq_along(x)){ 250 | if(! ignore[idx] && abs(x[idx]) > max){ 251 | extreme_idx <- idx 252 | max <- abs(x[idx]) 253 | } 254 | } 255 | extreme_idx 256 | } 257 | } 258 | 259 | 260 | aggregate_matrix <- function(mat, group_split, aggr_fnc, col_sel = TRUE, ...){ 261 | group_split_lgl <- lapply(group_split, \(idx){ 262 | lgl <- rep(FALSE, ncol(mat)) 263 | lgl[idx] <- TRUE 264 | lgl 265 | }) 266 | if(all(col_sel == TRUE)){ 267 | new_data_mat <- t(mply_dbl(group_split_lgl, \(split_sel){ 268 | aggr_fnc(mat, cols = split_sel, ...) 269 | }, ncol = nrow(mat))) 270 | }else{ 271 | if(! is.logical(col_sel) && length(col_sel) == ncol(mat)){ 272 | stop("Illegal 'col_sel' argument") 273 | } 274 | new_data_mat <- t(mply_dbl(group_split_lgl, \(split_sel){ 275 | aggr_fnc(mat, cols = split_sel & col_sel, ...) 276 | }, ncol = nrow(mat))) 277 | } 278 | rownames(new_data_mat) <- rownames(mat) 279 | new_data_mat 280 | } 281 | 282 | 283 | limma_eBayes_without_shrinkage <- function(lm_fit){ 284 | lm_fit$t <- lm_fit$coefficients / lm_fit$stdev.unscaled / lm_fit$sigma 285 | lm_fit$p.value <- 2 * pt(-abs(lm_fit$t), df = lm_fit$df.residual) 286 | lm_fit 287 | } 288 | 289 | 290 | nullspace <- function(X){ 291 | dim <- nrow(X) 292 | n_obs <- ncol(X) 293 | if(dim == 0){ 294 | return(matrix(nrow = dim, ncol = 0)) 295 | }else if(n_obs == 0){ 296 | diag(nrow = dim) 297 | } 298 | 299 | qrX <- qr(X) 300 | rank <- qrX$rank 301 | qr.Q(qrX, complete = TRUE)[,seq_excl(rank, dim),drop=FALSE] 302 | } 303 | 304 | is_contrast_estimable <- function(contrast, design_matrix, tol = sqrt(.Machine$double.eps)){ 305 | # The algorithm is inspired by 'lmerTest::is_estimable()'. 306 | ns <- nullspace(t(design_matrix)) 307 | if(ncol(ns) == 0){ 308 | return(TRUE) 309 | } 310 | abs(sum(c(contrast) %*% ns)) < tol 311 | } 312 | 313 | #' Moore-Penrose pseudoinverse calculated via SVD 314 | #' 315 | #' In the simplest case, the pseudoinverse is 316 | #' \deqn{X^{+} = (X^T X)^{-1} X^T.} 317 | #' 318 | #' To handle the more general case, the pseudoinverse can expressed using a SVD 319 | #' \eqn{X = U D V^T}: 320 | #' \deqn{X^{+} = V D^{-1} U^T} 321 | #' 322 | #' @param X a matrix X 323 | #' 324 | #' @returns The matrix \eqn{X^{+}}. 325 | #' 326 | #' @keywords internal 327 | pseudoinverse <- function(X){ 328 | # Moore-Penrose inverse via SVD (https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse#Singular_value_decomposition_(SVD)) 329 | # See also MASS::ginv or pracma::pinv 330 | tol <- sqrt(.Machine$double.eps) 331 | svd <- svd(X) 332 | not_null <- svd$d > max(tol * svd$d[1L], 0) 333 | if(all(not_null)){ 334 | with(svd, v %*% (1/d * t(u))) 335 | }else if(all(! not_null)){ 336 | matrix(0, nrow = ncol(X), ncol = nrow(X)) 337 | }else{ 338 | with(svd, v[,not_null,drop=FALSE] %*% (1/d[not_null] * t(u[,not_null,drop=FALSE]))) 339 | } 340 | } 341 | 342 | 343 | 344 | 345 | #' Helper function that makes sure that NA * 0 = 0 in matrix multiply 346 | #' 347 | #' @param X a matrix of size `n*m` 348 | #' @param Y a matrix of size `m*p` 349 | #' 350 | #' @return a matrix of size `n*p` 351 | #' 352 | #' @keywords internal 353 | `%zero_dom_mat_mult%` <- function(X, Y){ 354 | X[is.infinite(X)] <- NA 355 | Y[is.infinite(Y)] <- NA 356 | X_cp <- X 357 | X_cp[is.na(X_cp)] <- 0 358 | Y_cp <- Y 359 | Y_cp[is.na(Y_cp)] <- 0 360 | 361 | res <- X_cp %*% Y_cp 362 | mask1 <- (is.na(X)) %*% (is.na(Y) | Y != 0) 363 | mask2 <- (is.na(X) | X != 0) %*% (is.na(Y)) 364 | res[mask1 + mask2 != 0] <- NA 365 | res 366 | } 367 | 368 | 369 | 370 | as_dgTMatrix <- function(x){ 371 | if(utils::packageVersion("Matrix") >= "1.4.2"){ 372 | # See email from Martin Maechler from 2022-08-12 373 | as(as(as(x, "dMatrix"), "generalMatrix"), "TsparseMatrix") 374 | }else{ 375 | # This approach is deprecated since 1.4.2 and triggers warnings 376 | as(x, "dgTMatrix") 377 | } 378 | } 379 | 380 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r setup, include = FALSE} 8 | if(! exists("..options_set") || isFALSE(..options_set)){ 9 | knitr::opts_chunk$set( 10 | collapse = TRUE, 11 | comment = "#>", 12 | fig.path = "man/figures/README-", 13 | out.width = "80%", 14 | fig.asp = 0.5, 15 | fig.align = "center", 16 | dpi = 300 17 | ) 18 | ..options_set <- TRUE 19 | } 20 | ``` 21 | 22 | ```{r child = "vignettes/Introduction.qmd"} 23 | 24 | ``` 25 | 26 | -------------------------------------------------------------------------------- /data/glioblastoma_example_data.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/data/glioblastoma_example_data.rda -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry( 2 | bibtype = "Article", 3 | title = "Analysis of multi-condition single-cell data with latent embedding multivariate regression", 4 | author = "Constantin Ahlmann-Eltze and Wolfgang Huber", 5 | journal = "bioRxiv", 6 | year = 2023, 7 | doi = "10.1101/2023.03.06.531268" 8 | ) 9 | -------------------------------------------------------------------------------- /man/align_harmony.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/align.R 3 | \name{align_harmony} 4 | \alias{align_harmony} 5 | \alias{align_by_grouping} 6 | \title{Enforce additional alignment of cell clusters beyond the direct differential embedding} 7 | \usage{ 8 | align_harmony( 9 | fit, 10 | design = fit$alignment_design, 11 | ridge_penalty = 0.01, 12 | max_iter = 10, 13 | ..., 14 | verbose = TRUE 15 | ) 16 | 17 | align_by_grouping( 18 | fit, 19 | grouping, 20 | design = fit$alignment_design, 21 | ridge_penalty = 0.01, 22 | preserve_position_of_NAs = FALSE, 23 | verbose = TRUE 24 | ) 25 | } 26 | \arguments{ 27 | \item{fit}{a \code{lemur_fit} object} 28 | 29 | \item{design}{a specification of the design (matrix or formula) that is used 30 | for the transformation. Default: \code{fit$design_matrix}} 31 | 32 | \item{ridge_penalty}{specification how much the flexibility of the transformation 33 | should be regularized. Default: \code{0.01}} 34 | 35 | \item{max_iter}{argument specific for \code{align_harmony}. The number of iterations. Default: \code{10}} 36 | 37 | \item{...}{additional parameters that are passed on to relevant functions} 38 | 39 | \item{verbose}{Should the method print information during the fitting. Default: \code{TRUE}.} 40 | 41 | \item{grouping}{argument specific for \code{align_by_grouping}. Either a vector which assigns 42 | each cell to one group or a matrix with \code{ncol(fit)} columns where the rows are a soft-assignment 43 | to a cluster (i.e., columns sum to \code{1}). \code{NA}'s are allowed.} 44 | 45 | \item{preserve_position_of_NAs}{argument specific for \code{align_by_grouping}. 46 | Boolean flag to decide if \code{NA}s in the \code{grouping} mean that these cells should stay where they are (if 47 | possible) or if they are free to move around. Default: \code{FALSE}} 48 | } 49 | \value{ 50 | The \code{fit} object with the updated \code{fit$embedding} and \code{fit$alignment_coefficients}. 51 | } 52 | \description{ 53 | Enforce additional alignment of cell clusters beyond the direct differential embedding 54 | } 55 | \examples{ 56 | data(glioblastoma_example_data) 57 | fit <- lemur(glioblastoma_example_data, design = ~ patient_id + condition, 58 | n_emb = 5, verbose = FALSE) 59 | # Creating some grouping for illustration 60 | cell_types <- sample(c("tumor cell", "neuron", "leukocyte"), size = ncol(fit), replace = TRUE) 61 | fit_al1 <- align_by_grouping(fit, grouping = cell_types) 62 | 63 | # Alternatively, use harmony to automatically group cells 64 | fit_al2 <- align_harmony(fit) 65 | fit_al2 66 | 67 | # The alignment coefficients are a 3D array 68 | fit_al2$alignment_coefficients 69 | 70 | } 71 | -------------------------------------------------------------------------------- /man/align_impl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/align.R 3 | \name{align_impl} 4 | \alias{align_impl} 5 | \title{Align the points according to some grouping} 6 | \usage{ 7 | align_impl( 8 | embedding, 9 | grouping, 10 | design_matrix, 11 | ridge_penalty = 0.01, 12 | preserve_position_of_NAs = FALSE, 13 | calculate_new_embedding = TRUE 14 | ) 15 | } 16 | \value{ 17 | A list with the new embedding and the coefficients 18 | } 19 | \description{ 20 | Align the points according to some grouping 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/cash-lemur_fit-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lemur_fit.R 3 | \name{.DollarNames.lemur_fit} 4 | \alias{.DollarNames.lemur_fit} 5 | \alias{$,lemur_fit-method} 6 | \alias{dollar_methods} 7 | \alias{$<-,lemur_fit-method} 8 | \title{Access values from a \code{lemur_fit}} 9 | \usage{ 10 | \method{.DollarNames}{lemur_fit}(x, pattern = "") 11 | 12 | \S4method{$}{lemur_fit}(x, name) 13 | 14 | \S4method{$}{lemur_fit}(x, name) <- value 15 | } 16 | \arguments{ 17 | \item{x}{the \code{lemur_fit}} 18 | 19 | \item{pattern}{the pattern from looking up potential values interactively} 20 | 21 | \item{name}{the name of the value behind the dollar} 22 | 23 | \item{value}{the replacement value. This only works for \code{colData} and 24 | \code{rowData}.} 25 | } 26 | \value{ 27 | The respective value stored in the \code{lemur_fit} object. 28 | } 29 | \description{ 30 | Access values from a \code{lemur_fit} 31 | } 32 | \seealso{ 33 | \code{\linkS4class{lemur_fit}} for more documentation on the 34 | accessor functions. 35 | } 36 | -------------------------------------------------------------------------------- /man/figures/README-fig-Neighborhood_size_vs_significance-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/man/figures/README-fig-Neighborhood_size_vs_significance-1.png -------------------------------------------------------------------------------- /man/figures/README-fig-lemur_umap-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/man/figures/README-fig-lemur_umap-1.png -------------------------------------------------------------------------------- /man/figures/README-fig-raw_umap-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/man/figures/README-fig-raw_umap-1.png -------------------------------------------------------------------------------- /man/figures/README-fig-tumor_cell_annotation1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/man/figures/README-fig-tumor_cell_annotation1-1.png -------------------------------------------------------------------------------- /man/figures/README-fig-tumor_cell_annotation2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/man/figures/README-fig-tumor_cell_annotation2-1.png -------------------------------------------------------------------------------- /man/figures/README-fig-tumor_de_neighborhood_plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/man/figures/README-fig-tumor_de_neighborhood_plot-1.png -------------------------------------------------------------------------------- /man/figures/README-fig-umap_de-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/man/figures/README-fig-umap_de-1.png -------------------------------------------------------------------------------- /man/figures/README-fig-umap_de-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/man/figures/README-fig-umap_de-2.png -------------------------------------------------------------------------------- /man/figures/README-fig-umap_de2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/man/figures/README-fig-umap_de2-1.png -------------------------------------------------------------------------------- /man/figures/README-fig-umap_de3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/man/figures/README-fig-umap_de3-1.png -------------------------------------------------------------------------------- /man/figures/README-fig-volcano_plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/man/figures/README-fig-volcano_plot-1.png -------------------------------------------------------------------------------- /man/figures/equation_schematic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/man/figures/equation_schematic.png -------------------------------------------------------------------------------- /man/figures/lemur-art.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/man/figures/lemur-art.jpg -------------------------------------------------------------------------------- /man/find_de_neighborhoods.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/find_de_neighborhoods.R 3 | \name{find_de_neighborhoods} 4 | \alias{find_de_neighborhoods} 5 | \title{Find differential expression neighborhoods} 6 | \usage{ 7 | find_de_neighborhoods( 8 | fit, 9 | group_by, 10 | contrast = fit$contrast, 11 | selection_procedure = c("zscore", "contrast"), 12 | directions = c("random", "contrast", "axis_parallel"), 13 | min_neighborhood_size = 50, 14 | de_mat = SummarizedExperiment::assays(fit)[["DE"]], 15 | test_data = fit$test_data, 16 | test_data_col_data = NULL, 17 | test_method = c("glmGamPoi", "edgeR", "limma", "none"), 18 | continuous_assay_name = fit$use_assay, 19 | count_assay_name = "counts", 20 | size_factor_method = NULL, 21 | design = fit$design, 22 | alignment_design = fit$alignment_design, 23 | add_diff_in_diff = TRUE, 24 | make_neighborhoods_consistent = FALSE, 25 | skip_confounded_neighborhoods = FALSE, 26 | control_parameters = NULL, 27 | verbose = TRUE 28 | ) 29 | } 30 | \arguments{ 31 | \item{fit}{the \code{lemur_fit} generated by \code{lemur()}} 32 | 33 | \item{group_by}{If the \code{independent_matrix} is provided, \code{group_by} defines 34 | how the pseudobulks are formed. This is typically the variable in the column 35 | data that represents the independent unit of replication of the experiment 36 | (e.g., the mouse or patient ID). The argument has to be wrapped in \code{vars(...)}.} 37 | 38 | \item{contrast}{a specification which contrast to fit. This defaults to the 39 | \code{contrast} argument that was used for \code{test_de} and is stored in \code{fit$contrast}.} 40 | 41 | \item{selection_procedure}{specify the algorithm that is used to select the 42 | neighborhoods for each gene. Broadly, \code{selection_procedure = "zscore"} is faster 43 | but less precise than \code{selection_procedure = "contrast"}.} 44 | 45 | \item{directions}{a string to define the algorithm to select the direction onto 46 | which the cells are projected before searching for the neighborhood. 47 | \code{directions = "random"} produces denser neighborhoods, whereas \code{directions = "contrast"} 48 | has usually more power. \cr 49 | Alternatively, this can also be a matrix with one direction for each gene 50 | (i.e., a matrix of size \code{nrow(fit) * fit$n_embedding}).} 51 | 52 | \item{min_neighborhood_size}{the minimum number of cells per neighborhood. Default: \code{50}.} 53 | 54 | \item{de_mat}{the matrix with the differential expression values and is only relevant if 55 | \code{selection_procedure = "zscore"} or \code{directions = "random"}. Defaults 56 | to an assay called \code{"DE"} that is produced by \code{lemur::test_de()}.} 57 | 58 | \item{test_data}{a \code{SummarizedExperiment} object or a named list of matrices. The 59 | data is used to test if the neighborhood inferred on the training data contain a 60 | reliable significant change. If \code{test_method} is \code{"glmGamPoi"} or \code{"edgeR"} a test 61 | using raw counts is conducted and two matching assays are needed: (1) the continuous 62 | assay (with \code{continuous_assay_name}) is projected onto the LEMUR fit to find the latent 63 | position of each cell and (2) the count assay (\code{count_assay_name}) is used for 64 | forming the pseudobulk. If \code{test_method == "limma"}, only the continuous assay is needed. \cr 65 | The arguments defaults to the test data split of when calling \code{lemur()}.} 66 | 67 | \item{test_data_col_data}{additional column data for the \code{test_data} argument.} 68 | 69 | \item{test_method}{choice of test for the pseudobulked differential expression. 70 | \href{https://bioconductor.org/packages/glmGamPoi/}{glmGamPoi} and 71 | \href{https://bioconductor.org/packages/edgeR/}{edgeR} work on an count assay. 72 | \href{http://bioconductor.org/packages/limma/}{limma} works on the continuous assay.} 73 | 74 | \item{continuous_assay_name, count_assay_name}{the assay or list names of \code{independent_data}.} 75 | 76 | \item{size_factor_method}{Set the procedure to calculate the size factor after pseudobulking. This argument 77 | is only relevant if \code{test_method} is \code{"glmGamPoi"} or \code{"edgeR"}. If \code{fit} is subsetted, using a 78 | vector with the sequencing depth per cell ensures reasonable results. 79 | Default: \code{NULL} which means that \code{colSums(assay(fit$test_data, count_assay_name))} is used.} 80 | 81 | \item{design, alignment_design}{the design to use for the fit. Default: \code{fit$design}} 82 | 83 | \item{add_diff_in_diff}{a boolean to specify if the log-fold change (plus significance) of 84 | the DE in the neighborhood against the DE in the complement of the neighborhood is calculated. 85 | If \code{TRUE}, the result includes three additional columns starting with \code{"did_"} short for 86 | difference-in-difference. Default: \code{TRUE}.} 87 | 88 | \item{make_neighborhoods_consistent}{Include cells from outside the neighborhood if they are 89 | at least 10 times in the k-nearest neighbors of the cells inside the neighborhood. Secondly, 90 | remove cells from the neighborhood which are less than 10 times in the k-nearest neighbors of the 91 | other cells in the neighborhood. Default \code{FALSE}} 92 | 93 | \item{skip_confounded_neighborhoods}{Sometimes the inferred neighborhoods are not limited to 94 | a single cell state; this becomes problematic if the cells of the conditions compared in the contrast 95 | are unequally distributed between the cell states. Default: \code{FALSE}} 96 | 97 | \item{control_parameters}{named list with additional parameters passed to underlying functions.} 98 | 99 | \item{verbose}{Should the method print information during the fitting. Default: \code{TRUE}.} 100 | } 101 | \value{ 102 | a data frame with one entry per gene 103 | \describe{ 104 | \item{\code{name}}{The gene name.} 105 | \item{\code{neighborhood}}{A list column where each element is a vector with the cell names included 106 | in that neighborhood.} 107 | \item{\code{n_cells}}{the number of cells in the neighborhood (\code{lengths(neighborhood)}).} 108 | \item{\code{sel_statistic}}{The statistic that is maximized by the \code{selection_procedure}.} 109 | \item{\code{pval}, \code{adj_pval}, \code{t_statistic}, \code{lfc}}{The p-value, Benjamini-Hochberg adjusted p-value (FDR), the 110 | t-statistic, and the log2 fold change of the differential expression test defined by \code{contrast} for the 111 | cells inside the neighborhood (calculated using \code{test_method}). Only present if \code{test_data} is not \code{NULL}.} 112 | \item{\code{did_pval}, \code{did_adj_pval}, \code{did_lfc}}{The measurement if the differential expression of the cells 113 | inside the neighborhood is significantly different from the differential expression of the cells outside 114 | the neighborhood. Only present if \code{add_diff_in_diff = TRUE}.} 115 | } 116 | } 117 | \description{ 118 | Find differential expression neighborhoods 119 | } 120 | \examples{ 121 | data(glioblastoma_example_data) 122 | fit <- lemur(glioblastoma_example_data, design = ~ patient_id + condition, 123 | n_emb = 5, verbose = FALSE) 124 | # Optional alignment 125 | # fit <- align_harmony(fit) 126 | fit <- test_de(fit, contrast = cond(condition = "panobinostat") - cond(condition = "ctrl")) 127 | nei <- find_de_neighborhoods(fit, group_by = vars(patient_id)) 128 | head(nei) 129 | 130 | } 131 | -------------------------------------------------------------------------------- /man/fold_left.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{fold_left} 4 | \alias{fold_left} 5 | \alias{fold_right} 6 | \title{Fold left over a sequence} 7 | \usage{ 8 | fold_left(init) 9 | 10 | fold_right(init) 11 | } 12 | \arguments{ 13 | \item{init}{initial value. If not specified \code{NULL}} 14 | 15 | \item{x}{the sequence to iterate over} 16 | 17 | \item{FUN}{a function with first argument named \code{elem} and second argument 18 | named \code{accum}} 19 | } 20 | \value{ 21 | The final value of \code{accum}. 22 | } 23 | \description{ 24 | Fold left over a sequence 25 | 26 | Fold right over a sequence 27 | } 28 | \examples{ 29 | \dontrun{ 30 | # This produces ... 31 | fold_left(0)(1:10, \(elem, accum) accum + elem) 32 | # ... the same as 33 | sum(1:10) 34 | } 35 | 36 | } 37 | \keyword{internal} 38 | -------------------------------------------------------------------------------- /man/glioblastoma_example_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/glioblastoma_example_data.R 3 | \name{glioblastoma_example_data} 4 | \alias{glioblastoma_example_data} 5 | \title{The \code{glioblastoma_example_data} dataset} 6 | \value{ 7 | A \code{\link{SingleCellExperiment}} object. 8 | } 9 | \description{ 10 | The dataset is a \code{\link{SingleCellExperiment}} object subset to 5,000 cells and 11 | 300 genes. The \code{colData} contain an entry for each cell from which patient 12 | it came and to which treatment condition it belonged (\code{"ctrl"} or \code{"panobinostat"}). 13 | } 14 | \details{ 15 | The original data was collected by Zhao et al. (2021). 16 | } 17 | \references{ 18 | \itemize{ 19 | \item Zhao, Wenting, Athanassios Dovas, Eleonora Francesca Spinazzi, Hanna Mendes Levitin, Matei Alexandru Banu, Pavan Upadhyayula, Tejaswi Sudhakar, et al. 20 | “Deconvolution of Cell Type-Specific Drug Responses in Human Tumor Tissue with Single-Cell RNA-Seq.” Genome Medicine 13, no. 1 21 | (December 2021): 82. https://doi.org/10.1186/s13073-021-00894-y. 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /man/grapes-zero_dom_mat_mult-grapes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{\%zero_dom_mat_mult\%} 4 | \alias{\%zero_dom_mat_mult\%} 5 | \title{Helper function that makes sure that NA * 0 = 0 in matrix multiply} 6 | \usage{ 7 | X \%zero_dom_mat_mult\% Y 8 | } 9 | \arguments{ 10 | \item{X}{a matrix of size \code{n*m}} 11 | 12 | \item{Y}{a matrix of size \code{m*p}} 13 | } 14 | \value{ 15 | a matrix of size \code{n*p} 16 | } 17 | \description{ 18 | Helper function that makes sure that NA * 0 = 0 in matrix multiply 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /man/grassmann_geodesic_regression.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geodesic_regression.R 3 | \name{grassmann_geodesic_regression} 4 | \alias{grassmann_geodesic_regression} 5 | \title{Solve d(P, exp_p(V * x))^2 for V} 6 | \usage{ 7 | grassmann_geodesic_regression( 8 | coordsystems, 9 | design, 10 | base_point, 11 | weights = 1, 12 | tangent_regression = FALSE 13 | ) 14 | } 15 | \value{ 16 | A three-dimensional array with the coefficients \code{V}. 17 | } 18 | \description{ 19 | Solve d(P, exp_p(V * x))^2 for V 20 | } 21 | \keyword{internal} 22 | -------------------------------------------------------------------------------- /man/grassmann_lm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geodesic_regression.R 3 | \name{grassmann_lm} 4 | \alias{grassmann_lm} 5 | \title{Solve ||Y - exp_p(V * x) Y ||^2_2 for V} 6 | \usage{ 7 | grassmann_lm(data, design, base_point, tangent_regression = FALSE) 8 | } 9 | \value{ 10 | A three-dimensional array with the coefficients \code{V}. 11 | } 12 | \description{ 13 | Solve ||Y - exp_p(V * x) Y ||^2_2 for V 14 | } 15 | \keyword{internal} 16 | -------------------------------------------------------------------------------- /man/harmony_new_object.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/harmony_wrapper.R 3 | \name{harmony_new_object} 4 | \alias{harmony_new_object} 5 | \title{Create an arbitrary Harmony object so that I can modify it later} 6 | \usage{ 7 | harmony_new_object() 8 | } 9 | \value{ 10 | The full \code{\link{harmony}} object (R6 reference class type). 11 | } 12 | \description{ 13 | Create an arbitrary Harmony object so that I can modify it later 14 | } 15 | \keyword{internal} 16 | -------------------------------------------------------------------------------- /man/lemur.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lemur.R 3 | \name{lemur} 4 | \alias{lemur} 5 | \title{Main function to fit the latent embedding multivariate regression (LEMUR) model} 6 | \usage{ 7 | lemur( 8 | data, 9 | design = ~1, 10 | col_data = NULL, 11 | n_embedding = 15, 12 | linear_coefficient_estimator = c("linear", "mean", "cluster_median", "zero"), 13 | use_assay = "logcounts", 14 | test_fraction = 0.2, 15 | ..., 16 | verbose = TRUE 17 | ) 18 | } 19 | \arguments{ 20 | \item{data}{a matrix with observations in the columns and features in the rows, 21 | or a \code{SummarizedExperiment} / \code{SingleCellExperiment} object} 22 | 23 | \item{design}{a formula referring to global objects, 24 | \code{colData} (column annotations) of \code{data} if it is a \code{SummarizedExperiment} / \code{SingleCellExperiment} object, 25 | or column names of the \code{col_data} argument} 26 | 27 | \item{col_data}{an optional data frame with \code{ncol(data)} rows, with 28 | annotations ("metadata") on the observations, i.e, on the columns of \code{data}.} 29 | 30 | \item{n_embedding}{the dimension of the linear subspace (latent space).} 31 | 32 | \item{linear_coefficient_estimator}{specify which estimator is used to center the conditions. 33 | \code{"linear"} runs simple regression. It works well in many circumstances, but can produce unsatisfactory 34 | results if the composition of the cell types changes drastically between conditions (e.g., one cell type 35 | disappears). \code{"mean"}, \code{"cluster_median"} and \code{"zero"} are alternative estimators, which 36 | are each supposed to be more robust against compositional changes, but cannot account 37 | for genes that change for all cells between conditions. 38 | \code{"linear"} is the default as it works best with subsequent alignment steps.} 39 | 40 | \item{use_assay}{if \code{data} is a \code{SummarizedExperiment} / \code{SingleCellExperiment} object: 41 | which of its \code{assay} slots should be used?} 42 | 43 | \item{test_fraction}{the fraction of cells (observations) that are set aside before the model fit, to keep an 44 | independent set of test observations. Alternatively, a logical vector of length \code{ncol(data)}.} 45 | 46 | \item{...}{additional parameters that are passed on to the internal function \code{lemur_impl}.} 47 | 48 | \item{verbose}{Should the method print information during the fitting.} 49 | } 50 | 51 | \value{ 52 | An object of class \code{lemur_fit}, which extends \code{\link{SingleCellExperiment}}. Accordingly, 53 | all functions that work for that class also work for \code{lemur_fit} 54 | objects. In addition, slots of these objects can be accessed using the 55 | dollar notation, e.g., \code{fit$embedding}). 56 | For details see the \linkS4class{lemur_fit} help page. 57 | } 58 | \description{ 59 | The main function of this package, to fit the latent embedding multivariate regression (LEMUR) model. 60 | } 61 | \examples{ 62 | data("glioblastoma_example_data") 63 | fit <- lemur(glioblastoma_example_data, design = ~ patient_id + condition) 64 | fit 65 | 66 | } 67 | \references{ 68 | \itemize{ 69 | \item Ahlmann-Eltze, C. & Huber, W. (2023). Analysis of multi-condition single-cell data with latent 70 | embedding multivariate regression. bioRxiv \url{https://doi.org/10.1101/2023.03.06.531268} 71 | } 72 | } 73 | \seealso{ 74 | \code{\link{align_by_grouping}}, \code{\link{align_harmony}}, \code{\link{test_de}}, \code{\link{find_de_neighborhoods}} 75 | } 76 | -------------------------------------------------------------------------------- /man/lemur_fit.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lemur_fit.R 3 | \docType{class} 4 | \name{lemur_fit-class} 5 | \alias{lemur_fit-class} 6 | \alias{.lemur_fit} 7 | \alias{lemur_fit} 8 | \alias{[,lemur_fit,ANY,ANY,ANY-method} 9 | \alias{design,lemur_fit-method} 10 | \title{The \code{lemur_fit} class} 11 | \usage{ 12 | \S4method{[}{lemur_fit,ANY,ANY,ANY}(x, i, j, ..., drop = TRUE) 13 | 14 | \S4method{design}{lemur_fit}(object) 15 | } 16 | \arguments{ 17 | \item{x, i, j, ..., drop}{the \code{lemur_fit} object and indices for the \code{[} subsetting operator} 18 | 19 | \item{object}{the \code{lemur_fit} object for the \code{\link[BiocGenerics:dge]{BiocGenerics::design}} generic} 20 | } 21 | \value{ 22 | An object of class \code{lemur_fit}. 23 | } 24 | \description{ 25 | The \code{lemur_fit} class extends \code{\link{SingleCellExperiment}} and provides 26 | additional accessors to get the values of the values produced by \code{\link{lemur}}. 27 | } 28 | \details{ 29 | To access the values produced by \code{\link{lemur}}, use the dollar notation (\code{$}): 30 | \describe{ 31 | \item{\code{fit$n_embedding}}{the number of embedding dimensions.} 32 | \item{\code{fit$design}}{the specification of the design in \code{\link{lemur}}. Usually this is a \code{\link[stats:formula]{stats::formula}}.} 33 | \item{\code{fit$base_point}}{a matrix (\code{nrow(fit) * fit$n_embedding}) with the base point for the Grassmann exponential map.} 34 | \item{\code{fit$coefficients}}{a three-dimensional tensor (\code{nrow(fit) * fit$n_embedding * ncol(fit$design_matrix)}) with the coefficients for 35 | the exponential map.} 36 | \item{\code{fit$embedding}}{a matrix (\code{fit$n_embedding * ncol(fit)}) with the low dimensional position for each cell.} 37 | \item{\code{fit$design_matrix}}{a matrix with covariates for each cell (\code{ncol(fit) * ncol(fit$design_matrix)}).} 38 | \item{\code{fit$linear_coefficients}}{a matrix (\code{nrow(fit) * ncol(fit$design_matrix)}) with the coefficients for the linear regression.} 39 | \item{\code{fit$alignment_coefficients}}{a 3D tensor with the coefficients for the alignment (\code{fit$n_embedding * fit$n_embedding * ncol(fit$design_matrix)})} 40 | \item{\code{fit$alignment_design}}{an alternative design specification for the alignment. This is typically a \code{\link[stats:formula]{stats::formula}}.} 41 | \item{\code{fit$alignment_design_matrix}}{an alternative design matrix specification for the alignment.} 42 | \item{\code{fit$contrast}}{a parsed version of the contrast specification from the \code{test_de} function or \code{NULL}.} 43 | \item{\code{fit$colData}}{the column annotation \code{DataFrame}.} 44 | \item{\code{fit$rowData}}{the row annotation \code{DataFrame}.} 45 | } 46 | } 47 | \examples{ 48 | # The easiest way to make a lemur_fit object, is to call `lemur` 49 | data(glioblastoma_example_data) 50 | fit <- lemur(glioblastoma_example_data, design = ~ patient_id + condition, 51 | n_emb = 5, verbose = FALSE) 52 | 53 | fit$n_embedding 54 | fit$embedding[,1:10] 55 | fit$n_embedding 56 | fit$embedding[,1:10] 57 | fit$design_matrix[1:10,] 58 | fit$coefficients[1:3,,] 59 | 60 | } 61 | \seealso{ 62 | \code{\link{lemur}}, \code{\link[=predict.lemur_fit]{predict}}, \code{\link[=residuals,lemur_fit-method]{residuals}} 63 | } 64 | -------------------------------------------------------------------------------- /man/mply_dbl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{mply_dbl} 4 | \alias{mply_dbl} 5 | \alias{stack_rows} 6 | \alias{stack_cols} 7 | \title{Iterating function that returns a matrix} 8 | \usage{ 9 | mply_dbl(x, FUN, ncol = 1, ...) 10 | 11 | stack_rows(x) 12 | 13 | stack_cols(x) 14 | } 15 | \arguments{ 16 | \item{x}{the sequence that is mapped to a matrix} 17 | 18 | \item{FUN}{the function that returns a vector of length \code{ncol}} 19 | 20 | \item{ncol}{the length of the output vector} 21 | 22 | \item{...}{additional arguments that are passed to \code{FUN}} 23 | } 24 | \value{ 25 | A matrix with \code{length(x)} / \code{nrow(x)} rows and \code{ncol} columns. 26 | For \code{msply_dbl} the number of columns depends on the output of \code{FUN}. 27 | } 28 | \description{ 29 | The length of \code{x} determines the number of rows. The length of 30 | \code{FUN(x[i])} determines the number of columns. Must match \code{ncol}. 31 | } 32 | \section{Functions}{ 33 | \itemize{ 34 | \item \code{stack_rows()}: Each list element becomes a row in a matrix 35 | 36 | \item \code{stack_cols()}: Each list element becomes a row in a matrix 37 | 38 | }} 39 | \keyword{internal} 40 | -------------------------------------------------------------------------------- /man/one_hot_encoding.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/align.R 3 | \name{one_hot_encoding} 4 | \alias{one_hot_encoding} 5 | \title{Take a vector and convert it to a one-hot encoded matrix} 6 | \usage{ 7 | one_hot_encoding(groups) 8 | } 9 | \value{ 10 | A matrix with \code{length(unique(groups))} rows and \code{length(groups)} columns. 11 | } 12 | \description{ 13 | Take a vector and convert it to a one-hot encoded matrix 14 | } 15 | \keyword{internal} 16 | -------------------------------------------------------------------------------- /man/predict.lemur_fit.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predict.R 3 | \name{predict.lemur_fit} 4 | \alias{predict.lemur_fit} 5 | \title{Predict values from \code{lemur_fit} object} 6 | \usage{ 7 | \method{predict}{lemur_fit}( 8 | object, 9 | newdata = NULL, 10 | newdesign = NULL, 11 | newcondition = NULL, 12 | embedding = object$embedding, 13 | with_linear_model = TRUE, 14 | with_embedding = TRUE, 15 | with_alignment = TRUE, 16 | ... 17 | ) 18 | } 19 | \arguments{ 20 | \item{object}{an \code{lemur_fit} object} 21 | 22 | \item{newdata}{a data.frame which passed to \code{\link{model.matrix}} with 23 | \code{design} to make the \code{newdesign} matrix} 24 | 25 | \item{newdesign}{a matrix with the covariates for which the output 26 | is predicted. If \code{NULL}, the \code{object$design_matrix} is used. If 27 | it is a vector it is repeated \code{ncol(embedding)} times to create 28 | a design matrix with the same entry for each cell.} 29 | 30 | \item{newcondition}{an unquoted expression with a call to \code{cond()} specifying 31 | the covariates of the prediction. See the \code{contrast} argument in \link{test_de} 32 | for more details. Note that combinations of multiple calls to \code{cond()} are 33 | not allowed (e.g., \code{cond(a = 1) - cond(a = 2)}). If specified, \code{newdata} 34 | and \code{newdesign} are ignored.} 35 | 36 | \item{embedding}{the low-dimensional cell position for which the 37 | output is predicted.} 38 | 39 | \item{with_linear_model}{a boolean to indicate if the linear regression 40 | offset is included in the prediction.} 41 | 42 | \item{with_embedding}{a boolean to indicate if the embedding contributes 43 | to the output.} 44 | 45 | \item{with_alignment}{a boolean to indicate if the alignment effect 46 | is removed from the output.} 47 | 48 | \item{...}{additional parameters passed to \code{predict_impl}.} 49 | } 50 | \value{ 51 | A matrix with the same dimension \code{nrow(object) * nrow(newdesign)}. 52 | } 53 | \description{ 54 | Predict values from \code{lemur_fit} object 55 | } 56 | \examples{ 57 | 58 | data(glioblastoma_example_data) 59 | fit <- lemur(glioblastoma_example_data, design = ~ patient_id + condition, 60 | n_emb = 5, verbose = FALSE) 61 | 62 | pred <- predict(fit) 63 | 64 | pred_ctrl <- predict(fit, newdesign = c(1, 0, 0, 0, 0, 0)) 65 | pred_trt <- predict(fit, newdesign = c(1, 0, 0, 0, 0, 1)) 66 | # This is the same as the test_de result 67 | fit <- test_de(fit, cond(condition = "panobinostat") - cond(condition = "ctrl")) 68 | all.equal(SummarizedExperiment::assay(fit, "DE"), pred_trt - pred_ctrl, 69 | check.attributes = FALSE) 70 | 71 | } 72 | \seealso{ 73 | \code{\link[=residuals,lemur_fit-method]{residuals}} 74 | } 75 | -------------------------------------------------------------------------------- /man/project_on_lemur_fit.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/project_on_fit.R 3 | \name{project_on_lemur_fit} 4 | \alias{project_on_lemur_fit} 5 | \title{Project new data onto the latent spaces of an existing lemur fit} 6 | \usage{ 7 | project_on_lemur_fit( 8 | fit, 9 | data, 10 | col_data = NULL, 11 | use_assay = "logcounts", 12 | design = fit$design, 13 | alignment_design = fit$alignment_design, 14 | return = c("matrix", "lemur_fit") 15 | ) 16 | } 17 | \arguments{ 18 | \item{fit}{an \code{lemur_fit} object} 19 | 20 | \item{data}{a matrix with observations in the columns and features in the rows. 21 | Or a \code{SummarizedExperiment} / \code{SingleCellExperiment} object. The features must 22 | match the features in \code{fit}.} 23 | 24 | \item{col_data}{col_data an optional data frame with \code{ncol(data)} rows.} 25 | 26 | \item{use_assay}{if \code{data} is a \code{SummarizedExperiment} / \code{SingleCellExperiment} object, 27 | which assay should be used.} 28 | 29 | \item{design, alignment_design}{the design formulas or design matrices that are used 30 | to project the data on the correct latent subspace. Both default to the designs 31 | from the \code{fit} object.} 32 | 33 | \item{return}{which data structure is returned.} 34 | } 35 | \value{ 36 | Either a matrix with the low-dimensional embeddings of the \code{data} or 37 | an object of class \code{lemur_fit} wrapping that embedding. 38 | } 39 | \description{ 40 | Project new data onto the latent spaces of an existing lemur fit 41 | } 42 | \examples{ 43 | 44 | data(glioblastoma_example_data) 45 | 46 | subset1 <- glioblastoma_example_data[,1:2500] 47 | subset2 <- glioblastoma_example_data[,2501:5000] 48 | 49 | fit <- lemur(subset1, design = ~ condition, n_emb = 5, 50 | test_fraction = 0, verbose = FALSE) 51 | 52 | # Returns a `lemur_fit` object with the projection of `subset2` 53 | fit2 <- project_on_lemur_fit(fit, subset2, return = "lemur_fit") 54 | fit2 55 | 56 | 57 | 58 | } 59 | -------------------------------------------------------------------------------- /man/pseudoinverse.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{pseudoinverse} 4 | \alias{pseudoinverse} 5 | \title{Moore-Penrose pseudoinverse calculated via SVD} 6 | \usage{ 7 | pseudoinverse(X) 8 | } 9 | \arguments{ 10 | \item{X}{a matrix X} 11 | } 12 | \value{ 13 | The matrix \eqn{X^{+}}. 14 | } 15 | \description{ 16 | In the simplest case, the pseudoinverse is 17 | \deqn{X^{+} = (X^T X)^{-1} X^T.} 18 | } 19 | \details{ 20 | To handle the more general case, the pseudoinverse can expressed using a SVD 21 | \eqn{X = U D V^T}: 22 | \deqn{X^{+} = V D^{-1} U^T} 23 | } 24 | \keyword{internal} 25 | -------------------------------------------------------------------------------- /man/recursive_least_squares.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/recursive_least_squares.R 3 | \name{recursive_least_squares} 4 | \alias{recursive_least_squares} 5 | \alias{bulked_recursive_least_squares_contrast} 6 | \title{Iteratively calculate the least squares solution} 7 | \usage{ 8 | recursive_least_squares(y, X) 9 | 10 | bulked_recursive_least_squares_contrast( 11 | y, 12 | X, 13 | group, 14 | contrast, 15 | ridge_penalty = 1e-06 16 | ) 17 | } 18 | \arguments{ 19 | \item{y}{a vector with observations} 20 | 21 | \item{X}{a design matrix} 22 | } 23 | \value{ 24 | a matrix where column i is the 25 | solution to \code{y[1:i] ~ X[1:i,]}. 26 | } 27 | \description{ 28 | Both functions are for testing purposes. There is a faster implementation 29 | called \code{cum_brls_which_abs_max}. 30 | } 31 | \keyword{internal} 32 | -------------------------------------------------------------------------------- /man/reexports.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/find_de_neighborhoods.R 3 | \docType{import} 4 | \name{reexports} 5 | \alias{reexports} 6 | \alias{vars} 7 | \title{Objects exported from other packages} 8 | \value{ 9 | see \link[glmGamPoi:vars]{glmGamPoi::vars}. 10 | } 11 | \examples{ 12 | # `vars` quotes expressions (just like in dplyr) 13 | vars(condition, sample) 14 | 15 | } 16 | \keyword{internal} 17 | \description{ 18 | These objects are imported from other packages. Follow the links 19 | below to see their documentation. 20 | 21 | \describe{ 22 | \item{glmGamPoi}{\code{\link[glmGamPoi]{vars}}} 23 | }} 24 | 25 | -------------------------------------------------------------------------------- /man/residuals-lemur_fit-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predict.R 3 | \name{residuals,lemur_fit-method} 4 | \alias{residuals,lemur_fit-method} 5 | \title{Predict values from \code{lemur_fit} object} 6 | \usage{ 7 | \S4method{residuals}{lemur_fit}(object, with_linear_model = TRUE, with_embedding = TRUE, ...) 8 | } 9 | \arguments{ 10 | \item{object}{an \code{lemur_fit} object} 11 | 12 | \item{with_linear_model}{a boolean to indicate if the linear regression 13 | offset is included in the prediction.} 14 | 15 | \item{with_embedding}{a boolean to indicate if the embedding contributes 16 | to the output.} 17 | 18 | \item{...}{ignored.} 19 | } 20 | \value{ 21 | A matrix with the same dimension \code{dim(object)}. 22 | } 23 | \description{ 24 | Predict values from \code{lemur_fit} object 25 | } 26 | \examples{ 27 | data(glioblastoma_example_data) 28 | fit <- lemur(glioblastoma_example_data, design = ~ patient_id + condition, 29 | n_emb = 5, verbose = FALSE) 30 | 31 | resid <- residuals(fit) 32 | dim(resid) 33 | 34 | 35 | } 36 | \seealso{ 37 | \link{predict.lemur_fit} 38 | } 39 | -------------------------------------------------------------------------------- /man/ridge_regression.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ridge_regression.R 3 | \name{ridge_regression} 4 | \alias{ridge_regression} 5 | \title{Ridge regression} 6 | \usage{ 7 | ridge_regression(Y, X, ridge_penalty = 0, weights = rep(1, nrow(X))) 8 | } 9 | \arguments{ 10 | \item{Y}{the observations matrix (\verb{features x samples})} 11 | 12 | \item{X}{the design matrix (\verb{samples x covariates})} 13 | 14 | \item{ridge_penalty}{a numeric vector or matrix of size (\code{covariates} or 15 | \verb{covariates x covariates} respectively)} 16 | 17 | \item{weights}{a vector of observation weights} 18 | } 19 | \value{ 20 | The matrix of coefficients. 21 | } 22 | \description{ 23 | The function does not treat the intercept special. 24 | } 25 | \keyword{internal} 26 | -------------------------------------------------------------------------------- /man/stack_slice.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{stack_slice} 4 | \alias{stack_slice} 5 | \alias{destack_slice} 6 | \title{Make a cube from a list of matrices} 7 | \usage{ 8 | stack_slice(x) 9 | 10 | destack_slice(x) 11 | } 12 | \arguments{ 13 | \item{x}{a list of vectors/matrices that are stacked} 14 | } 15 | \value{ 16 | A three-dimensional array. 17 | } 18 | \description{ 19 | The length of the list will become the third dimension of the cube. 20 | } 21 | \section{Functions}{ 22 | \itemize{ 23 | \item \code{destack_slice()}: Make a list of matrices from a cube 24 | 25 | }} 26 | \keyword{internal} 27 | -------------------------------------------------------------------------------- /man/test_de.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/test_de.R 3 | \name{test_de} 4 | \alias{test_de} 5 | \title{Predict log fold changes between conditions for each cell} 6 | \usage{ 7 | test_de( 8 | fit, 9 | contrast, 10 | embedding = NULL, 11 | consider = c("embedding+linear", "embedding", "linear"), 12 | new_assay_name = "DE" 13 | ) 14 | } 15 | \arguments{ 16 | \item{fit}{the result of calling \code{\link[=lemur]{lemur()}}} 17 | 18 | \item{contrast}{Specification of the contrast: a call to \code{cond()} specifying a full observation 19 | (e.g. \code{cond(treatment = "A", sex = "male") - cond(treatment = "C", sex = "male")} to 20 | compare treatment A vs C for male observations). Unspecified factors default to the reference level.} 21 | 22 | \item{embedding}{matrix of size \code{n_embedding} \eqn{\times} \code{n} that specifies where in the latent space 23 | the differential expression is tested. It defaults to the position of all cells from the original fit.} 24 | 25 | \item{consider}{specify which part of the model are considered for the differential expression test.} 26 | 27 | \item{new_assay_name}{the name of the assay added to the \code{fit} object. Default: \code{"DE"}.} 28 | } 29 | \value{ 30 | If \code{is.null(embedding)} the \code{fit} object with a new assay called \code{"DE"}. Otherwise 31 | return a matrix with the differential expression values. 32 | } 33 | \description{ 34 | Predict log fold changes between conditions for each cell 35 | } 36 | \examples{ 37 | library(SummarizedExperiment) 38 | library(SingleCellExperiment) 39 | 40 | data(glioblastoma_example_data) 41 | fit <- lemur(glioblastoma_example_data, design = ~ patient_id + condition, 42 | n_emb = 5, verbose = FALSE) 43 | # Optional alignment 44 | # fit <- align_harmony(fit) 45 | fit <- test_de(fit, contrast = cond(condition = "panobinostat") - cond(condition = "ctrl")) 46 | 47 | # The fit object contains a new assay called "DE" 48 | assayNames(fit) 49 | 50 | # The DE assay captures differences between conditions 51 | is_ctrl_cond <- fit$colData$condition == "ctrl" 52 | mean(logcounts(fit)[1,!is_ctrl_cond]) - mean(logcounts(fit)[1,is_ctrl_cond]) 53 | mean(assay(fit, "DE")[1,]) 54 | 55 | } 56 | \seealso{ 57 | \link{find_de_neighborhoods} 58 | } 59 | -------------------------------------------------------------------------------- /man/test_global.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/test_de.R 3 | \name{test_global} 4 | \alias{test_global} 5 | \title{Differential embedding for each condition} 6 | \usage{ 7 | test_global( 8 | fit, 9 | contrast, 10 | reduced_design = NULL, 11 | consider = c("embedding+linear", "embedding", "linear"), 12 | variance_est = c("analytical", "resampling", "none"), 13 | verbose = TRUE, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{fit}{the result of calling \code{\link[=lemur]{lemur()}}} 19 | 20 | \item{contrast}{Specification of the contrast: a call to \code{cond()} specifying a full observation 21 | (e.g. \code{cond(treatment = "A", sex = "male") - cond(treatment = "C", sex = "male")} to 22 | compare treatment A vs C for male observations). Unspecified factors default to the reference level.} 23 | 24 | \item{reduced_design}{an alternative specification of the null hypothesis.} 25 | 26 | \item{consider}{specify which part of the model are considered for the differential expression test.} 27 | 28 | \item{variance_est}{How or if the variance should be estimated. \code{'analytical'} is only compatible with \code{consider = "linear"}. \code{'resampling'} is the most flexible (to adapt the number 29 | of resampling iterations, set \code{n_resampling_iter}. Default: \code{100})} 30 | 31 | \item{verbose}{should the method print information during the fitting. Default: \code{TRUE}.} 32 | 33 | \item{...}{additional arguments.} 34 | } 35 | \value{ 36 | a data.frame 37 | } 38 | \description{ 39 | Differential embedding for each condition 40 | } 41 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 2 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 3 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 2 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 3 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 10 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 11 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 12 | #endif 13 | 14 | // cumz_which_abs_max 15 | List cumz_which_abs_max(NumericVector x, int min_neighborhood_size); 16 | RcppExport SEXP _lemur_cumz_which_abs_max(SEXP xSEXP, SEXP min_neighborhood_sizeSEXP) { 17 | BEGIN_RCPP 18 | Rcpp::RObject rcpp_result_gen; 19 | Rcpp::traits::input_parameter< NumericVector >::type x(xSEXP); 20 | Rcpp::traits::input_parameter< int >::type min_neighborhood_size(min_neighborhood_sizeSEXP); 21 | rcpp_result_gen = Rcpp::wrap(cumz_which_abs_max(x, min_neighborhood_size)); 22 | return rcpp_result_gen; 23 | END_RCPP 24 | } 25 | // cum_brls_which_abs_max 26 | List cum_brls_which_abs_max(const NumericVector y, const arma::mat& X, const IntegerVector group, const arma::rowvec& contrast, const double penalty, int min_neighborhood_size); 27 | RcppExport SEXP _lemur_cum_brls_which_abs_max(SEXP ySEXP, SEXP XSEXP, SEXP groupSEXP, SEXP contrastSEXP, SEXP penaltySEXP, SEXP min_neighborhood_sizeSEXP) { 28 | BEGIN_RCPP 29 | Rcpp::RObject rcpp_result_gen; 30 | Rcpp::traits::input_parameter< const NumericVector >::type y(ySEXP); 31 | Rcpp::traits::input_parameter< const arma::mat& >::type X(XSEXP); 32 | Rcpp::traits::input_parameter< const IntegerVector >::type group(groupSEXP); 33 | Rcpp::traits::input_parameter< const arma::rowvec& >::type contrast(contrastSEXP); 34 | Rcpp::traits::input_parameter< const double >::type penalty(penaltySEXP); 35 | Rcpp::traits::input_parameter< int >::type min_neighborhood_size(min_neighborhood_sizeSEXP); 36 | rcpp_result_gen = Rcpp::wrap(cum_brls_which_abs_max(y, X, group, contrast, penalty, min_neighborhood_size)); 37 | return rcpp_result_gen; 38 | END_RCPP 39 | } 40 | // count_neighbors_fast 41 | IntegerVector count_neighbors_fast(NumericMatrix knn_mat, IntegerVector indices); 42 | RcppExport SEXP _lemur_count_neighbors_fast(SEXP knn_matSEXP, SEXP indicesSEXP) { 43 | BEGIN_RCPP 44 | Rcpp::RObject rcpp_result_gen; 45 | Rcpp::traits::input_parameter< NumericMatrix >::type knn_mat(knn_matSEXP); 46 | Rcpp::traits::input_parameter< IntegerVector >::type indices(indicesSEXP); 47 | rcpp_result_gen = Rcpp::wrap(count_neighbors_fast(knn_mat, indices)); 48 | return rcpp_result_gen; 49 | END_RCPP 50 | } 51 | 52 | static const R_CallMethodDef CallEntries[] = { 53 | {"_lemur_cumz_which_abs_max", (DL_FUNC) &_lemur_cumz_which_abs_max, 2}, 54 | {"_lemur_cum_brls_which_abs_max", (DL_FUNC) &_lemur_cum_brls_which_abs_max, 6}, 55 | {"_lemur_count_neighbors_fast", (DL_FUNC) &_lemur_count_neighbors_fast, 2}, 56 | {NULL, NULL, 0} 57 | }; 58 | 59 | RcppExport void R_init_lemur(DllInfo *dll) { 60 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 61 | R_useDynamicSymbols(dll, FALSE); 62 | } 63 | -------------------------------------------------------------------------------- /src/util.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | 5 | // [[Rcpp::export(rng = FALSE)]] 6 | List cumz_which_abs_max(NumericVector x, int min_neighborhood_size){ 7 | int size = x.size(); 8 | min_neighborhood_size = std::min(min_neighborhood_size, size); 9 | int max_idx = 0; 10 | double max = -std::numeric_limits::infinity(); 11 | int sign = +1; 12 | if(size == 0){ 13 | return max_idx; 14 | }else if(size == 1){ 15 | return 1; 16 | } 17 | double m = x[0]; 18 | double msq = 0; 19 | 20 | for(int i = 2; i <= size; i++){ 21 | double delta = x[i-1] - m; 22 | m += delta / i; 23 | double delta2 = x[i-1] - m; 24 | msq = (msq * (i - 1) + delta * delta2) / i; 25 | double val = std::abs(m / sqrt(msq / (i-1))); 26 | if(i >= min_neighborhood_size - 1 && val > max){ 27 | max = val; 28 | sign = m < 0 ? -1 : +1; 29 | max_idx = i; 30 | } 31 | } 32 | return List::create(Named("max") = sign * max , Named("idx") = max_idx); 33 | } 34 | 35 | 36 | // [[Rcpp::export(rng = FALSE)]] 37 | List cum_brls_which_abs_max(const NumericVector y, const arma::mat& X, const IntegerVector group, 38 | const arma::rowvec& contrast, const double penalty, int min_neighborhood_size){ 39 | int size = y.size(); 40 | min_neighborhood_size = std::min(min_neighborhood_size, size); 41 | 42 | int max_idx = 0; 43 | double max_val = -std::numeric_limits::infinity(); 44 | int sign = +1; 45 | if(size == 0){ 46 | return List::create(Named("index") = max_idx, Named("max") = max_val); 47 | }else if(size == 1){ 48 | return List::create(Named("index") = 1, Named("max") = y); 49 | } 50 | int k = X.n_cols; 51 | int g = max(group); 52 | arma::colvec m(g); 53 | IntegerVector count(g); 54 | arma::mat X_act(g, k); 55 | arma::mat gamma = 1/penalty * arma::eye(k, k); 56 | arma::colvec beta(k); 57 | int n_obs = 0; 58 | double se_pre = 0; 59 | 60 | for(int i = 0; i < size; ++i){ 61 | double yi = y[i]; 62 | arma::colvec xi = X.row(i).t(); 63 | int gi = group[i] - 1; 64 | double delta_m = 1/(count[gi] + 1.0) * yi - (1 - count[gi] / (count[gi] + 1.0)) * m(gi); 65 | m(gi) += delta_m; 66 | count(gi) += 1; 67 | if(count(gi) == 1){ 68 | X_act.row(gi) = xi.t(); 69 | n_obs += 1; 70 | gamma -= (gamma * xi * xi.t() * gamma) / arma::as_scalar(1 + xi.t() * gamma * xi); 71 | se_pre = arma::as_scalar(contrast * gamma * contrast.t()); 72 | beta += gamma * xi * (m[gi] - xi.t() * beta); 73 | }else{ 74 | beta += gamma * (xi * delta_m); 75 | } 76 | if(n_obs > k){ 77 | double rss = std::max(1e-6, arma::accu(arma::pow(m - X_act * beta, 2))); 78 | double se = sqrt(se_pre * rss / (n_obs - k)); 79 | double t_stat = arma::as_scalar(contrast * beta) / se; 80 | if(i >= min_neighborhood_size-1 && std::abs(t_stat) > max_val){ 81 | max_val = std::abs(t_stat); 82 | sign = t_stat < 0 ? -1 : +1; 83 | max_idx = i + 1; 84 | } 85 | } 86 | } 87 | return List::create(Named("idx") = max_idx, Named("max") = sign * max_val); 88 | } 89 | 90 | 91 | // [[Rcpp::export(rng = FALSE)]] 92 | IntegerVector count_neighbors_fast(NumericMatrix knn_mat, IntegerVector indices){ 93 | int n_cells = knn_mat.nrow(); 94 | int knn = knn_mat.ncol(); 95 | int n = indices.length(); 96 | IntegerVector counter(n_cells); 97 | for(int i = 0; i < n; ++i){ 98 | for(int k = 0; k < knn; ++k){ 99 | counter(knn_mat(indices(i)-1, k)-1) += 1; 100 | } 101 | } 102 | return counter; 103 | } 104 | 105 | 106 | /*** R 107 | 108 | cum_z_stat2 <- function(x){ 109 | out <- rep(NA_real_, length(x)) 110 | m <- x[1] 111 | msq <- 0 112 | out[1] <- NA 113 | for(idx in seq_along(x)[-1]){ 114 | delta <- x[idx] - m 115 | m <- m + delta / idx 116 | delta2 <- x[idx] - m 117 | msq <- (msq * (idx-1) + delta * delta2) / idx 118 | 119 | out[idx] <- m / sqrt(msq / (idx-1)) 120 | } 121 | out 122 | } 123 | 124 | cum_z_stat2(x[1:10]) 125 | idx <- 10 126 | .x <- x[seq_len(idx)] 127 | msq <- sum((mean(.x) - .x)^2) 128 | mean(.x) / sqrt(msq / idx / (idx-1)) 129 | mean(.x) / (sd(.x) / sqrt(idx)) 130 | 131 | x <- rnorm(16000) 132 | cumz_which_abs_max(x) 133 | max(abs(cum_z_stat2(x)), na.rm = TRUE) 134 | bench::mark( 135 | which.max(abs(cum_z_stat2(x))), 136 | which.max(abs(cumz(x))), 137 | cumz2(x), 138 | check = TRUE 139 | ) 140 | 141 | */ 142 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(lemur) 3 | 4 | test_check("lemur") 5 | -------------------------------------------------------------------------------- /tests/testthat/helper.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | make_synthetic_data <- function(n_genes = 30, n_cells = 500, n_centers = 4, n_lat = 2, treatment_effect = 0.1){ 4 | n_lat <- min(n_lat, n_genes) 5 | centers <- duplicate_cols(randn(n_lat, n_centers, sd = 2), ceiling(n_cells / n_centers))[,seq_len(n_cells)] 6 | true_Z <- centers + rnorm(n_cells * n_lat, sd = 0.1) 7 | stopifnot(n_centers <= length(LETTERS)) 8 | cell_type <- rep(LETTERS[seq_len(n_centers)], length.out = n_cells) 9 | condition <- sample(letters[1:3], n_cells, replace = TRUE) 10 | design_matrix <- model.matrix(~ condition) 11 | plane <- qr.Q(qr(randn(n_genes, n_genes)))[seq_len(n_genes), seq_len(n_lat)] 12 | 13 | true_P <- nullspace(plane) %*% cbind(intercept = randn(n_genes - n_lat, 1, sd = 0), 14 | beta1 = randn(n_genes - n_lat, 1, sd = treatment_effect), 15 | beta2 = randn(n_genes - n_lat, 1, sd = treatment_effect)) 16 | true_P <- true_P + rnorm(prod(dim(true_P)), sd = 1e-8) 17 | true_Beta <- randn(n_genes, ncol(design_matrix), sd = 0.1) 18 | 19 | dir <- scale(randn(n_lat, 1), center = FALSE) 20 | Y <- true_Beta %*% t(design_matrix) + plane %*% true_Z + true_P %*% (t(design_matrix) * duplicate_rows(t(dir) %*% true_Z, 3)) 21 | 22 | linear_effect <- LinearEmbeddingMatrix(design_matrix, true_Beta) 23 | linear_embedding <- LinearEmbeddingMatrix(t(true_Z), plane) 24 | interaction_embedding <- LinearEmbeddingMatrix(t(t(design_matrix) * duplicate_rows(t(dir) %*% true_Z, 3)), true_P) 25 | 26 | colnames(Y) <- paste0("cell_", seq_len(n_cells)) 27 | rownames(Y) <- paste0("gene_", seq_len(n_genes)) 28 | sce <- SingleCellExperiment(list(logcounts = Y), colData = data.frame(condition = condition, cell_type = cell_type), 29 | reducedDims = list(linear_effect = linear_effect, linear_embedding = linear_embedding, interaction_embedding = interaction_embedding)) 30 | 31 | } 32 | 33 | 34 | make_vectors <- function(n_genes, n_obs, sd = 0.1){ 35 | x <- randn(n_genes, n_obs) 36 | x <- apply(x, 2, \(.x) .x / sqrt(sum(.x^2))) 37 | z <- randn(n_genes, 1) 38 | z <- apply(z, 2, \(.z) .z / sqrt(sum(.z^2))) 39 | bp <- diag(nrow=n_genes) 40 | pert <- project_rotation_tangent(randn(n_genes, n_genes, sd = sd), bp) 41 | y <- rotation_map(pert, bp) %*% x 42 | list(x=x, y=y, z=z, bp=bp, pert=pert) 43 | } 44 | 45 | make_vectors2 <- function(n_genes, n_obs, sd = 0.1){ 46 | x <- randn(n_genes, n_obs) 47 | z <- randn(n_genes, 1) 48 | bp <- diag(nrow=n_genes) 49 | pert <- project_spd_tangent(randn(n_genes, n_genes, sd = sd), bp) 50 | y <- spd_map(pert, bp) %*% x 51 | list(x=x, y=y, z=z, bp=bp, pert=pert) 52 | } 53 | 54 | principal_angle <- function(A, B){ 55 | acos(pmax(0, pmin(1, svd(t(qr.Q(qr(A))) %*% qr.Q(qr(B)))$d))) / pi * 180 56 | } 57 | -------------------------------------------------------------------------------- /tests/testthat/test-align.R: -------------------------------------------------------------------------------- 1 | set.seed(1) 2 | dat <- make_synthetic_data(n_centers = 4, n_genes = 50) 3 | dat$patient <- sample(paste0("p", 1:3), 500, replace = TRUE) 4 | fit <- lemur(dat, ~ condition + patient, n_embedding = 5, test_fraction = 0, verbose = FALSE) 5 | 6 | test_that("forward and reverse transformation cancel", { 7 | coef <- array(cbind(randn(5, 6), randn(5, 6)), dim = c(5,6,2)) 8 | vec <- rnorm(2) 9 | forward <- forward_linear_transformation(coef, vec) 10 | reverse <- reverse_linear_transformation(coef, vec) 11 | expect_equal(reverse %*% forward[,-1], diag(nrow = 5)) 12 | }) 13 | 14 | 15 | test_that("alignment with Harmony work", { 16 | fit_al <- align_harmony(fit, verbose = FALSE, max_iter = 1) 17 | 18 | n_coef <- ncol(fit$design_matrix) 19 | n_lat <- fit$n_embedding 20 | expect_equal(fit$alignment_coefficients, array(0, dim = c(n_lat, n_lat+1, n_coef))) 21 | expect_equal(dim(fit_al$alignment_coefficients), c(5,6,5)) 22 | 23 | pred0 <- predict(fit) 24 | pred1 <- predict(fit_al) 25 | expect_equal(pred0, pred1) 26 | 27 | pred0_fixed <- predict(fit, newdesign = c(1, 0, 0, 0, 0)) 28 | pred1_fixed <- predict(fit_al, newdesign = c(1, 0, 0, 0, 0)) 29 | unchanged_subset <- fit$colData$condition == "a" & fit$colData$patient == "p1" 30 | expect_equal(pred0_fixed[,unchanged_subset], pred0[,unchanged_subset], ignore_attr = "dimnames") 31 | expect_equal(pred0_fixed[,unchanged_subset], pred1_fixed[,unchanged_subset]) 32 | 33 | pred2_fixed <- predict(fit_al, newdesign = c(1, 0, 0, 0, 0), 34 | alignment_design_matrix = c(1, 0, 0, 0, 0)) 35 | expect_equal(pred0_fixed[,unchanged_subset], pred2_fixed[,unchanged_subset]) 36 | expect_equal(pred2_fixed, pred1_fixed) 37 | 38 | de1 <- test_de(fit_al, contrast = cond(condition = "a") - cond(condition = "b")) 39 | de2 <- test_de(fit_al, contrast = cond(condition = "a", patient = "p1") - cond(condition = "b", patient = "p1")) 40 | expect_equal(de1, de2) 41 | }) 42 | 43 | 44 | test_that("harmony is fine with degenerate designs", { 45 | al_design <- cbind(fit$design_matrix, rep(rnorm(2), each = 250), 0, 0, 1) 46 | # harmony ignores 'verbose = FALSE' 47 | expect_silent( 48 | align_harmony(fit, design = al_design, max_iter = 1, verbose = FALSE) 49 | ) 50 | attr(al_design, "ignore_degeneracy") <- FALSE 51 | expect_error( 52 | align_harmony(fit, design = al_design, max_iter = 1, verbose = FALSE) 53 | ) 54 | }) 55 | 56 | 57 | test_that("alignment with custom alignment_design works", { 58 | fit_al <- align_harmony(fit, verbose = FALSE) 59 | set.seed(1) 60 | fit_al2 <- align_harmony(fit, design = ~ patient * condition, verbose = FALSE) 61 | set.seed(1) 62 | align_mm <- model.matrix(~ patient * condition, data = colData(dat)) 63 | fit_al3 <- align_harmony(fit, design = align_mm, verbose = FALSE) 64 | 65 | expect_equal(fit_al2$alignment_design_matrix, fit_al3$alignment_design_matrix, ignore_attr = c("dimnames", "ignore_degeneracy")) 66 | expect_equal(fit_al2$alignment_coefficients, fit_al3$alignment_coefficients, ignore_attr = "dimnames") 67 | 68 | pred <- predict(fit_al, newdesign = c(1, 0, 0, 0, 0)) 69 | 70 | de1 <- test_de(fit_al, contrast = cond(condition = "a", patient = "p2") - cond(condition = "b", patient = "p2")) 71 | de2 <- test_de(fit_al2, contrast = cond(condition = "a", patient = "p2") - cond(condition = "b", patient = "p2")) 72 | expect_error({ 73 | test_de(fit_al3, contrast = cond(condition = "a", patient = "p2") - cond(condition = "b", patient = "p2")) 74 | }) 75 | }) 76 | 77 | test_that("handle_ridge_penalty_parameter works", { 78 | expect_equal(handle_ridge_penalty_parameter(3), 3) 79 | expect_error(handle_ridge_penalty_parameter(c(rotation = 2))) 80 | expect_error(handle_ridge_penalty_parameter(c(stretching = 1))) 81 | expect_error(handle_ridge_penalty_parameter(c(stretching = 5, rotation = 2))) 82 | expect_error(handle_ridge_penalty_parameter(list(rotation = diag(nrow = 5)))) 83 | }) 84 | 85 | 86 | 87 | test_that("check that aligning points works perfectly for low number of points", { 88 | n_genes <- 10 89 | n_emb <- 8 90 | n_points <- n_emb + 1 91 | df <- data.frame(tmp = rep(c("a", "b"), each = n_points)) 92 | design_matrix <- model.matrix(~ tmp, data = df) 93 | mat <- randn(n_emb, n_points * 2) 94 | 95 | fit <- lemur_fit(randn(n_genes, n_points * 2), col_data = df, row_data = NULL, 96 | n_embedding = n_emb, design = ~ tmp, design_matrix = design_matrix, 97 | linear_coefficients = matrix(0, nrow = n_genes, ncol = 2), 98 | base_point = diag(nrow = n_genes, ncol = n_emb), coefficients = array(0, dim = c(n_genes, n_emb, 2)), 99 | embedding = mat, 100 | alignment_coefficients = array(0, dim = c(n_emb, n_emb+1, 2)), 101 | alignment_design = NULL, alignment_design_matrix = design_matrix, 102 | use_assay = "foo", is_test_data = rep(FALSE, ncol(mat))) 103 | gr <- rep(seq_len(n_points), times = 2) 104 | fit_al <- align_by_grouping(fit, design = fit$alignment_design_matrix, grouping = gr, ridge_penalty = 0, verbose = FALSE) 105 | expect_equal(fit_al$embedding[,df$tmp == "a"], fit_al$embedding[,df$tmp == "b"], tolerance = 1e-8) 106 | }) 107 | 108 | 109 | 110 | test_that("check that harmony alignment works as expected", { 111 | set.seed(1) 112 | n_genes <- 10 113 | n_emb <- 2 114 | n_points <- n_emb + 1 115 | df <- data.frame(tmp = rep(c("a", "b"), each = n_points)) 116 | design_matrix <- model.matrix(~ tmp, data = df) 117 | mat <- randn(n_emb, n_points) 118 | mat <- cbind(mat, diag(1.1, nrow = n_emb) %*% mat) 119 | 120 | fit <- lemur_fit(randn(n_genes, n_points * 2), col_data = df, row_data = NULL, 121 | n_embedding = n_emb, design = ~ tmp, design_matrix = design_matrix, 122 | linear_coefficients = matrix(0, nrow = n_genes, ncol = 2), 123 | base_point = diag(nrow = n_genes, ncol = n_emb), coefficients = array(0, dim = c(n_genes, n_emb, 2)), 124 | embedding = mat, 125 | alignment_coefficients = array(0, dim = c(n_emb, n_emb+1, 2)), 126 | alignment_design = NULL, alignment_design_matrix = design_matrix, 127 | use_assay = "foo", is_test_data = rep(FALSE, ncol(mat))) 128 | gr <- rep(seq_len(n_points), times = 2) 129 | suppressWarnings({ 130 | fit_al2 <- align_harmony(fit, design = fit$alignment_design_matrix, nclust = n_points, ridge_penalty = 1e-3, verbose = FALSE) 131 | }) 132 | set.seed(1) 133 | suppressWarnings({ 134 | harm <- t(harmony::RunHarmony(mat, meta_data = df, vars_use = "tmp", nclust = n_points, lambda = 1e-8, verbose = FALSE)) 135 | }) 136 | 137 | expect_equal(fit_al2$embedding[,df$tmp == "a"], fit_al2$embedding[,df$tmp == "b"], tolerance = 1e-2) 138 | expect_equal(harm[,df$tmp == "a"], harm[,df$tmp == "b"], tolerance = 1e-2) 139 | 140 | # Reimplement harmony correction 141 | set.seed(1) 142 | suppressWarnings({ 143 | harm_obj <- harmony_init(mat, design_matrix, nclust = n_points, lambda = 1e-8, verbose = FALSE) 144 | }) 145 | harm_obj <- harmony_max_div_clustering(harm_obj) 146 | Z_corr <- harm_obj$Z_orig 147 | for(k in seq_len(harm_obj$K)){ 148 | Phi_Rk <- as.matrix(harm_obj$Phi_moe %*% diag(harm_obj$R[k,])) 149 | W <- solve(Phi_Rk %*% t(harm_obj$Phi_moe) + diag(c(harm_obj$lambda))) %*% Phi_Rk %*% t(harm_obj$Z_orig) 150 | W[1,] <- 0 151 | Z_corr <- Z_corr - t(W) %*% Phi_Rk 152 | } 153 | expect_equal(Z_corr, harm, tolerance = 1e-3) 154 | }) 155 | 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /tests/testthat/test-differential_geometry.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("Random Grassmann manifold functions are correct", { 3 | p <- random_grassmann_point(5, 2) 4 | expect_equal(t(p) %*% p, diag(nrow = 2)) 5 | v <- random_grassmann_tangent(p) * 10 6 | # Make sure that v is outside the injective radius 7 | expect_gt(svd(v)$d[1] / pi * 180, 90) 8 | 9 | 10 | expect_equal(t(p) %*% v + t(v) %*% p, matrix(0, nrow = 2, ncol = 2)) 11 | p2 <- grassmann_map(v, p) 12 | valt <- grassmann_log(p, p2) 13 | expect_equal(qr(cbind(grassmann_map(valt, p), grassmann_map(v, p)))$rank, 2) 14 | # v and valt are somehow equivalent, in the sense that going to direction 15 | # v/valt from p results in the same space. 16 | # valt is the minimal transition from p to p2 17 | expect_lt(sum(valt^2), sum(v^2)) 18 | 19 | # check that grassmann_map and log are consistent 20 | p3 <- random_grassmann_point(5, 2) 21 | p4 <- random_grassmann_point(5, 2) 22 | 23 | v34 <- grassmann_log(p3, p4) 24 | expect_equal(t(p3) %*% v34 + t(v34) %*% p3, matrix(0, nrow = 2, ncol = 2)) 25 | expect_equal(qr(grassmann_map(v34, p3), p4)$rank, 2) # They span the same space 26 | expect_equal(grassmann_log(p3, grassmann_map(v34, p3)), v34) 27 | }) 28 | 29 | test_that("Grassmann angles work", { 30 | for(idx in 1:10){ 31 | p <- random_grassmann_point(10, 3) 32 | v <- random_grassmann_tangent(p, sd = runif(n = 1, 0.01, 2)) 33 | theta <- grassmann_angle_from_tangent(v, normalized = TRUE) 34 | expect_gt(theta, 0) 35 | expect_lt(theta, 90) 36 | q <- grassmann_map(v, p) 37 | expect_equal(theta, tail(principal_angle(p, q), n = 1)) 38 | } 39 | }) 40 | 41 | 42 | 43 | test_that("zero-dimensional arguments work", { 44 | zero_dim_mat <- matrix(NA_real_, nrow = 0, ncol = 0) 45 | expect_equal(random_grassmann_point(0, 0), zero_dim_mat) 46 | expect_equal(random_grassmann_tangent(zero_dim_mat), zero_dim_mat) 47 | 48 | expect_equal(grassmann_map(zero_dim_mat, zero_dim_mat), zero_dim_mat) 49 | expect_equal(grassmann_log(zero_dim_mat, zero_dim_mat), zero_dim_mat) 50 | }) 51 | 52 | 53 | -------------------------------------------------------------------------------- /tests/testthat/test-geodesic_regression.R: -------------------------------------------------------------------------------- 1 | test_that("grassmann_geodesic_regression works", { 2 | 3 | base_point <- qr.Q(qr(randn(5, 2))) 4 | coordsystems <- lapply(1:10, \(idx) qr.Q(qr(randn(5, 2)))) 5 | x <- seq(1, 10) 6 | design_matrix <- model.matrix(~ x) 7 | fit <- grassmann_geodesic_regression(coordsystems, design_matrix, base_point = base_point) 8 | expect_equal(dim(fit), c(5, 2, 2)) 9 | proj <- grassmann_map(fit[,,1], base_point) 10 | expect_lt(sum((t(proj) %*% proj - diag(nrow = 2))^2), 1e-8) 11 | proj <- grassmann_map(fit[,,2], base_point) 12 | expect_lt(sum((t(proj) %*% proj - diag(nrow = 2))^2), 1e-8) 13 | 14 | }) 15 | 16 | 17 | test_that("grassmann_lm works", { 18 | n_obs <- 100 19 | data <- randn(5, n_obs) 20 | col_data <- data.frame(x = sample(letters[1:3], size = n_obs, replace = TRUE)) 21 | des <- model.matrix(~ x, col_data) 22 | base_point <- qr.Q(qr(randn(5, 2))) 23 | fit <- grassmann_lm(data, des, base_point) 24 | plane_a <- pca(data[,col_data$x == "a"], n = 2, center = FALSE)$coordsystem 25 | plane_b <- pca(data[,col_data$x == "b"], n = 2, center = FALSE)$coordsystem 26 | plane_c <- pca(data[,col_data$x == "c"], n = 2, center = FALSE)$coordsystem 27 | expect_equal(principal_angle(grassmann_map(fit[,,"(Intercept)"], base_point), plane_a), c(0,0), 28 | tolerance = 1e-5) 29 | expect_equal(principal_angle(grassmann_map(fit[,,"(Intercept)"] + fit[,,"xb"], base_point), plane_b), c(0,0), 30 | tolerance = 1e-5) 31 | expect_equal(principal_angle(grassmann_map(fit[,,"(Intercept)"] + fit[,,"xc"], base_point), plane_c), c(0,0), 32 | tolerance = 1e-5) 33 | 34 | expect_equal(fit[,,"(Intercept)"], grassmann_log(base_point, plane_a)) 35 | expect_equal(fit[,,"(Intercept)"] + fit[,,"xb"], grassmann_log(base_point, plane_b)) 36 | # x = f(p, a) 37 | # x + y = f(p, b) 38 | # -!-> y = f(b, a) 39 | #expect_equal(fit[,,"xb"], grassmann_log(plane_b, plane_a)) 40 | # ---> y = f(p, b) - f(p, a) 41 | expect_equal(fit[,,"xb"], grassmann_log(base_point, plane_b) - grassmann_log(base_point, plane_a)) 42 | }) 43 | 44 | test_that("grassmann_lm throws a helpful error message", { 45 | n_obs <- 100 46 | data <- randn(5, n_obs) 47 | col_data <- data.frame(x = sample(letters[1:3], size = n_obs, replace = TRUE)) 48 | col_data$x[1] <- "new_element" 49 | des <- model.matrix(~ x, col_data) 50 | base_point <- qr.Q(qr(randn(5, 2))) 51 | expect_error({ 52 | fit <- grassmann_lm(data, des, base_point) 53 | }) 54 | }) 55 | 56 | 57 | test_that("get_groups works and is fast", { 58 | df <- data.frame(let = sample(letters[1:2], size = 100, replace = TRUE), 59 | num = rnorm(100)) 60 | mat <- model.matrix(~ let, data = df) 61 | vec <- ifelse(df$let == df$let[1], 1, 2) 62 | attr(vec, "n") <- 2 63 | expect_equal(get_groups(mat), vec) 64 | 65 | mat <- model.matrix(~ let + num, data = df) 66 | vec <- seq_len(100) 67 | attr(vec, "n") <- 100 68 | expect_equal(get_groups(mat), vec) 69 | 70 | # This is fast and grows roughly linear 71 | df <- data.frame(num = rnorm(1e6)) 72 | mat <- model.matrix(~ num, data = df) 73 | vec <- seq_len(1e6) 74 | attr(vec, "n") <- 1e6 75 | expect_equal(get_groups(mat), vec) 76 | }) 77 | 78 | -------------------------------------------------------------------------------- /tests/testthat/test-parse_contrasts.R: -------------------------------------------------------------------------------- 1 | test_that("parse contrasts works", { 2 | expect_error(parse_contrast(A - B, formula = ~ A + B)) 3 | expect_warning(contr <- parse_contrast(cond(A = 3, B = 4), formula = ~ A + B)) 4 | expect_equal(unname(contr), c(1, 3, 4), ignore_attr = "class") 5 | }) 6 | 7 | 8 | test_that("factor based contrast specification works", { 9 | set.seed(1) 10 | n_obs <- 500 11 | col_data <- data.frame(group = sample(LETTERS[1:3], size = n_obs, replace = TRUE), 12 | cont = rnorm(n_obs), 13 | city = sample(c("New York", "Paris", "London"), size = n_obs, replace = TRUE), 14 | y = rnorm(n_obs), 15 | stringsAsFactors = TRUE) 16 | Y <- matrix(0, nrow = 10, ncol = n_obs) 17 | des <- handle_design_parameter(data = Y, design = ~ group + cont, col_data = col_data) 18 | form <- des$design_formula 19 | 20 | # cond 21 | expect_equal(parse_contrast(cond(group = "B"), form), 22 | c(1, 1, 0, 0), ignore_attr = c("names", "class")) 23 | expect_equal(parse_contrast(cond(group = "A"), form), 24 | c(1, 0, 0, 0), ignore_attr = c("names", "class")) 25 | expect_equal(parse_contrast(cond(group = "B") - cond(group = "A"), form), 26 | .minus(c(1,1,0,0), c(1,0,0,0)), ignore_attr = c("names", "class")) 27 | expect_equal(parse_contrast(cond(group = "B") + cond(group = "A"), form), 28 | .plus(c(1,1,0,0), c(1,0,0,0)), ignore_attr = c("names", "class")) 29 | expect_equal(parse_contrast(cond(group = "B") * 3, form), 30 | .multiply(c(1,1,0,0), 3), ignore_attr = c("names", "class")) 31 | expect_equal(parse_contrast((cond(group = "B") - cond(group = "C")) / 3, form), 32 | .divide(.minus(c(1,1,0,0), c(1,0,1,0)), 3), ignore_attr = c("names", "class")) 33 | 34 | 35 | des <- handle_design_parameter(data = Y, design = ~ group + cont + city:group, col_data = col_data) 36 | form <- des$design_formula 37 | mm <- des$design_matrix 38 | expect_equal(parse_contrast(cond(group = "B", city = "New York"), form), 39 | c(1, 1, 0, 0, 0, 1, 0, 0, 0, 0), ignore_attr = c("names", "class")) 40 | expect_equal(parse_contrast(cond(group = "B", city = "New York") - cond(group = "B", city = "Paris"), form), 41 | .minus(c(1, 1, 0, 0, 0, 1, 0, 0, 0, 0), c(1, 1, 0, 0, 0, 0, 0, 0, 1, 0)), ignore_attr = c("names", "class")) 42 | 43 | # Contrast relation 44 | des <- handle_design_parameter(data = Y, design = ~ group + cont, col_data = col_data) 45 | form <- des$design_formula 46 | expect_equal(unclass(parse_contrast(cond(group = "B") == cond(group = "A"), form)), 47 | list(lhs = c(1, 1, 0, 0), rhs = c(1, 0, 0, 0), relation = "equal"), 48 | ignore_attr = c("names", "class")) 49 | 50 | expect_equal(unclass(parse_contrast(cond(group = "A", cont = 3) <= cond(group = "A", cont = 5), form)), 51 | list(lhs = c(1, 0, 0, 3), rhs = c(1, 0, 0, 5), relation = "less_than"), 52 | ignore_attr = c("names", "class")) 53 | 54 | }) 55 | 56 | 57 | test_that("cond() works with custom contrasts", { 58 | n_obs <- 50 59 | col_data <- data.frame(group = sample(LETTERS[1:3], size = n_obs, replace = TRUE), 60 | cont = rnorm(n_obs), 61 | city = sample(c("New York", "Paris", "London"), size = n_obs, replace = TRUE), 62 | y = rnorm(n_obs), 63 | stringsAsFactors = TRUE) 64 | col_data$group <- C(col_data$group, contr.sum) 65 | Y <- matrix(0, nrow = 10, ncol = n_obs) 66 | des <- handle_design_parameter(data = Y, design = ~ group + cont, col_data = col_data) 67 | form <- des$design_formula 68 | mm <- des$design_matrix 69 | expect_equal(parse_contrast(cond(group = "A"), form), c(1, 1, 0, 0), ignore_attr = c("names", "class")) 70 | expect_equal(parse_contrast(cond(group = "B"), form), c(1, 0, 1, 0), ignore_attr = c("names", "class")) 71 | expect_equal(parse_contrast(cond(group = "C"), form), c(1, -1, -1, 0), ignore_attr = c("names", "class")) 72 | }) 73 | 74 | test_that("evaluate_contrast_tree works", { 75 | n_obs <- 50 76 | col_data <- data.frame(group = sample(LETTERS[1:3], size = n_obs, replace = TRUE), 77 | cont = rnorm(n_obs), 78 | city = sample(c("New York", "Paris", "London"), size = n_obs, replace = TRUE), 79 | y = rnorm(n_obs), 80 | stringsAsFactors = TRUE) 81 | Y <- matrix(0, nrow = 10, ncol = n_obs) 82 | des <- handle_design_parameter(data = Y, design = ~ group + cont, col_data = col_data) 83 | form <- des$design_formula 84 | al_des <- handle_design_parameter(data = Y, design = ~ group * cont, col_data = col_data) 85 | al_form <- al_des$design_formula 86 | 87 | cntrst <- parse_contrast(cond(group = "A", cont = 5) - cond(group = "B", cont = 5), form) 88 | al_cntrst <- parse_contrast(cond(group = "A", cont = 5) - cond(group = "B", cont = 5), al_form) 89 | 90 | sum <- evaluate_contrast_tree(cntrst, al_cntrst, \(x, y){ 91 | sum(x) + sum(y) 92 | }) 93 | expect_equal(sum, 6 * 2 - (7 + 12)) 94 | 95 | contrast_spec <- rlang::quo((cond(group = "A", cont = 5) - cond(group = "B", cont = 5)) * 9) 96 | sum <- evaluate_contrast_tree(parse_contrast(!!contrast_spec, form), 97 | parse_contrast(!!contrast_spec, al_form), 98 | \(x, y) sum(x) + sum(y)) 99 | expect_equal(sum, (6 * 2) * 9 - (7 + 12) * 9) 100 | 101 | contrast_spec <- rlang::quo(cond(group = "A", cont = 5) / cond(group = "B", cont = 5)) 102 | sum <- evaluate_contrast_tree(parse_contrast(!!contrast_spec, form), 103 | parse_contrast(!!contrast_spec, al_form), 104 | \(x, y) sum(x) + sum(y)) 105 | expect_equal(sum, (6 * 2) / (7 + 12)) 106 | }) 107 | 108 | 109 | test_that("parse_contrast works in dynamic contexts", { 110 | n_obs <- 50 111 | col_data <- data.frame(group = sample(LETTERS[1:3], size = n_obs, replace = TRUE), 112 | cont = rnorm(n_obs), 113 | city = sample(c("New York", "Paris", "London"), size = n_obs, replace = TRUE), 114 | y = rnorm(n_obs), 115 | stringsAsFactors = TRUE) 116 | Y <- matrix(0, nrow = 10, ncol = n_obs) 117 | des <- handle_design_parameter(data = Y, design = ~ group + cont, col_data = col_data) 118 | form <- des$design_formula 119 | 120 | res <- parse_contrast(cond(group = "B"), form) 121 | fun <- function(cov, lvl){ 122 | parse_contrast(cond({{cov}} := lvl), form) 123 | } 124 | res2 <- fun("group", "B") 125 | val <- list("group") 126 | res3 <- parse_contrast(cond(!!val[[1]] := "B"), form) 127 | expect_equal(res, res2) 128 | expect_equal(res, res3) 129 | }) 130 | 131 | test_that("parse_contrast throws appropriate error message", { 132 | expect_error(parse_contrast(cond(hello = "123"), formula = NULL)) 133 | data <- matrix(1, nrow = 5, ncol =10) 134 | expect_message(handle_design_parameter(matrix(1:10, ncol = 1), data, NULL, verbose = TRUE), "The 'design' was not specified with a formula") 135 | }) 136 | 137 | test_that("parse_contrast and handle_design_parameter work", { 138 | n_obs <- 50 139 | col_data <- data.frame(group = sample(LETTERS[1:3], size = n_obs, replace = TRUE), 140 | cont = rnorm(n_obs), 141 | city = sample(c("New York", "Paris", "London"), size = n_obs, replace = TRUE), 142 | y = rnorm(n_obs), 143 | stringsAsFactors = TRUE) 144 | Y <- matrix(0, nrow = 10, ncol = n_obs) 145 | des <- handle_design_parameter(data = Y, design = ~ group + cont, col_data = col_data) 146 | form <- des$design_formula 147 | res <- parse_contrast(cond(group = "B"), form) 148 | }) 149 | -------------------------------------------------------------------------------- /tests/testthat/test-pca.R: -------------------------------------------------------------------------------- 1 | test_that("pca function works", { 2 | 3 | Y <- matrix(rnorm(5 * 10, mean = 3), nrow = 5, ncol = 10) 4 | decomp <- pca(Y, n = 3) 5 | expect_named(decomp, c("coordsystem", "embedding", "offset")) 6 | expect_equal(ncol(decomp$coordsystem), 3) 7 | expect_equal(nrow(decomp$embedding), 3) 8 | 9 | centered_Y <- scale(t(Y), center = TRUE, scale = FALSE) 10 | expect_equal(decomp$offset, attr(centered_Y, "scaled:center")) 11 | alt_pca <- prcomp(t(Y), rank. = 3) 12 | expect_equal(decomp$coordsystem, alt_pca$rotation, ignore_attr = "dimnames") 13 | expect_equal(decomp$embedding, t(alt_pca$x), ignore_attr = "dimnames") 14 | expect_equal(decomp$offset, alt_pca$center) 15 | }) 16 | -------------------------------------------------------------------------------- /tests/testthat/test-recursive_least_squares.R: -------------------------------------------------------------------------------- 1 | test_that("recursive least squares works", { 2 | n <- 30 3 | y <- rnorm(n = n) 4 | X <- matrix(rnorm(n * 3), nrow = n, ncol = 3) 5 | res <- recursive_least_squares(y, X) 6 | manual_res <- sapply(seq_len(n), \(idx){ 7 | lm.fit(X[seq_len(idx),,drop=FALSE], y[seq_len(idx)])$coefficients 8 | }) 9 | expect_equal(res[,-c(1,2)], manual_res[,-c(1,2)], ignore_attr = "dimnames") 10 | 11 | 12 | # dat <- data.frame(group = rep(letters[1:3], times = 10), 13 | # cont = rep(c(0.1, 5, 50, 0.2, 6, 20), times = 5)) 14 | 15 | # mm <- model.matrix(~ group, dat) 16 | mm <- duplicate_rows(matrix(rnorm(3 * 3), nrow = 3, ncol = 3), times = 10) 17 | y <- rnorm(30) 18 | group <- c(rep(1:6, times = 5)) 19 | 20 | # Contrast 21 | contr <- matrix(c(1, 0, -1), nrow = 1) 22 | res <- bulked_recursive_least_squares_contrast(y, mm, group, contrast = contr) 23 | lm_fit <- lm(tapply(y, group, mean) ~ mm[1:6,] - 1) 24 | 25 | rdf <- lm_fit$df.residual 26 | # Same as summary(lm_fit)$sigma^2 27 | sigma_sq <- sum(lm_fit$residuals^2) / rdf 28 | covar <- sigma_sq * solve(t(mm[1:6,]) %*% mm[1:6,]) 29 | t_stat <- contr %*% matrix(coef(lm_fit), ncol = 1) / sqrt(contr %*% covar %*% t(contr)) 30 | expect_equal(res$t_stat[30], drop(t_stat), tolerance = 1e-3) 31 | expect_equal(res$coef[,30], coef(lm_fit), tolerance = 1e-3, ignore_attr = "names") 32 | 33 | pval <- pt(t_stat, df = lm_fit$df.residual, lower.tail = TRUE) 34 | # expect_equal(min(pval, 1 - pval) * 2, summary(multcomp::glht(lm_fit, contr))$test$pvalues, 35 | # ignore_attr = "error") 36 | 37 | manual_res <- sapply(seq_len(n), \(idx){ 38 | lm.fit(mm[1:min(6, idx),,drop=FALSE], tapply(y[seq_len(idx)], group[seq_len(idx)], mean))$coefficients 39 | }) 40 | expect_equal(res$coef[,-c(1:5)], unname(manual_res[,-c(1:5)]), tolerance = 1e-3) 41 | 42 | 43 | # Comparison with C++ implementation 44 | cpp_res <- cum_brls_which_abs_max(y, mm, group, contrast = contr, penalty = 1e-6, min_neighborhood_size = 0) 45 | expect_equal(cpp_res$idx, which.max(abs(res$t_stat))) 46 | expect_equal(cpp_res$max, res$t_stat[which.max(abs(res$t_stat))]) 47 | }) 48 | 49 | 50 | test_that("min_neighborhood_size argument of cum_brls_which_abs_max works", { 51 | mm <- duplicate_rows(matrix(rnorm(3 * 3), nrow = 3, ncol = 3), times = 10) 52 | y <- rnorm(30) 53 | group <- c(rep(1:6, times = 5)) 54 | contr <- matrix(c(1, 0, -1), nrow = 1) 55 | 56 | res1 <- cum_brls_which_abs_max(y, mm, group, contrast = contr, penalty = 1e-6, min_neighborhood_size = 28) 57 | # res1$idx # either 28, 29, or 30 58 | expect_gte(res1$idx, 28) 59 | 60 | res2 <- cum_brls_which_abs_max(y, mm, group, contrast = contr, penalty = 1e-6, min_neighborhood_size = 50) 61 | expect_equal(res2$idx, 30) 62 | }) 63 | 64 | 65 | # test_that("bulked_recursive_least_squares_contrast is fast", { 66 | # 67 | # n <- 1e6 68 | # n_gr <- 30 69 | # n_col <- 5 70 | # # contrast <- 1 71 | # contrast <- c(0, 1, 0, -1, 0) 72 | # y <- rnorm(n = n) 73 | # gr <- sample(1:n_gr, size = n, replace = TRUE) 74 | # ref <- matrix(rnorm(n_gr * n_col), nrow = 30, ncol = n_col) 75 | # mm <- do.call(rbind, lapply(gr, \(g){ 76 | # ref[g,] 77 | # })) 78 | # system.time( 79 | # res <- bulked_recursive_least_squares_contrast(y, mm, gr, contrast) 80 | # ) 81 | # profvis::profvis( 82 | # bulked_recursive_least_squares_contrast_fast(y, mm, gr, contrast, ridge_penalty = 1e-6) 83 | # ) 84 | # 85 | # bulked_recursive_least_squares_contrast_fast(y, mm, gr, contrast, ridge_penalty = 1e-6) 86 | # system.time( 87 | # cum_brls_which_abs_max(y, mm, gr, contrast, penalty = 1e-6) 88 | # ) 89 | # 90 | # bench::mark( 91 | # # which.max(abs(bulked_recursive_least_squares_contrast(y, mm, gr, contrast)$t_stat)), 92 | # # bulked_recursive_least_squares_contrast_fast(y, mm, gr, contrast)$index, 93 | # cum_brls_which_abs_max(y, mm, gr, contrast, penalty = 1e-6)$index, 94 | # cum_brls_which_abs_max_faster(y, mm, gr, contrast, penalty = 1e-6)$index, 95 | # check = FALSE 96 | # ) 97 | # 98 | # }) 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /tests/testthat/test-ridge_penalty.R: -------------------------------------------------------------------------------- 1 | test_that("ridge_regression works", { 2 | Y <- randn(5, 30) 3 | X <- randn(30, 2) 4 | 5 | lm_fit <- t(coef(lm(t(Y) ~ X - 1))) 6 | expect_equal(ridge_regression(Y, X, ridge_penalty = 0), lm_fit, ignore_attr = "dimnames") 7 | expect_lt(sum(ridge_regression(Y, X, ridge_penalty = 3)^2), sum(lm_fit^2)) 8 | }) 9 | 10 | test_that("weighted ridge_regression works", { 11 | Y <- randn(5, 30) 12 | X <- randn(30, 2) 13 | weights <- rexp(n = 30, rate = 2) 14 | weights <- weights / sum(weights) * ncol(X) 15 | 16 | lm_fit <- t(coef(lm(t(Y) ~ X - 1, weights = weights))) 17 | expect_equal(ridge_regression(Y, X, ridge_penalty = 0, weights = weights), lm_fit, ignore_attr = "dimnames") 18 | expect_lt(sum(ridge_regression(Y, X, ridge_penalty = 3, weights = weights)^2), sum(lm_fit^2)) 19 | }) 20 | -------------------------------------------------------------------------------- /tests/testthat/test-test_de.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("test_de works", { 3 | dat <- make_synthetic_data(n_genes = 30, n_cells = 500, n_lat = 3, n_centers = 5) 4 | fit <- lemur(dat, design = ~ condition, n_embedding = 3, verbose = FALSE) 5 | fit <- align_by_grouping(fit, grouping = fit$colData$cell_type, verbose = FALSE) 6 | fit <- fit[,1:10] 7 | 8 | res <- test_de(fit, cond(condition = "b") - cond(condition = "a")) 9 | res2 <- test_de(fit, cond(condition = "b")) 10 | 11 | expect_equal(dim(res), dim(res2)) 12 | }) 13 | 14 | test_that("my implementation of Welford's algorithm works", { 15 | x <- rnorm(1000) 16 | res <- fold_left(list(mean = 0, msq = 0, iter = 1))(x, \(elem, accum){ 17 | diff <- elem 18 | delta <- diff - accum$mean 19 | accum$mean <- accum$mean + delta / accum$iter 20 | accum$msq <- accum$msq + delta * (diff - accum$mean) 21 | accum$iter <- accum$iter + 1 22 | accum 23 | }) 24 | expect_equal(res$mean, mean(x)) 25 | sd <- sqrt(res$msq / (length(x) - 1)) 26 | expect_equal(sd, sd(x)) 27 | }) 28 | 29 | test_that("test_de works with custom embedding", { 30 | dat <- make_synthetic_data(n_genes = 30, n_cells = 500, n_lat = 3, n_centers = 5) 31 | fit <- lemur(dat, design = ~ condition, n_embedding = 3, verbose = FALSE) 32 | fit <- align_by_grouping(fit, grouping = fit$colData$cell_type, verbose = FALSE) 33 | fit <- fit[,1:10] 34 | test_point <- matrix(0, nrow = 3, ncol = 1) 35 | colnames(test_point) <- "zero" 36 | res <- test_de(fit, cond(condition = "b") - cond(condition = "a"), 37 | embedding = test_point) 38 | res2 <- test_de(fit, contrast = cond() - cond(condition = "b"), embedding = test_point) 39 | 40 | expect_equal(dim(res), c(30, 1)) 41 | expect_equal(dim(res2), c(30, 1)) 42 | }) 43 | 44 | 45 | test_that("test_global works", { 46 | skip("Something is broken in 'test_global'") 47 | dat <- make_synthetic_data(n_genes = 30, n_cells = 500, n_lat = 3, n_centers = 5) 48 | fit <- lemur(dat, design = ~ condition, n_embedding = 3, verbose = FALSE) 49 | fit <- align_by_grouping(fit, grouping = dat$cell_type, verbose = FALSE) 50 | 51 | res <- test_global(fit, reduced_design = ~ 1, consider = "linear", variance_est = "analytical", verbose = FALSE) 52 | expect_s3_class(res, "data.frame") 53 | 54 | res3 <- test_global(fit, contrast = cond(condition = "a") - cond(condition = "b"), 55 | variance_est = "resampling", verbose = FALSE) 56 | expect_s3_class(res3, "data.frame") 57 | }) 58 | 59 | test_that("the angle between planes is correctly calculated", { 60 | n_emb <- 4 61 | dat <- make_synthetic_data(n_genes = 30, n_cells = 5000, n_lat = 5, n_centers = 3) 62 | fitlm <- lm(t(assay(dat)) ~ dat$condition) 63 | assay(dat) <- t(fitlm$residuals) 64 | fit <- lemur(dat, design = ~ condition, n_embedding = n_emb, test_fraction = 0, verbose = FALSE) 65 | expect_equal(fit$linear_coefficients, matrix(0, nrow = nrow(fit), ncol = 3), ignore_attr = "dimnames") 66 | # The angle and delta_diffemb for a left-right contrast are slightly different than the results 67 | # for a one-sided contrast. The left-right contrast is slighly more accurate because it calculate 68 | # log(map(a, p), map(b, p)) instead of simply a - b 69 | res <- test_global(fit, contrast = cond(condition = "a") - cond(condition = "b"), 70 | variance_est = "none", verbose = FALSE) 71 | res2 <- test_global(fit, contrast = cond(condition = "a") - cond(condition = "b"), 72 | variance_est = "none", verbose = FALSE) 73 | plane_a <- pca(assay(dat)[,dat$condition == "a"], n = n_emb)$coordsystem 74 | plane_b <- pca(assay(dat)[,dat$condition == "b"], n = n_emb)$coordsystem 75 | expect_equal(res2$angle_degrees, tail(principal_angle(plane_a, plane_b), n = 1)) 76 | }) 77 | 78 | test_that("test_global's analytical test produces uniform p-values", { 79 | skip("Long running test") 80 | 81 | # Analytical test for linear part 82 | res <- do.call(rbind, lapply(1:40, function(idx){ 83 | print(paste0("Round: ", idx)) 84 | dat <- make_synthetic_data(n_genes = 30, n_cells = 500, n_lat = 3, n_centers = 5, 85 | treatment_effect = 0.8) 86 | dat$rand_cond <- sample(LETTERS[1:3], ncol(dat), replace = TRUE) 87 | fit <- lemur(dat, design = ~ rand_cond, n_embedding = 3, verbose = FALSE) 88 | test_global(fit, reduced_design = ~ 1, variance_est = "analytical", 89 | consider = "linear", verbose = FALSE) 90 | })) 91 | hist(res$pval, breaks = 40) 92 | plot(sort(res$pval), ppoints(nrow(res)), asp = 1, log = "xy"); abline(0,1) 93 | 94 | # Multi-var Z test for both 95 | res <- do.call(rbind, lapply(1:30, function(idx){ 96 | print(paste0("Round: ", idx)) 97 | dat <- randn(8, 500) 98 | rand_cond <- sample(LETTERS[1:2], ncol(dat), replace = TRUE) 99 | fit <- lemur(dat, design = ~ rand_cond, n_embedding = 2, verbose = FALSE) 100 | test_global(fit, contrast = rand_condB, verbose = FALSE) 101 | })) 102 | hist(res$pval, breaks = 40) 103 | plot(sort(res$pval), ppoints(nrow(res)), asp = 1, log = "xy"); abline(0,1) 104 | 105 | # Resampling-based test 106 | res <- do.call(rbind, lapply(1:100, function(idx){ 107 | print(paste0("Round: ", idx)) 108 | dat <- make_synthetic_data(n_genes = 30, n_cells = 500, n_lat = 3, n_centers = 5, 109 | treatment_effect = 0.8) 110 | dat$rand_cond <- sample(LETTERS[1:3], ncol(dat), replace = TRUE) 111 | dat$num <- round(runif(ncol(dat), min = -0.7, max = 0.7)) 112 | fit <- lemur(dat, design = ~ rand_cond + num, n_embedding = 2, verbose = FALSE) 113 | test_global(fit, contrast = rand_condB, 114 | variance_est = "resampling", verbose = FALSE) 115 | })) 116 | hist(res$pval, breaks = 40) 117 | plot(sort(res$pval), ppoints(nrow(res)), asp = 1, log = "xy"); abline(0,1) 118 | 119 | # Check that multi-var Z test and resampling are consistent 120 | res <- do.call(rbind, lapply(1:30, function(idx){ 121 | print(paste0("Round: ", idx)) 122 | dat <- make_synthetic_data(n_genes = 30, n_cells = 500, n_lat = 3, n_centers = 5, 123 | treatment_effect = 0.8) 124 | dat$rand_cond <- sample(LETTERS[1:3], ncol(dat), replace = TRUE) 125 | dat$num <- round(runif(ncol(dat), min = -0.7, max = 0.7)) 126 | fit <- lemur(dat, design = ~ rand_cond + num, n_embedding = 0, verbose = FALSE) 127 | res1 <- test_global(fit, contrast = rand_condB, variance_est = "resampling", verbose = FALSE) 128 | 129 | res3 <- test_global(fit, contrast = rand_condB, 130 | variance_est = "analytical", consider = "linear", verbose = FALSE) 131 | tmp <- rbind(res1, res3) 132 | tmp$method <- c("resampling", "analytical") 133 | tmp 134 | })) 135 | res 136 | }) 137 | 138 | 139 | -------------------------------------------------------------------------------- /tests/testthat/test-util.R: -------------------------------------------------------------------------------- 1 | test_that("mply_dbl works", { 2 | 3 | mat1 <- mply_dbl(1:4, \(i) rep(i * 2, 7), ncol = 7) 4 | expect_equal(dim(mat1), c(4, 7)) 5 | 6 | tmp <- lapply(1:4, \(i) rep(i * 2, 7)) 7 | expect_equal(dim(stack_rows(tmp)), c(4, 7)) 8 | expect_equal(dim(stack_cols(tmp)), c(7, 4)) 9 | 10 | }) 11 | 12 | test_that("duplicate_cols and duplicate_rows works", { 13 | mat <- matrix(1:10, nrow = 5, ncol = 2) 14 | expect_equal(duplicate_cols(mat, 2), cbind(mat, mat)) 15 | expect_equal(duplicate_rows(mat, 2), rbind(mat, mat)) 16 | 17 | expect_equal(duplicate_cols(mat, 0), matrix(nrow = 5, ncol = 0)) 18 | expect_equal(duplicate_rows(mat, 0), matrix(nrow = 0, ncol = 2)) 19 | }) 20 | 21 | 22 | test_that("fold_left works", { 23 | expect_equal(fold_left(0)(1:10, \(elem, accum) accum + elem), sum(1:10)) 24 | expect_equal(fold_left(1)(1:10, \(elem, accum) accum * elem), prod(1:10)) 25 | expect_equal(fold_left(NULL)(2:10, \(elem, accum) if(is.null(accum)) elem * 5 else accum * elem), 5 * prod(2:10)) 26 | expect_error(fold_left(0)(1:10, \(x, y) accum + elem)) 27 | }) 28 | 29 | 30 | test_that("resample works", { 31 | 32 | samp <- resample(3) 33 | expect_equal(length(samp), 3) 34 | expect_true(all(samp %in% (1:3))) 35 | 36 | group <- sample(letters[1:3], 100, replace = TRUE) 37 | samp <- resample(100, group) 38 | 39 | }) 40 | 41 | 42 | test_that("which_extreme works", { 43 | x <- abs(rnorm(10)) 44 | expect_equal(which_extreme(x), which.max(x)) 45 | expect_equal(which_extreme(-x), which.min(-x)) 46 | 47 | ignore <- rep(c(FALSE, TRUE), each = 5) 48 | expect_equal(which_extreme(x, ignore), which.max(x[! ignore])) 49 | }) 50 | 51 | 52 | # test_that("nullspace works", { 53 | # mat <- randn(30, 3) 54 | # # n1 <- lmerTest:::nullspace(mat, type = "left") 55 | # n2 <- nullspace(mat) 56 | # n3 <- MASS::Null(mat) 57 | # n4 <- pracma::nullspace(t(mat)) 58 | # # expect_equal(grassmann_angle_from_points(n1, n2), 0) 59 | # expect_equal(grassmann_angle_from_points(n3, n2), 0) 60 | # expect_equal(grassmann_angle_from_points(n4, n2), 0) 61 | # }) 62 | 63 | 64 | test_that("estimability test works", { 65 | n <- 40 66 | dat <- data.frame(group = sample(letters[1:3], size = n, replace = TRUE), 67 | cont1 = rnorm(n), 68 | cont2 = rnorm(n)) 69 | 70 | mm <- model.matrix(~ group + cont1 + cont2, data = dat) 71 | 72 | expect_true(is_contrast_estimable(c(0, 1, 0, 0, 0), mm)) 73 | expect_false(is_contrast_estimable(c(0, 1, 0, 0, 0), mm[dat$group != "a",])) 74 | expect_true( is_contrast_estimable(c(0, 0, 0, 1, 0), mm[dat$group != "a",])) 75 | expect_true( is_contrast_estimable(c(0, 0, 0, 1, -1), mm[dat$group != "a",])) 76 | expect_true( is_contrast_estimable(c(0, 0, 0, 2, -1), mm[dat$group != "a",])) 77 | expect_false(is_contrast_estimable(c(1, 0, 0, 2, -1), mm[dat$group != "a",])) 78 | expect_true( is_contrast_estimable(c(0, 1, -1, 0, 0), mm[dat$group != "a",])) 79 | expect_false(is_contrast_estimable(c(0, 2, -1, 0, 0), mm[dat$group != "a",])) 80 | }) 81 | 82 | 83 | test_that("pseudoinverse works", { 84 | # Works well for full rank matrices 85 | mat <- randn(20, 3) 86 | pmat <- pseudoinverse(mat) 87 | expect_equal(mat %*% pmat %*% mat, mat) 88 | expect_equal(pmat %*% mat %*% pmat, pmat) 89 | expect_equal(pmat, solve(t(mat) %*% mat) %*% t(mat)) 90 | 91 | 92 | # Works well for non-full rank matrices 93 | mat <- randn(20, 2) 94 | mat <- cbind(mat, mat[,2]) 95 | pmat <- pseudoinverse(mat) 96 | expect_equal(mat %*% pmat %*% mat, mat) 97 | expect_equal(pmat %*% mat %*% pmat, pmat) 98 | # expect_equal(pmat, solve(t(mat) %*% mat) %*% t(mat)) 99 | 100 | }) 101 | 102 | 103 | test_that("update_values works", { 104 | df1 <- S4Vectors::DataFrame(x = 1:5, y = letters[1:5]) 105 | df2 <- data.frame(a = 10^(0:4), b = "red", x = letters[10:14]) 106 | 107 | expect_equal(NULL %update_values% NULL, NULL) 108 | expect_equal(df1 %update_values% NULL, df1) 109 | expect_equal(NULL %update_values% df2, df2) 110 | expect_equal(df1 %update_values% df2, S4Vectors::DataFrame(x = df2$x, y = df1$y, a = df2$a, b = df2$b)) 111 | }) 112 | 113 | test_that("default_to works", { 114 | df1 <- S4Vectors::DataFrame(x = 1:5, y = letters[1:5]) 115 | df2 <- data.frame(a = 10^(0:4), b = "red", x = letters[10:14]) 116 | 117 | expect_equal(NULL %default_to% NULL, NULL) 118 | expect_equal(df1 %default_to% NULL, df1) 119 | expect_equal(NULL %default_to% df2, df2) 120 | expect_equal(df1 %default_to% df2, S4Vectors::DataFrame(x = df1$x, y = df1$y, a = df2$a, b = df2$b)) 121 | }) 122 | 123 | 124 | test_that("aggregate_matrix works", { 125 | mat <- matrix(rnorm(10 * 5), nrow = 10, ncol = 5) 126 | res <- aggregate_matrix(mat, group_split = list(c(1,3), c(2,4,5)), 127 | aggr_fnc = MatrixGenerics::rowSums2) 128 | expect_equal(res, cbind(rowSums(mat[,c(1,3)]), rowSums(mat[,c(2,4,5)]))) 129 | 130 | res <- aggregate_matrix(as(mat, "dgCMatrix"), group_split = list(c(1,3), c(2,4,5)), 131 | aggr_fnc = MatrixGenerics::rowSums2) 132 | expect_equal(res, cbind(rowSums(mat[,c(1,3)]), rowSums(mat[,c(2,4,5)]))) 133 | }) 134 | 135 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/man/figures/equation_schematic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/lemur/a9dda42650c03fa8ed2ed4adf6d0f4fa76e2e249/vignettes/man/figures/equation_schematic.png --------------------------------------------------------------------------------