├── R ├── countsplit.R ├── estimate_zipoisson.R ├── seurat_workflow.R ├── copula.R ├── estimate_negative_binomial.R └── recall.R ├── .github ├── .gitignore └── workflows │ ├── docker-image.yml │ ├── super-linter.yml │ ├── check-standard.yml │ ├── pkgdown.yaml │ └── lintr.yml ├── LICENSE ├── .Rbuildignore ├── _pkgdown.yml ├── Dockerfile ├── man ├── figures │ └── recall_logo.png ├── estimate_zi_poisson.Rd ├── estimate_negative_binomial.Rd ├── rzipoisson.Rd ├── estimate_negative_binomial_copula.Rd ├── compute_knockoff_filter.Rd ├── get_seurat_obj_with_artificial_variables.Rd ├── seurat_workflow.Rd ├── FindClustersRecall.Rd └── FindClustersCountsplit.Rd ├── NAMESPACE ├── .gitignore ├── LICENSE.md ├── DESCRIPTION ├── vignettes └── basic-usage.Rmd └── README.md /R/countsplit.R: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2024 2 | COPYRIGHT HOLDER: recall authors 3 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^_pkgdown\.yml$ 2 | ^docs$ 3 | ^pkgdown$ 4 | ^\.github$ 5 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://lcrawlab.github.io/recall/ 2 | template: 3 | bootstrap: 5 4 | 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rocker/verse:4.0.5 2 | 3 | RUN R -e "install.packages('.', type = 'source', repos = NULL)" -------------------------------------------------------------------------------- /man/figures/recall_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcrawlab/recall/HEAD/man/figures/recall_logo.png -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(FindClustersCountsplit) 4 | export(FindClustersRecall) 5 | export(seurat_workflow) -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | jobs: 10 | 11 | build: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | - name: Build the Docker image 18 | run: docker build . --file Dockerfile --tag recall:$(date +%s) 19 | -------------------------------------------------------------------------------- /man/estimate_zi_poisson.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_zipoisson.R 3 | \name{estimate_zi_poisson} 4 | \alias{estimate_zi_poisson} 5 | \title{Maximum likelihood estimation for the zero-inflated Poisson distribution 6 | with Poisson parameter lambda and zero proportion prop.zero.} 7 | \usage{ 8 | estimate_zi_poisson(data) 9 | } 10 | \arguments{ 11 | \item{data}{The data to estimate parameters from.} 12 | } 13 | \value{ 14 | Maximum likelihood estimators of the zero-inflated Poisson 15 | distribution 16 | } 17 | \description{ 18 | Given data, computes the maximum likelihood estimators 19 | for the zero-inflated Poisson distribution. 20 | } 21 | -------------------------------------------------------------------------------- /man/estimate_negative_binomial.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_negative_binomial.R 3 | \name{estimate_negative_binomial} 4 | \alias{estimate_negative_binomial} 5 | \title{Maximum likelihood estimation for the negative binomial 6 | distribution.} 7 | \usage{ 8 | estimate_negative_binomial(data, verbose = FALSE) 9 | } 10 | \arguments{ 11 | \item{data}{The data to estimate parameters from.} 12 | 13 | \item{verbose}{Whether or not to show all logging.} 14 | } 15 | \value{ 16 | Maximum likelihood estimators size and mu for the negative 17 | binomial distribution 18 | } 19 | \description{ 20 | Given data, computes the maximum likelihood estimators 21 | for the negative binomial distribution with parameters: size and mu. 22 | } 23 | -------------------------------------------------------------------------------- /man/rzipoisson.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_zipoisson.R 3 | \name{rzipoisson} 4 | \alias{rzipoisson} 5 | \title{Random data generation for the zero-infalted Poisson distribution 6 | with Poisson parameter lambda and zero proportion prop.zero.} 7 | \usage{ 8 | rzipoisson(n, lambda, prop.zero) 9 | } 10 | \arguments{ 11 | \item{n}{The number of samples to be simulated.} 12 | 13 | \item{lambda}{The Poisson rate parameter.} 14 | 15 | \item{prop.zero}{The proportion of excess zeroes.} 16 | } 17 | \value{ 18 | Simulated data from ZIP(lambda, prop.zero). 19 | } 20 | \description{ 21 | Given the number of samples desired, a Poisson parameter, 22 | lambda, and a zero proportion, prop.zero, simulates the number of desired 23 | samples from ZIP(lambda, prop.zero). 24 | } 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | .RDataTmp 8 | 9 | # User-specific files 10 | .Ruserdata 11 | 12 | # Example code in package build process 13 | *-Ex.R 14 | 15 | # Output files from R CMD build 16 | /*.tar.gz 17 | 18 | # Output files from R CMD check 19 | /*.Rcheck/ 20 | 21 | # RStudio files 22 | .Rproj.user/ 23 | 24 | # produced vignettes 25 | vignettes/*.html 26 | vignettes/*.pdf 27 | 28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 29 | .httr-oauth 30 | 31 | # knitr and R markdown default cache directories 32 | *_cache/ 33 | /cache/ 34 | 35 | # Temporary files created by R markdown 36 | *.utf8.md 37 | *.knit.md 38 | 39 | # R Environment Variables 40 | .Renviron 41 | 42 | # pkgdown site 43 | docs/ 44 | 45 | # translation temp files 46 | po/*~ 47 | 48 | # RStudio Connect folder 49 | rsconnect/ 50 | .Rproj.user 51 | .Rdata 52 | .DS_Store 53 | docs 54 | inst/doc -------------------------------------------------------------------------------- /.github/workflows/super-linter.yml: -------------------------------------------------------------------------------- 1 | # This workflow executes several linters on changed files based on languages used in your code base whenever 2 | # you push a code or open a pull request. 3 | # 4 | # You can adjust the behavior by modifying this file. 5 | # For more information, see: 6 | # https://github.com/github/super-linter 7 | name: Lint Code Base 8 | 9 | on: 10 | push: 11 | branches: [ "main" ] 12 | pull_request: 13 | branches: [ "main" ] 14 | jobs: 15 | run-lint: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v3 20 | with: 21 | # Full git history is needed to get a proper list of changed files within `super-linter` 22 | fetch-depth: 0 23 | 24 | - name: Lint Code Base 25 | uses: github/super-linter@v4 26 | env: 27 | VALIDATE_ALL_CODEBASE: false 28 | DEFAULT_BRANCH: "main" 29 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 30 | -------------------------------------------------------------------------------- /man/estimate_negative_binomial_copula.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/copula.R 3 | \name{estimate_negative_binomial_copula} 4 | \alias{estimate_negative_binomial_copula} 5 | \alias{estimate_zi_poisson_copula} 6 | \alias{estimate_poisson_copula} 7 | \alias{estimate_gaussian_copula} 8 | \title{todo} 9 | \usage{ 10 | estimate_zi_poisson_copula(data_matrix, cores) 11 | 12 | estimate_negative_binomial_copula(data_matrix, cores) 13 | 14 | estimate_poisson_copula(data_matrix, cores) 15 | 16 | estimate_gaussian_copula(data_matrix, cores) 17 | } 18 | \arguments{ 19 | \item{data_matrix}{The data to estimate parameters from.} 20 | 21 | \item{cores}{The number of CPU cores to use in estimation by scDesign3.} 22 | } 23 | \value{ 24 | todo 25 | 26 | todo 27 | 28 | todo 29 | 30 | todo 31 | } 32 | \description{ 33 | Given data, computes todo 34 | 35 | Given data, computes todo 36 | 37 | Given data, computes todo 38 | 39 | Given data, computes todo 40 | } 41 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2024 recall authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /man/compute_knockoff_filter.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/recall.R 3 | \name{compute_knockoff_filter} 4 | \alias{compute_knockoff_filter} 5 | \title{Returns the genes selected by the knockoff filter} 6 | \usage{ 7 | compute_knockoff_filter( 8 | seurat_obj, 9 | cluster1, 10 | cluster2, 11 | q, 12 | return_all = FALSE, 13 | num_cores = 1, 14 | shared_memory_max 15 | ) 16 | } 17 | \arguments{ 18 | \item{seurat_obj}{A Seurat object} 19 | 20 | \item{cluster1}{The Idents of the cluster of interest in seurat_obj1} 21 | 22 | \item{cluster2}{The Idents of the cluster of interest in seurat_obj2} 23 | 24 | \item{q}{The desired rate to control the FDR at} 25 | 26 | \item{return_all}{Determines if the returned object will contain all genes 27 | or just the selected genes.} 28 | 29 | \item{num_cores}{The number of cores for computing marker genes in parallel.} 30 | 31 | \item{shared_memory_max}{The maximum size for shared global variables.} 32 | } 33 | \value{ 34 | todo 35 | } 36 | \description{ 37 | Given two Seurat objects, returns the the genes selected by 38 | the knockoff filter and their W statistics. 39 | } 40 | -------------------------------------------------------------------------------- /man/get_seurat_obj_with_artificial_variables.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/recall.R 3 | \name{get_seurat_obj_with_artificial_variables} 4 | \alias{get_seurat_obj_with_artificial_variables} 5 | \title{Returns a Seurat object that contains additional (fake) RNA 6 | expression counts.} 7 | \usage{ 8 | get_seurat_obj_with_artificial_variables( 9 | seurat_obj, 10 | assay = "RNA", 11 | null_method = "ZIP", 12 | verbose = TRUE, 13 | cores 14 | ) 15 | } 16 | \arguments{ 17 | \item{seurat_obj}{A Seurat object containing RNA expression counts.} 18 | 19 | \item{assay}{The assay to generate artificial variables from.} 20 | 21 | \item{null_method}{The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)} 22 | 23 | \item{verbose}{Whether or not to show logging.} 24 | 25 | \item{cores}{The number of cores to use in generating synthetic null variables.} 26 | } 27 | \value{ 28 | A Seurat object that contains the original variable features and an 29 | equal number of artificial features. 30 | } 31 | \description{ 32 | Given a Seurat object, returns a new Seurat object whose RNA 33 | expression counts includes the 34 | variable features from the original object and an equal number of artificial 35 | features. 36 | } 37 | -------------------------------------------------------------------------------- /man/seurat_workflow.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/seurat_workflow.R 3 | \name{seurat_workflow} 4 | \alias{seurat_workflow} 5 | \title{Runs a typical Seurat workflow on a Seurat object (up to 6 | dimensionality reduction and clustering).} 7 | \usage{ 8 | seurat_workflow( 9 | seurat_obj, 10 | num_variable_features, 11 | resolution_param = 0.8, 12 | visualization_method = "umap", 13 | num_dims = 10, 14 | algorithm = "louvain" 15 | ) 16 | } 17 | \arguments{ 18 | \item{seurat_obj}{A Seurat object that will be analyzed.} 19 | 20 | \item{num_variable_features}{The number of variable features to use in the 21 | analysis.} 22 | 23 | \item{resolution_param}{The resolution parameter to use when clustering.} 24 | 25 | \item{visualization_method}{Either "umap" or "tsne".} 26 | 27 | \item{num_dims}{The number of principal components to use.} 28 | 29 | \item{algorithm}{The clustering algorithm to use, either "louvain" or 30 | "leiden".} 31 | } 32 | \value{ 33 | A Seurat object containing the relevant analysis results. 34 | } 35 | \description{ 36 | Given a Seurat object, returns a new Seurat that has been 37 | normalized, had variable features identified, 38 | scaled, had principal components computed, had clusters identified, and had 39 | tSNE and UMAP embeddings determined. 40 | } 41 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: recall 2 | Title: Calibrated clustering with artificial variables to avoid over-clustering in single-cell RNA-sequencing 3 | Version: 0.0.0 4 | Authors@R: 5 | person("Alan", "DenAdel", , "alan_denadel@brown.edu", role = c("aut", "cre"), 6 | comment = c(ORCID = "0000-0002-7985-6789")) 7 | Description: recall (Calibrated Clustering with Artificial Variables) is a method for protecting 8 | against over-clustering by controlling for the impact of double-dipping. The approach 9 | can be applied to any clustering algorithm (implemented are the Louvain and Leiden algorithms with 10 | plans forK-means, and hierarchical clustering algorithms). The method provides state-of-the-art 11 | clustering performance and can rapidly analyze large-scale scRNA-seq studies and is 12 | compatible with the Seurat library. 13 | Encoding: UTF-8 14 | Roxygen: list(markdown = TRUE) 15 | RoxygenNote: 7.3.2 16 | Imports: 17 | Matrix, 18 | Seurat (>= 5.0.1), 19 | SingleCellExperiment, 20 | scDesign3, 21 | SummarizedExperiment, 22 | MASS, 23 | fitdistrplus, 24 | lamW, 25 | knockoff, 26 | future, 27 | stats, 28 | cli, 29 | stringr, 30 | countsplit 31 | License: MIT + file LICENSE 32 | Suggests: 33 | knitr, 34 | markdown 35 | Remotes: 36 | scDesign3=github::SONGDONGYUAN1994/scDesign3 37 | VignetteBuilder: knitr 38 | URL: https://lcrawlab.github.io/recall/ 39 | -------------------------------------------------------------------------------- /.github/workflows/check-standard.yml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v4 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown.yaml 13 | 14 | permissions: read-all 15 | 16 | jobs: 17 | pkgdown: 18 | runs-on: ubuntu-latest 19 | # Only restrict concurrency for non-PR jobs 20 | concurrency: 21 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 22 | env: 23 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 24 | permissions: 25 | contents: write 26 | steps: 27 | - uses: actions/checkout@v4 28 | 29 | - uses: r-lib/actions/setup-pandoc@v2 30 | 31 | - uses: r-lib/actions/setup-r@v2 32 | with: 33 | use-public-rspm: true 34 | 35 | - uses: r-lib/actions/setup-r-dependencies@v2 36 | with: 37 | extra-packages: any::pkgdown, local::. 38 | needs: website 39 | 40 | - name: Build site 41 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 42 | shell: Rscript {0} 43 | 44 | - name: Deploy to GitHub pages 🚀 45 | if: github.event_name != 'pull_request' 46 | uses: JamesIves/github-pages-deploy-action@v4.5.0 47 | with: 48 | clean: false 49 | branch: gh-pages 50 | folder: docs 51 | -------------------------------------------------------------------------------- /vignettes/basic-usage.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Basic Usage on PBMC3k Data" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{basic-usage} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | 9 | --- 10 | 11 | ```{r, include = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = "#>" 15 | ) 16 | 17 | knitr::opts_chunk$set(eval = FALSE) 18 | 19 | ``` 20 | 21 | ```{r setup} 22 | suppressPackageStartupMessages({ 23 | library(Seurat) 24 | library(SeuratData) 25 | library(recall) 26 | }) 27 | ``` 28 | 29 | 30 | First, we use the `SeuratData` data package to first download and then load 31 | 2700 PBMCs. The loaded `SeuratObject`, `pbmc3k`, is from an old version of 32 | `Seurat`, and so we update the object to v5. 33 | 34 | ```{r load_data} 35 | set.seed(123) 36 | 37 | SeuratData::InstallData("pbmc3k") 38 | data("pbmc3k") 39 | 40 | pbmc3k <- UpdateSeuratObject(pbmc3k) 41 | ``` 42 | 43 | Now, we use `Seurat` to perform the usual preprocessing steps that are performed prior to clustering. 44 | 45 | ```{r preprocessing} 46 | pbmc3k <- NormalizeData(pbmc3k) 47 | pbmc3k <- FindVariableFeatures(pbmc3k) 48 | pbmc3k <- ScaleData(pbmc3k) 49 | pbmc3k <- RunPCA(pbmc3k) 50 | pbmc3k <- FindNeighbors(pbmc3k) 51 | pbmc3k <- RunUMAP(pbmc3k, dims = 1:10) 52 | ``` 53 | 54 | The `recall` algorithm can be run with a single function call as a drop-in 55 | replacement for the `Seurat` function `FindClusters`. 56 | 57 | ```{r run_recall} 58 | pbmc3k <- FindClustersRecall(pbmc3k) 59 | ``` 60 | 61 | The `recall` clusters are set to the idents of the `SeuratObject` that is 62 | returned by `FindClustersRecall` 63 | 64 | ```{r plot_umap} 65 | DimPlot(pbmc3k) 66 | ``` 67 | 68 | Cluster labels from `FindClustersRecall` care stored in the metadata in the 69 | column `pbmc3k@meta.data$recall_clusters`. 70 | 71 | ```{r plot_umap2} 72 | DimPlot(pbmc3k, group.by = "recall_clusters") 73 | ``` 74 | -------------------------------------------------------------------------------- /.github/workflows/lintr.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | # lintr provides static code analysis for R. 6 | # It checks for adherence to a given style, 7 | # identifying syntax errors and possible semantic issues, 8 | # then reports them to you so you can take action. 9 | # More details at https://lintr.r-lib.org/ 10 | 11 | name: lintr 12 | 13 | on: 14 | push: 15 | branches: [ "main" ] 16 | pull_request: 17 | # The branches below must be a subset of the branches above 18 | branches: [ "main" ] 19 | schedule: 20 | - cron: '18 7 * * 6' 21 | 22 | permissions: 23 | contents: read 24 | 25 | jobs: 26 | lintr: 27 | name: Run lintr scanning 28 | runs-on: ubuntu-latest 29 | permissions: 30 | contents: read # for checkout to fetch code 31 | security-events: write # for github/codeql-action/upload-sarif to upload SARIF results 32 | actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status 33 | 34 | steps: 35 | - name: Checkout code 36 | uses: actions/checkout@v4 37 | 38 | - name: Setup R 39 | uses: r-lib/actions/setup-r@4e1feaf90520ec1215d1882fdddfe3411c08e492 40 | with: 41 | r-version: '4.3' # The R version to download (if necessary) and use. 42 | 43 | - name: Setup lintr 44 | uses: r-lib/actions/setup-r-dependencies@4e1feaf90520ec1215d1882fdddfe3411c08e492 45 | with: 46 | extra-packages: lintr 47 | 48 | - name: Run lintr 49 | run: lintr::sarif_output(lintr::lint_dir("."), "lintr-results.sarif") 50 | shell: Rscript {0} 51 | continue-on-error: true 52 | 53 | - name: Upload analysis results to GitHub 54 | uses: github/codeql-action/upload-sarif@v3 55 | with: 56 | sarif_file: lintr-results.sarif 57 | wait-for-processing: true 58 | -------------------------------------------------------------------------------- /R/estimate_zipoisson.R: -------------------------------------------------------------------------------- 1 | 2 | # https://en.wikipedia.org/wiki/Zero-inflated_model#Estimators_of_ZIP_parameters 3 | # https://math.stackexchange.com/questions/2761563/maximum-likelihood-estimation-for-zero-inflated-poisson-distribution 4 | # https://ieeexplore.ieee.org/document/9032203 5 | 6 | #' @title Maximum likelihood estimation for the zero-inflated Poisson distribution 7 | #' with Poisson parameter lambda and zero proportion prop.zero. 8 | #' 9 | #' @description Given data, computes the maximum likelihood estimators 10 | #' for the zero-inflated Poisson distribution. 11 | #' 12 | #' @param data The data to estimate parameters from. 13 | #' @returns Maximum likelihood estimators of the zero-inflated Poisson 14 | #' distribution 15 | #' @name estimate_zi_poisson 16 | estimate_zi_poisson <- function(data) { 17 | num.zeros <- sum(data == 0) 18 | r0 <- 1 / length(data) * num.zeros 19 | 20 | x.bar = mean(data) 21 | 22 | gamma <- x.bar / (1 - r0) 23 | 24 | lambda.hat <- lamW::lambertW0(-gamma * exp(-gamma)) + gamma 25 | 26 | pi.hat <- 1 - x.bar / lambda.hat 27 | 28 | 29 | return.list <- list("lambda.hat" = lambda.hat, "pi.hat" = pi.hat) 30 | return(return.list) 31 | } 32 | 33 | 34 | #' @title Random data generation for the zero-infalted Poisson distribution 35 | #' with Poisson parameter lambda and zero proportion prop.zero. 36 | #' 37 | #' @description Given the number of samples desired, a Poisson parameter, 38 | #' lambda, and a zero proportion, prop.zero, simulates the number of desired 39 | #' samples from ZIP(lambda, prop.zero). 40 | #' 41 | #' @param n The number of samples to be simulated. 42 | #' @param lambda The Poisson rate parameter. 43 | #' @param prop.zero The proportion of excess zeroes. 44 | #' @returns Simulated data from ZIP(lambda, prop.zero). 45 | #' @name rzipoisson 46 | rzipoisson <- function(n, lambda, prop.zero) { 47 | data <- c() 48 | 49 | 50 | for (i in 1:n) { 51 | if (stats::runif(1) < prop.zero) { 52 | data[i] <- 0 53 | } 54 | else { 55 | data[i] <- stats::rpois(1, lambda) 56 | } 57 | } 58 | return(data) 59 | } 60 | 61 | -------------------------------------------------------------------------------- /man/FindClustersRecall.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/recall.R 3 | \name{FindClustersRecall} 4 | \alias{FindClustersRecall} 5 | \title{Runs a typical Seurat workflow on a Seurat object (up to 6 | dimensionality reduction and clustering).} 7 | \usage{ 8 | FindClustersRecall( 9 | seurat_obj, 10 | resolution_start = 0.8, 11 | reduction_percentage = 0.2, 12 | num_clusters_start = 20, 13 | dims = 1:10, 14 | algorithm = "louvain", 15 | null_method = "ZIP", 16 | assay = "RNA", 17 | cores = 1, 18 | shared_memory_max = 8000 * 1024^2, 19 | verbose = TRUE 20 | ) 21 | } 22 | \arguments{ 23 | \item{seurat_obj}{The Seurat object that will be analyzed.} 24 | 25 | \item{resolution_start}{The starting resolution to be used for the 26 | clustering algorithm (Louvain and Leiden algorithms).} 27 | 28 | \item{reduction_percentage}{The amount that the starting parameter will be 29 | reduced by after each iteration (between 0 and 1).} 30 | 31 | \item{num_clusters_start}{The starting number of clusters to be used for the 32 | clustering algorithm (K-means and Hierarchical clustering algorithms).} 33 | 34 | \item{dims}{The dimensions to use as input features (i.e. 1:10).} 35 | 36 | \item{algorithm}{The clustering algorithm to be used.} 37 | 38 | \item{null_method}{The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)} 39 | 40 | \item{assay}{The assay to generate artificial variables from.} 41 | 42 | \item{cores}{The number of cores to compute marker genes in parallel.} 43 | 44 | \item{shared_memory_max}{The maximum size for shared global variables. 45 | Increased this variable if you see the following error: 46 | The total size of the X globals that need to be exported for the future expression 47 | ('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB 48 | (option 'future.globals.maxSize'). The X largest globals are ...} 49 | 50 | \item{verbose}{Whether or not to show all logging.} 51 | } 52 | \value{ 53 | Returns a Seurat object where the idents have been updated with the 54 | clusters determined via the recall algorithm. 55 | Latest clustering results will be stored in the object metadata under 56 | recall_clusters'. Note that 'recall_clusters' will be overwritten ever 57 | time FindClustersRecall is run. 58 | } 59 | \description{ 60 | Given a Seurat object, returns a new Seurat that has been 61 | normalized, had variable features identified, scaled, had principal 62 | components computed, hadclusters identified, and had tSNE and UMAP 63 | embeddings determined. 64 | } 65 | -------------------------------------------------------------------------------- /man/FindClustersCountsplit.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/recall.R 3 | \name{FindClustersCountsplit} 4 | \alias{FindClustersCountsplit} 5 | \title{Runs a typical Seurat workflow on a Seurat object (up to 6 | dimensionality reduction and clustering).} 7 | \usage{ 8 | FindClustersCountsplit( 9 | seurat_obj, 10 | resolution_start = 0.8, 11 | reduction_percentage = 0.2, 12 | num_clusters_start = 20, 13 | dims = 1:10, 14 | algorithm = "louvain", 15 | null_method = "ZIP", 16 | assay = "RNA", 17 | cores = 1, 18 | shared_memory_max = 8000 * 1024^2, 19 | verbose = TRUE 20 | ) 21 | } 22 | \arguments{ 23 | \item{seurat_obj}{The Seurat object that will be analyzed.} 24 | 25 | \item{resolution_start}{The starting resolution to be used for the 26 | clustering algorithm (Louvain and Leiden algorithms).} 27 | 28 | \item{reduction_percentage}{The amount that the starting parameter will be 29 | reduced by after each iteration (between 0 and 1).} 30 | 31 | \item{num_clusters_start}{The starting number of clusters to be used for the 32 | clustering algorithm (K-means and Hierarchical clustering algorithms).} 33 | 34 | \item{dims}{The dimensions to use as input features (i.e. 1:10).} 35 | 36 | \item{algorithm}{The clustering algorithm to be used.} 37 | 38 | \item{null_method}{The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)} 39 | 40 | \item{assay}{The assay to generate artificial variables from.} 41 | 42 | \item{cores}{The number of cores to compute marker genes in parallel.} 43 | 44 | \item{shared_memory_max}{The maximum size for shared global variables. 45 | Increased this variable if you see the following error: 46 | The total size of the X globals that need to be exported for the future expression 47 | ('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB 48 | (option 'future.globals.maxSize'). The X largest globals are ...} 49 | 50 | \item{verbose}{Whether or not to show all logging.} 51 | } 52 | \value{ 53 | Returns a Seurat object where the idents have been updated with the 54 | clusters determined via the countsplit algorithm. 55 | Latest clustering results will be stored in the object metadata under 56 | countsplit_clusters'. Note that 'countsplit_clusters' will be overwritten ever 57 | time FindClustersCountsplit is run. 58 | } 59 | \description{ 60 | Given a Seurat object, returns a new Seurat that has been 61 | normalized, had variable features identified, scaled, had principal 62 | components computed, hadclusters identified, and had tSNE and UMAP 63 | embeddings determined. 64 | } 65 | -------------------------------------------------------------------------------- /R/seurat_workflow.R: -------------------------------------------------------------------------------- 1 | #' @title Runs a typical Seurat workflow on a Seurat object (up to 2 | #' dimensionality reduction and clustering). 3 | #' 4 | #' @description Given a Seurat object, returns a new Seurat that has been 5 | #' normalized, had variable features identified, 6 | #' scaled, had principal components computed, had clusters identified, and had 7 | #' tSNE and UMAP embeddings determined. 8 | #' 9 | #' @param seurat_obj A Seurat object that will be analyzed. 10 | #' @param num_variable_features The number of variable features to use in the 11 | #' analysis. 12 | #' @param resolution_param The resolution parameter to use when clustering. 13 | #' @param visualization_method Either "umap" or "tsne". 14 | #' @param num_dims The number of principal components to use. 15 | #' @param algorithm The clustering algorithm to use, either "louvain" or 16 | #' "leiden". 17 | #' @returns A Seurat object containing the relevant analysis results. 18 | #' @export 19 | #' @name seurat_workflow 20 | seurat_workflow <- function(seurat_obj, 21 | num_variable_features, 22 | resolution_param = 0.8, 23 | visualization_method = "umap", 24 | num_dims = 10, 25 | algorithm = "louvain") { 26 | seurat_obj <- Seurat::NormalizeData(seurat_obj) 27 | 28 | seurat_obj <- Seurat::FindVariableFeatures(seurat_obj, 29 | selection.method = "vst", 30 | nfeatures = num_variable_features) 31 | 32 | all_genes <- rownames(seurat_obj) 33 | 34 | seurat_obj <- Seurat::ScaleData(seurat_obj) 35 | 36 | seurat_obj <- Seurat::RunPCA(seurat_obj, 37 | features = Seurat::VariableFeatures(object = seurat_obj)) 38 | 39 | seurat_obj <- Seurat::FindNeighbors(seurat_obj, dims = 1:num_dims) 40 | 41 | if (algorithm == "louvain") { 42 | seurat_obj <- Seurat::FindClusters(seurat_obj, 43 | resolution = resolution_param) 44 | } 45 | 46 | if (algorithm == "leiden") { 47 | seurat_obj <- Seurat::FindClusters(seurat_obj, 48 | resolution = resolution_param, 49 | algorithm = 4, 50 | method = "igraph") 51 | } 52 | 53 | if (visualization_method == "umap") { 54 | seurat_obj <- Seurat::RunUMAP(seurat_obj, dims = 1:num_dims) 55 | } 56 | if (visualization_method == "tsne") { 57 | seurat_obj <- Seurat::RunTSNE(seurat_obj, dims = 1:num_dims) 58 | } 59 | 60 | if (visualization_method == "both") { 61 | seurat_obj <- Seurat::RunUMAP(seurat_obj, dims = 1:num_dims) 62 | seurat_obj <- Seurat::RunTSNE(seurat_obj, dims = 1:num_dims) 63 | } 64 | 65 | return(seurat_obj) 66 | } 67 | -------------------------------------------------------------------------------- /R/copula.R: -------------------------------------------------------------------------------- 1 | simulate_data_scDesign3 <- function(data_matrix, cores, family) { 2 | sce <- SingleCellExperiment::SingleCellExperiment(list(counts = data_matrix)) 3 | SummarizedExperiment::colData(sce)$cell_type <- "1" # scDesign3 needs a cell type so we just make it the same for all cells 4 | 5 | simulated_data <- scDesign3::scdesign3(sce, 6 | celltype = "cell_type", 7 | pseudotime = NULL, 8 | spatial = NULL, 9 | other_covariates = NULL, 10 | empirical_quantile = FALSE, 11 | usebam=TRUE, # to speedup marginal inference 12 | mu_formula = "1", 13 | sigma_formula = "1", 14 | corr_formula = "1", 15 | family_use = family, # this is the key parameter 16 | nonzerovar = FALSE, 17 | n_cores = cores, 18 | parallelization = "mcmapply", 19 | important_feature = "all", 20 | nonnegative = FALSE, 21 | copula = "gaussian", 22 | fastmvn = TRUE) 23 | 24 | ko <- simulated_data$new_count 25 | 26 | return(ko) 27 | } 28 | 29 | 30 | #' @title todo 31 | #' 32 | #' @description Given data, computes todo 33 | #' 34 | #' @param data_matrix The data to estimate parameters from. 35 | #' @param cores The number of CPU cores to use in estimation by scDesign3. 36 | #' @returns todo 37 | #' @name estimate_negative_binomial_copula 38 | estimate_zi_poisson_copula <- function(data_matrix, cores) { 39 | family <- "zip" 40 | ko <- simulate_data_scDesign3(data_matrix, cores, family) 41 | return(ko) 42 | } 43 | 44 | 45 | #' @title todo 46 | #' 47 | #' @description Given data, computes todo 48 | #' 49 | #' @param data_matrix The data to estimate parameters from. 50 | #' @param cores The number of CPU cores to use in estimation by scDesign3. 51 | #' @returns todo 52 | #' @name estimate_negative_binomial_copula 53 | estimate_negative_binomial_copula <- function(data_matrix, cores) { 54 | family <- "nb" 55 | ko <- simulate_data_scDesign3(data_matrix, cores, family) 56 | return(ko) 57 | } 58 | 59 | 60 | #' @title todo 61 | #' 62 | #' @description Given data, computes todo 63 | #' 64 | #' @param data_matrix The data to estimate parameters from. 65 | #' @param cores The number of CPU cores to use in estimation by scDesign3. 66 | #' @returns todo 67 | #' @name estimate_negative_binomial_copula 68 | estimate_poisson_copula <- function(data_matrix, cores) { 69 | family <- "poisson" 70 | ko <- simulate_data_scDesign3(data_matrix, cores, family) 71 | return(ko) 72 | } 73 | 74 | 75 | 76 | 77 | #' @title todo 78 | #' 79 | #' @description Given data, computes todo 80 | #' 81 | #' @param data_matrix The data to estimate parameters from. 82 | #' @param cores The number of CPU cores to use in estimation by scDesign3. 83 | #' @returns todo 84 | #' @name estimate_negative_binomial_copula 85 | estimate_gaussian_copula <- function(data_matrix, cores) { 86 | family <- "gaussian" 87 | ko <- simulate_data_scDesign3(data_matrix, cores, family) 88 | return(ko) 89 | } 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # recall (Calibrated Clustering with Artificial Variables) 2 | 3 | [![R CMD check](https://github.com/lcrawlab/recall/actions/workflows/check-standard.yml/badge.svg)](https://github.com/lcrawlab/recall/actions/workflows/check-standard.yml) 4 | [![Docker Image CI](https://github.com/lcrawlab/recall/actions/workflows/docker-image.yml/badge.svg)](https://github.com/lcrawlab/recall/actions/workflows/docker-image.yml) 5 | 6 | ## Introduction 7 | 8 | Standard single-cell RNA-sequencing (scRNA-seq) pipelines nearly always include unsupervised clustering as a key step in identifying biologically distinct cell types. A follow-up step in these pipelines is to test for differential expression between the identified clusters. When algorithms over-cluster, downstream analyses will produce inflated P-values resulting in increased false discoveries. 9 | Here, we present `recall` (Calibrated Clustering with Artificial Variables): a new method for protecting against over-clustering by controlling for the impact of double-dipping. 10 | Importantly, our approach can be applied to any clustering algorithm (implemented here are the Louvain and Leiden algorithms with plans to implement the K-means and hierarchical clustering algorithms). 11 | `recall` provides state-of-the-art clustering performance and can rapidly analyze large-scale scRNA-seq studies, even on a personal laptop. 12 | 13 | ## Installation 14 | 15 | You can install the lastest development version by using the [devtools](https://CRAN.R-project.org/package=devtools) library. To install this package with devtools, use this command: 16 | 17 | ```r 18 | devtools::install_github("lcrawlab/recall") 19 | ``` 20 | 21 | Although it is not explicitly a dependency, making sure you have `presto` installed will make `recall` much faster. 22 | 23 | ```r 24 | devtools::install_github("immunogenomics/presto") 25 | ``` 26 | 27 | 28 | ## Tutorial 29 | 30 | ```r 31 | library(Seurat) 32 | library(SeuratData) 33 | 34 | library(recall) 35 | 36 | set.seed(123) 37 | 38 | # load pbmc3k dataset 39 | SeuratData::InstallData("pbmc3k") 40 | data("pbmc3k") 41 | 42 | pbmc3k <- UpdateSeuratObject(pbmc3k) 43 | 44 | pbmc3k <- NormalizeData(pbmc3k) 45 | pbmc3k <- FindVariableFeatures(pbmc3k) 46 | pbmc3k <- ScaleData(pbmc3k) 47 | pbmc3k <- RunPCA(pbmc3k) 48 | pbmc3k <- FindNeighbors(pbmc3k) 49 | pbmc3k <- RunUMAP(pbmc3k, dims = 1:10) 50 | 51 | pbmc_default <- FindClusters(pbmc3k) 52 | pbmc_recall <- FindClustersRecall(pbmc3k) 53 | 54 | DimPlot(pbmc_default) + DimPlot(pbmc_recall) 55 | ``` 56 | ## Overview of the Method 57 | 58 | The `recall` algorithm consists of three simple steps: 59 | 60 | 1. First, we generate synthetic null variables, inspired by knockoff variables (Barber and Candès,2015) , where we augment the single-cell data being analyzed with "fake" genes that are known not to contribute to any unique cell type. 61 | 2. Second, we perform both preprocessing and clustering on this augmented dataset. 62 | 3. Third, we calibrate the number of inferred clusters by using a hypothesis testing strategy with a data-dependent threshold to determine if there is a statistically significant difference between groups. If any pair of groups does not have statistically significant differences then re-clustering occurs. 63 | 64 | The synthetic genes act as negative control variables; they go through the same analytic steps as the real data and are presented with the same opportunity to be identified as marker genes. 65 | The `recall` algorithm uses the guiding principle that well-calibrated clusters (i.e., those representing real groups) should have significantly differentially expressed genes after correcting for multiple hypothesis tests, while over-clustered groups will not. 66 | We use this rule to iteratively re-cluster cells until the inferred clusters are well-calibrated and the observed differences in expression between groups are not due to the effects of double-dipping. 67 | 68 | ## Relevant Citations 69 | `recall` is now published in AJHG, [here](https://www.cell.com/ajhg/abstract/S0002-9297(25)00061-8). 70 | 71 | A. DenAdel, M. Ramseier, A. Navia, A. Shalek, S. Raghavan, P. Winter, A. Amini, and L. Crawford. A knockoff calibration method to avoid over-clustering in single-cell RNA-sequencing. _AJHG_. 72 | 73 | ## Questions and Feedback 74 | For questions or concerns with `recall`, please contact 75 | [Alan DenAdel](mailto:alan_denadel@brown.edu) or [Lorin Crawford](lcrawford@microsoft.com). Any feedback on the software, manuscript, and tutorials is appreciated. 76 | -------------------------------------------------------------------------------- /R/estimate_negative_binomial.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title Maximum likelihood estimation for the negative binomial 3 | #' distribution. 4 | #' 5 | #' @description Given data, computes the maximum likelihood estimators 6 | #' for the negative binomial distribution with parameters: size and mu. 7 | #' 8 | #' @param data The data to estimate parameters from. 9 | #' @returns Maximum likelihood estimators size and mu for the negative 10 | #' binomial distribution 11 | #' @param verbose Whether or not to show all logging. 12 | #' @name estimate_negative_binomial 13 | estimate_negative_binomial <- function(data, verbose=FALSE) { 14 | 15 | if (verbose) { message("Attempting MLE method 1") } 16 | mle1 <- tryCatch( 17 | { 18 | nb_fit <- MASS::fitdistr(data, "negative binomial", method = "Nelder-Mead") 19 | size <- nb_fit$estimate[["size"]] 20 | mu <- nb_fit$estimate[["mu"]] 21 | 22 | # check if method returned NaN or NA without throwing an error 23 | if (is.na(mu) || is.na(size)) { stop() } 24 | 25 | return.list <- list("size" = size, "mu" = mu) 26 | return(return.list) 27 | }, 28 | error = function(cond) { 29 | if (verbose) { message("MLE method 1 failed with an error.") } 30 | NA 31 | }, 32 | warning = function(cond) { 33 | if (verbose) { message("MLE method 2 had a warning. Warning message:\n") } 34 | if (verbose) { message(cond) } 35 | if (verbose) { message("\n") } 36 | NA 37 | } 38 | ) 39 | 40 | if (verbose) { message("Attempting MLE method 2") } 41 | mle2 <- tryCatch( 42 | { 43 | nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mle") 44 | size <- nb_fit$estimate[["size"]] 45 | mu <- nb_fit$estimate[["mu"]] 46 | 47 | # check if method returned NaN or NA without throwing an error 48 | if (is.na(mu) || is.na(size)) { stop() } 49 | 50 | return.list <- list("size" = size, "mu" = mu) 51 | return(return.list) 52 | 53 | }, 54 | error = function(cond) { 55 | if (verbose) { message("MLE method 2 failed with an error.") } 56 | NA 57 | }, 58 | warning = function(cond) { 59 | if (verbose) { message("MLE method 2 had a warning. Warning message:") } 60 | if (verbose) { message(cond) } 61 | NA 62 | } 63 | ) 64 | 65 | if (verbose) { message("Attempting MME") } 66 | mme <- tryCatch( 67 | { 68 | nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mme") 69 | size <- nb_fit$estimate[["size"]] 70 | mu <- nb_fit$estimate[["mu"]] 71 | 72 | # check if method returned NaN or NA without throwing an error 73 | if (is.na(mu) || is.na(size)) { stop() } 74 | 75 | return.list <- list("size" = size, "mu" = mu) 76 | return(return.list) 77 | }, 78 | error = function(cond) { 79 | if (verbose) { message("MME failed with an error.") } 80 | NA 81 | }, 82 | warning = function(cond) { 83 | if (verbose) { message("MME method has a warning. Warning message:") } 84 | if (verbose) { message(cond) } 85 | NA 86 | } 87 | ) 88 | 89 | 90 | if (verbose) { message("Attempting MME with warnings") } 91 | mme <- tryCatch( 92 | { 93 | nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mme") 94 | size <- nb_fit$estimate[["size"]] 95 | mu <- nb_fit$estimate[["mu"]] 96 | 97 | # check if method returned NaN or NA without throwing an error 98 | if (is.na(mu) || is.na(size)) { stop() } 99 | 100 | return.list <- list("size" = size, "mu" = mu) 101 | return(return.list) 102 | }, 103 | error = function(cond) { 104 | if (verbose) { message("MME failed with an error.") } 105 | NA 106 | } 107 | ) 108 | 109 | if (verbose) { message("Attempting MSE") } 110 | mme <- tryCatch( 111 | { 112 | nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mse") 113 | size <- nb_fit$estimate[["size"]] 114 | mu <- nb_fit$estimate[["mu"]] 115 | 116 | # check if method returned NaN or NA without throwing an error 117 | if (is.na(mu) || is.na(size)) { stop() } 118 | 119 | return.list <- list("size" = size, "mu" = mu) 120 | return(return.list) 121 | }, 122 | error = function(cond) { 123 | if (verbose) { message("MSE failed with an error.") } 124 | NA 125 | }, 126 | warning = function(cond) { 127 | if (verbose) { message("MSE method failed. Warning message:") } 128 | if (verbose) { message(cond) } 129 | NA 130 | } 131 | ) 132 | 133 | if (verbose) { message("Attempting QME") } 134 | mme <- tryCatch( 135 | { 136 | nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="qme") 137 | size <- nb_fit$estimate[["size"]] 138 | mu <- nb_fit$estimate[["mu"]] 139 | 140 | # check if method returned NaN or NA without throwing an error 141 | if (is.na(mu) || is.na(size)) { stop() } 142 | 143 | return.list <- list("size" = size, "mu" = mu) 144 | return(return.list) 145 | }, 146 | error = function(cond) { 147 | if (verbose) { message("QME failed with an error.") } 148 | NA 149 | }, 150 | warning = function(cond) { 151 | if (verbose) { message("QME method failed. Warning message:") } 152 | if (verbose) { message(cond) } 153 | NA 154 | } 155 | ) 156 | 157 | 158 | if (verbose) { message("Attempting MGE") } 159 | mme <- tryCatch( 160 | { 161 | nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mge") 162 | size <- nb_fit$estimate[["size"]] 163 | mu <- nb_fit$estimate[["mu"]] 164 | 165 | # check if method returned NaN or NA without throwing an error 166 | if (is.na(mu) || is.na(size)) { stop() } 167 | 168 | return.list <- list("size" = size, "mu" = mu) 169 | return(return.list) 170 | }, 171 | error = function(cond) { 172 | if (verbose) { message("MGE failed with an error.") } 173 | NA 174 | }, 175 | warning = function(cond) { 176 | if (verbose) { message("MGE method failed. Warning message:") } 177 | if (verbose) { message(cond) } 178 | NA 179 | } 180 | ) 181 | 182 | 183 | 184 | stop("All negative binomial estimation methods failed.") 185 | 186 | } 187 | -------------------------------------------------------------------------------- /R/recall.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title Returns a Seurat object that contains additional (fake) RNA 3 | #' expression counts. 4 | #' 5 | #' @description Given a Seurat object, returns a new Seurat object whose RNA 6 | #' expression counts includes the 7 | #' variable features from the original object and an equal number of artificial 8 | #' features. 9 | #' 10 | #' @param seurat_obj A Seurat object containing RNA expression counts. 11 | #' @param assay The assay to generate artificial variables from. 12 | #' @param null_method The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula) 13 | #' @param cores The number of cores to use in generating synthetic null variables. 14 | #' @param verbose Whether or not to show logging. 15 | #' @returns A Seurat object that contains the original variable features and an 16 | #' equal number of artificial features. 17 | #' @name get_seurat_obj_with_artificial_variables 18 | get_seurat_obj_with_artificial_variables <- function(seurat_obj, assay = "RNA", null_method = "ZIP", verbose = TRUE, cores) { 19 | 20 | if (verbose) { 21 | message("Pulling data from Seurat object") 22 | } 23 | 24 | var_features <- Seurat::VariableFeatures(seurat_obj) 25 | seurat_obj_data <- as.data.frame(t(as.matrix(Seurat::GetAssayData(seurat_obj, assay = assay, layer = "counts")[var_features, ]))) 26 | 27 | #if (verbose) { 28 | # message("Estimating the distribution of each gene") 29 | #} 30 | if (verbose) { 31 | message("Computing artificial features") 32 | } 33 | 34 | if (null_method == "ZIP") { 35 | estimates <- lapply(seurat_obj_data, estimate_zi_poisson) 36 | sampling_function <- function(x) { 37 | rzipoisson(nrow(seurat_obj_data), 38 | x$lambda.hat, 39 | x$pi.hat) 40 | } 41 | ko <- as.data.frame(lapply(estimates, sampling_function)) 42 | 43 | } 44 | else if (null_method == "NB") { 45 | estimates <- lapply(seurat_obj_data, estimate_negative_binomial) 46 | sampling_function <- function(x) { 47 | stats::rnbinom(nrow(seurat_obj_data), 48 | size = x$size, 49 | mu = x$mu) 50 | } 51 | ko <- as.data.frame(lapply(estimates, sampling_function)) 52 | } 53 | else if (null_method == "ZIP-copula") { 54 | ko <- estimate_zi_poisson_copula(seurat_obj_data, cores) 55 | } 56 | else if (null_method == "NB-copula") { 57 | ko <- estimate_negative_binomial_copula(seurat_obj_data, cores) 58 | } 59 | else if (null_method == "Poisson-copula") { 60 | ko <- estimate_poisson_copula(seurat_obj_data, cores) 61 | } 62 | else if (null_method == "Gaussian-copula") { 63 | ko <- estimate_gaussian_copula(seurat_obj_data, cores) 64 | } 65 | else { 66 | stop("You selected a null_method that is not supported. Choose from: ZIP, NB, ZIP-copula, NB-copula, Poisson-copula, Gaussian-copula.") 67 | } 68 | 69 | 70 | num_variable_features <- length(var_features) 71 | colnames(ko) <- paste0(rep("knockoff", num_variable_features), 1:num_variable_features) 72 | combined_data <- cbind(seurat_obj_data, ko) 73 | 74 | # sparsify augmented data matrix and transpose for use in Seurat 75 | combined_data <- Matrix::Matrix(t(combined_data), sparse = TRUE) 76 | 77 | new_project_name <- paste0(seurat_obj@project.name, "_with_knockoffs") 78 | new_seurat_obj <- Seurat::CreateSeuratObject(counts = combined_data, project = new_project_name) 79 | 80 | return(new_seurat_obj) 81 | } 82 | 83 | 84 | 85 | 86 | 87 | 88 | #' @title Returns the genes selected by the knockoff filter 89 | #' 90 | #' @description Given two Seurat objects, returns the the genes selected by 91 | #' the knockoff filter and their W statistics. 92 | #' 93 | #' @param seurat_obj A Seurat object 94 | #' @param cluster1 The Idents of the cluster of interest in seurat_obj1 95 | #' @param cluster2 The Idents of the cluster of interest in seurat_obj2 96 | #' @param q The desired rate to control the FDR at 97 | #' @param return_all Determines if the returned object will contain all genes 98 | #' or just the selected genes. 99 | #' @param num_cores The number of cores for computing marker genes in parallel. 100 | #' @param shared_memory_max The maximum size for shared global variables. 101 | #' @returns todo 102 | #' @name compute_knockoff_filter 103 | compute_knockoff_filter <- function(seurat_obj, 104 | cluster1, 105 | cluster2, 106 | q, 107 | return_all = FALSE, 108 | num_cores = 1, 109 | shared_memory_max) { 110 | options(future.globals.maxSize = shared_memory_max) 111 | # todo note what this is for, figure this out as a parameter or programmatically 112 | future::plan("multicore", workers = as.numeric(num_cores)) 113 | 114 | markers <- Seurat::FindMarkers(seurat_obj, 115 | ident.1 = cluster1, 116 | ident.2 = cluster2, 117 | logfc.threshold = 0, 118 | min.pct = 0) 119 | 120 | 121 | # FindMarkers orders by p-value, so we can't rely on position to know which genes are which 122 | knockoff_indices <- grepl("^knockoff", rownames(markers)) 123 | original_indices <- !knockoff_indices 124 | 125 | # subset the markers data.frame into originals and knockoffs 126 | knockoff_markers <- markers[knockoff_indices, ] 127 | original_markers <- markers[original_indices, ] 128 | 129 | all_genes <- rownames(seurat_obj) 130 | 131 | # get indices of knockoffs and originals from seurat_obj, should be [FALSE, ..., FALSE, TRUE, ..., TRUE] 132 | knockoff_indices_sorted <- grepl("^knockoff", all_genes) 133 | original_indices_sorted <- !knockoff_indices_sorted 134 | 135 | knockoff_names_sorted <- all_genes[knockoff_indices_sorted] 136 | original_names_sorted <- all_genes[original_indices_sorted] 137 | 138 | # sort markers data.frames by their original orderings 139 | knockoff_markers_sorted <- knockoff_markers[knockoff_names_sorted, ] 140 | original_markers_sorted <- original_markers[original_names_sorted, ] 141 | 142 | original_p_values <- original_markers_sorted$p_val 143 | knockoff_p_values <- knockoff_markers_sorted$p_val 144 | 145 | log_original_p_values <- -log10(original_p_values) 146 | log_knockoff_p_values <- -log10(knockoff_p_values) 147 | 148 | W <- log_original_p_values - log_knockoff_p_values 149 | 150 | thres <- knockoff::knockoff.threshold(W, fdr = q, offset = 1) 151 | 152 | 153 | if (return_all) { 154 | all_features <- as.data.frame(list("gene" = original_names_sorted, "W" = W)) 155 | 156 | ret <- list("all_features" = all_features, "threshold" = thres) 157 | 158 | return(ret) 159 | } 160 | selected_indices <- which(W >= thres) # todo check if this should be > (case where threshold is Inf, but there are still some Inf -log p) 161 | #selected_indices <- which(W > thres) # todo check if this should be > (case where threshold is Inf, but there are still some Inf -log p) 162 | 163 | selected_genes <- original_names_sorted[selected_indices] 164 | selected_Ws <- W[selected_indices] 165 | 166 | selected_features <- as.data.frame(list("selected_gene" = selected_genes, "W" = selected_Ws)) 167 | 168 | selected_features <- selected_features[order(selected_features$W, decreasing = TRUE), ] 169 | 170 | ret <- list("selected_features" = selected_features, "threshold" = thres) 171 | 172 | return(ret) 173 | } 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | #' @title Runs a typical Seurat workflow on a Seurat object (up to 183 | #' dimensionality reduction and clustering). 184 | #' 185 | #' @description Given a Seurat object, returns a new Seurat that has been 186 | #' normalized, had variable features identified, scaled, had principal 187 | #' components computed, hadclusters identified, and had tSNE and UMAP 188 | #' embeddings determined. 189 | #' 190 | #' @param seurat_obj The Seurat object that will be analyzed. 191 | #' @param resolution_start The starting resolution to be used for the 192 | #' clustering algorithm (Louvain and Leiden algorithms). 193 | #' @param num_clusters_start The starting number of clusters to be used for the 194 | #' clustering algorithm (K-means and Hierarchical clustering algorithms). 195 | #' @param reduction_percentage The amount that the starting parameter will be 196 | #' reduced by after each iteration (between 0 and 1). 197 | #' @param dims The dimensions to use as input features (i.e. 1:10). 198 | #' @param algorithm The clustering algorithm to be used. 199 | #' @param null_method The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula) 200 | #' @param assay The assay to generate artificial variables from. 201 | #' @param cores The number of cores to compute marker genes in parallel. 202 | #' @param shared_memory_max The maximum size for shared global variables. 203 | #' Increased this variable if you see the following error: 204 | #' The total size of the X globals that need to be exported for the future expression 205 | #' ('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB 206 | #' (option 'future.globals.maxSize'). The X largest globals are ... 207 | #' @param verbose Whether or not to show all logging. 208 | #' @returns Returns a Seurat object where the idents have been updated with the 209 | #' clusters determined via the recall algorithm. 210 | #' Latest clustering results will be stored in the object metadata under 211 | #' recall_clusters'. Note that 'recall_clusters' will be overwritten ever 212 | #' time FindClustersRecall is run. 213 | #' @name FindClustersRecall 214 | #' @export 215 | FindClustersRecall <- function(seurat_obj, 216 | resolution_start = 0.8, 217 | reduction_percentage = 0.2, 218 | num_clusters_start = 20, 219 | dims = 1:10, 220 | algorithm = "louvain", # todo implement all algos 221 | null_method = "ZIP", 222 | assay = "RNA", 223 | cores = 1, 224 | shared_memory_max = 8000 * 1024^2, 225 | verbose = TRUE) { 226 | 227 | # todo check function arguments for validity 228 | 229 | augmented_with_artificial_variables_seurat_obj <- get_seurat_obj_with_artificial_variables(seurat_obj, 230 | assay=assay, 231 | null_method=null_method, 232 | verbose=verbose, 233 | cores=cores) 234 | 235 | num_variable_features <- 2 * length(Seurat::VariableFeatures(seurat_obj)) 236 | 237 | 238 | # Pre-process data 239 | options(future.globals.maxSize = shared_memory_max) 240 | # todo log number of cores being used 241 | future::plan("multicore", workers = as.numeric(cores)) 242 | #options(future.globals.maxSize = 8000 * 1024^2) 243 | 244 | if (verbose) { 245 | message(paste("Number of cores:", cores)) 246 | } 247 | 248 | #plan("multicore", workers = as.numeric(cores)) 249 | 250 | augmented_with_artificial_variables_seurat_obj <- Seurat::NormalizeData(augmented_with_artificial_variables_seurat_obj, 251 | verbose = FALSE) 252 | 253 | augmented_with_artificial_variables_seurat_obj <- Seurat::FindVariableFeatures(augmented_with_artificial_variables_seurat_obj, 254 | selection.method = "vst", 255 | nfeatures = num_variable_features, 256 | verbose = FALSE) 257 | 258 | augmented_with_artificial_variables_seurat_obj <- Seurat::ScaleData(augmented_with_artificial_variables_seurat_obj, verbose = FALSE) 259 | augmented_with_artificial_variables_seurat_obj <- Seurat::RunPCA(augmented_with_artificial_variables_seurat_obj, 260 | features = Seurat::VariableFeatures(object = augmented_with_artificial_variables_seurat_obj), 261 | verbose = FALSE) 262 | 263 | augmented_with_artificial_variables_seurat_obj <- Seurat::FindNeighbors(augmented_with_artificial_variables_seurat_obj, 264 | dims = dims, 265 | verbose = FALSE) 266 | 267 | resolution_param <- resolution_start 268 | 269 | 270 | first_iteration <- TRUE 271 | 272 | while (TRUE) { 273 | if (verbose) { 274 | message("####################################################################") 275 | message(paste("Finding clusters with", stringr::str_to_title(algorithm), "algorithm")) 276 | message(paste("Resolution param:", resolution_param)) 277 | } 278 | 279 | if (algorithm == "louvain") { 280 | augmented_with_artificial_variables_seurat_obj <- Seurat::FindClusters(augmented_with_artificial_variables_seurat_obj, 281 | resolution = resolution_param, 282 | verbose = FALSE) 283 | } 284 | 285 | if (algorithm == "leiden") { 286 | #plan("sequential") # todo log number of cores being used # this is a weird one because leiden has a forked job hanging 287 | augmented_with_artificial_variables_seurat_obj <- Seurat::FindClusters(augmented_with_artificial_variables_seurat_obj, 288 | resolution = resolution_param, 289 | algorithm = 4, 290 | method = "igraph", 291 | verbose = FALSE) 292 | } 293 | 294 | # Reduce resolution for next iteration of the loop 295 | resolution_param <- (1 - reduction_percentage) * resolution_param 296 | 297 | k <- length(levels(Seurat::Idents(augmented_with_artificial_variables_seurat_obj))) 298 | #knock_idents <- 0:(k-1) 299 | 300 | if (verbose) { 301 | message("Num clusters:") 302 | message(k) 303 | } 304 | 305 | knock_idents <- levels(Seurat::Idents(augmented_with_artificial_variables_seurat_obj)) 306 | 307 | num_selected_matrix <- matrix(nrow = k, ncol = k) 308 | 309 | found_no_sign_diff <- FALSE 310 | 311 | num_clusters <- length(knock_idents) 312 | 313 | 314 | if (verbose) { 315 | progress_bar_length <- num_clusters * (num_clusters - 1) / 2 316 | cli::cli_progress_bar("Processing cluster pairs:", 317 | total = progress_bar_length, 318 | clear = FALSE) 319 | } 320 | 321 | m <- 0 322 | for (i in 1:num_clusters) { 323 | for (j in 1:num_clusters) { 324 | if (j >= i) { 325 | next 326 | } 327 | 328 | m <- m + 1 329 | 330 | if (verbose) { 331 | cli::cli_progress_update() 332 | } 333 | 334 | markers_selected <- compute_knockoff_filter(seurat_obj = augmented_with_artificial_variables_seurat_obj, 335 | cluster1 = knock_idents[i], 336 | cluster2 = knock_idents[j], 337 | q = 0.05, 338 | num_cores = cores, 339 | shared_memory_max = shared_memory_max) 340 | 341 | num_selected <- nrow(markers_selected$selected_features) 342 | 343 | if (num_selected == 0) { 344 | found_no_sign_diff <- TRUE 345 | break 346 | } 347 | 348 | num_selected_matrix[i, j] <- num_selected 349 | num_selected_matrix[j, i] <- num_selected 350 | 351 | } 352 | if (found_no_sign_diff) { 353 | if (verbose) { 354 | cli::cli_progress_done() 355 | message("Found clusters with no significant differences.") 356 | message("Progressing to next clustering iteration.") 357 | } 358 | first_iteration <- FALSE 359 | break 360 | } 361 | } 362 | 363 | if (found_no_sign_diff) { 364 | next 365 | } 366 | break 367 | } 368 | 369 | if (first_iteration) { 370 | warning("Only a single iteration occurred. The inferred cluster labels may be underclustered. To prevent this, you may want to re-run recall with a larger starting parameter.") 371 | } 372 | 373 | seurat_obj@meta.data$recall_clusters <- Seurat::Idents(augmented_with_artificial_variables_seurat_obj) 374 | Seurat::Idents(seurat_obj) <- seurat_obj@meta.data$recall_clusters 375 | 376 | return(seurat_obj) 377 | } 378 | 379 | 380 | 381 | 382 | 383 | #' @title Runs a typical Seurat workflow on a Seurat object (up to 384 | #' dimensionality reduction and clustering). 385 | #' 386 | #' @description Given a Seurat object, returns a new Seurat that has been 387 | #' normalized, had variable features identified, scaled, had principal 388 | #' components computed, hadclusters identified, and had tSNE and UMAP 389 | #' embeddings determined. 390 | #' 391 | #' @param seurat_obj The Seurat object that will be analyzed. 392 | #' @param resolution_start The starting resolution to be used for the 393 | #' clustering algorithm (Louvain and Leiden algorithms). 394 | #' @param num_clusters_start The starting number of clusters to be used for the 395 | #' clustering algorithm (K-means and Hierarchical clustering algorithms). 396 | #' @param reduction_percentage The amount that the starting parameter will be 397 | #' reduced by after each iteration (between 0 and 1). 398 | #' @param dims The dimensions to use as input features (i.e. 1:10). 399 | #' @param algorithm The clustering algorithm to be used. 400 | #' @param null_method The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula) 401 | #' @param assay The assay to generate artificial variables from. 402 | #' @param cores The number of cores to compute marker genes in parallel. 403 | #' @param shared_memory_max The maximum size for shared global variables. 404 | #' Increased this variable if you see the following error: 405 | #' The total size of the X globals that need to be exported for the future expression 406 | #' ('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB 407 | #' (option 'future.globals.maxSize'). The X largest globals are ... 408 | #' @param verbose Whether or not to show all logging. 409 | #' @returns Returns a Seurat object where the idents have been updated with the 410 | #' clusters determined via the countsplit algorithm. 411 | #' Latest clustering results will be stored in the object metadata under 412 | #' countsplit_clusters'. Note that 'countsplit_clusters' will be overwritten ever 413 | #' time FindClustersCountsplit is run. 414 | #' @name FindClustersCountsplit 415 | #' @export 416 | FindClustersCountsplit <- function(seurat_obj, 417 | resolution_start = 0.8, 418 | reduction_percentage = 0.2, 419 | num_clusters_start = 20, 420 | dims = 1:10, 421 | algorithm = "louvain", # todo implement all algos 422 | null_method = "ZIP", 423 | assay = "RNA", 424 | cores = 1, 425 | shared_memory_max = 8000 * 1024^2, 426 | verbose = TRUE) { 427 | 428 | options(future.globals.maxSize = shared_memory_max) 429 | # todo log number of cores being used 430 | future::plan("multicore", workers = as.numeric(cores)) 431 | 432 | num_variable_features <- length(Seurat::VariableFeatures(seurat_obj)) 433 | 434 | 435 | # follow this issue 436 | #https://github.com/anna-neufeld/countsplit/issues/8 437 | 438 | # todo only do this for variable features 439 | split <- countsplit::countsplit(Seurat::GetAssayData(seurat_obj, assay)) 440 | Xtrain <- split[[1]] 441 | Xtest <- split[[2]] 442 | 443 | seurat_obj_train <- Seurat::CreateSeuratObject(counts = Xtrain) 444 | seurat_obj_test <- Seurat::CreateSeuratObject(counts = Xtest) 445 | 446 | 447 | # process training data 448 | seurat_obj_train <- Seurat::NormalizeData(seurat_obj_train, 449 | verbose = FALSE) 450 | 451 | seurat_obj_train <- Seurat::FindVariableFeatures(seurat_obj_train, 452 | selection.method = "vst", 453 | nfeatures = num_variable_features, 454 | verbose = FALSE) 455 | 456 | seurat_obj_train <- Seurat::ScaleData(seurat_obj_train, verbose = FALSE) 457 | seurat_obj_train <- Seurat::RunPCA(seurat_obj_train, 458 | features = Seurat::VariableFeatures(object = seurat_obj_train), 459 | verbose = FALSE) 460 | 461 | seurat_obj_train <- Seurat::FindNeighbors(seurat_obj_train, 462 | dims = dims, 463 | verbose = FALSE) 464 | 465 | # process test data (no need for PCA and FindNeighbors since we are just assigning idents based on training idents) 466 | seurat_obj_test <- Seurat::NormalizeData(seurat_obj_test, 467 | verbose = FALSE) 468 | 469 | seurat_obj_test <- Seurat::FindVariableFeatures(seurat_obj_test, 470 | selection.method = "vst", 471 | nfeatures = num_variable_features, 472 | verbose = FALSE) 473 | 474 | seurat_obj_test <- Seurat::ScaleData(seurat_obj_test, verbose = FALSE) 475 | 476 | 477 | 478 | resolution_param <- resolution_start 479 | 480 | # set up multicore for FindMarkers 481 | future::plan("multicore", workers = as.numeric(cores)) 482 | 483 | first_iteration <- TRUE 484 | 485 | while (TRUE) { 486 | if (verbose) { 487 | message("####################################################################") 488 | message(paste("Finding clusters with", stringr::str_to_title(algorithm), "algorithm")) 489 | message(paste("Resolution param:", resolution_param)) 490 | } 491 | 492 | if (algorithm == "louvain") { 493 | seurat_obj_train <- Seurat::FindClusters(seurat_obj_train, 494 | resolution = resolution_param, 495 | verbose = FALSE) 496 | } 497 | 498 | if (algorithm == "leiden") { 499 | #plan("sequential") # todo log number of cores being used # this is a weird one because leiden has a forked job hanging 500 | seurat_obj_train <- Seurat::FindClusters(seurat_obj_train, 501 | resolution = resolution_param, 502 | algorithm = 4, 503 | method = "igraph", 504 | verbose = FALSE) 505 | } 506 | 507 | # Reduce resolution for next iteration of the loop 508 | resolution_param <- (1 - reduction_percentage) * resolution_param 509 | 510 | Seurat::Idents(seurat_obj_test) <- Seurat::Idents(seurat_obj_train) 511 | 512 | k <- length(levels(Seurat::Idents(seurat_obj_test))) 513 | #knock_idents <- 0:(k-1) 514 | 515 | if (verbose) { 516 | message("Num clusters:") 517 | message(k) 518 | } 519 | 520 | countsplit_idents <- levels(Seurat::Idents(seurat_obj_test)) 521 | 522 | num_selected_matrix <- matrix(nrow = k, ncol = k) 523 | 524 | found_no_sign_diff <- FALSE 525 | 526 | num_clusters <- length(countsplit_idents) 527 | 528 | 529 | if (verbose) { 530 | progress_bar_length <- num_clusters * (num_clusters - 1) / 2 531 | cli::cli_progress_bar("Processing cluster pairs:", 532 | total = progress_bar_length, 533 | clear = FALSE) 534 | } 535 | 536 | m <- 0 537 | for (i in 1:num_clusters) { 538 | for (j in 1:num_clusters) { 539 | if (j >= i) { 540 | next 541 | } 542 | 543 | m <- m + 1 544 | 545 | if (verbose) { 546 | cli::cli_progress_update() 547 | } 548 | 549 | markers_selected <- Seurat::FindMarkers(seurat_obj_test, 550 | ident.1 = countsplit_idents[i], 551 | ident.2 = countsplit_idents[j]) 552 | 553 | num_selected <- sum(markers_selected$p_val_adj < 0.05) 554 | 555 | if (num_selected == 0) { 556 | found_no_sign_diff <- TRUE 557 | break 558 | } 559 | 560 | num_selected_matrix[i, j] <- num_selected 561 | num_selected_matrix[j, i] <- num_selected 562 | 563 | } 564 | if (found_no_sign_diff) { 565 | if (verbose) { 566 | cli::cli_progress_done() 567 | message("Found clusters with no significant differences.") 568 | message("Progressing to next clustering iteration.") 569 | } 570 | first_iteration <- FALSE 571 | break 572 | } 573 | } 574 | 575 | if (found_no_sign_diff) { 576 | next 577 | } 578 | break 579 | } 580 | 581 | if (first_iteration) { 582 | warning("Only a single iteration occurred. The inferred cluster labels may be underclustered. To prevent this, you may want to re-run FindClustersCountsplit with a larger starting parameter.") 583 | } 584 | 585 | 586 | seurat_obj@meta.data$countsplit_clusters <- Seurat::Idents(seurat_obj_test) 587 | Seurat::Idents(seurat_obj) <- seurat_obj@meta.data$countsplit_clusters 588 | 589 | return(seurat_obj) 590 | 591 | } --------------------------------------------------------------------------------