├── .github ├── .gitignore └── workflows │ ├── docker-image.yml │ ├── super-linter.yml │ ├── pkgdown.yaml │ └── check-standard.yml ├── vignettes ├── .gitignore └── basic-usage.Rmd ├── LICENSE ├── _pkgdown.yml ├── Dockerfile ├── man ├── figures │ └── callback_logo.png ├── estimate_zi_poisson.Rd ├── rzipoisson.Rd ├── get_seurat_obj_with_knockoffs.Rd ├── compute_knockoff_filter.Rd ├── seurat_workflow.Rd └── FindClustersCallback.Rd ├── .Rbuildignore ├── NAMESPACE ├── .gitignore ├── LICENSE.md ├── DESCRIPTION ├── R ├── estimate_zipoisson.R ├── seurat_workflow.R └── callback.R └── README.md /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2024 2 | COPYRIGHT HOLDER: callback authors 3 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://lcrawlab.github.io/callback/ 2 | template: 3 | bootstrap: 5 4 | 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rocker/verse:4.0.5 2 | 3 | RUN R -e "install.packages('.', type = 'source', repos = NULL)" -------------------------------------------------------------------------------- /man/figures/callback_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcrawlab/callback/HEAD/man/figures/callback_logo.png -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^LICENSE\.md$ 2 | ^Dockerfile$ 3 | ^tmp$ 4 | ^_pkgdown\.yml$ 5 | ^docs$ 6 | ^pkgdown$ 7 | ^\.github$ 8 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(FindClustersCallback) 4 | export(seurat_workflow) 5 | -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | jobs: 10 | 11 | build: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | - name: Build the Docker image 18 | run: docker build . --file Dockerfile --tag callback:$(date +%s) 19 | -------------------------------------------------------------------------------- /man/estimate_zi_poisson.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_zipoisson.R 3 | \name{estimate_zi_poisson} 4 | \alias{estimate_zi_poisson} 5 | \title{Random data generation for the zero-infalted Poisson distribution 6 | with Poisson parameter lambda and zero proportion prop.zero.} 7 | \usage{ 8 | estimate_zi_poisson(data) 9 | } 10 | \arguments{ 11 | \item{data}{The data to estimate parameters from.} 12 | } 13 | \value{ 14 | Maximum likelihood estimators of for the zero-inflated Poisson 15 | distribution 16 | } 17 | \description{ 18 | Given data, computes the maximum likelihood estimators 19 | for the zero-infalted Poisson distribution. 20 | } 21 | -------------------------------------------------------------------------------- /man/rzipoisson.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_zipoisson.R 3 | \name{rzipoisson} 4 | \alias{rzipoisson} 5 | \title{Random data generation for the zero-infalted Poisson distribution 6 | with Poisson parameter lambda and zero proportion prop.zero.} 7 | \usage{ 8 | rzipoisson(n, lambda, prop.zero) 9 | } 10 | \arguments{ 11 | \item{n}{The number of samples to be simulated.} 12 | 13 | \item{lambda}{The Poisson rate parameter.} 14 | 15 | \item{prop.zero}{The proportion of excess zeroes.} 16 | } 17 | \value{ 18 | Simulated data from ZIP(lambda, prop.zero). 19 | } 20 | \description{ 21 | Given the number of samples desired, a Poisson parameter, 22 | lambda, and a zero proportion, prop.zero, simulates the number of desired 23 | samples from ZIP(lambda, prop.zero). 24 | } 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | .RDataTmp 8 | 9 | # User-specific files 10 | .Ruserdata 11 | 12 | # Example code in package build process 13 | *-Ex.R 14 | 15 | # Output files from R CMD build 16 | /*.tar.gz 17 | 18 | # Output files from R CMD check 19 | /*.Rcheck/ 20 | 21 | # RStudio files 22 | .Rproj.user/ 23 | 24 | # produced vignettes 25 | vignettes/*.html 26 | vignettes/*.pdf 27 | 28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 29 | .httr-oauth 30 | 31 | # knitr and R markdown default cache directories 32 | *_cache/ 33 | /cache/ 34 | 35 | # Temporary files created by R markdown 36 | *.utf8.md 37 | *.knit.md 38 | 39 | # R Environment Variables 40 | .Renviron 41 | 42 | # pkgdown site 43 | docs/ 44 | 45 | # translation temp files 46 | po/*~ 47 | 48 | # RStudio Connect folder 49 | rsconnect/ 50 | .Rproj.user 51 | .Rdata 52 | .DS_Store 53 | docs 54 | inst/doc 55 | -------------------------------------------------------------------------------- /.github/workflows/super-linter.yml: -------------------------------------------------------------------------------- 1 | # This workflow executes several linters on changed files based on languages used in your code base whenever 2 | # you push a code or open a pull request. 3 | # 4 | # You can adjust the behavior by modifying this file. 5 | # For more information, see: 6 | # https://github.com/github/super-linter 7 | name: Lint Code Base 8 | 9 | on: 10 | push: 11 | branches: [ "main" ] 12 | pull_request: 13 | branches: [ "main" ] 14 | jobs: 15 | run-lint: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v3 20 | with: 21 | # Full git history is needed to get a proper list of changed files within `super-linter` 22 | fetch-depth: 0 23 | 24 | - name: Lint Code Base 25 | uses: github/super-linter@v4 26 | env: 27 | VALIDATE_ALL_CODEBASE: false 28 | DEFAULT_BRANCH: "main" 29 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 30 | -------------------------------------------------------------------------------- /man/get_seurat_obj_with_knockoffs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/callback.R 3 | \name{get_seurat_obj_with_knockoffs} 4 | \alias{get_seurat_obj_with_knockoffs} 5 | \title{Returns a Seurat object that contains additional (fake) RNA 6 | expression counts in the form of knockoffs.} 7 | \usage{ 8 | get_seurat_obj_with_knockoffs(seurat_obj, assay = "RNA", verbose = TRUE) 9 | } 10 | \arguments{ 11 | \item{seurat_obj}{A Seurat object containing RNA expression counts.} 12 | 13 | \item{assay}{The assay to generate knockoffs from.} 14 | 15 | \item{verbose}{Whether or not to show logging.} 16 | } 17 | \value{ 18 | A Seurat object that contains the original variable features and an 19 | equal number of knockoff features. 20 | } 21 | \description{ 22 | Given a Seurat object, returns a new Seurat object whose RNA 23 | expression counts includes the 24 | variable features from the original object and an equal number of knockoff 25 | features. 26 | } 27 | -------------------------------------------------------------------------------- /man/compute_knockoff_filter.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/callback.R 3 | \name{compute_knockoff_filter} 4 | \alias{compute_knockoff_filter} 5 | \title{Returns the genes selected by the knockoff filter} 6 | \usage{ 7 | compute_knockoff_filter( 8 | seurat_obj, 9 | cluster1, 10 | cluster2, 11 | q, 12 | return_all = FALSE, 13 | num_cores = 1 14 | ) 15 | } 16 | \arguments{ 17 | \item{seurat_obj}{A Seurat object} 18 | 19 | \item{cluster1}{The Idents of the cluster of interest in seurat_obj1} 20 | 21 | \item{cluster2}{The Idents of the cluster of interest in seurat_obj2} 22 | 23 | \item{q}{The desired rate to control the FDR at} 24 | 25 | \item{return_all}{Determines if the returned object will contain all genes 26 | or just the selected genes.} 27 | 28 | \item{num_cores}{The number of cores for computing marker genes in parallel.} 29 | } 30 | \value{ 31 | todo 32 | } 33 | \description{ 34 | Given two Seurat objects, returns the the genes selected by 35 | the knockoff filter and their W statistics. 36 | } 37 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2024 callback authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: callback 2 | Title: A Knockoff Calibration Method to Avoid Over-Clustering in scRNAseq Data 3 | Version: 0.0.0 4 | Authors@R: 5 | person("Alan", "DenAdel", , "alan_denadel@brown.edu", role = c("aut", "cre"), 6 | comment = c(ORCID = "0000-0002-7985-6789")) 7 | Description: callback (Calibrated Clustering via Knockoffs) is a method for protecting 8 | against over-clustering by controlling for the impact of double-dipping. The approach 9 | can be applied to any clustering algorithm (implemented are the Louvain, Leiden, 10 | K-means, and hierarchical clustering algorithms). The method provides state-of-the-art 11 | clustering performance and can rapidly analyze large-scale scRNA-seq studies and is 12 | compatible with the Seurat library. 13 | Encoding: UTF-8 14 | Roxygen: list(markdown = TRUE) 15 | RoxygenNote: 7.3.1 16 | Imports: 17 | Matrix, 18 | Seurat (>= 5.0.1), 19 | lamW, 20 | knockoff, 21 | future, 22 | stats, 23 | cli, 24 | stringr 25 | License: MIT + file LICENSE 26 | Suggests: 27 | knitr, 28 | rmarkdown 29 | VignetteBuilder: knitr 30 | URL: https://lcrawlab.github.io/callback/ 31 | -------------------------------------------------------------------------------- /man/seurat_workflow.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/seurat_workflow.R 3 | \name{seurat_workflow} 4 | \alias{seurat_workflow} 5 | \title{Runs a typical Seurat workflow on a Seurat object (up to 6 | dimensionality reduction and clustering).} 7 | \usage{ 8 | seurat_workflow( 9 | seurat_obj, 10 | num_variable_features, 11 | resolution_param = 0.8, 12 | visualization_method = "umap", 13 | num_dims = 10, 14 | algorithm = "louvain" 15 | ) 16 | } 17 | \arguments{ 18 | \item{seurat_obj}{A Seurat object that will be analyzed.} 19 | 20 | \item{num_variable_features}{The number of variable features to use in the 21 | analysis.} 22 | 23 | \item{resolution_param}{The resolution parameter to use when clustering.} 24 | 25 | \item{visualization_method}{Either "umap" or "tsne".} 26 | 27 | \item{num_dims}{The number of principal components to use.} 28 | 29 | \item{algorithm}{The clustering algorithm to use, either "louvain" or 30 | "leiden".} 31 | } 32 | \value{ 33 | A Seurat object containing the relevant analysis results. 34 | } 35 | \description{ 36 | Given a Seurat object, returns a new Seurat that has been 37 | normalized, had variable features identified, 38 | scaled, had principal components computed, had clusters identified, and had 39 | tSNE and UMAP embeddings determined. 40 | } 41 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | permissions: 23 | contents: write 24 | steps: 25 | - uses: actions/checkout@v4 26 | 27 | - uses: r-lib/actions/setup-pandoc@v2 28 | 29 | - uses: r-lib/actions/setup-r@v2 30 | with: 31 | use-public-rspm: true 32 | 33 | - uses: r-lib/actions/setup-r-dependencies@v2 34 | with: 35 | extra-packages: any::pkgdown, local::. 36 | needs: website 37 | 38 | - name: Build site 39 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 40 | shell: Rscript {0} 41 | 42 | - name: Deploy to GitHub pages 🚀 43 | if: github.event_name != 'pull_request' 44 | uses: JamesIves/github-pages-deploy-action@v4.5.0 45 | with: 46 | clean: false 47 | branch: gh-pages 48 | folder: docs 49 | -------------------------------------------------------------------------------- /.github/workflows/check-standard.yml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v4 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /vignettes/basic-usage.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Basic Usage on PBMC3k Data" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{basic-usage} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | 9 | --- 10 | 11 | ```{r, include = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = "#>" 15 | ) 16 | 17 | knitr::opts_chunk$set(eval = FALSE) 18 | 19 | ``` 20 | 21 | ```{r setup} 22 | suppressPackageStartupMessages({ 23 | library(Seurat) 24 | library(SeuratData) 25 | library(callback) 26 | }) 27 | ``` 28 | 29 | 30 | First, we use the `SeuratData` data package to first download and then load 31 | 2700 PBMCs. The loaded `SeuratObject`, `pbmc3k`, is from an old version of 32 | `Seurat`, and so we update the object to v5. 33 | 34 | ```{r load_data} 35 | set.seed(123) 36 | 37 | SeuratData::InstallData("pbmc3k") 38 | data("pbmc3k") 39 | 40 | pbmc3k <- UpdateSeuratObject(pbmc3k) 41 | ``` 42 | 43 | Now, we use `Seurat` to perform the usual preprocessing steps that are performed prior to clustering. 44 | 45 | ```{r preprocessing} 46 | pbmc3k <- NormalizeData(pbmc3k) 47 | pbmc3k <- FindVariableFeatures(pbmc3k) 48 | pbmc3k <- ScaleData(pbmc3k) 49 | pbmc3k <- RunPCA(pbmc3k) 50 | pbmc3k <- FindNeighbors(pbmc3k) 51 | pbmc3k <- RunUMAP(pbmc3k, dims = 1:10) 52 | ``` 53 | 54 | The `callback` algorithm can be run with a single function call as a drop-in 55 | replacement for the `Seurat` function `FindClusters`. 56 | 57 | ```{r run_callback} 58 | pbmc3k <- FindClustersCallback(pbmc3k) 59 | ``` 60 | 61 | The `callback` clusters are set to the idents of the `SeuratObject` that is 62 | returned by `FindClustersCallback` 63 | 64 | ```{r plot_umap} 65 | DimPlot(pbmc3k) 66 | ``` 67 | 68 | Cluster labels from `FindClustersCallback` care stored in the metadata in the 69 | column `pbmc3k@meta.data$callback_clusters`. 70 | 71 | ```{r plot_umap2} 72 | DimPlot(pbmc3k, group.by = "callback_clusters") 73 | ``` 74 | -------------------------------------------------------------------------------- /man/FindClustersCallback.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/callback.R 3 | \name{FindClustersCallback} 4 | \alias{FindClustersCallback} 5 | \title{Runs a typical Seurat workflow on a Seurat object (up to 6 | dimensionality reduction and clustering).} 7 | \usage{ 8 | FindClustersCallback( 9 | seurat_obj, 10 | resolution_start = 0.8, 11 | reduction_percentage = 0.2, 12 | num_clusters_start = 20, 13 | dims = 1:10, 14 | algorithm = "louvain", 15 | assay = "RNA", 16 | cores = 1, 17 | verbose = TRUE 18 | ) 19 | } 20 | \arguments{ 21 | \item{seurat_obj}{The Seurat object that will be analyzed.} 22 | 23 | \item{resolution_start}{The starting resolution to be used for the 24 | clustering algorithm (Louvain and Leiden algorithms).} 25 | 26 | \item{reduction_percentage}{The amount that the starting parameter will be 27 | reduced by after each iteration (between 0 and 1).} 28 | 29 | \item{num_clusters_start}{The starting number of clusters to be used for the 30 | clustering algorithm (K-means and Hierarchical clustering algorithms).} 31 | 32 | \item{dims}{The dimensions to use as input features (i.e. 1:10).} 33 | 34 | \item{algorithm}{The clustering algorithm to be used.} 35 | 36 | \item{assay}{The assay to generate knockoffs from.} 37 | 38 | \item{cores}{The number of cores to compute marker genes in parallel.} 39 | 40 | \item{verbose}{Whether or not to show all logging.} 41 | } 42 | \value{ 43 | Returns a Seurat object where the idents have been updated with the 44 | clusters determined via the callback algorithm. 45 | Latest clustering results will be stored in the object metadata under 46 | callback_clusters'. Note that 'callback_clusters' will be overwritten ever 47 | time FindClustersKC is run. 48 | } 49 | \description{ 50 | Given a Seurat object, returns a new Seurat that has been 51 | normalized, had variable features identified, scaled, had principal 52 | components computed, hadclusters identified, and had tSNE and UMAP 53 | embeddings determined. 54 | } 55 | -------------------------------------------------------------------------------- /R/estimate_zipoisson.R: -------------------------------------------------------------------------------- 1 | 2 | # https://en.wikipedia.org/wiki/Zero-inflated_model#Estimators_of_ZIP_parameters 3 | # https://math.stackexchange.com/questions/2761563/maximum-likelihood-estimation-for-zero-inflated-poisson-distribution 4 | # https://ieeexplore.ieee.org/document/9032203 5 | 6 | #' @title Maximum likelihood estimation for the zero-inflated Poisson distribution 7 | #' with Poisson parameter lambda and zero proportion prop.zero. 8 | #' 9 | #' @description Given data, computes the maximum likelihood estimators 10 | #' for the zero-inflated Poisson distribution. 11 | #' 12 | #' @param data The data to estimate parameters from. 13 | #' @returns Maximum likelihood estimators of the zero-inflated Poisson 14 | #' distribution 15 | #' @name estimate_zi_poisson 16 | estimate_zi_poisson <- function(data) { 17 | num.zeros <- sum(data == 0) 18 | r0 <- 1 / length(data) * num.zeros 19 | 20 | x.bar = mean(data) 21 | 22 | gamma <- x.bar / (1 - r0) 23 | 24 | lambda.hat <- lamW::lambertW0(-gamma * exp(-gamma)) + gamma 25 | 26 | pi.hat <- 1 - x.bar / lambda.hat 27 | 28 | 29 | return.list <- list("lambda.hat" = lambda.hat, "pi.hat" = pi.hat) 30 | return(return.list) 31 | } 32 | 33 | 34 | #' @title Random data generation for the zero-infalted Poisson distribution 35 | #' with Poisson parameter lambda and zero proportion prop.zero. 36 | #' 37 | #' @description Given the number of samples desired, a Poisson parameter, 38 | #' lambda, and a zero proportion, prop.zero, simulates the number of desired 39 | #' samples from ZIP(lambda, prop.zero). 40 | #' 41 | #' @param n The number of samples to be simulated. 42 | #' @param lambda The Poisson rate parameter. 43 | #' @param prop.zero The proportion of excess zeroes. 44 | #' @returns Simulated data from ZIP(lambda, prop.zero). 45 | #' @name rzipoisson 46 | rzipoisson <- function(n, lambda, prop.zero) { 47 | data <- c() 48 | 49 | 50 | for (i in 1:n) { 51 | if (stats::runif(1) < prop.zero) { 52 | data[i] <- 0 53 | } 54 | else { 55 | data[i] <- stats::rpois(1, lambda) 56 | } 57 | } 58 | return(data) 59 | } 60 | 61 | -------------------------------------------------------------------------------- /R/seurat_workflow.R: -------------------------------------------------------------------------------- 1 | #' @title Runs a typical Seurat workflow on a Seurat object (up to 2 | #' dimensionality reduction and clustering). 3 | #' 4 | #' @description Given a Seurat object, returns a new Seurat that has been 5 | #' normalized, had variable features identified, 6 | #' scaled, had principal components computed, had clusters identified, and had 7 | #' tSNE and UMAP embeddings determined. 8 | #' 9 | #' @param seurat_obj A Seurat object that will be analyzed. 10 | #' @param num_variable_features The number of variable features to use in the 11 | #' analysis. 12 | #' @param resolution_param The resolution parameter to use when clustering. 13 | #' @param visualization_method Either "umap" or "tsne". 14 | #' @param num_dims The number of principal components to use. 15 | #' @param algorithm The clustering algorithm to use, either "louvain" or 16 | #' "leiden". 17 | #' @returns A Seurat object containing the relevant analysis results. 18 | #' @export 19 | #' @name seurat_workflow 20 | seurat_workflow <- function(seurat_obj, 21 | num_variable_features, 22 | resolution_param = 0.8, 23 | visualization_method = "umap", 24 | num_dims = 10, 25 | algorithm = "louvain") { 26 | seurat_obj <- Seurat::NormalizeData(seurat_obj) 27 | 28 | seurat_obj <- Seurat::FindVariableFeatures(seurat_obj, 29 | selection.method = "vst", 30 | nfeatures = num_variable_features) 31 | 32 | all_genes <- rownames(seurat_obj) 33 | 34 | #seurat_obj <- Seurat::ScaleData(seurat_obj, features = all_genes) 35 | seurat_obj <- Seurat::ScaleData(seurat_obj) 36 | 37 | seurat_obj <- Seurat::RunPCA(seurat_obj, 38 | features = Seurat::VariableFeatures(object = seurat_obj)) 39 | 40 | # todo check if i should use all dims for knockoffs 41 | seurat_obj <- Seurat::FindNeighbors(seurat_obj, dims = 1:num_dims) 42 | 43 | if (algorithm == "louvain") { 44 | seurat_obj <- Seurat::FindClusters(seurat_obj, 45 | resolution = resolution_param) 46 | } 47 | 48 | if (algorithm == "leiden") { 49 | seurat_obj <- Seurat::FindClusters(seurat_obj, 50 | resolution = resolution_param, 51 | algorithm = 4, 52 | method = "igraph") 53 | } 54 | 55 | if (visualization_method == "umap") { 56 | seurat_obj <- Seurat::RunUMAP(seurat_obj, dims = 1:num_dims) 57 | } 58 | if (visualization_method == "tsne") { 59 | seurat_obj <- Seurat::RunTSNE(seurat_obj, dims = 1:num_dims) 60 | } 61 | 62 | if (visualization_method == "both") { 63 | seurat_obj <- Seurat::RunUMAP(seurat_obj, dims = 1:num_dims) 64 | seurat_obj <- Seurat::RunTSNE(seurat_obj, dims = 1:num_dims) 65 | } 66 | 67 | return(seurat_obj) 68 | } 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # THIS REPOSITORY HAS BEEN ARCHIVED AND WILL NOT BE UPDATED. THIS METHOD HAS BEEN RENAMED recall (Calibrated Clustering with Artificial Variables) AND IS NOW LOCATED IN [THIS REPOSITORY](https://github.com/lcrawlab/recall). 2 | 3 | # callback (Calibrated Clustering via Knockoffs) 4 | 5 | [![R CMD check](https://github.com/lcrawlab/callback/actions/workflows/check-standard.yml/badge.svg)](https://github.com/lcrawlab/callback/actions/workflows/check-standard.yml) 6 | [![Docker Image CI](https://github.com/lcrawlab/callback/actions/workflows/docker-image.yml/badge.svg)](https://github.com/lcrawlab/callback/actions/workflows/docker-image.yml) 7 | 8 | ## Introduction 9 | 10 | Standard single-cell RNA-sequencing (scRNA-seq) pipelines nearly always include unsupervised clustering as a key step in identifying biologically distinct cell types. A follow-up step in these pipelines is to test for differential expression between the identified clusters. When algorithms over-cluster, downstream analyses will produce inflated P-values resulting in increased false discoveries. 11 | Here, we present `callback` (Calibrated Clustering via Knockoffs): a new method for protecting against over-clustering by controlling for the impact of double-dipping. 12 | Importantly, our approach can be applied to any clustering algorithm (implemented here are the Louvain, Leiden, K-means, and hierarchical clustering algorithms). 13 | `callback` provides state-of-the-art clustering performance and can rapidly analyze large-scale scRNA-seq studies, even on a personal laptop. 14 | 15 | ## Installation 16 | 17 | You can install the lastest development version by using the [devtools](https://CRAN.R-project.org/package=devtools) library. To install this package with devtools, use this command: 18 | 19 | ```r 20 | devtools::install_github("lcrawlab/callback") 21 | ``` 22 | 23 | Although it is not explicitly a dependency, making sure you have `presto` installed will make `callback` much faster. 24 | 25 | ```r 26 | devtools::install_github("immunogenomics/presto") 27 | ``` 28 | 29 | 30 | ## Tutorial 31 | 32 | ```r 33 | library(Seurat) 34 | library(SeuratData) 35 | 36 | library(callback) 37 | 38 | set.seed(123) 39 | 40 | # load pbmc3k dataset 41 | SeuratData::InstallData("pbmc3k") 42 | data("pbmc3k") 43 | 44 | pbmc3k <- UpdateSeuratObject(pbmc3k) 45 | 46 | pbmc3k <- NormalizeData(pbmc3k) 47 | pbmc3k <- FindVariableFeatures(pbmc3k) 48 | pbmc3k <- ScaleData(pbmc3k) 49 | pbmc3k <- RunPCA(pbmc3k) 50 | pbmc3k <- FindNeighbors(pbmc3k) 51 | pbmc3k <- RunUMAP(pbmc3k, dims = 1:10) 52 | 53 | pbmc_default <- FindClusters(pbmc3k) 54 | pbmc_callback <- FindClustersCallback(pbmc3k) 55 | 56 | DimPlot(pbmc_default) + DimPlot(pbmc_callback) 57 | ``` 58 | ## Overview of the Method 59 | 60 | The `callback` algorithm consists of three simple steps: 61 | 62 | 1. First, we generate synthetic null variables, formally called knockoff features, where we augment the single-cell data being analyzed with "fake" genes that are known not to contribute to any unique cell type. 63 | 2. Second, we perform both preprocessing and clustering on this augmented dataset. 64 | 3. Third, we calibrate the number of inferred clusters by using a hypothesis testing strategy with a data-dependent threshold to determine if there is a statistically significant difference between groups. If any pair of groups does not have statistically significant differences then re-clustering occurs. 65 | 66 | The synthetic knockoff genes act as negative control variables; they go through the same analytic steps as the real data and are presented with the same opportunity to be identified as marker genes. 67 | The `callback` algorithm uses the guiding principle that well-calibrated clusters (i.e., those representing real groups) should have significantly differentially expressed genes after correcting for multiple hypothesis tests, while over-clustered groups will not. 68 | We use this rule to iteratively re-cluster cells until the inferred clusters are well-calibrated and the observed differences in expression between groups are not due to the effects of double-dipping. 69 | 70 | ## Relevant Citations 71 | `callback` is currently on the bioRxiv, [here](https://www.biorxiv.org/content/10.1101/2024.03.08.584180v1). 72 | 73 | A. DenAdel, M. Ramseier, A. Navia, A. Shalek, S. Raghavan, P. Winter, A. Amini, and L. Crawford. A knockoff calibration method to avoid over-clustering in single-cell RNA-sequencing. _bioRxiv_. 74 | 75 | ## Questions and Feedback 76 | For questions or concerns with `callback`, please contact 77 | [Alan DenAdel](mailto:alan_denadel@brown.edu) or [Lorin Crawford](lcrawford@microsoft.com). Any feedback on the software, manuscript, and tutorials is appreciated. 78 | -------------------------------------------------------------------------------- /R/callback.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title Returns a Seurat object that contains additional (fake) RNA 3 | #' expression counts in the form of knockoffs. 4 | #' 5 | #' @description Given a Seurat object, returns a new Seurat object whose RNA 6 | #' expression counts includes the 7 | #' variable features from the original object and an equal number of knockoff 8 | #' features. 9 | #' 10 | #' @param seurat_obj A Seurat object containing RNA expression counts. 11 | #' @param assay The assay to generate knockoffs from. 12 | #' @param verbose Whether or not to show logging. 13 | #' @returns A Seurat object that contains the original variable features and an 14 | #' equal number of knockoff features. 15 | #' @name get_seurat_obj_with_knockoffs 16 | get_seurat_obj_with_knockoffs <- function(seurat_obj, assay = "RNA", verbose = TRUE) { 17 | 18 | if (verbose) { 19 | message("Pulling data from Seurat object") 20 | } 21 | 22 | var_features <- Seurat::VariableFeatures(seurat_obj) 23 | seurat_obj_data <- as.data.frame(t(as.matrix(Seurat::GetAssayData(seurat_obj, assay = assay, layer = "counts")[var_features, ]))) 24 | 25 | if (verbose) { 26 | message("Computing MLE for zero-inflated Poisson for each gene") 27 | } 28 | 29 | ml_estimates <- lapply(seurat_obj_data, estimate_zi_poisson) 30 | 31 | if (verbose) { 32 | message("Computing knockoff features") 33 | } 34 | 35 | ko <- as.data.frame(lapply(ml_estimates, 36 | function(x) { 37 | rzipoisson(nrow(seurat_obj_data), 38 | x$lambda.hat, 39 | x$pi.hat) 40 | })) 41 | 42 | 43 | num_variable_features <- length(var_features) 44 | colnames(ko) <- paste0(rep("knockoff", num_variable_features), 1:num_variable_features) 45 | combined_data <- cbind(seurat_obj_data, ko) 46 | 47 | # sparsify augmented data matrix and transpose for use in Seurat 48 | combined_data <- Matrix::Matrix(t(combined_data), sparse = TRUE) 49 | 50 | new_project_name <- paste0(seurat_obj@project.name, "_with_knockoffs") 51 | new_seurat_obj <- Seurat::CreateSeuratObject(counts = combined_data, project = new_project_name) 52 | 53 | return(new_seurat_obj) 54 | } 55 | 56 | 57 | 58 | 59 | 60 | 61 | #' @title Returns the genes selected by the knockoff filter 62 | #' 63 | #' @description Given two Seurat objects, returns the the genes selected by 64 | #' the knockoff filter and their W statistics. 65 | #' 66 | #' @param seurat_obj A Seurat object 67 | #' @param cluster1 The Idents of the cluster of interest in seurat_obj1 68 | #' @param cluster2 The Idents of the cluster of interest in seurat_obj2 69 | #' @param q The desired rate to control the FDR at 70 | #' @param return_all Determines if the returned object will contain all genes 71 | #' or just the selected genes. 72 | #' @param num_cores The number of cores for computing marker genes in parallel. 73 | #' @returns todo 74 | #' @name compute_knockoff_filter 75 | compute_knockoff_filter <- function(seurat_obj, 76 | cluster1, 77 | cluster2, 78 | q, 79 | return_all = FALSE, 80 | num_cores = 1) { 81 | options(future.globals.maxSize = 8000 * 1024^2) 82 | # todo note what this is for, figure this out as a parameter or programmatically 83 | future::plan("multicore", workers = as.numeric(num_cores)) 84 | 85 | markers <- Seurat::FindMarkers(seurat_obj, 86 | ident.1 = cluster1, 87 | ident.2 = cluster2, 88 | logfc.threshold = 0, 89 | min.pct = 0) 90 | 91 | 92 | # FindMarkers orders by p-value, so we can't rely on position to know which genes are which 93 | knockoff_indices <- grepl("^knockoff", rownames(markers)) 94 | original_indices <- !knockoff_indices 95 | 96 | # subset the markers data.frame into originals and knockoffs 97 | knockoff_markers <- markers[knockoff_indices, ] 98 | original_markers <- markers[original_indices, ] 99 | 100 | all_genes <- rownames(seurat_obj) 101 | 102 | # get indices of knockoffs and originals from seurat_obj, should be [FALSE, ..., FALSE, TRUE, ..., TRUE] 103 | knockoff_indices_sorted <- grepl("^knockoff", all_genes) 104 | original_indices_sorted <- !knockoff_indices_sorted 105 | 106 | knockoff_names_sorted <- all_genes[knockoff_indices_sorted] 107 | original_names_sorted <- all_genes[original_indices_sorted] 108 | 109 | # sort markers data.frames by their original orderings 110 | knockoff_markers_sorted <- knockoff_markers[knockoff_names_sorted, ] 111 | original_markers_sorted <- original_markers[original_names_sorted, ] 112 | 113 | original_p_values <- original_markers_sorted$p_val 114 | knockoff_p_values <- knockoff_markers_sorted$p_val 115 | 116 | log_original_p_values <- -log10(original_p_values) 117 | log_knockoff_p_values <- -log10(knockoff_p_values) 118 | 119 | W <- log_original_p_values - log_knockoff_p_values 120 | 121 | thres <- knockoff::knockoff.threshold(W, fdr = q, offset = 1) 122 | 123 | 124 | if (return_all) { 125 | all_features <- as.data.frame(list("gene" = original_names_sorted, "W" = W)) 126 | 127 | ret <- list("all_features" = all_features, "threshold" = thres) 128 | 129 | return(ret) 130 | } 131 | selected_indices <- which(W >= thres) # todo check if this should be > (case where threshold is Inf, but there are still some Inf -log p) 132 | #selected_indices <- which(W > thres) # todo check if this should be > (case where threshold is Inf, but there are still some Inf -log p) 133 | 134 | selected_genes <- original_names_sorted[selected_indices] 135 | selected_Ws <- W[selected_indices] 136 | 137 | selected_features <- as.data.frame(list("selected_gene" = selected_genes, "W" = selected_Ws)) 138 | 139 | selected_features <- selected_features[order(selected_features$W, decreasing = TRUE), ] 140 | 141 | ret <- list("selected_features" = selected_features, "threshold" = thres) 142 | 143 | return(ret) 144 | } 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | #' @title Runs a typical Seurat workflow on a Seurat object (up to 154 | #' dimensionality reduction and clustering). 155 | #' 156 | #' @description Given a Seurat object, returns a new Seurat that has been 157 | #' normalized, had variable features identified, scaled, had principal 158 | #' components computed, hadclusters identified, and had tSNE and UMAP 159 | #' embeddings determined. 160 | #' 161 | #' @param seurat_obj The Seurat object that will be analyzed. 162 | #' @param resolution_start The starting resolution to be used for the 163 | #' clustering algorithm (Louvain and Leiden algorithms). 164 | #' @param num_clusters_start The starting number of clusters to be used for the 165 | #' clustering algorithm (K-means and Hierarchical clustering algorithms). 166 | #' @param reduction_percentage The amount that the starting parameter will be 167 | #' reduced by after each iteration (between 0 and 1). 168 | #' @param dims The dimensions to use as input features (i.e. 1:10). 169 | #' @param algorithm The clustering algorithm to be used. 170 | #' @param assay The assay to generate knockoffs from. 171 | #' @param cores The number of cores to compute marker genes in parallel. 172 | #' @param verbose Whether or not to show all logging. 173 | #' @returns Returns a Seurat object where the idents have been updated with the 174 | #' clusters determined via the callback algorithm. 175 | #' Latest clustering results will be stored in the object metadata under 176 | #' callback_clusters'. Note that 'callback_clusters' will be overwritten ever 177 | #' time FindClustersKC is run. 178 | #' @name FindClustersCallback 179 | #' @export 180 | FindClustersCallback <- function(seurat_obj, 181 | resolution_start = 0.8, 182 | reduction_percentage = 0.2, 183 | num_clusters_start = 20, 184 | dims = 1:10, 185 | algorithm = "louvain", # todo implement all algos 186 | assay = "RNA", 187 | cores = 1, 188 | verbose = TRUE) { 189 | 190 | # todo check function arguments for validity 191 | 192 | knockoff_seurat_obj <- get_seurat_obj_with_knockoffs(seurat_obj, assay = assay, verbose = verbose) 193 | 194 | num_variable_features <- 2 * length(Seurat::VariableFeatures(seurat_obj)) 195 | 196 | 197 | # Pre-process data 198 | options(future.globals.maxSize = 8000 * 1024^2) 199 | # todo log number of cores being used 200 | future::plan("multicore", workers = as.numeric(cores)) 201 | #options(future.globals.maxSize = 8000 * 1024^2) 202 | 203 | if (verbose) { 204 | message(paste("Number of cores:", cores)) 205 | } 206 | 207 | #plan("multicore", workers = as.numeric(cores)) 208 | 209 | knockoff_seurat_obj <- Seurat::NormalizeData(knockoff_seurat_obj, 210 | verbose = FALSE) 211 | 212 | knockoff_seurat_obj <- Seurat::FindVariableFeatures(knockoff_seurat_obj, 213 | selection.method = "vst", 214 | nfeatures = num_variable_features, 215 | verbose = FALSE) 216 | 217 | knockoff_seurat_obj <- Seurat::ScaleData(knockoff_seurat_obj, verbose = FALSE) 218 | knockoff_seurat_obj <- Seurat::RunPCA(knockoff_seurat_obj, 219 | features = Seurat::VariableFeatures(object = knockoff_seurat_obj), 220 | verbose = FALSE) 221 | 222 | # todo check if i should use all dims for knockoffs 223 | knockoff_seurat_obj <- Seurat::FindNeighbors(knockoff_seurat_obj, 224 | dims = dims, 225 | verbose = FALSE) 226 | 227 | resolution_param <- resolution_start 228 | 229 | 230 | first_iteration <- TRUE 231 | 232 | while (TRUE) { 233 | if (verbose) { 234 | message("####################################################################") 235 | message(paste("Finding clusters with", stringr::str_to_title(algorithm), "algorithm")) 236 | message(paste("Resolution param:", resolution_param)) 237 | } 238 | 239 | if (algorithm == "louvain") { 240 | knockoff_seurat_obj <- Seurat::FindClusters(knockoff_seurat_obj, 241 | resolution = resolution_param, 242 | verbose = FALSE) 243 | } 244 | 245 | if (algorithm == "leiden") { 246 | #plan("sequential") # todo log number of cores being used # this is a weird one because leiden has a forked job hanging 247 | knockoff_seurat_obj <- Seurat::FindClusters(knockoff_seurat_obj, 248 | resolution = resolution_param, 249 | algorithm = 4, 250 | method = "igraph", 251 | verbose = FALSE) 252 | } 253 | 254 | # Reduce resolution for next iteration of the loop 255 | resolution_param <- (1 - reduction_percentage) * resolution_param 256 | 257 | k <- length(levels(Seurat::Idents(knockoff_seurat_obj))) 258 | #knock_idents <- 0:(k-1) 259 | 260 | if (verbose) { 261 | message("Num clusters:") 262 | message(k) 263 | } 264 | 265 | knock_idents <- levels(Seurat::Idents(knockoff_seurat_obj)) 266 | 267 | num_selected_matrix <- matrix(nrow = k, ncol = k) 268 | 269 | found_no_sign_diff <- FALSE 270 | 271 | num_clusters <- length(knock_idents) 272 | 273 | 274 | if (verbose) { 275 | progress_bar_length <- num_clusters * (num_clusters - 1) / 2 276 | cli::cli_progress_bar("Processing cluster pairs:", 277 | total = progress_bar_length, 278 | clear = FALSE) 279 | } 280 | 281 | m <- 0 282 | for (i in 1:num_clusters) { 283 | for (j in 1:num_clusters) { 284 | if (j >= i) { 285 | next 286 | } 287 | 288 | m <- m + 1 289 | 290 | if (verbose) { 291 | cli::cli_progress_update() 292 | } 293 | 294 | markers_selected <- compute_knockoff_filter(seurat_obj = knockoff_seurat_obj, 295 | cluster1 = knock_idents[i], 296 | cluster2 = knock_idents[j], 297 | q = 0.05, 298 | num_cores = cores) 299 | 300 | num_selected <- nrow(markers_selected$selected_features) 301 | 302 | if (num_selected == 0) { 303 | found_no_sign_diff <- TRUE 304 | break 305 | } 306 | 307 | num_selected_matrix[i, j] <- num_selected 308 | num_selected_matrix[j, i] <- num_selected 309 | 310 | } 311 | if (found_no_sign_diff) { 312 | if (verbose) { 313 | cli::cli_progress_done() 314 | message("Found clusters with no significant differences.") 315 | message("Progressing to next clustering iteration.") 316 | first_iteration <- FALSE 317 | } 318 | break 319 | } 320 | } 321 | 322 | if (found_no_sign_diff) { 323 | next 324 | } 325 | break 326 | } 327 | 328 | if (first_iteration) { 329 | warning("Only a single iteration occurred. The inferred cluster labels may be underclustered. To prevent this, you may want to re-run callback with a larger starting parameter.") 330 | } 331 | 332 | seurat_obj@meta.data$callback_clusters <- Seurat::Idents(knockoff_seurat_obj) 333 | Seurat::Idents(seurat_obj) <- seurat_obj@meta.data$callback_clusters 334 | 335 | return(seurat_obj) 336 | } 337 | --------------------------------------------------------------------------------