├── .github
    ├── .gitignore
    └── workflows
    │   ├── docker-image.yml
    │   ├── super-linter.yml
    │   ├── pkgdown.yaml
    │   └── check-standard.yml
├── vignettes
    ├── .gitignore
    └── basic-usage.Rmd
├── LICENSE
├── _pkgdown.yml
├── Dockerfile
├── man
    ├── figures
    │   └── callback_logo.png
    ├── estimate_zi_poisson.Rd
    ├── rzipoisson.Rd
    ├── get_seurat_obj_with_knockoffs.Rd
    ├── compute_knockoff_filter.Rd
    ├── seurat_workflow.Rd
    └── FindClustersCallback.Rd
├── .Rbuildignore
├── NAMESPACE
├── .gitignore
├── LICENSE.md
├── DESCRIPTION
├── R
    ├── estimate_zipoisson.R
    ├── seurat_workflow.R
    └── callback.R
└── README.md


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2024
2 | COPYRIGHT HOLDER: callback authors
3 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://lcrawlab.github.io/callback/
2 | template:
3 |   bootstrap: 5
4 | 
5 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM rocker/verse:4.0.5
2 | 
3 | RUN R -e "install.packages('.', type = 'source', repos = NULL)"


--------------------------------------------------------------------------------
/man/figures/callback_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lcrawlab/callback/HEAD/man/figures/callback_logo.png


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^LICENSE\.md$
2 | ^Dockerfile$
3 | ^tmp$
4 | ^_pkgdown\.yml$
5 | ^docs$
6 | ^pkgdown$
7 | ^\.github$
8 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 | 
3 | export(FindClustersCallback)
4 | export(seurat_workflow)
5 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 | 
 9 | jobs:
10 | 
11 |   build:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v3
17 |     - name: Build the Docker image
18 |       run: docker build . --file Dockerfile --tag callback:$(date +%s)
19 | 


--------------------------------------------------------------------------------
/man/estimate_zi_poisson.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/estimate_zipoisson.R
 3 | \name{estimate_zi_poisson}
 4 | \alias{estimate_zi_poisson}
 5 | \title{Random data generation for the zero-infalted Poisson distribution
 6 | with Poisson parameter lambda and zero proportion prop.zero.}
 7 | \usage{
 8 | estimate_zi_poisson(data)
 9 | }
10 | \arguments{
11 | \item{data}{The data to estimate parameters from.}
12 | }
13 | \value{
14 | Maximum likelihood estimators of for the zero-inflated Poisson
15 | distribution
16 | }
17 | \description{
18 | Given data, computes the maximum likelihood estimators
19 | for the zero-infalted Poisson distribution.
20 | }
21 | 


--------------------------------------------------------------------------------
/man/rzipoisson.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/estimate_zipoisson.R
 3 | \name{rzipoisson}
 4 | \alias{rzipoisson}
 5 | \title{Random data generation for the zero-infalted Poisson distribution
 6 | with Poisson parameter lambda and zero proportion prop.zero.}
 7 | \usage{
 8 | rzipoisson(n, lambda, prop.zero)
 9 | }
10 | \arguments{
11 | \item{n}{The number of samples to be simulated.}
12 | 
13 | \item{lambda}{The Poisson rate parameter.}
14 | 
15 | \item{prop.zero}{The proportion of excess zeroes.}
16 | }
17 | \value{
18 | Simulated data from ZIP(lambda, prop.zero).
19 | }
20 | \description{
21 | Given the number of samples desired, a Poisson parameter,
22 | lambda, and a zero proportion, prop.zero, simulates the number of desired
23 | samples from ZIP(lambda, prop.zero).
24 | }
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | .RDataTmp
 8 | 
 9 | # User-specific files
10 | .Ruserdata
11 | 
12 | # Example code in package build process
13 | *-Ex.R
14 | 
15 | # Output files from R CMD build
16 | /*.tar.gz
17 | 
18 | # Output files from R CMD check
19 | /*.Rcheck/
20 | 
21 | # RStudio files
22 | .Rproj.user/
23 | 
24 | # produced vignettes
25 | vignettes/*.html
26 | vignettes/*.pdf
27 | 
28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
29 | .httr-oauth
30 | 
31 | # knitr and R markdown default cache directories
32 | *_cache/
33 | /cache/
34 | 
35 | # Temporary files created by R markdown
36 | *.utf8.md
37 | *.knit.md
38 | 
39 | # R Environment Variables
40 | .Renviron
41 | 
42 | # pkgdown site
43 | docs/
44 | 
45 | # translation temp files
46 | po/*~
47 | 
48 | # RStudio Connect folder
49 | rsconnect/
50 | .Rproj.user
51 | .Rdata
52 | .DS_Store
53 | docs
54 | inst/doc
55 | 


--------------------------------------------------------------------------------
/.github/workflows/super-linter.yml:
--------------------------------------------------------------------------------
 1 | # This workflow executes several linters on changed files based on languages used in your code base whenever
 2 | # you push a code or open a pull request.
 3 | #
 4 | # You can adjust the behavior by modifying this file.
 5 | # For more information, see:
 6 | # https://github.com/github/super-linter
 7 | name: Lint Code Base
 8 | 
 9 | on:
10 |   push:
11 |     branches: [ "main" ]
12 |   pull_request:
13 |     branches: [ "main" ]
14 | jobs:
15 |   run-lint:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - name: Checkout code
19 |         uses: actions/checkout@v3
20 |         with:
21 |           # Full git history is needed to get a proper list of changed files within `super-linter`
22 |           fetch-depth: 0
23 | 
24 |       - name: Lint Code Base
25 |         uses: github/super-linter@v4
26 |         env:
27 |           VALIDATE_ALL_CODEBASE: false
28 |           DEFAULT_BRANCH: "main"
29 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
30 | 


--------------------------------------------------------------------------------
/man/get_seurat_obj_with_knockoffs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/callback.R
 3 | \name{get_seurat_obj_with_knockoffs}
 4 | \alias{get_seurat_obj_with_knockoffs}
 5 | \title{Returns a Seurat object that contains additional (fake) RNA
 6 | expression counts in the form of knockoffs.}
 7 | \usage{
 8 | get_seurat_obj_with_knockoffs(seurat_obj, assay = "RNA", verbose = TRUE)
 9 | }
10 | \arguments{
11 | \item{seurat_obj}{A Seurat object containing RNA expression counts.}
12 | 
13 | \item{assay}{The assay to generate knockoffs from.}
14 | 
15 | \item{verbose}{Whether or not to show logging.}
16 | }
17 | \value{
18 | A Seurat object that contains the original variable features and an
19 | equal number of knockoff features.
20 | }
21 | \description{
22 | Given a Seurat object, returns a new Seurat object whose RNA
23 | expression counts includes the
24 | variable features from the original object and an equal number of knockoff
25 | features.
26 | }
27 | 


--------------------------------------------------------------------------------
/man/compute_knockoff_filter.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/callback.R
 3 | \name{compute_knockoff_filter}
 4 | \alias{compute_knockoff_filter}
 5 | \title{Returns the genes selected by the knockoff filter}
 6 | \usage{
 7 | compute_knockoff_filter(
 8 |   seurat_obj,
 9 |   cluster1,
10 |   cluster2,
11 |   q,
12 |   return_all = FALSE,
13 |   num_cores = 1
14 | )
15 | }
16 | \arguments{
17 | \item{seurat_obj}{A Seurat object}
18 | 
19 | \item{cluster1}{The Idents of the cluster of interest in seurat_obj1}
20 | 
21 | \item{cluster2}{The Idents of the cluster of interest in seurat_obj2}
22 | 
23 | \item{q}{The desired rate to control the FDR at}
24 | 
25 | \item{return_all}{Determines if the returned object will contain all genes
26 | or just the selected genes.}
27 | 
28 | \item{num_cores}{The number of cores for computing marker genes in parallel.}
29 | }
30 | \value{
31 | todo
32 | }
33 | \description{
34 | Given two Seurat objects, returns the  the genes selected by
35 | the knockoff filter and their W statistics.
36 | }
37 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2024 callback authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: callback
 2 | Title: A Knockoff Calibration Method to Avoid Over-Clustering in scRNAseq Data
 3 | Version: 0.0.0
 4 | Authors@R: 
 5 |     person("Alan", "DenAdel", , "alan_denadel@brown.edu", role = c("aut", "cre"),
 6 |            comment = c(ORCID = "0000-0002-7985-6789"))
 7 | Description: callback (Calibrated Clustering via Knockoffs) is a method for protecting
 8 |  against over-clustering by controlling for the impact of double-dipping. The approach 
 9 |     can be applied to any clustering algorithm (implemented are the Louvain, Leiden, 
10 |     K-means, and hierarchical clustering algorithms). The method provides state-of-the-art
11 |     clustering performance and can rapidly analyze large-scale scRNA-seq studies and is 
12 |     compatible with the Seurat library.
13 | Encoding: UTF-8
14 | Roxygen: list(markdown = TRUE)
15 | RoxygenNote: 7.3.1
16 | Imports: 
17 |     Matrix,
18 |     Seurat (>= 5.0.1),
19 |     lamW,
20 |     knockoff,
21 |     future,
22 |     stats,
23 |     cli,
24 |     stringr
25 | License: MIT + file LICENSE
26 | Suggests: 
27 |     knitr,
28 |     rmarkdown
29 | VignetteBuilder: knitr
30 | URL: https://lcrawlab.github.io/callback/
31 | 


--------------------------------------------------------------------------------
/man/seurat_workflow.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/seurat_workflow.R
 3 | \name{seurat_workflow}
 4 | \alias{seurat_workflow}
 5 | \title{Runs a typical Seurat workflow on a Seurat object (up to
 6 | dimensionality reduction and clustering).}
 7 | \usage{
 8 | seurat_workflow(
 9 |   seurat_obj,
10 |   num_variable_features,
11 |   resolution_param = 0.8,
12 |   visualization_method = "umap",
13 |   num_dims = 10,
14 |   algorithm = "louvain"
15 | )
16 | }
17 | \arguments{
18 | \item{seurat_obj}{A Seurat object that will be analyzed.}
19 | 
20 | \item{num_variable_features}{The number of variable features to use in the
21 | analysis.}
22 | 
23 | \item{resolution_param}{The resolution parameter to use when clustering.}
24 | 
25 | \item{visualization_method}{Either "umap" or "tsne".}
26 | 
27 | \item{num_dims}{The number of principal components to use.}
28 | 
29 | \item{algorithm}{The clustering algorithm to use, either "louvain" or
30 | "leiden".}
31 | }
32 | \value{
33 | A Seurat object containing the relevant analysis results.
34 | }
35 | \description{
36 | Given a Seurat object, returns a new Seurat that has been
37 | normalized, had variable features identified,
38 | scaled, had principal components computed, had clusters identified, and had
39 | tSNE and UMAP embeddings determined.
40 | }
41 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown
13 | 
14 | jobs:
15 |   pkgdown:
16 |     runs-on: ubuntu-latest
17 |     # Only restrict concurrency for non-PR jobs
18 |     concurrency:
19 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 |     env:
21 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 |     permissions:
23 |       contents: write
24 |     steps:
25 |       - uses: actions/checkout@v4
26 | 
27 |       - uses: r-lib/actions/setup-pandoc@v2
28 | 
29 |       - uses: r-lib/actions/setup-r@v2
30 |         with:
31 |           use-public-rspm: true
32 | 
33 |       - uses: r-lib/actions/setup-r-dependencies@v2
34 |         with:
35 |           extra-packages: any::pkgdown, local::.
36 |           needs: website
37 | 
38 |       - name: Build site
39 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
40 |         shell: Rscript {0}
41 | 
42 |       - name: Deploy to GitHub pages 🚀
43 |         if: github.event_name != 'pull_request'
44 |         uses: JamesIves/github-pages-deploy-action@v4.5.0
45 |         with:
46 |           clean: false
47 |           branch: gh-pages
48 |           folder: docs
49 | 


--------------------------------------------------------------------------------
/.github/workflows/check-standard.yml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macos-latest,   r: 'release'}
22 |           - {os: windows-latest, r: 'release'}
23 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
24 |           - {os: ubuntu-latest,   r: 'release'}
25 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
26 | 
27 |     env:
28 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
29 |       R_KEEP_PKG_SOURCE: yes
30 | 
31 |     steps:
32 |       - uses: actions/checkout@v4
33 | 
34 |       - uses: r-lib/actions/setup-pandoc@v2
35 | 
36 |       - uses: r-lib/actions/setup-r@v2
37 |         with:
38 |           r-version: ${{ matrix.config.r }}
39 |           http-user-agent: ${{ matrix.config.http-user-agent }}
40 |           use-public-rspm: true
41 | 
42 |       - uses: r-lib/actions/setup-r-dependencies@v2
43 |         with:
44 |           extra-packages: any::rcmdcheck
45 |           needs: check
46 | 
47 |       - uses: r-lib/actions/check-r-package@v2
48 |         with:
49 |           upload-snapshots: true
50 | 


--------------------------------------------------------------------------------
/vignettes/basic-usage.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Basic Usage on PBMC3k Data"
 3 | output: rmarkdown::html_vignette
 4 | vignette: >
 5 |   %\VignetteIndexEntry{basic-usage}
 6 |   %\VignetteEngine{knitr::rmarkdown}
 7 |   %\VignetteEncoding{UTF-8}
 8 | 
 9 | ---
10 | 
11 | ```{r, include = FALSE}
12 | knitr::opts_chunk$set(
13 |   collapse = TRUE,
14 |   comment = "#>"
15 | )
16 | 
17 | knitr::opts_chunk$set(eval = FALSE)
18 | 
19 | ```
20 | 
21 | ```{r setup}
22 | suppressPackageStartupMessages({
23 | library(Seurat)
24 | library(SeuratData)
25 | library(callback)
26 | })
27 | ```
28 | 
29 | 
30 | First, we use the `SeuratData` data package to first download and then load
31 | 2700 PBMCs. The loaded `SeuratObject`, `pbmc3k`, is from an old version of 
32 | `Seurat`, and so we update the object to v5.
33 | 
34 | ```{r load_data}
35 | set.seed(123)
36 | 
37 | SeuratData::InstallData("pbmc3k")
38 | data("pbmc3k")
39 | 
40 | pbmc3k <- UpdateSeuratObject(pbmc3k)
41 | ```
42 | 
43 | Now, we use `Seurat` to perform the usual preprocessing steps that are performed prior to clustering.
44 | 
45 | ```{r preprocessing}
46 | pbmc3k <- NormalizeData(pbmc3k)
47 | pbmc3k <- FindVariableFeatures(pbmc3k)
48 | pbmc3k <- ScaleData(pbmc3k)
49 | pbmc3k <- RunPCA(pbmc3k)
50 | pbmc3k <- FindNeighbors(pbmc3k)
51 | pbmc3k <- RunUMAP(pbmc3k, dims = 1:10)
52 | ```
53 | 
54 | The `callback` algorithm can be run with a single function call as a drop-in
55 | replacement for the `Seurat` function `FindClusters`.
56 | 
57 | ```{r run_callback}
58 | pbmc3k <- FindClustersCallback(pbmc3k)
59 | ```
60 | 
61 | The `callback`  clusters are set to the idents of the `SeuratObject` that is
62 | returned by `FindClustersCallback`
63 | 
64 | ```{r plot_umap}
65 | DimPlot(pbmc3k)
66 | ```
67 | 
68 | Cluster labels from `FindClustersCallback` care stored in the metadata in the
69 | column `pbmc3k@meta.data$callback_clusters`.
70 | 
71 | ```{r plot_umap2}
72 | DimPlot(pbmc3k, group.by = "callback_clusters")
73 | ```
74 | 


--------------------------------------------------------------------------------
/man/FindClustersCallback.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/callback.R
 3 | \name{FindClustersCallback}
 4 | \alias{FindClustersCallback}
 5 | \title{Runs a typical Seurat workflow on a Seurat object (up to
 6 | dimensionality reduction and clustering).}
 7 | \usage{
 8 | FindClustersCallback(
 9 |   seurat_obj,
10 |   resolution_start = 0.8,
11 |   reduction_percentage = 0.2,
12 |   num_clusters_start = 20,
13 |   dims = 1:10,
14 |   algorithm = "louvain",
15 |   assay = "RNA",
16 |   cores = 1,
17 |   verbose = TRUE
18 | )
19 | }
20 | \arguments{
21 | \item{seurat_obj}{The Seurat object that will be analyzed.}
22 | 
23 | \item{resolution_start}{The starting resolution to be used for the
24 | clustering algorithm (Louvain and Leiden algorithms).}
25 | 
26 | \item{reduction_percentage}{The amount that the starting parameter will be
27 | reduced by after each iteration (between 0 and 1).}
28 | 
29 | \item{num_clusters_start}{The starting number of clusters to be used for the
30 | clustering algorithm (K-means and Hierarchical clustering algorithms).}
31 | 
32 | \item{dims}{The dimensions to use as input features (i.e. 1:10).}
33 | 
34 | \item{algorithm}{The clustering algorithm to be used.}
35 | 
36 | \item{assay}{The assay to generate knockoffs from.}
37 | 
38 | \item{cores}{The number of cores to compute marker genes in parallel.}
39 | 
40 | \item{verbose}{Whether or not to show all logging.}
41 | }
42 | \value{
43 | Returns a Seurat object where the idents have been updated with the
44 | clusters determined via the callback algorithm.
45 | Latest clustering results will be stored in the object metadata under
46 | callback_clusters'. Note that 'callback_clusters' will be overwritten ever
47 | time FindClustersKC is run.
48 | }
49 | \description{
50 | Given a Seurat object, returns a new Seurat that has been
51 | normalized, had variable features identified, scaled, had principal
52 | components computed, hadclusters identified, and had tSNE and UMAP
53 | embeddings determined.
54 | }
55 | 


--------------------------------------------------------------------------------
/R/estimate_zipoisson.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # https://en.wikipedia.org/wiki/Zero-inflated_model#Estimators_of_ZIP_parameters
 3 | # https://math.stackexchange.com/questions/2761563/maximum-likelihood-estimation-for-zero-inflated-poisson-distribution
 4 | # https://ieeexplore.ieee.org/document/9032203
 5 | 
 6 | #' @title Maximum likelihood estimation for the zero-inflated Poisson distribution
 7 | #' with Poisson parameter lambda and zero proportion prop.zero.
 8 | #'
 9 | #' @description Given data, computes the maximum likelihood estimators
10 | #' for the zero-inflated Poisson distribution.
11 | #'
12 | #' @param data The data to estimate parameters from.
13 | #' @returns Maximum likelihood estimators of the zero-inflated Poisson
14 | #' distribution
15 | #' @name estimate_zi_poisson
16 | estimate_zi_poisson <- function(data) {
17 |   num.zeros <- sum(data == 0)
18 |   r0 <- 1 / length(data) * num.zeros
19 |   
20 |   x.bar = mean(data)
21 |   
22 |   gamma <- x.bar / (1 - r0)
23 |   
24 |   lambda.hat <- lamW::lambertW0(-gamma * exp(-gamma)) + gamma
25 |   
26 |   pi.hat <- 1 - x.bar / lambda.hat
27 | 
28 | 
29 |   return.list <- list("lambda.hat" = lambda.hat, "pi.hat" = pi.hat)
30 |   return(return.list)
31 | }
32 | 
33 | 
34 | #' @title Random data generation for the zero-infalted Poisson distribution
35 | #' with Poisson parameter lambda and zero proportion prop.zero.
36 | #'
37 | #' @description Given the number of samples desired, a Poisson parameter,
38 | #' lambda, and a zero proportion, prop.zero, simulates the number of desired
39 | #' samples from ZIP(lambda, prop.zero).
40 | #'
41 | #' @param n The number of samples to be simulated.
42 | #' @param lambda The Poisson rate parameter.
43 | #' @param prop.zero The proportion of excess zeroes.
44 | #' @returns Simulated data from ZIP(lambda, prop.zero).
45 | #' @name rzipoisson
46 | rzipoisson <- function(n, lambda, prop.zero) {
47 |   data <- c()
48 | 
49 | 
50 |   for (i in 1:n) {
51 |     if (stats::runif(1) < prop.zero) {
52 |       data[i] <- 0
53 |     }
54 |     else {
55 |       data[i] <- stats::rpois(1, lambda)
56 |     }
57 |   }
58 |   return(data)
59 | } 
60 | 
61 | 


--------------------------------------------------------------------------------
/R/seurat_workflow.R:
--------------------------------------------------------------------------------
 1 | #' @title Runs a typical Seurat workflow on a Seurat object (up to
 2 | #' dimensionality reduction and clustering).
 3 | #'
 4 | #' @description Given a Seurat object, returns a new Seurat that has been
 5 | #' normalized, had variable features identified,
 6 | #' scaled, had principal components computed, had clusters identified, and had
 7 | #' tSNE and UMAP embeddings determined.
 8 | #'
 9 | #' @param seurat_obj A Seurat object that will be analyzed.
10 | #' @param num_variable_features The number of variable features to use in the
11 | #' analysis.
12 | #' @param resolution_param The resolution parameter to use when clustering.
13 | #' @param visualization_method Either "umap" or "tsne".
14 | #' @param num_dims The number of principal components to use.
15 | #' @param algorithm The clustering algorithm to use, either "louvain" or
16 | #' "leiden".
17 | #' @returns A Seurat object containing the relevant analysis results.
18 | #' @export
19 | #' @name seurat_workflow
20 | seurat_workflow <- function(seurat_obj,
21 |                             num_variable_features,
22 |                             resolution_param = 0.8,
23 |                             visualization_method = "umap",
24 |                             num_dims = 10,
25 |                             algorithm = "louvain") {
26 |   seurat_obj <- Seurat::NormalizeData(seurat_obj)
27 | 
28 |   seurat_obj <- Seurat::FindVariableFeatures(seurat_obj,
29 |                                              selection.method = "vst",
30 |                                              nfeatures = num_variable_features)
31 | 
32 |   all_genes <- rownames(seurat_obj)
33 | 
34 |   #seurat_obj <- Seurat::ScaleData(seurat_obj, features = all_genes)
35 |   seurat_obj <- Seurat::ScaleData(seurat_obj)
36 | 
37 |   seurat_obj <- Seurat::RunPCA(seurat_obj,
38 |                                features = Seurat::VariableFeatures(object = seurat_obj))
39 | 
40 |   # todo check if i should use all dims for knockoffs
41 |   seurat_obj <- Seurat::FindNeighbors(seurat_obj, dims = 1:num_dims)
42 | 
43 |   if (algorithm == "louvain") {
44 |     seurat_obj <- Seurat::FindClusters(seurat_obj,
45 |                                        resolution = resolution_param)
46 |   }
47 | 
48 |   if (algorithm == "leiden") {
49 |     seurat_obj <- Seurat::FindClusters(seurat_obj,
50 |                                        resolution = resolution_param,
51 |                                        algorithm = 4,
52 |                                        method = "igraph")
53 |   }
54 | 
55 |   if (visualization_method == "umap") {
56 |     seurat_obj <- Seurat::RunUMAP(seurat_obj, dims = 1:num_dims)
57 |   }
58 |   if (visualization_method == "tsne") {
59 |     seurat_obj <- Seurat::RunTSNE(seurat_obj, dims = 1:num_dims)
60 |   }
61 | 
62 | if (visualization_method == "both") {
63 |     seurat_obj <- Seurat::RunUMAP(seurat_obj, dims = 1:num_dims)
64 |     seurat_obj <- Seurat::RunTSNE(seurat_obj, dims = 1:num_dims)
65 |   }
66 | 
67 |   return(seurat_obj)
68 | }
69 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # THIS REPOSITORY HAS BEEN ARCHIVED AND WILL NOT BE UPDATED. THIS METHOD HAS BEEN RENAMED recall (Calibrated Clustering with Artificial Variables) AND IS NOW LOCATED IN [THIS REPOSITORY](https://github.com/lcrawlab/recall).
 2 | 
 3 | # callback (Calibrated Clustering via Knockoffs) <img src="man/figures/callback_logo.png" align="right" alt="" width="120"/>
 4 | 
 5 | [![R CMD check](https://github.com/lcrawlab/callback/actions/workflows/check-standard.yml/badge.svg)](https://github.com/lcrawlab/callback/actions/workflows/check-standard.yml)
 6 | [![Docker Image CI](https://github.com/lcrawlab/callback/actions/workflows/docker-image.yml/badge.svg)](https://github.com/lcrawlab/callback/actions/workflows/docker-image.yml)
 7 | 
 8 | ## Introduction
 9 | 
10 | Standard single-cell RNA-sequencing (scRNA-seq) pipelines nearly always include unsupervised clustering as a key step in identifying biologically distinct cell types. A follow-up step in these pipelines is to test for differential expression between the identified clusters. When algorithms over-cluster, downstream analyses will produce inflated P-values resulting in increased false discoveries.
11 | Here, we present `callback` (Calibrated Clustering via Knockoffs): a new method for protecting against over-clustering by controlling for the impact of double-dipping.
12 | Importantly, our approach can be applied to any clustering algorithm (implemented here are the Louvain, Leiden, K-means, and hierarchical clustering algorithms).
13 | `callback` provides state-of-the-art clustering performance and can rapidly analyze large-scale scRNA-seq studies, even on a personal laptop.
14 | 
15 | ## Installation
16 | 
17 | You can install the lastest development version by using the [devtools](https://CRAN.R-project.org/package=devtools) library. To install this package with devtools, use this command:
18 | 
19 | ```r
20 | devtools::install_github("lcrawlab/callback")
21 | ```
22 | 
23 | Although it is not explicitly a dependency, making sure you have `presto` installed will make `callback` much faster.
24 | 
25 | ```r
26 | devtools::install_github("immunogenomics/presto")
27 | ```
28 | 
29 | 
30 | ## Tutorial
31 | 
32 | ```r
33 | library(Seurat)
34 | library(SeuratData)
35 | 
36 | library(callback)
37 | 
38 | set.seed(123)
39 | 
40 | # load pbmc3k dataset
41 | SeuratData::InstallData("pbmc3k")
42 | data("pbmc3k")
43 | 
44 | pbmc3k <- UpdateSeuratObject(pbmc3k)
45 | 
46 | pbmc3k <- NormalizeData(pbmc3k)
47 | pbmc3k <- FindVariableFeatures(pbmc3k)
48 | pbmc3k <- ScaleData(pbmc3k)
49 | pbmc3k <- RunPCA(pbmc3k)
50 | pbmc3k <- FindNeighbors(pbmc3k)
51 | pbmc3k <- RunUMAP(pbmc3k, dims = 1:10)
52 | 
53 | pbmc_default <- FindClusters(pbmc3k)
54 | pbmc_callback <- FindClustersCallback(pbmc3k)
55 | 
56 | DimPlot(pbmc_default) + DimPlot(pbmc_callback)
57 | ```
58 | ## Overview of the Method
59 | 
60 | The `callback` algorithm consists of three simple steps:
61 | 
62 | 1. First, we generate synthetic null variables, formally called knockoff features, where we augment the single-cell data being analyzed with "fake" genes that are known not to contribute to any unique cell type.
63 | 2. Second, we perform both preprocessing and clustering on this augmented dataset.
64 | 3. Third, we calibrate the number of inferred clusters by using a hypothesis testing strategy with a data-dependent threshold to determine if there is a statistically significant difference between groups. If any pair of groups does not have statistically significant differences then re-clustering occurs.
65 | 
66 | The synthetic knockoff genes act as negative control variables; they go through the same analytic steps as the real data and are presented with the same opportunity to be identified as marker genes.
67 | The `callback` algorithm uses the guiding principle that well-calibrated clusters (i.e., those representing real groups) should have significantly differentially expressed genes after correcting for multiple hypothesis tests, while over-clustered groups will not.
68 | We use this rule to iteratively re-cluster cells until the inferred clusters are well-calibrated and the observed differences in expression between groups are not due to the effects of double-dipping.
69 | 
70 | ## Relevant Citations
71 | `callback` is currently on the bioRxiv, [here](https://www.biorxiv.org/content/10.1101/2024.03.08.584180v1).
72 | 
73 | A. DenAdel, M. Ramseier, A. Navia, A. Shalek, S. Raghavan, P. Winter, A. Amini, and L. Crawford. A knockoff calibration method to avoid over-clustering in single-cell RNA-sequencing. _bioRxiv_.
74 | 
75 | ## Questions and Feedback
76 | For questions or concerns with `callback`, please contact
77 | [Alan DenAdel](mailto:alan_denadel@brown.edu) or [Lorin Crawford](lcrawford@microsoft.com). Any feedback on the software, manuscript, and tutorials is appreciated.
78 | 


--------------------------------------------------------------------------------
/R/callback.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' @title Returns a Seurat object that contains additional (fake) RNA
  3 | #' expression counts in the form of knockoffs.
  4 | #'
  5 | #' @description Given a Seurat object, returns a new Seurat object whose RNA
  6 | #' expression counts includes the
  7 | #' variable features from the original object and an equal number of knockoff
  8 | #' features.
  9 | #'
 10 | #' @param seurat_obj A Seurat object containing RNA expression counts.
 11 | #' @param assay The assay to generate knockoffs from.
 12 | #' @param verbose Whether or not to show logging.
 13 | #' @returns A Seurat object that contains the original variable features and an
 14 | #' equal number of knockoff features.
 15 | #' @name get_seurat_obj_with_knockoffs
 16 | get_seurat_obj_with_knockoffs <- function(seurat_obj, assay = "RNA", verbose = TRUE) {
 17 | 
 18 |   if (verbose) {
 19 |     message("Pulling data from Seurat object")
 20 |   }
 21 | 
 22 |   var_features <- Seurat::VariableFeatures(seurat_obj)
 23 |   seurat_obj_data <- as.data.frame(t(as.matrix(Seurat::GetAssayData(seurat_obj, assay = assay, layer = "counts")[var_features, ])))
 24 | 
 25 |   if (verbose) {
 26 |     message("Computing MLE for zero-inflated Poisson for each gene")
 27 |   }
 28 | 
 29 |   ml_estimates <- lapply(seurat_obj_data, estimate_zi_poisson)
 30 | 
 31 |   if (verbose) {
 32 |     message("Computing knockoff features")
 33 |   }
 34 | 
 35 |   ko <- as.data.frame(lapply(ml_estimates,
 36 |                              function(x) {
 37 |                                            rzipoisson(nrow(seurat_obj_data),
 38 |                                                       x$lambda.hat,
 39 |                                                       x$pi.hat)
 40 |                              }))
 41 | 
 42 | 
 43 |   num_variable_features <- length(var_features)
 44 |   colnames(ko) <- paste0(rep("knockoff", num_variable_features), 1:num_variable_features)
 45 |   combined_data <- cbind(seurat_obj_data, ko)
 46 | 
 47 |   # sparsify augmented data matrix and transpose for use in Seurat
 48 |   combined_data <- Matrix::Matrix(t(combined_data), sparse = TRUE)
 49 | 
 50 |   new_project_name <- paste0(seurat_obj@project.name, "_with_knockoffs")
 51 |   new_seurat_obj <- Seurat::CreateSeuratObject(counts = combined_data, project = new_project_name)
 52 | 
 53 |   return(new_seurat_obj)
 54 | }
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | #' @title Returns the genes selected by the knockoff filter
 62 | #'
 63 | #' @description Given two Seurat objects, returns the  the genes selected by
 64 | #' the knockoff filter and their W statistics.
 65 | #'
 66 | #' @param seurat_obj A Seurat object
 67 | #' @param cluster1 The Idents of the cluster of interest in seurat_obj1
 68 | #' @param cluster2 The Idents of the cluster of interest in seurat_obj2
 69 | #' @param q The desired rate to control the FDR at
 70 | #' @param return_all Determines if the returned object will contain all genes
 71 | #' or just the selected genes.
 72 | #' @param num_cores The number of cores for computing marker genes in parallel.
 73 | #' @returns todo
 74 | #' @name compute_knockoff_filter
 75 | compute_knockoff_filter <- function(seurat_obj,
 76 |                                     cluster1,
 77 |                                     cluster2,
 78 |                                     q,
 79 |                                     return_all = FALSE,
 80 |                                     num_cores = 1) {
 81 |   options(future.globals.maxSize = 8000 * 1024^2)
 82 |   # todo note what this is for, figure this out as a parameter or programmatically
 83 |   future::plan("multicore", workers = as.numeric(num_cores))
 84 | 
 85 |   markers <- Seurat::FindMarkers(seurat_obj,
 86 |                          ident.1 = cluster1,
 87 |                          ident.2 = cluster2,
 88 |                          logfc.threshold = 0,
 89 |                          min.pct = 0)
 90 | 
 91 | 
 92 |   # FindMarkers orders by p-value, so we can't rely on position to know which genes are which
 93 |   knockoff_indices <- grepl("^knockoff", rownames(markers))
 94 |   original_indices <- !knockoff_indices
 95 | 
 96 |   # subset the markers data.frame into originals and knockoffs
 97 |   knockoff_markers <- markers[knockoff_indices, ]
 98 |   original_markers <- markers[original_indices, ]
 99 | 
100 |   all_genes <- rownames(seurat_obj)
101 | 
102 |   # get indices of knockoffs and originals from seurat_obj, should be [FALSE, ..., FALSE, TRUE, ..., TRUE]
103 |   knockoff_indices_sorted <- grepl("^knockoff", all_genes)
104 |   original_indices_sorted <- !knockoff_indices_sorted
105 | 
106 |   knockoff_names_sorted <- all_genes[knockoff_indices_sorted]
107 |   original_names_sorted <- all_genes[original_indices_sorted]
108 | 
109 |   # sort markers data.frames by their original orderings
110 |   knockoff_markers_sorted <- knockoff_markers[knockoff_names_sorted, ]
111 |   original_markers_sorted <- original_markers[original_names_sorted, ]
112 | 
113 |   original_p_values <- original_markers_sorted$p_val
114 |   knockoff_p_values <- knockoff_markers_sorted$p_val
115 | 
116 |   log_original_p_values <- -log10(original_p_values)
117 |   log_knockoff_p_values <- -log10(knockoff_p_values)
118 | 
119 |   W <- log_original_p_values - log_knockoff_p_values
120 | 
121 |   thres <- knockoff::knockoff.threshold(W, fdr = q, offset = 1)
122 | 
123 | 
124 |   if (return_all) {
125 |     all_features <- as.data.frame(list("gene" = original_names_sorted, "W" = W))
126 | 
127 |     ret <-  list("all_features" = all_features, "threshold" = thres)
128 | 
129 |     return(ret)
130 |   }
131 |   selected_indices <- which(W >= thres) # todo check if this should be > (case where threshold is Inf, but there are still some Inf -log p)
132 |   #selected_indices <- which(W > thres) # todo check if this should be > (case where threshold is Inf, but there are still some Inf -log p)
133 | 
134 |   selected_genes <- original_names_sorted[selected_indices]
135 |   selected_Ws <- W[selected_indices]
136 | 
137 |   selected_features <- as.data.frame(list("selected_gene" = selected_genes, "W" = selected_Ws))
138 | 
139 |   selected_features <- selected_features[order(selected_features$W, decreasing = TRUE), ]
140 | 
141 |   ret <-  list("selected_features" = selected_features, "threshold" = thres)
142 | 
143 |   return(ret)
144 | }
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | #' @title Runs a typical Seurat workflow on a Seurat object (up to
154 | #' dimensionality reduction and clustering).
155 | #'
156 | #' @description Given a Seurat object, returns a new Seurat that has been
157 | #' normalized, had variable features identified, scaled, had principal
158 | #' components computed, hadclusters identified, and had tSNE and UMAP
159 | #' embeddings determined.
160 | #'
161 | #' @param seurat_obj The Seurat object that will be analyzed.
162 | #' @param resolution_start The starting resolution to be used for the
163 | #' clustering algorithm (Louvain and Leiden algorithms).
164 | #' @param num_clusters_start The starting number of clusters to be used for the
165 | #' clustering algorithm (K-means and Hierarchical clustering algorithms).
166 | #' @param reduction_percentage The amount that the starting parameter will be
167 | #' reduced by after each iteration (between 0 and 1).
168 | #' @param dims The dimensions to use as input features (i.e. 1:10).
169 | #' @param algorithm The clustering algorithm to be used.
170 | #' @param assay The assay to generate knockoffs from.
171 | #' @param cores The number of cores to compute marker genes in parallel.
172 | #' @param verbose Whether or not to show all logging.
173 | #' @returns Returns a Seurat object where the idents have been updated with the
174 | #' clusters determined via the callback algorithm.
175 | #' Latest clustering results will be stored in the object metadata under
176 | #' callback_clusters'. Note that 'callback_clusters' will be overwritten ever
177 | #' time FindClustersKC is run.
178 | #' @name FindClustersCallback
179 | #' @export
180 | FindClustersCallback <- function(seurat_obj,
181 |                                  resolution_start = 0.8,
182 |                                  reduction_percentage = 0.2,
183 |                                  num_clusters_start = 20,
184 |                                  dims = 1:10,
185 |                                  algorithm = "louvain", # todo implement all algos
186 |                                  assay = "RNA",
187 |                                  cores = 1,
188 |                                  verbose = TRUE) {
189 | 
190 |   # todo check function arguments for validity
191 | 
192 |   knockoff_seurat_obj <- get_seurat_obj_with_knockoffs(seurat_obj, assay = assay, verbose = verbose)
193 | 
194 |   num_variable_features <- 2 * length(Seurat::VariableFeatures(seurat_obj))
195 | 
196 | 
197 |   # Pre-process data
198 |   options(future.globals.maxSize = 8000 * 1024^2)
199 |   # todo log number of cores being used
200 |   future::plan("multicore", workers = as.numeric(cores))
201 |   #options(future.globals.maxSize = 8000 * 1024^2)
202 | 
203 |     if (verbose) {
204 |       message(paste("Number of cores:", cores))
205 |     }
206 | 
207 |   #plan("multicore", workers = as.numeric(cores))
208 | 
209 |   knockoff_seurat_obj <- Seurat::NormalizeData(knockoff_seurat_obj,
210 |                                                verbose = FALSE)
211 |    
212 |   knockoff_seurat_obj <- Seurat::FindVariableFeatures(knockoff_seurat_obj,
213 |                                                       selection.method = "vst",
214 |                                                       nfeatures = num_variable_features,
215 |                                                       verbose = FALSE)
216 | 
217 |   knockoff_seurat_obj <- Seurat::ScaleData(knockoff_seurat_obj, verbose = FALSE)
218 |   knockoff_seurat_obj <- Seurat::RunPCA(knockoff_seurat_obj,
219 |                                         features = Seurat::VariableFeatures(object = knockoff_seurat_obj),
220 |                                         verbose = FALSE)
221 | 
222 |   # todo check if i should use all dims for knockoffs
223 |   knockoff_seurat_obj <- Seurat::FindNeighbors(knockoff_seurat_obj,
224 |                                                dims = dims,
225 |                                                verbose = FALSE)
226 | 
227 |   resolution_param <- resolution_start
228 | 
229 | 
230 |   first_iteration <- TRUE
231 | 
232 |   while (TRUE) {
233 |     if (verbose) {
234 |       message("####################################################################")
235 |       message(paste("Finding clusters with", stringr::str_to_title(algorithm), "algorithm"))
236 |       message(paste("Resolution param:", resolution_param))
237 |     }
238 | 
239 |     if (algorithm == "louvain") {
240 |       knockoff_seurat_obj <- Seurat::FindClusters(knockoff_seurat_obj,
241 |                                                   resolution = resolution_param,
242 |                                                   verbose = FALSE)
243 |     }
244 | 
245 |     if (algorithm == "leiden") {
246 |       #plan("sequential") # todo log number of cores being used # this is a weird one because leiden has a forked job hanging
247 |       knockoff_seurat_obj <- Seurat::FindClusters(knockoff_seurat_obj,
248 |                                                   resolution = resolution_param,
249 |                                                   algorithm = 4,
250 |                                                   method = "igraph",
251 |                                                   verbose = FALSE)
252 |     }
253 | 
254 |     # Reduce resolution for next iteration of the loop
255 |     resolution_param <- (1 - reduction_percentage) * resolution_param
256 | 
257 |     k <- length(levels(Seurat::Idents(knockoff_seurat_obj)))
258 |     #knock_idents <- 0:(k-1)
259 | 
260 |     if (verbose) {
261 |       message("Num clusters:")
262 |       message(k)
263 |     }
264 | 
265 |     knock_idents <- levels(Seurat::Idents(knockoff_seurat_obj))
266 | 
267 |     num_selected_matrix <- matrix(nrow = k, ncol = k)
268 | 
269 |     found_no_sign_diff <- FALSE
270 | 
271 |     num_clusters <- length(knock_idents)
272 | 
273 | 
274 |     if (verbose) {
275 |       progress_bar_length <- num_clusters * (num_clusters - 1) / 2
276 |       cli::cli_progress_bar("Processing cluster pairs:",
277 |                             total = progress_bar_length,
278 |                             clear = FALSE)
279 |     }
280 | 
281 |     m <- 0
282 |     for (i in 1:num_clusters) {
283 |       for (j in 1:num_clusters) {
284 |         if (j >= i) {
285 |           next
286 |         }
287 | 
288 |         m <- m + 1
289 | 
290 |         if (verbose) {
291 |           cli::cli_progress_update()
292 |         }
293 | 
294 |         markers_selected <- compute_knockoff_filter(seurat_obj = knockoff_seurat_obj,
295 |                                                     cluster1 = knock_idents[i],
296 |                                                     cluster2 = knock_idents[j],
297 |                                                     q = 0.05,
298 |                                                     num_cores = cores)
299 | 
300 |         num_selected <- nrow(markers_selected$selected_features)
301 | 
302 |         if (num_selected == 0) {
303 |           found_no_sign_diff <- TRUE
304 |           break
305 |         }
306 | 
307 |         num_selected_matrix[i, j] <- num_selected
308 |         num_selected_matrix[j, i] <- num_selected
309 | 
310 |       }
311 |       if (found_no_sign_diff) {
312 |         if (verbose) {
313 |           cli::cli_progress_done()
314 |           message("Found clusters with no significant differences.")
315 |           message("Progressing to next clustering iteration.")
316 |           first_iteration <- FALSE
317 |         }
318 |         break
319 |       }
320 |     }
321 | 
322 |     if (found_no_sign_diff) {
323 |       next
324 |     }
325 |     break
326 |   }
327 | 
328 |   if (first_iteration) {
329 |     warning("Only a single iteration occurred. The inferred cluster labels may be underclustered. To prevent this, you may want to re-run callback with a larger starting parameter.")
330 |   }
331 | 
332 |   seurat_obj@meta.data$callback_clusters <- Seurat::Idents(knockoff_seurat_obj)
333 |   Seurat::Idents(seurat_obj) <- seurat_obj@meta.data$callback_clusters
334 | 
335 |   return(seurat_obj)
336 | }
337 | 


--------------------------------------------------------------------------------