├── R
    ├── countsplit.R
    ├── estimate_zipoisson.R
    ├── seurat_workflow.R
    ├── copula.R
    ├── estimate_negative_binomial.R
    └── recall.R
├── .github
    ├── .gitignore
    └── workflows
    │   ├── docker-image.yml
    │   ├── super-linter.yml
    │   ├── check-standard.yml
    │   ├── pkgdown.yaml
    │   └── lintr.yml
├── LICENSE
├── .Rbuildignore
├── _pkgdown.yml
├── Dockerfile
├── man
    ├── figures
    │   └── recall_logo.png
    ├── estimate_zi_poisson.Rd
    ├── estimate_negative_binomial.Rd
    ├── rzipoisson.Rd
    ├── estimate_negative_binomial_copula.Rd
    ├── compute_knockoff_filter.Rd
    ├── get_seurat_obj_with_artificial_variables.Rd
    ├── seurat_workflow.Rd
    ├── FindClustersRecall.Rd
    └── FindClustersCountsplit.Rd
├── NAMESPACE
├── .gitignore
├── LICENSE.md
├── DESCRIPTION
├── vignettes
    └── basic-usage.Rmd
└── README.md


/R/countsplit.R:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2024
2 | COPYRIGHT HOLDER: recall authors
3 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^_pkgdown\.yml$
2 | ^docs$
3 | ^pkgdown$
4 | ^\.github$
5 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://lcrawlab.github.io/recall/
2 | template:
3 |   bootstrap: 5
4 | 
5 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM rocker/verse:4.0.5
2 | 
3 | RUN R -e "install.packages('.', type = 'source', repos = NULL)"


--------------------------------------------------------------------------------
/man/figures/recall_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lcrawlab/recall/HEAD/man/figures/recall_logo.png


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 | 
3 | export(FindClustersCountsplit)
4 | export(FindClustersRecall)
5 | export(seurat_workflow)


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 | 
 9 | jobs:
10 | 
11 |   build:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v3
17 |     - name: Build the Docker image
18 |       run: docker build . --file Dockerfile --tag recall:$(date +%s)
19 | 


--------------------------------------------------------------------------------
/man/estimate_zi_poisson.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/estimate_zipoisson.R
 3 | \name{estimate_zi_poisson}
 4 | \alias{estimate_zi_poisson}
 5 | \title{Maximum likelihood estimation for the zero-inflated Poisson distribution
 6 | with Poisson parameter lambda and zero proportion prop.zero.}
 7 | \usage{
 8 | estimate_zi_poisson(data)
 9 | }
10 | \arguments{
11 | \item{data}{The data to estimate parameters from.}
12 | }
13 | \value{
14 | Maximum likelihood estimators of the zero-inflated Poisson
15 | distribution
16 | }
17 | \description{
18 | Given data, computes the maximum likelihood estimators
19 | for the zero-inflated Poisson distribution.
20 | }
21 | 


--------------------------------------------------------------------------------
/man/estimate_negative_binomial.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/estimate_negative_binomial.R
 3 | \name{estimate_negative_binomial}
 4 | \alias{estimate_negative_binomial}
 5 | \title{Maximum likelihood estimation for the negative binomial
 6 | distribution.}
 7 | \usage{
 8 | estimate_negative_binomial(data, verbose = FALSE)
 9 | }
10 | \arguments{
11 | \item{data}{The data to estimate parameters from.}
12 | 
13 | \item{verbose}{Whether or not to show all logging.}
14 | }
15 | \value{
16 | Maximum likelihood estimators size and mu for the negative
17 | binomial distribution
18 | }
19 | \description{
20 | Given data, computes the maximum likelihood estimators
21 | for the negative binomial distribution with parameters: size and mu.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/rzipoisson.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/estimate_zipoisson.R
 3 | \name{rzipoisson}
 4 | \alias{rzipoisson}
 5 | \title{Random data generation for the zero-infalted Poisson distribution
 6 | with Poisson parameter lambda and zero proportion prop.zero.}
 7 | \usage{
 8 | rzipoisson(n, lambda, prop.zero)
 9 | }
10 | \arguments{
11 | \item{n}{The number of samples to be simulated.}
12 | 
13 | \item{lambda}{The Poisson rate parameter.}
14 | 
15 | \item{prop.zero}{The proportion of excess zeroes.}
16 | }
17 | \value{
18 | Simulated data from ZIP(lambda, prop.zero).
19 | }
20 | \description{
21 | Given the number of samples desired, a Poisson parameter,
22 | lambda, and a zero proportion, prop.zero, simulates the number of desired
23 | samples from ZIP(lambda, prop.zero).
24 | }
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | .RDataTmp
 8 | 
 9 | # User-specific files
10 | .Ruserdata
11 | 
12 | # Example code in package build process
13 | *-Ex.R
14 | 
15 | # Output files from R CMD build
16 | /*.tar.gz
17 | 
18 | # Output files from R CMD check
19 | /*.Rcheck/
20 | 
21 | # RStudio files
22 | .Rproj.user/
23 | 
24 | # produced vignettes
25 | vignettes/*.html
26 | vignettes/*.pdf
27 | 
28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
29 | .httr-oauth
30 | 
31 | # knitr and R markdown default cache directories
32 | *_cache/
33 | /cache/
34 | 
35 | # Temporary files created by R markdown
36 | *.utf8.md
37 | *.knit.md
38 | 
39 | # R Environment Variables
40 | .Renviron
41 | 
42 | # pkgdown site
43 | docs/
44 | 
45 | # translation temp files
46 | po/*~
47 | 
48 | # RStudio Connect folder
49 | rsconnect/
50 | .Rproj.user
51 | .Rdata
52 | .DS_Store
53 | docs
54 | inst/doc


--------------------------------------------------------------------------------
/.github/workflows/super-linter.yml:
--------------------------------------------------------------------------------
 1 | # This workflow executes several linters on changed files based on languages used in your code base whenever
 2 | # you push a code or open a pull request.
 3 | #
 4 | # You can adjust the behavior by modifying this file.
 5 | # For more information, see:
 6 | # https://github.com/github/super-linter
 7 | name: Lint Code Base
 8 | 
 9 | on:
10 |   push:
11 |     branches: [ "main" ]
12 |   pull_request:
13 |     branches: [ "main" ]
14 | jobs:
15 |   run-lint:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - name: Checkout code
19 |         uses: actions/checkout@v3
20 |         with:
21 |           # Full git history is needed to get a proper list of changed files within `super-linter`
22 |           fetch-depth: 0
23 | 
24 |       - name: Lint Code Base
25 |         uses: github/super-linter@v4
26 |         env:
27 |           VALIDATE_ALL_CODEBASE: false
28 |           DEFAULT_BRANCH: "main"
29 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
30 | 


--------------------------------------------------------------------------------
/man/estimate_negative_binomial_copula.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/copula.R
 3 | \name{estimate_negative_binomial_copula}
 4 | \alias{estimate_negative_binomial_copula}
 5 | \alias{estimate_zi_poisson_copula}
 6 | \alias{estimate_poisson_copula}
 7 | \alias{estimate_gaussian_copula}
 8 | \title{todo}
 9 | \usage{
10 | estimate_zi_poisson_copula(data_matrix, cores)
11 | 
12 | estimate_negative_binomial_copula(data_matrix, cores)
13 | 
14 | estimate_poisson_copula(data_matrix, cores)
15 | 
16 | estimate_gaussian_copula(data_matrix, cores)
17 | }
18 | \arguments{
19 | \item{data_matrix}{The data to estimate parameters from.}
20 | 
21 | \item{cores}{The number of CPU cores to use in estimation by scDesign3.}
22 | }
23 | \value{
24 | todo
25 | 
26 | todo
27 | 
28 | todo
29 | 
30 | todo
31 | }
32 | \description{
33 | Given data, computes todo
34 | 
35 | Given data, computes todo
36 | 
37 | Given data, computes todo
38 | 
39 | Given data, computes todo
40 | }
41 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2024 recall authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/man/compute_knockoff_filter.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/recall.R
 3 | \name{compute_knockoff_filter}
 4 | \alias{compute_knockoff_filter}
 5 | \title{Returns the genes selected by the knockoff filter}
 6 | \usage{
 7 | compute_knockoff_filter(
 8 |   seurat_obj,
 9 |   cluster1,
10 |   cluster2,
11 |   q,
12 |   return_all = FALSE,
13 |   num_cores = 1,
14 |   shared_memory_max
15 | )
16 | }
17 | \arguments{
18 | \item{seurat_obj}{A Seurat object}
19 | 
20 | \item{cluster1}{The Idents of the cluster of interest in seurat_obj1}
21 | 
22 | \item{cluster2}{The Idents of the cluster of interest in seurat_obj2}
23 | 
24 | \item{q}{The desired rate to control the FDR at}
25 | 
26 | \item{return_all}{Determines if the returned object will contain all genes
27 | or just the selected genes.}
28 | 
29 | \item{num_cores}{The number of cores for computing marker genes in parallel.}
30 | 
31 | \item{shared_memory_max}{The maximum size for shared global variables.}
32 | }
33 | \value{
34 | todo
35 | }
36 | \description{
37 | Given two Seurat objects, returns the  the genes selected by
38 | the knockoff filter and their W statistics.
39 | }
40 | 


--------------------------------------------------------------------------------
/man/get_seurat_obj_with_artificial_variables.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/recall.R
 3 | \name{get_seurat_obj_with_artificial_variables}
 4 | \alias{get_seurat_obj_with_artificial_variables}
 5 | \title{Returns a Seurat object that contains additional (fake) RNA
 6 | expression counts.}
 7 | \usage{
 8 | get_seurat_obj_with_artificial_variables(
 9 |   seurat_obj,
10 |   assay = "RNA",
11 |   null_method = "ZIP",
12 |   verbose = TRUE,
13 |   cores
14 | )
15 | }
16 | \arguments{
17 | \item{seurat_obj}{A Seurat object containing RNA expression counts.}
18 | 
19 | \item{assay}{The assay to generate artificial variables from.}
20 | 
21 | \item{null_method}{The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)}
22 | 
23 | \item{verbose}{Whether or not to show logging.}
24 | 
25 | \item{cores}{The number of cores to use in generating synthetic null variables.}
26 | }
27 | \value{
28 | A Seurat object that contains the original variable features and an
29 | equal number of artificial features.
30 | }
31 | \description{
32 | Given a Seurat object, returns a new Seurat object whose RNA
33 | expression counts includes the
34 | variable features from the original object and an equal number of artificial
35 | features.
36 | }
37 | 


--------------------------------------------------------------------------------
/man/seurat_workflow.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/seurat_workflow.R
 3 | \name{seurat_workflow}
 4 | \alias{seurat_workflow}
 5 | \title{Runs a typical Seurat workflow on a Seurat object (up to
 6 | dimensionality reduction and clustering).}
 7 | \usage{
 8 | seurat_workflow(
 9 |   seurat_obj,
10 |   num_variable_features,
11 |   resolution_param = 0.8,
12 |   visualization_method = "umap",
13 |   num_dims = 10,
14 |   algorithm = "louvain"
15 | )
16 | }
17 | \arguments{
18 | \item{seurat_obj}{A Seurat object that will be analyzed.}
19 | 
20 | \item{num_variable_features}{The number of variable features to use in the
21 | analysis.}
22 | 
23 | \item{resolution_param}{The resolution parameter to use when clustering.}
24 | 
25 | \item{visualization_method}{Either "umap" or "tsne".}
26 | 
27 | \item{num_dims}{The number of principal components to use.}
28 | 
29 | \item{algorithm}{The clustering algorithm to use, either "louvain" or
30 | "leiden".}
31 | }
32 | \value{
33 | A Seurat object containing the relevant analysis results.
34 | }
35 | \description{
36 | Given a Seurat object, returns a new Seurat that has been
37 | normalized, had variable features identified,
38 | scaled, had principal components computed, had clusters identified, and had
39 | tSNE and UMAP embeddings determined.
40 | }
41 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: recall
 2 | Title: Calibrated clustering with artificial variables to avoid over-clustering in single-cell RNA-sequencing
 3 | Version: 0.0.0
 4 | Authors@R: 
 5 |     person("Alan", "DenAdel", , "alan_denadel@brown.edu", role = c("aut", "cre"),
 6 |            comment = c(ORCID = "0000-0002-7985-6789"))
 7 | Description: recall (Calibrated Clustering with Artificial Variables) is a method for protecting
 8 |  against over-clustering by controlling for the impact of double-dipping. The approach 
 9 |     can be applied to any clustering algorithm (implemented are the Louvain and Leiden algorithms with
10 |     plans forK-means, and hierarchical clustering algorithms). The method provides state-of-the-art
11 |     clustering performance and can rapidly analyze large-scale scRNA-seq studies and is 
12 |     compatible with the Seurat library.
13 | Encoding: UTF-8
14 | Roxygen: list(markdown = TRUE)
15 | RoxygenNote: 7.3.2
16 | Imports: 
17 |     Matrix,
18 |     Seurat (>= 5.0.1),
19 |     SingleCellExperiment,
20 |     scDesign3,
21 |     SummarizedExperiment,
22 |     MASS,
23 |     fitdistrplus,
24 |     lamW,
25 |     knockoff,
26 |     future,
27 |     stats,
28 |     cli,
29 |     stringr,
30 |     countsplit
31 | License: MIT + file LICENSE
32 | Suggests: 
33 |     knitr,
34 |     markdown
35 | Remotes:
36 |     scDesign3=github::SONGDONGYUAN1994/scDesign3
37 | VignetteBuilder: knitr
38 | URL: https://lcrawlab.github.io/recall/
39 | 


--------------------------------------------------------------------------------
/.github/workflows/check-standard.yml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macos-latest,   r: 'release'}
22 |           - {os: windows-latest, r: 'release'}
23 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
24 |           - {os: ubuntu-latest,   r: 'release'}
25 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
26 | 
27 |     env:
28 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
29 |       R_KEEP_PKG_SOURCE: yes
30 | 
31 |     steps:
32 |       - uses: actions/checkout@v4
33 | 
34 |       - uses: r-lib/actions/setup-pandoc@v2
35 | 
36 |       - uses: r-lib/actions/setup-r@v2
37 |         with:
38 |           r-version: ${{ matrix.config.r }}
39 |           http-user-agent: ${{ matrix.config.http-user-agent }}
40 |           use-public-rspm: true
41 | 
42 |       - uses: r-lib/actions/setup-r-dependencies@v2
43 |         with:
44 |           extra-packages: any::rcmdcheck
45 |           needs: check
46 | 
47 |       - uses: r-lib/actions/check-r-package@v2
48 |         with:
49 |           upload-snapshots: true
50 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown.yaml
13 | 
14 | permissions: read-all
15 | 
16 | jobs:
17 |   pkgdown:
18 |     runs-on: ubuntu-latest
19 |     # Only restrict concurrency for non-PR jobs
20 |     concurrency:
21 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
22 |     env:
23 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
24 |     permissions:
25 |       contents: write
26 |     steps:
27 |       - uses: actions/checkout@v4
28 | 
29 |       - uses: r-lib/actions/setup-pandoc@v2
30 | 
31 |       - uses: r-lib/actions/setup-r@v2
32 |         with:
33 |           use-public-rspm: true
34 | 
35 |       - uses: r-lib/actions/setup-r-dependencies@v2
36 |         with:
37 |           extra-packages: any::pkgdown, local::.
38 |           needs: website
39 | 
40 |       - name: Build site
41 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
42 |         shell: Rscript {0}
43 | 
44 |       - name: Deploy to GitHub pages 🚀
45 |         if: github.event_name != 'pull_request'
46 |         uses: JamesIves/github-pages-deploy-action@v4.5.0
47 |         with:
48 |           clean: false
49 |           branch: gh-pages
50 |           folder: docs
51 | 


--------------------------------------------------------------------------------
/vignettes/basic-usage.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Basic Usage on PBMC3k Data"
 3 | output: rmarkdown::html_vignette
 4 | vignette: >
 5 |   %\VignetteIndexEntry{basic-usage}
 6 |   %\VignetteEngine{knitr::rmarkdown}
 7 |   %\VignetteEncoding{UTF-8}
 8 | 
 9 | ---
10 | 
11 | ```{r, include = FALSE}
12 | knitr::opts_chunk$set(
13 |   collapse = TRUE,
14 |   comment = "#>"
15 | )
16 | 
17 | knitr::opts_chunk$set(eval = FALSE)
18 | 
19 | ```
20 | 
21 | ```{r setup}
22 | suppressPackageStartupMessages({
23 | library(Seurat)
24 | library(SeuratData)
25 | library(recall)
26 | })
27 | ```
28 | 
29 | 
30 | First, we use the `SeuratData` data package to first download and then load
31 | 2700 PBMCs. The loaded `SeuratObject`, `pbmc3k`, is from an old version of 
32 | `Seurat`, and so we update the object to v5.
33 | 
34 | ```{r load_data}
35 | set.seed(123)
36 | 
37 | SeuratData::InstallData("pbmc3k")
38 | data("pbmc3k")
39 | 
40 | pbmc3k <- UpdateSeuratObject(pbmc3k)
41 | ```
42 | 
43 | Now, we use `Seurat` to perform the usual preprocessing steps that are performed prior to clustering.
44 | 
45 | ```{r preprocessing}
46 | pbmc3k <- NormalizeData(pbmc3k)
47 | pbmc3k <- FindVariableFeatures(pbmc3k)
48 | pbmc3k <- ScaleData(pbmc3k)
49 | pbmc3k <- RunPCA(pbmc3k)
50 | pbmc3k <- FindNeighbors(pbmc3k)
51 | pbmc3k <- RunUMAP(pbmc3k, dims = 1:10)
52 | ```
53 | 
54 | The `recall` algorithm can be run with a single function call as a drop-in
55 | replacement for the `Seurat` function `FindClusters`.
56 | 
57 | ```{r run_recall}
58 | pbmc3k <- FindClustersRecall(pbmc3k)
59 | ```
60 | 
61 | The `recall`  clusters are set to the idents of the `SeuratObject` that is
62 | returned by `FindClustersRecall`
63 | 
64 | ```{r plot_umap}
65 | DimPlot(pbmc3k)
66 | ```
67 | 
68 | Cluster labels from `FindClustersRecall` care stored in the metadata in the
69 | column `pbmc3k@meta.data$recall_clusters`.
70 | 
71 | ```{r plot_umap2}
72 | DimPlot(pbmc3k, group.by = "recall_clusters")
73 | ```
74 | 


--------------------------------------------------------------------------------
/.github/workflows/lintr.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | # lintr provides static code analysis for R.
 6 | # It checks for adherence to a given style,
 7 | # identifying syntax errors and possible semantic issues,
 8 | # then reports them to you so you can take action.
 9 | # More details at https://lintr.r-lib.org/
10 | 
11 | name: lintr
12 | 
13 | on:
14 |   push:
15 |     branches: [ "main" ]
16 |   pull_request:
17 |     # The branches below must be a subset of the branches above
18 |     branches: [ "main" ]
19 |   schedule:
20 |     - cron: '18 7 * * 6'
21 | 
22 | permissions:
23 |   contents: read
24 | 
25 | jobs:
26 |   lintr:
27 |     name: Run lintr scanning
28 |     runs-on: ubuntu-latest
29 |     permissions:
30 |       contents: read # for checkout to fetch code
31 |       security-events: write # for github/codeql-action/upload-sarif to upload SARIF results
32 |       actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status
33 | 
34 |     steps:
35 |       - name: Checkout code
36 |         uses: actions/checkout@v4
37 | 
38 |       - name: Setup R
39 |         uses: r-lib/actions/setup-r@4e1feaf90520ec1215d1882fdddfe3411c08e492
40 |         with:
41 |           r-version: '4.3' # The R version to download (if necessary) and use.
42 | 
43 |       - name: Setup lintr
44 |         uses: r-lib/actions/setup-r-dependencies@4e1feaf90520ec1215d1882fdddfe3411c08e492
45 |         with:
46 |           extra-packages: lintr
47 | 
48 |       - name: Run lintr
49 |         run: lintr::sarif_output(lintr::lint_dir("."), "lintr-results.sarif")
50 |         shell: Rscript {0}
51 |         continue-on-error: true
52 | 
53 |       - name: Upload analysis results to GitHub
54 |         uses: github/codeql-action/upload-sarif@v3
55 |         with:
56 |           sarif_file: lintr-results.sarif
57 |           wait-for-processing: true
58 | 


--------------------------------------------------------------------------------
/R/estimate_zipoisson.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # https://en.wikipedia.org/wiki/Zero-inflated_model#Estimators_of_ZIP_parameters
 3 | # https://math.stackexchange.com/questions/2761563/maximum-likelihood-estimation-for-zero-inflated-poisson-distribution
 4 | # https://ieeexplore.ieee.org/document/9032203
 5 | 
 6 | #' @title Maximum likelihood estimation for the zero-inflated Poisson distribution
 7 | #' with Poisson parameter lambda and zero proportion prop.zero.
 8 | #'
 9 | #' @description Given data, computes the maximum likelihood estimators
10 | #' for the zero-inflated Poisson distribution.
11 | #'
12 | #' @param data The data to estimate parameters from.
13 | #' @returns Maximum likelihood estimators of the zero-inflated Poisson
14 | #' distribution
15 | #' @name estimate_zi_poisson
16 | estimate_zi_poisson <- function(data) {
17 |   num.zeros <- sum(data == 0)
18 |   r0 <- 1 / length(data) * num.zeros
19 |   
20 |   x.bar = mean(data)
21 |   
22 |   gamma <- x.bar / (1 - r0)
23 |   
24 |   lambda.hat <- lamW::lambertW0(-gamma * exp(-gamma)) + gamma
25 |   
26 |   pi.hat <- 1 - x.bar / lambda.hat
27 | 
28 | 
29 |   return.list <- list("lambda.hat" = lambda.hat, "pi.hat" = pi.hat)
30 |   return(return.list)
31 | }
32 | 
33 | 
34 | #' @title Random data generation for the zero-infalted Poisson distribution
35 | #' with Poisson parameter lambda and zero proportion prop.zero.
36 | #'
37 | #' @description Given the number of samples desired, a Poisson parameter,
38 | #' lambda, and a zero proportion, prop.zero, simulates the number of desired
39 | #' samples from ZIP(lambda, prop.zero).
40 | #'
41 | #' @param n The number of samples to be simulated.
42 | #' @param lambda The Poisson rate parameter.
43 | #' @param prop.zero The proportion of excess zeroes.
44 | #' @returns Simulated data from ZIP(lambda, prop.zero).
45 | #' @name rzipoisson
46 | rzipoisson <- function(n, lambda, prop.zero) {
47 |   data <- c()
48 | 
49 | 
50 |   for (i in 1:n) {
51 |     if (stats::runif(1) < prop.zero) {
52 |       data[i] <- 0
53 |     }
54 |     else {
55 |       data[i] <- stats::rpois(1, lambda)
56 |     }
57 |   }
58 |   return(data)
59 | } 
60 | 
61 | 


--------------------------------------------------------------------------------
/man/FindClustersRecall.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/recall.R
 3 | \name{FindClustersRecall}
 4 | \alias{FindClustersRecall}
 5 | \title{Runs a typical Seurat workflow on a Seurat object (up to
 6 | dimensionality reduction and clustering).}
 7 | \usage{
 8 | FindClustersRecall(
 9 |   seurat_obj,
10 |   resolution_start = 0.8,
11 |   reduction_percentage = 0.2,
12 |   num_clusters_start = 20,
13 |   dims = 1:10,
14 |   algorithm = "louvain",
15 |   null_method = "ZIP",
16 |   assay = "RNA",
17 |   cores = 1,
18 |   shared_memory_max = 8000 * 1024^2,
19 |   verbose = TRUE
20 | )
21 | }
22 | \arguments{
23 | \item{seurat_obj}{The Seurat object that will be analyzed.}
24 | 
25 | \item{resolution_start}{The starting resolution to be used for the
26 | clustering algorithm (Louvain and Leiden algorithms).}
27 | 
28 | \item{reduction_percentage}{The amount that the starting parameter will be
29 | reduced by after each iteration (between 0 and 1).}
30 | 
31 | \item{num_clusters_start}{The starting number of clusters to be used for the
32 | clustering algorithm (K-means and Hierarchical clustering algorithms).}
33 | 
34 | \item{dims}{The dimensions to use as input features (i.e. 1:10).}
35 | 
36 | \item{algorithm}{The clustering algorithm to be used.}
37 | 
38 | \item{null_method}{The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)}
39 | 
40 | \item{assay}{The assay to generate artificial variables from.}
41 | 
42 | \item{cores}{The number of cores to compute marker genes in parallel.}
43 | 
44 | \item{shared_memory_max}{The maximum size for shared global variables.
45 | Increased this variable if you see the following error:
46 | The total size of the X globals that need to be exported for the future expression
47 | ('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB
48 | (option 'future.globals.maxSize'). The X largest globals are ...}
49 | 
50 | \item{verbose}{Whether or not to show all logging.}
51 | }
52 | \value{
53 | Returns a Seurat object where the idents have been updated with the
54 | clusters determined via the recall algorithm.
55 | Latest clustering results will be stored in the object metadata under
56 | recall_clusters'. Note that 'recall_clusters' will be overwritten ever
57 | time FindClustersRecall is run.
58 | }
59 | \description{
60 | Given a Seurat object, returns a new Seurat that has been
61 | normalized, had variable features identified, scaled, had principal
62 | components computed, hadclusters identified, and had tSNE and UMAP
63 | embeddings determined.
64 | }
65 | 


--------------------------------------------------------------------------------
/man/FindClustersCountsplit.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/recall.R
 3 | \name{FindClustersCountsplit}
 4 | \alias{FindClustersCountsplit}
 5 | \title{Runs a typical Seurat workflow on a Seurat object (up to
 6 | dimensionality reduction and clustering).}
 7 | \usage{
 8 | FindClustersCountsplit(
 9 |   seurat_obj,
10 |   resolution_start = 0.8,
11 |   reduction_percentage = 0.2,
12 |   num_clusters_start = 20,
13 |   dims = 1:10,
14 |   algorithm = "louvain",
15 |   null_method = "ZIP",
16 |   assay = "RNA",
17 |   cores = 1,
18 |   shared_memory_max = 8000 * 1024^2,
19 |   verbose = TRUE
20 | )
21 | }
22 | \arguments{
23 | \item{seurat_obj}{The Seurat object that will be analyzed.}
24 | 
25 | \item{resolution_start}{The starting resolution to be used for the
26 | clustering algorithm (Louvain and Leiden algorithms).}
27 | 
28 | \item{reduction_percentage}{The amount that the starting parameter will be
29 | reduced by after each iteration (between 0 and 1).}
30 | 
31 | \item{num_clusters_start}{The starting number of clusters to be used for the
32 | clustering algorithm (K-means and Hierarchical clustering algorithms).}
33 | 
34 | \item{dims}{The dimensions to use as input features (i.e. 1:10).}
35 | 
36 | \item{algorithm}{The clustering algorithm to be used.}
37 | 
38 | \item{null_method}{The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)}
39 | 
40 | \item{assay}{The assay to generate artificial variables from.}
41 | 
42 | \item{cores}{The number of cores to compute marker genes in parallel.}
43 | 
44 | \item{shared_memory_max}{The maximum size for shared global variables.
45 | Increased this variable if you see the following error:
46 | The total size of the X globals that need to be exported for the future expression
47 | ('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB
48 | (option 'future.globals.maxSize'). The X largest globals are ...}
49 | 
50 | \item{verbose}{Whether or not to show all logging.}
51 | }
52 | \value{
53 | Returns a Seurat object where the idents have been updated with the
54 | clusters determined via the countsplit algorithm.
55 | Latest clustering results will be stored in the object metadata under
56 | countsplit_clusters'. Note that 'countsplit_clusters' will be overwritten ever
57 | time FindClustersCountsplit is run.
58 | }
59 | \description{
60 | Given a Seurat object, returns a new Seurat that has been
61 | normalized, had variable features identified, scaled, had principal
62 | components computed, hadclusters identified, and had tSNE and UMAP
63 | embeddings determined.
64 | }
65 | 


--------------------------------------------------------------------------------
/R/seurat_workflow.R:
--------------------------------------------------------------------------------
 1 | #' @title Runs a typical Seurat workflow on a Seurat object (up to
 2 | #' dimensionality reduction and clustering).
 3 | #'
 4 | #' @description Given a Seurat object, returns a new Seurat that has been
 5 | #' normalized, had variable features identified,
 6 | #' scaled, had principal components computed, had clusters identified, and had
 7 | #' tSNE and UMAP embeddings determined.
 8 | #'
 9 | #' @param seurat_obj A Seurat object that will be analyzed.
10 | #' @param num_variable_features The number of variable features to use in the
11 | #' analysis.
12 | #' @param resolution_param The resolution parameter to use when clustering.
13 | #' @param visualization_method Either "umap" or "tsne".
14 | #' @param num_dims The number of principal components to use.
15 | #' @param algorithm The clustering algorithm to use, either "louvain" or
16 | #' "leiden".
17 | #' @returns A Seurat object containing the relevant analysis results.
18 | #' @export
19 | #' @name seurat_workflow
20 | seurat_workflow <- function(seurat_obj,
21 |                             num_variable_features,
22 |                             resolution_param = 0.8,
23 |                             visualization_method = "umap",
24 |                             num_dims = 10,
25 |                             algorithm = "louvain") {
26 |   seurat_obj <- Seurat::NormalizeData(seurat_obj)
27 | 
28 |   seurat_obj <- Seurat::FindVariableFeatures(seurat_obj,
29 |                                              selection.method = "vst",
30 |                                              nfeatures = num_variable_features)
31 | 
32 |   all_genes <- rownames(seurat_obj)
33 | 
34 |   seurat_obj <- Seurat::ScaleData(seurat_obj)
35 | 
36 |   seurat_obj <- Seurat::RunPCA(seurat_obj,
37 |                                features = Seurat::VariableFeatures(object = seurat_obj))
38 | 
39 |   seurat_obj <- Seurat::FindNeighbors(seurat_obj, dims = 1:num_dims)
40 | 
41 |   if (algorithm == "louvain") {
42 |     seurat_obj <- Seurat::FindClusters(seurat_obj,
43 |                                        resolution = resolution_param)
44 |   }
45 | 
46 |   if (algorithm == "leiden") {
47 |     seurat_obj <- Seurat::FindClusters(seurat_obj,
48 |                                        resolution = resolution_param,
49 |                                        algorithm = 4,
50 |                                        method = "igraph")
51 |   }
52 | 
53 |   if (visualization_method == "umap") {
54 |     seurat_obj <- Seurat::RunUMAP(seurat_obj, dims = 1:num_dims)
55 |   }
56 |   if (visualization_method == "tsne") {
57 |     seurat_obj <- Seurat::RunTSNE(seurat_obj, dims = 1:num_dims)
58 |   }
59 | 
60 | if (visualization_method == "both") {
61 |     seurat_obj <- Seurat::RunUMAP(seurat_obj, dims = 1:num_dims)
62 |     seurat_obj <- Seurat::RunTSNE(seurat_obj, dims = 1:num_dims)
63 |   }
64 | 
65 |   return(seurat_obj)
66 | }
67 | 


--------------------------------------------------------------------------------
/R/copula.R:
--------------------------------------------------------------------------------
 1 | simulate_data_scDesign3 <- function(data_matrix, cores, family) {
 2 |     sce <- SingleCellExperiment::SingleCellExperiment(list(counts = data_matrix))
 3 |     SummarizedExperiment::colData(sce)$cell_type <- "1" # scDesign3 needs a cell type so we just make it the same for all cells
 4 | 
 5 |     simulated_data <- scDesign3::scdesign3(sce,
 6 |                                            celltype = "cell_type",
 7 |                                            pseudotime = NULL,
 8 |                                            spatial = NULL,
 9 |                                            other_covariates = NULL,
10 |                                            empirical_quantile = FALSE,
11 |                                            usebam=TRUE, # to speedup marginal inference
12 |                                            mu_formula = "1",
13 |                                            sigma_formula = "1",
14 |                                            corr_formula = "1",
15 |                                            family_use = family, # this is the key parameter
16 |                                            nonzerovar = FALSE,
17 |                                            n_cores = cores,
18 |                                            parallelization = "mcmapply",
19 |                                            important_feature = "all",
20 |                                            nonnegative = FALSE,
21 |                                            copula = "gaussian",
22 |                                            fastmvn = TRUE)
23 | 
24 |     ko <- simulated_data$new_count
25 | 
26 |     return(ko)
27 | }
28 | 
29 | 
30 | #' @title todo
31 | #'
32 | #' @description Given data, computes todo
33 | #'
34 | #' @param data_matrix The data to estimate parameters from.
35 | #' @param cores The number of CPU cores to use in estimation by scDesign3.
36 | #' @returns todo
37 | #' @name estimate_negative_binomial_copula
38 | estimate_zi_poisson_copula <- function(data_matrix, cores) {
39 |     family <- "zip"
40 |     ko <- simulate_data_scDesign3(data_matrix, cores, family)
41 |     return(ko)
42 | }
43 | 
44 | 
45 | #' @title todo
46 | #'
47 | #' @description Given data, computes todo
48 | #'
49 | #' @param data_matrix The data to estimate parameters from.
50 | #' @param cores The number of CPU cores to use in estimation by scDesign3.
51 | #' @returns todo
52 | #' @name estimate_negative_binomial_copula
53 | estimate_negative_binomial_copula <- function(data_matrix, cores) {
54 |     family <- "nb"
55 |     ko <- simulate_data_scDesign3(data_matrix, cores, family)
56 |     return(ko)
57 | }
58 | 
59 | 
60 | #' @title todo
61 | #'
62 | #' @description Given data, computes todo
63 | #'
64 | #' @param data_matrix The data to estimate parameters from.
65 | #' @param cores The number of CPU cores to use in estimation by scDesign3.
66 | #' @returns todo
67 | #' @name estimate_negative_binomial_copula
68 | estimate_poisson_copula <- function(data_matrix, cores) {
69 |     family <- "poisson"
70 |     ko <- simulate_data_scDesign3(data_matrix, cores, family)
71 |     return(ko)
72 | }
73 | 
74 | 
75 | 
76 | 
77 | #' @title todo
78 | #'
79 | #' @description Given data, computes todo
80 | #'
81 | #' @param data_matrix The data to estimate parameters from.
82 | #' @param cores The number of CPU cores to use in estimation by scDesign3.
83 | #' @returns todo
84 | #' @name estimate_negative_binomial_copula
85 | estimate_gaussian_copula <- function(data_matrix, cores) {
86 |     family <- "gaussian"
87 |     ko <- simulate_data_scDesign3(data_matrix, cores, family)
88 |     return(ko)
89 | }
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # recall (Calibrated Clustering with Artificial Variables)<img src="man/figures/recall_logo.png" align="right" alt="" width="120"/>
 2 | 
 3 | [![R CMD check](https://github.com/lcrawlab/recall/actions/workflows/check-standard.yml/badge.svg)](https://github.com/lcrawlab/recall/actions/workflows/check-standard.yml)
 4 | [![Docker Image CI](https://github.com/lcrawlab/recall/actions/workflows/docker-image.yml/badge.svg)](https://github.com/lcrawlab/recall/actions/workflows/docker-image.yml)
 5 | 
 6 | ## Introduction
 7 | 
 8 | Standard single-cell RNA-sequencing (scRNA-seq) pipelines nearly always include unsupervised clustering as a key step in identifying biologically distinct cell types. A follow-up step in these pipelines is to test for differential expression between the identified clusters. When algorithms over-cluster, downstream analyses will produce inflated P-values resulting in increased false discoveries.
 9 | Here, we present `recall` (Calibrated Clustering with Artificial Variables): a new method for protecting against over-clustering by controlling for the impact of double-dipping.
10 | Importantly, our approach can be applied to any clustering algorithm (implemented here are the Louvain and Leiden algorithms with plans to implement the K-means and hierarchical clustering algorithms).
11 | `recall` provides state-of-the-art clustering performance and can rapidly analyze large-scale scRNA-seq studies, even on a personal laptop.
12 | 
13 | ## Installation
14 | 
15 | You can install the lastest development version by using the [devtools](https://CRAN.R-project.org/package=devtools) library. To install this package with devtools, use this command:
16 | 
17 | ```r
18 | devtools::install_github("lcrawlab/recall")
19 | ```
20 | 
21 | Although it is not explicitly a dependency, making sure you have `presto` installed will make `recall` much faster.
22 | 
23 | ```r
24 | devtools::install_github("immunogenomics/presto")
25 | ```
26 | 
27 | 
28 | ## Tutorial
29 | 
30 | ```r
31 | library(Seurat)
32 | library(SeuratData)
33 | 
34 | library(recall)
35 | 
36 | set.seed(123)
37 | 
38 | # load pbmc3k dataset
39 | SeuratData::InstallData("pbmc3k")
40 | data("pbmc3k")
41 | 
42 | pbmc3k <- UpdateSeuratObject(pbmc3k)
43 | 
44 | pbmc3k <- NormalizeData(pbmc3k)
45 | pbmc3k <- FindVariableFeatures(pbmc3k)
46 | pbmc3k <- ScaleData(pbmc3k)
47 | pbmc3k <- RunPCA(pbmc3k)
48 | pbmc3k <- FindNeighbors(pbmc3k)
49 | pbmc3k <- RunUMAP(pbmc3k, dims = 1:10)
50 | 
51 | pbmc_default <- FindClusters(pbmc3k)
52 | pbmc_recall <- FindClustersRecall(pbmc3k)
53 | 
54 | DimPlot(pbmc_default) + DimPlot(pbmc_recall)
55 | ```
56 | ## Overview of the Method
57 | 
58 | The `recall` algorithm consists of three simple steps:
59 | 
60 | 1. First, we generate synthetic null variables, inspired by knockoff variables (Barber and Candès,2015) , where we augment the single-cell data being analyzed with "fake" genes that are known not to contribute to any unique cell type.
61 | 2. Second, we perform both preprocessing and clustering on this augmented dataset.
62 | 3. Third, we calibrate the number of inferred clusters by using a hypothesis testing strategy with a data-dependent threshold to determine if there is a statistically significant difference between groups. If any pair of groups does not have statistically significant differences then re-clustering occurs.
63 | 
64 | The synthetic genes act as negative control variables; they go through the same analytic steps as the real data and are presented with the same opportunity to be identified as marker genes.
65 | The `recall` algorithm uses the guiding principle that well-calibrated clusters (i.e., those representing real groups) should have significantly differentially expressed genes after correcting for multiple hypothesis tests, while over-clustered groups will not.
66 | We use this rule to iteratively re-cluster cells until the inferred clusters are well-calibrated and the observed differences in expression between groups are not due to the effects of double-dipping.
67 | 
68 | ## Relevant Citations
69 | `recall` is now published in AJHG, [here](https://www.cell.com/ajhg/abstract/S0002-9297(25)00061-8).
70 | 
71 | A. DenAdel, M. Ramseier, A. Navia, A. Shalek, S. Raghavan, P. Winter, A. Amini, and L. Crawford. A knockoff calibration method to avoid over-clustering in single-cell RNA-sequencing. _AJHG_.
72 | 
73 | ## Questions and Feedback
74 | For questions or concerns with `recall`, please contact
75 | [Alan DenAdel](mailto:alan_denadel@brown.edu) or [Lorin Crawford](lcrawford@microsoft.com). Any feedback on the software, manuscript, and tutorials is appreciated.
76 | 


--------------------------------------------------------------------------------
/R/estimate_negative_binomial.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' @title Maximum likelihood estimation for the negative binomial
  3 | #' distribution.
  4 | #'
  5 | #' @description Given data, computes the maximum likelihood estimators
  6 | #' for the negative binomial distribution with parameters: size and mu.
  7 | #'
  8 | #' @param data The data to estimate parameters from.
  9 | #' @returns Maximum likelihood estimators size and mu for the negative 
 10 | #' binomial distribution
 11 | #' @param verbose Whether or not to show all logging.
 12 | #' @name estimate_negative_binomial
 13 | estimate_negative_binomial <- function(data, verbose=FALSE) {
 14 |   
 15 |   if (verbose) { message("Attempting MLE method 1") }
 16 |   mle1 <- tryCatch(
 17 |     {
 18 |       nb_fit <- MASS::fitdistr(data, "negative binomial", method = "Nelder-Mead")
 19 |       size <- nb_fit$estimate[["size"]]
 20 |       mu <- nb_fit$estimate[["mu"]]
 21 | 
 22 |       # check if method returned NaN or NA without throwing an error
 23 |       if (is.na(mu) || is.na(size)) { stop() }
 24 | 
 25 |       return.list <- list("size" = size, "mu" = mu)
 26 |       return(return.list)
 27 |     },
 28 |     error = function(cond) {
 29 |       if (verbose) { message("MLE method 1 failed with an error.") }
 30 |       NA
 31 |     },
 32 |     warning = function(cond) {
 33 |       if (verbose) { message("MLE method 2 had a warning. Warning message:\n") }
 34 |       if (verbose) { message(cond) }
 35 |       if (verbose) { message("\n") }
 36 |       NA
 37 |     }
 38 |   )
 39 |   
 40 |   if (verbose) { message("Attempting MLE method 2") }
 41 |   mle2 <- tryCatch(
 42 |     {
 43 |       nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mle")
 44 |       size <- nb_fit$estimate[["size"]]
 45 |       mu <- nb_fit$estimate[["mu"]]
 46 | 
 47 |       # check if method returned NaN or NA without throwing an error
 48 |       if (is.na(mu) || is.na(size)) { stop() }
 49 | 
 50 |       return.list <- list("size" = size, "mu" = mu)
 51 |       return(return.list)
 52 | 
 53 |     },
 54 |     error = function(cond) {
 55 |       if (verbose) { message("MLE method 2 failed with an error.") }
 56 |       NA
 57 |     },
 58 |     warning = function(cond) {
 59 |       if (verbose) { message("MLE method 2 had a warning. Warning message:") }
 60 |       if (verbose) { message(cond) }
 61 |       NA
 62 |     }
 63 |   )
 64 |   
 65 |   if (verbose) { message("Attempting MME") }
 66 |   mme <- tryCatch(
 67 |     {
 68 |       nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mme")
 69 |       size <- nb_fit$estimate[["size"]]
 70 |       mu <- nb_fit$estimate[["mu"]]
 71 | 
 72 |       # check if method returned NaN or NA without throwing an error
 73 |       if (is.na(mu) || is.na(size)) { stop() }
 74 | 
 75 |       return.list <- list("size" = size, "mu" = mu)
 76 |       return(return.list)
 77 |     },
 78 |     error = function(cond) {
 79 |       if (verbose) { message("MME failed with an error.") }
 80 |       NA
 81 |     },
 82 |     warning = function(cond) {
 83 |       if (verbose) { message("MME method has a warning. Warning message:") }
 84 |       if (verbose) { message(cond) }
 85 |       NA
 86 |     }
 87 |   )
 88 | 
 89 |   
 90 |   if (verbose) { message("Attempting MME with warnings") }
 91 |   mme <- tryCatch(
 92 |     {
 93 |       nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mme")
 94 |       size <- nb_fit$estimate[["size"]]
 95 |       mu <- nb_fit$estimate[["mu"]]
 96 | 
 97 |       # check if method returned NaN or NA without throwing an error
 98 |       if (is.na(mu) || is.na(size)) { stop() }
 99 | 
100 |       return.list <- list("size" = size, "mu" = mu)
101 |       return(return.list)
102 |     },
103 |     error = function(cond) {
104 |       if (verbose) { message("MME failed with an error.") }
105 |       NA
106 |     }
107 |   )
108 | 
109 |   if (verbose) { message("Attempting MSE") }
110 |   mme <- tryCatch(
111 |     {
112 |       nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mse")
113 |       size <- nb_fit$estimate[["size"]]
114 |       mu <- nb_fit$estimate[["mu"]]
115 | 
116 |       # check if method returned NaN or NA without throwing an error
117 |       if (is.na(mu) || is.na(size)) { stop() }
118 | 
119 |       return.list <- list("size" = size, "mu" = mu)
120 |       return(return.list)
121 |     },
122 |     error = function(cond) {
123 |       if (verbose) { message("MSE failed with an error.") }
124 |       NA
125 |     },
126 |     warning = function(cond) {
127 |       if (verbose) { message("MSE method failed. Warning message:") }
128 |       if (verbose) { message(cond) }
129 |       NA
130 |     }
131 |   )
132 | 
133 |   if (verbose) { message("Attempting QME") }
134 |   mme <- tryCatch(
135 |     {
136 |       nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="qme")
137 |       size <- nb_fit$estimate[["size"]]
138 |       mu <- nb_fit$estimate[["mu"]]
139 | 
140 |       # check if method returned NaN or NA without throwing an error
141 |       if (is.na(mu) || is.na(size)) { stop() }
142 | 
143 |       return.list <- list("size" = size, "mu" = mu)
144 |       return(return.list)
145 |     },
146 |     error = function(cond) {
147 |       if (verbose) { message("QME failed with an error.") }
148 |       NA
149 |     },
150 |     warning = function(cond) {
151 |       if (verbose) { message("QME method failed. Warning message:") }
152 |       if (verbose) { message(cond) }
153 |       NA
154 |     }
155 |   )
156 | 
157 | 
158 |   if (verbose) { message("Attempting MGE") }
159 |   mme <- tryCatch(
160 |     {
161 |       nb_fit <- fitdistrplus::fitdist(data, "nbinom", method="mge")
162 |       size <- nb_fit$estimate[["size"]]
163 |       mu <- nb_fit$estimate[["mu"]]
164 | 
165 |       # check if method returned NaN or NA without throwing an error
166 |       if (is.na(mu) || is.na(size)) { stop() }
167 | 
168 |       return.list <- list("size" = size, "mu" = mu)
169 |       return(return.list)
170 |     },
171 |     error = function(cond) {
172 |       if (verbose) { message("MGE failed with an error.") }
173 |       NA
174 |     },
175 |     warning = function(cond) {
176 |       if (verbose) { message("MGE method failed. Warning message:") }
177 |       if (verbose) { message(cond) }
178 |       NA
179 |     }
180 |   )
181 |   
182 | 
183 | 
184 |   stop("All negative binomial estimation methods failed.")  
185 | 
186 | }
187 | 


--------------------------------------------------------------------------------
/R/recall.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' @title Returns a Seurat object that contains additional (fake) RNA
  3 | #' expression counts.
  4 | #'
  5 | #' @description Given a Seurat object, returns a new Seurat object whose RNA
  6 | #' expression counts includes the
  7 | #' variable features from the original object and an equal number of artificial
  8 | #' features.
  9 | #'
 10 | #' @param seurat_obj A Seurat object containing RNA expression counts.
 11 | #' @param assay The assay to generate artificial variables from.
 12 | #' @param null_method The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)
 13 | #' @param cores The number of cores to use in generating synthetic null variables.
 14 | #' @param verbose Whether or not to show logging.
 15 | #' @returns A Seurat object that contains the original variable features and an
 16 | #' equal number of artificial features.
 17 | #' @name get_seurat_obj_with_artificial_variables
 18 | get_seurat_obj_with_artificial_variables <- function(seurat_obj, assay = "RNA", null_method = "ZIP", verbose = TRUE, cores) {
 19 | 
 20 |   if (verbose) {
 21 |     message("Pulling data from Seurat object")
 22 |   }
 23 | 
 24 |   var_features <- Seurat::VariableFeatures(seurat_obj)
 25 |   seurat_obj_data <- as.data.frame(t(as.matrix(Seurat::GetAssayData(seurat_obj, assay = assay, layer = "counts")[var_features, ])))
 26 | 
 27 |   #if (verbose) {
 28 |   #  message("Estimating the distribution of each gene")
 29 |   #}
 30 |   if (verbose) {
 31 |     message("Computing artificial features")
 32 |   }
 33 | 
 34 |   if (null_method == "ZIP") {
 35 |     estimates <- lapply(seurat_obj_data, estimate_zi_poisson)
 36 |     sampling_function <- function(x) {
 37 |                                       rzipoisson(nrow(seurat_obj_data),
 38 |                                                  x$lambda.hat,
 39 |                                                  x$pi.hat)
 40 |                                     }
 41 |     ko <- as.data.frame(lapply(estimates, sampling_function))
 42 | 
 43 |   }
 44 |   else if (null_method == "NB") {
 45 |     estimates <- lapply(seurat_obj_data, estimate_negative_binomial)
 46 |     sampling_function <- function(x) {
 47 |                                       stats::rnbinom(nrow(seurat_obj_data),
 48 |                                                  size = x$size,
 49 |                                                  mu = x$mu)
 50 |                                     }
 51 |     ko <- as.data.frame(lapply(estimates, sampling_function))
 52 |   }
 53 |   else if (null_method == "ZIP-copula") {
 54 |     ko <- estimate_zi_poisson_copula(seurat_obj_data, cores)
 55 |   }
 56 |   else if (null_method == "NB-copula") {
 57 |     ko <- estimate_negative_binomial_copula(seurat_obj_data, cores)
 58 |   }
 59 |   else if (null_method == "Poisson-copula") {
 60 |     ko <- estimate_poisson_copula(seurat_obj_data, cores)
 61 |   }
 62 |   else if (null_method == "Gaussian-copula") {
 63 |     ko <- estimate_gaussian_copula(seurat_obj_data, cores)
 64 |   }
 65 |   else {
 66 |     stop("You selected a null_method that is not supported. Choose from: ZIP, NB, ZIP-copula, NB-copula, Poisson-copula, Gaussian-copula.")
 67 |   }
 68 | 
 69 | 
 70 |   num_variable_features <- length(var_features)
 71 |   colnames(ko) <- paste0(rep("knockoff", num_variable_features), 1:num_variable_features)
 72 |   combined_data <- cbind(seurat_obj_data, ko)
 73 | 
 74 |   # sparsify augmented data matrix and transpose for use in Seurat
 75 |   combined_data <- Matrix::Matrix(t(combined_data), sparse = TRUE)
 76 | 
 77 |   new_project_name <- paste0(seurat_obj@project.name, "_with_knockoffs")
 78 |   new_seurat_obj <- Seurat::CreateSeuratObject(counts = combined_data, project = new_project_name)
 79 | 
 80 |   return(new_seurat_obj)
 81 | }
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | #' @title Returns the genes selected by the knockoff filter
 89 | #'
 90 | #' @description Given two Seurat objects, returns the  the genes selected by
 91 | #' the knockoff filter and their W statistics.
 92 | #'
 93 | #' @param seurat_obj A Seurat object
 94 | #' @param cluster1 The Idents of the cluster of interest in seurat_obj1
 95 | #' @param cluster2 The Idents of the cluster of interest in seurat_obj2
 96 | #' @param q The desired rate to control the FDR at
 97 | #' @param return_all Determines if the returned object will contain all genes
 98 | #' or just the selected genes.
 99 | #' @param num_cores The number of cores for computing marker genes in parallel.
100 | #' @param shared_memory_max The maximum size for shared global variables.
101 | #' @returns todo
102 | #' @name compute_knockoff_filter
103 | compute_knockoff_filter <- function(seurat_obj,
104 |                                     cluster1,
105 |                                     cluster2,
106 |                                     q,
107 |                                     return_all = FALSE,
108 |                                     num_cores = 1,
109 |                                     shared_memory_max) {
110 |   options(future.globals.maxSize = shared_memory_max)
111 |   # todo note what this is for, figure this out as a parameter or programmatically
112 |   future::plan("multicore", workers = as.numeric(num_cores))
113 | 
114 |   markers <- Seurat::FindMarkers(seurat_obj,
115 |                          ident.1 = cluster1,
116 |                          ident.2 = cluster2,
117 |                          logfc.threshold = 0,
118 |                          min.pct = 0)
119 | 
120 | 
121 |   # FindMarkers orders by p-value, so we can't rely on position to know which genes are which
122 |   knockoff_indices <- grepl("^knockoff", rownames(markers))
123 |   original_indices <- !knockoff_indices
124 | 
125 |   # subset the markers data.frame into originals and knockoffs
126 |   knockoff_markers <- markers[knockoff_indices, ]
127 |   original_markers <- markers[original_indices, ]
128 | 
129 |   all_genes <- rownames(seurat_obj)
130 | 
131 |   # get indices of knockoffs and originals from seurat_obj, should be [FALSE, ..., FALSE, TRUE, ..., TRUE]
132 |   knockoff_indices_sorted <- grepl("^knockoff", all_genes)
133 |   original_indices_sorted <- !knockoff_indices_sorted
134 | 
135 |   knockoff_names_sorted <- all_genes[knockoff_indices_sorted]
136 |   original_names_sorted <- all_genes[original_indices_sorted]
137 | 
138 |   # sort markers data.frames by their original orderings
139 |   knockoff_markers_sorted <- knockoff_markers[knockoff_names_sorted, ]
140 |   original_markers_sorted <- original_markers[original_names_sorted, ]
141 | 
142 |   original_p_values <- original_markers_sorted$p_val
143 |   knockoff_p_values <- knockoff_markers_sorted$p_val
144 | 
145 |   log_original_p_values <- -log10(original_p_values)
146 |   log_knockoff_p_values <- -log10(knockoff_p_values)
147 | 
148 |   W <- log_original_p_values - log_knockoff_p_values
149 | 
150 |   thres <- knockoff::knockoff.threshold(W, fdr = q, offset = 1)
151 | 
152 | 
153 |   if (return_all) {
154 |     all_features <- as.data.frame(list("gene" = original_names_sorted, "W" = W))
155 | 
156 |     ret <-  list("all_features" = all_features, "threshold" = thres)
157 | 
158 |     return(ret)
159 |   }
160 |   selected_indices <- which(W >= thres) # todo check if this should be > (case where threshold is Inf, but there are still some Inf -log p)
161 |   #selected_indices <- which(W > thres) # todo check if this should be > (case where threshold is Inf, but there are still some Inf -log p)
162 | 
163 |   selected_genes <- original_names_sorted[selected_indices]
164 |   selected_Ws <- W[selected_indices]
165 | 
166 |   selected_features <- as.data.frame(list("selected_gene" = selected_genes, "W" = selected_Ws))
167 | 
168 |   selected_features <- selected_features[order(selected_features$W, decreasing = TRUE), ]
169 | 
170 |   ret <-  list("selected_features" = selected_features, "threshold" = thres)
171 | 
172 |   return(ret)
173 | }
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | #' @title Runs a typical Seurat workflow on a Seurat object (up to
183 | #' dimensionality reduction and clustering).
184 | #'
185 | #' @description Given a Seurat object, returns a new Seurat that has been
186 | #' normalized, had variable features identified, scaled, had principal
187 | #' components computed, hadclusters identified, and had tSNE and UMAP
188 | #' embeddings determined.
189 | #'
190 | #' @param seurat_obj The Seurat object that will be analyzed.
191 | #' @param resolution_start The starting resolution to be used for the
192 | #' clustering algorithm (Louvain and Leiden algorithms).
193 | #' @param num_clusters_start The starting number of clusters to be used for the
194 | #' clustering algorithm (K-means and Hierarchical clustering algorithms).
195 | #' @param reduction_percentage The amount that the starting parameter will be
196 | #' reduced by after each iteration (between 0 and 1).
197 | #' @param dims The dimensions to use as input features (i.e. 1:10).
198 | #' @param algorithm The clustering algorithm to be used.
199 | #' @param null_method The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)
200 | #' @param assay The assay to generate artificial variables from.
201 | #' @param cores The number of cores to compute marker genes in parallel.
202 | #' @param shared_memory_max The maximum size for shared global variables.
203 | #' Increased this variable if you see the following error:
204 | #' The total size of the X globals that need to be exported for the future expression 
205 | #' ('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB 
206 | #' (option 'future.globals.maxSize'). The X largest globals are ... 
207 | #' @param verbose Whether or not to show all logging.
208 | #' @returns Returns a Seurat object where the idents have been updated with the
209 | #' clusters determined via the recall algorithm.
210 | #' Latest clustering results will be stored in the object metadata under
211 | #' recall_clusters'. Note that 'recall_clusters' will be overwritten ever
212 | #' time FindClustersRecall is run.
213 | #' @name FindClustersRecall
214 | #' @export
215 | FindClustersRecall <- function(seurat_obj,
216 |                                  resolution_start = 0.8,
217 |                                  reduction_percentage = 0.2,
218 |                                  num_clusters_start = 20,
219 |                                  dims = 1:10,
220 |                                  algorithm = "louvain", # todo implement all algos
221 |                                  null_method = "ZIP",
222 |                                  assay = "RNA",
223 |                                  cores = 1,
224 |                                  shared_memory_max = 8000 * 1024^2,
225 |                                  verbose = TRUE) {
226 | 
227 |   # todo check function arguments for validity
228 | 
229 |   augmented_with_artificial_variables_seurat_obj <- get_seurat_obj_with_artificial_variables(seurat_obj,
230 |                                                        assay=assay,
231 |                                                        null_method=null_method,
232 |                                                        verbose=verbose,
233 |                                                        cores=cores)
234 | 
235 |   num_variable_features <- 2 * length(Seurat::VariableFeatures(seurat_obj))
236 | 
237 | 
238 |   # Pre-process data
239 |   options(future.globals.maxSize = shared_memory_max)
240 |   # todo log number of cores being used
241 |   future::plan("multicore", workers = as.numeric(cores))
242 |   #options(future.globals.maxSize = 8000 * 1024^2)
243 | 
244 |     if (verbose) {
245 |       message(paste("Number of cores:", cores))
246 |     }
247 | 
248 |   #plan("multicore", workers = as.numeric(cores))
249 | 
250 |   augmented_with_artificial_variables_seurat_obj <- Seurat::NormalizeData(augmented_with_artificial_variables_seurat_obj,
251 |                                                verbose = FALSE)
252 |    
253 |   augmented_with_artificial_variables_seurat_obj <- Seurat::FindVariableFeatures(augmented_with_artificial_variables_seurat_obj,
254 |                                                       selection.method = "vst",
255 |                                                       nfeatures = num_variable_features,
256 |                                                       verbose = FALSE)
257 | 
258 |   augmented_with_artificial_variables_seurat_obj <- Seurat::ScaleData(augmented_with_artificial_variables_seurat_obj, verbose = FALSE)
259 |   augmented_with_artificial_variables_seurat_obj <- Seurat::RunPCA(augmented_with_artificial_variables_seurat_obj,
260 |                                         features = Seurat::VariableFeatures(object = augmented_with_artificial_variables_seurat_obj),
261 |                                         verbose = FALSE)
262 | 
263 |   augmented_with_artificial_variables_seurat_obj <- Seurat::FindNeighbors(augmented_with_artificial_variables_seurat_obj,
264 |                                                dims = dims,
265 |                                                verbose = FALSE)
266 | 
267 |   resolution_param <- resolution_start
268 | 
269 | 
270 |   first_iteration <- TRUE
271 | 
272 |   while (TRUE) {
273 |     if (verbose) {
274 |       message("####################################################################")
275 |       message(paste("Finding clusters with", stringr::str_to_title(algorithm), "algorithm"))
276 |       message(paste("Resolution param:", resolution_param))
277 |     }
278 | 
279 |     if (algorithm == "louvain") {
280 |       augmented_with_artificial_variables_seurat_obj <- Seurat::FindClusters(augmented_with_artificial_variables_seurat_obj,
281 |                                                   resolution = resolution_param,
282 |                                                   verbose = FALSE)
283 |     }
284 | 
285 |     if (algorithm == "leiden") {
286 |       #plan("sequential") # todo log number of cores being used # this is a weird one because leiden has a forked job hanging
287 |       augmented_with_artificial_variables_seurat_obj <- Seurat::FindClusters(augmented_with_artificial_variables_seurat_obj,
288 |                                                   resolution = resolution_param,
289 |                                                   algorithm = 4,
290 |                                                   method = "igraph",
291 |                                                   verbose = FALSE)
292 |     }
293 | 
294 |     # Reduce resolution for next iteration of the loop
295 |     resolution_param <- (1 - reduction_percentage) * resolution_param
296 | 
297 |     k <- length(levels(Seurat::Idents(augmented_with_artificial_variables_seurat_obj)))
298 |     #knock_idents <- 0:(k-1)
299 | 
300 |     if (verbose) {
301 |       message("Num clusters:")
302 |       message(k)
303 |     }
304 | 
305 |     knock_idents <- levels(Seurat::Idents(augmented_with_artificial_variables_seurat_obj))
306 | 
307 |     num_selected_matrix <- matrix(nrow = k, ncol = k)
308 | 
309 |     found_no_sign_diff <- FALSE
310 | 
311 |     num_clusters <- length(knock_idents)
312 | 
313 | 
314 |     if (verbose) {
315 |       progress_bar_length <- num_clusters * (num_clusters - 1) / 2
316 |       cli::cli_progress_bar("Processing cluster pairs:",
317 |                             total = progress_bar_length,
318 |                             clear = FALSE)
319 |     }
320 | 
321 |     m <- 0
322 |     for (i in 1:num_clusters) {
323 |       for (j in 1:num_clusters) {
324 |         if (j >= i) {
325 |           next
326 |         }
327 | 
328 |         m <- m + 1
329 | 
330 |         if (verbose) {
331 |           cli::cli_progress_update()
332 |         }
333 | 
334 |         markers_selected <- compute_knockoff_filter(seurat_obj = augmented_with_artificial_variables_seurat_obj,
335 |                                                     cluster1 = knock_idents[i],
336 |                                                     cluster2 = knock_idents[j],
337 |                                                     q = 0.05,
338 |                                                     num_cores = cores,
339 |                                                     shared_memory_max = shared_memory_max)
340 | 
341 |         num_selected <- nrow(markers_selected$selected_features)
342 | 
343 |         if (num_selected == 0) {
344 |           found_no_sign_diff <- TRUE
345 |           break
346 |         }
347 | 
348 |         num_selected_matrix[i, j] <- num_selected
349 |         num_selected_matrix[j, i] <- num_selected
350 | 
351 |       }
352 |       if (found_no_sign_diff) {
353 |         if (verbose) {
354 |           cli::cli_progress_done()
355 |           message("Found clusters with no significant differences.")
356 |           message("Progressing to next clustering iteration.")
357 |         }
358 |         first_iteration <- FALSE
359 |         break
360 |       }
361 |     }
362 | 
363 |     if (found_no_sign_diff) {
364 |       next
365 |     }
366 |     break
367 |   }
368 | 
369 |   if (first_iteration) {
370 |     warning("Only a single iteration occurred. The inferred cluster labels may be underclustered. To prevent this, you may want to re-run recall with a larger starting parameter.")
371 |   }
372 | 
373 |   seurat_obj@meta.data$recall_clusters <- Seurat::Idents(augmented_with_artificial_variables_seurat_obj)
374 |   Seurat::Idents(seurat_obj) <- seurat_obj@meta.data$recall_clusters
375 | 
376 |   return(seurat_obj)
377 | }
378 | 
379 | 
380 | 
381 | 
382 | 
383 | #' @title Runs a typical Seurat workflow on a Seurat object (up to
384 | #' dimensionality reduction and clustering).
385 | #'
386 | #' @description Given a Seurat object, returns a new Seurat that has been
387 | #' normalized, had variable features identified, scaled, had principal
388 | #' components computed, hadclusters identified, and had tSNE and UMAP
389 | #' embeddings determined.
390 | #'
391 | #' @param seurat_obj The Seurat object that will be analyzed.
392 | #' @param resolution_start The starting resolution to be used for the
393 | #' clustering algorithm (Louvain and Leiden algorithms).
394 | #' @param num_clusters_start The starting number of clusters to be used for the
395 | #' clustering algorithm (K-means and Hierarchical clustering algorithms).
396 | #' @param reduction_percentage The amount that the starting parameter will be
397 | #' reduced by after each iteration (between 0 and 1).
398 | #' @param dims The dimensions to use as input features (i.e. 1:10).
399 | #' @param algorithm The clustering algorithm to be used.
400 | #' @param null_method The generating distribution for the synthetic null variables (ZIP, NB, ZIP-copula, NB-copula)
401 | #' @param assay The assay to generate artificial variables from.
402 | #' @param cores The number of cores to compute marker genes in parallel.
403 | #' @param shared_memory_max The maximum size for shared global variables.
404 | #' Increased this variable if you see the following error:
405 | #' The total size of the X globals that need to be exported for the future expression 
406 | #' ('FUN()') is X GiB. This exceeds the maximum allowed size of 500.00 MiB 
407 | #' (option 'future.globals.maxSize'). The X largest globals are ... 
408 | #' @param verbose Whether or not to show all logging.
409 | #' @returns Returns a Seurat object where the idents have been updated with the
410 | #' clusters determined via the countsplit algorithm.
411 | #' Latest clustering results will be stored in the object metadata under
412 | #' countsplit_clusters'. Note that 'countsplit_clusters' will be overwritten ever
413 | #' time FindClustersCountsplit is run.
414 | #' @name FindClustersCountsplit
415 | #' @export
416 | FindClustersCountsplit <- function(seurat_obj,
417 |                                    resolution_start = 0.8,
418 |                                    reduction_percentage = 0.2,
419 |                                    num_clusters_start = 20,
420 |                                    dims = 1:10,
421 |                                    algorithm = "louvain", # todo implement all algos
422 |                                    null_method = "ZIP",
423 |                                    assay = "RNA",
424 |                                    cores = 1,
425 |                                    shared_memory_max = 8000 * 1024^2,
426 |                                    verbose = TRUE) {
427 | 
428 |   options(future.globals.maxSize = shared_memory_max)
429 |   # todo log number of cores being used
430 |   future::plan("multicore", workers = as.numeric(cores))
431 | 
432 |   num_variable_features <- length(Seurat::VariableFeatures(seurat_obj))
433 | 
434 | 
435 |   # follow this issue
436 |   #https://github.com/anna-neufeld/countsplit/issues/8
437 | 
438 |   # todo only do this for variable features
439 |   split <- countsplit::countsplit(Seurat::GetAssayData(seurat_obj, assay))
440 |   Xtrain <- split[[1]]
441 |   Xtest <- split[[2]]
442 | 
443 |   seurat_obj_train <- Seurat::CreateSeuratObject(counts = Xtrain)
444 |   seurat_obj_test <- Seurat::CreateSeuratObject(counts = Xtest)
445 | 
446 | 
447 |   # process training data
448 |   seurat_obj_train <- Seurat::NormalizeData(seurat_obj_train,
449 |                                             verbose = FALSE)
450 | 
451 |   seurat_obj_train <- Seurat::FindVariableFeatures(seurat_obj_train,
452 |                                                    selection.method = "vst",
453 |                                                    nfeatures = num_variable_features,
454 |                                                    verbose = FALSE)
455 | 
456 |   seurat_obj_train <- Seurat::ScaleData(seurat_obj_train, verbose = FALSE)
457 |   seurat_obj_train <- Seurat::RunPCA(seurat_obj_train,
458 |                                      features = Seurat::VariableFeatures(object = seurat_obj_train),
459 |                                      verbose = FALSE)
460 | 
461 |   seurat_obj_train <- Seurat::FindNeighbors(seurat_obj_train,
462 |                                             dims = dims,
463 |                                             verbose = FALSE)
464 | 
465 |   # process test data (no need for PCA and FindNeighbors since we are just assigning idents based on training idents)
466 |   seurat_obj_test <- Seurat::NormalizeData(seurat_obj_test,
467 |                                            verbose = FALSE)
468 | 
469 |   seurat_obj_test <- Seurat::FindVariableFeatures(seurat_obj_test,
470 |                                                   selection.method = "vst",
471 |                                                   nfeatures = num_variable_features,
472 |                                                   verbose = FALSE)
473 | 
474 |   seurat_obj_test <- Seurat::ScaleData(seurat_obj_test, verbose = FALSE)
475 | 
476 | 
477 | 
478 |   resolution_param <- resolution_start
479 | 
480 |   # set up multicore for FindMarkers
481 |   future::plan("multicore", workers = as.numeric(cores))
482 | 
483 |   first_iteration <- TRUE
484 | 
485 |   while (TRUE) {
486 |     if (verbose) {
487 |       message("####################################################################")
488 |       message(paste("Finding clusters with", stringr::str_to_title(algorithm), "algorithm"))
489 |       message(paste("Resolution param:", resolution_param))
490 |     }
491 |     
492 |     if (algorithm == "louvain") {
493 |       seurat_obj_train <- Seurat::FindClusters(seurat_obj_train,
494 |                                                   resolution = resolution_param,
495 |                                                   verbose = FALSE)
496 |     }
497 |     
498 |     if (algorithm == "leiden") {
499 |       #plan("sequential") # todo log number of cores being used # this is a weird one because leiden has a forked job hanging
500 |       seurat_obj_train <- Seurat::FindClusters(seurat_obj_train,
501 |                                                   resolution = resolution_param,
502 |                                                   algorithm = 4,
503 |                                                   method = "igraph",
504 |                                                   verbose = FALSE)
505 |     }
506 |     
507 |     # Reduce resolution for next iteration of the loop
508 |     resolution_param <- (1 - reduction_percentage) * resolution_param
509 |     
510 |     Seurat::Idents(seurat_obj_test) <- Seurat::Idents(seurat_obj_train)
511 |     
512 |     k <- length(levels(Seurat::Idents(seurat_obj_test)))
513 |     #knock_idents <- 0:(k-1)
514 |     
515 |     if (verbose) {
516 |       message("Num clusters:")
517 |       message(k)
518 |     }
519 |     
520 |     countsplit_idents <- levels(Seurat::Idents(seurat_obj_test))
521 |     
522 |     num_selected_matrix <- matrix(nrow = k, ncol = k)
523 |     
524 |     found_no_sign_diff <- FALSE
525 |     
526 |     num_clusters <- length(countsplit_idents)
527 |     
528 |     
529 |     if (verbose) {
530 |       progress_bar_length <- num_clusters * (num_clusters - 1) / 2
531 |       cli::cli_progress_bar("Processing cluster pairs:",
532 |                             total = progress_bar_length,
533 |                             clear = FALSE)
534 |     }
535 |     
536 |     m <- 0
537 |     for (i in 1:num_clusters) {
538 |       for (j in 1:num_clusters) {
539 |         if (j >= i) {
540 |           next
541 |         }
542 |         
543 |         m <- m + 1
544 |         
545 |         if (verbose) {
546 |           cli::cli_progress_update()
547 |         }
548 |         
549 |         markers_selected <- Seurat::FindMarkers(seurat_obj_test,
550 |                                                 ident.1 = countsplit_idents[i],
551 |                                                 ident.2 = countsplit_idents[j])
552 |     
553 |         num_selected <- sum(markers_selected$p_val_adj < 0.05)
554 |         
555 |         if (num_selected == 0) {
556 |           found_no_sign_diff <- TRUE
557 |           break
558 |         }
559 |         
560 |         num_selected_matrix[i, j] <- num_selected
561 |         num_selected_matrix[j, i] <- num_selected
562 |         
563 |       }
564 |       if (found_no_sign_diff) {
565 |         if (verbose) {
566 |           cli::cli_progress_done()
567 |           message("Found clusters with no significant differences.")
568 |           message("Progressing to next clustering iteration.")
569 |         }
570 |         first_iteration <- FALSE
571 |         break
572 |       }
573 |     }
574 |     
575 |     if (found_no_sign_diff) {
576 |       next
577 |     }
578 |     break
579 |   }
580 | 
581 |   if (first_iteration) {
582 |     warning("Only a single iteration occurred. The inferred cluster labels may be underclustered. To prevent this, you may want to re-run FindClustersCountsplit with a larger starting parameter.")
583 |   }
584 | 
585 | 
586 |   seurat_obj@meta.data$countsplit_clusters <- Seurat::Idents(seurat_obj_test)
587 |   Seurat::Idents(seurat_obj) <- seurat_obj@meta.data$countsplit_clusters
588 | 
589 |   return(seurat_obj)
590 | 
591 | }


--------------------------------------------------------------------------------