├── src
    ├── .gitignore
    ├── Makevars
    ├── Makevars.win
    ├── utils.cpp
    ├── jaccard.cpp
    ├── allocation.cpp
    ├── RcppExports.cpp
    └── optim.cpp
├── .github
    ├── .gitignore
    └── workflows
    │   ├── close-inactive-issues.yml
    │   ├── pkgdown.yaml
    │   └── rhub.yaml
├── vignettes
    └── articles
    │   ├── .gitignore
    │   ├── mouse.Rmd
    │   └── pbmc.Rmd
├── .gitattributes
├── cran-comments.md
├── R
    ├── sysdata.rda
    ├── scregclust-package.R
    ├── RcppExports.R
    ├── plotting.R
    └── utils.R
├── man
    ├── figures
    │   └── overview_fig1A_bg.png
    ├── reset_array.Rd
    ├── coef_ridge.Rd
    ├── available_results.Rd
    ├── kmeanspp_init.Rd
    ├── find_module_sizes.Rd
    ├── remove_empty_modules.Rd
    ├── coef_ols.Rd
    ├── get_avg_num_regulators.Rd
    ├── get_num_final_configs.Rd
    ├── alloc_array.Rd
    ├── get_regulator_list.Rd
    ├── fast_cor.Rd
    ├── cluster_overlap.Rd
    ├── count_table.Rd
    ├── progstr.Rd
    ├── compute_rand_index.Rd
    ├── compute_adjusted_rand_index.Rd
    ├── get_target_gene_modules.Rd
    ├── jaccard_indicator.Rd
    ├── plot_module_count_helper.Rd
    ├── scregclust_format.Rd
    ├── scregclust-package.Rd
    ├── plot_silhouettes.Rd
    ├── kmeanspp.Rd
    ├── coef_nnls.Rd
    ├── plot_regulator_network.Rd
    ├── get_rand_indices.Rd
    ├── jaccard_indicator_comp.Rd
    ├── split_sample.Rd
    ├── coop_lasso.Rd
    └── scregclust.Rd
├── .gitignore
├── pkgdown
    └── assets
    │   └── overview_fig1A_bg.png
├── datasets
    ├── mouse_scregclust.rds
    ├── pbmc_scregclust.rds
    ├── humanKinases.txt
    ├── humanTFs_v2.txt
    ├── humanTFs.txt
    └── humanTFs_v3.txt
├── .clang-format
├── .Rbuildignore
├── tests
    ├── testthat
    │   ├── test-fast-cor.R
    │   └── test-constant-genes.R
    └── testthat.R
├── inst
    └── CITATION
├── scripts
    └── update-sysdata.R
├── index.md
├── _pkgdown.yml
├── NEWS.md
├── README.md
├── DESCRIPTION
└── NAMESPACE


/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/vignettes/articles/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.rds filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | * Removed vignettes
2 | * Adressed CRAN issues
3 | 


--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scmethods/scregclust/HEAD/R/sysdata.rda


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
2 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)
3 | 


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
1 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
2 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)
3 | 


--------------------------------------------------------------------------------
/man/figures/overview_fig1A_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scmethods/scregclust/HEAD/man/figures/overview_fig1A_bg.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | inst/doc
3 | /doc/
4 | /Meta/
5 | docs
6 | CRAN-SUBMISSION
7 | .cache
8 | compile_commands.json
9 | 


--------------------------------------------------------------------------------
/pkgdown/assets/overview_fig1A_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scmethods/scregclust/HEAD/pkgdown/assets/overview_fig1A_bg.png


--------------------------------------------------------------------------------
/datasets/mouse_scregclust.rds:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f357f3aef9f6f48346394458643ff2d345542eed6ac81a53ad9f8ed239caead8
3 | size 7995810
4 | 


--------------------------------------------------------------------------------
/datasets/pbmc_scregclust.rds:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3c92b65257337e04b370addc88f029fc16533db5cda7a9647cbb330d8d95a3ff
3 | size 15045381
4 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: Google
 2 | ColumnLimit: 90
 3 | DerivePointerAlignment: false
 4 | PointerAlignment: Left
 5 | AccessModifierOffset: 0
 6 | NamespaceIndentation: All
 7 | IncludeBlocks: Preserve
 8 | TabWidth: 4
 9 | IndentWidth: 4
10 | UseTab: Always
11 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^datasets$
 2 | ^scripts$
 3 | ^LICENSE\.md$
 4 | \.RData$
 5 | ^_pkgdown\.yml$
 6 | ^docs$
 7 | ^pkgdown$
 8 | ^index\.md$
 9 | ^\.github$
10 | ^\.clang-format$
11 | ^CRAN-SUBMISSION$
12 | ^cran-comments\.md$
13 | \.cache
14 | compile_commands\.json
15 | ^vignettes/articles$
16 | 


--------------------------------------------------------------------------------
/tests/testthat/test-fast-cor.R:
--------------------------------------------------------------------------------
 1 | test_that("fast correlation computation", {
 2 |   pt <- 50
 3 |   pr <- 10
 4 |   n <- 200
 5 | 
 6 |   zt <- matrix(rnorm(pt * n), ncol = pt)
 7 |   zr <- matrix(rnorm(pr * n), ncol = pr)
 8 | 
 9 |   c_ref <- cor(zt, zr)
10 |   c2 <- fast_cor(zt, zr)
11 | 
12 |   expect_equal(
13 |     c_ref,
14 |     c2
15 |   )
16 | })
17 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # This file is part of the standard setup for testthat.
 2 | # It is recommended that you do not modify it.
 3 | #
 4 | # Where should you do additional test configuration?
 5 | # Learn more about the roles of various files in:
 6 | # * https://r-pkgs.org/tests.html
 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files
 8 | 
 9 | library(testthat)
10 | library(scregclust)
11 | 
12 | test_check("scregclust")
13 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | bibentry(
 2 |   bibtype  = "Article",
 3 |   title    = "Reconstructing the regulatory programs underlying the phenotypic plasticity of neural cancers",
 4 |   author   = "Ida Larsson, Felix Held, Gergana Popova, Alper Koc, Soumi Kundu, Rebecka Jörnsten, Sven Nelander",
 5 |   journal  = "Nature Communications",
 6 |   year     = "2024",
 7 |   volume   = "15",
 8 |   number   = "9699",
 9 |   doi      = "10.1038/s41467-024-53954-3"
10 | )
11 | 


--------------------------------------------------------------------------------
/man/reset_array.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{reset_array}
 4 | \alias{reset_array}
 5 | \title{Reset input 3d-array by filling matrix along first dimension}
 6 | \usage{
 7 | reset_array(arr, input)
 8 | }
 9 | \arguments{
10 | \item{arr}{The 3d-array of dimension \verb{n_cl x n_obs x n_genes}}
11 | 
12 | \item{input}{The matrix of size \verb{n_obs x n_genes}}
13 | }
14 | \description{
15 | Reset input 3d-array by filling matrix along first dimension
16 | }
17 | \keyword{internal}
18 | 


--------------------------------------------------------------------------------
/man/coef_ridge.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{coef_ridge}
 4 | \alias{coef_ridge}
 5 | \title{Compute ridge regression coefficients}
 6 | \usage{
 7 | coef_ridge(y, x, lambda)
 8 | }
 9 | \arguments{
10 | \item{y}{Target vector (n x 1)/matrix (n x m)}
11 | 
12 | \item{x}{Design matrix (n x p)}
13 | 
14 | \item{lambda}{Positive parameter for ridge penalty}
15 | }
16 | \value{
17 | Vector of ridge regression coefficients
18 | }
19 | \description{
20 | Compute ridge regression coefficients
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/available_results.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{available_results}
 4 | \alias{available_results}
 5 | \title{Extract final configurations into a data frame}
 6 | \usage{
 7 | available_results(obj)
 8 | }
 9 | \arguments{
10 | \item{obj}{An object of class \code{scregclust}}
11 | }
12 | \value{
13 | A \code{\link{data.frame}} containing penalization parameters and
14 | final configurations for those penalizations.
15 | }
16 | \description{
17 | Extract final configurations into a data frame
18 | }
19 | \concept{helpers}
20 | 


--------------------------------------------------------------------------------
/man/kmeanspp_init.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{kmeanspp_init}
 4 | \alias{kmeanspp_init}
 5 | \title{Determine initial centers for the kmeans++ algorithm}
 6 | \usage{
 7 | kmeanspp_init(n_cluster, x = NULL, dm = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{data matrix to be clustered}
11 | 
12 | \item{dm}{distance matrix (between rows of x; of class "dist")}
13 | }
14 | \value{
15 | Row indices of initial cluster centers of x
16 | }
17 | \description{
18 | Determine initial centers for the kmeans++ algorithm
19 | }
20 | \keyword{internal}
21 | 


--------------------------------------------------------------------------------
/man/find_module_sizes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{find_module_sizes}
 4 | \alias{find_module_sizes}
 5 | \title{Determine module sizes}
 6 | \usage{
 7 | find_module_sizes(module, n_modules)
 8 | }
 9 | \arguments{
10 | \item{module}{Vector of module indices}
11 | 
12 | \item{n_modules}{Total number of modules}
13 | }
14 | \value{
15 | A named vector containing the name of the module (its index or
16 | \code{"Noise"}) and the number of elements in that module
17 | }
18 | \description{
19 | Determine module sizes
20 | }
21 | \concept{helpers}
22 | 


--------------------------------------------------------------------------------
/man/remove_empty_modules.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{remove_empty_modules}
 4 | \alias{remove_empty_modules}
 5 | \title{Remove empty modules}
 6 | \usage{
 7 | remove_empty_modules(module)
 8 | }
 9 | \arguments{
10 | \item{module}{Vector of module indices}
11 | }
12 | \value{
13 | The updated vector of module indices with empty modules removed.
14 | }
15 | \description{
16 | Remove empty modules
17 | }
18 | \details{
19 | Only iterates through modules with positive index, leaving the noise
20 | module untouched.
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/coef_ols.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{coef_ols}
 4 | \alias{coef_ols}
 5 | \title{Compute OLS coefficients}
 6 | \usage{
 7 | coef_ols(y, x)
 8 | }
 9 | \arguments{
10 | \item{y}{Target vector (n x 1)/matrix (n x m)}
11 | 
12 | \item{x}{Design matrix (n x p)}
13 | }
14 | \value{
15 | Vector of OLS coefficients
16 | }
17 | \description{
18 | If the design matrix has full column-rank, then use the normal
19 | least squares estimate. Otherwise, use the Moore-Penrose inverse
20 | to compute the least squares estimate.
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/get_avg_num_regulators.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_avg_num_regulators}
 4 | \alias{get_avg_num_regulators}
 5 | \title{Get the average number of active regulators per module}
 6 | \usage{
 7 | get_avg_num_regulators(fit)
 8 | }
 9 | \arguments{
10 | \item{fit}{An object of class \code{scRegClust}}
11 | }
12 | \value{
13 | A \code{\link{data.frame}} containing the average number of active regulators
14 | per module for each penalization parameter.
15 | }
16 | \description{
17 | Get the average number of active regulators per module
18 | }
19 | \concept{utilities}
20 | 


--------------------------------------------------------------------------------
/man/get_num_final_configs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_num_final_configs}
 4 | \alias{get_num_final_configs}
 5 | \title{Return the number of final configurations}
 6 | \usage{
 7 | get_num_final_configs(fit)
 8 | }
 9 | \arguments{
10 | \item{fit}{An object of class \code{scRegClust}}
11 | }
12 | \value{
13 | An integer vector containing the number of final configurations
14 | for each penalization parameter.
15 | }
16 | \description{
17 | Returns the number of final configurations per penalization parameter in an
18 | scRegClust object.
19 | }
20 | \concept{utilities}
21 | 


--------------------------------------------------------------------------------
/man/alloc_array.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{alloc_array}
 4 | \alias{alloc_array}
 5 | \title{Allocate 3d-array and fill with matrix along first dimension}
 6 | \usage{
 7 | alloc_array(input, n_cl)
 8 | }
 9 | \arguments{
10 | \item{input}{the matrix of size \verb{n_obs x n_genes}}
11 | 
12 | \item{n_cl}{the size of the three-dimensional array's first dimension}
13 | }
14 | \value{
15 | The allocated and filled array of size \verb{n_cl x n_obs x n_genes}
16 | }
17 | \description{
18 | Allocate 3d-array and fill with matrix along first dimension
19 | }
20 | \keyword{internal}
21 | 


--------------------------------------------------------------------------------
/man/get_regulator_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scregclust.R
 3 | \name{get_regulator_list}
 4 | \alias{get_regulator_list}
 5 | \title{Return list of regulator genes}
 6 | \usage{
 7 | get_regulator_list(mode = c("TF", "kinase"))
 8 | }
 9 | \arguments{
10 | \item{mode}{Determines which genes are considered to be regulators.
11 | Currently supports TF=transcription factors and kinases.}
12 | }
13 | \value{
14 | a list of gene symbols
15 | }
16 | \description{
17 | Return list of regulator genes
18 | }
19 | \seealso{
20 | \code{\link[=scregclust_format]{scregclust_format()}}
21 | }
22 | \concept{utilities}
23 | 


--------------------------------------------------------------------------------
/man/fast_cor.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{fast_cor}
 4 | \alias{fast_cor}
 5 | \title{Fast computation of correlation}
 6 | \usage{
 7 | fast_cor(x, y)
 8 | }
 9 | \arguments{
10 | \item{x}{first input matrix}
11 | 
12 | \item{y}{second input matrix}
13 | }
14 | \value{
15 | Correlations matrix between the columns of \code{x} and \code{y}
16 | }
17 | \description{
18 | This uses a more memory-intensive but much faster algorithm than
19 | the built-in \code{cor} function.
20 | }
21 | \details{
22 | Computes the correlation between the columns of \code{x} and \code{y}.
23 | }
24 | \concept{helpers}
25 | 


--------------------------------------------------------------------------------
/man/cluster_overlap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{cluster_overlap}
 4 | \alias{cluster_overlap}
 5 | \title{Create a table of module overlap for two clusterings}
 6 | \usage{
 7 | cluster_overlap(k1, k2)
 8 | }
 9 | \arguments{
10 | \item{k1}{First clustering}
11 | 
12 | \item{k2}{Second clustering}
13 | }
14 | \value{
15 | A matrix showing the module overlap with the labels of \code{k1} in
16 | the columns and the labels of \code{k2} in the rows.
17 | }
18 | \description{
19 | Compares two clusterings and creates a table of overlap between them.
20 | Module labels do not have to match.
21 | }
22 | \concept{helpers}
23 | 


--------------------------------------------------------------------------------
/tests/testthat/test-constant-genes.R:
--------------------------------------------------------------------------------
 1 | test_that("constant genes are discarded correctly", {
 2 |   expression <- rbind(
 3 |     rep.int(1, 100),
 4 |     matrix(rnorm(500), nrow = 5),
 5 |     rep.int(0.5, 100),
 6 |     rnorm(100)
 7 |   )
 8 | 
 9 |   genesymbols <- c("T1", "T2", "T3", "T4", "T5", "T6", "R1", "R2")
10 |   is_regulator <- c(0, 0, 0, 0, 0, 0, 1, 1)
11 | 
12 |   fit <- scregclust(
13 |     expression, genesymbols, is_regulator, 0.1, 2, verbose = FALSE
14 |   )
15 | 
16 |   expect_equal(
17 |     fit$results[[1]]$genesymbols,
18 |     c("T2", "T3", "T4", "T5", "T6", "R2")
19 |   )
20 |   expect_equal(
21 |     fit$results[[1]]$is_regulator,
22 |     c(FALSE, FALSE, FALSE, FALSE, FALSE, TRUE)
23 |   )
24 | })
25 | 


--------------------------------------------------------------------------------
/man/count_table.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{count_table}
 4 | \alias{count_table}
 5 | \title{Format count table nicely}
 6 | \usage{
 7 | count_table(counts, title, row_names, col_width = 5)
 8 | }
 9 | \arguments{
10 | \item{counts}{a list of count vectors with \code{1 + n_cl} entries each.
11 | \code{NA} values are replaced with \code{-}}
12 | 
13 | \item{title}{title above the table}
14 | 
15 | \item{row_names}{a vector of row names, one for each count vector}
16 | 
17 | \item{col_width}{minimum width for columns}
18 | }
19 | \value{
20 | A string formatted as a table
21 | }
22 | \description{
23 | Format count table nicely
24 | }
25 | \keyword{internal}
26 | 


--------------------------------------------------------------------------------
/man/progstr.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{progstr}
 4 | \alias{progstr}
 5 | \title{Quick'n'dirty progress bar}
 6 | \usage{
 7 | progstr(step, n_steps, name, finished = FALSE, progress_length = 20L)
 8 | }
 9 | \arguments{
10 | \item{step}{current step being worked on}
11 | 
12 | \item{n_steps}{total number of steps}
13 | 
14 | \item{name}{name of the process}
15 | 
16 | \item{finished}{whether the process is finished}
17 | 
18 | \item{progress_length}{length of the progress bar in ascii signs}
19 | }
20 | \value{
21 | A string formatted as a progress bar
22 | }
23 | \description{
24 | Creates a progress bar and returns it as a string.
25 | }
26 | \keyword{internal}
27 | 


--------------------------------------------------------------------------------
/man/compute_rand_index.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{compute_rand_index}
 4 | \alias{compute_rand_index}
 5 | \title{Compute the Rand index}
 6 | \usage{
 7 | compute_rand_index(k1, k2)
 8 | }
 9 | \arguments{
10 | \item{k1}{First clustering as vector of integers}
11 | 
12 | \item{k2}{Second clustering as vector of integers}
13 | }
14 | \value{
15 | The Rand index as a numeric value
16 | }
17 | \description{
18 | Compute the Rand index
19 | }
20 | \references{
21 | W. M. Rand (1971). "Objective criteria for the evaluation of clustering
22 | methods". Journal of the American Statistical Association 66 (336): 846–850.
23 | DOI:10.2307/2284239
24 | }
25 | \keyword{internal}
26 | 


--------------------------------------------------------------------------------
/scripts/update-sysdata.R:
--------------------------------------------------------------------------------
 1 | # Load data to be used inside package
 2 | human_tfs <- read.csv("datasets/humanTFs.txt", header = FALSE)[, 1]
 3 | human_tfs_v2 <- read.csv("datasets/humanTFs_v2.txt", header = FALSE)[, 1]
 4 | human_tfs_v3 <- read.csv("datasets/humanTFs_v3.txt", header = FALSE)[, 1]
 5 | 
 6 | human_kinases <- read.csv("datasets/humanKinases.txt", header = FALSE)[, 1]
 7 | 
 8 | human_regulators <- read.csv("datasets/humanRegulators.txt", header = FALSE)[, 1]
 9 | 
10 | # Create R/sysdata.rda with those datasets
11 | usethis::use_data(
12 |   human_tfs,
13 |   human_tfs_v2,
14 |   human_tfs_v3,
15 |   human_kinases,
16 |   human_regulators,
17 |   internal = TRUE,
18 |   overwrite = TRUE
19 | )
20 | 
21 | # Use them with e.g. scregclust:::human_tfs
22 | # Note that there are three `:`


--------------------------------------------------------------------------------
/man/compute_adjusted_rand_index.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{compute_adjusted_rand_index}
 4 | \alias{compute_adjusted_rand_index}
 5 | \title{Compute Hubert's and Arabie's Adjusted Rand index}
 6 | \usage{
 7 | compute_adjusted_rand_index(k1, k2)
 8 | }
 9 | \arguments{
10 | \item{k1}{First clustering as vector of integers}
11 | 
12 | \item{k2}{Second clustering as vector of integers}
13 | }
14 | \value{
15 | The Adjusted Rand index as a numeric value
16 | }
17 | \description{
18 | Compute Hubert's and Arabie's Adjusted Rand index
19 | }
20 | \references{
21 | Lawrence Hubert and Phipps Arabie (1985). "Comparing partitions".
22 | Journal of Classification. 2 (1): 193–218. DOI:10.1007/BF01908075
23 | }
24 | \keyword{internal}
25 | 


--------------------------------------------------------------------------------
/.github/workflows/close-inactive-issues.yml:
--------------------------------------------------------------------------------
 1 | name: Close inactive issues
 2 | on:
 3 |   schedule:
 4 |     - cron: "30 1 * * *"
 5 | 
 6 | jobs:
 7 |   close-issues:
 8 |     runs-on: ubuntu-latest
 9 |     permissions:
10 |       issues: write
11 |       pull-requests: write
12 |     steps:
13 |       - uses: actions/stale@v5
14 |         with:
15 |           days-before-issue-stale: 30
16 |           days-before-issue-close: 30
17 |           stale-issue-label: "stale"
18 |           stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
19 |           close-issue-message: "This issue was closed because it has been inactive for 30 days since being marked as stale."
20 |           days-before-pr-stale: -1
21 |           days-before-pr-close: -1
22 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
23 | 


--------------------------------------------------------------------------------
/man/get_target_gene_modules.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_target_gene_modules}
 4 | \alias{get_target_gene_modules}
 5 | \title{Extract target gene modules for given penalization parameters}
 6 | \usage{
 7 | get_target_gene_modules(fit, penalization = NULL)
 8 | }
 9 | \arguments{
10 | \item{fit}{An object of class \code{scregclust}}
11 | 
12 | \item{penalization}{A numeric vector of penalization parameters.
13 | The penalization parameters specified here must have
14 | been used used during fitting of the \code{fit} object.}
15 | }
16 | \value{
17 | A list of lists of final target modules. One list for each
18 | parameter in \code{penalization}. The lists contain the modules of
19 | target genes for each final configuration.
20 | }
21 | \description{
22 | Extract target gene modules for given penalization parameters
23 | }
24 | \concept{utilities}
25 | 


--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
 1 | # Single-cell Regulatory-driven Clustering (scregclust)
 2 | 
 3 | <!-- badges: start -->
 4 | 
 5 | <!-- badges: end -->
 6 | 
 7 | ![A schematic overview of the steps involved in the scregclust algorithm.](overview_fig1A_bg.png "Overview of the scregclust algorithm")
 8 | 
 9 | ## Introduction
10 | 
11 | The goal of *scregclust* is to cluster genes by regulatory programs. To do so, genes are clustered into modules which in turn are associated with regulators. The algorithm alternates between associating regulators to modules and reallocating target genes into modules.
12 | 
13 | A detailed description of the algorithm and an in-depth evaluation of its properties can be found in our original research article [Larsson, Held, et al. (2024) Reconstructing the regulatory programs underlying the phenotypic plasticity of neural cancers. Nature Communications 15, 9699 DOI 10.1038/s41467-024-53954-3](https://doi.org/10.1038/s41467-024-53954-3)


--------------------------------------------------------------------------------
/man/jaccard_indicator.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{jaccard_indicator}
 4 | \alias{jaccard_indicator}
 5 | \title{Compute indicator matrix of pairwise distances smaller than threshold}
 6 | \usage{
 7 | jaccard_indicator(x, upper_bnd = 0.8)
 8 | }
 9 | \arguments{
10 | \item{x}{the input matrix with vectors to be compared in the rows.}
11 | 
12 | \item{upper_bnd}{pairs with a Jaccard distance below this upper bound are
13 | returned as 1 while all others receive the entry 0.}
14 | }
15 | \value{
16 | A list of vectors describing a sparse lower triangular pattern matrix
17 | \item{i}{Row indices}
18 | \item{j}{Column indices}
19 | }
20 | \description{
21 | Computes the Jaccard distance between rows of a matrix and returns a
22 | sparse symmetric indicator matrix containing the entries with a distance
23 | of less than a given upper bound. Note that the diagonal is always 1.
24 | }
25 | \keyword{internal}
26 | 


--------------------------------------------------------------------------------
/man/plot_module_count_helper.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plotting.R
 3 | \name{plot_module_count_helper}
 4 | \alias{plot_module_count_helper}
 5 | \title{Plot average silhouette scores and average predictive \eqn{R^2}}
 6 | \usage{
 7 | plot_module_count_helper(list_of_fits, penalization)
 8 | }
 9 | \arguments{
10 | \item{list_of_fits}{A list of \code{scregclust} objects each fit to the same
11 | dataset across a variety of module counts (varying
12 | \code{n_modules} while running \code{\link{scregclust}}).}
13 | 
14 | \item{penalization}{Either a single numeric value requesting the results
15 | for the same penalty parameter across all fits in
16 | \code{list_of_fits}, or one for each individual fit.}
17 | }
18 | \value{
19 | A ggplot2 plot showing the average silhouette score and the
20 | average predictive \eqn{R^2}
21 | }
22 | \description{
23 | Plot average silhouette scores and average predictive \eqn{R^2}
24 | }
25 | \concept{plotting}
26 | 


--------------------------------------------------------------------------------
/man/scregclust_format.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scregclust.R
 3 | \name{scregclust_format}
 4 | \alias{scregclust_format}
 5 | \title{Package data before clustering}
 6 | \usage{
 7 | scregclust_format(expression_matrix, mode = c("TF", "kinase"))
 8 | }
 9 | \arguments{
10 | \item{expression_matrix}{The p x n gene expression matrix with gene symbols
11 | as rownames.}
12 | 
13 | \item{mode}{Determines which genes are considered to be regulators.}
14 | }
15 | \value{
16 | A list with
17 | \item{genesymbols}{The gene symbols extracted from the expression matrix}
18 | \item{sample_assignment}{A vector filled with \code{1}'s of the same length as
19 | there are columns in the gene expression matrix.}
20 | \item{is_regulator}{Whether a gene is considered to be a regulator or not,
21 | determined dependent on \code{mode}.}
22 | }
23 | \description{
24 | Package data before clustering
25 | }
26 | \seealso{
27 | \code{\link[=get_regulator_list]{get_regulator_list()}}
28 | }
29 | \concept{main}
30 | 


--------------------------------------------------------------------------------
/man/scregclust-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scregclust-package.R
 3 | \docType{package}
 4 | \name{scregclust-package}
 5 | \alias{scregclust-package}
 6 | \title{scregclust: Reconstructing the Regulatory Programs of Target Genes in scRNA-Seq Data}
 7 | \description{
 8 | Implementation of the scregclust algorithm described in Larsson, Held, et al. (2024) \doi{10.1038/s41467-024-53954-3} which reconstructs regulatory programs of target genes in scRNA-seq data. Target genes are clustered into modules and each module is associated with a linear model describing the regulatory program.
 9 | }
10 | \details{
11 | Computational methods for the scregclust algorithm
12 | }
13 | \seealso{
14 | Useful links:
15 | \itemize{
16 |   \item \url{https://scmethods.github.io/scregclust/}
17 |   \item \url{https://github.com/scmethods/scregclust/}
18 |   \item Report bugs at \url{https://github.com/scmethods/scregclust/issues}
19 | }
20 | 
21 | }
22 | \author{
23 | Ida Larsson, Felix Held, Sven Nelander
24 | }
25 | \keyword{internal}
26 | 


--------------------------------------------------------------------------------
/R/scregclust-package.R:
--------------------------------------------------------------------------------
 1 | #' @details
 2 | #' Computational methods for the scregclust algorithm
 3 | #' @keywords internal
 4 | #' @aliases scregclust-package
 5 | #' @author Ida Larsson, Felix Held, Sven Nelander
 6 | #' @import Rcpp
 7 | #' @import cli
 8 | #' @import ggplot2
 9 | #' @importFrom rlang .data
10 | #' @importFrom prettyunits pretty_dt
11 | #' @importFrom Matrix Matrix sparseMatrix Diagonal t rowSums colSums summary
12 | #' @importFrom stats cor coef predict na.omit kmeans quantile sd dist setNames
13 | #' @importFrom utils read.table head tail globalVariables
14 | #' @importFrom graphics legend
15 | #' @importFrom methods is as
16 | #' @importFrom reshape melt
17 | #' @importFrom igraph graph_from_data_frame delete_edges delete_vertices layout_with_fr V E degree
18 | #' @importFrom grid arrow unit
19 | #' @useDynLib scregclust, .registration = TRUE
20 | "_PACKAGE"
21 | 
22 | .onUnload <- function(libpath) {
23 |   library.dynam.unload("scregclust", libpath)
24 | }
25 | 
26 | # Shut up some annoying `R CMD check` warnings
27 | utils::globalVariables(c(".", "variable"))
28 | 


--------------------------------------------------------------------------------
/man/plot_silhouettes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plotting.R
 3 | \name{plot_silhouettes}
 4 | \alias{plot_silhouettes}
 5 | \title{Plot individual silhouette scores}
 6 | \usage{
 7 | plot_silhouettes(list_of_fits, penalization, final_config = 1L)
 8 | }
 9 | \arguments{
10 | \item{list_of_fits}{A list of \code{scregclust} objects each fit to the same
11 | dataset across a variety of module counts (varying
12 | \code{n_modules} when running \code{\link{scregclust}}).}
13 | 
14 | \item{penalization}{Either a single numeric value requesting the results
15 | for the same penalty parameter across all fits in
16 | \code{list_of_fits}, or one for each individual fit.}
17 | 
18 | \item{final_config}{The final configuration that should be visualized.
19 | Either a single number to be used for all fits in
20 | \code{list_of_fits}, or one for each individual fit.}
21 | }
22 | \value{
23 | A ggplot2 plot showing the the silhouette scores for each
24 | supplied fit.
25 | }
26 | \description{
27 | Plot individual silhouette scores
28 | }
29 | \concept{plotting}
30 | 


--------------------------------------------------------------------------------
/man/kmeanspp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{kmeanspp}
 4 | \alias{kmeanspp}
 5 | \title{Perform the k-means++ algorithm}
 6 | \usage{
 7 | kmeanspp(x, n_cluster, n_init_clusterings = 10L, n_max_iter = 10L)
 8 | }
 9 | \arguments{
10 | \item{x}{Input matrix (n x p)}
11 | 
12 | \item{n_cluster}{Number of clusters}
13 | 
14 | \item{n_init_clusterings}{Number of repeated random initializations
15 | to perform}
16 | 
17 | \item{n_max_iter}{Number of maximum iterations to perform in the k-means
18 | algorithm}
19 | }
20 | \value{
21 | An object of class \code{\link[stats:kmeans]{stats::kmeans}}.
22 | }
23 | \description{
24 | Performs the k-means++ algorithm to cluster the rows of the input matrix.
25 | }
26 | \details{
27 | Estimation is repeated
28 | }
29 | \references{
30 | David Arthur and Sergei Vassilvitskii. K-Means++: The advantages
31 | of careful seeding. In Proceedings of the Eighteenth Annual ACM-SIAM
32 | Symposium on Discrete Algorithms, SODA '07, pages 1027––1035.
33 | Society for Industrial and Applied Mathematics, 2007.
34 | }
35 | \concept{helpers}
36 | 


--------------------------------------------------------------------------------
/man/coef_nnls.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{coef_nnls}
 4 | \alias{coef_nnls}
 5 | \title{Compute NNLS coefficients}
 6 | \usage{
 7 | coef_nnls(x, y, eps = 1e-12, max_iter = 1000L)
 8 | }
 9 | \arguments{
10 | \item{x}{Coefficient matrix (p x n matrix)}
11 | 
12 | \item{y}{Right hand side (p x m matrix)}
13 | 
14 | \item{eps}{Convergence tolerance}
15 | 
16 | \item{max_iter}{Maximum number of iterations}
17 | }
18 | \value{
19 | A list containing
20 | \item{beta}{The estimated coefficient matrix}
21 | \item{iterations}{A vector containing the number of iterations needed
22 | for the \code{i}-th column in \code{y} in the \code{i}-th entry.}
23 | }
24 | \description{
25 | Computes non-negative least squares coefficients with a matrix
26 | right hand side.
27 | }
28 | \references{
29 | Duy Khuong Nguyen and Tu Bao Ho. Accelerated anti-lopsided algorithm
30 | for nonnegative least squares. International Journal of Data Science
31 | and Analytics, 3(1):23–34, 2017.
32 | 
33 | Adapted from \url{https://github.com/khuongnd/nnls_antilopsided}
34 | }
35 | \keyword{internal}
36 | 


--------------------------------------------------------------------------------
/datasets/humanKinases.txt:
--------------------------------------------------------------------------------
 1 | MYLK
 2 | CRIM1
 3 | CAMK2G
 4 | DGKB
 5 | PRKG1
 6 | ROCK1
 7 | CCND1
 8 | PDGFRB
 9 | PLK2
10 | PDGFRL
11 | PKDCC
12 | CIT
13 | CALM2
14 | DTYMK
15 | AURKA
16 | AURKB
17 | BUB1B
18 | CDK1
19 | PBK
20 | TTK
21 | BUB1
22 | CCNB1
23 | CCNB2
24 | NEK2
25 | PLK1
26 | TK1
27 | WEE1
28 | CCNE1
29 | UGP2
30 | HSP90AA1
31 | CAMK2A
32 | EPHA4
33 | FGFR3
34 | LRRK2
35 | NTRK3
36 | PHKG1
37 | TNK2
38 | PIM3
39 | DGKI
40 | ERBB4
41 | CDK6
42 | EPHA3
43 | NRBP2
44 | RPS6KA2
45 | AK4
46 | MAP3K1
47 | PDGFRA
48 | CKB
49 | KIT
50 | SGK1
51 | STK32A
52 | REV3L
53 | MET
54 | TGFBR2
55 | DCLK1
56 | EFEMP1
57 | ABL2
58 | CCL2
59 | CDKN1A
60 | EPHA2
61 | FAM20C
62 | HMGA2
63 | LTBP1
64 | MAP4K4
65 | NRP1
66 | PAK3
67 | DCX
68 | DDR2
69 | KALRN
70 | TRIB3
71 | AXL
72 | FGFR1
73 | SQSTM1
74 | HBEGF
75 | IRS2
76 | NRP2
77 | STK17A
78 | TRIB1
79 | PGM2L1
80 | ITPKC
81 | PFKFB3
82 | PIM1
83 | PLK3
84 | RASSF2
85 | EGFR
86 | CAMK2B
87 | DCLK2
88 | NTRK2
89 | PDGFA
90 | TGFB2
91 | DDR1
92 | MAPK10
93 | SOX9
94 | TRIO
95 | TGFBR3
96 | SPHK1
97 | AATK
98 | CDK18
99 | ERBB3


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | url: https://scmethods.github.io/scregclust/
 2 | template:
 3 |   bootstrap: 5
 4 | navbar:
 5 |   title: scregclust
 6 |   left:
 7 |     - text: Reference
 8 |       href: reference/index.html
 9 |     - text: Articles
10 |       menu:
11 |       - text: Demonstration of workflow
12 |         href: articles/pbmc.html
13 |       - text: Custom regulator list
14 |         href: articles/mouse.html
15 |   right:
16 |     - icon: fa-github
17 |       href: https://github.com/scmethods/scregclust
18 |       aria-label: GitHub
19 | reference:
20 | - title: Setting up and performing clustering
21 |   desc: |
22 |     Functions to prepare the input data and to perform single-cell regulatory-driven clustering.
23 | - contents:
24 |   - has_concept("main") 
25 | - title: Plotting and evaluation
26 |   desc: |
27 |     Functions which help in plotting and evaluating results.
28 | - contents:
29 |   - has_concept("plotting")
30 | - title: Utility functions
31 |   desc: |
32 |     Functions that make accessing aspects of the results easier.
33 | - contents:
34 |   - has_concept("utilities")
35 | - title: Other helpers
36 | - contents:
37 |   - has_concept("helpers")
38 | 


--------------------------------------------------------------------------------
/man/plot_regulator_network.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plotting.R
 3 | \name{plot_regulator_network}
 4 | \alias{plot_regulator_network}
 5 | \title{Plotting the regulatory table from scregclust as a directed graph}
 6 | \usage{
 7 | plot_regulator_network(
 8 |   output,
 9 |   arrow_size = 0.3,
10 |   edge_scaling = 30,
11 |   no_links = 6,
12 |   col = c("gray80", "#FC7165", "#BD828C", "#9D8A9F", "#7D92B2", "#BDA88C", "#FCBD65",
13 |     "#F2BB90", "#E7B9BA", "#BDB69C", "#92B27D", "#9B8BA5", "#9D7DB2", "#94A5BF")
14 | )
15 | }
16 | \arguments{
17 | \item{output}{Object of type \code{scregclust_output} from a fit of the
18 | scregclust algorithm.}
19 | 
20 | \item{arrow_size}{Size of arrow head}
21 | 
22 | \item{edge_scaling}{Scaling factor for edge width}
23 | 
24 | \item{no_links}{Threshold value (0-10) for number of edges to show,
25 | higher value = more stringent threshold = less edges}
26 | 
27 | \item{col}{color}
28 | }
29 | \value{
30 | Graph with gene modules and regulators as nodes
31 | }
32 | \description{
33 | Plotting the regulatory table from scregclust as a directed graph
34 | }
35 | \concept{plotting}
36 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # scregclust 0.2.2
 2 | 
 3 | - Vignettes relied heavily on downloading data. Make vignettes articles
 4 |   that appear on the website but not in the actual package, since it is expected
 5 |   that anybody profiting from the vignettes will have an active internet
 6 |   connection anyways.
 7 | 
 8 | # scregclust 0.2.1
 9 | 
10 | - Added a vignette illustrating how to supply your own regulator list
11 | 
12 | # scregclust 0.2.0-1
13 | 
14 | - Regulator importance was computed unnecessarily if there is only one
15 |   regulator. A single regulator is always the most important regulator
16 |   for a cluster.
17 | 
18 | # scregclust 0.2.0
19 | 
20 | ## New features
21 | 
22 | - Quick Mode: Instead of trying to re-allocate all target genes that were
23 |   allocated into the noise cluster, only a certain (random) percentage of
24 |   these target genes is attempted to be re-allocated.
25 |   
26 |   `quick_mode = TRUE` has to be supplied as an argument to `scregclust` to
27 |   activate this feature (off by default) and the percentage of
28 |   noise target genes to re-allocate is given by `quick_mode_percent`,
29 |   a number in [0, 1).
30 | 
31 | ## Minor changes
32 | 
33 | - Added CRAN install instructions to the README
34 | 
35 | # scregclust 0.1.0
36 | 
37 | - First release on CRAN
38 | 


--------------------------------------------------------------------------------
/man/get_rand_indices.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_rand_indices}
 4 | \alias{get_rand_indices}
 5 | \title{Compute Rand indices}
 6 | \usage{
 7 | get_rand_indices(fit, groundtruth, adjusted = TRUE)
 8 | }
 9 | \arguments{
10 | \item{fit}{An object of class \code{scregclust}}
11 | 
12 | \item{groundtruth}{A known clustering of the target genes (integer vector)}
13 | 
14 | \item{adjusted}{If TRUE, the Adjusted Rand index is computed. Otherwise the
15 | ordinary Rand index is computed.}
16 | }
17 | \value{
18 | A \code{\link{data.frame}} containing the Rand indices. Since there can
19 | be more than one final configuration for some penalization
20 | parameters, Rand indices are averaged for each fixed penalization
21 | parameter. Returned are the mean, standard deviation and number
22 | of final configurations that were averaged.
23 | }
24 | \description{
25 | Compute Rand indices for fitted scregclust object
26 | }
27 | \references{
28 | W. M. Rand (1971). "Objective criteria for the evaluation of clustering
29 | methods". Journal of the American Statistical Association 66 (336): 846–850.
30 | DOI:10.2307/2284239
31 | 
32 | Lawrence Hubert and Phipps Arabie (1985). "Comparing partitions".
33 | Journal of Classification. 2 (1): 193–218. DOI:10.1007/BF01908075
34 | }
35 | \concept{utilities}
36 | 


--------------------------------------------------------------------------------
/man/jaccard_indicator_comp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{jaccard_indicator_comp}
 4 | \alias{jaccard_indicator_comp}
 5 | \title{Perform the computations for thresholded Jaccard distance}
 6 | \usage{
 7 | jaccard_indicator_comp(gs, eps)
 8 | }
 9 | \arguments{
10 | \item{gs}{a list of integer vectors, one for each row, giving the column
11 | indices of the non-zero elements of the row or \code{NULL} if the
12 | whole row is empty.}
13 | 
14 | \item{eps}{an upper bound on the Jaccard distance (\code{1 - eps} becomes a
15 | lower bound on the Jaccard similarity)}
16 | }
17 | \value{
18 | A list with row and column indices in the #row x #row indicator
19 | matrix specifying which rows in the original matrix had a distance
20 | of at most \code{eps}.
21 | }
22 | \description{
23 | Perform the computations for thresholded Jaccard distance
24 | }
25 | \details{
26 | This function is optimized for sparse matrices and computes the pairwise
27 | Jaccard distances between the rows of the input matrix. Note that the
28 | actual distance is not saved. Instead, a threshold (\code{eps}) is supplied
29 | and an indicator matrix is returned, with a one indicating that the
30 | distance is smaller than \code{eps} (equivalently, the Jaccard similarity
31 | is larger than \code{1 - eps}).
32 | }
33 | \keyword{internal}
34 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown
13 | 
14 | jobs:
15 |   pkgdown:
16 |     runs-on: ubuntu-latest
17 |     # Only restrict concurrency for non-PR jobs
18 |     concurrency:
19 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 |     env:
21 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 |     permissions:
23 |       contents: write
24 |     steps:
25 |       - uses: actions/checkout@v4
26 | 
27 |       - uses: r-lib/actions/setup-pandoc@v2
28 | 
29 |       - uses: r-lib/actions/setup-r@v2
30 |         with:
31 |           use-public-rspm: true
32 | 
33 |       - uses: r-lib/actions/setup-r-dependencies@v2
34 |         with:
35 |           extra-packages: any::pkgdown, local::.
36 |           needs: website
37 | 
38 |       - name: Build site
39 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
40 |         shell: Rscript {0}
41 | 
42 |       - name: Deploy to GitHub pages 🚀
43 |         if: github.event_name != 'pull_request'
44 |         uses: JamesIves/github-pages-deploy-action@v4.5.0
45 |         with:
46 |           clean: false
47 |           branch: gh-pages
48 |           folder: docs
49 | 


--------------------------------------------------------------------------------
/man/split_sample.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scregclust.R
 3 | \name{split_sample}
 4 | \alias{split_sample}
 5 | \title{Split Sample}
 6 | \usage{
 7 | split_sample(
 8 |   z,
 9 |   stratification,
10 |   is_regulator,
11 |   split_indices,
12 |   split1_proportion,
13 |   total_proportion,
14 |   center
15 | )
16 | }
17 | \arguments{
18 | \item{z}{matrix of single cell data with rows as genes and columns as cells.}
19 | 
20 | \item{stratification}{a vector by which the sampling will be stratified
21 | of length \code{ncol(z)}}
22 | 
23 | \item{is_regulator}{an indicator vector, telling which rows in \code{z} are
24 | candidate regulators}
25 | 
26 | \item{split_indices}{a vector of given split indices. can be \code{NULL}}
27 | 
28 | \item{split1_proportion}{proportion to include in first data split}
29 | 
30 | \item{total_proportion}{proportion of data to include overall in splitting}
31 | 
32 | \item{center}{TRUE if data should be row-centered. Set to FALSE otherwise.}
33 | }
34 | \value{
35 | a list containing
36 | \item{z1_reg}{first data split, TF-part}
37 | \item{z2_reg}{second data split, TF-part}
38 | \item{z1_target}{first data split, non-TF part}
39 | \item{z2_target}{second data split, non-TF part}
40 | \item{split_indices}{either verbatim the vector given as input or
41 | a vector encoding the splits as NA = not included,
42 | 1 = split 1 or 2 = split 2. Allows reproducibility
43 | of data splits.}
44 | }
45 | \description{
46 | Splits sample in train and test set
47 | }
48 | \keyword{internal}
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Single-cell Regulatory-driven Clustering (scregclust)
 2 | 
 3 | <!-- badges: start -->
 4 | 
 5 | [![CRAN status](https://www.r-pkg.org/badges/version/scregclust)](https://CRAN.R-project.org/package=scregclust)
 6 | <!-- badges: end -->
 7 | 
 8 | ![A diagram illustrating the *scregclust* algorithm.](man/figures/overview_fig1A_bg.png "Illustration of the scregclust algorithm")
 9 | 
10 | The goal of *scregclust* is to cluster genes by regulatory programs. To do so, genes are clustered into modules which in turn are associated with regulators. The algorithm alternates between associating regulators to modules and reallocating target genes into modules.
11 | 
12 | - The documentation for this package can be found at [https://scmethods.github.io/scregclust/](https://scmethods.github.io/scregclust/)
13 | - A detailed description of the algorithm and an in-depth evaluation of its properties can be found in our original research article [Larsson, Held, et al. (2024) Reconstructing the regulatory programs underlying the phenotypic plasticity of neural cancers. Nature Communications 15, 9699 DOI 10.1038/s41467-024-53954-3](https://doi.org/10.1038/s41467-024-53954-3)
14 | 
15 | ## Installation
16 | 
17 | You can install the stable version of *scregclust* from [CRAN](https://cran.r-project.org/package=scregclust) with
18 | 
19 | ```r
20 | install.packages("scregclust")
21 | ```
22 | 
23 | You can install the current development version of *scregclust* from [GitHub](https://github.com/scmethods/scregclust) with:
24 | 
25 | ```r
26 | # install.packages("devtools")
27 | devtools::install_github("scmethods/scregclust")
28 | ```
29 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: scregclust
 2 | Title: Reconstructing the Regulatory Programs of Target Genes in scRNA-Seq Data
 3 | Version: 0.2.2
 4 | Authors@R: c(
 5 |     person("Felix", "Held", ,"felix.held@gmail.com", role = c("aut", "cre"),
 6 |            comment = c(ORCID = "0000-0002-7679-7752")),
 7 |     person("Ida", "Larsson", ,"ida.larsson@igp.uu.se", role = c("aut"),
 8 |            comment = c(ORCID = "0000-0001-5422-4243")),
 9 |     person("Sven", "Nelander", ,"sven.nelander@igp.uu.se", role = c("aut"),
10 |            comment = c(ORCID = "0000-0003-1758-1262")),
11 |     person("André", "Armatowski", role = c("ctb")))
12 | Description: Implementation of the scregclust algorithm
13 |   described in Larsson, Held, et al. (2024) <doi:10.1038/s41467-024-53954-3>
14 |   which reconstructs regulatory programs of target genes in scRNA-seq data.
15 |   Target genes are clustered into modules and each module is associated with a linear
16 |   model describing the regulatory program.
17 | Encoding: UTF-8
18 | Depends: R (>= 4.1.0)
19 | Imports:
20 |     Matrix,
21 |     stats,
22 |     methods,
23 |     utils,
24 |     reshape,
25 |     igraph,
26 |     graphics,
27 |     grid,
28 |     cli,
29 |     prettyunits,
30 |     ggplot2,
31 |     rlang,
32 |     Rcpp (>= 1.0.8)
33 | Suggests:
34 |     testthat (>= 3.0.0),
35 |     hdf5r,
36 |     glmGamPoi,
37 |     Seurat,
38 |     GEOquery
39 | LinkingTo: Rcpp, RcppEigen
40 | Roxygen: list(markdown = TRUE)
41 | RoxygenNote: 7.3.2
42 | License: GPL (>= 3)
43 | Config/testthat/edition: 3
44 | URL: https://scmethods.github.io/scregclust/, https://github.com/scmethods/scregclust/ 
45 | BugReports: https://github.com/scmethods/scregclust/issues
46 | Config/Needs/website: rmarkdown
47 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(format,scregclust)
 4 | S3method(format,scregclust_output)
 5 | S3method(format,scregclust_result)
 6 | S3method(plot,scregclust)
 7 | S3method(print,scregclust)
 8 | S3method(print,scregclust_output)
 9 | S3method(print,scregclust_result)
10 | export(available_results)
11 | export(cluster_overlap)
12 | export(fast_cor)
13 | export(find_module_sizes)
14 | export(get_avg_num_regulators)
15 | export(get_num_final_configs)
16 | export(get_rand_indices)
17 | export(get_regulator_list)
18 | export(get_target_gene_modules)
19 | export(kmeanspp)
20 | export(plot_module_count_helper)
21 | export(plot_regulator_network)
22 | export(plot_silhouettes)
23 | export(scregclust)
24 | export(scregclust_format)
25 | import(Rcpp)
26 | import(cli)
27 | import(ggplot2)
28 | importFrom(Matrix,Diagonal)
29 | importFrom(Matrix,Matrix)
30 | importFrom(Matrix,colSums)
31 | importFrom(Matrix,rowSums)
32 | importFrom(Matrix,sparseMatrix)
33 | importFrom(Matrix,summary)
34 | importFrom(Matrix,t)
35 | importFrom(graphics,legend)
36 | importFrom(grid,arrow)
37 | importFrom(grid,unit)
38 | importFrom(igraph,E)
39 | importFrom(igraph,V)
40 | importFrom(igraph,degree)
41 | importFrom(igraph,delete_edges)
42 | importFrom(igraph,delete_vertices)
43 | importFrom(igraph,graph_from_data_frame)
44 | importFrom(igraph,layout_with_fr)
45 | importFrom(methods,as)
46 | importFrom(methods,is)
47 | importFrom(prettyunits,pretty_dt)
48 | importFrom(reshape,melt)
49 | importFrom(rlang,.data)
50 | importFrom(stats,coef)
51 | importFrom(stats,cor)
52 | importFrom(stats,dist)
53 | importFrom(stats,kmeans)
54 | importFrom(stats,na.omit)
55 | importFrom(stats,predict)
56 | importFrom(stats,quantile)
57 | importFrom(stats,sd)
58 | importFrom(stats,setNames)
59 | importFrom(utils,globalVariables)
60 | importFrom(utils,head)
61 | importFrom(utils,read.table)
62 | importFrom(utils,tail)
63 | useDynLib(scregclust, .registration = TRUE)
64 | 


--------------------------------------------------------------------------------
/man/coop_lasso.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{coop_lasso}
 4 | \alias{coop_lasso}
 5 | \title{ADMM algorithm for solving the group-penalized least squares problem}
 6 | \usage{
 7 | coop_lasso(
 8 |   y,
 9 |   x,
10 |   lambda,
11 |   weights,
12 |   beta_0 = NULL,
13 |   rho_0 = 0.2,
14 |   alpha_0 = 1.5,
15 |   n_update = 2L,
16 |   eps_corr = 0.2,
17 |   max_iter = 1000L,
18 |   eps_rel = 1e-08,
19 |   eps_abs = 1e-12,
20 |   verbose = FALSE
21 | )
22 | }
23 | \arguments{
24 | \item{y}{Target (n x m)}
25 | 
26 | \item{x}{Design matrix (n x p)}
27 | 
28 | \item{lambda}{Penalization parameter}
29 | 
30 | \item{weights}{A specific weight for each group (typically this is
31 | \verb{sqrt(group size)}).}
32 | 
33 | \item{beta_0}{Initial value for coefficients, allowing for warm start.
34 | Can be set to NULL, which results in the initial \code{beta}
35 | being a zero matrix.}
36 | 
37 | \item{rho_0}{Initial ADMM step-size}
38 | 
39 | \item{alpha_0}{Initial ADMM relaxation parameter}
40 | 
41 | \item{n_update}{Number of steps in-between updates of the
42 | step-size/adaptation parameters}
43 | 
44 | \item{eps_corr}{Lower bound for the correlation in the step-size
45 | update steps}
46 | 
47 | \item{max_iter}{Maximum number of iterations}
48 | 
49 | \item{eps_rel}{Relative tolerance for convergence check}
50 | 
51 | \item{eps_abs}{Absolute tolerance for convergence check}
52 | 
53 | \item{verbose}{Whether or not information about the optimization process
54 | should be printed to the terminal}
55 | }
56 | \value{
57 | A list containing
58 | \item{beta}{The coefficients at convergence}
59 | \item{iterations}{Number of iterations}
60 | }
61 | \description{
62 | Implements estimation of the coop-lasso problem.
63 | }
64 | \references{
65 | Xu et al. (2017) Adaptive relaxed ADMM: Convergence theory and
66 | practical implementation. DOI 10.1109/CVPR.2017.765
67 | }
68 | \keyword{internal}
69 | 


--------------------------------------------------------------------------------
/datasets/humanTFs_v2.txt:
--------------------------------------------------------------------------------
  1 | SORBS2
  2 | CEBPB
  3 | EBF1
  4 | ETS2
  5 | FOXC1
  6 | ID3
  7 | MEF2C
  8 | NR2F2
  9 | NR4A2
 10 | NR4A3
 11 | SMAD7
 12 | ZFHX3
 13 | ZNF90
 14 | IFI16
 15 | HMGA1
 16 | PRRX1
 17 | KLF5
 18 | FBN1
 19 | PLAGL1
 20 | FOXS1
 21 | HMGB3
 22 | DEPDC1
 23 | FOXM1
 24 | MXD3
 25 | HMGB2
 26 | HMGB1
 27 | E2F7
 28 | EZH2
 29 | HIST1H1B
 30 | HIST1H1D
 31 | MYBL1
 32 | DEK
 33 | MYBL2
 34 | E2F1
 35 | H1FX
 36 | CARHSP1
 37 | HIST1H1A
 38 | HIST1H1C
 39 | HIST1H1E
 40 | LHX2
 41 | PAX6
 42 | POU3F2
 43 | SOX11
 44 | ARX
 45 | CHD9
 46 | FOXJ1
 47 | GSX2
 48 | HES5
 49 | INSM1
 50 | NEUROD1
 51 | OSR1
 52 | PBX1
 53 | POU3F4
 54 | PROX1
 55 | SALL3
 56 | SOX21
 57 | ZMAT1
 58 | ZNF117
 59 | CHD7
 60 | H1F0
 61 | HEY2
 62 | JDP2
 63 | MLXIP
 64 | NFATC1
 65 | OSR2
 66 | SEMA4A
 67 | SKIL
 68 | TSC22D3
 69 | ZNF331
 70 | ZNF503
 71 | DBX2
 72 | RORA
 73 | TCF12
 74 | ZIC1
 75 | NFIB
 76 | NR2F1
 77 | PITX1
 78 | RORB
 79 | STAT1
 80 | STAT2
 81 | MEOX2
 82 | ASCL1
 83 | ETV1
 84 | HES6
 85 | NFIA
 86 | OLIG2
 87 | RFX4
 88 | SOX8
 89 | TCF4
 90 | ZEB1
 91 | ZNF704
 92 | HEY1
 93 | MEIS2
 94 | POU3F3
 95 | SOX2
 96 | MITF
 97 | PAX3
 98 | PLXNC1
 99 | SNAI2
100 | EPAS1
101 | MAF
102 | TBX2
103 | MET
104 | PLXNA1
105 | AHR
106 | GLIS3
107 | PAWR
108 | BMP2
109 | DRAP1
110 | ELK3
111 | FOSL1
112 | FOXP1
113 | GTF2F2
114 | HMGA2
115 | HOXB2
116 | ID1
117 | KLF7
118 | NR1D1
119 | PRDM1
120 | RUNX1
121 | TBX3
122 | HES1
123 | HIC1
124 | TWIST1
125 | XBP1
126 | PLXNA4
127 | ARID5B
128 | KLF9
129 | MACF1
130 | EGR3
131 | MYC
132 | NFIL3
133 | NR4A1
134 | ATF3
135 | CREB5
136 | EGR1
137 | EGR2
138 | FOS
139 | FOSB
140 | ID4
141 | JUN
142 | JUNB
143 | JUND
144 | KLF10
145 | KLF2
146 | KLF4
147 | KLF6
148 | MAFF
149 | ZFP36
150 | ZFP36L1
151 | ZFP36L2
152 | DDIT3
153 | FOSL2
154 | IRF1
155 | TIPARP
156 | TSC22D1
157 | HOPX
158 | OLIG1
159 | TSC22D4
160 | DPF3
161 | HES4
162 | ID2
163 | SMAD1
164 | ZBTB20
165 | BAZ2B
166 | FAM171B
167 | SOX9
168 | TSHZ2
169 | ZFHX4
170 | ZMAT3
171 | NFATC2
172 | TFAP2B
173 | TFAP2A
174 | GPR155
175 | POU3F1
176 | RXRG
177 | SOX10
178 | SOX4
179 | SOX6
180 | ZEB2
181 | ZNF536
182 | 


--------------------------------------------------------------------------------
/src/utils.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp/Lightest>
 2 | 
 3 | //' Allocate 3d-array and fill with matrix along first dimension
 4 | //'
 5 | //' @param input the matrix of size `n_obs x n_genes`
 6 | //' @param n_cl the size of the three-dimensional array's first dimension
 7 | //'
 8 | //' @return The allocated and filled array of size `n_cl x n_obs x n_genes`
 9 | //'
10 | //' @keywords internal
11 | // [[Rcpp::export]]
12 | SEXP alloc_array(SEXP input, R_xlen_t n_cl) {
13 | 	const auto n_obs = static_cast<R_xlen_t>(Rf_nrows(input));
14 | 	const auto n_genes = static_cast<R_xlen_t>(Rf_ncols(input));
15 | 
16 | 	const double* const pinput = REAL(input);
17 | 
18 | 	const auto n_total = n_cl * n_obs * n_genes;
19 | 	if (n_total > R_XLEN_T_MAX) {
20 | 		Rcpp::stop("alloc_array: requested allocation too large");
21 | 	}
22 | 
23 | 	SEXP arr = PROTECT(Rf_allocVector(REALSXP, n_total));
24 | 	double* const parr = REAL(arr);
25 | 
26 | 	for (R_xlen_t i = 0, ub = n_obs * n_genes; i < ub; i++) {
27 | 		for (R_xlen_t j = 0; j < n_cl; j++) {
28 | 			parr[i * n_cl + j] = pinput[i];
29 | 		}
30 | 	}
31 | 
32 | 	UNPROTECT(1);
33 | 	return arr;
34 | }
35 | 
36 | //' Reset input 3d-array by filling matrix along first dimension
37 | //'
38 | //' @param arr The 3d-array of dimension `n_cl x n_obs x n_genes`
39 | //' @param input The matrix of size `n_obs x n_genes`
40 | //'
41 | //' @keywords internal
42 | // [[Rcpp::export]]
43 | void reset_array(SEXP arr, SEXP input) {
44 | 	const int* const dims = INTEGER(PROTECT(Rf_getAttrib(arr, R_DimSymbol)));
45 | 	const auto n_cl = static_cast<R_xlen_t>(dims[0]);
46 | 	const auto n_obs = static_cast<R_xlen_t>(dims[1]);
47 | 	const auto n_genes = static_cast<R_xlen_t>(dims[2]);
48 | 	UNPROTECT(1);
49 | 
50 | 	if (static_cast<R_xlen_t>(Rf_nrows(input)) != n_obs ||
51 | 		static_cast<R_xlen_t>(Rf_ncols(input)) != n_genes) {
52 | 		Rcpp::stop("reset_array: input has wrong dimensions");
53 | 	}
54 | 
55 | 	const double* const pinput = REAL(input);
56 | 	double* const parr = REAL(arr);
57 | 
58 | 	for (R_xlen_t i = 0, ub = n_obs * n_genes; i < ub; i++) {
59 | 		for (R_xlen_t j = 0; j < n_cl; j++) {
60 | 			parr[i * n_cl + j] = pinput[i];
61 | 		}
62 | 	}
63 | }


--------------------------------------------------------------------------------
/src/jaccard.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp/Lightest>
 2 | 
 3 | using length_type = std::vector<int>::size_type;
 4 | 
 5 | static length_type len_intersect(const std::vector<int>& x, const std::vector<int>& y) {
 6 | 	length_type i = 0;
 7 | 	length_type j = 0;
 8 | 	length_type result = 0;
 9 | 
10 | 	while (i < x.size() && j < y.size()) {
11 | 		if (x[i] < y[j]) {
12 | 			i++;
13 | 		} else if (y[j] < x[i]) {
14 | 			j++;
15 | 		} else {
16 | 			result++;
17 | 
18 | 			i++;
19 | 			j++;
20 | 		}
21 | 	}
22 | 
23 | 	return result;
24 | }
25 | 
26 | //' Perform the computations for thresholded Jaccard distance
27 | //'
28 | //' @details
29 | //' This function is optimized for sparse matrices and computes the pairwise
30 | //' Jaccard distances between the rows of the input matrix. Note that the
31 | //' actual distance is not saved. Instead, a threshold (`eps`) is supplied
32 | //' and an indicator matrix is returned, with a one indicating that the
33 | //' distance is smaller than `eps` (equivalently, the Jaccard similarity
34 | //' is larger than `1 - eps`).
35 | //'
36 | //' @param gs a list of integer vectors, one for each row, giving the column
37 | //'           indices of the non-zero elements of the row or `NULL` if the
38 | //'           whole row is empty.
39 | //' @param eps an upper bound on the Jaccard distance (`1 - eps` becomes a
40 | //'            lower bound on the Jaccard similarity)
41 | //'
42 | //' @return A list with row and column indices in the #row x #row indicator
43 | //'         matrix specifying which rows in the original matrix had a distance
44 | //'         of at most `eps`.
45 | //'
46 | //' @keywords internal
47 | // [[Rcpp::export]]
48 | Rcpp::List jaccard_indicator_comp(Rcpp::List gs, double eps) {
49 | 	const auto n = static_cast<length_type>(gs.length());
50 | 
51 | 	if (eps > 1.0 || eps < 0.0) {
52 | 		Rcpp::stop("0 <= eps <= 1 needs to hold");
53 | 	}
54 | 
55 | 	std::vector<std::vector<int>> varr;
56 | 	varr.reserve(n);
57 | 	std::transform(gs.begin(), gs.end(), std::back_inserter(varr),
58 | 				   Rcpp::as<std::vector<int>>);
59 | 
60 | 	const auto eps_ = 1.0 - eps;
61 | 
62 | 	std::vector<int> ipairs;
63 | 	std::vector<int> jpairs;
64 | 
65 | 	for (length_type i = 1; i < n; i++) {
66 | 		for (length_type j = 0; j < i; j++) {
67 | 			const auto len_inter = len_intersect(varr[i], varr[j]);
68 | 			const auto len_union = varr[i].size() + varr[j].size() - len_inter;
69 | 
70 | 			if (static_cast<const double>(len_union) * eps_ <
71 | 				static_cast<const double>(len_inter)) {
72 | 				ipairs.emplace_back(i + 1);
73 | 				jpairs.emplace_back(j + 1);
74 | 			}
75 | 		}
76 | 	}
77 | 
78 | 	Rcpp::List out;
79 | 	out["i"] = ipairs;
80 | 	out["j"] = jpairs;
81 | 
82 | 	return out;
83 | }
84 | 


--------------------------------------------------------------------------------
/.github/workflows/rhub.yaml:
--------------------------------------------------------------------------------
 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at
 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml
 3 | # You can update this file to a newer version using the rhub2 package:
 4 | #
 5 | # rhub::rhub_setup()
 6 | #
 7 | # It is unlikely that you need to modify this file manually.
 8 | 
 9 | name: R-hub
10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}"
11 | 
12 | on:
13 |   workflow_dispatch:
14 |     inputs:
15 |       config:
16 |         description: 'A comma separated list of R-hub platforms to use.'
17 |         type: string
18 |         default: 'linux,windows,macos'
19 |       name:
20 |         description: 'Run name. You can leave this empty now.'
21 |         type: string
22 |       id:
23 |         description: 'Unique ID. You can leave this empty now.'
24 |         type: string
25 | 
26 | jobs:
27 | 
28 |   setup:
29 |     runs-on: ubuntu-latest
30 |     outputs:
31 |       containers: ${{ steps.rhub-setup.outputs.containers }}
32 |       platforms: ${{ steps.rhub-setup.outputs.platforms }}
33 | 
34 |     steps:
35 |     # NO NEED TO CHECKOUT HERE
36 |     - uses: r-hub/actions/setup@v1
37 |       with:
38 |         config: ${{ github.event.inputs.config }}
39 |       id: rhub-setup
40 | 
41 |   linux-containers:
42 |     needs: setup
43 |     if: ${{ needs.setup.outputs.containers != '[]' }}
44 |     runs-on: ubuntu-latest
45 |     name: ${{ matrix.config.label }}
46 |     strategy:
47 |       fail-fast: false
48 |       matrix:
49 |         config: ${{ fromJson(needs.setup.outputs.containers) }}
50 |     container:
51 |       image: ${{ matrix.config.container }}
52 | 
53 |     steps:
54 |       - uses: r-hub/actions/checkout@v1
55 |       - uses: r-hub/actions/platform-info@v1
56 |         with:
57 |           token: ${{ secrets.RHUB_TOKEN }}
58 |           job-config: ${{ matrix.config.job-config }}
59 |       - uses: r-hub/actions/setup-deps@v1
60 |         with:
61 |           token: ${{ secrets.RHUB_TOKEN }}
62 |           job-config: ${{ matrix.config.job-config }}
63 |       - uses: r-hub/actions/run-check@v1
64 |         with:
65 |           token: ${{ secrets.RHUB_TOKEN }}
66 |           job-config: ${{ matrix.config.job-config }}
67 | 
68 |   other-platforms:
69 |     needs: setup
70 |     if: ${{ needs.setup.outputs.platforms != '[]' }}
71 |     runs-on: ${{ matrix.config.os }}
72 |     name: ${{ matrix.config.label }}
73 |     strategy:
74 |       fail-fast: false
75 |       matrix:
76 |         config: ${{ fromJson(needs.setup.outputs.platforms) }}
77 | 
78 |     steps:
79 |       - uses: r-hub/actions/checkout@v1
80 |       - uses: r-hub/actions/setup-r@v1
81 |         with:
82 |           job-config: ${{ matrix.config.job-config }}
83 |           token: ${{ secrets.RHUB_TOKEN }}
84 |       - uses: r-hub/actions/platform-info@v1
85 |         with:
86 |           token: ${{ secrets.RHUB_TOKEN }}
87 |           job-config: ${{ matrix.config.job-config }}
88 |       - uses: r-hub/actions/setup-deps@v1
89 |         with:
90 |           job-config: ${{ matrix.config.job-config }}
91 |           token: ${{ secrets.RHUB_TOKEN }}
92 |       - uses: r-hub/actions/run-check@v1
93 |         with:
94 |           job-config: ${{ matrix.config.job-config }}
95 |           token: ${{ secrets.RHUB_TOKEN }}
96 | 


--------------------------------------------------------------------------------
/src/allocation.cpp:
--------------------------------------------------------------------------------
  1 | #include <Rcpp/Lighter>
  2 | #include <Eigen/Core>
  3 | 
  4 | // Assumes that
  5 | // - update_order is a permutation of 0:(length(k_) - 1)
  6 | // - indices in prior_indicators elements are within 0:(length(k_) - 1)
  7 | // [[Rcpp::export]]
  8 | Rcpp::IntegerVector allocate_into_modules(SEXP resid_array,
  9 | 										  Eigen::Map<Eigen::MatrixXd> resid_var,
 10 | 										  Rcpp::List prior_indicator,
 11 | 										  Rcpp::IntegerVector k_,
 12 | 										  Rcpp::IntegerVector update_order,
 13 | 										  double prior_baseline, double prior_weight) {
 14 | 	double* arr = REAL(resid_array);
 15 | 	int* dims = INTEGER(PROTECT(Rf_getAttrib(resid_array, R_DimSymbol)));
 16 | 	const auto n_modules = static_cast<Eigen::Index>(dims[0]);
 17 | 	const auto n_obs = static_cast<Eigen::Index>(dims[1]);
 18 | 	// const auto n_genes = static_cast<Eigen::Index>(dims[2]);
 19 | 	UNPROTECT(1);
 20 | 
 21 | 	const auto n_total = n_modules * n_obs;
 22 | 
 23 | 	Rcpp::IntegerVector k(k_ - 1);
 24 | 
 25 | 	Eigen::ArrayXd module_totals = Eigen::ArrayXd::Zero(n_modules);
 26 | 	for (const auto& idx : k) {
 27 | 		if (idx != -2) {
 28 | 			module_totals[idx] += 1;
 29 | 		}
 30 | 	}
 31 | 
 32 | 	// Iterate over genes in given order
 33 | 	for (const auto& j : update_order) {
 34 | 		// Load residuals for current gene
 35 | 		const Eigen::Map<Eigen::MatrixXd> resid(arr + j * n_total, n_modules, n_obs);
 36 | 
 37 | 		// Compute fraction of genes that gene j interacts with in each module,
 38 | 		// according to prior information
 39 | 		const Rcpp::IntegerVector prior_indices =
 40 | 			Rcpp::as<Rcpp::IntegerVector>(prior_indicator[j]);
 41 | 		Eigen::ArrayXd prior_frac = Eigen::ArrayXd::Zero(n_modules);
 42 | 		for (const auto& idx : prior_indices) {
 43 | 			if (k[idx] != -2) {
 44 | 				prior_frac[k[idx]] += 1;
 45 | 			}
 46 | 		}
 47 | 
 48 | 		for (Eigen::Index idx = 0; idx < n_modules; idx++) {
 49 | 			if (module_totals[idx] > 0) {
 50 | 				prior_frac[idx] /= module_totals[idx];
 51 | 			}
 52 | 		}
 53 | 		// Add baseline to avoid numerical problems if a module is empty
 54 | 		prior_frac += prior_baseline;
 55 | 		// Convert to probabilities
 56 | 		Eigen::ArrayXd prior_log_prob = prior_frac.log() - log(prior_frac.sum());
 57 | 
 58 | 		// Compute model likelihood from residuals
 59 | 		Eigen::MatrixXd model_log_likelihood =
 60 | 			((-resid).array().colwise() / (2.0 * resid_var.row(j).transpose().array()))
 61 | 				.colwise() -
 62 | 			0.5 * (2.0 * M_PI * resid_var.row(j).transpose()).array().log();
 63 | 
 64 | 		// // Normalise to convert to probabilities
 65 | 		// Eigen::RowVectorXd max_model_ll = model_log_likelihood.colwise().maxCoeff();
 66 | 		// Eigen::MatrixXd model_ll_minus_max =
 67 | 		// 	model_log_likelihood.rowwise() - max_model_ll;
 68 | 		// Eigen::MatrixXd model_log_prob =
 69 | 		// 	model_ll_minus_max.rowwise() -
 70 | 		// 	model_ll_minus_max.array().exp().colwise().sum().log().matrix();
 71 | 
 72 | 		// Compute total scores by weighting the model likelihood
 73 | 		// and the prior probabilities.
 74 | 		const Eigen::MatrixXd total_model_log_scores =
 75 | 			// (((1.0 - prior_weight) * model_log_prob.array()).colwise() +
 76 | 			(((1.0 - prior_weight) * model_log_likelihood.array()).colwise() +
 77 | 			 (prior_weight * prior_log_prob));
 78 | 
 79 | 		// Compute votes for each of the n_modules modules
 80 | 		Eigen::ArrayXi votes = Eigen::ArrayXi::Zero(n_modules);
 81 | 		for (Eigen::Index i = 0; i < n_obs; i++) {
 82 | 			int max_idx = -1;
 83 | 			total_model_log_scores.col(i).maxCoeff(&max_idx);
 84 | 			votes[max_idx] += 1;
 85 | 		}
 86 | 
 87 | 		// Move gene j to the new best module
 88 | 		int best_cl = -1;
 89 | 		votes.maxCoeff(&best_cl);
 90 | 		if (k[j] != -2) {
 91 | 			module_totals[k[j]] -= 1;
 92 | 		}
 93 | 		module_totals[best_cl] += 1;
 94 | 		k[j] = best_cl;
 95 | 
 96 | 		Rcpp::checkUserInterrupt();
 97 | 	}
 98 | 
 99 | 	return k + 1;
100 | }
101 | 


--------------------------------------------------------------------------------
/vignettes/articles/mouse.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Manually assigning regulators"
  3 | ---
  4 | 
  5 | ```{r, include = FALSE}
  6 | knitr::opts_chunk$set(
  7 |   collapse = TRUE,
  8 |   comment = "#>"
  9 | )
 10 | ```
 11 | 
 12 | The purpose of this vignette is to show how to manually configure the
 13 | `is_regulator` vector, e.g. when you want to run *scregclust* on a custom set of
 14 | regulators (not TFs or kinases), or if your data is from an organism other than
 15 | human, e.g. mouse. This vignette will show how to do this using a data set from
 16 | the mouse brain ([GSE60361](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE60361)), and a
 17 | [list of mouse TFs provided by Aertslab](https://resources.aertslab.org/cistarget/tf_lists/).
 18 | 
 19 | We use [Seurat](https://satijalab.org/seurat/) for pre-processing of the data.
 20 | 
 21 | ```{r load-packages, results='hide', message=FALSE}
 22 | # Load required packages
 23 | library(GEOquery)
 24 | library(Seurat)
 25 | library(scregclust)
 26 | ```
 27 | 
 28 | Read in the data and preprocess it in *Seurat*. Here, we simply use the
 29 | full dataset. In practice, you would perform additional quality checks and,
 30 | e.g., investigate PCA, UMAP, or TSNE plots of the data. We use the
 31 | package *GEOquery* to download meta data for the data.
 32 | 
 33 | ```{r load-data, results='hide', message=FALSE, warning=FALSE}
 34 | # Download the gene expression data
 35 | url <- paste0(
 36 |   "https://www.ncbi.nlm.nih.gov/geo/download/",
 37 |   "?acc=GSE60361&format=file&",
 38 |   "file=GSE60361%5FC1%2D3005%2DExpression%2Etxt%2Egz"
 39 | )
 40 | expr_path <- file.path(tempdir(), "Expression.txt.gz")
 41 | download.file(url, expr_path, cacheOK = FALSE, mode = "wb")
 42 | 
 43 | # Load the gene expression data
 44 | expr <- read.table(
 45 |   expr_path,
 46 |   header = TRUE,
 47 |   sep = "\t",
 48 |   stringsAsFactors = FALSE,
 49 |   fill = TRUE
 50 | )
 51 | 
 52 | # A few gene symbols appear as duplicates, make unique.
 53 | gene_symbols <- make.unique(expr[, 1], sep = "-")
 54 | expr <- expr[, -1]
 55 | rownames(expr) <- gene_symbols
 56 | 
 57 | # Download meta data
 58 | gse <- getGEO("GSE60361")
 59 | meta_data <- pData(phenoData(gse[[1]]))
 60 | # Sample names are stored in the meta data's row names
 61 | sample_names <- rownames(meta_data)
 62 | colnames(expr) <- sample_names
 63 | 
 64 | # Create Seurat object and preprocess the data using SCTransform
 65 | mouse <- CreateSeuratObject(
 66 |   counts = expr,
 67 |   min.cells = 3,
 68 |   min.features = 500,
 69 |   meta.data = meta_data
 70 | )
 71 | mouse <- SCTransform(mouse, verbose = TRUE)
 72 | ```
 73 | 
 74 | The built in transcription factor lists in *scregclust* are for human
 75 | transcription factors (TFs) and kinases. Download and read in a list of
 76 | mouse-specific TFs.
 77 | 
 78 | ```{r load-tfs, results='hide', message=FALSE}
 79 | url <- "https://resources.aertslab.org/cistarget/tf_lists/allTFs_mm.txt"
 80 | tfs_path <- file.path(tempdir(), "allTFs_mm.txt")
 81 | download.file(url, tfs_path, cacheOK = FALSE, mode = "w")
 82 | tfs <- read.table(
 83 |   tfs_path,
 84 |   header = FALSE,
 85 |   sep = "\t",
 86 |   stringsAsFactors = FALSE
 87 | )
 88 | tfs <- tfs[, 1]
 89 | ```
 90 | 
 91 | Extract `gene x cells` table
 92 | 
 93 | ```{r extract-gene-cells-table}
 94 | z <- GetAssayData(mouse, layer = "scale.data")
 95 | dim(z)
 96 | ```
 97 | 
 98 | Make sure data is in the format for *scregclust*
 99 | 
100 | ```{r scregclust-format}
101 | out <- scregclust_format(z, mode = "TF")
102 | 
103 | genesymbols <- out$genesymbols
104 | sample_assignment <- out$sample_assignment
105 | ```
106 | 
107 | Manually create the indicator vector `is_regulator`
108 | 
109 | ```{r manual-is-regulator}
110 | is_regulator <- rep(0, length = length(genesymbols))
111 | is_regulator[which(genesymbols %in% tfs)] <- 1
112 | ```
113 | 
114 | Finally, run scregclust to estimate the model. The run can be reproduced with
115 | the command below. A pre-fitted model can be downloaded from
116 | [GitHub](https://github.com/scmethods/scregclust/raw/main/datasets/mouse_scregclust.rds)
117 | for convenience.
118 | 
119 | ```{r run-scregclust}
120 | # # Run scregclust
121 | # set.seed(8374)
122 | # fit <- scregclust(
123 | #   z, genesymbols, is_regulator, penalization = seq(0.1, 0.5, 0.05),
124 | #   n_modules = 10L, n_cycles = 50L, noise_threshold = 0.05
125 | # )
126 | # saveRDS(fit, file = "datasets/mouse_scregclust.rds")
127 | 
128 | url <- paste0(
129 |   "https://github.com/scmethods/scregclust/raw/main/datasets/",
130 |   "mouse_scregclust.rds"
131 | )
132 | fit_path <- file.path(tempdir(), "mouse_scregclust.rds")
133 | download.file(url, fit_path)
134 | fit <- readRDS(fit_path)
135 | ```
136 | 
137 | Visualize the fit
138 | 
139 | ```{r viz-fit, fig.width=7, fig.height=4, fig.dpi=100}
140 | #| fig.alt: >
141 | #|   Boxplots of predictive R^2 per module (bottom) and
142 | #|   regulator importance (top) over the penalization parameters
143 | #|   specified during model estimation. A decreasing trend can
144 | #|   be seen in R^2 per module until about 0.35 with a drop from 0.4.
145 | #|   In addition, a slow and steady increase in regulator importance
146 | #|   is followed by an increase from around 0.4 penalization.
147 | plot(fit)
148 | ```
149 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
  1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | allocate_into_modules <- function(resid_array, resid_var, prior_indicator, k_, update_order, prior_baseline, prior_weight) {
  5 |     .Call(`_scregclust_allocate_into_modules`, resid_array, resid_var, prior_indicator, k_, update_order, prior_baseline, prior_weight)
  6 | }
  7 | 
  8 | #' Perform the computations for thresholded Jaccard distance
  9 | #'
 10 | #' @details
 11 | #' This function is optimized for sparse matrices and computes the pairwise
 12 | #' Jaccard distances between the rows of the input matrix. Note that the
 13 | #' actual distance is not saved. Instead, a threshold (`eps`) is supplied
 14 | #' and an indicator matrix is returned, with a one indicating that the
 15 | #' distance is smaller than `eps` (equivalently, the Jaccard similarity
 16 | #' is larger than `1 - eps`).
 17 | #'
 18 | #' @param gs a list of integer vectors, one for each row, giving the column
 19 | #'           indices of the non-zero elements of the row or `NULL` if the
 20 | #'           whole row is empty.
 21 | #' @param eps an upper bound on the Jaccard distance (`1 - eps` becomes a
 22 | #'            lower bound on the Jaccard similarity)
 23 | #'
 24 | #' @return A list with row and column indices in the #row x #row indicator
 25 | #'         matrix specifying which rows in the original matrix had a distance
 26 | #'         of at most `eps`.
 27 | #'
 28 | #' @keywords internal
 29 | jaccard_indicator_comp <- function(gs, eps) {
 30 |     .Call(`_scregclust_jaccard_indicator_comp`, gs, eps)
 31 | }
 32 | 
 33 | #' ADMM algorithm for solving the group-penalized least squares problem
 34 | #'
 35 | #' Implements estimation of the coop-lasso problem.
 36 | #'
 37 | #' @param y Target (n x m)
 38 | #' @param x Design matrix (n x p)
 39 | #' @param lambda Penalization parameter
 40 | #' @param weights A specific weight for each group (typically this is
 41 | #'                `sqrt(group size)`).
 42 | #' @param beta_0 Initial value for coefficients, allowing for warm start.
 43 | #'               Can be set to NULL, which results in the initial `beta`
 44 | #'               being a zero matrix.
 45 | #' @param rho_0 Initial ADMM step-size
 46 | #' @param alpha_0 Initial ADMM relaxation parameter
 47 | #' @param n_update Number of steps in-between updates of the
 48 | #'                 step-size/adaptation parameters
 49 | #' @param eps_corr Lower bound for the correlation in the step-size
 50 | #'                 update steps
 51 | #' @param max_iter Maximum number of iterations
 52 | #' @param eps_rel Relative tolerance for convergence check
 53 | #' @param eps_abs Absolute tolerance for convergence check
 54 | #' @param verbose Whether or not information about the optimization process
 55 | #'                should be printed to the terminal
 56 | #'
 57 | #' @return A list containing
 58 | #'     \item{beta}{The coefficients at convergence}
 59 | #'     \item{iterations}{Number of iterations}
 60 | #'
 61 | #' @references
 62 | #' Xu et al. (2017) Adaptive relaxed ADMM: Convergence theory and
 63 | #' practical implementation. DOI 10.1109/CVPR.2017.765
 64 | #'
 65 | #' @keywords internal
 66 | coop_lasso <- function(y, x, lambda, weights, beta_0 = NULL, rho_0 = 0.2, alpha_0 = 1.5, n_update = 2L, eps_corr = 0.2, max_iter = 1000L, eps_rel = 1e-8, eps_abs = 1e-12, verbose = FALSE) {
 67 |     .Call(`_scregclust_coop_lasso`, y, x, lambda, weights, beta_0, rho_0, alpha_0, n_update, eps_corr, max_iter, eps_rel, eps_abs, verbose)
 68 | }
 69 | 
 70 | #' Compute NNLS coefficients
 71 | #'
 72 | #' Computes non-negative least squares coefficients with a matrix
 73 | #' right hand side.
 74 | #'
 75 | #' @param x Coefficient matrix (p x n matrix)
 76 | #' @param y Right hand side (p x m matrix)
 77 | #' @param eps Convergence tolerance
 78 | #' @param max_iter Maximum number of iterations
 79 | #'
 80 | #' @return A list containing
 81 | #'  \item{beta}{The estimated coefficient matrix}
 82 | #'  \item{iterations}{A vector containing the number of iterations needed
 83 | #'                    for the `i`-th column in `y` in the `i`-th entry.}
 84 | #'
 85 | #' @references
 86 | #' Duy Khuong Nguyen and Tu Bao Ho. Accelerated anti-lopsided algorithm
 87 | #' for nonnegative least squares. International Journal of Data Science
 88 | #' and Analytics, 3(1):23–34, 2017.
 89 | #'
 90 | #' Adapted from <https://github.com/khuongnd/nnls_antilopsided>
 91 | #'
 92 | #' @keywords internal
 93 | coef_nnls <- function(x, y, eps = 1e-12, max_iter = 1000L) {
 94 |     .Call(`_scregclust_coef_nnls`, x, y, eps, max_iter)
 95 | }
 96 | 
 97 | #' Allocate 3d-array and fill with matrix along first dimension
 98 | #'
 99 | #' @param input the matrix of size `n_obs x n_genes`
100 | #' @param n_cl the size of the three-dimensional array's first dimension
101 | #'
102 | #' @return The allocated and filled array of size `n_cl x n_obs x n_genes`
103 | #'
104 | #' @keywords internal
105 | alloc_array <- function(input, n_cl) {
106 |     .Call(`_scregclust_alloc_array`, input, n_cl)
107 | }
108 | 
109 | #' Reset input 3d-array by filling matrix along first dimension
110 | #'
111 | #' @param arr The 3d-array of dimension `n_cl x n_obs x n_genes`
112 | #' @param input The matrix of size `n_obs x n_genes`
113 | #'
114 | #' @keywords internal
115 | reset_array <- function(arr, input) {
116 |     invisible(.Call(`_scregclust_reset_array`, arr, input))
117 | }
118 | 
119 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
  1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | #include <RcppEigen.h>
  5 | #include <Rcpp.h>
  6 | 
  7 | using namespace Rcpp;
  8 | 
  9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
 10 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
 11 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
 12 | #endif
 13 | 
 14 | // allocate_into_modules
 15 | Rcpp::IntegerVector allocate_into_modules(SEXP resid_array, Eigen::Map<Eigen::MatrixXd> resid_var, Rcpp::List prior_indicator, Rcpp::IntegerVector k_, Rcpp::IntegerVector update_order, double prior_baseline, double prior_weight);
 16 | RcppExport SEXP _scregclust_allocate_into_modules(SEXP resid_arraySEXP, SEXP resid_varSEXP, SEXP prior_indicatorSEXP, SEXP k_SEXP, SEXP update_orderSEXP, SEXP prior_baselineSEXP, SEXP prior_weightSEXP) {
 17 | BEGIN_RCPP
 18 |     Rcpp::RObject rcpp_result_gen;
 19 |     Rcpp::RNGScope rcpp_rngScope_gen;
 20 |     Rcpp::traits::input_parameter< SEXP >::type resid_array(resid_arraySEXP);
 21 |     Rcpp::traits::input_parameter< Eigen::Map<Eigen::MatrixXd> >::type resid_var(resid_varSEXP);
 22 |     Rcpp::traits::input_parameter< Rcpp::List >::type prior_indicator(prior_indicatorSEXP);
 23 |     Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type k_(k_SEXP);
 24 |     Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type update_order(update_orderSEXP);
 25 |     Rcpp::traits::input_parameter< double >::type prior_baseline(prior_baselineSEXP);
 26 |     Rcpp::traits::input_parameter< double >::type prior_weight(prior_weightSEXP);
 27 |     rcpp_result_gen = Rcpp::wrap(allocate_into_modules(resid_array, resid_var, prior_indicator, k_, update_order, prior_baseline, prior_weight));
 28 |     return rcpp_result_gen;
 29 | END_RCPP
 30 | }
 31 | // jaccard_indicator_comp
 32 | Rcpp::List jaccard_indicator_comp(Rcpp::List gs, double eps);
 33 | RcppExport SEXP _scregclust_jaccard_indicator_comp(SEXP gsSEXP, SEXP epsSEXP) {
 34 | BEGIN_RCPP
 35 |     Rcpp::RObject rcpp_result_gen;
 36 |     Rcpp::RNGScope rcpp_rngScope_gen;
 37 |     Rcpp::traits::input_parameter< Rcpp::List >::type gs(gsSEXP);
 38 |     Rcpp::traits::input_parameter< double >::type eps(epsSEXP);
 39 |     rcpp_result_gen = Rcpp::wrap(jaccard_indicator_comp(gs, eps));
 40 |     return rcpp_result_gen;
 41 | END_RCPP
 42 | }
 43 | // coop_lasso
 44 | Rcpp::List coop_lasso(Eigen::Map<Eigen::MatrixXd> y, Eigen::Map<Eigen::MatrixXd> x, double lambda, Eigen::Map<Eigen::ArrayXd> weights, Rcpp::Nullable<Rcpp::NumericMatrix> beta_0, double rho_0, double alpha_0, int n_update, double eps_corr, int max_iter, double eps_rel, double eps_abs, bool verbose);
 45 | RcppExport SEXP _scregclust_coop_lasso(SEXP ySEXP, SEXP xSEXP, SEXP lambdaSEXP, SEXP weightsSEXP, SEXP beta_0SEXP, SEXP rho_0SEXP, SEXP alpha_0SEXP, SEXP n_updateSEXP, SEXP eps_corrSEXP, SEXP max_iterSEXP, SEXP eps_relSEXP, SEXP eps_absSEXP, SEXP verboseSEXP) {
 46 | BEGIN_RCPP
 47 |     Rcpp::RObject rcpp_result_gen;
 48 |     Rcpp::RNGScope rcpp_rngScope_gen;
 49 |     Rcpp::traits::input_parameter< Eigen::Map<Eigen::MatrixXd> >::type y(ySEXP);
 50 |     Rcpp::traits::input_parameter< Eigen::Map<Eigen::MatrixXd> >::type x(xSEXP);
 51 |     Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP);
 52 |     Rcpp::traits::input_parameter< Eigen::Map<Eigen::ArrayXd> >::type weights(weightsSEXP);
 53 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::NumericMatrix> >::type beta_0(beta_0SEXP);
 54 |     Rcpp::traits::input_parameter< double >::type rho_0(rho_0SEXP);
 55 |     Rcpp::traits::input_parameter< double >::type alpha_0(alpha_0SEXP);
 56 |     Rcpp::traits::input_parameter< int >::type n_update(n_updateSEXP);
 57 |     Rcpp::traits::input_parameter< double >::type eps_corr(eps_corrSEXP);
 58 |     Rcpp::traits::input_parameter< int >::type max_iter(max_iterSEXP);
 59 |     Rcpp::traits::input_parameter< double >::type eps_rel(eps_relSEXP);
 60 |     Rcpp::traits::input_parameter< double >::type eps_abs(eps_absSEXP);
 61 |     Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
 62 |     rcpp_result_gen = Rcpp::wrap(coop_lasso(y, x, lambda, weights, beta_0, rho_0, alpha_0, n_update, eps_corr, max_iter, eps_rel, eps_abs, verbose));
 63 |     return rcpp_result_gen;
 64 | END_RCPP
 65 | }
 66 | // coef_nnls
 67 | Rcpp::List coef_nnls(Eigen::Map<Eigen::MatrixXd> x, Eigen::Map<Eigen::MatrixXd> y, double eps, int max_iter);
 68 | RcppExport SEXP _scregclust_coef_nnls(SEXP xSEXP, SEXP ySEXP, SEXP epsSEXP, SEXP max_iterSEXP) {
 69 | BEGIN_RCPP
 70 |     Rcpp::RObject rcpp_result_gen;
 71 |     Rcpp::RNGScope rcpp_rngScope_gen;
 72 |     Rcpp::traits::input_parameter< Eigen::Map<Eigen::MatrixXd> >::type x(xSEXP);
 73 |     Rcpp::traits::input_parameter< Eigen::Map<Eigen::MatrixXd> >::type y(ySEXP);
 74 |     Rcpp::traits::input_parameter< double >::type eps(epsSEXP);
 75 |     Rcpp::traits::input_parameter< int >::type max_iter(max_iterSEXP);
 76 |     rcpp_result_gen = Rcpp::wrap(coef_nnls(x, y, eps, max_iter));
 77 |     return rcpp_result_gen;
 78 | END_RCPP
 79 | }
 80 | // alloc_array
 81 | SEXP alloc_array(SEXP input, R_xlen_t n_cl);
 82 | RcppExport SEXP _scregclust_alloc_array(SEXP inputSEXP, SEXP n_clSEXP) {
 83 | BEGIN_RCPP
 84 |     Rcpp::RObject rcpp_result_gen;
 85 |     Rcpp::RNGScope rcpp_rngScope_gen;
 86 |     Rcpp::traits::input_parameter< SEXP >::type input(inputSEXP);
 87 |     Rcpp::traits::input_parameter< R_xlen_t >::type n_cl(n_clSEXP);
 88 |     rcpp_result_gen = Rcpp::wrap(alloc_array(input, n_cl));
 89 |     return rcpp_result_gen;
 90 | END_RCPP
 91 | }
 92 | // reset_array
 93 | void reset_array(SEXP arr, SEXP input);
 94 | RcppExport SEXP _scregclust_reset_array(SEXP arrSEXP, SEXP inputSEXP) {
 95 | BEGIN_RCPP
 96 |     Rcpp::RNGScope rcpp_rngScope_gen;
 97 |     Rcpp::traits::input_parameter< SEXP >::type arr(arrSEXP);
 98 |     Rcpp::traits::input_parameter< SEXP >::type input(inputSEXP);
 99 |     reset_array(arr, input);
100 |     return R_NilValue;
101 | END_RCPP
102 | }
103 | 
104 | static const R_CallMethodDef CallEntries[] = {
105 |     {"_scregclust_allocate_into_modules", (DL_FUNC) &_scregclust_allocate_into_modules, 7},
106 |     {"_scregclust_jaccard_indicator_comp", (DL_FUNC) &_scregclust_jaccard_indicator_comp, 2},
107 |     {"_scregclust_coop_lasso", (DL_FUNC) &_scregclust_coop_lasso, 13},
108 |     {"_scregclust_coef_nnls", (DL_FUNC) &_scregclust_coef_nnls, 4},
109 |     {"_scregclust_alloc_array", (DL_FUNC) &_scregclust_alloc_array, 2},
110 |     {"_scregclust_reset_array", (DL_FUNC) &_scregclust_reset_array, 2},
111 |     {NULL, NULL, 0}
112 | };
113 | 
114 | RcppExport void R_init_scregclust(DllInfo *dll) {
115 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
116 |     R_useDynamicSymbols(dll, FALSE);
117 | }
118 | 


--------------------------------------------------------------------------------
/vignettes/articles/pbmc.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Demonstration of workflow"
  3 | ---
  4 | 
  5 | ```{r, include = FALSE}
  6 | knitr::opts_chunk$set(
  7 |   collapse = TRUE,
  8 |   comment = "#>"
  9 | )
 10 | rlang::local_options(lifecycle_verbosity = "quiet")
 11 | ```
 12 | 
 13 | The methods below are described in our article
 14 | 
 15 | > Larsson, Held, et al. (2024) Reconstructing the regulatory programs
 16 | > underlying the phenotypic plasticity of neural cancers.
 17 | > Nature Communications 15, 9699
 18 | > DOI [10.1038/s41467-024-53954-3](https://doi.org/10.1038/s41467-024-53954-3)
 19 | 
 20 | Here we demonstrate the scregclust workflow using the PBMC data from
 21 | 10X Genomics (available [here](https://www.10xgenomics.com/resources/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0)).
 22 | This is the same data used in an [introductory vignette](https://satijalab.org/seurat/articles/pbmc3k_tutorial)
 23 | for the Seurat package. We use [Seurat](https://satijalab.org/seurat/) for
 24 | pre-processing of the data.
 25 | 
 26 | ```{r load-packages, results='hide', message=FALSE}
 27 | # Load required packages
 28 | library(Seurat)
 29 | library(scregclust)
 30 | ```
 31 | 
 32 | # Download the data
 33 | 
 34 | We are focusing here on the filtered feature barcode matrix available as an
 35 | HDF5 file from the website linked above. The data can be downloaded manually
 36 | or using R.
 37 | 
 38 | However you obtain the data, the code below assumes that the HDF5 file
 39 | containing it is placed in the same folder as this script with the name
 40 | `pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5`.
 41 | 
 42 | ```{r download-data}
 43 | url <- paste0(
 44 |   "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/",
 45 |   "pbmc_granulocyte_sorted_3k/",
 46 |   "pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5"
 47 | )
 48 | data_path <- file.path(
 49 |   tempdir(), "pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5"
 50 | )
 51 | 
 52 | download.file(url, data_path, cacheOK = FALSE, mode = "wb")
 53 | ```
 54 | 
 55 | # Load the data in Seurat and preprocess
 56 | 
 57 | To perform preprocessing use Seurat to load the data. The file ships with
 58 | two modalities, "Gene Expression" and "Peaks". We only use the former.
 59 | 
 60 | ```{r load-h5}
 61 | pbmc_data <- Read10X_h5(
 62 |   data_path,
 63 |   use.names = TRUE,
 64 |   unique.features = TRUE
 65 | )[["Gene Expression"]]
 66 | ```
 67 | 
 68 | We create a Seurat object and follow the Seurat vignette to subset the
 69 | cells and features (genes).
 70 | 
 71 | ```{r create-seurat-object}
 72 | pbmc <- CreateSeuratObject(
 73 |   counts = pbmc_data, min.cells = 3, min.features = 200
 74 | )
 75 | 
 76 | pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT.")
 77 | pbmc <- subset(pbmc, subset = percent.mt < 30 & nFeature_RNA < 6000)
 78 | ```
 79 | 
 80 | [SCTransform](https://satijalab.org/seurat/articles/sctransform_vignette) is
 81 | used for variance stabilization of the data and Pearson residuals for the
 82 | 6000 most variable genes are extracted as matrix `z`.
 83 | 
 84 | ```{r apply-var-stabilization}
 85 | pbmc <- SCTransform(pbmc, variable.features.n = 6000)
 86 | 
 87 | z <- GetAssayData(pbmc, layer = "scale.data")
 88 | dim(z)
 89 | ```
 90 | 
 91 | # Use scregclust for clustering target genes into modules
 92 | 
 93 | We then use `scregclust_format` which extracts gene symbols from the
 94 | expression matrix and determines which genes are considered regulators.
 95 | By default, transcription factors are used as regulators. Setting `mode`
 96 | to `"kinase"` uses kinases instead of transcription factors. A list of the
 97 | regulators used internally is returned by `get_regulator_list()`.
 98 | 
 99 | ```{r prep-scregclust}
100 | out <- scregclust_format(z, mode = "TF")
101 | ```
102 | 
103 | The output of `scregclust_format` is a list with three elements.
104 | 
105 | 1. `genesymbols` contains the rownames of `z`
106 | 2. `sample_assignment` is initialized to be a vector of `1`s of length `ncol(z)`
107 |    and can be filled with a known sample grouping. Here, we do not use it and
108 |    just keep it uniform across all cells.
109 | 3. `is_regulator` is an indicator vector (elements are 0 or 1) corresponding to 
110 |    the entries of `genesymbols` with 1 marking that the genesymbol is selected
111 |    as a regulator according to the model of `scregclust_format` (`"TF"` or
112 |    `"kinase"`) and 0 otherwise.
113 | 
114 | ```{r extract-scregclust-arguments}
115 | genesymbols <- out$genesymbols
116 | sample_assignment <- out$sample_assignment
117 | is_regulator <- out$is_regulator
118 | ```
119 | 
120 | Run `scregclust` with number of initial modules set to 10 and test
121 | several penalties. The penalties provided to `penalization` are used during
122 | selection of regulators associated with each module. An increasing penalty
123 | implies the selection of fewer regulators.
124 | `noise_threshold` controls the minimum $R^2$ a gene has to achieve across
125 | modules. Otherwise the gene is marked as noise.
126 | The run can be reproduced with the command below. A pre-fitted model can be
127 | downloaded from [GitHub](https://github.com/scmethods/scregclust/raw/main/datasets/pbmc_scregclust.rds)
128 | for convenience.
129 | 
130 | ```{r run-scregclust}
131 | # set.seed(8374)
132 | # fit <- scregclust(
133 | #   z, genesymbols, is_regulator, penalization = seq(0.1, 0.5, 0.05),
134 | #   n_modules = 10L, n_cycles = 50L, noise_threshold = 0.05
135 | # )
136 | # saveRDS(fit, file = "datasets/pbmc_scregclust.rds")
137 | 
138 | url <- paste0(
139 |   "https://github.com/scmethods/scregclust/raw/main/datasets/",
140 |   "pbmc_scregclust.rds"
141 | )
142 | fit_path <- file.path(tempdir(), "pbmc_scregclust.rds")
143 | download.file(url, fit_path)
144 | fit <- readRDS(fit_path)
145 | ```
146 | 
147 | # Analysis of results
148 | 
149 | Results can be visualized easily using built-in functions.
150 | Metrics for helping in choosing an optimal penalty can be plotted by calling
151 | `plot` on the object returned from `scregclust`.
152 | 
153 | ```{r viz-metrics, fig.width=7, fig.height=4, fig.dpi=100}
154 | #| fig.alt: >
155 | #|   Boxplots of predictive R^2 per module (bottom) and
156 | #|   regulator importance (top) over the penalization parameters
157 | #|   specified during model estimation. A decreasing trend can
158 | #|   be seen in R^2 per module and a slow and steady increase in
159 | #|   regulator importance is followed by an explosive increase from
160 | #|   around 0.4 penalization.
161 | plot(fit)
162 | ```
163 | 
164 | The results for each penalization parameter are placed in a list, `results`,
165 | attached to the `fit` object. So `fit$results[[1]]` contains the results
166 | of running `scregclust` with `penalization = 0.1`. For each penalization
167 | parameter, the algorithm might end up finding multiple optimal configurations.
168 | Each configuration describes target genes module assignments and which
169 | regulators are associated with which modules.
170 | The results for each such configuration are contained in the list `output`.
171 | This means that `fit$results[[1]]$output[[1]]` contains the results for
172 | the first final configuration. More than one may be available.
173 | 
174 | ```{r n-configs}
175 | sapply(fit$results, function(r) length(r$output))
176 | ```
177 | 
178 | In this example, at most two final configurations were found for each
179 | penalization parameters.
180 | 
181 | To plot the regulator network of the first configuration for
182 |  `penalization = 0.1` the function `plot_regulator_network` can be used.
183 | 
184 | ```{r viz-reg-network, fig.width=7, fig.height=7, fig.dpi=100}
185 | #| fig.alt: >
186 | #|   Network visualization of modules (colorful circles) and their top
187 | #|   regulators (grey rectangles). Arrows indicate regulation and their
188 | #|   thickness represents regulation strength. Red arrows indicate positive
189 | #|   regulation and blue arrows indicate negative regulation.
190 | plot_regulator_network(fit$results[[1]]$output[[1]])
191 | ```
192 | 


--------------------------------------------------------------------------------
/man/scregclust.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/scregclust.R
  3 | \name{scregclust}
  4 | \alias{scregclust}
  5 | \title{Uncover gene modules and their regulatory programs from single-cell data}
  6 | \usage{
  7 | scregclust(
  8 |   expression,
  9 |   genesymbols,
 10 |   is_regulator,
 11 |   penalization,
 12 |   n_modules,
 13 |   initial_target_modules = NULL,
 14 |   sample_assignment = NULL,
 15 |   center = TRUE,
 16 |   split1_proportion = 0.5,
 17 |   total_proportion = 1,
 18 |   split_indices = NULL,
 19 |   prior_indicator = NULL,
 20 |   prior_genesymbols = NULL,
 21 |   prior_baseline = 1e-06,
 22 |   prior_weight = 0.5,
 23 |   min_module_size = 0L,
 24 |   allocate_per_obs = TRUE,
 25 |   noise_threshold = 0.025,
 26 |   n_cycles = 50L,
 27 |   use_kmeanspp_init = TRUE,
 28 |   n_initializations = 50L,
 29 |   max_optim_iter = 10000L,
 30 |   tol_coop_rel = 1e-08,
 31 |   tol_coop_abs = 1e-12,
 32 |   tol_nnls = 1e-04,
 33 |   compute_predictive_r2 = TRUE,
 34 |   compute_silhouette = FALSE,
 35 |   nowarnings = FALSE,
 36 |   verbose = TRUE,
 37 |   quick_mode = FALSE,
 38 |   quick_mode_percent = 0.1
 39 | )
 40 | }
 41 | \arguments{
 42 | \item{expression}{\verb{p x n} matrix of pre-processed single cell expression
 43 | data with \code{p} rows of genes and \code{n} columns of cells.}
 44 | 
 45 | \item{genesymbols}{A vector of gene names corresponding to rows of
 46 | \code{expression}. Has to be of length \code{p}.}
 47 | 
 48 | \item{is_regulator}{An indicator vector where \code{1} indicates that the
 49 | corresponding row in \code{expression} is a candidate
 50 | regulator. All other rows represent target genes.
 51 | Has to be of length \code{p}.}
 52 | 
 53 | \item{penalization}{Sparsity penalty related to the amount of regulators
 54 | associated with each module. Either a single positive
 55 | number or a vector of positive numbers.}
 56 | 
 57 | \item{n_modules}{Requested number of modules (integer).
 58 | If this is provided without specifying \code{initial_target_modules},
 59 | then an initial module allocation is performed on the
 60 | cross-correlation matrix of targets and genes on the first
 61 | dataset after data splitting.}
 62 | 
 63 | \item{initial_target_modules}{The initial assignment of target genes to
 64 | modules of length \code{sum(is_regulator == 0L)}.
 65 | If this is not specified, then see \code{n_modules} regarding
 66 | module initialization. If provided, \code{use_kmeanspp_init}
 67 | and \code{n_initializations} are ignored.}
 68 | 
 69 | \item{sample_assignment}{A vector of sample assignment for each cell, can
 70 | be used to perform the data splitting with
 71 | stratification. Has to be of length \code{n}.
 72 | No stratification if \code{NULL} is supplied.}
 73 | 
 74 | \item{center}{Whether or not genes should be centered within each subgroup
 75 | defined in \code{sample_assignment}.}
 76 | 
 77 | \item{split1_proportion}{The proportion to use for the first dataset during
 78 | data splitting. The proportion for the second
 79 | dataset is \code{1 - split1_proportion}. If stratification
 80 | with \code{sample_assignment} is used, then the proportion
 81 | of each strata is controlled.}
 82 | 
 83 | \item{total_proportion}{Can be used to only use a proportion of the supplied
 84 | observations. The proportion of the first dataset
 85 | during data splitting in relation to the full
 86 | dataset will be
 87 | \code{total_proportion * split1_proportion}.}
 88 | 
 89 | \item{split_indices}{Can be used to provide an explicit data split. If this
 90 | is supplied then \code{split1_proportion}, and
 91 | \code{total_proportion} are ignored.
 92 | Note that if \code{sample_assigment} is provided and
 93 | \code{center == TRUE}, then subgroup centering will be
 94 | performed as in the case of random splitting.
 95 | A vector of length \code{n} containing entries 1 for cells
 96 | in the first data split, 2 for cells in the second
 97 | data split and \code{NA} for cells that should be excluded
 98 | from the computations.}
 99 | 
100 | \item{prior_indicator}{An indicator matrix (sparse or dense) of size \verb{q x q}
101 | that indicates whether there is a known functional
102 | relationship between two genes. Ideally, this is
103 | supplied as a sparse matrix (\code{sparseMatrix}
104 | in the \code{Matrix} package). If not, then the matrix
105 | is converted to one.}
106 | 
107 | \item{prior_genesymbols}{A vector of gene names of length q corresponding
108 | to the rows/columns in \code{prior_indicator}. Does not
109 | have to be the same as \code{genesymbols}, but only
110 | useful if there is overlap.}
111 | 
112 | \item{prior_baseline}{A positive baseline for the network prior. The larger
113 | this parameter is, the less impact the network prior
114 | will have.}
115 | 
116 | \item{prior_weight}{A number between 0 and 1 indicating the strength of the
117 | prior in relation to the data. 0 ignores the prior and
118 | makes the algorithm completely data-driven. 1 uses only
119 | the prior during module allocation.}
120 | 
121 | \item{min_module_size}{Minimum required size of target genes in a module.
122 | Smaller modules are emptied.}
123 | 
124 | \item{allocate_per_obs}{Whether module allocation should be performed for
125 | each observation in the second data split separately.
126 | If \code{FALSE}, target genes are allocated into modules
127 | on the aggregate sum of squares across all
128 | observations in the second data split.}
129 | 
130 | \item{noise_threshold}{Threshold for the best \eqn{R^2} of a target gene
131 | before it gets identified as noise.}
132 | 
133 | \item{n_cycles}{Number of maximum algorithmic cycles.}
134 | 
135 | \item{use_kmeanspp_init}{Use kmeans++ for module initialization if
136 | \code{initial_target_modules} is a single integer;
137 | otherwise use kmeans with random initial cluster
138 | centers}
139 | 
140 | \item{n_initializations}{Number of kmeans(++) initialization runs.}
141 | 
142 | \item{max_optim_iter}{Maximum number of iterations during optimization
143 | in the coop-Lasso and NNLS steps.}
144 | 
145 | \item{tol_coop_rel}{Relative convergence tolerance during optimization
146 | in the coop-Lasso step.}
147 | 
148 | \item{tol_coop_abs}{Absolute convergence tolerance during optimization
149 | in the coop-Lasso step.}
150 | 
151 | \item{tol_nnls}{Convergence tolerance during optimization in the NNLS step.}
152 | 
153 | \item{compute_predictive_r2}{Whether to compute predictive \eqn{R^2} per
154 | module as well as regulator importance.}
155 | 
156 | \item{compute_silhouette}{Whether to compute silhouette scores for each
157 | target gene.}
158 | 
159 | \item{nowarnings}{When turned on then no warning messages are shown.}
160 | 
161 | \item{verbose}{Whether to print progress.}
162 | 
163 | \item{quick_mode}{Whether to use a reduced number of noise targets to speed
164 | up computations.}
165 | 
166 | \item{quick_mode_percent}{A number in [0, 1) indicating the amount of
167 | noise targets to use in the re-allocation process
168 | if \code{quick_mode = TRUE}.}
169 | }
170 | \value{
171 | A list with S3 class \code{scregclust} containing
172 | \item{penalization}{The supplied \code{penalization} parameters}
173 | \item{results}{A list of result lists (each with S3 class
174 | \code{scregclust_result}), one for each supplied \code{penalization}
175 | parameter. See below.}
176 | \item{initial_target_modules}{Initial allocation of target genes into
177 | modules.}
178 | \item{split_indices}{either verbatim the vector given as input or
179 | a vector encoding the splits as NA = not included,
180 | 1 = split 1 or 2 = split 2. Allows reproducibility
181 | of data splits.}
182 | 
183 | For each supplied penalization parameter, \code{results} contains a list with
184 | \itemize{
185 | \item the current \code{penalization} parameter,
186 | \item the supplied \code{genesymbols} after filtering (as used during fitting),
187 | \item the supplied \code{is_regulator} vector after filtering (as used during
188 | fitting),
189 | \item the number of fitted modules \code{n_modules},
190 | \item whether the current run \code{converged} to a single configuration (as a
191 | boolean),
192 | \item as well as an \code{output} object containing the numeric results for each
193 | final configuration.
194 | }
195 | 
196 | It is possible that the algorithm ends in a finite cycle of configurations
197 | instead of a unique final configuration.
198 | Therefore, \code{output} is a list with each element itself being a list
199 | with the following contents:
200 | \describe{
201 | \item{\code{reg_table}}{a regulator table, a matrix of weights for each
202 | regulator and module}
203 | \item{\code{module}}{vector of same length as \code{genesymbols} containing the
204 | module assignments for all genes with regulators
205 | marked as \code{NA}. Genes considered noise are marked as \code{-1}.}
206 | \item{\code{module_all}}{same as \code{module}, however, genes that were marked as
207 | noise (-1 in \code{module}) are assigned to the
208 | module in which it has the largest \eqn{R^2},
209 | even if it is below \code{noise_threshold}.}
210 | \item{\code{r2}}{matrix of predictive \eqn{R^2} value for each target gene and
211 | module}
212 | \item{\code{best_r2}}{vector of best predictive \eqn{R^2} for each gene
213 | (regulators marked with NA)}
214 | \item{\code{best_r2_idx}}{module index corresponding to best predictive
215 | \eqn{R^2} for each gene (regulators marked with NA)}
216 | \item{\code{r2_module}}{a vector of predictive \eqn{R^2} values for each
217 | module (included if \code{compute_predictive_r2 == TRUE})}
218 | \item{\code{importance}}{a matrix of importance values for each regulator (rows)
219 | and module (columns) (included if
220 | \code{compute_predictive_r2 == TRUE})}
221 | \item{\code{r2_cross_module_per_target}}{a matrix of cross module \eqn{R^2}
222 | values for each target gene (rows)
223 | and each module (columns) (included
224 | if \code{compute_silhouette == TRUE})}
225 | \item{\code{silhouette}}{a vector of silhouette scores for each target gene
226 | (included if \code{compute_silhouette == TRUE})}
227 | \item{\code{models}}{regulator selection for each module as a matrix with
228 | regulators in rows and modules in columns}
229 | \item{\code{signs}}{regulator signs for each module as a matrix with
230 | regulators in rows and modules in columns}
231 | \item{\code{weights}}{average regulator coefficient for each module}
232 | \item{\code{coeffs}}{list of regulator coefficient matrices for each module
233 | for all target genes as re-estimated in the NNLS step}
234 | \item{\code{sigmas}}{matrix of residual variances, one per target gene
235 | in each module; derived from the residuals in NNLS step}
236 | }
237 | }
238 | \description{
239 | Use the scRegClust algorithm to determine gene modules and their
240 | regulatory programs from single-cell data.
241 | }
242 | \concept{main}
243 | 


--------------------------------------------------------------------------------
/src/optim.cpp:
--------------------------------------------------------------------------------
  1 | #include <Rcpp/Lightest>
  2 | 
  3 | #include <Eigen/Cholesky>
  4 | #include <Eigen/Core>
  5 | 
  6 | #include <algorithm>
  7 | #include <list>
  8 | #include <sstream>
  9 | #include <vector>
 10 | 
 11 | using Arr1d = Eigen::ArrayXd;
 12 | using Arr2d = Eigen::ArrayXXd;
 13 | using Matd = Eigen::MatrixXd;
 14 | using Vecd = Eigen::VectorXd;
 15 | using Veci = Eigen::VectorXi;
 16 | 
 17 | static Matd compute_xtx(const Matd& x) {
 18 | 	const auto p = x.cols();
 19 | 
 20 | 	Matd xtx = Eigen::MatrixXd::Zero(p, p);
 21 | 	if (p > 0) {
 22 | 		xtx.selfadjointView<Eigen::Lower>().rankUpdate(x.transpose());
 23 | 		xtx.triangularView<Eigen::Upper>() = xtx.transpose();
 24 | 	}
 25 | 
 26 | 	return xtx;
 27 | }
 28 | 
 29 | //' ADMM algorithm for solving the group-penalized least squares problem
 30 | //'
 31 | //' Implements estimation of the coop-lasso problem.
 32 | //'
 33 | //' @param y Target (n x m)
 34 | //' @param x Design matrix (n x p)
 35 | //' @param lambda Penalization parameter
 36 | //' @param weights A specific weight for each group (typically this is
 37 | //'                `sqrt(group size)`).
 38 | //' @param beta_0 Initial value for coefficients, allowing for warm start.
 39 | //'               Can be set to NULL, which results in the initial `beta`
 40 | //'               being a zero matrix.
 41 | //' @param rho_0 Initial ADMM step-size
 42 | //' @param alpha_0 Initial ADMM relaxation parameter
 43 | //' @param n_update Number of steps in-between updates of the
 44 | //'                 step-size/adaptation parameters
 45 | //' @param eps_corr Lower bound for the correlation in the step-size
 46 | //'                 update steps
 47 | //' @param max_iter Maximum number of iterations
 48 | //' @param eps_rel Relative tolerance for convergence check
 49 | //' @param eps_abs Absolute tolerance for convergence check
 50 | //' @param verbose Whether or not information about the optimization process
 51 | //'                should be printed to the terminal
 52 | //'
 53 | //' @return A list containing
 54 | //'     \item{beta}{The coefficients at convergence}
 55 | //'     \item{iterations}{Number of iterations}
 56 | //'
 57 | //' @references
 58 | //' Xu et al. (2017) Adaptive relaxed ADMM: Convergence theory and
 59 | //' practical implementation. DOI 10.1109/CVPR.2017.765
 60 | //'
 61 | //' @keywords internal
 62 | // [[Rcpp::export]]
 63 | Rcpp::List coop_lasso(
 64 | 	Eigen::Map<Eigen::MatrixXd> y, Eigen::Map<Eigen::MatrixXd> x, double lambda,
 65 | 	Eigen::Map<Eigen::ArrayXd> weights,
 66 | 	Rcpp::Nullable<Rcpp::NumericMatrix> beta_0 = R_NilValue,  // Initialization
 67 | 	double rho_0 = 0.2, double alpha_0 = 1.5, int n_update = 2,
 68 | 	double eps_corr = 0.2,												 // Step-size
 69 | 	int max_iter = 1000, double eps_rel = 1e-8, double eps_abs = 1e-12,	 // Convergence
 70 | 	bool verbose = false) {
 71 | 	// Record sizes
 72 | 	const auto n = y.rows();
 73 | 	const auto m = y.cols();
 74 | 	const auto p = x.cols();
 75 | 
 76 | 	if (n <= 0 || m <= 0 || p <= 0 || x.rows() <= 0) {
 77 | 		Rcpp::stop("COOP LASSO: Matrix dimensions of y and x need to be positive.");
 78 | 	}
 79 | 
 80 | 	if (x.rows() != n) {
 81 | 		Rcpp::stop("y and x need to have the same number of rows.");
 82 | 	}
 83 | 
 84 | 	Matd beta = Eigen::MatrixXd::Zero(p, m);
 85 | 	if (beta_0.isUsable()) {
 86 | 		if (static_cast<Eigen::Index>(beta_0.as().nrow()) != p ||
 87 | 			static_cast<Eigen::Index>(beta_0.as().ncol()) != m) {
 88 | 			Rcpp::stop("beta_0 needs to be of size p x m");
 89 | 		}
 90 | 
 91 | 		beta =
 92 | 			Eigen::Map<Matd>(beta_0.as().begin(), beta_0.as().nrow(), beta_0.as().ncol());
 93 | 	}
 94 | 
 95 | 	// Pre-compute some quantities to speed up computation
 96 | 	// Precompute X^T X
 97 | 	const Matd xtx = compute_xtx(x);
 98 | 	// Precompute X^T Y
 99 | 	const Matd xty = x.transpose() * y;
100 | 
101 | 	// Ensure we are starting with a feasible point (i.e. zeta == beta)
102 | 	Matd zeta = beta;
103 | 	// For adaptive ADMM it is necessary to use unscaled multipliers
104 | 	// to get the computations right
105 | 	Matd mult = Eigen::MatrixXd::Zero(p, m);
106 | 
107 | 	// We need to save one old iterate for step-size/relaxation
108 | 	// parameter estimation
109 | 	Matd beta_old = beta;
110 | 	Matd zeta_old = zeta;
111 | 	Matd mult_old = mult;
112 | 	Matd mult_hat_old = mult;
113 | 
114 | 	// Initialize step-size and relaxation parameter
115 | 	if (rho_0 <= 0.0) {
116 | 		Rcpp::stop("rho_0 > 0 needs to hold");
117 | 	}
118 | 	auto rho = rho_0;
119 | 	double rho_old = 0.0;  // Ensure rho_old != rho in first iteration
120 | 	if (alpha_0 <= 0.0 || alpha_0 > 2.0) {
121 | 		Rcpp::stop("0 < alpha_0 < 2 needs to hold");
122 | 	}
123 | 	auto alpha = alpha_0;
124 | 
125 | 	// Iteration counter
126 | 	int it = 1;
127 | 	// Parameter update counter
128 | 	int pc = 0;
129 | 
130 | 	Matd xtx_rho_eye = xtx;
131 | 	Eigen::LDLT<Eigen::MatrixXd> ldlt;
132 | 
133 | 	// ADMM algorithm
134 | 	while (true) {
135 | 		// Precompute if necessary
136 | 		if (pc == 0 || rho != rho_old) {
137 | 			// Only diagonal needs to be updated
138 | 			xtx_rho_eye.diagonal() = xtx.diagonal().array() + rho;
139 | 			ldlt.compute(xtx_rho_eye);
140 | 		}
141 | 
142 | 		// Step 1: Update beta using pre-computed quantities
143 | 		beta = ldlt.solve(xty + rho * zeta + mult);
144 | 
145 | 		// Relaxation step
146 | 		const Matd beta_relaxed = alpha * beta + (1.0 - alpha) * zeta;
147 | 
148 | 		// Step 2: Update zeta
149 | 		Matd zeta_new = (beta_relaxed - mult / rho);
150 | 
151 | 		const Arr1d shrink_pos =
152 | 			(1.0 -
153 | 			 lambda * weights / (rho * (zeta_new.cwiseMax(0.0).rowwise().norm()).array()))
154 | 				.max(0.0);
155 | 		const Arr1d shrink_neg =
156 | 			(1.0 -
157 | 			 lambda * weights / (rho * (zeta_new.cwiseMin(0.0).rowwise().norm()).array()))
158 | 				.max(0.0);
159 | 
160 | 		for (Eigen::Index j = 0; j < m; j++) {
161 | 			for (Eigen::Index i = 0; i < p; i++) {
162 | 				if (zeta_new(i, j) >= 0.0) {
163 | 					zeta_new(i, j) *= shrink_pos(i);
164 | 				} else {
165 | 					zeta_new(i, j) *= shrink_neg(i);
166 | 				}
167 | 			}
168 | 		}
169 | 
170 | 		// Step 3: Update multipliers
171 | 		mult += rho * (-beta_relaxed + zeta_new);
172 | 
173 | 		// Convergence check
174 | 		// Compute primal and dual residuals
175 | 		const Matd primal_resid = -beta + zeta_new;
176 | 		const Matd dual_resid = rho * (zeta - zeta_new);
177 | 
178 | 		if (verbose) {
179 | 			std::stringstream ss;
180 | 			ss << "\r#" << std::setw(5) << it << std::scientific << std::setprecision(4)
181 | 			   << " rho " << std::setw(5) << rho << " alpha " << std::setw(5) << alpha
182 | 			   << " prim_res " << std::setw(5) << primal_resid.norm() << " bnd "
183 | 			   << std::setw(5)
184 | 			   << fmax(eps_rel * fmax(beta.norm(), zeta_new.norm()), eps_abs)
185 | 			   << " dual_res " << std::setw(5) << dual_resid.norm() << " bnd "
186 | 			   << std::setw(5) << fmax(eps_rel * mult.norm(), eps_abs);
187 | 
188 | 			Rcpp::Rcout << ss.str();
189 | 		}
190 | 
191 | 		// Check residual convergence
192 | 		if ((primal_resid.norm() <=
193 | 			 fmax(eps_rel * fmax(beta.norm(), zeta_new.norm()), eps_abs)) &&
194 | 			(dual_resid.norm() <= fmax(eps_rel * mult.norm(), eps_abs))) {
195 | 			break;
196 | 		}
197 | 
198 | 		// Step-size/relaxation parameter update
199 | 		pc++;
200 | 		if (pc == n_update) {
201 | 			// The hatted multipliers use non-relaxed beta and the zetas from
202 | 			// the previous iteration
203 | 			const Matd mult_hat = mult + rho * (-beta + zeta);
204 | 
205 | 			const Matd delta_mult_hat = mult_hat - mult_hat_old;
206 | 			const Matd delta_h_hat = beta - beta_old;
207 | 
208 | 			const Matd delta_mult = mult - mult_old;
209 | 			const Matd delta_g_hat = zeta_old - zeta;
210 | 
211 | 			const auto norm_delta_mult_hat = delta_mult_hat.norm();
212 | 			const auto norm_delta_h_hat = delta_h_hat.norm();
213 | 			const auto norm_delta_mult = delta_mult.norm();
214 | 			const auto norm_delta_g_hat = delta_g_hat.norm();
215 | 
216 | 			double a = 0.0;
217 | 			double a_corr = 0.0;
218 | 
219 | 			if (norm_delta_mult_hat > 0.0 && norm_delta_h_hat > 0.0) {
220 | 				// Estimate local slope for h
221 | 				const auto delta_h_hat_delta_mult_hat =
222 | 					(delta_h_hat.array() * delta_mult_hat.array()).sum();
223 | 				const auto a_sd =
224 | 					delta_mult_hat.squaredNorm() / delta_h_hat_delta_mult_hat;
225 | 				const auto a_mg = delta_h_hat_delta_mult_hat / delta_h_hat.squaredNorm();
226 | 
227 | 				if (2.0 * a_mg > a_sd) {
228 | 					a = a_mg;
229 | 				} else {
230 | 					a = a_sd - a_mg / 2.0;
231 | 				}
232 | 
233 | 				a_corr =
234 | 					delta_h_hat_delta_mult_hat / (norm_delta_h_hat * norm_delta_mult_hat);
235 | 			}
236 | 
237 | 			double b = 0.0;
238 | 			double b_corr = 0.0;
239 | 
240 | 			if (norm_delta_mult > 0.0 && norm_delta_g_hat > 0.0) {
241 | 				// Estimate local slope for g
242 | 				const auto delta_g_hat_delta_mult =
243 | 					(delta_g_hat.array() * delta_mult.array()).sum();
244 | 				const auto b_sd = delta_mult.squaredNorm() / delta_g_hat_delta_mult;
245 | 				const auto b_mg = delta_g_hat_delta_mult / delta_g_hat.squaredNorm();
246 | 
247 | 				if (2.0 * b_mg > b_sd) {
248 | 					b = b_mg;
249 | 				} else {
250 | 					b = b_sd - b_mg / 2.0;
251 | 				}
252 | 
253 | 				b_corr = delta_g_hat_delta_mult / (norm_delta_g_hat * norm_delta_mult);
254 | 			}
255 | 
256 | 			// Store old rho to check whether it changed and we need to
257 | 			// update pre-computed quantities
258 | 			rho_old = rho;
259 | 
260 | 			// Update step-size if appropriate
261 | 			if (a_corr > eps_corr && b_corr > eps_corr) {
262 | 				rho = sqrt(a * b);
263 | 			} else if (a_corr > eps_corr && b_corr <= eps_corr) {
264 | 				rho = a;
265 | 			} else if (a_corr <= eps_corr && b_corr > eps_corr) {
266 | 				rho = b;
267 | 			}
268 | 			// Else: Leave rho as is
269 | 
270 | 			// Update relaxation parameter if appropriate
271 | 			if (a_corr > eps_corr && b_corr > eps_corr) {
272 | 				alpha = 1.0 + 2.0 / (sqrt(a * b) * (1.0 / a + 1.0 / b));
273 | 			} else if (a_corr > eps_corr && b_corr <= eps_corr) {
274 | 				alpha = 1.9;
275 | 			} else if (a_corr <= eps_corr && b_corr > eps_corr) {
276 | 				alpha = 1.1;
277 | 			} else {
278 | 				alpha = 1.5;
279 | 			}
280 | 
281 | 			// House-keeping
282 | 			beta_old = beta;
283 | 			zeta_old = zeta_new;
284 | 			mult_old = mult;
285 | 			mult_hat_old = mult_hat;
286 | 
287 | 			// Reset counter
288 | 			pc = 0;
289 | 		}
290 | 
291 | 		// House-keeping
292 | 		zeta = zeta_new;
293 | 
294 | 		// Check iteration limit
295 | 		it++;
296 | 		if (it > max_iter) {
297 | 			if (verbose) {
298 | 				Rcpp::Rcout << std::endl;
299 | 			}
300 | 			Rcpp::Rcout << "Coop-Lasso: Maximum number of iterations reached";
301 | 			if (!verbose) {
302 | 				Rcpp::Rcout << std::endl;
303 | 			}
304 | 			break;
305 | 		}
306 | 
307 | 		Rcpp::checkUserInterrupt();
308 | 	}
309 | 
310 | 	if (verbose) {
311 | 		Rcpp::Rcout << std::endl;
312 | 	}
313 | 
314 | 	Rcpp::List out;
315 | 	Rcpp::NumericMatrix beta_(beta.rows(), beta.cols(), beta.data());
316 | 	out["beta"] = beta_;
317 | 	out["iterations"] = it;
318 | 
319 | 	return out;
320 | }
321 | 
322 | // static void remove_kkt_elements(const Matd& beta, const Matd& grad, Matd& grad_bar) {
323 | // 	const auto n = beta.rows();
324 | // 	const auto m = beta.cols();
325 | 
326 | // 	for (Eigen::Index j = 0; j < m; j++) {
327 | // 		for (Eigen::Index i = 0; i < n; i++) {
328 | // 	 		if ((beta(i, j) == 0.0) && (grad(i, j) > 0.0)) {
329 | // 	 			grad_bar(i, j) = 0.0;
330 | // 	 		}
331 | // 		}
332 | // 	}
333 | // }
334 | 
335 | static void greedy_coord_descent(const Matd& Q, Matd& beta, Matd& grad) {
336 | 	const auto n = beta.rows();
337 | 	const auto m = beta.cols();
338 | 
339 | 	for (Eigen::Index t = 0; t < n; t++) {
340 | 		Eigen::Index empty_passive_sets = 0;
341 | 
342 | 		for (Eigen::Index j = 0; j < m; j++) {
343 | 			// Determine maximum absolute gradient over passive set
344 | 			Eigen::Index p = -1;
345 | 			auto max_val = (((beta.col(j).array() > 0.0) || (grad.col(j).array() < 0.0))
346 | 								.cast<double>() *
347 | 							grad.col(j).array().abs())
348 | 							   .maxCoeff(&p);
349 | 
350 | 			// Eigen::Index p = -1;
351 | 			// double max_val = 0.0;
352 | 			// for (Eigen::Index i = 0; i < n; i++) {
353 | 			// 	if ((beta(i, j) > 0.0) || (grad(i, j) < 0.0)) {
354 | 			// 		auto abs_grad = fabs(grad(i, j));
355 | 			// 		if (abs_grad > max_val) {
356 | 			// 			max_val = abs_grad;
357 | 			// 			p = i;
358 | 			// 		}
359 | 			// 	}
360 | 			// }
361 | 
362 | 			// Perform coordinate descent on the selected coefficient
363 | 			if (max_val == 0.0) {
364 | 				empty_passive_sets++;
365 | 				continue;
366 | 			}
367 | 
368 | 			const auto dbeta = fmax(0.0, beta(p, j) - grad(p, j) / Q(p, p)) - beta(p, j);
369 | 			beta(p, j) += dbeta;
370 | 			grad.col(j) += dbeta * Q.col(p);
371 | 		}
372 | 
373 | 		if (empty_passive_sets == m) {
374 | 			break;
375 | 		}
376 | 	}
377 | }
378 | 
379 | //' Compute NNLS coefficients
380 | //'
381 | //' Computes non-negative least squares coefficients with a matrix
382 | //' right hand side.
383 | //'
384 | //' @param x Coefficient matrix (p x n matrix)
385 | //' @param y Right hand side (p x m matrix)
386 | //' @param eps Convergence tolerance
387 | //' @param max_iter Maximum number of iterations
388 | //'
389 | //' @return A list containing
390 | //'  \item{beta}{The estimated coefficient matrix}
391 | //'  \item{iterations}{A vector containing the number of iterations needed
392 | //'                    for the `i`-th column in `y` in the `i`-th entry.}
393 | //'
394 | //' @references
395 | //' Duy Khuong Nguyen and Tu Bao Ho. Accelerated anti-lopsided algorithm
396 | //' for nonnegative least squares. International Journal of Data Science
397 | //' and Analytics, 3(1):23–34, 2017.
398 | //'
399 | //' Adapted from <https://github.com/khuongnd/nnls_antilopsided>
400 | //'
401 | //' @keywords internal
402 | // [[Rcpp::export]]
403 | Rcpp::List coef_nnls(Eigen::Map<Eigen::MatrixXd> x, Eigen::Map<Eigen::MatrixXd> y,
404 | 					 double eps = 1e-12, int max_iter = 1000L) {
405 | 	const auto n = x.cols();
406 | 	auto m = y.cols();	// Will be reduced whenever right-hand sides reach convergence
407 | 
408 | 	if (n <= 0 || m <= 0 || x.rows() <= 0 || y.rows() <= 0) {
409 | 		Rcpp::stop("NNLS: Matrix dimensions of y and x need to be positive.");
410 | 	}
411 | 
412 | 	// Pre-compute some quantities to speed up computation
413 | 	// Precompute X^T X
414 | 	const Matd xtx = compute_xtx(x);
415 | 
416 | 	const Vecd inv_sqrt_diag_xtx = 1.0 / xtx.diagonal().array().sqrt();
417 | 
418 | 	const Matd Q =
419 | 		xtx.array() * (inv_sqrt_diag_xtx * inv_sqrt_diag_xtx.transpose()).array();
420 | 	// Multiply -x^T y row-wise by the elements in inv_sqrt_diag_xtx
421 | 	Matd grad = (-x.transpose() * y).array().colwise() * inv_sqrt_diag_xtx.array();
422 | 
423 | 	Matd beta_final = Eigen::MatrixXd::Zero(n, m);
424 | 	Matd beta = Eigen::MatrixXd::Zero(n, m);
425 | 	Matd grad_bar = grad;
426 | 	// remove_kkt_elements(beta, grad, grad_bar);
427 | 	grad_bar.array() *=
428 | 		(1 - ((beta.array() == 0.0) && (grad.array() > 0.0))).cast<double>();
429 | 
430 | 	// Save necessary number of iterations
431 | 	std::vector<int> iterations(static_cast<std::vector<int>::size_type>(m));
432 | 	std::fill(iterations.begin(), iterations.end(), max_iter);
433 | 
434 | 	std::list<int> remaining_obs(static_cast<std::list<int>::size_type>(m));
435 | 	std::iota(remaining_obs.begin(), remaining_obs.end(), 0);
436 | 
437 | 	for (int l = 0; l < max_iter; l++) {
438 | 		const Matd beta_save = beta;
439 | 		const Matd grad_save = grad;
440 | 
441 | 		// Exact line search algorithm over passive variables
442 | 		const Matd Q_grad_bar = Q * grad_bar;
443 | 		const Arr1d alpha1 = (grad_bar.colwise().squaredNorm()).array() /
444 | 							 (grad_bar.array() * Q_grad_bar.array()).colwise().sum();
445 | 
446 | 		for (Eigen::Index j = 0; j < m; j++) {
447 | 			const auto a = alpha1(j);
448 | 			if ((a == a) && fabs(a) >= 1e-20 && fabs(a) < 1e30) {
449 | 				beta.col(j) -= a * grad_bar.col(j);
450 | 				grad.col(j) -= a * Q_grad_bar.col(j);
451 | 				for (Eigen::Index i = 0; i < n; i++) {
452 | 					if (beta(i, j) < 0.0) {
453 | 						// Correct for negative elements
454 | 						grad.col(j) -= beta(i, j) * Q.col(i);
455 | 						beta(i, j) = 0.0;  // Remove them from updated iterate
456 | 					}
457 | 				}
458 | 			}
459 | 		}
460 | 
461 | 		// Greedy coordinate descent algorithm (First time)
462 | 		greedy_coord_descent(Q, beta, grad);
463 | 
464 | 		// Accelerated search
465 | 		const Matd dbeta = beta_save - beta;
466 | 		const Matd Q_dbeta = Q * dbeta;
467 | 		const Arr1d alpha2 = (grad.array() * dbeta.array()).square().colwise().sum() /
468 | 							 (dbeta.array() * Q_dbeta.array()).colwise().sum();
469 | 
470 | 		for (Eigen::Index j = 0; j < m; j++) {
471 | 			const auto a = alpha2(j);
472 | 			if ((a == a) && fabs(a) >= 1e-20 && fabs(a) < 1e30) {
473 | 				beta.col(j) -= a * dbeta.col(j);
474 | 				grad.col(j) -= a * Q_dbeta.col(j);
475 | 				for (Eigen::Index i = 0; i < n; i++) {
476 | 					if (beta(i, j) < 0) {
477 | 						// Correct for negative elements
478 | 						grad.col(j) -= beta(i, j) * Q.col(i);
479 | 						beta(i, j) = 0.0;  // Remove them from updated iterate
480 | 					}
481 | 				}
482 | 			}
483 | 		}
484 | 
485 | 		// Greedy coordinate descent algorithm (Second time)
486 | 		greedy_coord_descent(Q, beta, grad);
487 | 
488 | 		// Compute error
489 | 		grad_bar = grad;
490 | 		// remove_kkt_elements(beta, grad, grad_bar);
491 | 		grad_bar.array() *=
492 | 			(1 - ((beta.array() == 0.0) && (grad.array() > 0.0))).cast<double>();
493 | 
494 | 		// Check for which rhs convergence has been achieved
495 | 		const Arr1d grad_bar_norms = grad_bar.colwise().norm();
496 | 		std::vector<Eigen::Index> kept_cols;
497 | 		kept_cols.reserve(remaining_obs.size());
498 | 
499 | 		auto it = remaining_obs.begin();
500 | 		for (Eigen::Index i = 0; i < m; i++) {
501 | 			if (grad_bar_norms(i) < eps) {
502 | 				beta_final.col(*it) = beta.col(i);
503 | 				iterations[static_cast<std::vector<int>::size_type>(*it)] = l + 1;
504 | 				it = remaining_obs.erase(it);
505 | 			} else {
506 | 				kept_cols.push_back(i);
507 | 				it++;
508 | 			}
509 | 		}
510 | 
511 | 		// Reduce problem to those rhs where convergence has not yet occurred
512 | 		m = static_cast<Eigen::Index>(kept_cols.size());
513 | 		if (m > static_cast<Eigen::Index>(0)) {
514 | 			// Use that columns in kept_cols are sorted by construction
515 | 			Eigen::Index j = 0;
516 | 			for (auto& i : kept_cols) {
517 | 				if (j != i) {
518 | 					beta.col(j) = beta.col(i);
519 | 					grad.col(j) = grad.col(i);
520 | 					grad_bar.col(j) = grad_bar.col(i);
521 | 				}
522 | 				j++;
523 | 			}
524 | 
525 | 			beta.conservativeResize(Eigen::NoChange, m);
526 | 			grad.conservativeResize(Eigen::NoChange, m);
527 | 			grad_bar.conservativeResize(Eigen::NoChange, m);
528 | 		} else {
529 | 			break;
530 | 		}
531 | 
532 | 		if (l == max_iter - 1) {
533 | 			Rcpp::Rcout << "NNLS: Maximum number of iterations reached" << std::endl;
534 | 		}
535 | 
536 | 		Rcpp::checkUserInterrupt();
537 | 	}
538 | 
539 | 	// Re-scale to original scale
540 | 	beta_final.array().colwise() *= inv_sqrt_diag_xtx.array();
541 | 
542 | 	Rcpp::List out;
543 | 	Rcpp::NumericMatrix beta_final_(beta_final.rows(), beta_final.cols(),
544 | 									beta_final.data());
545 | 	out["beta"] = beta_final_;
546 | 	out["iterations"] = iterations;
547 | 
548 | 	return out;
549 | }
550 | 


--------------------------------------------------------------------------------
/R/plotting.R:
--------------------------------------------------------------------------------
  1 | #' Plotting the regulatory table from scregclust as a directed graph
  2 | #'
  3 | #' @param output Object of type `scregclust_output` from a fit of the
  4 | #'               scregclust algorithm.
  5 | #' @param arrow_size Size of arrow head
  6 | #' @param edge_scaling Scaling factor for edge width
  7 | #' @param no_links Threshold value (0-10) for number of edges to show,
  8 | #'                 higher value = more stringent threshold = less edges
  9 | #' @param col color
 10 | #'
 11 | #' @return Graph with gene modules and regulators as nodes
 12 | #'
 13 | #' @concept plotting
 14 | #'
 15 | #' @export
 16 | plot_regulator_network <- function(output,
 17 |                                    arrow_size = 0.3,
 18 |                                    edge_scaling = 30,
 19 |                                    no_links = 6,
 20 |                                    col = c(
 21 |                                      "gray80",
 22 |                                      "#FC7165",
 23 |                                      "#BD828C",
 24 |                                      "#9D8A9F",
 25 |                                      "#7D92B2",
 26 |                                      "#BDA88C",
 27 |                                      "#FCBD65",
 28 |                                      "#F2BB90",
 29 |                                      "#E7B9BA",
 30 |                                      "#BDB69C",
 31 |                                      "#92B27D",
 32 |                                      "#9B8BA5",
 33 |                                      "#9D7DB2",
 34 |                                      "#94A5BF"
 35 |                                    )) {
 36 |   reg_table <- output$reg_table
 37 |   idx <- !is.na(colSums(reg_table))
 38 |   reg_table <- reg_table[, idx]
 39 | 
 40 |   regulators <- c()
 41 |   for (i in seq_len(ncol(reg_table))) {
 42 |     tmp1 <- head(rownames(
 43 |       reg_table[order(reg_table[, i], decreasing = TRUE), ]
 44 |     ))
 45 |     regulators <- append(regulators, tmp1)
 46 |     tmp2 <- tail(rownames(
 47 |       reg_table[order(reg_table[, i], decreasing = TRUE), ]
 48 |     ))
 49 |     regulators <- append(regulators, tmp2)
 50 |   }
 51 | 
 52 |   regulators <- unique(regulators)
 53 | 
 54 |   f <- which(rownames(reg_table) %in% regulators)
 55 |   reg_table <- reg_table[f, ]
 56 | 
 57 |   reg_table$regulator <- rownames(reg_table)
 58 |   rownames(reg_table) <- NULL
 59 | 
 60 |   links <- reshape::melt(reg_table, id.vars = "regulator")
 61 |   colnames(links) <- c("from", "to", "weight")
 62 |   f <- which(links$weight == 0)
 63 |   links <- links[-f, ]
 64 | 
 65 |   m <- which(links$weight < 0)
 66 |   p <- which(links$weight > 0)
 67 | 
 68 |   links$mode <- array(0, dim = c(nrow(links), 1))
 69 |   links$mode[m] <- "Repress"
 70 |   links$mode[p] <- "Activate"
 71 |   links$color <- array(0, dim = c(nrow(links), 1))
 72 |   links$color[m] <- "#2B278C"
 73 |   links$color[p] <- "#BD111F"
 74 |   links$weight <- abs(links$weight)
 75 | 
 76 |   links <- as.data.frame(links)
 77 | 
 78 |   rownames(reg_table) <- reg_table$regulator
 79 |   reg_table <- reg_table[, -ncol(reg_table)]
 80 | 
 81 |   nodes <- array(0, dim = c((nrow(reg_table) + ncol(reg_table)), 2))
 82 |   colnames(nodes) <- c("id", "type")
 83 | 
 84 |   nodes[seq_len(nrow(reg_table)), 1] <- rownames(reg_table)
 85 |   nodes[seq_len(nrow(reg_table)), 2] <- "Regulator"
 86 |   nodes[(nrow(reg_table) + 1):nrow(nodes), 1] <- colnames(reg_table)
 87 |   nodes[(nrow(reg_table) + 1):nrow(nodes), 2] <- "TargetState"
 88 |   nodes <- as.data.frame(nodes)
 89 | 
 90 |   net <- igraph::graph_from_data_frame(
 91 |     d = links, vertices = nodes, directed = TRUE
 92 |   )
 93 | 
 94 |   igraph::V(net)[which(igraph::V(net)$type == "Regulator")]$shape <- 1
 95 |   igraph::V(net)[which(igraph::V(net)$type == "TargetState")]$shape <- 2
 96 | 
 97 |   igraph::V(net)[which(igraph::V(net)$type == "Regulator")]$type <- 1
 98 |   igraph::V(net)[which(igraph::V(net)$type == "TargetState")]$type <- (
 99 |     seq_len(ncol(reg_table))
100 |   )
101 | 
102 |   colrs <- col
103 |   igraph::V(net)$color <- colrs[as.numeric(igraph::V(net)$type)]
104 | 
105 |   cut.off <- quantile(links$weight, probs = seq(0, 1, 0.1))[no_links]
106 |   net <- igraph::delete_edges(net, igraph::E(net)[links$weight < cut.off])
107 | 
108 |   isolated <- which(igraph::degree(net) == 0)
109 |   net <- igraph::delete_vertices(net, isolated)
110 | 
111 |   igraph::E(net)$arrow.size <- arrow_size
112 |   igraph::V(net)$shape <- c("vrectangle", "circle")[
113 |     as.numeric(igraph::V(net)$shape)
114 |   ]
115 |   igraph::E(net)$width <- igraph::E(net)$weight * edge_scaling
116 | 
117 |   l <- igraph::layout_with_fr(net)
118 | 
119 |   plot(
120 |     net,
121 |     layout = l,
122 |     edge.curved = 0.3,
123 |     vertex.label.cex = .6,
124 |     vertex.label.color = "black",
125 |     alpha = 0.5
126 |   )
127 |   legend(
128 |     x = -1.1,
129 |     y = -0.8,
130 |     c("Activating", "Repressing"),
131 |     pch = 21,
132 |     col = "#777777",
133 |     pt.bg = c("#BD111F", "#2B278C"),
134 |     pt.cex = 2,
135 |     cex = .8,
136 |     bty = "n",
137 |     ncol = 1
138 |   )
139 | }
140 | 
141 | #' @concept plotting
142 | #'
143 | #' @export
144 | plot.scregclust <- function(x, ...) {
145 |   r2_module_data <- do.call(rbind, lapply(x$results, function(r) {
146 |     do.call(rbind, lapply(r$output, function(o) {
147 |       idx <- !is.na(o$r2_module)
148 | 
149 |       data.frame(
150 |         penalization = r$penalization,
151 |         module = seq_along(o$r2_module)[idx],
152 |         value = o$r2_module[idx]
153 |       )
154 |     }))
155 |   }))
156 |   r2_module_data$penalization <- factor(
157 |     r2_module_data$penalization, levels = x$penalization
158 |   )
159 |   r2_module_data$variable <- "r2-per-module"
160 | 
161 |   importance_data <- do.call(rbind, lapply(x$results, function(r) {
162 |     do.call(rbind, lapply(seq_along(r$output), function(j) {
163 |       o <- r$output[[j]]
164 |       do.call(rbind, lapply(seq_len(ncol(o$models)), function(i) {
165 |         idx <- !is.na(o$importance[, i])
166 |         if (sum(idx) == 0) {
167 |           return(NULL)
168 |         }
169 | 
170 |         data.frame(
171 |           penalization = r$penalization,
172 |           module = i,
173 |           value = o$importance[idx, i]
174 |         )
175 |       }))
176 |     }))
177 |   }))
178 |   importance_data$penalization <- factor(
179 |     importance_data$penalization, levels = x$penalization
180 |   )
181 |   importance_data$variable <- "importance"
182 | 
183 |   rbind(r2_module_data, importance_data) |>
184 |     ggplot2::ggplot() +
185 |     ggplot2::facet_wrap(
186 |       variable ~ .,
187 |       nrow = 2,
188 |       scales = "free_y",
189 |       strip.position = "left",
190 |       labeller = ggplot2::label_bquote(
191 |         .(
192 |           if (variable == "importance") {
193 |             "Regulator Importance"
194 |           } else {
195 |             "Predictive" ~ R^2 ~ "per module"
196 |           }
197 |         )
198 |       ),
199 |     ) +
200 |     ggplot2::geom_boxplot(
201 |       ggplot2::aes(x = .data$penalization, y = .data$value),
202 |       outlier.size = 0.5,
203 |       lwd = 0.25,
204 |     ) +
205 |     ggplot2::labs(x = "Penalization", y = NULL) +
206 |     ggplot2::theme_minimal() +
207 |     ggplot2::theme(
208 |       panel.grid = ggplot2::element_blank(),
209 |       axis.line = ggplot2::element_line(
210 |         arrow = grid::arrow(length = grid::unit(1, "mm")),
211 |       ),
212 |       strip.background = ggplot2::element_blank(),
213 |       strip.placement = "outside",
214 |       line = ggplot2::element_line(linewidth = 0.25),
215 |       plot.margin = ggplot2::margin(t = 2, unit = "mm"),
216 |     )
217 | }
218 | 
219 | collect_silhouette_data <- function(list_of_fits) {
220 |   do.call(rbind, lapply(list_of_fits, function(fit) {
221 |     do.call(rbind, lapply(seq_along(fit$results), function(i) {
222 |       r <- fit$results[[i]]
223 |       do.call(rbind, lapply(seq_along(r$output), function(j) {
224 |         o <- r$output[[j]]
225 |         k <- o$module[!r$is_regulator]
226 | 
227 |         order_list <- lapply(seq_len(r$n_modules), function(cl) {
228 |           if (sum(k == cl) > 0) {
229 |             order(o$silhouette[k == cl])
230 |           } else {
231 |             integer(0)
232 |           }
233 |         })
234 |         gene <- do.call(c, lapply(seq_len(r$n_modules), function(cl) {
235 |           seq_along(k)[k == cl][order_list[[cl]]]
236 |         }))
237 | 
238 |         data.frame(
239 |           order = seq_len(sum(k != -1)),
240 |           gene = gene,
241 |           silhouette = o$silhouette[gene],
242 |           module = as.factor(k[gene]),
243 |           n_modules = r$n_modules,
244 |           output = j,
245 |           penalization = r$penalization
246 |         )
247 |       }))
248 |     }))
249 |   }))
250 | }
251 | 
252 | #' Plot individual silhouette scores
253 | #'
254 | #' @param list_of_fits A list of `scregclust` objects each fit to the same
255 | #'                     dataset across a variety of module counts (varying
256 | #'                     `n_modules` when running [`scregclust`]).
257 | #' @param penalization Either a single numeric value requesting the results
258 | #'                     for the same penalty parameter across all fits in
259 | #'                     `list_of_fits`, or one for each individual fit.
260 | #' @param final_config The final configuration that should be visualized.
261 | #'                     Either a single number to be used for all fits in
262 | #'                     `list_of_fits`, or one for each individual fit.
263 | #'
264 | #' @return A ggplot2 plot showing the the silhouette scores for each
265 | #'         supplied fit.
266 | #'
267 | #' @concept plotting
268 | #'
269 | #' @export
270 | plot_silhouettes <- function(list_of_fits, penalization, final_config = 1L) {
271 |   if (!(
272 |     is.numeric(penalization)
273 |     && (
274 |       (
275 |         length(penalization) == 1L
276 |         && all(sapply(list_of_fits, function(fit) {
277 |           penalization %in% fit$penalization
278 |         }))
279 |       ) || (
280 |         length(penalization) == length(list_of_fits)
281 |         && all(mapply(function(fit, p) {
282 |           p %in% fit$penalization
283 |         }, list_of_fits, penalization))
284 |       )
285 |     )
286 |   )) {
287 |     cli::cli_abort(c(
288 |       "{.var penalization} is not supplied correctly.",
289 |       "x" = "It needs to be one of the following two:",
290 |       "*" = "A single penalization parameter used in all fits.",
291 |       "*" = (
292 |         "A list of penalization parameters, exactly one for each supplied fit."
293 |       )
294 |     ))
295 |   }
296 | 
297 |   #### TODO: Checking the correctness of this is a bit of a pain
298 |   ####       Do soon-ish!
299 |   # if (!(
300 |   #   is.numeric(final_config)
301 |   #   && all(as.integer(final_config) == final_config)
302 |   #   && (
303 |   #     (
304 |   #       length(final_config) == 1L
305 |   #       && all(sapply(list_of_fits, function(fit) {
306 |   #         final_config %in% fit$final_config
307 |   #       }))
308 |   #     ) || (
309 |   #       length(final_config) == length(list_of_fits)
310 |   #       && all(mapply(function(fit, p) {
311 |   #         p %in% fit$final_config
312 |   #       }, list_of_fits, final_config))
313 |   #     )
314 |   #   )
315 |   # )) {
316 |   #   cli::cli_abort(c(
317 |   #     "{.var final_config} is not supplied correctly.",
318 |   #     "x" = "It needs to be one of the following two:",
319 |   #     "*" = "A single final_config parameter used in all fits.",
320 |   #     "*" = (
321 |   #       "A list of final_config parameters, exactly one for each supplied fit."
322 |   #     )
323 |   #   ))
324 |   # }
325 | 
326 |   if (any(
327 |     do.call(c, lapply(list_of_fits, function(fit) {
328 |       do.call(c, lapply(fit$results, function(res) {
329 |         sapply(res$output, function(o) {
330 |           is.null(o$silhouette)
331 |         })
332 |       }))
333 |     }))
334 |   )) {
335 |     cli::cli_abort(c(
336 |       "Silhouette scores were not computed during fitting.",
337 |       "i" = "Set `compute_silhouette = TRUE` in `scregclust`"
338 |     ))
339 |   }
340 | 
341 |   silhouette_data <- collect_silhouette_data(list_of_fits)
342 |   module_counts <- sapply(
343 |     list_of_fits, function(fit) fit$results[[1]]$n_modules
344 |   )
345 | 
346 |   silhouette_data$n_modules_lbl <- as.factor(
347 |     sprintf("K = %d", silhouette_data$n_modules)
348 |   )
349 | 
350 |   if (length(penalization) == 1L) {
351 |     silhouette_data <- silhouette_data[
352 |       silhouette_data$penalization == penalization,
353 |     ]
354 |   } else {
355 |     silhouette_data <- do.call(rbind, lapply(
356 |       seq_along(module_counts),
357 |       function(i) {
358 |         df <- silhouette_data[silhouette_data$n_modules == module_counts[i], ]
359 |         df[df$penalization == penalization[i]]
360 |       }
361 |     ))
362 |   }
363 | 
364 |   if (length(final_config) == 1L) {
365 |     silhouette_data <- silhouette_data[
366 |       silhouette_data$output == final_config,
367 |     ]
368 |   } else {
369 |     silhouette_data <- do.call(rbind, lapply(
370 |       seq_along(module_counts),
371 |       function(i) {
372 |         df <- silhouette_data[silhouette_data$n_modules == module_counts[i], ]
373 |         df[df$output == final_config[i]]
374 |       }
375 |     ))
376 |   }
377 | 
378 |   module_centers <- do.call(rbind, lapply(module_counts, function(n_modules) {
379 |     df <- silhouette_data[silhouette_data$n_modules == n_modules, ]
380 |     contained_modules <- unique(df$module)
381 | 
382 |     data.frame(
383 |       n_modules = n_modules,
384 |       module = contained_modules,
385 |       order =  sapply(contained_modules, function(cl) {
386 |         mean(df[df$module == cl, ]$order)
387 |       })
388 |     )
389 |   }))
390 |   module_centers$n_modules_lbl <- as.factor(
391 |     sprintf("K = %d", module_centers$n_modules)
392 |   )
393 | 
394 |   avg_silhouette <- data.frame(
395 |     n_modules = module_counts,
396 |     silhouette = sapply(module_counts, function(n_modules) {
397 |       df <- silhouette_data[silhouette_data$n_modules == n_modules, ]
398 |       mean(df$silhouette)
399 |     })
400 |   )
401 |   avg_silhouette$n_modules_lbl <- as.factor(
402 |     sprintf("K = %d", avg_silhouette$n_modules)
403 |   )
404 | 
405 |   silhouette_data |>
406 |     ggplot2::ggplot() +
407 |     ggplot2::facet_wrap(n_modules_lbl ~ .) +
408 |     ggplot2::geom_bar(
409 |       ggplot2::aes(x = .data$order, y = .data$silhouette, fill = .data$module),
410 |       stat = "identity",
411 |     ) +
412 |     ggplot2::geom_text(
413 |       ggplot2::aes(x = .data$order, y = -0.1, label = .data$module),
414 |       data = module_centers,
415 |     ) +
416 |     ggplot2::geom_hline(
417 |       ggplot2::aes(yintercept = .data$silhouette),
418 |       data = avg_silhouette,
419 |       linetype = "dashed",
420 |       color = "red",
421 |       linewidth = 0.25,
422 |     ) +
423 |     ggplot2::coord_flip() +
424 |     ggplot2::scale_fill_discrete(guide = "none") +
425 |     ggplot2::labs(x = "Module", y = "Silhouette score") +
426 |     ggplot2::theme_minimal() +
427 |     ggplot2::theme(
428 |       panel.grid = ggplot2::element_blank(),
429 |       axis.text.y = ggplot2::element_blank(),
430 |     )
431 | }
432 | 
433 | #' Plot average silhouette scores and average predictive \eqn{R^2}
434 | #'
435 | #' @param list_of_fits A list of `scregclust` objects each fit to the same
436 | #'                     dataset across a variety of module counts (varying
437 | #'                     `n_modules` while running [`scregclust`]).
438 | #' @param penalization Either a single numeric value requesting the results
439 | #'                     for the same penalty parameter across all fits in
440 | #'                     `list_of_fits`, or one for each individual fit.
441 | #'
442 | #' @return A ggplot2 plot showing the average silhouette score and the
443 | #'         average predictive \eqn{R^2}
444 | #'
445 | #' @concept plotting
446 | #'
447 | #' @export
448 | plot_module_count_helper <- function(list_of_fits, penalization) {
449 |   if (!(
450 |     is.list(list_of_fits)
451 |     && all(sapply(list_of_fits, function(f) "scregclust" %in% class(f)))
452 |   )) {
453 |     cli::cli_abort(c(
454 |       "{.var list_of_fits} is not supplied correctly.",
455 |       "x" = "It needs to be a list of {.class scregclust} objects."
456 |     ))
457 |   }
458 | 
459 |   if (!(
460 |     is.numeric(penalization)
461 |     && (
462 |       (
463 |         length(penalization) == 1L
464 |         && all(sapply(list_of_fits, function(fit) {
465 |           penalization %in% fit$penalization
466 |         }))
467 |       ) || (
468 |         length(penalization) == length(list_of_fits)
469 |         && all(mapply(function(fit, p) {
470 |           p %in% fit$penalization
471 |         }, list_of_fits, penalization))
472 |       )
473 |     )
474 |   )) {
475 |     cli::cli_abort(c(
476 |       "{.var penalization} is not supplied correctly.",
477 |       "x" = "It needs to be one of the following two:",
478 |       "*" = "A single penalization parameter used in all fits.",
479 |       "*" = (
480 |         "A list of penalization parameters, exactly one for each supplied fit."
481 |       )
482 |     ))
483 |   }
484 | 
485 |   if (any(
486 |     do.call(c, lapply(list_of_fits, function(fit) {
487 |       do.call(c, lapply(fit$results, function(res) {
488 |         sapply(res$output, function(o) {
489 |           is.null(o$silhouette)
490 |         })
491 |       }))
492 |     }))
493 |   )) {
494 |     cli::cli_abort(c(
495 |       "Silhouette scores were notcomputed during fitting.",
496 |       "i" = "Set `compute_silhouette = TRUE` in `scregclust`"
497 |     ))
498 |   }
499 | 
500 |   silhouette_data <- collect_silhouette_data(list_of_fits)
501 | 
502 |   avg_r2_module_data <- do.call(rbind, lapply(list_of_fits, function(fit) {
503 |     do.call(rbind, lapply(seq_along(fit$results), function(i) {
504 |       r <- fit$results[[i]]
505 |       r2_module <- do.call(c, lapply(seq_along(r$output), function(j) {
506 |         r$output[[j]]$r2_module
507 |       })) # average across different configurations
508 | 
509 |       # If a module is empty then r2_module is NA, so use NA remove
510 |       value <- mean(r2_module, na.rm = TRUE)
511 |       # If all modules turn out to be empty (e.g. too high penalization) then
512 |       # mean(...) above will evaluate to NaN. Do not return a data.frame
513 |       # in that case.
514 |       if (is.nan(value)) {
515 |         return(NULL)
516 |       }
517 | 
518 |       data.frame(
519 |         n_modules = r$n_modules,
520 |         penalization = r$penalization,
521 |         value = value,
522 |         variable = "avg-r2-module"
523 |       )
524 |     }))
525 |   }))
526 | 
527 |   module_counts <- sapply(
528 |     list_of_fits, function(fit) fit$results[[1]]$n_modules
529 |   )
530 | 
531 |   if (length(penalization) == 1) {
532 |     silhouette_data <- silhouette_data[
533 |       silhouette_data$penalization == penalization,
534 |     ]
535 |     avg_r2_module_data <- avg_r2_module_data[
536 |       avg_r2_module_data$penalization == penalization,
537 |     ]
538 |   } else {
539 |     silhouette_data <- do.call(rbind, lapply(
540 |       seq_along(module_counts),
541 |       function(i) {
542 |         df <- silhouette_data[silhouette_data$n_modules == module_counts[i], ]
543 |         df[df$penalization == penalization[i]]
544 |       }
545 |     ))
546 |     avg_r2_module_data <- do.call(rbind, lapply(
547 |       seq_along(module_counts),
548 |       function(i) {
549 |         df <- avg_r2_module_data[
550 |           avg_r2_module_data$n_modules == module_counts[i],
551 |         ]
552 |         df[df$penalization == penalization[i]]
553 |       }
554 |     ))
555 |   }
556 | 
557 |   avg_silhouette <- sapply(seq_along(module_counts), function(i) {
558 |     df <- silhouette_data[silhouette_data$n_modules == module_counts[i], ]
559 |     mean(df$silhouette) # average across different configurations
560 |   })
561 | 
562 |   rbind(
563 |     data.frame(
564 |       n_modules = module_counts,
565 |       penalization = penalization,
566 |       value = avg_silhouette,
567 |       variable = "avg-silhouette"
568 |     ),
569 |     avg_r2_module_data
570 |   ) |>
571 |     ggplot2::ggplot() +
572 |     ggplot2::facet_wrap(
573 |       variable ~ .,
574 |       nrow = 2,
575 |       scales = "free_y",
576 |       strip.position = "left",
577 |       labeller = ggplot2::label_bquote(
578 |         .(
579 |           if (variable == "avg-silhouette") {
580 |             "Average silhouette score"
581 |           } else {
582 |             "Avg. pred." ~ R^2 ~ "per module"
583 |           }
584 |         )
585 |       ),
586 |     ) +
587 |     ggplot2::geom_line(
588 |       ggplot2::aes(.data$n_modules, .data$value), linewidth = 0.25
589 |     ) +
590 |     ggplot2::geom_point(
591 |       ggplot2::aes(.data$n_modules, .data$value), size = 0.5
592 |     ) +
593 |     ggplot2::labs(x = "# of modules (K)", y = NULL) +
594 |     ggplot2::scale_x_continuous(breaks = module_counts) +
595 |     ggplot2::theme_minimal() +
596 |     ggplot2::theme(
597 |       panel.grid = ggplot2::element_blank(),
598 |       axis.line = ggplot2::element_line(
599 |         arrow = grid::arrow(length = grid::unit(1, "mm")),
600 |       ),
601 |       strip.background = ggplot2::element_blank(),
602 |       strip.placement = "outside",
603 |       line = ggplot2::element_line(linewidth = 0.25),
604 |       plot.margin = ggplot2::margin(t = 2, unit = "mm"),
605 |     )
606 | }
607 | 


--------------------------------------------------------------------------------
/datasets/humanTFs.txt:
--------------------------------------------------------------------------------
   1 | AC008770.3
   2 | AC023509.3
   3 | AC092835.1
   4 | AC138696.1
   5 | ADNP
   6 | ADNP2
   7 | AEBP1
   8 | AEBP2
   9 | AHCTF1
  10 | AHDC1
  11 | AHR
  12 | AHRR
  13 | AIRE
  14 | AKAP8
  15 | AKAP8L
  16 | AKNA
  17 | ALX1
  18 | ALX3
  19 | ALX4
  20 | ANHX
  21 | ANKZF1
  22 | AR
  23 | ARGFX
  24 | ARHGAP35
  25 | ARID2
  26 | ARID3A
  27 | ARID3B
  28 | ARID3C
  29 | ARID5A
  30 | ARID5B
  31 | ARNT
  32 | ARNT2
  33 | ARNTL
  34 | ARNTL2
  35 | ARX
  36 | ASCL1
  37 | ASCL2
  38 | ASCL3
  39 | ASCL4
  40 | ASCL5
  41 | ASH1L
  42 | ATF1
  43 | ATF2
  44 | ATF3
  45 | ATF4
  46 | ATF5
  47 | ATF6
  48 | ATF6B
  49 | ATF7
  50 | ATMIN
  51 | ATOH1
  52 | ATOH7
  53 | ATOH8
  54 | BACH1
  55 | BACH2
  56 | BARHL1
  57 | BARHL2
  58 | BARX1
  59 | BARX2
  60 | BATF
  61 | BATF2
  62 | BATF3
  63 | BAZ2A
  64 | BAZ2B
  65 | BBX
  66 | BCL11A
  67 | BCL11B
  68 | BCL6
  69 | BCL6B
  70 | BHLHA15
  71 | BHLHA9
  72 | BHLHE22
  73 | BHLHE23
  74 | BHLHE40
  75 | BHLHE41
  76 | BNC1
  77 | BNC2
  78 | BORCS8-MEF2B
  79 | BPTF
  80 | BRF2
  81 | BSX
  82 | C11orf95
  83 | CAMTA1
  84 | CAMTA2
  85 | CARF
  86 | CASZ1
  87 | CBX2
  88 | CC2D1A
  89 | CCDC169-SOHLH2
  90 | CCDC17
  91 | CDC5L
  92 | CDX1
  93 | CDX2
  94 | CDX4
  95 | CEBPA
  96 | CEBPB
  97 | CEBPD
  98 | CEBPE
  99 | CEBPG
 100 | CEBPZ
 101 | CENPA
 102 | CENPB
 103 | CENPBD1
 104 | CENPS
 105 | CENPT
 106 | CENPX
 107 | CGGBP1
 108 | CHAMP1
 109 | CHCHD3
 110 | CIC
 111 | CLOCK
 112 | CPEB1
 113 | CPXCR1
 114 | CREB1
 115 | CREB3
 116 | CREB3L1
 117 | CREB3L2
 118 | CREB3L3
 119 | CREB3L4
 120 | CREB5
 121 | CREBL2
 122 | CREBZF
 123 | CREM
 124 | CRX
 125 | CSRNP1
 126 | CSRNP2
 127 | CSRNP3
 128 | CTCF
 129 | CTCFL
 130 | CUX1
 131 | CUX2
 132 | CXXC1
 133 | CXXC4
 134 | CXXC5
 135 | DACH1
 136 | DACH2
 137 | DBP
 138 | DBX1
 139 | DBX2
 140 | DDIT3
 141 | DEAF1
 142 | DLX1
 143 | DLX2
 144 | DLX3
 145 | DLX4
 146 | DLX5
 147 | DLX6
 148 | DMBX1
 149 | DMRT1
 150 | DMRT2
 151 | DMRT3
 152 | DMRTA1
 153 | DMRTA2
 154 | DMRTB1
 155 | DMRTC2
 156 | DMTF1
 157 | DNMT1
 158 | DNTTIP1
 159 | DOT1L
 160 | DPF1
 161 | DPF3
 162 | DPRX
 163 | DR1
 164 | DRAP1
 165 | DRGX
 166 | DUX1
 167 | DUX3
 168 | DUX4
 169 | DUXA
 170 | DZIP1
 171 | E2F1
 172 | E2F2
 173 | E2F3
 174 | E2F4
 175 | E2F5
 176 | E2F6
 177 | E2F7
 178 | E2F8
 179 | E4F1
 180 | EBF1
 181 | EBF2
 182 | EBF3
 183 | EBF4
 184 | EEA1
 185 | EGR1
 186 | EGR2
 187 | EGR3
 188 | EGR4
 189 | EHF
 190 | ELF1
 191 | ELF2
 192 | ELF3
 193 | ELF4
 194 | ELF5
 195 | ELK1
 196 | ELK3
 197 | ELK4
 198 | EMX1
 199 | EMX2
 200 | EN1
 201 | EN2
 202 | EOMES
 203 | EPAS1
 204 | ERF
 205 | ERG
 206 | ESR1
 207 | ESR2
 208 | ESRRA
 209 | ESRRB
 210 | ESRRG
 211 | ESX1
 212 | ETS1
 213 | ETS2
 214 | ETV1
 215 | ETV2
 216 | ETV3
 217 | ETV3L
 218 | ETV4
 219 | ETV5
 220 | ETV6
 221 | ETV7
 222 | EVX1
 223 | EVX2
 224 | FAM170A
 225 | FAM200B
 226 | FBXL19
 227 | FERD3L
 228 | FEV
 229 | FEZF1
 230 | FEZF2
 231 | FIGLA
 232 | FIZ1
 233 | FLI1
 234 | FLYWCH1
 235 | FOS
 236 | FOSB
 237 | FOSL1
 238 | FOSL2
 239 | FOXA1
 240 | FOXA2
 241 | FOXA3
 242 | FOXB1
 243 | FOXB2
 244 | FOXC1
 245 | FOXC2
 246 | FOXD1
 247 | FOXD2
 248 | FOXD3
 249 | FOXD4
 250 | FOXD4L1
 251 | FOXD4L3
 252 | FOXD4L4
 253 | FOXD4L5
 254 | FOXD4L6
 255 | FOXE1
 256 | FOXE3
 257 | FOXF1
 258 | FOXF2
 259 | FOXG1
 260 | FOXH1
 261 | FOXI1
 262 | FOXI2
 263 | FOXI3
 264 | FOXJ1
 265 | FOXJ2
 266 | FOXJ3
 267 | FOXK1
 268 | FOXK2
 269 | FOXL1
 270 | FOXL2
 271 | FOXM1
 272 | FOXN1
 273 | FOXN2
 274 | FOXN3
 275 | FOXN4
 276 | FOXO1
 277 | FOXO3
 278 | FOXO4
 279 | FOXO6
 280 | FOXP1
 281 | FOXP2
 282 | FOXP3
 283 | FOXP4
 284 | FOXQ1
 285 | FOXR1
 286 | FOXR2
 287 | FOXS1
 288 | GABPA
 289 | GATA1
 290 | GATA2
 291 | GATA3
 292 | GATA4
 293 | GATA5
 294 | GATA6
 295 | GATAD2A
 296 | GATAD2B
 297 | GBX1
 298 | GBX2
 299 | GCM1
 300 | GCM2
 301 | GFI1
 302 | GFI1B
 303 | GLI1
 304 | GLI2
 305 | GLI3
 306 | GLI4
 307 | GLIS1
 308 | GLIS2
 309 | GLIS3
 310 | GLMP
 311 | GLYR1
 312 | GMEB1
 313 | GMEB2
 314 | GPBP1
 315 | GPBP1L1
 316 | GRHL1
 317 | GRHL2
 318 | GRHL3
 319 | GSC
 320 | GSC2
 321 | GSX1
 322 | GSX2
 323 | GTF2B
 324 | GTF2I
 325 | GTF2IRD1
 326 | GTF2IRD2
 327 | GTF2IRD2B
 328 | GTF3A
 329 | GZF1
 330 | HAND1
 331 | HAND2
 332 | HBP1
 333 | HDX
 334 | HELT
 335 | HES1
 336 | HES2
 337 | HES3
 338 | HES4
 339 | HES5
 340 | HES6
 341 | HES7
 342 | HESX1
 343 | HEY1
 344 | HEY2
 345 | HEYL
 346 | HHEX
 347 | HIC1
 348 | HIC2
 349 | HIF1A
 350 | HIF3A
 351 | HINFP
 352 | HIVEP1
 353 | HIVEP2
 354 | HIVEP3
 355 | HKR1
 356 | HLF
 357 | HLX
 358 | HMBOX1
 359 | HMG20A
 360 | HMG20B
 361 | HMGA1
 362 | HMGA2
 363 | HMGN3
 364 | HMX1
 365 | HMX2
 366 | HMX3
 367 | HNF1A
 368 | HNF1B
 369 | HNF4A
 370 | HNF4G
 371 | HOMEZ
 372 | HOXA1
 373 | HOXA10
 374 | HOXA11
 375 | HOXA13
 376 | HOXA2
 377 | HOXA3
 378 | HOXA4
 379 | HOXA5
 380 | HOXA6
 381 | HOXA7
 382 | HOXA9
 383 | HOXB1
 384 | HOXB13
 385 | HOXB2
 386 | HOXB3
 387 | HOXB4
 388 | HOXB5
 389 | HOXB6
 390 | HOXB7
 391 | HOXB8
 392 | HOXB9
 393 | HOXC10
 394 | HOXC11
 395 | HOXC12
 396 | HOXC13
 397 | HOXC4
 398 | HOXC5
 399 | HOXC6
 400 | HOXC8
 401 | HOXC9
 402 | HOXD1
 403 | HOXD10
 404 | HOXD11
 405 | HOXD12
 406 | HOXD13
 407 | HOXD3
 408 | HOXD4
 409 | HOXD8
 410 | HOXD9
 411 | HSF1
 412 | HSF2
 413 | HSF4
 414 | HSF5
 415 | HSFX1
 416 | HSFX2
 417 | HSFY1
 418 | HSFY2
 419 | IKZF1
 420 | IKZF2
 421 | IKZF3
 422 | IKZF4
 423 | IKZF5
 424 | INSM1
 425 | INSM2
 426 | IRF1
 427 | IRF2
 428 | IRF3
 429 | IRF4
 430 | IRF5
 431 | IRF6
 432 | IRF7
 433 | IRF8
 434 | IRF9
 435 | IRX1
 436 | IRX2
 437 | IRX3
 438 | IRX4
 439 | IRX5
 440 | IRX6
 441 | ISL1
 442 | ISL2
 443 | ISX
 444 | JAZF1
 445 | JDP2
 446 | JRK
 447 | JRKL
 448 | JUN
 449 | JUNB
 450 | JUND
 451 | KAT7
 452 | KCMF1
 453 | KCNIP3
 454 | KDM2A
 455 | KDM2B
 456 | KDM5B
 457 | KIN
 458 | KLF1
 459 | KLF10
 460 | KLF11
 461 | KLF12
 462 | KLF13
 463 | KLF14
 464 | KLF15
 465 | KLF16
 466 | KLF17
 467 | KLF2
 468 | KLF3
 469 | KLF4
 470 | KLF5
 471 | KLF6
 472 | KLF7
 473 | KLF8
 474 | KLF9
 475 | KMT2A
 476 | KMT2B
 477 | L3MBTL1
 478 | L3MBTL3
 479 | L3MBTL4
 480 | LBX1
 481 | LBX2
 482 | LCOR
 483 | LCORL
 484 | LEF1
 485 | LEUTX
 486 | LHX1
 487 | LHX2
 488 | LHX3
 489 | LHX4
 490 | LHX5
 491 | LHX6
 492 | LHX8
 493 | LHX9
 494 | LIN28A
 495 | LIN28B
 496 | LIN54
 497 | LMX1A
 498 | LMX1B
 499 | LTF
 500 | LYL1
 501 | MAF
 502 | MAFA
 503 | MAFB
 504 | MAFF
 505 | MAFG
 506 | MAFK
 507 | MAX
 508 | MAZ
 509 | MBD1
 510 | MBD2
 511 | MBD3
 512 | MBD4
 513 | MBD6
 514 | MBNL2
 515 | MECOM
 516 | MECP2
 517 | MEF2A
 518 | MEF2B
 519 | MEF2C
 520 | MEF2D
 521 | MEIS1
 522 | MEIS2
 523 | MEIS3
 524 | MEOX1
 525 | MEOX2
 526 | MESP1
 527 | MESP2
 528 | MGA
 529 | MITF
 530 | MIXL1
 531 | MKX
 532 | MLX
 533 | MLXIP
 534 | MLXIPL
 535 | MNT
 536 | MNX1
 537 | MSANTD1
 538 | MSANTD3
 539 | MSANTD4
 540 | MSC
 541 | MSGN1
 542 | MSX1
 543 | MSX2
 544 | MTERF1
 545 | MTERF2
 546 | MTERF3
 547 | MTERF4
 548 | MTF1
 549 | MTF2
 550 | MXD1
 551 | MXD3
 552 | MXD4
 553 | MXI1
 554 | MYB
 555 | MYBL1
 556 | MYBL2
 557 | MYC
 558 | MYCL
 559 | MYCN
 560 | MYF5
 561 | MYF6
 562 | MYNN
 563 | MYOD1
 564 | MYOG
 565 | MYPOP
 566 | MYRF
 567 | MYRFL
 568 | MYSM1
 569 | MYT1
 570 | MYT1L
 571 | MZF1
 572 | NACC2
 573 | NAIF1
 574 | NANOG
 575 | NANOGNB
 576 | NANOGP8
 577 | NCOA1
 578 | NCOA2
 579 | NCOA3
 580 | NEUROD1
 581 | NEUROD2
 582 | NEUROD4
 583 | NEUROD6
 584 | NEUROG1
 585 | NEUROG2
 586 | NEUROG3
 587 | NFAT5
 588 | NFATC1
 589 | NFATC2
 590 | NFATC3
 591 | NFATC4
 592 | NFE2
 593 | NFE2L1
 594 | NFE2L2
 595 | NFE2L3
 596 | NFE4
 597 | NFIA
 598 | NFIB
 599 | NFIC
 600 | NFIL3
 601 | NFIX
 602 | NFKB1
 603 | NFKB2
 604 | NFX1
 605 | NFXL1
 606 | NFYA
 607 | NFYB
 608 | NFYC
 609 | NHLH1
 610 | NHLH2
 611 | NKRF
 612 | NKX1-1
 613 | NKX1-2
 614 | NKX2-1
 615 | NKX2-2
 616 | NKX2-3
 617 | NKX2-4
 618 | NKX2-5
 619 | NKX2-6
 620 | NKX2-8
 621 | NKX3-1
 622 | NKX3-2
 623 | NKX6-1
 624 | NKX6-2
 625 | NKX6-3
 626 | NME2
 627 | NOBOX
 628 | NOTO
 629 | NPAS1
 630 | NPAS2
 631 | NPAS3
 632 | NPAS4
 633 | NR0B1
 634 | NR1D1
 635 | NR1D2
 636 | NR1H2
 637 | NR1H3
 638 | NR1H4
 639 | NR1I2
 640 | NR1I3
 641 | NR2C1
 642 | NR2C2
 643 | NR2E1
 644 | NR2E3
 645 | NR2F1
 646 | NR2F2
 647 | NR2F6
 648 | NR3C1
 649 | NR3C2
 650 | NR4A1
 651 | NR4A2
 652 | NR4A3
 653 | NR5A1
 654 | NR5A2
 655 | NR6A1
 656 | NRF1
 657 | NRL
 658 | OLIG1
 659 | OLIG2
 660 | OLIG3
 661 | ONECUT1
 662 | ONECUT2
 663 | ONECUT3
 664 | OSR1
 665 | OSR2
 666 | OTP
 667 | OTX1
 668 | OTX2
 669 | OVOL1
 670 | OVOL2
 671 | OVOL3
 672 | PA2G4
 673 | PATZ1
 674 | PAX1
 675 | PAX2
 676 | PAX3
 677 | PAX4
 678 | PAX5
 679 | PAX6
 680 | PAX7
 681 | PAX8
 682 | PAX9
 683 | PBX1
 684 | PBX2
 685 | PBX3
 686 | PBX4
 687 | PCGF2
 688 | PCGF6
 689 | PDX1
 690 | PEG3
 691 | PGR
 692 | PHF1
 693 | PHF19 
 694 | PHF20
 695 | PHF21A
 696 | PHOX2A
 697 | PHOX2B
 698 | PIN1
 699 | PITX1
 700 | PITX2
 701 | PITX3
 702 | PKNOX1
 703 | PKNOX2
 704 | PLAG1
 705 | PLAGL1
 706 | PLAGL2
 707 | PLSCR1
 708 | POGK
 709 | POU1F1
 710 | POU2AF1
 711 | POU2F1
 712 | POU2F2
 713 | POU2F3
 714 | POU3F1
 715 | POU3F2
 716 | POU3F3
 717 | POU3F4
 718 | POU4F1
 719 | POU4F2
 720 | POU4F3
 721 | POU5F1
 722 | POU5F1B
 723 | POU5F2
 724 | POU6F1
 725 | POU6F2
 726 | PPARA
 727 | PPARD
 728 | PPARG
 729 | PRDM1
 730 | PRDM10
 731 | PRDM12
 732 | PRDM13
 733 | PRDM14
 734 | PRDM15
 735 | PRDM16
 736 | PRDM2
 737 | PRDM4
 738 | PRDM5
 739 | PRDM6
 740 | PRDM8
 741 | PRDM9
 742 | PREB
 743 | PRMT3
 744 | PROP1
 745 | PROX1
 746 | PROX2
 747 | PRR12
 748 | PRRX1
 749 | PRRX2
 750 | PTF1A
 751 | PURA
 752 | PURB
 753 | PURG
 754 | RAG1
 755 | RARA
 756 | RARB
 757 | RARG
 758 | RAX
 759 | RAX2
 760 | RBAK
 761 | RBCK1
 762 | RBPJ
 763 | RBPJL
 764 | RBSN
 765 | REL
 766 | RELA
 767 | RELB
 768 | REPIN1
 769 | REST
 770 | REXO4
 771 | RFX1
 772 | RFX2
 773 | RFX3
 774 | RFX4
 775 | RFX5
 776 | RFX6
 777 | RFX7
 778 | RFX8
 779 | RHOXF1
 780 | RHOXF2
 781 | RHOXF2B
 782 | RLF
 783 | RORA
 784 | RORB
 785 | RORC
 786 | RREB1
 787 | RUNX1
 788 | RUNX2
 789 | RUNX3
 790 | RXRA
 791 | RXRB
 792 | RXRG
 793 | SAFB
 794 | SAFB2
 795 | SALL1
 796 | SALL2
 797 | SALL3
 798 | SALL4
 799 | SATB1
 800 | SATB2
 801 | SCMH1
 802 | SCML4
 803 | SCRT1
 804 | SCRT2
 805 | SCX
 806 | SEBOX
 807 | SETBP1
 808 | SETDB1
 809 | SETDB2
 810 | SGSM2
 811 | SHOX
 812 | SHOX2
 813 | SIM1
 814 | SIM2
 815 | SIX1
 816 | SIX2
 817 | SIX3
 818 | SIX4
 819 | SIX5
 820 | SIX6
 821 | SKI
 822 | SKIL
 823 | SKOR1
 824 | SKOR2
 825 | SLC2A4RG
 826 | SMAD1
 827 | SMAD3
 828 | SMAD4
 829 | SMAD5
 830 | SMAD9
 831 | SMYD3
 832 | SNAI1
 833 | SNAI2
 834 | SNAI3
 835 | SNAPC2
 836 | SNAPC4
 837 | SNAPC5
 838 | SOHLH1
 839 | SOHLH2
 840 | SON
 841 | SOX1
 842 | SOX10
 843 | SOX11
 844 | SOX12
 845 | SOX13
 846 | SOX14
 847 | SOX15
 848 | SOX17
 849 | SOX18
 850 | SOX2
 851 | SOX21
 852 | SOX3
 853 | SOX30
 854 | SOX4
 855 | SOX5
 856 | SOX6
 857 | SOX7
 858 | SOX8
 859 | SOX9
 860 | SP1
 861 | SP100
 862 | SP110
 863 | SP140
 864 | SP140L
 865 | SP2
 866 | SP3
 867 | SP4
 868 | SP5
 869 | SP6
 870 | SP7
 871 | SP8
 872 | SP9
 873 | SPDEF
 874 | SPEN
 875 | SPI1
 876 | SPIB
 877 | SPIC
 878 | SPZ1
 879 | SRCAP
 880 | SREBF1
 881 | SREBF2
 882 | SRF
 883 | SRY
 884 | ST18
 885 | STAT1
 886 | STAT2
 887 | STAT3
 888 | STAT4
 889 | STAT5A
 890 | STAT5B
 891 | STAT6
 892 | T
 893 | TAL1
 894 | TAL2
 895 | TBP
 896 | TBPL1
 897 | TBPL2
 898 | TBR1
 899 | TBX1
 900 | TBX10
 901 | TBX15
 902 | TBX18
 903 | TBX19
 904 | TBX2
 905 | TBX20
 906 | TBX21
 907 | TBX22
 908 | TBX3
 909 | TBX4
 910 | TBX5
 911 | TBX6
 912 | TCF12
 913 | TCF15
 914 | TCF20
 915 | TCF21
 916 | TCF23
 917 | TCF24
 918 | TCF3
 919 | TCF4
 920 | TCF7
 921 | TCF7L1
 922 | TCF7L2
 923 | TCFL5
 924 | TEAD1
 925 | TEAD2
 926 | TEAD3
 927 | TEAD4
 928 | TEF
 929 | TERB1
 930 | TERF1
 931 | TERF2
 932 | TET1
 933 | TET2
 934 | TET3
 935 | TFAP2A
 936 | TFAP2B
 937 | TFAP2C
 938 | TFAP2D
 939 | TFAP2E
 940 | TFAP4
 941 | TFCP2
 942 | TFCP2L1
 943 | TFDP1
 944 | TFDP2
 945 | TFDP3
 946 | TFE3
 947 | TFEB
 948 | TFEC
 949 | TGIF1
 950 | TGIF2
 951 | TGIF2LX
 952 | TGIF2LY
 953 | THAP1
 954 | THAP10
 955 | THAP11
 956 | THAP12
 957 | THAP2
 958 | THAP3
 959 | THAP4
 960 | THAP5
 961 | THAP6
 962 | THAP7
 963 | THAP8
 964 | THAP9
 965 | THRA
 966 | THRB
 967 | THYN1
 968 | TIGD1
 969 | TIGD2
 970 | TIGD3
 971 | TIGD4
 972 | TIGD5
 973 | TIGD6
 974 | TIGD7
 975 | TLX1
 976 | TLX2
 977 | TLX3
 978 | TMF1
 979 | TOPORS
 980 | TP53
 981 | TP63
 982 | TP73
 983 | TPRX1
 984 | TRAFD1
 985 | TRERF1
 986 | TRPS1
 987 | TSC22D1
 988 | TSHZ1
 989 | TSHZ2
 990 | TSHZ3
 991 | TTF1
 992 | TWIST1
 993 | TWIST2
 994 | UBP1
 995 | UNCX
 996 | USF1
 997 | USF2
 998 | USF3
 999 | VAX1
1000 | VAX2
1001 | VDR
1002 | VENTX
1003 | VEZF1
1004 | VSX1
1005 | VSX2
1006 | WIZ
1007 | WT1
1008 | XBP1
1009 | XPA
1010 | YBX1
1011 | YBX2
1012 | YBX3
1013 | YY1
1014 | YY2
1015 | ZBED1
1016 | ZBED2
1017 | ZBED3
1018 | ZBED4
1019 | ZBED5
1020 | ZBED6
1021 | ZBED9
1022 | ZBTB1
1023 | ZBTB10
1024 | ZBTB11
1025 | ZBTB12
1026 | ZBTB14
1027 | ZBTB16
1028 | ZBTB17
1029 | ZBTB18
1030 | ZBTB2
1031 | ZBTB20
1032 | ZBTB21
1033 | ZBTB22
1034 | ZBTB24
1035 | ZBTB25
1036 | ZBTB26
1037 | ZBTB3
1038 | ZBTB32
1039 | ZBTB33
1040 | ZBTB34
1041 | ZBTB37
1042 | ZBTB38
1043 | ZBTB39
1044 | ZBTB4
1045 | ZBTB40
1046 | ZBTB41
1047 | ZBTB42
1048 | ZBTB43
1049 | ZBTB44
1050 | ZBTB45
1051 | ZBTB46
1052 | ZBTB47
1053 | ZBTB48
1054 | ZBTB49
1055 | ZBTB5
1056 | ZBTB6
1057 | ZBTB7A
1058 | ZBTB7B
1059 | ZBTB7C
1060 | ZBTB8A
1061 | ZBTB8B
1062 | ZBTB9
1063 | ZC3H8
1064 | ZEB1
1065 | ZEB2
1066 | ZFAT
1067 | ZFHX2
1068 | ZFHX3
1069 | ZFHX4
1070 | ZFP1
1071 | ZFP14
1072 | ZFP2
1073 | ZFP28
1074 | ZFP3
1075 | ZFP30
1076 | ZFP37
1077 | ZFP41
1078 | ZFP42
1079 | ZFP57
1080 | ZFP62
1081 | ZFP64
1082 | ZFP69
1083 | ZFP69B
1084 | ZFP82
1085 | ZFP90
1086 | ZFP91
1087 | ZFP92
1088 | ZFPM1
1089 | ZFPM2
1090 | ZFX
1091 | ZFY
1092 | ZGLP1
1093 | ZGPAT
1094 | ZHX1
1095 | ZHX2
1096 | ZHX3
1097 | ZIC1
1098 | ZIC2
1099 | ZIC3
1100 | ZIC4
1101 | ZIC5
1102 | ZIK1
1103 | ZIM2
1104 | ZIM3
1105 | ZKSCAN1
1106 | ZKSCAN2
1107 | ZKSCAN3
1108 | ZKSCAN4
1109 | ZKSCAN5
1110 | ZKSCAN7
1111 | ZKSCAN8
1112 | ZMAT1
1113 | ZMAT4
1114 | ZNF10
1115 | ZNF100
1116 | ZNF101
1117 | ZNF107
1118 | ZNF112
1119 | ZNF114
1120 | ZNF117
1121 | ZNF12
1122 | ZNF121
1123 | ZNF124
1124 | ZNF131
1125 | ZNF132
1126 | ZNF133
1127 | ZNF134
1128 | ZNF135
1129 | ZNF136
1130 | ZNF138
1131 | ZNF14
1132 | ZNF140
1133 | ZNF141
1134 | ZNF142
1135 | ZNF143
1136 | ZNF146
1137 | ZNF148
1138 | ZNF154
1139 | ZNF155
1140 | ZNF157
1141 | ZNF16
1142 | ZNF160
1143 | ZNF165
1144 | ZNF169
1145 | ZNF17
1146 | ZNF174
1147 | ZNF175
1148 | ZNF177
1149 | ZNF18
1150 | ZNF180
1151 | ZNF181
1152 | ZNF182
1153 | ZNF184
1154 | ZNF189
1155 | ZNF19
1156 | ZNF195
1157 | ZNF197
1158 | ZNF2
1159 | ZNF20
1160 | ZNF200
1161 | ZNF202
1162 | ZNF205
1163 | ZNF207
1164 | ZNF208
1165 | ZNF211
1166 | ZNF212
1167 | ZNF213
1168 | ZNF214
1169 | ZNF215
1170 | ZNF217
1171 | ZNF219
1172 | ZNF22
1173 | ZNF221
1174 | ZNF222
1175 | ZNF223
1176 | ZNF224
1177 | ZNF225
1178 | ZNF226
1179 | ZNF227
1180 | ZNF229
1181 | ZNF23
1182 | ZNF230
1183 | ZNF232
1184 | ZNF233
1185 | ZNF234
1186 | ZNF235
1187 | ZNF236
1188 | ZNF239
1189 | ZNF24
1190 | ZNF248
1191 | ZNF25
1192 | ZNF250
1193 | ZNF251
1194 | ZNF253
1195 | ZNF254
1196 | ZNF256
1197 | ZNF257
1198 | ZNF26
1199 | ZNF260
1200 | ZNF263
1201 | ZNF264
1202 | ZNF266
1203 | ZNF267
1204 | ZNF268
1205 | ZNF273
1206 | ZNF274
1207 | ZNF275
1208 | ZNF276
1209 | ZNF277
1210 | ZNF28
1211 | ZNF280A
1212 | ZNF280B
1213 | ZNF280C
1214 | ZNF280D
1215 | ZNF281
1216 | ZNF282
1217 | ZNF283
1218 | ZNF284
1219 | ZNF285
1220 | ZNF286A
1221 | ZNF286B
1222 | ZNF287
1223 | ZNF292
1224 | ZNF296
1225 | ZNF3
1226 | ZNF30
1227 | ZNF300
1228 | ZNF302
1229 | ZNF304
1230 | ZNF311
1231 | ZNF316
1232 | ZNF317
1233 | ZNF318
1234 | ZNF319
1235 | ZNF32
1236 | ZNF320
1237 | ZNF322
1238 | ZNF324
1239 | ZNF324B
1240 | ZNF326
1241 | ZNF329
1242 | ZNF331
1243 | ZNF333
1244 | ZNF334
1245 | ZNF335
1246 | ZNF337
1247 | ZNF33A
1248 | ZNF33B
1249 | ZNF34
1250 | ZNF341
1251 | ZNF343
1252 | ZNF345
1253 | ZNF346
1254 | ZNF347
1255 | ZNF35
1256 | ZNF350
1257 | ZNF354A
1258 | ZNF354B
1259 | ZNF354C
1260 | ZNF358
1261 | ZNF362
1262 | ZNF365
1263 | ZNF366
1264 | ZNF367
1265 | ZNF37A
1266 | ZNF382
1267 | ZNF383
1268 | ZNF384
1269 | ZNF385A
1270 | ZNF385B
1271 | ZNF385C
1272 | ZNF385D
1273 | ZNF391
1274 | ZNF394
1275 | ZNF395
1276 | ZNF396
1277 | ZNF397
1278 | ZNF398
1279 | ZNF404
1280 | ZNF407
1281 | ZNF408
1282 | ZNF41
1283 | ZNF410
1284 | ZNF414
1285 | ZNF415
1286 | ZNF416
1287 | ZNF417
1288 | ZNF418
1289 | ZNF419
1290 | ZNF420
1291 | ZNF423
1292 | ZNF425
1293 | ZNF426
1294 | ZNF428
1295 | ZNF429
1296 | ZNF43
1297 | ZNF430
1298 | ZNF431
1299 | ZNF432
1300 | ZNF433
1301 | ZNF436
1302 | ZNF438
1303 | ZNF439
1304 | ZNF44
1305 | ZNF440
1306 | ZNF441
1307 | ZNF442
1308 | ZNF443
1309 | ZNF444
1310 | ZNF445
1311 | ZNF446
1312 | ZNF449
1313 | ZNF45
1314 | ZNF451
1315 | ZNF454
1316 | ZNF460
1317 | ZNF461
1318 | ZNF462
1319 | ZNF467
1320 | ZNF468
1321 | ZNF469
1322 | ZNF470
1323 | ZNF471
1324 | ZNF473
1325 | ZNF474
1326 | ZNF479
1327 | ZNF48
1328 | ZNF480
1329 | ZNF483
1330 | ZNF484
1331 | ZNF485
1332 | ZNF486
1333 | ZNF487
1334 | ZNF488
1335 | ZNF490
1336 | ZNF491
1337 | ZNF492
1338 | ZNF493
1339 | ZNF496
1340 | ZNF497
1341 | ZNF500
1342 | ZNF501
1343 | ZNF502
1344 | ZNF503
1345 | ZNF506
1346 | ZNF507
1347 | ZNF510
1348 | ZNF511
1349 | ZNF512
1350 | ZNF512B
1351 | ZNF513
1352 | ZNF514
1353 | ZNF516
1354 | ZNF517
1355 | ZNF518A
1356 | ZNF518B
1357 | ZNF519
1358 | ZNF521
1359 | ZNF524
1360 | ZNF525
1361 | ZNF526
1362 | ZNF527
1363 | ZNF528
1364 | ZNF529
1365 | ZNF530
1366 | ZNF532
1367 | ZNF534
1368 | ZNF536
1369 | ZNF540
1370 | ZNF541
1371 | ZNF543
1372 | ZNF544
1373 | ZNF546
1374 | ZNF547
1375 | ZNF548
1376 | ZNF549
1377 | ZNF550
1378 | ZNF551
1379 | ZNF552
1380 | ZNF554
1381 | ZNF555
1382 | ZNF556
1383 | ZNF557
1384 | ZNF558
1385 | ZNF559
1386 | ZNF560
1387 | ZNF561
1388 | ZNF562
1389 | ZNF563
1390 | ZNF564
1391 | ZNF565
1392 | ZNF566
1393 | ZNF567
1394 | ZNF568
1395 | ZNF569
1396 | ZNF57
1397 | ZNF570
1398 | ZNF571
1399 | ZNF572
1400 | ZNF573
1401 | ZNF574
1402 | ZNF575
1403 | ZNF576
1404 | ZNF577
1405 | ZNF578
1406 | ZNF579
1407 | ZNF580
1408 | ZNF581
1409 | ZNF582
1410 | ZNF583
1411 | ZNF584
1412 | ZNF585A
1413 | ZNF585B
1414 | ZNF586
1415 | ZNF587
1416 | ZNF587B
1417 | ZNF589
1418 | ZNF592
1419 | ZNF594
1420 | ZNF595
1421 | ZNF596
1422 | ZNF597
1423 | ZNF598
1424 | ZNF599
1425 | ZNF600
1426 | ZNF605
1427 | ZNF606
1428 | ZNF607
1429 | ZNF608
1430 | ZNF609
1431 | ZNF610
1432 | ZNF611
1433 | ZNF613
1434 | ZNF614
1435 | ZNF615
1436 | ZNF616
1437 | ZNF618
1438 | ZNF619
1439 | ZNF620
1440 | ZNF621
1441 | ZNF623
1442 | ZNF624
1443 | ZNF625
1444 | ZNF626
1445 | ZNF627
1446 | ZNF628
1447 | ZNF629
1448 | ZNF630
1449 | ZNF639
1450 | ZNF641
1451 | ZNF644
1452 | ZNF645
1453 | ZNF646
1454 | ZNF648
1455 | ZNF649
1456 | ZNF652
1457 | ZNF653
1458 | ZNF654
1459 | ZNF655
1460 | ZNF658
1461 | ZNF66
1462 | ZNF660
1463 | ZNF662
1464 | ZNF664
1465 | ZNF665
1466 | ZNF667
1467 | ZNF668
1468 | ZNF669
1469 | ZNF670
1470 | ZNF671
1471 | ZNF672
1472 | ZNF674
1473 | ZNF675
1474 | ZNF676
1475 | ZNF677
1476 | ZNF678
1477 | ZNF679
1478 | ZNF680
1479 | ZNF681
1480 | ZNF682
1481 | ZNF683
1482 | ZNF684
1483 | ZNF687
1484 | ZNF688
1485 | ZNF689
1486 | ZNF69
1487 | ZNF691
1488 | ZNF692
1489 | ZNF695
1490 | ZNF696
1491 | ZNF697
1492 | ZNF699
1493 | ZNF7
1494 | ZNF70
1495 | ZNF700
1496 | ZNF701
1497 | ZNF703
1498 | ZNF704
1499 | ZNF705A
1500 | ZNF705B
1501 | ZNF705D
1502 | ZNF705E
1503 | ZNF705G
1504 | ZNF706
1505 | ZNF707
1506 | ZNF708
1507 | ZNF709
1508 | ZNF71
1509 | ZNF710
1510 | ZNF711
1511 | ZNF713
1512 | ZNF714
1513 | ZNF716
1514 | ZNF717
1515 | ZNF718
1516 | ZNF721
1517 | ZNF724
1518 | ZNF726
1519 | ZNF727
1520 | ZNF728
1521 | ZNF729
1522 | ZNF730
1523 | ZNF732
1524 | ZNF735
1525 | ZNF736
1526 | ZNF737
1527 | ZNF74
1528 | ZNF740
1529 | ZNF746
1530 | ZNF747
1531 | ZNF749
1532 | ZNF750
1533 | ZNF75A
1534 | ZNF75D
1535 | ZNF76
1536 | ZNF761
1537 | ZNF763
1538 | ZNF764
1539 | ZNF765
1540 | ZNF766
1541 | ZNF768
1542 | ZNF77
1543 | ZNF770
1544 | ZNF771
1545 | ZNF772
1546 | ZNF773
1547 | ZNF774
1548 | ZNF775
1549 | ZNF776
1550 | ZNF777
1551 | ZNF778
1552 | ZNF780A
1553 | ZNF780B
1554 | ZNF781
1555 | ZNF782
1556 | ZNF783
1557 | ZNF784
1558 | ZNF785
1559 | ZNF786
1560 | ZNF787
1561 | ZNF788
1562 | ZNF789
1563 | ZNF79
1564 | ZNF790
1565 | ZNF791
1566 | ZNF792
1567 | ZNF793
1568 | ZNF799
1569 | ZNF8
1570 | ZNF80
1571 | ZNF800
1572 | ZNF804A
1573 | ZNF804B
1574 | ZNF805
1575 | ZNF808
1576 | ZNF81
1577 | ZNF813
1578 | ZNF814
1579 | ZNF816
1580 | ZNF821
1581 | ZNF823
1582 | ZNF827
1583 | ZNF829
1584 | ZNF83
1585 | ZNF830
1586 | ZNF831
1587 | ZNF835
1588 | ZNF836
1589 | ZNF837
1590 | ZNF84
1591 | ZNF841
1592 | ZNF843
1593 | ZNF844
1594 | ZNF845
1595 | ZNF846
1596 | ZNF85
1597 | ZNF850
1598 | ZNF852
1599 | ZNF853
1600 | ZNF860
1601 | ZNF865
1602 | ZNF878
1603 | ZNF879
1604 | ZNF880
1605 | ZNF883
1606 | ZNF888
1607 | ZNF891
1608 | ZNF90
1609 | ZNF91
1610 | ZNF92
1611 | ZNF93
1612 | ZNF98
1613 | ZNF99
1614 | ZSCAN1
1615 | ZSCAN10
1616 | ZSCAN12
1617 | ZSCAN16
1618 | ZSCAN18
1619 | ZSCAN2
1620 | ZSCAN20
1621 | ZSCAN21
1622 | ZSCAN22
1623 | ZSCAN23
1624 | ZSCAN25
1625 | ZSCAN26
1626 | ZSCAN29
1627 | ZSCAN30
1628 | ZSCAN31
1629 | ZSCAN32
1630 | ZSCAN4
1631 | ZSCAN5A
1632 | ZSCAN5B
1633 | ZSCAN5C
1634 | ZSCAN9
1635 | ZUFSP
1636 | ZXDA
1637 | ZXDB
1638 | ZXDC
1639 | ZZZ3
1640 | 


--------------------------------------------------------------------------------
/datasets/humanTFs_v3.txt:
--------------------------------------------------------------------------------
   1 | SORBS2
   2 | CEBPB
   3 | EBF1
   4 | ETS2
   5 | FOXC1
   6 | ID3
   7 | MEF2C
   8 | NR2F2
   9 | NR4A2
  10 | NR4A3
  11 | SMAD7
  12 | ZFHX3
  13 | ZNF90
  14 | IFI16
  15 | HMGA1
  16 | PRRX1
  17 | KLF5
  18 | FBN1
  19 | PLAGL1
  20 | FOXS1
  21 | HMGB3
  22 | DEPDC1
  23 | FOXM1
  24 | MXD3
  25 | HMGB2
  26 | HMGB1
  27 | E2F7
  28 | EZH2
  29 | HIST1H1B
  30 | HIST1H1D
  31 | MYBL1
  32 | DEK
  33 | MYBL2
  34 | E2F1
  35 | H1FX
  36 | CARHSP1
  37 | HIST1H1A
  38 | HIST1H1C
  39 | HIST1H1E
  40 | LHX2
  41 | PAX6
  42 | POU3F2
  43 | SOX11
  44 | ARX
  45 | CHD9
  46 | FOXJ1
  47 | GSX2
  48 | HES5
  49 | INSM1
  50 | NEUROD1
  51 | OSR1
  52 | PBX1
  53 | POU3F4
  54 | PROX1
  55 | SALL3
  56 | SOX21
  57 | ZMAT1
  58 | ZNF117
  59 | CHD7
  60 | H1F0
  61 | HEY2
  62 | JDP2
  63 | MLXIP
  64 | NFATC1
  65 | OSR2
  66 | SEMA4A
  67 | SKIL
  68 | TSC22D3
  69 | ZNF331
  70 | ZNF503
  71 | DBX2
  72 | RORA
  73 | TCF12
  74 | ZIC1
  75 | NFIB
  76 | NR2F1
  77 | PITX1
  78 | RORB
  79 | STAT1
  80 | STAT2
  81 | MEOX2
  82 | ASCL1
  83 | ETV1
  84 | HES6
  85 | NFIA
  86 | OLIG2
  87 | RFX4
  88 | SOX8
  89 | TCF4
  90 | ZEB1
  91 | ZNF704
  92 | HEY1
  93 | MEIS2
  94 | POU3F3
  95 | SOX2
  96 | MITF
  97 | PAX3
  98 | PLXNC1
  99 | SNAI2
 100 | EPAS1
 101 | MAF
 102 | TBX2
 103 | MET
 104 | PLXNA1
 105 | AHR
 106 | GLIS3
 107 | PAWR
 108 | BMP2
 109 | DRAP1
 110 | ELK3
 111 | FOSL1
 112 | FOXP1
 113 | GTF2F2
 114 | HMGA2
 115 | HOXB2
 116 | ID1
 117 | KLF7
 118 | NR1D1
 119 | PRDM1
 120 | RUNX1
 121 | TBX3
 122 | HES1
 123 | HIC1
 124 | TWIST1
 125 | XBP1
 126 | PLXNA4
 127 | ARID5B
 128 | KLF9
 129 | MACF1
 130 | EGR3
 131 | MYC
 132 | NFIL3
 133 | NR4A1
 134 | ATF3
 135 | CREB5
 136 | EGR1
 137 | EGR2
 138 | FOS
 139 | FOSB
 140 | ID4
 141 | JUN
 142 | JUNB
 143 | JUND
 144 | KLF10
 145 | KLF2
 146 | KLF4
 147 | KLF6
 148 | MAFF
 149 | ZFP36
 150 | ZFP36L1
 151 | ZFP36L2
 152 | DDIT3
 153 | FOSL2
 154 | IRF1
 155 | TIPARP
 156 | TSC22D1
 157 | HOPX
 158 | OLIG1
 159 | TSC22D4
 160 | DPF3
 161 | HES4
 162 | ID2
 163 | SMAD1
 164 | ZBTB20
 165 | BAZ2B
 166 | FAM171B
 167 | SOX9
 168 | TSHZ2
 169 | ZFHX4
 170 | ZMAT3
 171 | NFATC2
 172 | TFAP2B
 173 | TFAP2A
 174 | GPR155
 175 | POU3F1
 176 | RXRG
 177 | SOX10
 178 | SOX4
 179 | SOX6
 180 | ZEB2
 181 | ZNF536
 182 | AC008770.3
 183 | AC023509.3
 184 | AC092835.1
 185 | AC138696.1
 186 | ADNP
 187 | ADNP2
 188 | AEBP1
 189 | AEBP2
 190 | AHCTF1
 191 | AHDC1
 192 | AHR
 193 | AHRR
 194 | AIRE
 195 | AKAP8
 196 | AKAP8L
 197 | AKNA
 198 | ALX1
 199 | ALX3
 200 | ALX4
 201 | ANHX
 202 | ANKZF1
 203 | AR
 204 | ARGFX
 205 | ARHGAP35
 206 | ARID2
 207 | ARID3A
 208 | ARID3B
 209 | ARID3C
 210 | ARID5A
 211 | ARID5B
 212 | ARNT
 213 | ARNT2
 214 | ARNTL
 215 | ARNTL2
 216 | ARX
 217 | ASCL1
 218 | ASCL2
 219 | ASCL3
 220 | ASCL4
 221 | ASCL5
 222 | ASH1L
 223 | ATF1
 224 | ATF2
 225 | ATF3
 226 | ATF4
 227 | ATF5
 228 | ATF6
 229 | ATF6B
 230 | ATF7
 231 | ATMIN
 232 | ATOH1
 233 | ATOH7
 234 | ATOH8
 235 | BACH1
 236 | BACH2
 237 | BARHL1
 238 | BARHL2
 239 | BARX1
 240 | BARX2
 241 | BATF
 242 | BATF2
 243 | BATF3
 244 | BAZ2A
 245 | BAZ2B
 246 | BBX
 247 | BCL11A
 248 | BCL11B
 249 | BCL6
 250 | BCL6B
 251 | BHLHA15
 252 | BHLHA9
 253 | BHLHE22
 254 | BHLHE23
 255 | BHLHE40
 256 | BHLHE41
 257 | BNC1
 258 | BNC2
 259 | BORCS8-MEF2B
 260 | BPTF
 261 | BRF2
 262 | BSX
 263 | C11orf95
 264 | CAMTA1
 265 | CAMTA2
 266 | CARF
 267 | CASZ1
 268 | CBX2
 269 | CC2D1A
 270 | CCDC169-SOHLH2
 271 | CCDC17
 272 | CDC5L
 273 | CDX1
 274 | CDX2
 275 | CDX4
 276 | CEBPA
 277 | CEBPB
 278 | CEBPD
 279 | CEBPE
 280 | CEBPG
 281 | CEBPZ
 282 | CENPA
 283 | CENPB
 284 | CENPBD1
 285 | CENPS
 286 | CENPT
 287 | CENPX
 288 | CGGBP1
 289 | CHAMP1
 290 | CHCHD3
 291 | CIC
 292 | CLOCK
 293 | CPEB1
 294 | CPXCR1
 295 | CREB1
 296 | CREB3
 297 | CREB3L1
 298 | CREB3L2
 299 | CREB3L3
 300 | CREB3L4
 301 | CREB5
 302 | CREBL2
 303 | CREBZF
 304 | CREM
 305 | CRX
 306 | CSRNP1
 307 | CSRNP2
 308 | CSRNP3
 309 | CTCF
 310 | CTCFL
 311 | CUX1
 312 | CUX2
 313 | CXXC1
 314 | CXXC4
 315 | CXXC5
 316 | DACH1
 317 | DACH2
 318 | DBP
 319 | DBX1
 320 | DBX2
 321 | DDIT3
 322 | DEAF1
 323 | DLX1
 324 | DLX2
 325 | DLX3
 326 | DLX4
 327 | DLX5
 328 | DLX6
 329 | DMBX1
 330 | DMRT1
 331 | DMRT2
 332 | DMRT3
 333 | DMRTA1
 334 | DMRTA2
 335 | DMRTB1
 336 | DMRTC2
 337 | DMTF1
 338 | DNMT1
 339 | DNTTIP1
 340 | DOT1L
 341 | DPF1
 342 | DPF3
 343 | DPRX
 344 | DR1
 345 | DRAP1
 346 | DRGX
 347 | DUX1
 348 | DUX3
 349 | DUX4
 350 | DUXA
 351 | DZIP1
 352 | E2F1
 353 | E2F2
 354 | E2F3
 355 | E2F4
 356 | E2F5
 357 | E2F6
 358 | E2F7
 359 | E2F8
 360 | E4F1
 361 | EBF1
 362 | EBF2
 363 | EBF3
 364 | EBF4
 365 | EEA1
 366 | EGR1
 367 | EGR2
 368 | EGR3
 369 | EGR4
 370 | EHF
 371 | ELF1
 372 | ELF2
 373 | ELF3
 374 | ELF4
 375 | ELF5
 376 | ELK1
 377 | ELK3
 378 | ELK4
 379 | EMX1
 380 | EMX2
 381 | EN1
 382 | EN2
 383 | EOMES
 384 | EPAS1
 385 | ERF
 386 | ERG
 387 | ESR1
 388 | ESR2
 389 | ESRRA
 390 | ESRRB
 391 | ESRRG
 392 | ESX1
 393 | ETS1
 394 | ETS2
 395 | ETV1
 396 | ETV2
 397 | ETV3
 398 | ETV3L
 399 | ETV4
 400 | ETV5
 401 | ETV6
 402 | ETV7
 403 | EVX1
 404 | EVX2
 405 | FAM170A
 406 | FAM200B
 407 | FBXL19
 408 | FERD3L
 409 | FEV
 410 | FEZF1
 411 | FEZF2
 412 | FIGLA
 413 | FIZ1
 414 | FLI1
 415 | FLYWCH1
 416 | FOS
 417 | FOSB
 418 | FOSL1
 419 | FOSL2
 420 | FOXA1
 421 | FOXA2
 422 | FOXA3
 423 | FOXB1
 424 | FOXB2
 425 | FOXC1
 426 | FOXC2
 427 | FOXD1
 428 | FOXD2
 429 | FOXD3
 430 | FOXD4
 431 | FOXD4L1
 432 | FOXD4L3
 433 | FOXD4L4
 434 | FOXD4L5
 435 | FOXD4L6
 436 | FOXE1
 437 | FOXE3
 438 | FOXF1
 439 | FOXF2
 440 | FOXG1
 441 | FOXH1
 442 | FOXI1
 443 | FOXI2
 444 | FOXI3
 445 | FOXJ1
 446 | FOXJ2
 447 | FOXJ3
 448 | FOXK1
 449 | FOXK2
 450 | FOXL1
 451 | FOXL2
 452 | FOXM1
 453 | FOXN1
 454 | FOXN2
 455 | FOXN3
 456 | FOXN4
 457 | FOXO1
 458 | FOXO3
 459 | FOXO4
 460 | FOXO6
 461 | FOXP1
 462 | FOXP2
 463 | FOXP3
 464 | FOXP4
 465 | FOXQ1
 466 | FOXR1
 467 | FOXR2
 468 | FOXS1
 469 | GABPA
 470 | GATA1
 471 | GATA2
 472 | GATA3
 473 | GATA4
 474 | GATA5
 475 | GATA6
 476 | GATAD2A
 477 | GATAD2B
 478 | GBX1
 479 | GBX2
 480 | GCM1
 481 | GCM2
 482 | GFI1
 483 | GFI1B
 484 | GLI1
 485 | GLI2
 486 | GLI3
 487 | GLI4
 488 | GLIS1
 489 | GLIS2
 490 | GLIS3
 491 | GLMP
 492 | GLYR1
 493 | GMEB1
 494 | GMEB2
 495 | GPBP1
 496 | GPBP1L1
 497 | GRHL1
 498 | GRHL2
 499 | GRHL3
 500 | GSC
 501 | GSC2
 502 | GSX1
 503 | GSX2
 504 | GTF2B
 505 | GTF2I
 506 | GTF2IRD1
 507 | GTF2IRD2
 508 | GTF2IRD2B
 509 | GTF3A
 510 | GZF1
 511 | HAND1
 512 | HAND2
 513 | HBP1
 514 | HDX
 515 | HELT
 516 | HES1
 517 | HES2
 518 | HES3
 519 | HES4
 520 | HES5
 521 | HES6
 522 | HES7
 523 | HESX1
 524 | HEY1
 525 | HEY2
 526 | HEYL
 527 | HHEX
 528 | HIC1
 529 | HIC2
 530 | HIF1A
 531 | HIF3A
 532 | HINFP
 533 | HIVEP1
 534 | HIVEP2
 535 | HIVEP3
 536 | HKR1
 537 | HLF
 538 | HLX
 539 | HMBOX1
 540 | HMG20A
 541 | HMG20B
 542 | HMGA1
 543 | HMGA2
 544 | HMGN3
 545 | HMX1
 546 | HMX2
 547 | HMX3
 548 | HNF1A
 549 | HNF1B
 550 | HNF4A
 551 | HNF4G
 552 | HOMEZ
 553 | HOXA1
 554 | HOXA10
 555 | HOXA11
 556 | HOXA13
 557 | HOXA2
 558 | HOXA3
 559 | HOXA4
 560 | HOXA5
 561 | HOXA6
 562 | HOXA7
 563 | HOXA9
 564 | HOXB1
 565 | HOXB13
 566 | HOXB2
 567 | HOXB3
 568 | HOXB4
 569 | HOXB5
 570 | HOXB6
 571 | HOXB7
 572 | HOXB8
 573 | HOXB9
 574 | HOXC10
 575 | HOXC11
 576 | HOXC12
 577 | HOXC13
 578 | HOXC4
 579 | HOXC5
 580 | HOXC6
 581 | HOXC8
 582 | HOXC9
 583 | HOXD1
 584 | HOXD10
 585 | HOXD11
 586 | HOXD12
 587 | HOXD13
 588 | HOXD3
 589 | HOXD4
 590 | HOXD8
 591 | HOXD9
 592 | HSF1
 593 | HSF2
 594 | HSF4
 595 | HSF5
 596 | HSFX1
 597 | HSFX2
 598 | HSFY1
 599 | HSFY2
 600 | IKZF1
 601 | IKZF2
 602 | IKZF3
 603 | IKZF4
 604 | IKZF5
 605 | INSM1
 606 | INSM2
 607 | IRF1
 608 | IRF2
 609 | IRF3
 610 | IRF4
 611 | IRF5
 612 | IRF6
 613 | IRF7
 614 | IRF8
 615 | IRF9
 616 | IRX1
 617 | IRX2
 618 | IRX3
 619 | IRX4
 620 | IRX5
 621 | IRX6
 622 | ISL1
 623 | ISL2
 624 | ISX
 625 | JAZF1
 626 | JDP2
 627 | JRK
 628 | JRKL
 629 | JUN
 630 | JUNB
 631 | JUND
 632 | KAT7
 633 | KCMF1
 634 | KCNIP3
 635 | KDM2A
 636 | KDM2B
 637 | KDM5B
 638 | KIN
 639 | KLF1
 640 | KLF10
 641 | KLF11
 642 | KLF12
 643 | KLF13
 644 | KLF14
 645 | KLF15
 646 | KLF16
 647 | KLF17
 648 | KLF2
 649 | KLF3
 650 | KLF4
 651 | KLF5
 652 | KLF6
 653 | KLF7
 654 | KLF8
 655 | KLF9
 656 | KMT2A
 657 | KMT2B
 658 | L3MBTL1
 659 | L3MBTL3
 660 | L3MBTL4
 661 | LBX1
 662 | LBX2
 663 | LCOR
 664 | LCORL
 665 | LEF1
 666 | LEUTX
 667 | LHX1
 668 | LHX2
 669 | LHX3
 670 | LHX4
 671 | LHX5
 672 | LHX6
 673 | LHX8
 674 | LHX9
 675 | LIN28A
 676 | LIN28B
 677 | LIN54
 678 | LMX1A
 679 | LMX1B
 680 | LTF
 681 | LYL1
 682 | MAF
 683 | MAFA
 684 | MAFB
 685 | MAFF
 686 | MAFG
 687 | MAFK
 688 | MAX
 689 | MAZ
 690 | MBD1
 691 | MBD2
 692 | MBD3
 693 | MBD4
 694 | MBD6
 695 | MBNL2
 696 | MECOM
 697 | MECP2
 698 | MEF2A
 699 | MEF2B
 700 | MEF2C
 701 | MEF2D
 702 | MEIS1
 703 | MEIS2
 704 | MEIS3
 705 | MEOX1
 706 | MEOX2
 707 | MESP1
 708 | MESP2
 709 | MGA
 710 | MITF
 711 | MIXL1
 712 | MKX
 713 | MLX
 714 | MLXIP
 715 | MLXIPL
 716 | MNT
 717 | MNX1
 718 | MSANTD1
 719 | MSANTD3
 720 | MSANTD4
 721 | MSC
 722 | MSGN1
 723 | MSX1
 724 | MSX2
 725 | MTERF1
 726 | MTERF2
 727 | MTERF3
 728 | MTERF4
 729 | MTF1
 730 | MTF2
 731 | MXD1
 732 | MXD3
 733 | MXD4
 734 | MXI1
 735 | MYB
 736 | MYBL1
 737 | MYBL2
 738 | MYC
 739 | MYCL
 740 | MYCN
 741 | MYF5
 742 | MYF6
 743 | MYNN
 744 | MYOD1
 745 | MYOG
 746 | MYPOP
 747 | MYRF
 748 | MYRFL
 749 | MYSM1
 750 | MYT1
 751 | MYT1L
 752 | MZF1
 753 | NACC2
 754 | NAIF1
 755 | NANOG
 756 | NANOGNB
 757 | NANOGP8
 758 | NCOA1
 759 | NCOA2
 760 | NCOA3
 761 | NEUROD1
 762 | NEUROD2
 763 | NEUROD4
 764 | NEUROD6
 765 | NEUROG1
 766 | NEUROG2
 767 | NEUROG3
 768 | NFAT5
 769 | NFATC1
 770 | NFATC2
 771 | NFATC3
 772 | NFATC4
 773 | NFE2
 774 | NFE2L1
 775 | NFE2L2
 776 | NFE2L3
 777 | NFE4
 778 | NFIA
 779 | NFIB
 780 | NFIC
 781 | NFIL3
 782 | NFIX
 783 | NFKB1
 784 | NFKB2
 785 | NFX1
 786 | NFXL1
 787 | NFYA
 788 | NFYB
 789 | NFYC
 790 | NHLH1
 791 | NHLH2
 792 | NKRF
 793 | NKX1-1
 794 | NKX1-2
 795 | NKX2-1
 796 | NKX2-2
 797 | NKX2-3
 798 | NKX2-4
 799 | NKX2-5
 800 | NKX2-6
 801 | NKX2-8
 802 | NKX3-1
 803 | NKX3-2
 804 | NKX6-1
 805 | NKX6-2
 806 | NKX6-3
 807 | NME2
 808 | NOBOX
 809 | NOTO
 810 | NPAS1
 811 | NPAS2
 812 | NPAS3
 813 | NPAS4
 814 | NR0B1
 815 | NR1D1
 816 | NR1D2
 817 | NR1H2
 818 | NR1H3
 819 | NR1H4
 820 | NR1I2
 821 | NR1I3
 822 | NR2C1
 823 | NR2C2
 824 | NR2E1
 825 | NR2E3
 826 | NR2F1
 827 | NR2F2
 828 | NR2F6
 829 | NR3C1
 830 | NR3C2
 831 | NR4A1
 832 | NR4A2
 833 | NR4A3
 834 | NR5A1
 835 | NR5A2
 836 | NR6A1
 837 | NRF1
 838 | NRL
 839 | OLIG1
 840 | OLIG2
 841 | OLIG3
 842 | ONECUT1
 843 | ONECUT2
 844 | ONECUT3
 845 | OSR1
 846 | OSR2
 847 | OTP
 848 | OTX1
 849 | OTX2
 850 | OVOL1
 851 | OVOL2
 852 | OVOL3
 853 | PA2G4
 854 | PATZ1
 855 | PAX1
 856 | PAX2
 857 | PAX3
 858 | PAX4
 859 | PAX5
 860 | PAX6
 861 | PAX7
 862 | PAX8
 863 | PAX9
 864 | PBX1
 865 | PBX2
 866 | PBX3
 867 | PBX4
 868 | PCGF2
 869 | PCGF6
 870 | PDX1
 871 | PEG3
 872 | PGR
 873 | PHF1
 874 | PHF19 
 875 | PHF20
 876 | PHF21A
 877 | PHOX2A
 878 | PHOX2B
 879 | PIN1
 880 | PITX1
 881 | PITX2
 882 | PITX3
 883 | PKNOX1
 884 | PKNOX2
 885 | PLAG1
 886 | PLAGL1
 887 | PLAGL2
 888 | PLSCR1
 889 | POGK
 890 | POU1F1
 891 | POU2AF1
 892 | POU2F1
 893 | POU2F2
 894 | POU2F3
 895 | POU3F1
 896 | POU3F2
 897 | POU3F3
 898 | POU3F4
 899 | POU4F1
 900 | POU4F2
 901 | POU4F3
 902 | POU5F1
 903 | POU5F1B
 904 | POU5F2
 905 | POU6F1
 906 | POU6F2
 907 | PPARA
 908 | PPARD
 909 | PPARG
 910 | PRDM1
 911 | PRDM10
 912 | PRDM12
 913 | PRDM13
 914 | PRDM14
 915 | PRDM15
 916 | PRDM16
 917 | PRDM2
 918 | PRDM4
 919 | PRDM5
 920 | PRDM6
 921 | PRDM8
 922 | PRDM9
 923 | PREB
 924 | PRMT3
 925 | PROP1
 926 | PROX1
 927 | PROX2
 928 | PRR12
 929 | PRRX1
 930 | PRRX2
 931 | PTF1A
 932 | PURA
 933 | PURB
 934 | PURG
 935 | RAG1
 936 | RARA
 937 | RARB
 938 | RARG
 939 | RAX
 940 | RAX2
 941 | RBAK
 942 | RBCK1
 943 | RBPJ
 944 | RBPJL
 945 | RBSN
 946 | REL
 947 | RELA
 948 | RELB
 949 | REPIN1
 950 | REST
 951 | REXO4
 952 | RFX1
 953 | RFX2
 954 | RFX3
 955 | RFX4
 956 | RFX5
 957 | RFX6
 958 | RFX7
 959 | RFX8
 960 | RHOXF1
 961 | RHOXF2
 962 | RHOXF2B
 963 | RLF
 964 | RORA
 965 | RORB
 966 | RORC
 967 | RREB1
 968 | RUNX1
 969 | RUNX2
 970 | RUNX3
 971 | RXRA
 972 | RXRB
 973 | RXRG
 974 | SAFB
 975 | SAFB2
 976 | SALL1
 977 | SALL2
 978 | SALL3
 979 | SALL4
 980 | SATB1
 981 | SATB2
 982 | SCMH1
 983 | SCML4
 984 | SCRT1
 985 | SCRT2
 986 | SCX
 987 | SEBOX
 988 | SETBP1
 989 | SETDB1
 990 | SETDB2
 991 | SGSM2
 992 | SHOX
 993 | SHOX2
 994 | SIM1
 995 | SIM2
 996 | SIX1
 997 | SIX2
 998 | SIX3
 999 | SIX4
1000 | SIX5
1001 | SIX6
1002 | SKI
1003 | SKIL
1004 | SKOR1
1005 | SKOR2
1006 | SLC2A4RG
1007 | SMAD1
1008 | SMAD3
1009 | SMAD4
1010 | SMAD5
1011 | SMAD9
1012 | SMYD3
1013 | SNAI1
1014 | SNAI2
1015 | SNAI3
1016 | SNAPC2
1017 | SNAPC4
1018 | SNAPC5
1019 | SOHLH1
1020 | SOHLH2
1021 | SON
1022 | SOX1
1023 | SOX10
1024 | SOX11
1025 | SOX12
1026 | SOX13
1027 | SOX14
1028 | SOX15
1029 | SOX17
1030 | SOX18
1031 | SOX2
1032 | SOX21
1033 | SOX3
1034 | SOX30
1035 | SOX4
1036 | SOX5
1037 | SOX6
1038 | SOX7
1039 | SOX8
1040 | SOX9
1041 | SP1
1042 | SP100
1043 | SP110
1044 | SP140
1045 | SP140L
1046 | SP2
1047 | SP3
1048 | SP4
1049 | SP5
1050 | SP6
1051 | SP7
1052 | SP8
1053 | SP9
1054 | SPDEF
1055 | SPEN
1056 | SPI1
1057 | SPIB
1058 | SPIC
1059 | SPZ1
1060 | SRCAP
1061 | SREBF1
1062 | SREBF2
1063 | SRF
1064 | SRY
1065 | ST18
1066 | STAT1
1067 | STAT2
1068 | STAT3
1069 | STAT4
1070 | STAT5A
1071 | STAT5B
1072 | STAT6
1073 | T
1074 | TAL1
1075 | TAL2
1076 | TBP
1077 | TBPL1
1078 | TBPL2
1079 | TBR1
1080 | TBX1
1081 | TBX10
1082 | TBX15
1083 | TBX18
1084 | TBX19
1085 | TBX2
1086 | TBX20
1087 | TBX21
1088 | TBX22
1089 | TBX3
1090 | TBX4
1091 | TBX5
1092 | TBX6
1093 | TCF12
1094 | TCF15
1095 | TCF20
1096 | TCF21
1097 | TCF23
1098 | TCF24
1099 | TCF3
1100 | TCF4
1101 | TCF7
1102 | TCF7L1
1103 | TCF7L2
1104 | TCFL5
1105 | TEAD1
1106 | TEAD2
1107 | TEAD3
1108 | TEAD4
1109 | TEF
1110 | TERB1
1111 | TERF1
1112 | TERF2
1113 | TET1
1114 | TET2
1115 | TET3
1116 | TFAP2A
1117 | TFAP2B
1118 | TFAP2C
1119 | TFAP2D
1120 | TFAP2E
1121 | TFAP4
1122 | TFCP2
1123 | TFCP2L1
1124 | TFDP1
1125 | TFDP2
1126 | TFDP3
1127 | TFE3
1128 | TFEB
1129 | TFEC
1130 | TGIF1
1131 | TGIF2
1132 | TGIF2LX
1133 | TGIF2LY
1134 | THAP1
1135 | THAP10
1136 | THAP11
1137 | THAP12
1138 | THAP2
1139 | THAP3
1140 | THAP4
1141 | THAP5
1142 | THAP6
1143 | THAP7
1144 | THAP8
1145 | THAP9
1146 | THRA
1147 | THRB
1148 | THYN1
1149 | TIGD1
1150 | TIGD2
1151 | TIGD3
1152 | TIGD4
1153 | TIGD5
1154 | TIGD6
1155 | TIGD7
1156 | TLX1
1157 | TLX2
1158 | TLX3
1159 | TMF1
1160 | TOPORS
1161 | TP53
1162 | TP63
1163 | TP73
1164 | TPRX1
1165 | TRAFD1
1166 | TRERF1
1167 | TRPS1
1168 | TSC22D1
1169 | TSHZ1
1170 | TSHZ2
1171 | TSHZ3
1172 | TTF1
1173 | TWIST1
1174 | TWIST2
1175 | UBP1
1176 | UNCX
1177 | USF1
1178 | USF2
1179 | USF3
1180 | VAX1
1181 | VAX2
1182 | VDR
1183 | VENTX
1184 | VEZF1
1185 | VSX1
1186 | VSX2
1187 | WIZ
1188 | WT1
1189 | XBP1
1190 | XPA
1191 | YBX1
1192 | YBX2
1193 | YBX3
1194 | YY1
1195 | YY2
1196 | ZBED1
1197 | ZBED2
1198 | ZBED3
1199 | ZBED4
1200 | ZBED5
1201 | ZBED6
1202 | ZBED9
1203 | ZBTB1
1204 | ZBTB10
1205 | ZBTB11
1206 | ZBTB12
1207 | ZBTB14
1208 | ZBTB16
1209 | ZBTB17
1210 | ZBTB18
1211 | ZBTB2
1212 | ZBTB20
1213 | ZBTB21
1214 | ZBTB22
1215 | ZBTB24
1216 | ZBTB25
1217 | ZBTB26
1218 | ZBTB3
1219 | ZBTB32
1220 | ZBTB33
1221 | ZBTB34
1222 | ZBTB37
1223 | ZBTB38
1224 | ZBTB39
1225 | ZBTB4
1226 | ZBTB40
1227 | ZBTB41
1228 | ZBTB42
1229 | ZBTB43
1230 | ZBTB44
1231 | ZBTB45
1232 | ZBTB46
1233 | ZBTB47
1234 | ZBTB48
1235 | ZBTB49
1236 | ZBTB5
1237 | ZBTB6
1238 | ZBTB7A
1239 | ZBTB7B
1240 | ZBTB7C
1241 | ZBTB8A
1242 | ZBTB8B
1243 | ZBTB9
1244 | ZC3H8
1245 | ZEB1
1246 | ZEB2
1247 | ZFAT
1248 | ZFHX2
1249 | ZFHX3
1250 | ZFHX4
1251 | ZFP1
1252 | ZFP14
1253 | ZFP2
1254 | ZFP28
1255 | ZFP3
1256 | ZFP30
1257 | ZFP37
1258 | ZFP41
1259 | ZFP42
1260 | ZFP57
1261 | ZFP62
1262 | ZFP64
1263 | ZFP69
1264 | ZFP69B
1265 | ZFP82
1266 | ZFP90
1267 | ZFP91
1268 | ZFP92
1269 | ZFPM1
1270 | ZFPM2
1271 | ZFX
1272 | ZFY
1273 | ZGLP1
1274 | ZGPAT
1275 | ZHX1
1276 | ZHX2
1277 | ZHX3
1278 | ZIC1
1279 | ZIC2
1280 | ZIC3
1281 | ZIC4
1282 | ZIC5
1283 | ZIK1
1284 | ZIM2
1285 | ZIM3
1286 | ZKSCAN1
1287 | ZKSCAN2
1288 | ZKSCAN3
1289 | ZKSCAN4
1290 | ZKSCAN5
1291 | ZKSCAN7
1292 | ZKSCAN8
1293 | ZMAT1
1294 | ZMAT4
1295 | ZNF10
1296 | ZNF100
1297 | ZNF101
1298 | ZNF107
1299 | ZNF112
1300 | ZNF114
1301 | ZNF117
1302 | ZNF12
1303 | ZNF121
1304 | ZNF124
1305 | ZNF131
1306 | ZNF132
1307 | ZNF133
1308 | ZNF134
1309 | ZNF135
1310 | ZNF136
1311 | ZNF138
1312 | ZNF14
1313 | ZNF140
1314 | ZNF141
1315 | ZNF142
1316 | ZNF143
1317 | ZNF146
1318 | ZNF148
1319 | ZNF154
1320 | ZNF155
1321 | ZNF157
1322 | ZNF16
1323 | ZNF160
1324 | ZNF165
1325 | ZNF169
1326 | ZNF17
1327 | ZNF174
1328 | ZNF175
1329 | ZNF177
1330 | ZNF18
1331 | ZNF180
1332 | ZNF181
1333 | ZNF182
1334 | ZNF184
1335 | ZNF189
1336 | ZNF19
1337 | ZNF195
1338 | ZNF197
1339 | ZNF2
1340 | ZNF20
1341 | ZNF200
1342 | ZNF202
1343 | ZNF205
1344 | ZNF207
1345 | ZNF208
1346 | ZNF211
1347 | ZNF212
1348 | ZNF213
1349 | ZNF214
1350 | ZNF215
1351 | ZNF217
1352 | ZNF219
1353 | ZNF22
1354 | ZNF221
1355 | ZNF222
1356 | ZNF223
1357 | ZNF224
1358 | ZNF225
1359 | ZNF226
1360 | ZNF227
1361 | ZNF229
1362 | ZNF23
1363 | ZNF230
1364 | ZNF232
1365 | ZNF233
1366 | ZNF234
1367 | ZNF235
1368 | ZNF236
1369 | ZNF239
1370 | ZNF24
1371 | ZNF248
1372 | ZNF25
1373 | ZNF250
1374 | ZNF251
1375 | ZNF253
1376 | ZNF254
1377 | ZNF256
1378 | ZNF257
1379 | ZNF26
1380 | ZNF260
1381 | ZNF263
1382 | ZNF264
1383 | ZNF266
1384 | ZNF267
1385 | ZNF268
1386 | ZNF273
1387 | ZNF274
1388 | ZNF275
1389 | ZNF276
1390 | ZNF277
1391 | ZNF28
1392 | ZNF280A
1393 | ZNF280B
1394 | ZNF280C
1395 | ZNF280D
1396 | ZNF281
1397 | ZNF282
1398 | ZNF283
1399 | ZNF284
1400 | ZNF285
1401 | ZNF286A
1402 | ZNF286B
1403 | ZNF287
1404 | ZNF292
1405 | ZNF296
1406 | ZNF3
1407 | ZNF30
1408 | ZNF300
1409 | ZNF302
1410 | ZNF304
1411 | ZNF311
1412 | ZNF316
1413 | ZNF317
1414 | ZNF318
1415 | ZNF319
1416 | ZNF32
1417 | ZNF320
1418 | ZNF322
1419 | ZNF324
1420 | ZNF324B
1421 | ZNF326
1422 | ZNF329
1423 | ZNF331
1424 | ZNF333
1425 | ZNF334
1426 | ZNF335
1427 | ZNF337
1428 | ZNF33A
1429 | ZNF33B
1430 | ZNF34
1431 | ZNF341
1432 | ZNF343
1433 | ZNF345
1434 | ZNF346
1435 | ZNF347
1436 | ZNF35
1437 | ZNF350
1438 | ZNF354A
1439 | ZNF354B
1440 | ZNF354C
1441 | ZNF358
1442 | ZNF362
1443 | ZNF365
1444 | ZNF366
1445 | ZNF367
1446 | ZNF37A
1447 | ZNF382
1448 | ZNF383
1449 | ZNF384
1450 | ZNF385A
1451 | ZNF385B
1452 | ZNF385C
1453 | ZNF385D
1454 | ZNF391
1455 | ZNF394
1456 | ZNF395
1457 | ZNF396
1458 | ZNF397
1459 | ZNF398
1460 | ZNF404
1461 | ZNF407
1462 | ZNF408
1463 | ZNF41
1464 | ZNF410
1465 | ZNF414
1466 | ZNF415
1467 | ZNF416
1468 | ZNF417
1469 | ZNF418
1470 | ZNF419
1471 | ZNF420
1472 | ZNF423
1473 | ZNF425
1474 | ZNF426
1475 | ZNF428
1476 | ZNF429
1477 | ZNF43
1478 | ZNF430
1479 | ZNF431
1480 | ZNF432
1481 | ZNF433
1482 | ZNF436
1483 | ZNF438
1484 | ZNF439
1485 | ZNF44
1486 | ZNF440
1487 | ZNF441
1488 | ZNF442
1489 | ZNF443
1490 | ZNF444
1491 | ZNF445
1492 | ZNF446
1493 | ZNF449
1494 | ZNF45
1495 | ZNF451
1496 | ZNF454
1497 | ZNF460
1498 | ZNF461
1499 | ZNF462
1500 | ZNF467
1501 | ZNF468
1502 | ZNF469
1503 | ZNF470
1504 | ZNF471
1505 | ZNF473
1506 | ZNF474
1507 | ZNF479
1508 | ZNF48
1509 | ZNF480
1510 | ZNF483
1511 | ZNF484
1512 | ZNF485
1513 | ZNF486
1514 | ZNF487
1515 | ZNF488
1516 | ZNF490
1517 | ZNF491
1518 | ZNF492
1519 | ZNF493
1520 | ZNF496
1521 | ZNF497
1522 | ZNF500
1523 | ZNF501
1524 | ZNF502
1525 | ZNF503
1526 | ZNF506
1527 | ZNF507
1528 | ZNF510
1529 | ZNF511
1530 | ZNF512
1531 | ZNF512B
1532 | ZNF513
1533 | ZNF514
1534 | ZNF516
1535 | ZNF517
1536 | ZNF518A
1537 | ZNF518B
1538 | ZNF519
1539 | ZNF521
1540 | ZNF524
1541 | ZNF525
1542 | ZNF526
1543 | ZNF527
1544 | ZNF528
1545 | ZNF529
1546 | ZNF530
1547 | ZNF532
1548 | ZNF534
1549 | ZNF536
1550 | ZNF540
1551 | ZNF541
1552 | ZNF543
1553 | ZNF544
1554 | ZNF546
1555 | ZNF547
1556 | ZNF548
1557 | ZNF549
1558 | ZNF550
1559 | ZNF551
1560 | ZNF552
1561 | ZNF554
1562 | ZNF555
1563 | ZNF556
1564 | ZNF557
1565 | ZNF558
1566 | ZNF559
1567 | ZNF560
1568 | ZNF561
1569 | ZNF562
1570 | ZNF563
1571 | ZNF564
1572 | ZNF565
1573 | ZNF566
1574 | ZNF567
1575 | ZNF568
1576 | ZNF569
1577 | ZNF57
1578 | ZNF570
1579 | ZNF571
1580 | ZNF572
1581 | ZNF573
1582 | ZNF574
1583 | ZNF575
1584 | ZNF576
1585 | ZNF577
1586 | ZNF578
1587 | ZNF579
1588 | ZNF580
1589 | ZNF581
1590 | ZNF582
1591 | ZNF583
1592 | ZNF584
1593 | ZNF585A
1594 | ZNF585B
1595 | ZNF586
1596 | ZNF587
1597 | ZNF587B
1598 | ZNF589
1599 | ZNF592
1600 | ZNF594
1601 | ZNF595
1602 | ZNF596
1603 | ZNF597
1604 | ZNF598
1605 | ZNF599
1606 | ZNF600
1607 | ZNF605
1608 | ZNF606
1609 | ZNF607
1610 | ZNF608
1611 | ZNF609
1612 | ZNF610
1613 | ZNF611
1614 | ZNF613
1615 | ZNF614
1616 | ZNF615
1617 | ZNF616
1618 | ZNF618
1619 | ZNF619
1620 | ZNF620
1621 | ZNF621
1622 | ZNF623
1623 | ZNF624
1624 | ZNF625
1625 | ZNF626
1626 | ZNF627
1627 | ZNF628
1628 | ZNF629
1629 | ZNF630
1630 | ZNF639
1631 | ZNF641
1632 | ZNF644
1633 | ZNF645
1634 | ZNF646
1635 | ZNF648
1636 | ZNF649
1637 | ZNF652
1638 | ZNF653
1639 | ZNF654
1640 | ZNF655
1641 | ZNF658
1642 | ZNF66
1643 | ZNF660
1644 | ZNF662
1645 | ZNF664
1646 | ZNF665
1647 | ZNF667
1648 | ZNF668
1649 | ZNF669
1650 | ZNF670
1651 | ZNF671
1652 | ZNF672
1653 | ZNF674
1654 | ZNF675
1655 | ZNF676
1656 | ZNF677
1657 | ZNF678
1658 | ZNF679
1659 | ZNF680
1660 | ZNF681
1661 | ZNF682
1662 | ZNF683
1663 | ZNF684
1664 | ZNF687
1665 | ZNF688
1666 | ZNF689
1667 | ZNF69
1668 | ZNF691
1669 | ZNF692
1670 | ZNF695
1671 | ZNF696
1672 | ZNF697
1673 | ZNF699
1674 | ZNF7
1675 | ZNF70
1676 | ZNF700
1677 | ZNF701
1678 | ZNF703
1679 | ZNF704
1680 | ZNF705A
1681 | ZNF705B
1682 | ZNF705D
1683 | ZNF705E
1684 | ZNF705G
1685 | ZNF706
1686 | ZNF707
1687 | ZNF708
1688 | ZNF709
1689 | ZNF71
1690 | ZNF710
1691 | ZNF711
1692 | ZNF713
1693 | ZNF714
1694 | ZNF716
1695 | ZNF717
1696 | ZNF718
1697 | ZNF721
1698 | ZNF724
1699 | ZNF726
1700 | ZNF727
1701 | ZNF728
1702 | ZNF729
1703 | ZNF730
1704 | ZNF732
1705 | ZNF735
1706 | ZNF736
1707 | ZNF737
1708 | ZNF74
1709 | ZNF740
1710 | ZNF746
1711 | ZNF747
1712 | ZNF749
1713 | ZNF750
1714 | ZNF75A
1715 | ZNF75D
1716 | ZNF76
1717 | ZNF761
1718 | ZNF763
1719 | ZNF764
1720 | ZNF765
1721 | ZNF766
1722 | ZNF768
1723 | ZNF77
1724 | ZNF770
1725 | ZNF771
1726 | ZNF772
1727 | ZNF773
1728 | ZNF774
1729 | ZNF775
1730 | ZNF776
1731 | ZNF777
1732 | ZNF778
1733 | ZNF780A
1734 | ZNF780B
1735 | ZNF781
1736 | ZNF782
1737 | ZNF783
1738 | ZNF784
1739 | ZNF785
1740 | ZNF786
1741 | ZNF787
1742 | ZNF788
1743 | ZNF789
1744 | ZNF79
1745 | ZNF790
1746 | ZNF791
1747 | ZNF792
1748 | ZNF793
1749 | ZNF799
1750 | ZNF8
1751 | ZNF80
1752 | ZNF800
1753 | ZNF804A
1754 | ZNF804B
1755 | ZNF805
1756 | ZNF808
1757 | ZNF81
1758 | ZNF813
1759 | ZNF814
1760 | ZNF816
1761 | ZNF821
1762 | ZNF823
1763 | ZNF827
1764 | ZNF829
1765 | ZNF83
1766 | ZNF830
1767 | ZNF831
1768 | ZNF835
1769 | ZNF836
1770 | ZNF837
1771 | ZNF84
1772 | ZNF841
1773 | ZNF843
1774 | ZNF844
1775 | ZNF845
1776 | ZNF846
1777 | ZNF85
1778 | ZNF850
1779 | ZNF852
1780 | ZNF853
1781 | ZNF860
1782 | ZNF865
1783 | ZNF878
1784 | ZNF879
1785 | ZNF880
1786 | ZNF883
1787 | ZNF888
1788 | ZNF891
1789 | ZNF90
1790 | ZNF91
1791 | ZNF92
1792 | ZNF93
1793 | ZNF98
1794 | ZNF99
1795 | ZSCAN1
1796 | ZSCAN10
1797 | ZSCAN12
1798 | ZSCAN16
1799 | ZSCAN18
1800 | ZSCAN2
1801 | ZSCAN20
1802 | ZSCAN21
1803 | ZSCAN22
1804 | ZSCAN23
1805 | ZSCAN25
1806 | ZSCAN26
1807 | ZSCAN29
1808 | ZSCAN30
1809 | ZSCAN31
1810 | ZSCAN32
1811 | ZSCAN4
1812 | ZSCAN5A
1813 | ZSCAN5B
1814 | ZSCAN5C
1815 | ZSCAN9
1816 | ZUFSP
1817 | ZXDA
1818 | ZXDB
1819 | ZXDC
1820 | ZZZ3
1821 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | #' Compute OLS coefficients
  2 | #'
  3 | #' If the design matrix has full column-rank, then use the normal
  4 | #' least squares estimate. Otherwise, use the Moore-Penrose inverse
  5 | #' to compute the least squares estimate.
  6 | #'
  7 | #' @param y Target vector (n x 1)/matrix (n x m)
  8 | #' @param x Design matrix (n x p)
  9 | #'
 10 | #' @return Vector of OLS coefficients
 11 | #'
 12 | #' @keywords internal
 13 | coef_ols <- function(y, x) {
 14 |   # Pre-compute quantities
 15 |   n <- nrow(x)
 16 |   p <- ncol(x)
 17 |   xtx <- crossprod(x)
 18 |   xty <- crossprod(x, y)
 19 | 
 20 |   if (n < p) {
 21 |     # Compute the pseudo-inverse of xtx
 22 |     xtx_svd <- svd(xtx)
 23 |     d <- xtx_svd$d
 24 |     idx <- which(d > .Machine$double.eps * p * max(d))
 25 |     d[idx] <- 1 / d[idx]
 26 |     d[setdiff(seq_len(p), idx)] <- 0
 27 |     xtx_inv <- xtx_svd$v %*% diag(d, nrow = p, ncol = p) %*% t(xtx_svd$u)
 28 | 
 29 |     # Compute least squares solution using pseudo-inverse
 30 |     beta_ols <- xtx_inv %*% xty
 31 |   } else {
 32 |     # Compute least squares solution directly
 33 |     beta_ols <- solve(xtx, xty)
 34 |   }
 35 | 
 36 |   beta_ols
 37 | }
 38 | 
 39 | #' Compute ridge regression coefficients
 40 | #'
 41 | #'
 42 | #' @param y Target vector (n x 1)/matrix (n x m)
 43 | #' @param x Design matrix (n x p)
 44 | #' @param lambda Positive parameter for ridge penalty
 45 | #'
 46 | #' @return Vector of ridge regression coefficients
 47 | #'
 48 | #' @keywords internal
 49 | coef_ridge <- function(y, x, lambda) {
 50 |   # Pre-compute quantities
 51 |   p <- ncol(x)
 52 |   xtx <- crossprod(x)
 53 |   xty <- crossprod(x, y)
 54 | 
 55 |   # Compute ridge regression solution directly
 56 |   solve(xtx + diag(lambda, p, p), xty)
 57 | }
 58 | 
 59 | #' Quick'n'dirty progress bar
 60 | #'
 61 | #' Creates a progress bar and returns it as a string.
 62 | #'
 63 | #' @param step current step being worked on
 64 | #' @param n_steps total number of steps
 65 | #' @param name name of the process
 66 | #' @param finished whether the process is finished
 67 | #' @param progress_length length of the progress bar in ascii signs
 68 | #'
 69 | #' @return A string formatted as a progress bar
 70 | #'
 71 | #' @keywords internal
 72 | progstr <- function(step, n_steps, name,
 73 |                     finished = FALSE, progress_length = 20L) {
 74 |   steps_done <- floor(progress_length * (step - 1) / n_steps)
 75 | 
 76 |   parts <- c("|", rep.int(cli::col_blue(cli::symbol$square), steps_done))
 77 |   if (!finished) {
 78 |     parts <- c(
 79 |       parts,
 80 |       rep.int(cli::symbol$line, progress_length - steps_done)
 81 |     )
 82 |   } else {
 83 |     parts <- c(parts, rep.int(
 84 |       cli::col_blue(cli::symbol$square), progress_length - steps_done
 85 |     ))
 86 |   }
 87 |   parts <- c(parts, "| ", cli::col_grey("%d/%d ", name))
 88 | 
 89 |   sprintf(paste0(parts, collapse = ""), step, n_steps)
 90 | }
 91 | 
 92 | #' Format count table nicely
 93 | #'
 94 | #' @param counts a list of count vectors with `1 + n_cl` entries each.
 95 | #'               `NA` values are replaced with `-`
 96 | #' @param title title above the table
 97 | #' @param row_names a vector of row names, one for each count vector
 98 | #' @param col_width minimum width for columns
 99 | #'
100 | #' @return A string formatted as a table
101 | #'
102 | #' @keywords internal
103 | count_table <- function(counts,
104 |                         title,
105 |                         row_names,
106 |                         col_width = 5) {
107 |   nms <- c("Noise", as.character(seq_len(length(counts[[1]]) - 1)))
108 |   counts_chr <- lapply(counts, as.character)
109 |   # Replace NA with `-`
110 |   counts_chr <- lapply(counts_chr, function(cn) {
111 |     cn[is.na(cn)] <- "-"
112 |     cn
113 |   })
114 | 
115 |   stopifnot(length(row_names) == length(counts_chr))
116 | 
117 |   cws <- c(
118 |     max(
119 |       c(
120 |         nchar("Noise"),
121 |         sapply(counts_chr, function(cn) nchar(cn[1])),
122 |         col_width
123 |       )
124 |     ),
125 |     do.call(pmax, c(
126 |       list(unname(sapply(nms[-1], nchar))),
127 |       lapply(counts_chr, function(cn) unname(sapply(cn[-1], nchar))),
128 |       list(col_width)
129 |     ))
130 |   )
131 | 
132 |   # longest row name
133 |   width_row_nms <- max(sapply(c("Module", row_names), nchar))
134 | 
135 |   fmt_strs <- sprintf("%%%ds", cws)
136 |   fmt_row_str <- sprintf("  %%%ds  ", width_row_nms)
137 | 
138 |   width <- cli::console_width()
139 | 
140 |   # Two spaces + longest row name + two spaces
141 |   tbl_widths <- (
142 |     2 + width_row_nms + 2 + cumsum(cws + c(0, rep(3, length(cws) - 1)))
143 |   )
144 |   tbl_rows <- list(which(tbl_widths <= width))
145 |   cws_tmp <- cws[tbl_widths > width]
146 |   while (length(cws_tmp) > 0) {
147 |     tbl_widths <- (
148 |       2 + width_row_nms + 2
149 |       + cumsum(cws_tmp + c(0, rep(3, length(cws_tmp) - 1)))
150 |     )
151 | 
152 |     tbl_rows <- c(tbl_rows, list(
153 |       max(tbl_rows[[length(tbl_rows)]]) + which(tbl_widths <= width))
154 |     )
155 |     cws_tmp <- cws_tmp[tbl_widths > width]
156 |   }
157 | 
158 |   # Grey-out `-` and `0`s
159 |   counts_chr <- lapply(counts_chr, function(cn) {
160 |     cn_out <- sprintf(fmt_strs, cn)
161 |     cn_out[cn == "-"] <- cli::col_grey(sprintf(fmt_strs[cn == "-"], "-"))
162 |     cn_out[cn == "0"] <- cli::col_grey(sprintf(fmt_strs[cn == "0"], "0"))
163 |     cn_out
164 |   })
165 | 
166 |   do.call(function(...) paste(..., sep = "\n"), c(
167 |     list(cli::col_grey(sprintf("# %s", title))),
168 |     lapply(
169 |       tbl_rows,
170 |       function(elems) {
171 |         paste0(paste(
172 |           paste0(
173 |             cli::col_blue(sprintf(fmt_row_str, "Module")),
174 |             cli::col_grey(
175 |               paste0(
176 |                 sprintf(fmt_strs[elems], nms[elems]),
177 |                 collapse = cli::col_grey(" | ")
178 |               )
179 |             )
180 |           ),
181 |           do.call(
182 |             function(...) paste(..., sep = "\n"),
183 |             lapply(seq_along(counts_chr), function(i) {
184 |               paste0(
185 |                 cli::col_blue(sprintf(fmt_row_str, row_names[i])),
186 |                 paste0(
187 |                   sprintf(fmt_strs[elems], counts_chr[[i]][elems]),
188 |                   collapse = cli::col_grey(" | ")
189 |                 )
190 |               )
191 |             })
192 |           ),
193 |           sep = "\n"
194 |         ), "\n")
195 |       }
196 |     )
197 |   ))
198 | }
199 | 
200 | #' Compute indicator matrix of pairwise distances smaller than threshold
201 | #'
202 | #' Computes the Jaccard distance between rows of a matrix and returns a
203 | #' sparse symmetric indicator matrix containing the entries with a distance
204 | #' of less than a given upper bound. Note that the diagonal is always 1.
205 | #'
206 | #' @param x the input matrix with vectors to be compared in the rows.
207 | #' @param upper_bnd pairs with a Jaccard distance below this upper bound are
208 | #'                  returned as 1 while all others receive the entry 0.
209 | #'
210 | #' @return A list of vectors describing a sparse lower triangular pattern matrix
211 | #'      \item{i}{Row indices}
212 | #'      \item{j}{Column indices}
213 | #'
214 | #' @keywords internal
215 | jaccard_indicator <- function(x, upper_bnd = 0.8) {
216 |   # Treat matrix as sparse pattern matrix
217 |   x <- methods::as(x, "ngCMatrix")
218 | 
219 |   # Dimension along which pairwise distances are computed
220 |   n <- x@Dim[1]
221 | 
222 |   # Retrieve row and column indices of non-zero entries
223 |   xs <- Matrix::summary(x)
224 |   i <- xs$i
225 |   j <- xs$j
226 | 
227 |   # Split column indices by row indices
228 |   # -> jsplit will have exactly n entries
229 |   # -> This is almost equivalent to the call `split(j[iord], i[iord] + 1)`
230 |   #    except that rows with zero ones result in an empty vector
231 |   #    whereas they would not appear in the `split` call.
232 |   iord <- order(i)
233 |   iord_rle <- rle(i[iord] + 1L)
234 |   iord_rle_cs <- c(1L, cumsum(iord_rle$lengths))
235 |   jord <- j[iord]
236 |   jsplit <- vector(mode = "list", n)
237 |   m <- 1L
238 |   len_iuniq <- length(iord_rle$values)
239 |   for (l in seq_len(n)) {
240 |     if (m > len_iuniq) {
241 |       break
242 |     }
243 | 
244 |     if (iord_rle$values[m] == l) {
245 |       jsplit[[l]] <- jord[iord_rle_cs[m]:iord_rle_cs[m + 1L]]
246 |       m <- m + 1L
247 |     } else {
248 |       jsplit[[l]] <- vector("integer", 0L)
249 |     }
250 |   }
251 | 
252 |   # Run actual computation of Jaccard distances and save those
253 |   # entries that have distance below the upper_bnd.
254 |   out <- jaccard_indicator_comp(
255 |     jsplit,
256 |     eps = upper_bnd
257 |   )
258 | 
259 |   # Form the indicator matrix
260 |   methods::as(Matrix::sparseMatrix(
261 |       c(out$i, out$j),
262 |       c(out$j, out$i),
263 |       dims = c(n, n)
264 |   ) + Matrix::Diagonal(n), "ngCMatrix")
265 | }
266 | 
267 | #' Determine initial centers for the kmeans++ algorithm
268 | #'
269 | #' @param x data matrix to be clustered
270 | #' @param dm distance matrix (between rows of x; of class "dist")
271 | #'
272 | #' @return Row indices of initial cluster centers of x
273 | #'
274 | #' @keywords internal
275 | kmeanspp_init <- function(n_cluster, x = NULL, dm = NULL) {
276 |   if (sum(c(is.null(x), is.null(dm))) %in% c(0L, 2L)) {
277 |     stop("Exactly one of x or dm needs to be supplied")
278 |   }
279 | 
280 |   if (!is.null(x)) {
281 |     dm <- dist(x)
282 |   }
283 | 
284 |   n <- attr(dm, "Size")
285 | 
286 |   centers <- sample(n, size = 1L)
287 |   for (i in 2L:n_cluster) {
288 |     remaining_obs <- setdiff(seq_len(n), centers)
289 |     log_ws_sq <- log(apply(do.call(
290 |       cbind, lapply(
291 |         centers,
292 |         function(c) {
293 |           lower_idx <- remaining_obs[remaining_obs < c]
294 |           upper_idx <- remaining_obs[remaining_obs > c]
295 | 
296 |           c(
297 |             dm[
298 |               n * (lower_idx - 1)
299 |               - lower_idx * (lower_idx - 1) / 2
300 |               + c
301 |               - lower_idx
302 |             ],
303 |             dm[
304 |               n * (c - 1)
305 |               - c * (c - 1) / 2
306 |               + upper_idx
307 |               - c
308 |             ]
309 |           )
310 |           # # More straight-forward but less memory efficient method
311 |           # dist_vals2 <- as.vector(as.matrix(dm)[remaining_obs, c])
312 |         }
313 |       )
314 |     ), 1, min)^2)
315 |     max_log_ws_sq <- max(log_ws_sq)
316 |     ps <- (
317 |       exp(log_ws_sq - max_log_ws_sq) / sum(exp(log_ws_sq - max_log_ws_sq))
318 |     )
319 |     centers <- c(centers, remaining_obs[
320 |       sample.int(length(remaining_obs), size = 1, prob = ps)
321 |     ])
322 |   }
323 | 
324 |   centers
325 | }
326 | 
327 | #' Perform the k-means++ algorithm
328 | #'
329 | #' Performs the k-means++ algorithm to cluster the rows of the input matrix.
330 | #'
331 | #' Estimation is repeated
332 | #'
333 | #' @param x Input matrix (n x p)
334 | #' @param n_cluster Number of clusters
335 | #' @param n_init_clusterings Number of repeated random initializations
336 | #'                           to perform
337 | #' @param n_max_iter Number of maximum iterations to perform in the k-means
338 | #'                   algorithm
339 | #'
340 | #' @return An object of class [`stats::kmeans`].
341 | #'
342 | #' @references
343 | #' David Arthur and Sergei Vassilvitskii. K-Means++: The advantages
344 | #' of careful seeding. In Proceedings of the Eighteenth Annual ACM-SIAM
345 | #' Symposium on Discrete Algorithms, SODA '07, pages 1027––1035.
346 | #' Society for Industrial and Applied Mathematics, 2007.
347 | #'
348 | #' @concept helpers
349 | #'
350 | #' @export
351 | kmeanspp <- function(x, n_cluster, n_init_clusterings = 10L, n_max_iter = 10L) {
352 |   dm <- dist(x)
353 |   initial_center_indices <- lapply(
354 |     seq_len(n_init_clusterings),
355 |     function(i) {
356 |       kmeanspp_init(n_cluster, dm = dm)
357 |     }
358 |   )
359 |   # Remove reference to dm
360 |   dm <- NULL
361 | 
362 |   clusterings <- lapply(
363 |     initial_center_indices,
364 |     function(center_idx) {
365 |       stats::kmeans(
366 |         x,
367 |         centers = x[center_idx, , drop = FALSE],
368 |         iter.max = n_max_iter
369 |       )
370 |     }
371 |   )
372 | 
373 |   min_idx <- which.min(sapply(clusterings, function(cl) cl$tot.withinss))
374 |   clusterings[[min_idx]]$cluster
375 | }
376 | 
377 | #' Determine module sizes
378 | #'
379 | #' @param module Vector of module indices
380 | #' @param n_modules Total number of modules
381 | #'
382 | #' @return A named vector containing the name of the module (its index or
383 | #'         `"Noise"`) and the number of elements in that module
384 | #'
385 | #' @concept helpers
386 | #'
387 | #' @export
388 | find_module_sizes <- function(module, n_modules) {
389 |   sapply(c(-1L, seq_len(n_modules)), function(i) {
390 |     v <- sum(module == i)
391 |     if (i == -1) {
392 |       names(v) <- "Noise"
393 |     } else {
394 |       names(v) <- i
395 |     }
396 |     v
397 |   })
398 | }
399 | 
400 | #' Remove empty modules
401 | #'
402 | #' @details
403 | #' Only iterates through modules with positive index, leaving the noise
404 | #' module untouched.
405 | #'
406 | #' @param module Vector of module indices
407 | #'
408 | #' @return The updated vector of module indices with empty modules removed.
409 | #'
410 | #' @keywords internal
411 | remove_empty_modules <- function(module) {
412 |   module_ <- module
413 |   if (max(module) > length(unique(module[module > 0]))) {
414 |     unique_module <- unique(module[module > 0])
415 |     for (i in seq_len(length(unique_module))) {
416 |       module_[which(module == unique_module[i])] <- i
417 |     }
418 |   }
419 | 
420 |   module_
421 | }
422 | 
423 | #' Extract target gene modules for given penalization parameters
424 | #'
425 | #' @param fit An object of class `scregclust`
426 | #' @param penalization A numeric vector of penalization parameters.
427 | #'                     The penalization parameters specified here must have
428 | #'                     been used used during fitting of the `fit` object.
429 | #'
430 | #' @return A list of lists of final target modules. One list for each
431 | #'         parameter in `penalization`. The lists contain the modules of
432 | #'         target genes for each final configuration.
433 | #'
434 | #' @concept utilities
435 | #'
436 | #' @export
437 | get_target_gene_modules <- function(fit, penalization = NULL) {
438 |   if (!all(penalization %in% fit$penalization)) {
439 |     cli::cli_abort(c(
440 |       "Not all parameter values in {.var penalization} have been fitted.",
441 |       "i" = paste(
442 |         "Penalization parameters in {.class scregclust} object:",
443 |         "{fit$penalization}"
444 |       ),
445 |       "i" = "Penalization parameters provided: {penalization}"
446 |     ))
447 |   }
448 | 
449 |   if (is.null(penalization)) {
450 |     idx <- seq_along(fit$penalization)
451 |   } else {
452 |     idx <- which(fit$penalization %in% penalization)
453 |   }
454 | 
455 |   lapply(idx, function(i) {
456 |     lapply(
457 |       fit$results[[i]]$output,
458 |       function(o) {
459 |         o$module[!fit$results[[i]]$is_regulator]
460 |       }
461 |     )
462 |   })
463 | }
464 | 
465 | #' Create a table of module overlap for two clusterings
466 | #'
467 | #' Compares two clusterings and creates a table of overlap between them.
468 | #' Module labels do not have to match.
469 | #'
470 | #' @param k1 First clustering
471 | #' @param k2 Second clustering
472 | #'
473 | #' @return A matrix showing the module overlap with the labels of `k1` in
474 | #'         the columns and the labels of `k2` in the rows.
475 | #'
476 | #' @concept helpers
477 | #'
478 | #' @export
479 | cluster_overlap <- function(k1, k2) {
480 |   if (length(k1) != length(k2)) {
481 |     cli::cli_abort(c(
482 |       "Clusterings are not the same length.",
483 |       "i" = "Length of {.var k1}: {length(k1)}",
484 |       "i" = "Length of {.var k2}: {length(k2)}"
485 |     ))
486 |   }
487 | 
488 |   e_k1 <- sort(unique(k1))
489 |   e_k2 <- sort(unique(k2))
490 | 
491 |   out <- do.call(cbind, lapply(e_k1, function(i1) {
492 |     stats::setNames(vapply(e_k2, function(i2) {
493 |       sum((k1 == i1) & (k2 == i2))
494 |     }, 1L), e_k2)
495 |   }))
496 |   colnames(out) <- e_k1
497 | 
498 |   out
499 | }
500 | 
501 | #' Extract final configurations into a data frame
502 | #'
503 | #' @param obj An object of class `scregclust`
504 | #'
505 | #' @return A [`data.frame`] containing penalization parameters and
506 | #'         final configurations for those penalizations.
507 | #'
508 | #' @concept helpers
509 | #'
510 | #' @export
511 | available_results <- function(obj) {
512 |   data.frame(
513 |     penalization = obj$penalization,
514 |     final_configurations = sapply(obj$results, function(res) length(res$output))
515 |   )
516 | }
517 | 
518 | #' Fast computation of correlation
519 | #'
520 | #' This uses a more memory-intensive but much faster algorithm than
521 | #' the built-in `cor` function.
522 | #'
523 | #' Computes the correlation between the columns of `x` and `y`.
524 | #'
525 | #' @param x first input matrix
526 | #' @param y second input matrix
527 | #'
528 | #' @return Correlations matrix between the columns of `x` and `y`
529 | #'
530 | #' @concept helpers
531 | #'
532 | #' @export
533 | fast_cor <- function(x, y) {
534 |   xv <- scale(x, center = TRUE, scale = FALSE)
535 |   yv <- scale(y, center = TRUE, scale = FALSE)
536 |   xvss <- colSums(xv * xv)
537 |   yvss <- colSums(yv * yv)
538 |   result <- crossprod(xv, yv) / sqrt(outer(xvss, yvss))
539 | 
540 |   pmax(pmin(result, 1), -1)
541 | }
542 | 
543 | #' Return the number of final configurations
544 | #'
545 | #' Returns the number of final configurations per penalization parameter in an
546 | #' scRegClust object.
547 | #'
548 | #' @param fit An object of class `scRegClust`
549 | #'
550 | #' @return An integer vector containing the number of final configurations
551 | #'         for each penalization parameter.
552 | #'
553 | #' @concept utilities
554 | #'
555 | #' @export
556 | get_num_final_configs <- function(fit) {
557 |   sapply(fit$results, function(r) length(r$output))
558 | }
559 | 
560 | #' Get the average number of active regulators per module
561 | #'
562 | #' @param fit An object of class `scRegClust`
563 | #'
564 | #' @return A [`data.frame`] containing the average number of active regulators
565 | #'         per module for each penalization parameter.
566 | #'
567 | #' @concept utilities
568 | #'
569 | #' @export
570 | get_avg_num_regulators <- function(fit) {
571 |   as.data.frame(do.call(rbind, lapply(fit$results, function(r) {
572 |     c(
573 |       penalization = r$penalization,
574 |       colMeans(
575 |         do.call(rbind, lapply(r$output, function(o) {
576 |           stats::setNames(
577 |             colSums(o$models),
578 |             seq_len(ncol(o$models))
579 |           )
580 |         }))
581 |       )
582 |     )
583 |   })))
584 | }
585 | 
586 | #' Compute the Rand index
587 | #'
588 | #' @param k1 First clustering as vector of integers
589 | #' @param k2 Second clustering as vector of integers
590 | #'
591 | #' @return The Rand index as a numeric value
592 | #'
593 | #' @references
594 | #' W. M. Rand (1971). "Objective criteria for the evaluation of clustering
595 | #' methods". Journal of the American Statistical Association 66 (336): 846–850.
596 | #' DOI:10.2307/2284239
597 | #'
598 | #' @keywords internal
599 | compute_rand_index <- function(k1, k2) {
600 |   n <- length(k1)
601 | 
602 |   # Assertion
603 |   stopifnot(length(k2) == n)
604 |   stopifnot(is.numeric(k1), all(as.integer(k1) == k1))
605 |   stopifnot(is.numeric(k2), all(as.integer(k2) == k2))
606 | 
607 |   # Requires that k1 and k2 are integer vectors (or integers in numeric format)
608 |   m1 <- do.call(c, lapply(
609 |     seq_len(n - 1L), function(i) abs(k1[i] - k1[(i + 1):n])
610 |   ))
611 |   m2 <- do.call(c, lapply(
612 |     seq_len(n - 1L), function(i) abs(k2[i] - k2[(i + 1):n])
613 |   ))
614 | 
615 |   # Compute Rand index
616 |   (sum(!m1 & !m2) + sum(m1 & m2)) / choose(n, 2)
617 | }
618 | 
619 | #' Compute Hubert's and Arabie's Adjusted Rand index
620 | #'
621 | #' @param k1 First clustering as vector of integers
622 | #' @param k2 Second clustering as vector of integers
623 | #'
624 | #' @return The Adjusted Rand index as a numeric value
625 | #'
626 | #' @references
627 | #' Lawrence Hubert and Phipps Arabie (1985). "Comparing partitions".
628 | #' Journal of Classification. 2 (1): 193–218. DOI:10.1007/BF01908075
629 | #'
630 | #' @keywords internal
631 | compute_adjusted_rand_index <- function(k1, k2) {
632 |   n <- length(k1)
633 | 
634 |   # Assertion
635 |   stopifnot(length(k2) == n)
636 |   stopifnot(is.numeric(k1), all(as.integer(k1) == k1))
637 |   stopifnot(is.numeric(k2), all(as.integer(k2) == k2))
638 | 
639 |   # Construct contingency table
640 |   ct <- table(k1, k2)
641 | 
642 |   # Compute binomial pair sums
643 |   sum_as <- sum(choose(rowSums(ct), 2))
644 |   sum_bs <- sum(choose(colSums(ct), 2))
645 |   sum_ns <- sum(choose(as.vector(ct), 2))
646 |   denom <- choose(n, 2)
647 | 
648 |   # Compute adjusted Rand index
649 |   (
650 |     (sum_ns - (sum_as * sum_bs) / denom)
651 |     / (
652 |       0.5 * (sum_as + sum_bs)
653 |       - (sum_as * sum_bs) / denom
654 |     )
655 |   )
656 | }
657 | 
658 | #' Compute Rand indices
659 | #'
660 | #' Compute Rand indices for fitted scregclust object
661 | #'
662 | #' @param fit An object of class `scregclust`
663 | #' @param groundtruth A known clustering of the target genes (integer vector)
664 | #' @param adjusted If TRUE, the Adjusted Rand index is computed. Otherwise the
665 | #'                 ordinary Rand index is computed.
666 | #'
667 | #' @return A [`data.frame`] containing the Rand indices. Since there can
668 | #'         be more than one final configuration for some penalization
669 | #'         parameters, Rand indices are averaged for each fixed penalization
670 | #'         parameter. Returned are the mean, standard deviation and number
671 | #'         of final configurations that were averaged.
672 | #'
673 | #' @references
674 | #' W. M. Rand (1971). "Objective criteria for the evaluation of clustering
675 | #' methods". Journal of the American Statistical Association 66 (336): 846–850.
676 | #' DOI:10.2307/2284239
677 | #'
678 | #' Lawrence Hubert and Phipps Arabie (1985). "Comparing partitions".
679 | #' Journal of Classification. 2 (1): 193–218. DOI:10.1007/BF01908075
680 | #'
681 | #' @concept utilities
682 | #'
683 | #' @export
684 | get_rand_indices <- function(fit, groundtruth, adjusted = TRUE) {
685 |   df <- do.call(rbind, lapply(get_target_gene_modules(fit), function(cs) {
686 |     indices <- sapply(cs, function(cl) {
687 |       noise_idx <- which(cl == -1)
688 |       if (length(noise_idx) > 0) {
689 |         cl_ <- cl[-noise_idx]
690 |         gt_ <- groundtruth[-noise_idx]
691 |       } else {
692 |         cl_ <- cl
693 |         gt_ <- groundtruth
694 |       }
695 | 
696 |       if (length(cl_) > 0) {
697 |         if (adjusted) {
698 |           compute_adjusted_rand_index(gt_, cl_) * length(cl_) / length(cl)
699 |         } else {
700 |           compute_rand_index(gt_, cl_) * length(cl_) / length(cl)
701 |         }
702 |       } else {
703 |         c(0)
704 |       }
705 |     })
706 | 
707 |     data.frame(mean = mean(indices), sd = sd(indices), n = length(indices))
708 |   }))
709 | 
710 |   cbind(data.frame(penalization = fit$penalization), df)
711 | }
712 | 


--------------------------------------------------------------------------------