├── src ├── .gitignore ├── Makevars ├── Makevars.win ├── utils.cpp ├── jaccard.cpp ├── allocation.cpp ├── RcppExports.cpp └── optim.cpp ├── .github ├── .gitignore └── workflows │ ├── close-inactive-issues.yml │ ├── pkgdown.yaml │ └── rhub.yaml ├── vignettes └── articles │ ├── .gitignore │ ├── mouse.Rmd │ └── pbmc.Rmd ├── .gitattributes ├── cran-comments.md ├── R ├── sysdata.rda ├── scregclust-package.R ├── RcppExports.R ├── plotting.R └── utils.R ├── man ├── figures │ └── overview_fig1A_bg.png ├── reset_array.Rd ├── coef_ridge.Rd ├── available_results.Rd ├── kmeanspp_init.Rd ├── find_module_sizes.Rd ├── remove_empty_modules.Rd ├── coef_ols.Rd ├── get_avg_num_regulators.Rd ├── get_num_final_configs.Rd ├── alloc_array.Rd ├── get_regulator_list.Rd ├── fast_cor.Rd ├── cluster_overlap.Rd ├── count_table.Rd ├── progstr.Rd ├── compute_rand_index.Rd ├── compute_adjusted_rand_index.Rd ├── get_target_gene_modules.Rd ├── jaccard_indicator.Rd ├── plot_module_count_helper.Rd ├── scregclust_format.Rd ├── scregclust-package.Rd ├── plot_silhouettes.Rd ├── kmeanspp.Rd ├── coef_nnls.Rd ├── plot_regulator_network.Rd ├── get_rand_indices.Rd ├── jaccard_indicator_comp.Rd ├── split_sample.Rd ├── coop_lasso.Rd └── scregclust.Rd ├── .gitignore ├── pkgdown └── assets │ └── overview_fig1A_bg.png ├── datasets ├── mouse_scregclust.rds ├── pbmc_scregclust.rds ├── humanKinases.txt ├── humanTFs_v2.txt ├── humanTFs.txt └── humanTFs_v3.txt ├── .clang-format ├── .Rbuildignore ├── tests ├── testthat │ ├── test-fast-cor.R │ └── test-constant-genes.R └── testthat.R ├── inst └── CITATION ├── scripts └── update-sysdata.R ├── index.md ├── _pkgdown.yml ├── NEWS.md ├── README.md ├── DESCRIPTION └── NAMESPACE /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /vignettes/articles/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.rds filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | * Removed vignettes 2 | * Adressed CRAN issues 3 | -------------------------------------------------------------------------------- /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scmethods/scregclust/HEAD/R/sysdata.rda -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 2 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) 3 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 2 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) 3 | -------------------------------------------------------------------------------- /man/figures/overview_fig1A_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scmethods/scregclust/HEAD/man/figures/overview_fig1A_bg.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | inst/doc 3 | /doc/ 4 | /Meta/ 5 | docs 6 | CRAN-SUBMISSION 7 | .cache 8 | compile_commands.json 9 | -------------------------------------------------------------------------------- /pkgdown/assets/overview_fig1A_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scmethods/scregclust/HEAD/pkgdown/assets/overview_fig1A_bg.png -------------------------------------------------------------------------------- /datasets/mouse_scregclust.rds: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f357f3aef9f6f48346394458643ff2d345542eed6ac81a53ad9f8ed239caead8 3 | size 7995810 4 | -------------------------------------------------------------------------------- /datasets/pbmc_scregclust.rds: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3c92b65257337e04b370addc88f029fc16533db5cda7a9647cbb330d8d95a3ff 3 | size 15045381 4 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | ColumnLimit: 90 3 | DerivePointerAlignment: false 4 | PointerAlignment: Left 5 | AccessModifierOffset: 0 6 | NamespaceIndentation: All 7 | IncludeBlocks: Preserve 8 | TabWidth: 4 9 | IndentWidth: 4 10 | UseTab: Always 11 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^datasets$ 2 | ^scripts$ 3 | ^LICENSE\.md$ 4 | \.RData$ 5 | ^_pkgdown\.yml$ 6 | ^docs$ 7 | ^pkgdown$ 8 | ^index\.md$ 9 | ^\.github$ 10 | ^\.clang-format$ 11 | ^CRAN-SUBMISSION$ 12 | ^cran-comments\.md$ 13 | \.cache 14 | compile_commands\.json 15 | ^vignettes/articles$ 16 | -------------------------------------------------------------------------------- /tests/testthat/test-fast-cor.R: -------------------------------------------------------------------------------- 1 | test_that("fast correlation computation", { 2 | pt <- 50 3 | pr <- 10 4 | n <- 200 5 | 6 | zt <- matrix(rnorm(pt * n), ncol = pt) 7 | zr <- matrix(rnorm(pr * n), ncol = pr) 8 | 9 | c_ref <- cor(zt, zr) 10 | c2 <- fast_cor(zt, zr) 11 | 12 | expect_equal( 13 | c_ref, 14 | c2 15 | ) 16 | }) 17 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/tests.html 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files 8 | 9 | library(testthat) 10 | library(scregclust) 11 | 12 | test_check("scregclust") 13 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry( 2 | bibtype = "Article", 3 | title = "Reconstructing the regulatory programs underlying the phenotypic plasticity of neural cancers", 4 | author = "Ida Larsson, Felix Held, Gergana Popova, Alper Koc, Soumi Kundu, Rebecka Jörnsten, Sven Nelander", 5 | journal = "Nature Communications", 6 | year = "2024", 7 | volume = "15", 8 | number = "9699", 9 | doi = "10.1038/s41467-024-53954-3" 10 | ) 11 | -------------------------------------------------------------------------------- /man/reset_array.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{reset_array} 4 | \alias{reset_array} 5 | \title{Reset input 3d-array by filling matrix along first dimension} 6 | \usage{ 7 | reset_array(arr, input) 8 | } 9 | \arguments{ 10 | \item{arr}{The 3d-array of dimension \verb{n_cl x n_obs x n_genes}} 11 | 12 | \item{input}{The matrix of size \verb{n_obs x n_genes}} 13 | } 14 | \description{ 15 | Reset input 3d-array by filling matrix along first dimension 16 | } 17 | \keyword{internal} 18 | -------------------------------------------------------------------------------- /man/coef_ridge.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{coef_ridge} 4 | \alias{coef_ridge} 5 | \title{Compute ridge regression coefficients} 6 | \usage{ 7 | coef_ridge(y, x, lambda) 8 | } 9 | \arguments{ 10 | \item{y}{Target vector (n x 1)/matrix (n x m)} 11 | 12 | \item{x}{Design matrix (n x p)} 13 | 14 | \item{lambda}{Positive parameter for ridge penalty} 15 | } 16 | \value{ 17 | Vector of ridge regression coefficients 18 | } 19 | \description{ 20 | Compute ridge regression coefficients 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/available_results.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{available_results} 4 | \alias{available_results} 5 | \title{Extract final configurations into a data frame} 6 | \usage{ 7 | available_results(obj) 8 | } 9 | \arguments{ 10 | \item{obj}{An object of class \code{scregclust}} 11 | } 12 | \value{ 13 | A \code{\link{data.frame}} containing penalization parameters and 14 | final configurations for those penalizations. 15 | } 16 | \description{ 17 | Extract final configurations into a data frame 18 | } 19 | \concept{helpers} 20 | -------------------------------------------------------------------------------- /man/kmeanspp_init.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{kmeanspp_init} 4 | \alias{kmeanspp_init} 5 | \title{Determine initial centers for the kmeans++ algorithm} 6 | \usage{ 7 | kmeanspp_init(n_cluster, x = NULL, dm = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{data matrix to be clustered} 11 | 12 | \item{dm}{distance matrix (between rows of x; of class "dist")} 13 | } 14 | \value{ 15 | Row indices of initial cluster centers of x 16 | } 17 | \description{ 18 | Determine initial centers for the kmeans++ algorithm 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /man/find_module_sizes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{find_module_sizes} 4 | \alias{find_module_sizes} 5 | \title{Determine module sizes} 6 | \usage{ 7 | find_module_sizes(module, n_modules) 8 | } 9 | \arguments{ 10 | \item{module}{Vector of module indices} 11 | 12 | \item{n_modules}{Total number of modules} 13 | } 14 | \value{ 15 | A named vector containing the name of the module (its index or 16 | \code{"Noise"}) and the number of elements in that module 17 | } 18 | \description{ 19 | Determine module sizes 20 | } 21 | \concept{helpers} 22 | -------------------------------------------------------------------------------- /man/remove_empty_modules.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{remove_empty_modules} 4 | \alias{remove_empty_modules} 5 | \title{Remove empty modules} 6 | \usage{ 7 | remove_empty_modules(module) 8 | } 9 | \arguments{ 10 | \item{module}{Vector of module indices} 11 | } 12 | \value{ 13 | The updated vector of module indices with empty modules removed. 14 | } 15 | \description{ 16 | Remove empty modules 17 | } 18 | \details{ 19 | Only iterates through modules with positive index, leaving the noise 20 | module untouched. 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/coef_ols.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{coef_ols} 4 | \alias{coef_ols} 5 | \title{Compute OLS coefficients} 6 | \usage{ 7 | coef_ols(y, x) 8 | } 9 | \arguments{ 10 | \item{y}{Target vector (n x 1)/matrix (n x m)} 11 | 12 | \item{x}{Design matrix (n x p)} 13 | } 14 | \value{ 15 | Vector of OLS coefficients 16 | } 17 | \description{ 18 | If the design matrix has full column-rank, then use the normal 19 | least squares estimate. Otherwise, use the Moore-Penrose inverse 20 | to compute the least squares estimate. 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/get_avg_num_regulators.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{get_avg_num_regulators} 4 | \alias{get_avg_num_regulators} 5 | \title{Get the average number of active regulators per module} 6 | \usage{ 7 | get_avg_num_regulators(fit) 8 | } 9 | \arguments{ 10 | \item{fit}{An object of class \code{scRegClust}} 11 | } 12 | \value{ 13 | A \code{\link{data.frame}} containing the average number of active regulators 14 | per module for each penalization parameter. 15 | } 16 | \description{ 17 | Get the average number of active regulators per module 18 | } 19 | \concept{utilities} 20 | -------------------------------------------------------------------------------- /man/get_num_final_configs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{get_num_final_configs} 4 | \alias{get_num_final_configs} 5 | \title{Return the number of final configurations} 6 | \usage{ 7 | get_num_final_configs(fit) 8 | } 9 | \arguments{ 10 | \item{fit}{An object of class \code{scRegClust}} 11 | } 12 | \value{ 13 | An integer vector containing the number of final configurations 14 | for each penalization parameter. 15 | } 16 | \description{ 17 | Returns the number of final configurations per penalization parameter in an 18 | scRegClust object. 19 | } 20 | \concept{utilities} 21 | -------------------------------------------------------------------------------- /man/alloc_array.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{alloc_array} 4 | \alias{alloc_array} 5 | \title{Allocate 3d-array and fill with matrix along first dimension} 6 | \usage{ 7 | alloc_array(input, n_cl) 8 | } 9 | \arguments{ 10 | \item{input}{the matrix of size \verb{n_obs x n_genes}} 11 | 12 | \item{n_cl}{the size of the three-dimensional array's first dimension} 13 | } 14 | \value{ 15 | The allocated and filled array of size \verb{n_cl x n_obs x n_genes} 16 | } 17 | \description{ 18 | Allocate 3d-array and fill with matrix along first dimension 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /man/get_regulator_list.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scregclust.R 3 | \name{get_regulator_list} 4 | \alias{get_regulator_list} 5 | \title{Return list of regulator genes} 6 | \usage{ 7 | get_regulator_list(mode = c("TF", "kinase")) 8 | } 9 | \arguments{ 10 | \item{mode}{Determines which genes are considered to be regulators. 11 | Currently supports TF=transcription factors and kinases.} 12 | } 13 | \value{ 14 | a list of gene symbols 15 | } 16 | \description{ 17 | Return list of regulator genes 18 | } 19 | \seealso{ 20 | \code{\link[=scregclust_format]{scregclust_format()}} 21 | } 22 | \concept{utilities} 23 | -------------------------------------------------------------------------------- /man/fast_cor.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{fast_cor} 4 | \alias{fast_cor} 5 | \title{Fast computation of correlation} 6 | \usage{ 7 | fast_cor(x, y) 8 | } 9 | \arguments{ 10 | \item{x}{first input matrix} 11 | 12 | \item{y}{second input matrix} 13 | } 14 | \value{ 15 | Correlations matrix between the columns of \code{x} and \code{y} 16 | } 17 | \description{ 18 | This uses a more memory-intensive but much faster algorithm than 19 | the built-in \code{cor} function. 20 | } 21 | \details{ 22 | Computes the correlation between the columns of \code{x} and \code{y}. 23 | } 24 | \concept{helpers} 25 | -------------------------------------------------------------------------------- /man/cluster_overlap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{cluster_overlap} 4 | \alias{cluster_overlap} 5 | \title{Create a table of module overlap for two clusterings} 6 | \usage{ 7 | cluster_overlap(k1, k2) 8 | } 9 | \arguments{ 10 | \item{k1}{First clustering} 11 | 12 | \item{k2}{Second clustering} 13 | } 14 | \value{ 15 | A matrix showing the module overlap with the labels of \code{k1} in 16 | the columns and the labels of \code{k2} in the rows. 17 | } 18 | \description{ 19 | Compares two clusterings and creates a table of overlap between them. 20 | Module labels do not have to match. 21 | } 22 | \concept{helpers} 23 | -------------------------------------------------------------------------------- /tests/testthat/test-constant-genes.R: -------------------------------------------------------------------------------- 1 | test_that("constant genes are discarded correctly", { 2 | expression <- rbind( 3 | rep.int(1, 100), 4 | matrix(rnorm(500), nrow = 5), 5 | rep.int(0.5, 100), 6 | rnorm(100) 7 | ) 8 | 9 | genesymbols <- c("T1", "T2", "T3", "T4", "T5", "T6", "R1", "R2") 10 | is_regulator <- c(0, 0, 0, 0, 0, 0, 1, 1) 11 | 12 | fit <- scregclust( 13 | expression, genesymbols, is_regulator, 0.1, 2, verbose = FALSE 14 | ) 15 | 16 | expect_equal( 17 | fit$results[[1]]$genesymbols, 18 | c("T2", "T3", "T4", "T5", "T6", "R2") 19 | ) 20 | expect_equal( 21 | fit$results[[1]]$is_regulator, 22 | c(FALSE, FALSE, FALSE, FALSE, FALSE, TRUE) 23 | ) 24 | }) 25 | -------------------------------------------------------------------------------- /man/count_table.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{count_table} 4 | \alias{count_table} 5 | \title{Format count table nicely} 6 | \usage{ 7 | count_table(counts, title, row_names, col_width = 5) 8 | } 9 | \arguments{ 10 | \item{counts}{a list of count vectors with \code{1 + n_cl} entries each. 11 | \code{NA} values are replaced with \code{-}} 12 | 13 | \item{title}{title above the table} 14 | 15 | \item{row_names}{a vector of row names, one for each count vector} 16 | 17 | \item{col_width}{minimum width for columns} 18 | } 19 | \value{ 20 | A string formatted as a table 21 | } 22 | \description{ 23 | Format count table nicely 24 | } 25 | \keyword{internal} 26 | -------------------------------------------------------------------------------- /man/progstr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{progstr} 4 | \alias{progstr} 5 | \title{Quick'n'dirty progress bar} 6 | \usage{ 7 | progstr(step, n_steps, name, finished = FALSE, progress_length = 20L) 8 | } 9 | \arguments{ 10 | \item{step}{current step being worked on} 11 | 12 | \item{n_steps}{total number of steps} 13 | 14 | \item{name}{name of the process} 15 | 16 | \item{finished}{whether the process is finished} 17 | 18 | \item{progress_length}{length of the progress bar in ascii signs} 19 | } 20 | \value{ 21 | A string formatted as a progress bar 22 | } 23 | \description{ 24 | Creates a progress bar and returns it as a string. 25 | } 26 | \keyword{internal} 27 | -------------------------------------------------------------------------------- /man/compute_rand_index.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{compute_rand_index} 4 | \alias{compute_rand_index} 5 | \title{Compute the Rand index} 6 | \usage{ 7 | compute_rand_index(k1, k2) 8 | } 9 | \arguments{ 10 | \item{k1}{First clustering as vector of integers} 11 | 12 | \item{k2}{Second clustering as vector of integers} 13 | } 14 | \value{ 15 | The Rand index as a numeric value 16 | } 17 | \description{ 18 | Compute the Rand index 19 | } 20 | \references{ 21 | W. M. Rand (1971). "Objective criteria for the evaluation of clustering 22 | methods". Journal of the American Statistical Association 66 (336): 846–850. 23 | DOI:10.2307/2284239 24 | } 25 | \keyword{internal} 26 | -------------------------------------------------------------------------------- /scripts/update-sysdata.R: -------------------------------------------------------------------------------- 1 | # Load data to be used inside package 2 | human_tfs <- read.csv("datasets/humanTFs.txt", header = FALSE)[, 1] 3 | human_tfs_v2 <- read.csv("datasets/humanTFs_v2.txt", header = FALSE)[, 1] 4 | human_tfs_v3 <- read.csv("datasets/humanTFs_v3.txt", header = FALSE)[, 1] 5 | 6 | human_kinases <- read.csv("datasets/humanKinases.txt", header = FALSE)[, 1] 7 | 8 | human_regulators <- read.csv("datasets/humanRegulators.txt", header = FALSE)[, 1] 9 | 10 | # Create R/sysdata.rda with those datasets 11 | usethis::use_data( 12 | human_tfs, 13 | human_tfs_v2, 14 | human_tfs_v3, 15 | human_kinases, 16 | human_regulators, 17 | internal = TRUE, 18 | overwrite = TRUE 19 | ) 20 | 21 | # Use them with e.g. scregclust:::human_tfs 22 | # Note that there are three `:` -------------------------------------------------------------------------------- /man/compute_adjusted_rand_index.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{compute_adjusted_rand_index} 4 | \alias{compute_adjusted_rand_index} 5 | \title{Compute Hubert's and Arabie's Adjusted Rand index} 6 | \usage{ 7 | compute_adjusted_rand_index(k1, k2) 8 | } 9 | \arguments{ 10 | \item{k1}{First clustering as vector of integers} 11 | 12 | \item{k2}{Second clustering as vector of integers} 13 | } 14 | \value{ 15 | The Adjusted Rand index as a numeric value 16 | } 17 | \description{ 18 | Compute Hubert's and Arabie's Adjusted Rand index 19 | } 20 | \references{ 21 | Lawrence Hubert and Phipps Arabie (1985). "Comparing partitions". 22 | Journal of Classification. 2 (1): 193–218. DOI:10.1007/BF01908075 23 | } 24 | \keyword{internal} 25 | -------------------------------------------------------------------------------- /.github/workflows/close-inactive-issues.yml: -------------------------------------------------------------------------------- 1 | name: Close inactive issues 2 | on: 3 | schedule: 4 | - cron: "30 1 * * *" 5 | 6 | jobs: 7 | close-issues: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | issues: write 11 | pull-requests: write 12 | steps: 13 | - uses: actions/stale@v5 14 | with: 15 | days-before-issue-stale: 30 16 | days-before-issue-close: 30 17 | stale-issue-label: "stale" 18 | stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." 19 | close-issue-message: "This issue was closed because it has been inactive for 30 days since being marked as stale." 20 | days-before-pr-stale: -1 21 | days-before-pr-close: -1 22 | repo-token: ${{ secrets.GITHUB_TOKEN }} 23 | -------------------------------------------------------------------------------- /man/get_target_gene_modules.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{get_target_gene_modules} 4 | \alias{get_target_gene_modules} 5 | \title{Extract target gene modules for given penalization parameters} 6 | \usage{ 7 | get_target_gene_modules(fit, penalization = NULL) 8 | } 9 | \arguments{ 10 | \item{fit}{An object of class \code{scregclust}} 11 | 12 | \item{penalization}{A numeric vector of penalization parameters. 13 | The penalization parameters specified here must have 14 | been used used during fitting of the \code{fit} object.} 15 | } 16 | \value{ 17 | A list of lists of final target modules. One list for each 18 | parameter in \code{penalization}. The lists contain the modules of 19 | target genes for each final configuration. 20 | } 21 | \description{ 22 | Extract target gene modules for given penalization parameters 23 | } 24 | \concept{utilities} 25 | -------------------------------------------------------------------------------- /index.md: -------------------------------------------------------------------------------- 1 | # Single-cell Regulatory-driven Clustering (scregclust) 2 | 3 | 4 | 5 | 6 | 7 | ![A schematic overview of the steps involved in the scregclust algorithm.](overview_fig1A_bg.png "Overview of the scregclust algorithm") 8 | 9 | ## Introduction 10 | 11 | The goal of *scregclust* is to cluster genes by regulatory programs. To do so, genes are clustered into modules which in turn are associated with regulators. The algorithm alternates between associating regulators to modules and reallocating target genes into modules. 12 | 13 | A detailed description of the algorithm and an in-depth evaluation of its properties can be found in our original research article [Larsson, Held, et al. (2024) Reconstructing the regulatory programs underlying the phenotypic plasticity of neural cancers. Nature Communications 15, 9699 DOI 10.1038/s41467-024-53954-3](https://doi.org/10.1038/s41467-024-53954-3) -------------------------------------------------------------------------------- /man/jaccard_indicator.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{jaccard_indicator} 4 | \alias{jaccard_indicator} 5 | \title{Compute indicator matrix of pairwise distances smaller than threshold} 6 | \usage{ 7 | jaccard_indicator(x, upper_bnd = 0.8) 8 | } 9 | \arguments{ 10 | \item{x}{the input matrix with vectors to be compared in the rows.} 11 | 12 | \item{upper_bnd}{pairs with a Jaccard distance below this upper bound are 13 | returned as 1 while all others receive the entry 0.} 14 | } 15 | \value{ 16 | A list of vectors describing a sparse lower triangular pattern matrix 17 | \item{i}{Row indices} 18 | \item{j}{Column indices} 19 | } 20 | \description{ 21 | Computes the Jaccard distance between rows of a matrix and returns a 22 | sparse symmetric indicator matrix containing the entries with a distance 23 | of less than a given upper bound. Note that the diagonal is always 1. 24 | } 25 | \keyword{internal} 26 | -------------------------------------------------------------------------------- /man/plot_module_count_helper.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plotting.R 3 | \name{plot_module_count_helper} 4 | \alias{plot_module_count_helper} 5 | \title{Plot average silhouette scores and average predictive \eqn{R^2}} 6 | \usage{ 7 | plot_module_count_helper(list_of_fits, penalization) 8 | } 9 | \arguments{ 10 | \item{list_of_fits}{A list of \code{scregclust} objects each fit to the same 11 | dataset across a variety of module counts (varying 12 | \code{n_modules} while running \code{\link{scregclust}}).} 13 | 14 | \item{penalization}{Either a single numeric value requesting the results 15 | for the same penalty parameter across all fits in 16 | \code{list_of_fits}, or one for each individual fit.} 17 | } 18 | \value{ 19 | A ggplot2 plot showing the average silhouette score and the 20 | average predictive \eqn{R^2} 21 | } 22 | \description{ 23 | Plot average silhouette scores and average predictive \eqn{R^2} 24 | } 25 | \concept{plotting} 26 | -------------------------------------------------------------------------------- /man/scregclust_format.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scregclust.R 3 | \name{scregclust_format} 4 | \alias{scregclust_format} 5 | \title{Package data before clustering} 6 | \usage{ 7 | scregclust_format(expression_matrix, mode = c("TF", "kinase")) 8 | } 9 | \arguments{ 10 | \item{expression_matrix}{The p x n gene expression matrix with gene symbols 11 | as rownames.} 12 | 13 | \item{mode}{Determines which genes are considered to be regulators.} 14 | } 15 | \value{ 16 | A list with 17 | \item{genesymbols}{The gene symbols extracted from the expression matrix} 18 | \item{sample_assignment}{A vector filled with \code{1}'s of the same length as 19 | there are columns in the gene expression matrix.} 20 | \item{is_regulator}{Whether a gene is considered to be a regulator or not, 21 | determined dependent on \code{mode}.} 22 | } 23 | \description{ 24 | Package data before clustering 25 | } 26 | \seealso{ 27 | \code{\link[=get_regulator_list]{get_regulator_list()}} 28 | } 29 | \concept{main} 30 | -------------------------------------------------------------------------------- /man/scregclust-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scregclust-package.R 3 | \docType{package} 4 | \name{scregclust-package} 5 | \alias{scregclust-package} 6 | \title{scregclust: Reconstructing the Regulatory Programs of Target Genes in scRNA-Seq Data} 7 | \description{ 8 | Implementation of the scregclust algorithm described in Larsson, Held, et al. (2024) \doi{10.1038/s41467-024-53954-3} which reconstructs regulatory programs of target genes in scRNA-seq data. Target genes are clustered into modules and each module is associated with a linear model describing the regulatory program. 9 | } 10 | \details{ 11 | Computational methods for the scregclust algorithm 12 | } 13 | \seealso{ 14 | Useful links: 15 | \itemize{ 16 | \item \url{https://scmethods.github.io/scregclust/} 17 | \item \url{https://github.com/scmethods/scregclust/} 18 | \item Report bugs at \url{https://github.com/scmethods/scregclust/issues} 19 | } 20 | 21 | } 22 | \author{ 23 | Ida Larsson, Felix Held, Sven Nelander 24 | } 25 | \keyword{internal} 26 | -------------------------------------------------------------------------------- /R/scregclust-package.R: -------------------------------------------------------------------------------- 1 | #' @details 2 | #' Computational methods for the scregclust algorithm 3 | #' @keywords internal 4 | #' @aliases scregclust-package 5 | #' @author Ida Larsson, Felix Held, Sven Nelander 6 | #' @import Rcpp 7 | #' @import cli 8 | #' @import ggplot2 9 | #' @importFrom rlang .data 10 | #' @importFrom prettyunits pretty_dt 11 | #' @importFrom Matrix Matrix sparseMatrix Diagonal t rowSums colSums summary 12 | #' @importFrom stats cor coef predict na.omit kmeans quantile sd dist setNames 13 | #' @importFrom utils read.table head tail globalVariables 14 | #' @importFrom graphics legend 15 | #' @importFrom methods is as 16 | #' @importFrom reshape melt 17 | #' @importFrom igraph graph_from_data_frame delete_edges delete_vertices layout_with_fr V E degree 18 | #' @importFrom grid arrow unit 19 | #' @useDynLib scregclust, .registration = TRUE 20 | "_PACKAGE" 21 | 22 | .onUnload <- function(libpath) { 23 | library.dynam.unload("scregclust", libpath) 24 | } 25 | 26 | # Shut up some annoying `R CMD check` warnings 27 | utils::globalVariables(c(".", "variable")) 28 | -------------------------------------------------------------------------------- /man/plot_silhouettes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plotting.R 3 | \name{plot_silhouettes} 4 | \alias{plot_silhouettes} 5 | \title{Plot individual silhouette scores} 6 | \usage{ 7 | plot_silhouettes(list_of_fits, penalization, final_config = 1L) 8 | } 9 | \arguments{ 10 | \item{list_of_fits}{A list of \code{scregclust} objects each fit to the same 11 | dataset across a variety of module counts (varying 12 | \code{n_modules} when running \code{\link{scregclust}}).} 13 | 14 | \item{penalization}{Either a single numeric value requesting the results 15 | for the same penalty parameter across all fits in 16 | \code{list_of_fits}, or one for each individual fit.} 17 | 18 | \item{final_config}{The final configuration that should be visualized. 19 | Either a single number to be used for all fits in 20 | \code{list_of_fits}, or one for each individual fit.} 21 | } 22 | \value{ 23 | A ggplot2 plot showing the the silhouette scores for each 24 | supplied fit. 25 | } 26 | \description{ 27 | Plot individual silhouette scores 28 | } 29 | \concept{plotting} 30 | -------------------------------------------------------------------------------- /man/kmeanspp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{kmeanspp} 4 | \alias{kmeanspp} 5 | \title{Perform the k-means++ algorithm} 6 | \usage{ 7 | kmeanspp(x, n_cluster, n_init_clusterings = 10L, n_max_iter = 10L) 8 | } 9 | \arguments{ 10 | \item{x}{Input matrix (n x p)} 11 | 12 | \item{n_cluster}{Number of clusters} 13 | 14 | \item{n_init_clusterings}{Number of repeated random initializations 15 | to perform} 16 | 17 | \item{n_max_iter}{Number of maximum iterations to perform in the k-means 18 | algorithm} 19 | } 20 | \value{ 21 | An object of class \code{\link[stats:kmeans]{stats::kmeans}}. 22 | } 23 | \description{ 24 | Performs the k-means++ algorithm to cluster the rows of the input matrix. 25 | } 26 | \details{ 27 | Estimation is repeated 28 | } 29 | \references{ 30 | David Arthur and Sergei Vassilvitskii. K-Means++: The advantages 31 | of careful seeding. In Proceedings of the Eighteenth Annual ACM-SIAM 32 | Symposium on Discrete Algorithms, SODA '07, pages 1027––1035. 33 | Society for Industrial and Applied Mathematics, 2007. 34 | } 35 | \concept{helpers} 36 | -------------------------------------------------------------------------------- /man/coef_nnls.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{coef_nnls} 4 | \alias{coef_nnls} 5 | \title{Compute NNLS coefficients} 6 | \usage{ 7 | coef_nnls(x, y, eps = 1e-12, max_iter = 1000L) 8 | } 9 | \arguments{ 10 | \item{x}{Coefficient matrix (p x n matrix)} 11 | 12 | \item{y}{Right hand side (p x m matrix)} 13 | 14 | \item{eps}{Convergence tolerance} 15 | 16 | \item{max_iter}{Maximum number of iterations} 17 | } 18 | \value{ 19 | A list containing 20 | \item{beta}{The estimated coefficient matrix} 21 | \item{iterations}{A vector containing the number of iterations needed 22 | for the \code{i}-th column in \code{y} in the \code{i}-th entry.} 23 | } 24 | \description{ 25 | Computes non-negative least squares coefficients with a matrix 26 | right hand side. 27 | } 28 | \references{ 29 | Duy Khuong Nguyen and Tu Bao Ho. Accelerated anti-lopsided algorithm 30 | for nonnegative least squares. International Journal of Data Science 31 | and Analytics, 3(1):23–34, 2017. 32 | 33 | Adapted from \url{https://github.com/khuongnd/nnls_antilopsided} 34 | } 35 | \keyword{internal} 36 | -------------------------------------------------------------------------------- /datasets/humanKinases.txt: -------------------------------------------------------------------------------- 1 | MYLK 2 | CRIM1 3 | CAMK2G 4 | DGKB 5 | PRKG1 6 | ROCK1 7 | CCND1 8 | PDGFRB 9 | PLK2 10 | PDGFRL 11 | PKDCC 12 | CIT 13 | CALM2 14 | DTYMK 15 | AURKA 16 | AURKB 17 | BUB1B 18 | CDK1 19 | PBK 20 | TTK 21 | BUB1 22 | CCNB1 23 | CCNB2 24 | NEK2 25 | PLK1 26 | TK1 27 | WEE1 28 | CCNE1 29 | UGP2 30 | HSP90AA1 31 | CAMK2A 32 | EPHA4 33 | FGFR3 34 | LRRK2 35 | NTRK3 36 | PHKG1 37 | TNK2 38 | PIM3 39 | DGKI 40 | ERBB4 41 | CDK6 42 | EPHA3 43 | NRBP2 44 | RPS6KA2 45 | AK4 46 | MAP3K1 47 | PDGFRA 48 | CKB 49 | KIT 50 | SGK1 51 | STK32A 52 | REV3L 53 | MET 54 | TGFBR2 55 | DCLK1 56 | EFEMP1 57 | ABL2 58 | CCL2 59 | CDKN1A 60 | EPHA2 61 | FAM20C 62 | HMGA2 63 | LTBP1 64 | MAP4K4 65 | NRP1 66 | PAK3 67 | DCX 68 | DDR2 69 | KALRN 70 | TRIB3 71 | AXL 72 | FGFR1 73 | SQSTM1 74 | HBEGF 75 | IRS2 76 | NRP2 77 | STK17A 78 | TRIB1 79 | PGM2L1 80 | ITPKC 81 | PFKFB3 82 | PIM1 83 | PLK3 84 | RASSF2 85 | EGFR 86 | CAMK2B 87 | DCLK2 88 | NTRK2 89 | PDGFA 90 | TGFB2 91 | DDR1 92 | MAPK10 93 | SOX9 94 | TRIO 95 | TGFBR3 96 | SPHK1 97 | AATK 98 | CDK18 99 | ERBB3 -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://scmethods.github.io/scregclust/ 2 | template: 3 | bootstrap: 5 4 | navbar: 5 | title: scregclust 6 | left: 7 | - text: Reference 8 | href: reference/index.html 9 | - text: Articles 10 | menu: 11 | - text: Demonstration of workflow 12 | href: articles/pbmc.html 13 | - text: Custom regulator list 14 | href: articles/mouse.html 15 | right: 16 | - icon: fa-github 17 | href: https://github.com/scmethods/scregclust 18 | aria-label: GitHub 19 | reference: 20 | - title: Setting up and performing clustering 21 | desc: | 22 | Functions to prepare the input data and to perform single-cell regulatory-driven clustering. 23 | - contents: 24 | - has_concept("main") 25 | - title: Plotting and evaluation 26 | desc: | 27 | Functions which help in plotting and evaluating results. 28 | - contents: 29 | - has_concept("plotting") 30 | - title: Utility functions 31 | desc: | 32 | Functions that make accessing aspects of the results easier. 33 | - contents: 34 | - has_concept("utilities") 35 | - title: Other helpers 36 | - contents: 37 | - has_concept("helpers") 38 | -------------------------------------------------------------------------------- /man/plot_regulator_network.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plotting.R 3 | \name{plot_regulator_network} 4 | \alias{plot_regulator_network} 5 | \title{Plotting the regulatory table from scregclust as a directed graph} 6 | \usage{ 7 | plot_regulator_network( 8 | output, 9 | arrow_size = 0.3, 10 | edge_scaling = 30, 11 | no_links = 6, 12 | col = c("gray80", "#FC7165", "#BD828C", "#9D8A9F", "#7D92B2", "#BDA88C", "#FCBD65", 13 | "#F2BB90", "#E7B9BA", "#BDB69C", "#92B27D", "#9B8BA5", "#9D7DB2", "#94A5BF") 14 | ) 15 | } 16 | \arguments{ 17 | \item{output}{Object of type \code{scregclust_output} from a fit of the 18 | scregclust algorithm.} 19 | 20 | \item{arrow_size}{Size of arrow head} 21 | 22 | \item{edge_scaling}{Scaling factor for edge width} 23 | 24 | \item{no_links}{Threshold value (0-10) for number of edges to show, 25 | higher value = more stringent threshold = less edges} 26 | 27 | \item{col}{color} 28 | } 29 | \value{ 30 | Graph with gene modules and regulators as nodes 31 | } 32 | \description{ 33 | Plotting the regulatory table from scregclust as a directed graph 34 | } 35 | \concept{plotting} 36 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # scregclust 0.2.2 2 | 3 | - Vignettes relied heavily on downloading data. Make vignettes articles 4 | that appear on the website but not in the actual package, since it is expected 5 | that anybody profiting from the vignettes will have an active internet 6 | connection anyways. 7 | 8 | # scregclust 0.2.1 9 | 10 | - Added a vignette illustrating how to supply your own regulator list 11 | 12 | # scregclust 0.2.0-1 13 | 14 | - Regulator importance was computed unnecessarily if there is only one 15 | regulator. A single regulator is always the most important regulator 16 | for a cluster. 17 | 18 | # scregclust 0.2.0 19 | 20 | ## New features 21 | 22 | - Quick Mode: Instead of trying to re-allocate all target genes that were 23 | allocated into the noise cluster, only a certain (random) percentage of 24 | these target genes is attempted to be re-allocated. 25 | 26 | `quick_mode = TRUE` has to be supplied as an argument to `scregclust` to 27 | activate this feature (off by default) and the percentage of 28 | noise target genes to re-allocate is given by `quick_mode_percent`, 29 | a number in [0, 1). 30 | 31 | ## Minor changes 32 | 33 | - Added CRAN install instructions to the README 34 | 35 | # scregclust 0.1.0 36 | 37 | - First release on CRAN 38 | -------------------------------------------------------------------------------- /man/get_rand_indices.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{get_rand_indices} 4 | \alias{get_rand_indices} 5 | \title{Compute Rand indices} 6 | \usage{ 7 | get_rand_indices(fit, groundtruth, adjusted = TRUE) 8 | } 9 | \arguments{ 10 | \item{fit}{An object of class \code{scregclust}} 11 | 12 | \item{groundtruth}{A known clustering of the target genes (integer vector)} 13 | 14 | \item{adjusted}{If TRUE, the Adjusted Rand index is computed. Otherwise the 15 | ordinary Rand index is computed.} 16 | } 17 | \value{ 18 | A \code{\link{data.frame}} containing the Rand indices. Since there can 19 | be more than one final configuration for some penalization 20 | parameters, Rand indices are averaged for each fixed penalization 21 | parameter. Returned are the mean, standard deviation and number 22 | of final configurations that were averaged. 23 | } 24 | \description{ 25 | Compute Rand indices for fitted scregclust object 26 | } 27 | \references{ 28 | W. M. Rand (1971). "Objective criteria for the evaluation of clustering 29 | methods". Journal of the American Statistical Association 66 (336): 846–850. 30 | DOI:10.2307/2284239 31 | 32 | Lawrence Hubert and Phipps Arabie (1985). "Comparing partitions". 33 | Journal of Classification. 2 (1): 193–218. DOI:10.1007/BF01908075 34 | } 35 | \concept{utilities} 36 | -------------------------------------------------------------------------------- /man/jaccard_indicator_comp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{jaccard_indicator_comp} 4 | \alias{jaccard_indicator_comp} 5 | \title{Perform the computations for thresholded Jaccard distance} 6 | \usage{ 7 | jaccard_indicator_comp(gs, eps) 8 | } 9 | \arguments{ 10 | \item{gs}{a list of integer vectors, one for each row, giving the column 11 | indices of the non-zero elements of the row or \code{NULL} if the 12 | whole row is empty.} 13 | 14 | \item{eps}{an upper bound on the Jaccard distance (\code{1 - eps} becomes a 15 | lower bound on the Jaccard similarity)} 16 | } 17 | \value{ 18 | A list with row and column indices in the #row x #row indicator 19 | matrix specifying which rows in the original matrix had a distance 20 | of at most \code{eps}. 21 | } 22 | \description{ 23 | Perform the computations for thresholded Jaccard distance 24 | } 25 | \details{ 26 | This function is optimized for sparse matrices and computes the pairwise 27 | Jaccard distances between the rows of the input matrix. Note that the 28 | actual distance is not saved. Instead, a threshold (\code{eps}) is supplied 29 | and an indicator matrix is returned, with a one indicating that the 30 | distance is smaller than \code{eps} (equivalently, the Jaccard similarity 31 | is larger than \code{1 - eps}). 32 | } 33 | \keyword{internal} 34 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | permissions: 23 | contents: write 24 | steps: 25 | - uses: actions/checkout@v4 26 | 27 | - uses: r-lib/actions/setup-pandoc@v2 28 | 29 | - uses: r-lib/actions/setup-r@v2 30 | with: 31 | use-public-rspm: true 32 | 33 | - uses: r-lib/actions/setup-r-dependencies@v2 34 | with: 35 | extra-packages: any::pkgdown, local::. 36 | needs: website 37 | 38 | - name: Build site 39 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 40 | shell: Rscript {0} 41 | 42 | - name: Deploy to GitHub pages 🚀 43 | if: github.event_name != 'pull_request' 44 | uses: JamesIves/github-pages-deploy-action@v4.5.0 45 | with: 46 | clean: false 47 | branch: gh-pages 48 | folder: docs 49 | -------------------------------------------------------------------------------- /man/split_sample.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scregclust.R 3 | \name{split_sample} 4 | \alias{split_sample} 5 | \title{Split Sample} 6 | \usage{ 7 | split_sample( 8 | z, 9 | stratification, 10 | is_regulator, 11 | split_indices, 12 | split1_proportion, 13 | total_proportion, 14 | center 15 | ) 16 | } 17 | \arguments{ 18 | \item{z}{matrix of single cell data with rows as genes and columns as cells.} 19 | 20 | \item{stratification}{a vector by which the sampling will be stratified 21 | of length \code{ncol(z)}} 22 | 23 | \item{is_regulator}{an indicator vector, telling which rows in \code{z} are 24 | candidate regulators} 25 | 26 | \item{split_indices}{a vector of given split indices. can be \code{NULL}} 27 | 28 | \item{split1_proportion}{proportion to include in first data split} 29 | 30 | \item{total_proportion}{proportion of data to include overall in splitting} 31 | 32 | \item{center}{TRUE if data should be row-centered. Set to FALSE otherwise.} 33 | } 34 | \value{ 35 | a list containing 36 | \item{z1_reg}{first data split, TF-part} 37 | \item{z2_reg}{second data split, TF-part} 38 | \item{z1_target}{first data split, non-TF part} 39 | \item{z2_target}{second data split, non-TF part} 40 | \item{split_indices}{either verbatim the vector given as input or 41 | a vector encoding the splits as NA = not included, 42 | 1 = split 1 or 2 = split 2. Allows reproducibility 43 | of data splits.} 44 | } 45 | \description{ 46 | Splits sample in train and test set 47 | } 48 | \keyword{internal} 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Single-cell Regulatory-driven Clustering (scregclust) 2 | 3 | 4 | 5 | [![CRAN status](https://www.r-pkg.org/badges/version/scregclust)](https://CRAN.R-project.org/package=scregclust) 6 | 7 | 8 | ![A diagram illustrating the *scregclust* algorithm.](man/figures/overview_fig1A_bg.png "Illustration of the scregclust algorithm") 9 | 10 | The goal of *scregclust* is to cluster genes by regulatory programs. To do so, genes are clustered into modules which in turn are associated with regulators. The algorithm alternates between associating regulators to modules and reallocating target genes into modules. 11 | 12 | - The documentation for this package can be found at [https://scmethods.github.io/scregclust/](https://scmethods.github.io/scregclust/) 13 | - A detailed description of the algorithm and an in-depth evaluation of its properties can be found in our original research article [Larsson, Held, et al. (2024) Reconstructing the regulatory programs underlying the phenotypic plasticity of neural cancers. Nature Communications 15, 9699 DOI 10.1038/s41467-024-53954-3](https://doi.org/10.1038/s41467-024-53954-3) 14 | 15 | ## Installation 16 | 17 | You can install the stable version of *scregclust* from [CRAN](https://cran.r-project.org/package=scregclust) with 18 | 19 | ```r 20 | install.packages("scregclust") 21 | ``` 22 | 23 | You can install the current development version of *scregclust* from [GitHub](https://github.com/scmethods/scregclust) with: 24 | 25 | ```r 26 | # install.packages("devtools") 27 | devtools::install_github("scmethods/scregclust") 28 | ``` 29 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: scregclust 2 | Title: Reconstructing the Regulatory Programs of Target Genes in scRNA-Seq Data 3 | Version: 0.2.2 4 | Authors@R: c( 5 | person("Felix", "Held", ,"felix.held@gmail.com", role = c("aut", "cre"), 6 | comment = c(ORCID = "0000-0002-7679-7752")), 7 | person("Ida", "Larsson", ,"ida.larsson@igp.uu.se", role = c("aut"), 8 | comment = c(ORCID = "0000-0001-5422-4243")), 9 | person("Sven", "Nelander", ,"sven.nelander@igp.uu.se", role = c("aut"), 10 | comment = c(ORCID = "0000-0003-1758-1262")), 11 | person("André", "Armatowski", role = c("ctb"))) 12 | Description: Implementation of the scregclust algorithm 13 | described in Larsson, Held, et al. (2024) 14 | which reconstructs regulatory programs of target genes in scRNA-seq data. 15 | Target genes are clustered into modules and each module is associated with a linear 16 | model describing the regulatory program. 17 | Encoding: UTF-8 18 | Depends: R (>= 4.1.0) 19 | Imports: 20 | Matrix, 21 | stats, 22 | methods, 23 | utils, 24 | reshape, 25 | igraph, 26 | graphics, 27 | grid, 28 | cli, 29 | prettyunits, 30 | ggplot2, 31 | rlang, 32 | Rcpp (>= 1.0.8) 33 | Suggests: 34 | testthat (>= 3.0.0), 35 | hdf5r, 36 | glmGamPoi, 37 | Seurat, 38 | GEOquery 39 | LinkingTo: Rcpp, RcppEigen 40 | Roxygen: list(markdown = TRUE) 41 | RoxygenNote: 7.3.2 42 | License: GPL (>= 3) 43 | Config/testthat/edition: 3 44 | URL: https://scmethods.github.io/scregclust/, https://github.com/scmethods/scregclust/ 45 | BugReports: https://github.com/scmethods/scregclust/issues 46 | Config/Needs/website: rmarkdown 47 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(format,scregclust) 4 | S3method(format,scregclust_output) 5 | S3method(format,scregclust_result) 6 | S3method(plot,scregclust) 7 | S3method(print,scregclust) 8 | S3method(print,scregclust_output) 9 | S3method(print,scregclust_result) 10 | export(available_results) 11 | export(cluster_overlap) 12 | export(fast_cor) 13 | export(find_module_sizes) 14 | export(get_avg_num_regulators) 15 | export(get_num_final_configs) 16 | export(get_rand_indices) 17 | export(get_regulator_list) 18 | export(get_target_gene_modules) 19 | export(kmeanspp) 20 | export(plot_module_count_helper) 21 | export(plot_regulator_network) 22 | export(plot_silhouettes) 23 | export(scregclust) 24 | export(scregclust_format) 25 | import(Rcpp) 26 | import(cli) 27 | import(ggplot2) 28 | importFrom(Matrix,Diagonal) 29 | importFrom(Matrix,Matrix) 30 | importFrom(Matrix,colSums) 31 | importFrom(Matrix,rowSums) 32 | importFrom(Matrix,sparseMatrix) 33 | importFrom(Matrix,summary) 34 | importFrom(Matrix,t) 35 | importFrom(graphics,legend) 36 | importFrom(grid,arrow) 37 | importFrom(grid,unit) 38 | importFrom(igraph,E) 39 | importFrom(igraph,V) 40 | importFrom(igraph,degree) 41 | importFrom(igraph,delete_edges) 42 | importFrom(igraph,delete_vertices) 43 | importFrom(igraph,graph_from_data_frame) 44 | importFrom(igraph,layout_with_fr) 45 | importFrom(methods,as) 46 | importFrom(methods,is) 47 | importFrom(prettyunits,pretty_dt) 48 | importFrom(reshape,melt) 49 | importFrom(rlang,.data) 50 | importFrom(stats,coef) 51 | importFrom(stats,cor) 52 | importFrom(stats,dist) 53 | importFrom(stats,kmeans) 54 | importFrom(stats,na.omit) 55 | importFrom(stats,predict) 56 | importFrom(stats,quantile) 57 | importFrom(stats,sd) 58 | importFrom(stats,setNames) 59 | importFrom(utils,globalVariables) 60 | importFrom(utils,head) 61 | importFrom(utils,read.table) 62 | importFrom(utils,tail) 63 | useDynLib(scregclust, .registration = TRUE) 64 | -------------------------------------------------------------------------------- /man/coop_lasso.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{coop_lasso} 4 | \alias{coop_lasso} 5 | \title{ADMM algorithm for solving the group-penalized least squares problem} 6 | \usage{ 7 | coop_lasso( 8 | y, 9 | x, 10 | lambda, 11 | weights, 12 | beta_0 = NULL, 13 | rho_0 = 0.2, 14 | alpha_0 = 1.5, 15 | n_update = 2L, 16 | eps_corr = 0.2, 17 | max_iter = 1000L, 18 | eps_rel = 1e-08, 19 | eps_abs = 1e-12, 20 | verbose = FALSE 21 | ) 22 | } 23 | \arguments{ 24 | \item{y}{Target (n x m)} 25 | 26 | \item{x}{Design matrix (n x p)} 27 | 28 | \item{lambda}{Penalization parameter} 29 | 30 | \item{weights}{A specific weight for each group (typically this is 31 | \verb{sqrt(group size)}).} 32 | 33 | \item{beta_0}{Initial value for coefficients, allowing for warm start. 34 | Can be set to NULL, which results in the initial \code{beta} 35 | being a zero matrix.} 36 | 37 | \item{rho_0}{Initial ADMM step-size} 38 | 39 | \item{alpha_0}{Initial ADMM relaxation parameter} 40 | 41 | \item{n_update}{Number of steps in-between updates of the 42 | step-size/adaptation parameters} 43 | 44 | \item{eps_corr}{Lower bound for the correlation in the step-size 45 | update steps} 46 | 47 | \item{max_iter}{Maximum number of iterations} 48 | 49 | \item{eps_rel}{Relative tolerance for convergence check} 50 | 51 | \item{eps_abs}{Absolute tolerance for convergence check} 52 | 53 | \item{verbose}{Whether or not information about the optimization process 54 | should be printed to the terminal} 55 | } 56 | \value{ 57 | A list containing 58 | \item{beta}{The coefficients at convergence} 59 | \item{iterations}{Number of iterations} 60 | } 61 | \description{ 62 | Implements estimation of the coop-lasso problem. 63 | } 64 | \references{ 65 | Xu et al. (2017) Adaptive relaxed ADMM: Convergence theory and 66 | practical implementation. DOI 10.1109/CVPR.2017.765 67 | } 68 | \keyword{internal} 69 | -------------------------------------------------------------------------------- /datasets/humanTFs_v2.txt: -------------------------------------------------------------------------------- 1 | SORBS2 2 | CEBPB 3 | EBF1 4 | ETS2 5 | FOXC1 6 | ID3 7 | MEF2C 8 | NR2F2 9 | NR4A2 10 | NR4A3 11 | SMAD7 12 | ZFHX3 13 | ZNF90 14 | IFI16 15 | HMGA1 16 | PRRX1 17 | KLF5 18 | FBN1 19 | PLAGL1 20 | FOXS1 21 | HMGB3 22 | DEPDC1 23 | FOXM1 24 | MXD3 25 | HMGB2 26 | HMGB1 27 | E2F7 28 | EZH2 29 | HIST1H1B 30 | HIST1H1D 31 | MYBL1 32 | DEK 33 | MYBL2 34 | E2F1 35 | H1FX 36 | CARHSP1 37 | HIST1H1A 38 | HIST1H1C 39 | HIST1H1E 40 | LHX2 41 | PAX6 42 | POU3F2 43 | SOX11 44 | ARX 45 | CHD9 46 | FOXJ1 47 | GSX2 48 | HES5 49 | INSM1 50 | NEUROD1 51 | OSR1 52 | PBX1 53 | POU3F4 54 | PROX1 55 | SALL3 56 | SOX21 57 | ZMAT1 58 | ZNF117 59 | CHD7 60 | H1F0 61 | HEY2 62 | JDP2 63 | MLXIP 64 | NFATC1 65 | OSR2 66 | SEMA4A 67 | SKIL 68 | TSC22D3 69 | ZNF331 70 | ZNF503 71 | DBX2 72 | RORA 73 | TCF12 74 | ZIC1 75 | NFIB 76 | NR2F1 77 | PITX1 78 | RORB 79 | STAT1 80 | STAT2 81 | MEOX2 82 | ASCL1 83 | ETV1 84 | HES6 85 | NFIA 86 | OLIG2 87 | RFX4 88 | SOX8 89 | TCF4 90 | ZEB1 91 | ZNF704 92 | HEY1 93 | MEIS2 94 | POU3F3 95 | SOX2 96 | MITF 97 | PAX3 98 | PLXNC1 99 | SNAI2 100 | EPAS1 101 | MAF 102 | TBX2 103 | MET 104 | PLXNA1 105 | AHR 106 | GLIS3 107 | PAWR 108 | BMP2 109 | DRAP1 110 | ELK3 111 | FOSL1 112 | FOXP1 113 | GTF2F2 114 | HMGA2 115 | HOXB2 116 | ID1 117 | KLF7 118 | NR1D1 119 | PRDM1 120 | RUNX1 121 | TBX3 122 | HES1 123 | HIC1 124 | TWIST1 125 | XBP1 126 | PLXNA4 127 | ARID5B 128 | KLF9 129 | MACF1 130 | EGR3 131 | MYC 132 | NFIL3 133 | NR4A1 134 | ATF3 135 | CREB5 136 | EGR1 137 | EGR2 138 | FOS 139 | FOSB 140 | ID4 141 | JUN 142 | JUNB 143 | JUND 144 | KLF10 145 | KLF2 146 | KLF4 147 | KLF6 148 | MAFF 149 | ZFP36 150 | ZFP36L1 151 | ZFP36L2 152 | DDIT3 153 | FOSL2 154 | IRF1 155 | TIPARP 156 | TSC22D1 157 | HOPX 158 | OLIG1 159 | TSC22D4 160 | DPF3 161 | HES4 162 | ID2 163 | SMAD1 164 | ZBTB20 165 | BAZ2B 166 | FAM171B 167 | SOX9 168 | TSHZ2 169 | ZFHX4 170 | ZMAT3 171 | NFATC2 172 | TFAP2B 173 | TFAP2A 174 | GPR155 175 | POU3F1 176 | RXRG 177 | SOX10 178 | SOX4 179 | SOX6 180 | ZEB2 181 | ZNF536 182 | -------------------------------------------------------------------------------- /src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | //' Allocate 3d-array and fill with matrix along first dimension 4 | //' 5 | //' @param input the matrix of size `n_obs x n_genes` 6 | //' @param n_cl the size of the three-dimensional array's first dimension 7 | //' 8 | //' @return The allocated and filled array of size `n_cl x n_obs x n_genes` 9 | //' 10 | //' @keywords internal 11 | // [[Rcpp::export]] 12 | SEXP alloc_array(SEXP input, R_xlen_t n_cl) { 13 | const auto n_obs = static_cast(Rf_nrows(input)); 14 | const auto n_genes = static_cast(Rf_ncols(input)); 15 | 16 | const double* const pinput = REAL(input); 17 | 18 | const auto n_total = n_cl * n_obs * n_genes; 19 | if (n_total > R_XLEN_T_MAX) { 20 | Rcpp::stop("alloc_array: requested allocation too large"); 21 | } 22 | 23 | SEXP arr = PROTECT(Rf_allocVector(REALSXP, n_total)); 24 | double* const parr = REAL(arr); 25 | 26 | for (R_xlen_t i = 0, ub = n_obs * n_genes; i < ub; i++) { 27 | for (R_xlen_t j = 0; j < n_cl; j++) { 28 | parr[i * n_cl + j] = pinput[i]; 29 | } 30 | } 31 | 32 | UNPROTECT(1); 33 | return arr; 34 | } 35 | 36 | //' Reset input 3d-array by filling matrix along first dimension 37 | //' 38 | //' @param arr The 3d-array of dimension `n_cl x n_obs x n_genes` 39 | //' @param input The matrix of size `n_obs x n_genes` 40 | //' 41 | //' @keywords internal 42 | // [[Rcpp::export]] 43 | void reset_array(SEXP arr, SEXP input) { 44 | const int* const dims = INTEGER(PROTECT(Rf_getAttrib(arr, R_DimSymbol))); 45 | const auto n_cl = static_cast(dims[0]); 46 | const auto n_obs = static_cast(dims[1]); 47 | const auto n_genes = static_cast(dims[2]); 48 | UNPROTECT(1); 49 | 50 | if (static_cast(Rf_nrows(input)) != n_obs || 51 | static_cast(Rf_ncols(input)) != n_genes) { 52 | Rcpp::stop("reset_array: input has wrong dimensions"); 53 | } 54 | 55 | const double* const pinput = REAL(input); 56 | double* const parr = REAL(arr); 57 | 58 | for (R_xlen_t i = 0, ub = n_obs * n_genes; i < ub; i++) { 59 | for (R_xlen_t j = 0; j < n_cl; j++) { 60 | parr[i * n_cl + j] = pinput[i]; 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /src/jaccard.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | using length_type = std::vector::size_type; 4 | 5 | static length_type len_intersect(const std::vector& x, const std::vector& y) { 6 | length_type i = 0; 7 | length_type j = 0; 8 | length_type result = 0; 9 | 10 | while (i < x.size() && j < y.size()) { 11 | if (x[i] < y[j]) { 12 | i++; 13 | } else if (y[j] < x[i]) { 14 | j++; 15 | } else { 16 | result++; 17 | 18 | i++; 19 | j++; 20 | } 21 | } 22 | 23 | return result; 24 | } 25 | 26 | //' Perform the computations for thresholded Jaccard distance 27 | //' 28 | //' @details 29 | //' This function is optimized for sparse matrices and computes the pairwise 30 | //' Jaccard distances between the rows of the input matrix. Note that the 31 | //' actual distance is not saved. Instead, a threshold (`eps`) is supplied 32 | //' and an indicator matrix is returned, with a one indicating that the 33 | //' distance is smaller than `eps` (equivalently, the Jaccard similarity 34 | //' is larger than `1 - eps`). 35 | //' 36 | //' @param gs a list of integer vectors, one for each row, giving the column 37 | //' indices of the non-zero elements of the row or `NULL` if the 38 | //' whole row is empty. 39 | //' @param eps an upper bound on the Jaccard distance (`1 - eps` becomes a 40 | //' lower bound on the Jaccard similarity) 41 | //' 42 | //' @return A list with row and column indices in the #row x #row indicator 43 | //' matrix specifying which rows in the original matrix had a distance 44 | //' of at most `eps`. 45 | //' 46 | //' @keywords internal 47 | // [[Rcpp::export]] 48 | Rcpp::List jaccard_indicator_comp(Rcpp::List gs, double eps) { 49 | const auto n = static_cast(gs.length()); 50 | 51 | if (eps > 1.0 || eps < 0.0) { 52 | Rcpp::stop("0 <= eps <= 1 needs to hold"); 53 | } 54 | 55 | std::vector> varr; 56 | varr.reserve(n); 57 | std::transform(gs.begin(), gs.end(), std::back_inserter(varr), 58 | Rcpp::as>); 59 | 60 | const auto eps_ = 1.0 - eps; 61 | 62 | std::vector ipairs; 63 | std::vector jpairs; 64 | 65 | for (length_type i = 1; i < n; i++) { 66 | for (length_type j = 0; j < i; j++) { 67 | const auto len_inter = len_intersect(varr[i], varr[j]); 68 | const auto len_union = varr[i].size() + varr[j].size() - len_inter; 69 | 70 | if (static_cast(len_union) * eps_ < 71 | static_cast(len_inter)) { 72 | ipairs.emplace_back(i + 1); 73 | jpairs.emplace_back(j + 1); 74 | } 75 | } 76 | } 77 | 78 | Rcpp::List out; 79 | out["i"] = ipairs; 80 | out["j"] = jpairs; 81 | 82 | return out; 83 | } 84 | -------------------------------------------------------------------------------- /.github/workflows/rhub.yaml: -------------------------------------------------------------------------------- 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml 3 | # You can update this file to a newer version using the rhub2 package: 4 | # 5 | # rhub::rhub_setup() 6 | # 7 | # It is unlikely that you need to modify this file manually. 8 | 9 | name: R-hub 10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}" 11 | 12 | on: 13 | workflow_dispatch: 14 | inputs: 15 | config: 16 | description: 'A comma separated list of R-hub platforms to use.' 17 | type: string 18 | default: 'linux,windows,macos' 19 | name: 20 | description: 'Run name. You can leave this empty now.' 21 | type: string 22 | id: 23 | description: 'Unique ID. You can leave this empty now.' 24 | type: string 25 | 26 | jobs: 27 | 28 | setup: 29 | runs-on: ubuntu-latest 30 | outputs: 31 | containers: ${{ steps.rhub-setup.outputs.containers }} 32 | platforms: ${{ steps.rhub-setup.outputs.platforms }} 33 | 34 | steps: 35 | # NO NEED TO CHECKOUT HERE 36 | - uses: r-hub/actions/setup@v1 37 | with: 38 | config: ${{ github.event.inputs.config }} 39 | id: rhub-setup 40 | 41 | linux-containers: 42 | needs: setup 43 | if: ${{ needs.setup.outputs.containers != '[]' }} 44 | runs-on: ubuntu-latest 45 | name: ${{ matrix.config.label }} 46 | strategy: 47 | fail-fast: false 48 | matrix: 49 | config: ${{ fromJson(needs.setup.outputs.containers) }} 50 | container: 51 | image: ${{ matrix.config.container }} 52 | 53 | steps: 54 | - uses: r-hub/actions/checkout@v1 55 | - uses: r-hub/actions/platform-info@v1 56 | with: 57 | token: ${{ secrets.RHUB_TOKEN }} 58 | job-config: ${{ matrix.config.job-config }} 59 | - uses: r-hub/actions/setup-deps@v1 60 | with: 61 | token: ${{ secrets.RHUB_TOKEN }} 62 | job-config: ${{ matrix.config.job-config }} 63 | - uses: r-hub/actions/run-check@v1 64 | with: 65 | token: ${{ secrets.RHUB_TOKEN }} 66 | job-config: ${{ matrix.config.job-config }} 67 | 68 | other-platforms: 69 | needs: setup 70 | if: ${{ needs.setup.outputs.platforms != '[]' }} 71 | runs-on: ${{ matrix.config.os }} 72 | name: ${{ matrix.config.label }} 73 | strategy: 74 | fail-fast: false 75 | matrix: 76 | config: ${{ fromJson(needs.setup.outputs.platforms) }} 77 | 78 | steps: 79 | - uses: r-hub/actions/checkout@v1 80 | - uses: r-hub/actions/setup-r@v1 81 | with: 82 | job-config: ${{ matrix.config.job-config }} 83 | token: ${{ secrets.RHUB_TOKEN }} 84 | - uses: r-hub/actions/platform-info@v1 85 | with: 86 | token: ${{ secrets.RHUB_TOKEN }} 87 | job-config: ${{ matrix.config.job-config }} 88 | - uses: r-hub/actions/setup-deps@v1 89 | with: 90 | job-config: ${{ matrix.config.job-config }} 91 | token: ${{ secrets.RHUB_TOKEN }} 92 | - uses: r-hub/actions/run-check@v1 93 | with: 94 | job-config: ${{ matrix.config.job-config }} 95 | token: ${{ secrets.RHUB_TOKEN }} 96 | -------------------------------------------------------------------------------- /src/allocation.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // Assumes that 5 | // - update_order is a permutation of 0:(length(k_) - 1) 6 | // - indices in prior_indicators elements are within 0:(length(k_) - 1) 7 | // [[Rcpp::export]] 8 | Rcpp::IntegerVector allocate_into_modules(SEXP resid_array, 9 | Eigen::Map resid_var, 10 | Rcpp::List prior_indicator, 11 | Rcpp::IntegerVector k_, 12 | Rcpp::IntegerVector update_order, 13 | double prior_baseline, double prior_weight) { 14 | double* arr = REAL(resid_array); 15 | int* dims = INTEGER(PROTECT(Rf_getAttrib(resid_array, R_DimSymbol))); 16 | const auto n_modules = static_cast(dims[0]); 17 | const auto n_obs = static_cast(dims[1]); 18 | // const auto n_genes = static_cast(dims[2]); 19 | UNPROTECT(1); 20 | 21 | const auto n_total = n_modules * n_obs; 22 | 23 | Rcpp::IntegerVector k(k_ - 1); 24 | 25 | Eigen::ArrayXd module_totals = Eigen::ArrayXd::Zero(n_modules); 26 | for (const auto& idx : k) { 27 | if (idx != -2) { 28 | module_totals[idx] += 1; 29 | } 30 | } 31 | 32 | // Iterate over genes in given order 33 | for (const auto& j : update_order) { 34 | // Load residuals for current gene 35 | const Eigen::Map resid(arr + j * n_total, n_modules, n_obs); 36 | 37 | // Compute fraction of genes that gene j interacts with in each module, 38 | // according to prior information 39 | const Rcpp::IntegerVector prior_indices = 40 | Rcpp::as(prior_indicator[j]); 41 | Eigen::ArrayXd prior_frac = Eigen::ArrayXd::Zero(n_modules); 42 | for (const auto& idx : prior_indices) { 43 | if (k[idx] != -2) { 44 | prior_frac[k[idx]] += 1; 45 | } 46 | } 47 | 48 | for (Eigen::Index idx = 0; idx < n_modules; idx++) { 49 | if (module_totals[idx] > 0) { 50 | prior_frac[idx] /= module_totals[idx]; 51 | } 52 | } 53 | // Add baseline to avoid numerical problems if a module is empty 54 | prior_frac += prior_baseline; 55 | // Convert to probabilities 56 | Eigen::ArrayXd prior_log_prob = prior_frac.log() - log(prior_frac.sum()); 57 | 58 | // Compute model likelihood from residuals 59 | Eigen::MatrixXd model_log_likelihood = 60 | ((-resid).array().colwise() / (2.0 * resid_var.row(j).transpose().array())) 61 | .colwise() - 62 | 0.5 * (2.0 * M_PI * resid_var.row(j).transpose()).array().log(); 63 | 64 | // // Normalise to convert to probabilities 65 | // Eigen::RowVectorXd max_model_ll = model_log_likelihood.colwise().maxCoeff(); 66 | // Eigen::MatrixXd model_ll_minus_max = 67 | // model_log_likelihood.rowwise() - max_model_ll; 68 | // Eigen::MatrixXd model_log_prob = 69 | // model_ll_minus_max.rowwise() - 70 | // model_ll_minus_max.array().exp().colwise().sum().log().matrix(); 71 | 72 | // Compute total scores by weighting the model likelihood 73 | // and the prior probabilities. 74 | const Eigen::MatrixXd total_model_log_scores = 75 | // (((1.0 - prior_weight) * model_log_prob.array()).colwise() + 76 | (((1.0 - prior_weight) * model_log_likelihood.array()).colwise() + 77 | (prior_weight * prior_log_prob)); 78 | 79 | // Compute votes for each of the n_modules modules 80 | Eigen::ArrayXi votes = Eigen::ArrayXi::Zero(n_modules); 81 | for (Eigen::Index i = 0; i < n_obs; i++) { 82 | int max_idx = -1; 83 | total_model_log_scores.col(i).maxCoeff(&max_idx); 84 | votes[max_idx] += 1; 85 | } 86 | 87 | // Move gene j to the new best module 88 | int best_cl = -1; 89 | votes.maxCoeff(&best_cl); 90 | if (k[j] != -2) { 91 | module_totals[k[j]] -= 1; 92 | } 93 | module_totals[best_cl] += 1; 94 | k[j] = best_cl; 95 | 96 | Rcpp::checkUserInterrupt(); 97 | } 98 | 99 | return k + 1; 100 | } 101 | -------------------------------------------------------------------------------- /vignettes/articles/mouse.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Manually assigning regulators" 3 | --- 4 | 5 | ```{r, include = FALSE} 6 | knitr::opts_chunk$set( 7 | collapse = TRUE, 8 | comment = "#>" 9 | ) 10 | ``` 11 | 12 | The purpose of this vignette is to show how to manually configure the 13 | `is_regulator` vector, e.g. when you want to run *scregclust* on a custom set of 14 | regulators (not TFs or kinases), or if your data is from an organism other than 15 | human, e.g. mouse. This vignette will show how to do this using a data set from 16 | the mouse brain ([GSE60361](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE60361)), and a 17 | [list of mouse TFs provided by Aertslab](https://resources.aertslab.org/cistarget/tf_lists/). 18 | 19 | We use [Seurat](https://satijalab.org/seurat/) for pre-processing of the data. 20 | 21 | ```{r load-packages, results='hide', message=FALSE} 22 | # Load required packages 23 | library(GEOquery) 24 | library(Seurat) 25 | library(scregclust) 26 | ``` 27 | 28 | Read in the data and preprocess it in *Seurat*. Here, we simply use the 29 | full dataset. In practice, you would perform additional quality checks and, 30 | e.g., investigate PCA, UMAP, or TSNE plots of the data. We use the 31 | package *GEOquery* to download meta data for the data. 32 | 33 | ```{r load-data, results='hide', message=FALSE, warning=FALSE} 34 | # Download the gene expression data 35 | url <- paste0( 36 | "https://www.ncbi.nlm.nih.gov/geo/download/", 37 | "?acc=GSE60361&format=file&", 38 | "file=GSE60361%5FC1%2D3005%2DExpression%2Etxt%2Egz" 39 | ) 40 | expr_path <- file.path(tempdir(), "Expression.txt.gz") 41 | download.file(url, expr_path, cacheOK = FALSE, mode = "wb") 42 | 43 | # Load the gene expression data 44 | expr <- read.table( 45 | expr_path, 46 | header = TRUE, 47 | sep = "\t", 48 | stringsAsFactors = FALSE, 49 | fill = TRUE 50 | ) 51 | 52 | # A few gene symbols appear as duplicates, make unique. 53 | gene_symbols <- make.unique(expr[, 1], sep = "-") 54 | expr <- expr[, -1] 55 | rownames(expr) <- gene_symbols 56 | 57 | # Download meta data 58 | gse <- getGEO("GSE60361") 59 | meta_data <- pData(phenoData(gse[[1]])) 60 | # Sample names are stored in the meta data's row names 61 | sample_names <- rownames(meta_data) 62 | colnames(expr) <- sample_names 63 | 64 | # Create Seurat object and preprocess the data using SCTransform 65 | mouse <- CreateSeuratObject( 66 | counts = expr, 67 | min.cells = 3, 68 | min.features = 500, 69 | meta.data = meta_data 70 | ) 71 | mouse <- SCTransform(mouse, verbose = TRUE) 72 | ``` 73 | 74 | The built in transcription factor lists in *scregclust* are for human 75 | transcription factors (TFs) and kinases. Download and read in a list of 76 | mouse-specific TFs. 77 | 78 | ```{r load-tfs, results='hide', message=FALSE} 79 | url <- "https://resources.aertslab.org/cistarget/tf_lists/allTFs_mm.txt" 80 | tfs_path <- file.path(tempdir(), "allTFs_mm.txt") 81 | download.file(url, tfs_path, cacheOK = FALSE, mode = "w") 82 | tfs <- read.table( 83 | tfs_path, 84 | header = FALSE, 85 | sep = "\t", 86 | stringsAsFactors = FALSE 87 | ) 88 | tfs <- tfs[, 1] 89 | ``` 90 | 91 | Extract `gene x cells` table 92 | 93 | ```{r extract-gene-cells-table} 94 | z <- GetAssayData(mouse, layer = "scale.data") 95 | dim(z) 96 | ``` 97 | 98 | Make sure data is in the format for *scregclust* 99 | 100 | ```{r scregclust-format} 101 | out <- scregclust_format(z, mode = "TF") 102 | 103 | genesymbols <- out$genesymbols 104 | sample_assignment <- out$sample_assignment 105 | ``` 106 | 107 | Manually create the indicator vector `is_regulator` 108 | 109 | ```{r manual-is-regulator} 110 | is_regulator <- rep(0, length = length(genesymbols)) 111 | is_regulator[which(genesymbols %in% tfs)] <- 1 112 | ``` 113 | 114 | Finally, run scregclust to estimate the model. The run can be reproduced with 115 | the command below. A pre-fitted model can be downloaded from 116 | [GitHub](https://github.com/scmethods/scregclust/raw/main/datasets/mouse_scregclust.rds) 117 | for convenience. 118 | 119 | ```{r run-scregclust} 120 | # # Run scregclust 121 | # set.seed(8374) 122 | # fit <- scregclust( 123 | # z, genesymbols, is_regulator, penalization = seq(0.1, 0.5, 0.05), 124 | # n_modules = 10L, n_cycles = 50L, noise_threshold = 0.05 125 | # ) 126 | # saveRDS(fit, file = "datasets/mouse_scregclust.rds") 127 | 128 | url <- paste0( 129 | "https://github.com/scmethods/scregclust/raw/main/datasets/", 130 | "mouse_scregclust.rds" 131 | ) 132 | fit_path <- file.path(tempdir(), "mouse_scregclust.rds") 133 | download.file(url, fit_path) 134 | fit <- readRDS(fit_path) 135 | ``` 136 | 137 | Visualize the fit 138 | 139 | ```{r viz-fit, fig.width=7, fig.height=4, fig.dpi=100} 140 | #| fig.alt: > 141 | #| Boxplots of predictive R^2 per module (bottom) and 142 | #| regulator importance (top) over the penalization parameters 143 | #| specified during model estimation. A decreasing trend can 144 | #| be seen in R^2 per module until about 0.35 with a drop from 0.4. 145 | #| In addition, a slow and steady increase in regulator importance 146 | #| is followed by an increase from around 0.4 penalization. 147 | plot(fit) 148 | ``` 149 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | allocate_into_modules <- function(resid_array, resid_var, prior_indicator, k_, update_order, prior_baseline, prior_weight) { 5 | .Call(`_scregclust_allocate_into_modules`, resid_array, resid_var, prior_indicator, k_, update_order, prior_baseline, prior_weight) 6 | } 7 | 8 | #' Perform the computations for thresholded Jaccard distance 9 | #' 10 | #' @details 11 | #' This function is optimized for sparse matrices and computes the pairwise 12 | #' Jaccard distances between the rows of the input matrix. Note that the 13 | #' actual distance is not saved. Instead, a threshold (`eps`) is supplied 14 | #' and an indicator matrix is returned, with a one indicating that the 15 | #' distance is smaller than `eps` (equivalently, the Jaccard similarity 16 | #' is larger than `1 - eps`). 17 | #' 18 | #' @param gs a list of integer vectors, one for each row, giving the column 19 | #' indices of the non-zero elements of the row or `NULL` if the 20 | #' whole row is empty. 21 | #' @param eps an upper bound on the Jaccard distance (`1 - eps` becomes a 22 | #' lower bound on the Jaccard similarity) 23 | #' 24 | #' @return A list with row and column indices in the #row x #row indicator 25 | #' matrix specifying which rows in the original matrix had a distance 26 | #' of at most `eps`. 27 | #' 28 | #' @keywords internal 29 | jaccard_indicator_comp <- function(gs, eps) { 30 | .Call(`_scregclust_jaccard_indicator_comp`, gs, eps) 31 | } 32 | 33 | #' ADMM algorithm for solving the group-penalized least squares problem 34 | #' 35 | #' Implements estimation of the coop-lasso problem. 36 | #' 37 | #' @param y Target (n x m) 38 | #' @param x Design matrix (n x p) 39 | #' @param lambda Penalization parameter 40 | #' @param weights A specific weight for each group (typically this is 41 | #' `sqrt(group size)`). 42 | #' @param beta_0 Initial value for coefficients, allowing for warm start. 43 | #' Can be set to NULL, which results in the initial `beta` 44 | #' being a zero matrix. 45 | #' @param rho_0 Initial ADMM step-size 46 | #' @param alpha_0 Initial ADMM relaxation parameter 47 | #' @param n_update Number of steps in-between updates of the 48 | #' step-size/adaptation parameters 49 | #' @param eps_corr Lower bound for the correlation in the step-size 50 | #' update steps 51 | #' @param max_iter Maximum number of iterations 52 | #' @param eps_rel Relative tolerance for convergence check 53 | #' @param eps_abs Absolute tolerance for convergence check 54 | #' @param verbose Whether or not information about the optimization process 55 | #' should be printed to the terminal 56 | #' 57 | #' @return A list containing 58 | #' \item{beta}{The coefficients at convergence} 59 | #' \item{iterations}{Number of iterations} 60 | #' 61 | #' @references 62 | #' Xu et al. (2017) Adaptive relaxed ADMM: Convergence theory and 63 | #' practical implementation. DOI 10.1109/CVPR.2017.765 64 | #' 65 | #' @keywords internal 66 | coop_lasso <- function(y, x, lambda, weights, beta_0 = NULL, rho_0 = 0.2, alpha_0 = 1.5, n_update = 2L, eps_corr = 0.2, max_iter = 1000L, eps_rel = 1e-8, eps_abs = 1e-12, verbose = FALSE) { 67 | .Call(`_scregclust_coop_lasso`, y, x, lambda, weights, beta_0, rho_0, alpha_0, n_update, eps_corr, max_iter, eps_rel, eps_abs, verbose) 68 | } 69 | 70 | #' Compute NNLS coefficients 71 | #' 72 | #' Computes non-negative least squares coefficients with a matrix 73 | #' right hand side. 74 | #' 75 | #' @param x Coefficient matrix (p x n matrix) 76 | #' @param y Right hand side (p x m matrix) 77 | #' @param eps Convergence tolerance 78 | #' @param max_iter Maximum number of iterations 79 | #' 80 | #' @return A list containing 81 | #' \item{beta}{The estimated coefficient matrix} 82 | #' \item{iterations}{A vector containing the number of iterations needed 83 | #' for the `i`-th column in `y` in the `i`-th entry.} 84 | #' 85 | #' @references 86 | #' Duy Khuong Nguyen and Tu Bao Ho. Accelerated anti-lopsided algorithm 87 | #' for nonnegative least squares. International Journal of Data Science 88 | #' and Analytics, 3(1):23–34, 2017. 89 | #' 90 | #' Adapted from 91 | #' 92 | #' @keywords internal 93 | coef_nnls <- function(x, y, eps = 1e-12, max_iter = 1000L) { 94 | .Call(`_scregclust_coef_nnls`, x, y, eps, max_iter) 95 | } 96 | 97 | #' Allocate 3d-array and fill with matrix along first dimension 98 | #' 99 | #' @param input the matrix of size `n_obs x n_genes` 100 | #' @param n_cl the size of the three-dimensional array's first dimension 101 | #' 102 | #' @return The allocated and filled array of size `n_cl x n_obs x n_genes` 103 | #' 104 | #' @keywords internal 105 | alloc_array <- function(input, n_cl) { 106 | .Call(`_scregclust_alloc_array`, input, n_cl) 107 | } 108 | 109 | #' Reset input 3d-array by filling matrix along first dimension 110 | #' 111 | #' @param arr The 3d-array of dimension `n_cl x n_obs x n_genes` 112 | #' @param input The matrix of size `n_obs x n_genes` 113 | #' 114 | #' @keywords internal 115 | reset_array <- function(arr, input) { 116 | invisible(.Call(`_scregclust_reset_array`, arr, input)) 117 | } 118 | 119 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 10 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 11 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 12 | #endif 13 | 14 | // allocate_into_modules 15 | Rcpp::IntegerVector allocate_into_modules(SEXP resid_array, Eigen::Map resid_var, Rcpp::List prior_indicator, Rcpp::IntegerVector k_, Rcpp::IntegerVector update_order, double prior_baseline, double prior_weight); 16 | RcppExport SEXP _scregclust_allocate_into_modules(SEXP resid_arraySEXP, SEXP resid_varSEXP, SEXP prior_indicatorSEXP, SEXP k_SEXP, SEXP update_orderSEXP, SEXP prior_baselineSEXP, SEXP prior_weightSEXP) { 17 | BEGIN_RCPP 18 | Rcpp::RObject rcpp_result_gen; 19 | Rcpp::RNGScope rcpp_rngScope_gen; 20 | Rcpp::traits::input_parameter< SEXP >::type resid_array(resid_arraySEXP); 21 | Rcpp::traits::input_parameter< Eigen::Map >::type resid_var(resid_varSEXP); 22 | Rcpp::traits::input_parameter< Rcpp::List >::type prior_indicator(prior_indicatorSEXP); 23 | Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type k_(k_SEXP); 24 | Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type update_order(update_orderSEXP); 25 | Rcpp::traits::input_parameter< double >::type prior_baseline(prior_baselineSEXP); 26 | Rcpp::traits::input_parameter< double >::type prior_weight(prior_weightSEXP); 27 | rcpp_result_gen = Rcpp::wrap(allocate_into_modules(resid_array, resid_var, prior_indicator, k_, update_order, prior_baseline, prior_weight)); 28 | return rcpp_result_gen; 29 | END_RCPP 30 | } 31 | // jaccard_indicator_comp 32 | Rcpp::List jaccard_indicator_comp(Rcpp::List gs, double eps); 33 | RcppExport SEXP _scregclust_jaccard_indicator_comp(SEXP gsSEXP, SEXP epsSEXP) { 34 | BEGIN_RCPP 35 | Rcpp::RObject rcpp_result_gen; 36 | Rcpp::RNGScope rcpp_rngScope_gen; 37 | Rcpp::traits::input_parameter< Rcpp::List >::type gs(gsSEXP); 38 | Rcpp::traits::input_parameter< double >::type eps(epsSEXP); 39 | rcpp_result_gen = Rcpp::wrap(jaccard_indicator_comp(gs, eps)); 40 | return rcpp_result_gen; 41 | END_RCPP 42 | } 43 | // coop_lasso 44 | Rcpp::List coop_lasso(Eigen::Map y, Eigen::Map x, double lambda, Eigen::Map weights, Rcpp::Nullable beta_0, double rho_0, double alpha_0, int n_update, double eps_corr, int max_iter, double eps_rel, double eps_abs, bool verbose); 45 | RcppExport SEXP _scregclust_coop_lasso(SEXP ySEXP, SEXP xSEXP, SEXP lambdaSEXP, SEXP weightsSEXP, SEXP beta_0SEXP, SEXP rho_0SEXP, SEXP alpha_0SEXP, SEXP n_updateSEXP, SEXP eps_corrSEXP, SEXP max_iterSEXP, SEXP eps_relSEXP, SEXP eps_absSEXP, SEXP verboseSEXP) { 46 | BEGIN_RCPP 47 | Rcpp::RObject rcpp_result_gen; 48 | Rcpp::RNGScope rcpp_rngScope_gen; 49 | Rcpp::traits::input_parameter< Eigen::Map >::type y(ySEXP); 50 | Rcpp::traits::input_parameter< Eigen::Map >::type x(xSEXP); 51 | Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP); 52 | Rcpp::traits::input_parameter< Eigen::Map >::type weights(weightsSEXP); 53 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type beta_0(beta_0SEXP); 54 | Rcpp::traits::input_parameter< double >::type rho_0(rho_0SEXP); 55 | Rcpp::traits::input_parameter< double >::type alpha_0(alpha_0SEXP); 56 | Rcpp::traits::input_parameter< int >::type n_update(n_updateSEXP); 57 | Rcpp::traits::input_parameter< double >::type eps_corr(eps_corrSEXP); 58 | Rcpp::traits::input_parameter< int >::type max_iter(max_iterSEXP); 59 | Rcpp::traits::input_parameter< double >::type eps_rel(eps_relSEXP); 60 | Rcpp::traits::input_parameter< double >::type eps_abs(eps_absSEXP); 61 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 62 | rcpp_result_gen = Rcpp::wrap(coop_lasso(y, x, lambda, weights, beta_0, rho_0, alpha_0, n_update, eps_corr, max_iter, eps_rel, eps_abs, verbose)); 63 | return rcpp_result_gen; 64 | END_RCPP 65 | } 66 | // coef_nnls 67 | Rcpp::List coef_nnls(Eigen::Map x, Eigen::Map y, double eps, int max_iter); 68 | RcppExport SEXP _scregclust_coef_nnls(SEXP xSEXP, SEXP ySEXP, SEXP epsSEXP, SEXP max_iterSEXP) { 69 | BEGIN_RCPP 70 | Rcpp::RObject rcpp_result_gen; 71 | Rcpp::RNGScope rcpp_rngScope_gen; 72 | Rcpp::traits::input_parameter< Eigen::Map >::type x(xSEXP); 73 | Rcpp::traits::input_parameter< Eigen::Map >::type y(ySEXP); 74 | Rcpp::traits::input_parameter< double >::type eps(epsSEXP); 75 | Rcpp::traits::input_parameter< int >::type max_iter(max_iterSEXP); 76 | rcpp_result_gen = Rcpp::wrap(coef_nnls(x, y, eps, max_iter)); 77 | return rcpp_result_gen; 78 | END_RCPP 79 | } 80 | // alloc_array 81 | SEXP alloc_array(SEXP input, R_xlen_t n_cl); 82 | RcppExport SEXP _scregclust_alloc_array(SEXP inputSEXP, SEXP n_clSEXP) { 83 | BEGIN_RCPP 84 | Rcpp::RObject rcpp_result_gen; 85 | Rcpp::RNGScope rcpp_rngScope_gen; 86 | Rcpp::traits::input_parameter< SEXP >::type input(inputSEXP); 87 | Rcpp::traits::input_parameter< R_xlen_t >::type n_cl(n_clSEXP); 88 | rcpp_result_gen = Rcpp::wrap(alloc_array(input, n_cl)); 89 | return rcpp_result_gen; 90 | END_RCPP 91 | } 92 | // reset_array 93 | void reset_array(SEXP arr, SEXP input); 94 | RcppExport SEXP _scregclust_reset_array(SEXP arrSEXP, SEXP inputSEXP) { 95 | BEGIN_RCPP 96 | Rcpp::RNGScope rcpp_rngScope_gen; 97 | Rcpp::traits::input_parameter< SEXP >::type arr(arrSEXP); 98 | Rcpp::traits::input_parameter< SEXP >::type input(inputSEXP); 99 | reset_array(arr, input); 100 | return R_NilValue; 101 | END_RCPP 102 | } 103 | 104 | static const R_CallMethodDef CallEntries[] = { 105 | {"_scregclust_allocate_into_modules", (DL_FUNC) &_scregclust_allocate_into_modules, 7}, 106 | {"_scregclust_jaccard_indicator_comp", (DL_FUNC) &_scregclust_jaccard_indicator_comp, 2}, 107 | {"_scregclust_coop_lasso", (DL_FUNC) &_scregclust_coop_lasso, 13}, 108 | {"_scregclust_coef_nnls", (DL_FUNC) &_scregclust_coef_nnls, 4}, 109 | {"_scregclust_alloc_array", (DL_FUNC) &_scregclust_alloc_array, 2}, 110 | {"_scregclust_reset_array", (DL_FUNC) &_scregclust_reset_array, 2}, 111 | {NULL, NULL, 0} 112 | }; 113 | 114 | RcppExport void R_init_scregclust(DllInfo *dll) { 115 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 116 | R_useDynamicSymbols(dll, FALSE); 117 | } 118 | -------------------------------------------------------------------------------- /vignettes/articles/pbmc.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Demonstration of workflow" 3 | --- 4 | 5 | ```{r, include = FALSE} 6 | knitr::opts_chunk$set( 7 | collapse = TRUE, 8 | comment = "#>" 9 | ) 10 | rlang::local_options(lifecycle_verbosity = "quiet") 11 | ``` 12 | 13 | The methods below are described in our article 14 | 15 | > Larsson, Held, et al. (2024) Reconstructing the regulatory programs 16 | > underlying the phenotypic plasticity of neural cancers. 17 | > Nature Communications 15, 9699 18 | > DOI [10.1038/s41467-024-53954-3](https://doi.org/10.1038/s41467-024-53954-3) 19 | 20 | Here we demonstrate the scregclust workflow using the PBMC data from 21 | 10X Genomics (available [here](https://www.10xgenomics.com/resources/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0)). 22 | This is the same data used in an [introductory vignette](https://satijalab.org/seurat/articles/pbmc3k_tutorial) 23 | for the Seurat package. We use [Seurat](https://satijalab.org/seurat/) for 24 | pre-processing of the data. 25 | 26 | ```{r load-packages, results='hide', message=FALSE} 27 | # Load required packages 28 | library(Seurat) 29 | library(scregclust) 30 | ``` 31 | 32 | # Download the data 33 | 34 | We are focusing here on the filtered feature barcode matrix available as an 35 | HDF5 file from the website linked above. The data can be downloaded manually 36 | or using R. 37 | 38 | However you obtain the data, the code below assumes that the HDF5 file 39 | containing it is placed in the same folder as this script with the name 40 | `pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5`. 41 | 42 | ```{r download-data} 43 | url <- paste0( 44 | "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/", 45 | "pbmc_granulocyte_sorted_3k/", 46 | "pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5" 47 | ) 48 | data_path <- file.path( 49 | tempdir(), "pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5" 50 | ) 51 | 52 | download.file(url, data_path, cacheOK = FALSE, mode = "wb") 53 | ``` 54 | 55 | # Load the data in Seurat and preprocess 56 | 57 | To perform preprocessing use Seurat to load the data. The file ships with 58 | two modalities, "Gene Expression" and "Peaks". We only use the former. 59 | 60 | ```{r load-h5} 61 | pbmc_data <- Read10X_h5( 62 | data_path, 63 | use.names = TRUE, 64 | unique.features = TRUE 65 | )[["Gene Expression"]] 66 | ``` 67 | 68 | We create a Seurat object and follow the Seurat vignette to subset the 69 | cells and features (genes). 70 | 71 | ```{r create-seurat-object} 72 | pbmc <- CreateSeuratObject( 73 | counts = pbmc_data, min.cells = 3, min.features = 200 74 | ) 75 | 76 | pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT.") 77 | pbmc <- subset(pbmc, subset = percent.mt < 30 & nFeature_RNA < 6000) 78 | ``` 79 | 80 | [SCTransform](https://satijalab.org/seurat/articles/sctransform_vignette) is 81 | used for variance stabilization of the data and Pearson residuals for the 82 | 6000 most variable genes are extracted as matrix `z`. 83 | 84 | ```{r apply-var-stabilization} 85 | pbmc <- SCTransform(pbmc, variable.features.n = 6000) 86 | 87 | z <- GetAssayData(pbmc, layer = "scale.data") 88 | dim(z) 89 | ``` 90 | 91 | # Use scregclust for clustering target genes into modules 92 | 93 | We then use `scregclust_format` which extracts gene symbols from the 94 | expression matrix and determines which genes are considered regulators. 95 | By default, transcription factors are used as regulators. Setting `mode` 96 | to `"kinase"` uses kinases instead of transcription factors. A list of the 97 | regulators used internally is returned by `get_regulator_list()`. 98 | 99 | ```{r prep-scregclust} 100 | out <- scregclust_format(z, mode = "TF") 101 | ``` 102 | 103 | The output of `scregclust_format` is a list with three elements. 104 | 105 | 1. `genesymbols` contains the rownames of `z` 106 | 2. `sample_assignment` is initialized to be a vector of `1`s of length `ncol(z)` 107 | and can be filled with a known sample grouping. Here, we do not use it and 108 | just keep it uniform across all cells. 109 | 3. `is_regulator` is an indicator vector (elements are 0 or 1) corresponding to 110 | the entries of `genesymbols` with 1 marking that the genesymbol is selected 111 | as a regulator according to the model of `scregclust_format` (`"TF"` or 112 | `"kinase"`) and 0 otherwise. 113 | 114 | ```{r extract-scregclust-arguments} 115 | genesymbols <- out$genesymbols 116 | sample_assignment <- out$sample_assignment 117 | is_regulator <- out$is_regulator 118 | ``` 119 | 120 | Run `scregclust` with number of initial modules set to 10 and test 121 | several penalties. The penalties provided to `penalization` are used during 122 | selection of regulators associated with each module. An increasing penalty 123 | implies the selection of fewer regulators. 124 | `noise_threshold` controls the minimum $R^2$ a gene has to achieve across 125 | modules. Otherwise the gene is marked as noise. 126 | The run can be reproduced with the command below. A pre-fitted model can be 127 | downloaded from [GitHub](https://github.com/scmethods/scregclust/raw/main/datasets/pbmc_scregclust.rds) 128 | for convenience. 129 | 130 | ```{r run-scregclust} 131 | # set.seed(8374) 132 | # fit <- scregclust( 133 | # z, genesymbols, is_regulator, penalization = seq(0.1, 0.5, 0.05), 134 | # n_modules = 10L, n_cycles = 50L, noise_threshold = 0.05 135 | # ) 136 | # saveRDS(fit, file = "datasets/pbmc_scregclust.rds") 137 | 138 | url <- paste0( 139 | "https://github.com/scmethods/scregclust/raw/main/datasets/", 140 | "pbmc_scregclust.rds" 141 | ) 142 | fit_path <- file.path(tempdir(), "pbmc_scregclust.rds") 143 | download.file(url, fit_path) 144 | fit <- readRDS(fit_path) 145 | ``` 146 | 147 | # Analysis of results 148 | 149 | Results can be visualized easily using built-in functions. 150 | Metrics for helping in choosing an optimal penalty can be plotted by calling 151 | `plot` on the object returned from `scregclust`. 152 | 153 | ```{r viz-metrics, fig.width=7, fig.height=4, fig.dpi=100} 154 | #| fig.alt: > 155 | #| Boxplots of predictive R^2 per module (bottom) and 156 | #| regulator importance (top) over the penalization parameters 157 | #| specified during model estimation. A decreasing trend can 158 | #| be seen in R^2 per module and a slow and steady increase in 159 | #| regulator importance is followed by an explosive increase from 160 | #| around 0.4 penalization. 161 | plot(fit) 162 | ``` 163 | 164 | The results for each penalization parameter are placed in a list, `results`, 165 | attached to the `fit` object. So `fit$results[[1]]` contains the results 166 | of running `scregclust` with `penalization = 0.1`. For each penalization 167 | parameter, the algorithm might end up finding multiple optimal configurations. 168 | Each configuration describes target genes module assignments and which 169 | regulators are associated with which modules. 170 | The results for each such configuration are contained in the list `output`. 171 | This means that `fit$results[[1]]$output[[1]]` contains the results for 172 | the first final configuration. More than one may be available. 173 | 174 | ```{r n-configs} 175 | sapply(fit$results, function(r) length(r$output)) 176 | ``` 177 | 178 | In this example, at most two final configurations were found for each 179 | penalization parameters. 180 | 181 | To plot the regulator network of the first configuration for 182 | `penalization = 0.1` the function `plot_regulator_network` can be used. 183 | 184 | ```{r viz-reg-network, fig.width=7, fig.height=7, fig.dpi=100} 185 | #| fig.alt: > 186 | #| Network visualization of modules (colorful circles) and their top 187 | #| regulators (grey rectangles). Arrows indicate regulation and their 188 | #| thickness represents regulation strength. Red arrows indicate positive 189 | #| regulation and blue arrows indicate negative regulation. 190 | plot_regulator_network(fit$results[[1]]$output[[1]]) 191 | ``` 192 | -------------------------------------------------------------------------------- /man/scregclust.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scregclust.R 3 | \name{scregclust} 4 | \alias{scregclust} 5 | \title{Uncover gene modules and their regulatory programs from single-cell data} 6 | \usage{ 7 | scregclust( 8 | expression, 9 | genesymbols, 10 | is_regulator, 11 | penalization, 12 | n_modules, 13 | initial_target_modules = NULL, 14 | sample_assignment = NULL, 15 | center = TRUE, 16 | split1_proportion = 0.5, 17 | total_proportion = 1, 18 | split_indices = NULL, 19 | prior_indicator = NULL, 20 | prior_genesymbols = NULL, 21 | prior_baseline = 1e-06, 22 | prior_weight = 0.5, 23 | min_module_size = 0L, 24 | allocate_per_obs = TRUE, 25 | noise_threshold = 0.025, 26 | n_cycles = 50L, 27 | use_kmeanspp_init = TRUE, 28 | n_initializations = 50L, 29 | max_optim_iter = 10000L, 30 | tol_coop_rel = 1e-08, 31 | tol_coop_abs = 1e-12, 32 | tol_nnls = 1e-04, 33 | compute_predictive_r2 = TRUE, 34 | compute_silhouette = FALSE, 35 | nowarnings = FALSE, 36 | verbose = TRUE, 37 | quick_mode = FALSE, 38 | quick_mode_percent = 0.1 39 | ) 40 | } 41 | \arguments{ 42 | \item{expression}{\verb{p x n} matrix of pre-processed single cell expression 43 | data with \code{p} rows of genes and \code{n} columns of cells.} 44 | 45 | \item{genesymbols}{A vector of gene names corresponding to rows of 46 | \code{expression}. Has to be of length \code{p}.} 47 | 48 | \item{is_regulator}{An indicator vector where \code{1} indicates that the 49 | corresponding row in \code{expression} is a candidate 50 | regulator. All other rows represent target genes. 51 | Has to be of length \code{p}.} 52 | 53 | \item{penalization}{Sparsity penalty related to the amount of regulators 54 | associated with each module. Either a single positive 55 | number or a vector of positive numbers.} 56 | 57 | \item{n_modules}{Requested number of modules (integer). 58 | If this is provided without specifying \code{initial_target_modules}, 59 | then an initial module allocation is performed on the 60 | cross-correlation matrix of targets and genes on the first 61 | dataset after data splitting.} 62 | 63 | \item{initial_target_modules}{The initial assignment of target genes to 64 | modules of length \code{sum(is_regulator == 0L)}. 65 | If this is not specified, then see \code{n_modules} regarding 66 | module initialization. If provided, \code{use_kmeanspp_init} 67 | and \code{n_initializations} are ignored.} 68 | 69 | \item{sample_assignment}{A vector of sample assignment for each cell, can 70 | be used to perform the data splitting with 71 | stratification. Has to be of length \code{n}. 72 | No stratification if \code{NULL} is supplied.} 73 | 74 | \item{center}{Whether or not genes should be centered within each subgroup 75 | defined in \code{sample_assignment}.} 76 | 77 | \item{split1_proportion}{The proportion to use for the first dataset during 78 | data splitting. The proportion for the second 79 | dataset is \code{1 - split1_proportion}. If stratification 80 | with \code{sample_assignment} is used, then the proportion 81 | of each strata is controlled.} 82 | 83 | \item{total_proportion}{Can be used to only use a proportion of the supplied 84 | observations. The proportion of the first dataset 85 | during data splitting in relation to the full 86 | dataset will be 87 | \code{total_proportion * split1_proportion}.} 88 | 89 | \item{split_indices}{Can be used to provide an explicit data split. If this 90 | is supplied then \code{split1_proportion}, and 91 | \code{total_proportion} are ignored. 92 | Note that if \code{sample_assigment} is provided and 93 | \code{center == TRUE}, then subgroup centering will be 94 | performed as in the case of random splitting. 95 | A vector of length \code{n} containing entries 1 for cells 96 | in the first data split, 2 for cells in the second 97 | data split and \code{NA} for cells that should be excluded 98 | from the computations.} 99 | 100 | \item{prior_indicator}{An indicator matrix (sparse or dense) of size \verb{q x q} 101 | that indicates whether there is a known functional 102 | relationship between two genes. Ideally, this is 103 | supplied as a sparse matrix (\code{sparseMatrix} 104 | in the \code{Matrix} package). If not, then the matrix 105 | is converted to one.} 106 | 107 | \item{prior_genesymbols}{A vector of gene names of length q corresponding 108 | to the rows/columns in \code{prior_indicator}. Does not 109 | have to be the same as \code{genesymbols}, but only 110 | useful if there is overlap.} 111 | 112 | \item{prior_baseline}{A positive baseline for the network prior. The larger 113 | this parameter is, the less impact the network prior 114 | will have.} 115 | 116 | \item{prior_weight}{A number between 0 and 1 indicating the strength of the 117 | prior in relation to the data. 0 ignores the prior and 118 | makes the algorithm completely data-driven. 1 uses only 119 | the prior during module allocation.} 120 | 121 | \item{min_module_size}{Minimum required size of target genes in a module. 122 | Smaller modules are emptied.} 123 | 124 | \item{allocate_per_obs}{Whether module allocation should be performed for 125 | each observation in the second data split separately. 126 | If \code{FALSE}, target genes are allocated into modules 127 | on the aggregate sum of squares across all 128 | observations in the second data split.} 129 | 130 | \item{noise_threshold}{Threshold for the best \eqn{R^2} of a target gene 131 | before it gets identified as noise.} 132 | 133 | \item{n_cycles}{Number of maximum algorithmic cycles.} 134 | 135 | \item{use_kmeanspp_init}{Use kmeans++ for module initialization if 136 | \code{initial_target_modules} is a single integer; 137 | otherwise use kmeans with random initial cluster 138 | centers} 139 | 140 | \item{n_initializations}{Number of kmeans(++) initialization runs.} 141 | 142 | \item{max_optim_iter}{Maximum number of iterations during optimization 143 | in the coop-Lasso and NNLS steps.} 144 | 145 | \item{tol_coop_rel}{Relative convergence tolerance during optimization 146 | in the coop-Lasso step.} 147 | 148 | \item{tol_coop_abs}{Absolute convergence tolerance during optimization 149 | in the coop-Lasso step.} 150 | 151 | \item{tol_nnls}{Convergence tolerance during optimization in the NNLS step.} 152 | 153 | \item{compute_predictive_r2}{Whether to compute predictive \eqn{R^2} per 154 | module as well as regulator importance.} 155 | 156 | \item{compute_silhouette}{Whether to compute silhouette scores for each 157 | target gene.} 158 | 159 | \item{nowarnings}{When turned on then no warning messages are shown.} 160 | 161 | \item{verbose}{Whether to print progress.} 162 | 163 | \item{quick_mode}{Whether to use a reduced number of noise targets to speed 164 | up computations.} 165 | 166 | \item{quick_mode_percent}{A number in [0, 1) indicating the amount of 167 | noise targets to use in the re-allocation process 168 | if \code{quick_mode = TRUE}.} 169 | } 170 | \value{ 171 | A list with S3 class \code{scregclust} containing 172 | \item{penalization}{The supplied \code{penalization} parameters} 173 | \item{results}{A list of result lists (each with S3 class 174 | \code{scregclust_result}), one for each supplied \code{penalization} 175 | parameter. See below.} 176 | \item{initial_target_modules}{Initial allocation of target genes into 177 | modules.} 178 | \item{split_indices}{either verbatim the vector given as input or 179 | a vector encoding the splits as NA = not included, 180 | 1 = split 1 or 2 = split 2. Allows reproducibility 181 | of data splits.} 182 | 183 | For each supplied penalization parameter, \code{results} contains a list with 184 | \itemize{ 185 | \item the current \code{penalization} parameter, 186 | \item the supplied \code{genesymbols} after filtering (as used during fitting), 187 | \item the supplied \code{is_regulator} vector after filtering (as used during 188 | fitting), 189 | \item the number of fitted modules \code{n_modules}, 190 | \item whether the current run \code{converged} to a single configuration (as a 191 | boolean), 192 | \item as well as an \code{output} object containing the numeric results for each 193 | final configuration. 194 | } 195 | 196 | It is possible that the algorithm ends in a finite cycle of configurations 197 | instead of a unique final configuration. 198 | Therefore, \code{output} is a list with each element itself being a list 199 | with the following contents: 200 | \describe{ 201 | \item{\code{reg_table}}{a regulator table, a matrix of weights for each 202 | regulator and module} 203 | \item{\code{module}}{vector of same length as \code{genesymbols} containing the 204 | module assignments for all genes with regulators 205 | marked as \code{NA}. Genes considered noise are marked as \code{-1}.} 206 | \item{\code{module_all}}{same as \code{module}, however, genes that were marked as 207 | noise (-1 in \code{module}) are assigned to the 208 | module in which it has the largest \eqn{R^2}, 209 | even if it is below \code{noise_threshold}.} 210 | \item{\code{r2}}{matrix of predictive \eqn{R^2} value for each target gene and 211 | module} 212 | \item{\code{best_r2}}{vector of best predictive \eqn{R^2} for each gene 213 | (regulators marked with NA)} 214 | \item{\code{best_r2_idx}}{module index corresponding to best predictive 215 | \eqn{R^2} for each gene (regulators marked with NA)} 216 | \item{\code{r2_module}}{a vector of predictive \eqn{R^2} values for each 217 | module (included if \code{compute_predictive_r2 == TRUE})} 218 | \item{\code{importance}}{a matrix of importance values for each regulator (rows) 219 | and module (columns) (included if 220 | \code{compute_predictive_r2 == TRUE})} 221 | \item{\code{r2_cross_module_per_target}}{a matrix of cross module \eqn{R^2} 222 | values for each target gene (rows) 223 | and each module (columns) (included 224 | if \code{compute_silhouette == TRUE})} 225 | \item{\code{silhouette}}{a vector of silhouette scores for each target gene 226 | (included if \code{compute_silhouette == TRUE})} 227 | \item{\code{models}}{regulator selection for each module as a matrix with 228 | regulators in rows and modules in columns} 229 | \item{\code{signs}}{regulator signs for each module as a matrix with 230 | regulators in rows and modules in columns} 231 | \item{\code{weights}}{average regulator coefficient for each module} 232 | \item{\code{coeffs}}{list of regulator coefficient matrices for each module 233 | for all target genes as re-estimated in the NNLS step} 234 | \item{\code{sigmas}}{matrix of residual variances, one per target gene 235 | in each module; derived from the residuals in NNLS step} 236 | } 237 | } 238 | \description{ 239 | Use the scRegClust algorithm to determine gene modules and their 240 | regulatory programs from single-cell data. 241 | } 242 | \concept{main} 243 | -------------------------------------------------------------------------------- /src/optim.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using Arr1d = Eigen::ArrayXd; 12 | using Arr2d = Eigen::ArrayXXd; 13 | using Matd = Eigen::MatrixXd; 14 | using Vecd = Eigen::VectorXd; 15 | using Veci = Eigen::VectorXi; 16 | 17 | static Matd compute_xtx(const Matd& x) { 18 | const auto p = x.cols(); 19 | 20 | Matd xtx = Eigen::MatrixXd::Zero(p, p); 21 | if (p > 0) { 22 | xtx.selfadjointView().rankUpdate(x.transpose()); 23 | xtx.triangularView() = xtx.transpose(); 24 | } 25 | 26 | return xtx; 27 | } 28 | 29 | //' ADMM algorithm for solving the group-penalized least squares problem 30 | //' 31 | //' Implements estimation of the coop-lasso problem. 32 | //' 33 | //' @param y Target (n x m) 34 | //' @param x Design matrix (n x p) 35 | //' @param lambda Penalization parameter 36 | //' @param weights A specific weight for each group (typically this is 37 | //' `sqrt(group size)`). 38 | //' @param beta_0 Initial value for coefficients, allowing for warm start. 39 | //' Can be set to NULL, which results in the initial `beta` 40 | //' being a zero matrix. 41 | //' @param rho_0 Initial ADMM step-size 42 | //' @param alpha_0 Initial ADMM relaxation parameter 43 | //' @param n_update Number of steps in-between updates of the 44 | //' step-size/adaptation parameters 45 | //' @param eps_corr Lower bound for the correlation in the step-size 46 | //' update steps 47 | //' @param max_iter Maximum number of iterations 48 | //' @param eps_rel Relative tolerance for convergence check 49 | //' @param eps_abs Absolute tolerance for convergence check 50 | //' @param verbose Whether or not information about the optimization process 51 | //' should be printed to the terminal 52 | //' 53 | //' @return A list containing 54 | //' \item{beta}{The coefficients at convergence} 55 | //' \item{iterations}{Number of iterations} 56 | //' 57 | //' @references 58 | //' Xu et al. (2017) Adaptive relaxed ADMM: Convergence theory and 59 | //' practical implementation. DOI 10.1109/CVPR.2017.765 60 | //' 61 | //' @keywords internal 62 | // [[Rcpp::export]] 63 | Rcpp::List coop_lasso( 64 | Eigen::Map y, Eigen::Map x, double lambda, 65 | Eigen::Map weights, 66 | Rcpp::Nullable beta_0 = R_NilValue, // Initialization 67 | double rho_0 = 0.2, double alpha_0 = 1.5, int n_update = 2, 68 | double eps_corr = 0.2, // Step-size 69 | int max_iter = 1000, double eps_rel = 1e-8, double eps_abs = 1e-12, // Convergence 70 | bool verbose = false) { 71 | // Record sizes 72 | const auto n = y.rows(); 73 | const auto m = y.cols(); 74 | const auto p = x.cols(); 75 | 76 | if (n <= 0 || m <= 0 || p <= 0 || x.rows() <= 0) { 77 | Rcpp::stop("COOP LASSO: Matrix dimensions of y and x need to be positive."); 78 | } 79 | 80 | if (x.rows() != n) { 81 | Rcpp::stop("y and x need to have the same number of rows."); 82 | } 83 | 84 | Matd beta = Eigen::MatrixXd::Zero(p, m); 85 | if (beta_0.isUsable()) { 86 | if (static_cast(beta_0.as().nrow()) != p || 87 | static_cast(beta_0.as().ncol()) != m) { 88 | Rcpp::stop("beta_0 needs to be of size p x m"); 89 | } 90 | 91 | beta = 92 | Eigen::Map(beta_0.as().begin(), beta_0.as().nrow(), beta_0.as().ncol()); 93 | } 94 | 95 | // Pre-compute some quantities to speed up computation 96 | // Precompute X^T X 97 | const Matd xtx = compute_xtx(x); 98 | // Precompute X^T Y 99 | const Matd xty = x.transpose() * y; 100 | 101 | // Ensure we are starting with a feasible point (i.e. zeta == beta) 102 | Matd zeta = beta; 103 | // For adaptive ADMM it is necessary to use unscaled multipliers 104 | // to get the computations right 105 | Matd mult = Eigen::MatrixXd::Zero(p, m); 106 | 107 | // We need to save one old iterate for step-size/relaxation 108 | // parameter estimation 109 | Matd beta_old = beta; 110 | Matd zeta_old = zeta; 111 | Matd mult_old = mult; 112 | Matd mult_hat_old = mult; 113 | 114 | // Initialize step-size and relaxation parameter 115 | if (rho_0 <= 0.0) { 116 | Rcpp::stop("rho_0 > 0 needs to hold"); 117 | } 118 | auto rho = rho_0; 119 | double rho_old = 0.0; // Ensure rho_old != rho in first iteration 120 | if (alpha_0 <= 0.0 || alpha_0 > 2.0) { 121 | Rcpp::stop("0 < alpha_0 < 2 needs to hold"); 122 | } 123 | auto alpha = alpha_0; 124 | 125 | // Iteration counter 126 | int it = 1; 127 | // Parameter update counter 128 | int pc = 0; 129 | 130 | Matd xtx_rho_eye = xtx; 131 | Eigen::LDLT ldlt; 132 | 133 | // ADMM algorithm 134 | while (true) { 135 | // Precompute if necessary 136 | if (pc == 0 || rho != rho_old) { 137 | // Only diagonal needs to be updated 138 | xtx_rho_eye.diagonal() = xtx.diagonal().array() + rho; 139 | ldlt.compute(xtx_rho_eye); 140 | } 141 | 142 | // Step 1: Update beta using pre-computed quantities 143 | beta = ldlt.solve(xty + rho * zeta + mult); 144 | 145 | // Relaxation step 146 | const Matd beta_relaxed = alpha * beta + (1.0 - alpha) * zeta; 147 | 148 | // Step 2: Update zeta 149 | Matd zeta_new = (beta_relaxed - mult / rho); 150 | 151 | const Arr1d shrink_pos = 152 | (1.0 - 153 | lambda * weights / (rho * (zeta_new.cwiseMax(0.0).rowwise().norm()).array())) 154 | .max(0.0); 155 | const Arr1d shrink_neg = 156 | (1.0 - 157 | lambda * weights / (rho * (zeta_new.cwiseMin(0.0).rowwise().norm()).array())) 158 | .max(0.0); 159 | 160 | for (Eigen::Index j = 0; j < m; j++) { 161 | for (Eigen::Index i = 0; i < p; i++) { 162 | if (zeta_new(i, j) >= 0.0) { 163 | zeta_new(i, j) *= shrink_pos(i); 164 | } else { 165 | zeta_new(i, j) *= shrink_neg(i); 166 | } 167 | } 168 | } 169 | 170 | // Step 3: Update multipliers 171 | mult += rho * (-beta_relaxed + zeta_new); 172 | 173 | // Convergence check 174 | // Compute primal and dual residuals 175 | const Matd primal_resid = -beta + zeta_new; 176 | const Matd dual_resid = rho * (zeta - zeta_new); 177 | 178 | if (verbose) { 179 | std::stringstream ss; 180 | ss << "\r#" << std::setw(5) << it << std::scientific << std::setprecision(4) 181 | << " rho " << std::setw(5) << rho << " alpha " << std::setw(5) << alpha 182 | << " prim_res " << std::setw(5) << primal_resid.norm() << " bnd " 183 | << std::setw(5) 184 | << fmax(eps_rel * fmax(beta.norm(), zeta_new.norm()), eps_abs) 185 | << " dual_res " << std::setw(5) << dual_resid.norm() << " bnd " 186 | << std::setw(5) << fmax(eps_rel * mult.norm(), eps_abs); 187 | 188 | Rcpp::Rcout << ss.str(); 189 | } 190 | 191 | // Check residual convergence 192 | if ((primal_resid.norm() <= 193 | fmax(eps_rel * fmax(beta.norm(), zeta_new.norm()), eps_abs)) && 194 | (dual_resid.norm() <= fmax(eps_rel * mult.norm(), eps_abs))) { 195 | break; 196 | } 197 | 198 | // Step-size/relaxation parameter update 199 | pc++; 200 | if (pc == n_update) { 201 | // The hatted multipliers use non-relaxed beta and the zetas from 202 | // the previous iteration 203 | const Matd mult_hat = mult + rho * (-beta + zeta); 204 | 205 | const Matd delta_mult_hat = mult_hat - mult_hat_old; 206 | const Matd delta_h_hat = beta - beta_old; 207 | 208 | const Matd delta_mult = mult - mult_old; 209 | const Matd delta_g_hat = zeta_old - zeta; 210 | 211 | const auto norm_delta_mult_hat = delta_mult_hat.norm(); 212 | const auto norm_delta_h_hat = delta_h_hat.norm(); 213 | const auto norm_delta_mult = delta_mult.norm(); 214 | const auto norm_delta_g_hat = delta_g_hat.norm(); 215 | 216 | double a = 0.0; 217 | double a_corr = 0.0; 218 | 219 | if (norm_delta_mult_hat > 0.0 && norm_delta_h_hat > 0.0) { 220 | // Estimate local slope for h 221 | const auto delta_h_hat_delta_mult_hat = 222 | (delta_h_hat.array() * delta_mult_hat.array()).sum(); 223 | const auto a_sd = 224 | delta_mult_hat.squaredNorm() / delta_h_hat_delta_mult_hat; 225 | const auto a_mg = delta_h_hat_delta_mult_hat / delta_h_hat.squaredNorm(); 226 | 227 | if (2.0 * a_mg > a_sd) { 228 | a = a_mg; 229 | } else { 230 | a = a_sd - a_mg / 2.0; 231 | } 232 | 233 | a_corr = 234 | delta_h_hat_delta_mult_hat / (norm_delta_h_hat * norm_delta_mult_hat); 235 | } 236 | 237 | double b = 0.0; 238 | double b_corr = 0.0; 239 | 240 | if (norm_delta_mult > 0.0 && norm_delta_g_hat > 0.0) { 241 | // Estimate local slope for g 242 | const auto delta_g_hat_delta_mult = 243 | (delta_g_hat.array() * delta_mult.array()).sum(); 244 | const auto b_sd = delta_mult.squaredNorm() / delta_g_hat_delta_mult; 245 | const auto b_mg = delta_g_hat_delta_mult / delta_g_hat.squaredNorm(); 246 | 247 | if (2.0 * b_mg > b_sd) { 248 | b = b_mg; 249 | } else { 250 | b = b_sd - b_mg / 2.0; 251 | } 252 | 253 | b_corr = delta_g_hat_delta_mult / (norm_delta_g_hat * norm_delta_mult); 254 | } 255 | 256 | // Store old rho to check whether it changed and we need to 257 | // update pre-computed quantities 258 | rho_old = rho; 259 | 260 | // Update step-size if appropriate 261 | if (a_corr > eps_corr && b_corr > eps_corr) { 262 | rho = sqrt(a * b); 263 | } else if (a_corr > eps_corr && b_corr <= eps_corr) { 264 | rho = a; 265 | } else if (a_corr <= eps_corr && b_corr > eps_corr) { 266 | rho = b; 267 | } 268 | // Else: Leave rho as is 269 | 270 | // Update relaxation parameter if appropriate 271 | if (a_corr > eps_corr && b_corr > eps_corr) { 272 | alpha = 1.0 + 2.0 / (sqrt(a * b) * (1.0 / a + 1.0 / b)); 273 | } else if (a_corr > eps_corr && b_corr <= eps_corr) { 274 | alpha = 1.9; 275 | } else if (a_corr <= eps_corr && b_corr > eps_corr) { 276 | alpha = 1.1; 277 | } else { 278 | alpha = 1.5; 279 | } 280 | 281 | // House-keeping 282 | beta_old = beta; 283 | zeta_old = zeta_new; 284 | mult_old = mult; 285 | mult_hat_old = mult_hat; 286 | 287 | // Reset counter 288 | pc = 0; 289 | } 290 | 291 | // House-keeping 292 | zeta = zeta_new; 293 | 294 | // Check iteration limit 295 | it++; 296 | if (it > max_iter) { 297 | if (verbose) { 298 | Rcpp::Rcout << std::endl; 299 | } 300 | Rcpp::Rcout << "Coop-Lasso: Maximum number of iterations reached"; 301 | if (!verbose) { 302 | Rcpp::Rcout << std::endl; 303 | } 304 | break; 305 | } 306 | 307 | Rcpp::checkUserInterrupt(); 308 | } 309 | 310 | if (verbose) { 311 | Rcpp::Rcout << std::endl; 312 | } 313 | 314 | Rcpp::List out; 315 | Rcpp::NumericMatrix beta_(beta.rows(), beta.cols(), beta.data()); 316 | out["beta"] = beta_; 317 | out["iterations"] = it; 318 | 319 | return out; 320 | } 321 | 322 | // static void remove_kkt_elements(const Matd& beta, const Matd& grad, Matd& grad_bar) { 323 | // const auto n = beta.rows(); 324 | // const auto m = beta.cols(); 325 | 326 | // for (Eigen::Index j = 0; j < m; j++) { 327 | // for (Eigen::Index i = 0; i < n; i++) { 328 | // if ((beta(i, j) == 0.0) && (grad(i, j) > 0.0)) { 329 | // grad_bar(i, j) = 0.0; 330 | // } 331 | // } 332 | // } 333 | // } 334 | 335 | static void greedy_coord_descent(const Matd& Q, Matd& beta, Matd& grad) { 336 | const auto n = beta.rows(); 337 | const auto m = beta.cols(); 338 | 339 | for (Eigen::Index t = 0; t < n; t++) { 340 | Eigen::Index empty_passive_sets = 0; 341 | 342 | for (Eigen::Index j = 0; j < m; j++) { 343 | // Determine maximum absolute gradient over passive set 344 | Eigen::Index p = -1; 345 | auto max_val = (((beta.col(j).array() > 0.0) || (grad.col(j).array() < 0.0)) 346 | .cast() * 347 | grad.col(j).array().abs()) 348 | .maxCoeff(&p); 349 | 350 | // Eigen::Index p = -1; 351 | // double max_val = 0.0; 352 | // for (Eigen::Index i = 0; i < n; i++) { 353 | // if ((beta(i, j) > 0.0) || (grad(i, j) < 0.0)) { 354 | // auto abs_grad = fabs(grad(i, j)); 355 | // if (abs_grad > max_val) { 356 | // max_val = abs_grad; 357 | // p = i; 358 | // } 359 | // } 360 | // } 361 | 362 | // Perform coordinate descent on the selected coefficient 363 | if (max_val == 0.0) { 364 | empty_passive_sets++; 365 | continue; 366 | } 367 | 368 | const auto dbeta = fmax(0.0, beta(p, j) - grad(p, j) / Q(p, p)) - beta(p, j); 369 | beta(p, j) += dbeta; 370 | grad.col(j) += dbeta * Q.col(p); 371 | } 372 | 373 | if (empty_passive_sets == m) { 374 | break; 375 | } 376 | } 377 | } 378 | 379 | //' Compute NNLS coefficients 380 | //' 381 | //' Computes non-negative least squares coefficients with a matrix 382 | //' right hand side. 383 | //' 384 | //' @param x Coefficient matrix (p x n matrix) 385 | //' @param y Right hand side (p x m matrix) 386 | //' @param eps Convergence tolerance 387 | //' @param max_iter Maximum number of iterations 388 | //' 389 | //' @return A list containing 390 | //' \item{beta}{The estimated coefficient matrix} 391 | //' \item{iterations}{A vector containing the number of iterations needed 392 | //' for the `i`-th column in `y` in the `i`-th entry.} 393 | //' 394 | //' @references 395 | //' Duy Khuong Nguyen and Tu Bao Ho. Accelerated anti-lopsided algorithm 396 | //' for nonnegative least squares. International Journal of Data Science 397 | //' and Analytics, 3(1):23–34, 2017. 398 | //' 399 | //' Adapted from 400 | //' 401 | //' @keywords internal 402 | // [[Rcpp::export]] 403 | Rcpp::List coef_nnls(Eigen::Map x, Eigen::Map y, 404 | double eps = 1e-12, int max_iter = 1000L) { 405 | const auto n = x.cols(); 406 | auto m = y.cols(); // Will be reduced whenever right-hand sides reach convergence 407 | 408 | if (n <= 0 || m <= 0 || x.rows() <= 0 || y.rows() <= 0) { 409 | Rcpp::stop("NNLS: Matrix dimensions of y and x need to be positive."); 410 | } 411 | 412 | // Pre-compute some quantities to speed up computation 413 | // Precompute X^T X 414 | const Matd xtx = compute_xtx(x); 415 | 416 | const Vecd inv_sqrt_diag_xtx = 1.0 / xtx.diagonal().array().sqrt(); 417 | 418 | const Matd Q = 419 | xtx.array() * (inv_sqrt_diag_xtx * inv_sqrt_diag_xtx.transpose()).array(); 420 | // Multiply -x^T y row-wise by the elements in inv_sqrt_diag_xtx 421 | Matd grad = (-x.transpose() * y).array().colwise() * inv_sqrt_diag_xtx.array(); 422 | 423 | Matd beta_final = Eigen::MatrixXd::Zero(n, m); 424 | Matd beta = Eigen::MatrixXd::Zero(n, m); 425 | Matd grad_bar = grad; 426 | // remove_kkt_elements(beta, grad, grad_bar); 427 | grad_bar.array() *= 428 | (1 - ((beta.array() == 0.0) && (grad.array() > 0.0))).cast(); 429 | 430 | // Save necessary number of iterations 431 | std::vector iterations(static_cast::size_type>(m)); 432 | std::fill(iterations.begin(), iterations.end(), max_iter); 433 | 434 | std::list remaining_obs(static_cast::size_type>(m)); 435 | std::iota(remaining_obs.begin(), remaining_obs.end(), 0); 436 | 437 | for (int l = 0; l < max_iter; l++) { 438 | const Matd beta_save = beta; 439 | const Matd grad_save = grad; 440 | 441 | // Exact line search algorithm over passive variables 442 | const Matd Q_grad_bar = Q * grad_bar; 443 | const Arr1d alpha1 = (grad_bar.colwise().squaredNorm()).array() / 444 | (grad_bar.array() * Q_grad_bar.array()).colwise().sum(); 445 | 446 | for (Eigen::Index j = 0; j < m; j++) { 447 | const auto a = alpha1(j); 448 | if ((a == a) && fabs(a) >= 1e-20 && fabs(a) < 1e30) { 449 | beta.col(j) -= a * grad_bar.col(j); 450 | grad.col(j) -= a * Q_grad_bar.col(j); 451 | for (Eigen::Index i = 0; i < n; i++) { 452 | if (beta(i, j) < 0.0) { 453 | // Correct for negative elements 454 | grad.col(j) -= beta(i, j) * Q.col(i); 455 | beta(i, j) = 0.0; // Remove them from updated iterate 456 | } 457 | } 458 | } 459 | } 460 | 461 | // Greedy coordinate descent algorithm (First time) 462 | greedy_coord_descent(Q, beta, grad); 463 | 464 | // Accelerated search 465 | const Matd dbeta = beta_save - beta; 466 | const Matd Q_dbeta = Q * dbeta; 467 | const Arr1d alpha2 = (grad.array() * dbeta.array()).square().colwise().sum() / 468 | (dbeta.array() * Q_dbeta.array()).colwise().sum(); 469 | 470 | for (Eigen::Index j = 0; j < m; j++) { 471 | const auto a = alpha2(j); 472 | if ((a == a) && fabs(a) >= 1e-20 && fabs(a) < 1e30) { 473 | beta.col(j) -= a * dbeta.col(j); 474 | grad.col(j) -= a * Q_dbeta.col(j); 475 | for (Eigen::Index i = 0; i < n; i++) { 476 | if (beta(i, j) < 0) { 477 | // Correct for negative elements 478 | grad.col(j) -= beta(i, j) * Q.col(i); 479 | beta(i, j) = 0.0; // Remove them from updated iterate 480 | } 481 | } 482 | } 483 | } 484 | 485 | // Greedy coordinate descent algorithm (Second time) 486 | greedy_coord_descent(Q, beta, grad); 487 | 488 | // Compute error 489 | grad_bar = grad; 490 | // remove_kkt_elements(beta, grad, grad_bar); 491 | grad_bar.array() *= 492 | (1 - ((beta.array() == 0.0) && (grad.array() > 0.0))).cast(); 493 | 494 | // Check for which rhs convergence has been achieved 495 | const Arr1d grad_bar_norms = grad_bar.colwise().norm(); 496 | std::vector kept_cols; 497 | kept_cols.reserve(remaining_obs.size()); 498 | 499 | auto it = remaining_obs.begin(); 500 | for (Eigen::Index i = 0; i < m; i++) { 501 | if (grad_bar_norms(i) < eps) { 502 | beta_final.col(*it) = beta.col(i); 503 | iterations[static_cast::size_type>(*it)] = l + 1; 504 | it = remaining_obs.erase(it); 505 | } else { 506 | kept_cols.push_back(i); 507 | it++; 508 | } 509 | } 510 | 511 | // Reduce problem to those rhs where convergence has not yet occurred 512 | m = static_cast(kept_cols.size()); 513 | if (m > static_cast(0)) { 514 | // Use that columns in kept_cols are sorted by construction 515 | Eigen::Index j = 0; 516 | for (auto& i : kept_cols) { 517 | if (j != i) { 518 | beta.col(j) = beta.col(i); 519 | grad.col(j) = grad.col(i); 520 | grad_bar.col(j) = grad_bar.col(i); 521 | } 522 | j++; 523 | } 524 | 525 | beta.conservativeResize(Eigen::NoChange, m); 526 | grad.conservativeResize(Eigen::NoChange, m); 527 | grad_bar.conservativeResize(Eigen::NoChange, m); 528 | } else { 529 | break; 530 | } 531 | 532 | if (l == max_iter - 1) { 533 | Rcpp::Rcout << "NNLS: Maximum number of iterations reached" << std::endl; 534 | } 535 | 536 | Rcpp::checkUserInterrupt(); 537 | } 538 | 539 | // Re-scale to original scale 540 | beta_final.array().colwise() *= inv_sqrt_diag_xtx.array(); 541 | 542 | Rcpp::List out; 543 | Rcpp::NumericMatrix beta_final_(beta_final.rows(), beta_final.cols(), 544 | beta_final.data()); 545 | out["beta"] = beta_final_; 546 | out["iterations"] = iterations; 547 | 548 | return out; 549 | } 550 | -------------------------------------------------------------------------------- /R/plotting.R: -------------------------------------------------------------------------------- 1 | #' Plotting the regulatory table from scregclust as a directed graph 2 | #' 3 | #' @param output Object of type `scregclust_output` from a fit of the 4 | #' scregclust algorithm. 5 | #' @param arrow_size Size of arrow head 6 | #' @param edge_scaling Scaling factor for edge width 7 | #' @param no_links Threshold value (0-10) for number of edges to show, 8 | #' higher value = more stringent threshold = less edges 9 | #' @param col color 10 | #' 11 | #' @return Graph with gene modules and regulators as nodes 12 | #' 13 | #' @concept plotting 14 | #' 15 | #' @export 16 | plot_regulator_network <- function(output, 17 | arrow_size = 0.3, 18 | edge_scaling = 30, 19 | no_links = 6, 20 | col = c( 21 | "gray80", 22 | "#FC7165", 23 | "#BD828C", 24 | "#9D8A9F", 25 | "#7D92B2", 26 | "#BDA88C", 27 | "#FCBD65", 28 | "#F2BB90", 29 | "#E7B9BA", 30 | "#BDB69C", 31 | "#92B27D", 32 | "#9B8BA5", 33 | "#9D7DB2", 34 | "#94A5BF" 35 | )) { 36 | reg_table <- output$reg_table 37 | idx <- !is.na(colSums(reg_table)) 38 | reg_table <- reg_table[, idx] 39 | 40 | regulators <- c() 41 | for (i in seq_len(ncol(reg_table))) { 42 | tmp1 <- head(rownames( 43 | reg_table[order(reg_table[, i], decreasing = TRUE), ] 44 | )) 45 | regulators <- append(regulators, tmp1) 46 | tmp2 <- tail(rownames( 47 | reg_table[order(reg_table[, i], decreasing = TRUE), ] 48 | )) 49 | regulators <- append(regulators, tmp2) 50 | } 51 | 52 | regulators <- unique(regulators) 53 | 54 | f <- which(rownames(reg_table) %in% regulators) 55 | reg_table <- reg_table[f, ] 56 | 57 | reg_table$regulator <- rownames(reg_table) 58 | rownames(reg_table) <- NULL 59 | 60 | links <- reshape::melt(reg_table, id.vars = "regulator") 61 | colnames(links) <- c("from", "to", "weight") 62 | f <- which(links$weight == 0) 63 | links <- links[-f, ] 64 | 65 | m <- which(links$weight < 0) 66 | p <- which(links$weight > 0) 67 | 68 | links$mode <- array(0, dim = c(nrow(links), 1)) 69 | links$mode[m] <- "Repress" 70 | links$mode[p] <- "Activate" 71 | links$color <- array(0, dim = c(nrow(links), 1)) 72 | links$color[m] <- "#2B278C" 73 | links$color[p] <- "#BD111F" 74 | links$weight <- abs(links$weight) 75 | 76 | links <- as.data.frame(links) 77 | 78 | rownames(reg_table) <- reg_table$regulator 79 | reg_table <- reg_table[, -ncol(reg_table)] 80 | 81 | nodes <- array(0, dim = c((nrow(reg_table) + ncol(reg_table)), 2)) 82 | colnames(nodes) <- c("id", "type") 83 | 84 | nodes[seq_len(nrow(reg_table)), 1] <- rownames(reg_table) 85 | nodes[seq_len(nrow(reg_table)), 2] <- "Regulator" 86 | nodes[(nrow(reg_table) + 1):nrow(nodes), 1] <- colnames(reg_table) 87 | nodes[(nrow(reg_table) + 1):nrow(nodes), 2] <- "TargetState" 88 | nodes <- as.data.frame(nodes) 89 | 90 | net <- igraph::graph_from_data_frame( 91 | d = links, vertices = nodes, directed = TRUE 92 | ) 93 | 94 | igraph::V(net)[which(igraph::V(net)$type == "Regulator")]$shape <- 1 95 | igraph::V(net)[which(igraph::V(net)$type == "TargetState")]$shape <- 2 96 | 97 | igraph::V(net)[which(igraph::V(net)$type == "Regulator")]$type <- 1 98 | igraph::V(net)[which(igraph::V(net)$type == "TargetState")]$type <- ( 99 | seq_len(ncol(reg_table)) 100 | ) 101 | 102 | colrs <- col 103 | igraph::V(net)$color <- colrs[as.numeric(igraph::V(net)$type)] 104 | 105 | cut.off <- quantile(links$weight, probs = seq(0, 1, 0.1))[no_links] 106 | net <- igraph::delete_edges(net, igraph::E(net)[links$weight < cut.off]) 107 | 108 | isolated <- which(igraph::degree(net) == 0) 109 | net <- igraph::delete_vertices(net, isolated) 110 | 111 | igraph::E(net)$arrow.size <- arrow_size 112 | igraph::V(net)$shape <- c("vrectangle", "circle")[ 113 | as.numeric(igraph::V(net)$shape) 114 | ] 115 | igraph::E(net)$width <- igraph::E(net)$weight * edge_scaling 116 | 117 | l <- igraph::layout_with_fr(net) 118 | 119 | plot( 120 | net, 121 | layout = l, 122 | edge.curved = 0.3, 123 | vertex.label.cex = .6, 124 | vertex.label.color = "black", 125 | alpha = 0.5 126 | ) 127 | legend( 128 | x = -1.1, 129 | y = -0.8, 130 | c("Activating", "Repressing"), 131 | pch = 21, 132 | col = "#777777", 133 | pt.bg = c("#BD111F", "#2B278C"), 134 | pt.cex = 2, 135 | cex = .8, 136 | bty = "n", 137 | ncol = 1 138 | ) 139 | } 140 | 141 | #' @concept plotting 142 | #' 143 | #' @export 144 | plot.scregclust <- function(x, ...) { 145 | r2_module_data <- do.call(rbind, lapply(x$results, function(r) { 146 | do.call(rbind, lapply(r$output, function(o) { 147 | idx <- !is.na(o$r2_module) 148 | 149 | data.frame( 150 | penalization = r$penalization, 151 | module = seq_along(o$r2_module)[idx], 152 | value = o$r2_module[idx] 153 | ) 154 | })) 155 | })) 156 | r2_module_data$penalization <- factor( 157 | r2_module_data$penalization, levels = x$penalization 158 | ) 159 | r2_module_data$variable <- "r2-per-module" 160 | 161 | importance_data <- do.call(rbind, lapply(x$results, function(r) { 162 | do.call(rbind, lapply(seq_along(r$output), function(j) { 163 | o <- r$output[[j]] 164 | do.call(rbind, lapply(seq_len(ncol(o$models)), function(i) { 165 | idx <- !is.na(o$importance[, i]) 166 | if (sum(idx) == 0) { 167 | return(NULL) 168 | } 169 | 170 | data.frame( 171 | penalization = r$penalization, 172 | module = i, 173 | value = o$importance[idx, i] 174 | ) 175 | })) 176 | })) 177 | })) 178 | importance_data$penalization <- factor( 179 | importance_data$penalization, levels = x$penalization 180 | ) 181 | importance_data$variable <- "importance" 182 | 183 | rbind(r2_module_data, importance_data) |> 184 | ggplot2::ggplot() + 185 | ggplot2::facet_wrap( 186 | variable ~ ., 187 | nrow = 2, 188 | scales = "free_y", 189 | strip.position = "left", 190 | labeller = ggplot2::label_bquote( 191 | .( 192 | if (variable == "importance") { 193 | "Regulator Importance" 194 | } else { 195 | "Predictive" ~ R^2 ~ "per module" 196 | } 197 | ) 198 | ), 199 | ) + 200 | ggplot2::geom_boxplot( 201 | ggplot2::aes(x = .data$penalization, y = .data$value), 202 | outlier.size = 0.5, 203 | lwd = 0.25, 204 | ) + 205 | ggplot2::labs(x = "Penalization", y = NULL) + 206 | ggplot2::theme_minimal() + 207 | ggplot2::theme( 208 | panel.grid = ggplot2::element_blank(), 209 | axis.line = ggplot2::element_line( 210 | arrow = grid::arrow(length = grid::unit(1, "mm")), 211 | ), 212 | strip.background = ggplot2::element_blank(), 213 | strip.placement = "outside", 214 | line = ggplot2::element_line(linewidth = 0.25), 215 | plot.margin = ggplot2::margin(t = 2, unit = "mm"), 216 | ) 217 | } 218 | 219 | collect_silhouette_data <- function(list_of_fits) { 220 | do.call(rbind, lapply(list_of_fits, function(fit) { 221 | do.call(rbind, lapply(seq_along(fit$results), function(i) { 222 | r <- fit$results[[i]] 223 | do.call(rbind, lapply(seq_along(r$output), function(j) { 224 | o <- r$output[[j]] 225 | k <- o$module[!r$is_regulator] 226 | 227 | order_list <- lapply(seq_len(r$n_modules), function(cl) { 228 | if (sum(k == cl) > 0) { 229 | order(o$silhouette[k == cl]) 230 | } else { 231 | integer(0) 232 | } 233 | }) 234 | gene <- do.call(c, lapply(seq_len(r$n_modules), function(cl) { 235 | seq_along(k)[k == cl][order_list[[cl]]] 236 | })) 237 | 238 | data.frame( 239 | order = seq_len(sum(k != -1)), 240 | gene = gene, 241 | silhouette = o$silhouette[gene], 242 | module = as.factor(k[gene]), 243 | n_modules = r$n_modules, 244 | output = j, 245 | penalization = r$penalization 246 | ) 247 | })) 248 | })) 249 | })) 250 | } 251 | 252 | #' Plot individual silhouette scores 253 | #' 254 | #' @param list_of_fits A list of `scregclust` objects each fit to the same 255 | #' dataset across a variety of module counts (varying 256 | #' `n_modules` when running [`scregclust`]). 257 | #' @param penalization Either a single numeric value requesting the results 258 | #' for the same penalty parameter across all fits in 259 | #' `list_of_fits`, or one for each individual fit. 260 | #' @param final_config The final configuration that should be visualized. 261 | #' Either a single number to be used for all fits in 262 | #' `list_of_fits`, or one for each individual fit. 263 | #' 264 | #' @return A ggplot2 plot showing the the silhouette scores for each 265 | #' supplied fit. 266 | #' 267 | #' @concept plotting 268 | #' 269 | #' @export 270 | plot_silhouettes <- function(list_of_fits, penalization, final_config = 1L) { 271 | if (!( 272 | is.numeric(penalization) 273 | && ( 274 | ( 275 | length(penalization) == 1L 276 | && all(sapply(list_of_fits, function(fit) { 277 | penalization %in% fit$penalization 278 | })) 279 | ) || ( 280 | length(penalization) == length(list_of_fits) 281 | && all(mapply(function(fit, p) { 282 | p %in% fit$penalization 283 | }, list_of_fits, penalization)) 284 | ) 285 | ) 286 | )) { 287 | cli::cli_abort(c( 288 | "{.var penalization} is not supplied correctly.", 289 | "x" = "It needs to be one of the following two:", 290 | "*" = "A single penalization parameter used in all fits.", 291 | "*" = ( 292 | "A list of penalization parameters, exactly one for each supplied fit." 293 | ) 294 | )) 295 | } 296 | 297 | #### TODO: Checking the correctness of this is a bit of a pain 298 | #### Do soon-ish! 299 | # if (!( 300 | # is.numeric(final_config) 301 | # && all(as.integer(final_config) == final_config) 302 | # && ( 303 | # ( 304 | # length(final_config) == 1L 305 | # && all(sapply(list_of_fits, function(fit) { 306 | # final_config %in% fit$final_config 307 | # })) 308 | # ) || ( 309 | # length(final_config) == length(list_of_fits) 310 | # && all(mapply(function(fit, p) { 311 | # p %in% fit$final_config 312 | # }, list_of_fits, final_config)) 313 | # ) 314 | # ) 315 | # )) { 316 | # cli::cli_abort(c( 317 | # "{.var final_config} is not supplied correctly.", 318 | # "x" = "It needs to be one of the following two:", 319 | # "*" = "A single final_config parameter used in all fits.", 320 | # "*" = ( 321 | # "A list of final_config parameters, exactly one for each supplied fit." 322 | # ) 323 | # )) 324 | # } 325 | 326 | if (any( 327 | do.call(c, lapply(list_of_fits, function(fit) { 328 | do.call(c, lapply(fit$results, function(res) { 329 | sapply(res$output, function(o) { 330 | is.null(o$silhouette) 331 | }) 332 | })) 333 | })) 334 | )) { 335 | cli::cli_abort(c( 336 | "Silhouette scores were not computed during fitting.", 337 | "i" = "Set `compute_silhouette = TRUE` in `scregclust`" 338 | )) 339 | } 340 | 341 | silhouette_data <- collect_silhouette_data(list_of_fits) 342 | module_counts <- sapply( 343 | list_of_fits, function(fit) fit$results[[1]]$n_modules 344 | ) 345 | 346 | silhouette_data$n_modules_lbl <- as.factor( 347 | sprintf("K = %d", silhouette_data$n_modules) 348 | ) 349 | 350 | if (length(penalization) == 1L) { 351 | silhouette_data <- silhouette_data[ 352 | silhouette_data$penalization == penalization, 353 | ] 354 | } else { 355 | silhouette_data <- do.call(rbind, lapply( 356 | seq_along(module_counts), 357 | function(i) { 358 | df <- silhouette_data[silhouette_data$n_modules == module_counts[i], ] 359 | df[df$penalization == penalization[i]] 360 | } 361 | )) 362 | } 363 | 364 | if (length(final_config) == 1L) { 365 | silhouette_data <- silhouette_data[ 366 | silhouette_data$output == final_config, 367 | ] 368 | } else { 369 | silhouette_data <- do.call(rbind, lapply( 370 | seq_along(module_counts), 371 | function(i) { 372 | df <- silhouette_data[silhouette_data$n_modules == module_counts[i], ] 373 | df[df$output == final_config[i]] 374 | } 375 | )) 376 | } 377 | 378 | module_centers <- do.call(rbind, lapply(module_counts, function(n_modules) { 379 | df <- silhouette_data[silhouette_data$n_modules == n_modules, ] 380 | contained_modules <- unique(df$module) 381 | 382 | data.frame( 383 | n_modules = n_modules, 384 | module = contained_modules, 385 | order = sapply(contained_modules, function(cl) { 386 | mean(df[df$module == cl, ]$order) 387 | }) 388 | ) 389 | })) 390 | module_centers$n_modules_lbl <- as.factor( 391 | sprintf("K = %d", module_centers$n_modules) 392 | ) 393 | 394 | avg_silhouette <- data.frame( 395 | n_modules = module_counts, 396 | silhouette = sapply(module_counts, function(n_modules) { 397 | df <- silhouette_data[silhouette_data$n_modules == n_modules, ] 398 | mean(df$silhouette) 399 | }) 400 | ) 401 | avg_silhouette$n_modules_lbl <- as.factor( 402 | sprintf("K = %d", avg_silhouette$n_modules) 403 | ) 404 | 405 | silhouette_data |> 406 | ggplot2::ggplot() + 407 | ggplot2::facet_wrap(n_modules_lbl ~ .) + 408 | ggplot2::geom_bar( 409 | ggplot2::aes(x = .data$order, y = .data$silhouette, fill = .data$module), 410 | stat = "identity", 411 | ) + 412 | ggplot2::geom_text( 413 | ggplot2::aes(x = .data$order, y = -0.1, label = .data$module), 414 | data = module_centers, 415 | ) + 416 | ggplot2::geom_hline( 417 | ggplot2::aes(yintercept = .data$silhouette), 418 | data = avg_silhouette, 419 | linetype = "dashed", 420 | color = "red", 421 | linewidth = 0.25, 422 | ) + 423 | ggplot2::coord_flip() + 424 | ggplot2::scale_fill_discrete(guide = "none") + 425 | ggplot2::labs(x = "Module", y = "Silhouette score") + 426 | ggplot2::theme_minimal() + 427 | ggplot2::theme( 428 | panel.grid = ggplot2::element_blank(), 429 | axis.text.y = ggplot2::element_blank(), 430 | ) 431 | } 432 | 433 | #' Plot average silhouette scores and average predictive \eqn{R^2} 434 | #' 435 | #' @param list_of_fits A list of `scregclust` objects each fit to the same 436 | #' dataset across a variety of module counts (varying 437 | #' `n_modules` while running [`scregclust`]). 438 | #' @param penalization Either a single numeric value requesting the results 439 | #' for the same penalty parameter across all fits in 440 | #' `list_of_fits`, or one for each individual fit. 441 | #' 442 | #' @return A ggplot2 plot showing the average silhouette score and the 443 | #' average predictive \eqn{R^2} 444 | #' 445 | #' @concept plotting 446 | #' 447 | #' @export 448 | plot_module_count_helper <- function(list_of_fits, penalization) { 449 | if (!( 450 | is.list(list_of_fits) 451 | && all(sapply(list_of_fits, function(f) "scregclust" %in% class(f))) 452 | )) { 453 | cli::cli_abort(c( 454 | "{.var list_of_fits} is not supplied correctly.", 455 | "x" = "It needs to be a list of {.class scregclust} objects." 456 | )) 457 | } 458 | 459 | if (!( 460 | is.numeric(penalization) 461 | && ( 462 | ( 463 | length(penalization) == 1L 464 | && all(sapply(list_of_fits, function(fit) { 465 | penalization %in% fit$penalization 466 | })) 467 | ) || ( 468 | length(penalization) == length(list_of_fits) 469 | && all(mapply(function(fit, p) { 470 | p %in% fit$penalization 471 | }, list_of_fits, penalization)) 472 | ) 473 | ) 474 | )) { 475 | cli::cli_abort(c( 476 | "{.var penalization} is not supplied correctly.", 477 | "x" = "It needs to be one of the following two:", 478 | "*" = "A single penalization parameter used in all fits.", 479 | "*" = ( 480 | "A list of penalization parameters, exactly one for each supplied fit." 481 | ) 482 | )) 483 | } 484 | 485 | if (any( 486 | do.call(c, lapply(list_of_fits, function(fit) { 487 | do.call(c, lapply(fit$results, function(res) { 488 | sapply(res$output, function(o) { 489 | is.null(o$silhouette) 490 | }) 491 | })) 492 | })) 493 | )) { 494 | cli::cli_abort(c( 495 | "Silhouette scores were notcomputed during fitting.", 496 | "i" = "Set `compute_silhouette = TRUE` in `scregclust`" 497 | )) 498 | } 499 | 500 | silhouette_data <- collect_silhouette_data(list_of_fits) 501 | 502 | avg_r2_module_data <- do.call(rbind, lapply(list_of_fits, function(fit) { 503 | do.call(rbind, lapply(seq_along(fit$results), function(i) { 504 | r <- fit$results[[i]] 505 | r2_module <- do.call(c, lapply(seq_along(r$output), function(j) { 506 | r$output[[j]]$r2_module 507 | })) # average across different configurations 508 | 509 | # If a module is empty then r2_module is NA, so use NA remove 510 | value <- mean(r2_module, na.rm = TRUE) 511 | # If all modules turn out to be empty (e.g. too high penalization) then 512 | # mean(...) above will evaluate to NaN. Do not return a data.frame 513 | # in that case. 514 | if (is.nan(value)) { 515 | return(NULL) 516 | } 517 | 518 | data.frame( 519 | n_modules = r$n_modules, 520 | penalization = r$penalization, 521 | value = value, 522 | variable = "avg-r2-module" 523 | ) 524 | })) 525 | })) 526 | 527 | module_counts <- sapply( 528 | list_of_fits, function(fit) fit$results[[1]]$n_modules 529 | ) 530 | 531 | if (length(penalization) == 1) { 532 | silhouette_data <- silhouette_data[ 533 | silhouette_data$penalization == penalization, 534 | ] 535 | avg_r2_module_data <- avg_r2_module_data[ 536 | avg_r2_module_data$penalization == penalization, 537 | ] 538 | } else { 539 | silhouette_data <- do.call(rbind, lapply( 540 | seq_along(module_counts), 541 | function(i) { 542 | df <- silhouette_data[silhouette_data$n_modules == module_counts[i], ] 543 | df[df$penalization == penalization[i]] 544 | } 545 | )) 546 | avg_r2_module_data <- do.call(rbind, lapply( 547 | seq_along(module_counts), 548 | function(i) { 549 | df <- avg_r2_module_data[ 550 | avg_r2_module_data$n_modules == module_counts[i], 551 | ] 552 | df[df$penalization == penalization[i]] 553 | } 554 | )) 555 | } 556 | 557 | avg_silhouette <- sapply(seq_along(module_counts), function(i) { 558 | df <- silhouette_data[silhouette_data$n_modules == module_counts[i], ] 559 | mean(df$silhouette) # average across different configurations 560 | }) 561 | 562 | rbind( 563 | data.frame( 564 | n_modules = module_counts, 565 | penalization = penalization, 566 | value = avg_silhouette, 567 | variable = "avg-silhouette" 568 | ), 569 | avg_r2_module_data 570 | ) |> 571 | ggplot2::ggplot() + 572 | ggplot2::facet_wrap( 573 | variable ~ ., 574 | nrow = 2, 575 | scales = "free_y", 576 | strip.position = "left", 577 | labeller = ggplot2::label_bquote( 578 | .( 579 | if (variable == "avg-silhouette") { 580 | "Average silhouette score" 581 | } else { 582 | "Avg. pred." ~ R^2 ~ "per module" 583 | } 584 | ) 585 | ), 586 | ) + 587 | ggplot2::geom_line( 588 | ggplot2::aes(.data$n_modules, .data$value), linewidth = 0.25 589 | ) + 590 | ggplot2::geom_point( 591 | ggplot2::aes(.data$n_modules, .data$value), size = 0.5 592 | ) + 593 | ggplot2::labs(x = "# of modules (K)", y = NULL) + 594 | ggplot2::scale_x_continuous(breaks = module_counts) + 595 | ggplot2::theme_minimal() + 596 | ggplot2::theme( 597 | panel.grid = ggplot2::element_blank(), 598 | axis.line = ggplot2::element_line( 599 | arrow = grid::arrow(length = grid::unit(1, "mm")), 600 | ), 601 | strip.background = ggplot2::element_blank(), 602 | strip.placement = "outside", 603 | line = ggplot2::element_line(linewidth = 0.25), 604 | plot.margin = ggplot2::margin(t = 2, unit = "mm"), 605 | ) 606 | } 607 | -------------------------------------------------------------------------------- /datasets/humanTFs.txt: -------------------------------------------------------------------------------- 1 | AC008770.3 2 | AC023509.3 3 | AC092835.1 4 | AC138696.1 5 | ADNP 6 | ADNP2 7 | AEBP1 8 | AEBP2 9 | AHCTF1 10 | AHDC1 11 | AHR 12 | AHRR 13 | AIRE 14 | AKAP8 15 | AKAP8L 16 | AKNA 17 | ALX1 18 | ALX3 19 | ALX4 20 | ANHX 21 | ANKZF1 22 | AR 23 | ARGFX 24 | ARHGAP35 25 | ARID2 26 | ARID3A 27 | ARID3B 28 | ARID3C 29 | ARID5A 30 | ARID5B 31 | ARNT 32 | ARNT2 33 | ARNTL 34 | ARNTL2 35 | ARX 36 | ASCL1 37 | ASCL2 38 | ASCL3 39 | ASCL4 40 | ASCL5 41 | ASH1L 42 | ATF1 43 | ATF2 44 | ATF3 45 | ATF4 46 | ATF5 47 | ATF6 48 | ATF6B 49 | ATF7 50 | ATMIN 51 | ATOH1 52 | ATOH7 53 | ATOH8 54 | BACH1 55 | BACH2 56 | BARHL1 57 | BARHL2 58 | BARX1 59 | BARX2 60 | BATF 61 | BATF2 62 | BATF3 63 | BAZ2A 64 | BAZ2B 65 | BBX 66 | BCL11A 67 | BCL11B 68 | BCL6 69 | BCL6B 70 | BHLHA15 71 | BHLHA9 72 | BHLHE22 73 | BHLHE23 74 | BHLHE40 75 | BHLHE41 76 | BNC1 77 | BNC2 78 | BORCS8-MEF2B 79 | BPTF 80 | BRF2 81 | BSX 82 | C11orf95 83 | CAMTA1 84 | CAMTA2 85 | CARF 86 | CASZ1 87 | CBX2 88 | CC2D1A 89 | CCDC169-SOHLH2 90 | CCDC17 91 | CDC5L 92 | CDX1 93 | CDX2 94 | CDX4 95 | CEBPA 96 | CEBPB 97 | CEBPD 98 | CEBPE 99 | CEBPG 100 | CEBPZ 101 | CENPA 102 | CENPB 103 | CENPBD1 104 | CENPS 105 | CENPT 106 | CENPX 107 | CGGBP1 108 | CHAMP1 109 | CHCHD3 110 | CIC 111 | CLOCK 112 | CPEB1 113 | CPXCR1 114 | CREB1 115 | CREB3 116 | CREB3L1 117 | CREB3L2 118 | CREB3L3 119 | CREB3L4 120 | CREB5 121 | CREBL2 122 | CREBZF 123 | CREM 124 | CRX 125 | CSRNP1 126 | CSRNP2 127 | CSRNP3 128 | CTCF 129 | CTCFL 130 | CUX1 131 | CUX2 132 | CXXC1 133 | CXXC4 134 | CXXC5 135 | DACH1 136 | DACH2 137 | DBP 138 | DBX1 139 | DBX2 140 | DDIT3 141 | DEAF1 142 | DLX1 143 | DLX2 144 | DLX3 145 | DLX4 146 | DLX5 147 | DLX6 148 | DMBX1 149 | DMRT1 150 | DMRT2 151 | DMRT3 152 | DMRTA1 153 | DMRTA2 154 | DMRTB1 155 | DMRTC2 156 | DMTF1 157 | DNMT1 158 | DNTTIP1 159 | DOT1L 160 | DPF1 161 | DPF3 162 | DPRX 163 | DR1 164 | DRAP1 165 | DRGX 166 | DUX1 167 | DUX3 168 | DUX4 169 | DUXA 170 | DZIP1 171 | E2F1 172 | E2F2 173 | E2F3 174 | E2F4 175 | E2F5 176 | E2F6 177 | E2F7 178 | E2F8 179 | E4F1 180 | EBF1 181 | EBF2 182 | EBF3 183 | EBF4 184 | EEA1 185 | EGR1 186 | EGR2 187 | EGR3 188 | EGR4 189 | EHF 190 | ELF1 191 | ELF2 192 | ELF3 193 | ELF4 194 | ELF5 195 | ELK1 196 | ELK3 197 | ELK4 198 | EMX1 199 | EMX2 200 | EN1 201 | EN2 202 | EOMES 203 | EPAS1 204 | ERF 205 | ERG 206 | ESR1 207 | ESR2 208 | ESRRA 209 | ESRRB 210 | ESRRG 211 | ESX1 212 | ETS1 213 | ETS2 214 | ETV1 215 | ETV2 216 | ETV3 217 | ETV3L 218 | ETV4 219 | ETV5 220 | ETV6 221 | ETV7 222 | EVX1 223 | EVX2 224 | FAM170A 225 | FAM200B 226 | FBXL19 227 | FERD3L 228 | FEV 229 | FEZF1 230 | FEZF2 231 | FIGLA 232 | FIZ1 233 | FLI1 234 | FLYWCH1 235 | FOS 236 | FOSB 237 | FOSL1 238 | FOSL2 239 | FOXA1 240 | FOXA2 241 | FOXA3 242 | FOXB1 243 | FOXB2 244 | FOXC1 245 | FOXC2 246 | FOXD1 247 | FOXD2 248 | FOXD3 249 | FOXD4 250 | FOXD4L1 251 | FOXD4L3 252 | FOXD4L4 253 | FOXD4L5 254 | FOXD4L6 255 | FOXE1 256 | FOXE3 257 | FOXF1 258 | FOXF2 259 | FOXG1 260 | FOXH1 261 | FOXI1 262 | FOXI2 263 | FOXI3 264 | FOXJ1 265 | FOXJ2 266 | FOXJ3 267 | FOXK1 268 | FOXK2 269 | FOXL1 270 | FOXL2 271 | FOXM1 272 | FOXN1 273 | FOXN2 274 | FOXN3 275 | FOXN4 276 | FOXO1 277 | FOXO3 278 | FOXO4 279 | FOXO6 280 | FOXP1 281 | FOXP2 282 | FOXP3 283 | FOXP4 284 | FOXQ1 285 | FOXR1 286 | FOXR2 287 | FOXS1 288 | GABPA 289 | GATA1 290 | GATA2 291 | GATA3 292 | GATA4 293 | GATA5 294 | GATA6 295 | GATAD2A 296 | GATAD2B 297 | GBX1 298 | GBX2 299 | GCM1 300 | GCM2 301 | GFI1 302 | GFI1B 303 | GLI1 304 | GLI2 305 | GLI3 306 | GLI4 307 | GLIS1 308 | GLIS2 309 | GLIS3 310 | GLMP 311 | GLYR1 312 | GMEB1 313 | GMEB2 314 | GPBP1 315 | GPBP1L1 316 | GRHL1 317 | GRHL2 318 | GRHL3 319 | GSC 320 | GSC2 321 | GSX1 322 | GSX2 323 | GTF2B 324 | GTF2I 325 | GTF2IRD1 326 | GTF2IRD2 327 | GTF2IRD2B 328 | GTF3A 329 | GZF1 330 | HAND1 331 | HAND2 332 | HBP1 333 | HDX 334 | HELT 335 | HES1 336 | HES2 337 | HES3 338 | HES4 339 | HES5 340 | HES6 341 | HES7 342 | HESX1 343 | HEY1 344 | HEY2 345 | HEYL 346 | HHEX 347 | HIC1 348 | HIC2 349 | HIF1A 350 | HIF3A 351 | HINFP 352 | HIVEP1 353 | HIVEP2 354 | HIVEP3 355 | HKR1 356 | HLF 357 | HLX 358 | HMBOX1 359 | HMG20A 360 | HMG20B 361 | HMGA1 362 | HMGA2 363 | HMGN3 364 | HMX1 365 | HMX2 366 | HMX3 367 | HNF1A 368 | HNF1B 369 | HNF4A 370 | HNF4G 371 | HOMEZ 372 | HOXA1 373 | HOXA10 374 | HOXA11 375 | HOXA13 376 | HOXA2 377 | HOXA3 378 | HOXA4 379 | HOXA5 380 | HOXA6 381 | HOXA7 382 | HOXA9 383 | HOXB1 384 | HOXB13 385 | HOXB2 386 | HOXB3 387 | HOXB4 388 | HOXB5 389 | HOXB6 390 | HOXB7 391 | HOXB8 392 | HOXB9 393 | HOXC10 394 | HOXC11 395 | HOXC12 396 | HOXC13 397 | HOXC4 398 | HOXC5 399 | HOXC6 400 | HOXC8 401 | HOXC9 402 | HOXD1 403 | HOXD10 404 | HOXD11 405 | HOXD12 406 | HOXD13 407 | HOXD3 408 | HOXD4 409 | HOXD8 410 | HOXD9 411 | HSF1 412 | HSF2 413 | HSF4 414 | HSF5 415 | HSFX1 416 | HSFX2 417 | HSFY1 418 | HSFY2 419 | IKZF1 420 | IKZF2 421 | IKZF3 422 | IKZF4 423 | IKZF5 424 | INSM1 425 | INSM2 426 | IRF1 427 | IRF2 428 | IRF3 429 | IRF4 430 | IRF5 431 | IRF6 432 | IRF7 433 | IRF8 434 | IRF9 435 | IRX1 436 | IRX2 437 | IRX3 438 | IRX4 439 | IRX5 440 | IRX6 441 | ISL1 442 | ISL2 443 | ISX 444 | JAZF1 445 | JDP2 446 | JRK 447 | JRKL 448 | JUN 449 | JUNB 450 | JUND 451 | KAT7 452 | KCMF1 453 | KCNIP3 454 | KDM2A 455 | KDM2B 456 | KDM5B 457 | KIN 458 | KLF1 459 | KLF10 460 | KLF11 461 | KLF12 462 | KLF13 463 | KLF14 464 | KLF15 465 | KLF16 466 | KLF17 467 | KLF2 468 | KLF3 469 | KLF4 470 | KLF5 471 | KLF6 472 | KLF7 473 | KLF8 474 | KLF9 475 | KMT2A 476 | KMT2B 477 | L3MBTL1 478 | L3MBTL3 479 | L3MBTL4 480 | LBX1 481 | LBX2 482 | LCOR 483 | LCORL 484 | LEF1 485 | LEUTX 486 | LHX1 487 | LHX2 488 | LHX3 489 | LHX4 490 | LHX5 491 | LHX6 492 | LHX8 493 | LHX9 494 | LIN28A 495 | LIN28B 496 | LIN54 497 | LMX1A 498 | LMX1B 499 | LTF 500 | LYL1 501 | MAF 502 | MAFA 503 | MAFB 504 | MAFF 505 | MAFG 506 | MAFK 507 | MAX 508 | MAZ 509 | MBD1 510 | MBD2 511 | MBD3 512 | MBD4 513 | MBD6 514 | MBNL2 515 | MECOM 516 | MECP2 517 | MEF2A 518 | MEF2B 519 | MEF2C 520 | MEF2D 521 | MEIS1 522 | MEIS2 523 | MEIS3 524 | MEOX1 525 | MEOX2 526 | MESP1 527 | MESP2 528 | MGA 529 | MITF 530 | MIXL1 531 | MKX 532 | MLX 533 | MLXIP 534 | MLXIPL 535 | MNT 536 | MNX1 537 | MSANTD1 538 | MSANTD3 539 | MSANTD4 540 | MSC 541 | MSGN1 542 | MSX1 543 | MSX2 544 | MTERF1 545 | MTERF2 546 | MTERF3 547 | MTERF4 548 | MTF1 549 | MTF2 550 | MXD1 551 | MXD3 552 | MXD4 553 | MXI1 554 | MYB 555 | MYBL1 556 | MYBL2 557 | MYC 558 | MYCL 559 | MYCN 560 | MYF5 561 | MYF6 562 | MYNN 563 | MYOD1 564 | MYOG 565 | MYPOP 566 | MYRF 567 | MYRFL 568 | MYSM1 569 | MYT1 570 | MYT1L 571 | MZF1 572 | NACC2 573 | NAIF1 574 | NANOG 575 | NANOGNB 576 | NANOGP8 577 | NCOA1 578 | NCOA2 579 | NCOA3 580 | NEUROD1 581 | NEUROD2 582 | NEUROD4 583 | NEUROD6 584 | NEUROG1 585 | NEUROG2 586 | NEUROG3 587 | NFAT5 588 | NFATC1 589 | NFATC2 590 | NFATC3 591 | NFATC4 592 | NFE2 593 | NFE2L1 594 | NFE2L2 595 | NFE2L3 596 | NFE4 597 | NFIA 598 | NFIB 599 | NFIC 600 | NFIL3 601 | NFIX 602 | NFKB1 603 | NFKB2 604 | NFX1 605 | NFXL1 606 | NFYA 607 | NFYB 608 | NFYC 609 | NHLH1 610 | NHLH2 611 | NKRF 612 | NKX1-1 613 | NKX1-2 614 | NKX2-1 615 | NKX2-2 616 | NKX2-3 617 | NKX2-4 618 | NKX2-5 619 | NKX2-6 620 | NKX2-8 621 | NKX3-1 622 | NKX3-2 623 | NKX6-1 624 | NKX6-2 625 | NKX6-3 626 | NME2 627 | NOBOX 628 | NOTO 629 | NPAS1 630 | NPAS2 631 | NPAS3 632 | NPAS4 633 | NR0B1 634 | NR1D1 635 | NR1D2 636 | NR1H2 637 | NR1H3 638 | NR1H4 639 | NR1I2 640 | NR1I3 641 | NR2C1 642 | NR2C2 643 | NR2E1 644 | NR2E3 645 | NR2F1 646 | NR2F2 647 | NR2F6 648 | NR3C1 649 | NR3C2 650 | NR4A1 651 | NR4A2 652 | NR4A3 653 | NR5A1 654 | NR5A2 655 | NR6A1 656 | NRF1 657 | NRL 658 | OLIG1 659 | OLIG2 660 | OLIG3 661 | ONECUT1 662 | ONECUT2 663 | ONECUT3 664 | OSR1 665 | OSR2 666 | OTP 667 | OTX1 668 | OTX2 669 | OVOL1 670 | OVOL2 671 | OVOL3 672 | PA2G4 673 | PATZ1 674 | PAX1 675 | PAX2 676 | PAX3 677 | PAX4 678 | PAX5 679 | PAX6 680 | PAX7 681 | PAX8 682 | PAX9 683 | PBX1 684 | PBX2 685 | PBX3 686 | PBX4 687 | PCGF2 688 | PCGF6 689 | PDX1 690 | PEG3 691 | PGR 692 | PHF1 693 | PHF19 694 | PHF20 695 | PHF21A 696 | PHOX2A 697 | PHOX2B 698 | PIN1 699 | PITX1 700 | PITX2 701 | PITX3 702 | PKNOX1 703 | PKNOX2 704 | PLAG1 705 | PLAGL1 706 | PLAGL2 707 | PLSCR1 708 | POGK 709 | POU1F1 710 | POU2AF1 711 | POU2F1 712 | POU2F2 713 | POU2F3 714 | POU3F1 715 | POU3F2 716 | POU3F3 717 | POU3F4 718 | POU4F1 719 | POU4F2 720 | POU4F3 721 | POU5F1 722 | POU5F1B 723 | POU5F2 724 | POU6F1 725 | POU6F2 726 | PPARA 727 | PPARD 728 | PPARG 729 | PRDM1 730 | PRDM10 731 | PRDM12 732 | PRDM13 733 | PRDM14 734 | PRDM15 735 | PRDM16 736 | PRDM2 737 | PRDM4 738 | PRDM5 739 | PRDM6 740 | PRDM8 741 | PRDM9 742 | PREB 743 | PRMT3 744 | PROP1 745 | PROX1 746 | PROX2 747 | PRR12 748 | PRRX1 749 | PRRX2 750 | PTF1A 751 | PURA 752 | PURB 753 | PURG 754 | RAG1 755 | RARA 756 | RARB 757 | RARG 758 | RAX 759 | RAX2 760 | RBAK 761 | RBCK1 762 | RBPJ 763 | RBPJL 764 | RBSN 765 | REL 766 | RELA 767 | RELB 768 | REPIN1 769 | REST 770 | REXO4 771 | RFX1 772 | RFX2 773 | RFX3 774 | RFX4 775 | RFX5 776 | RFX6 777 | RFX7 778 | RFX8 779 | RHOXF1 780 | RHOXF2 781 | RHOXF2B 782 | RLF 783 | RORA 784 | RORB 785 | RORC 786 | RREB1 787 | RUNX1 788 | RUNX2 789 | RUNX3 790 | RXRA 791 | RXRB 792 | RXRG 793 | SAFB 794 | SAFB2 795 | SALL1 796 | SALL2 797 | SALL3 798 | SALL4 799 | SATB1 800 | SATB2 801 | SCMH1 802 | SCML4 803 | SCRT1 804 | SCRT2 805 | SCX 806 | SEBOX 807 | SETBP1 808 | SETDB1 809 | SETDB2 810 | SGSM2 811 | SHOX 812 | SHOX2 813 | SIM1 814 | SIM2 815 | SIX1 816 | SIX2 817 | SIX3 818 | SIX4 819 | SIX5 820 | SIX6 821 | SKI 822 | SKIL 823 | SKOR1 824 | SKOR2 825 | SLC2A4RG 826 | SMAD1 827 | SMAD3 828 | SMAD4 829 | SMAD5 830 | SMAD9 831 | SMYD3 832 | SNAI1 833 | SNAI2 834 | SNAI3 835 | SNAPC2 836 | SNAPC4 837 | SNAPC5 838 | SOHLH1 839 | SOHLH2 840 | SON 841 | SOX1 842 | SOX10 843 | SOX11 844 | SOX12 845 | SOX13 846 | SOX14 847 | SOX15 848 | SOX17 849 | SOX18 850 | SOX2 851 | SOX21 852 | SOX3 853 | SOX30 854 | SOX4 855 | SOX5 856 | SOX6 857 | SOX7 858 | SOX8 859 | SOX9 860 | SP1 861 | SP100 862 | SP110 863 | SP140 864 | SP140L 865 | SP2 866 | SP3 867 | SP4 868 | SP5 869 | SP6 870 | SP7 871 | SP8 872 | SP9 873 | SPDEF 874 | SPEN 875 | SPI1 876 | SPIB 877 | SPIC 878 | SPZ1 879 | SRCAP 880 | SREBF1 881 | SREBF2 882 | SRF 883 | SRY 884 | ST18 885 | STAT1 886 | STAT2 887 | STAT3 888 | STAT4 889 | STAT5A 890 | STAT5B 891 | STAT6 892 | T 893 | TAL1 894 | TAL2 895 | TBP 896 | TBPL1 897 | TBPL2 898 | TBR1 899 | TBX1 900 | TBX10 901 | TBX15 902 | TBX18 903 | TBX19 904 | TBX2 905 | TBX20 906 | TBX21 907 | TBX22 908 | TBX3 909 | TBX4 910 | TBX5 911 | TBX6 912 | TCF12 913 | TCF15 914 | TCF20 915 | TCF21 916 | TCF23 917 | TCF24 918 | TCF3 919 | TCF4 920 | TCF7 921 | TCF7L1 922 | TCF7L2 923 | TCFL5 924 | TEAD1 925 | TEAD2 926 | TEAD3 927 | TEAD4 928 | TEF 929 | TERB1 930 | TERF1 931 | TERF2 932 | TET1 933 | TET2 934 | TET3 935 | TFAP2A 936 | TFAP2B 937 | TFAP2C 938 | TFAP2D 939 | TFAP2E 940 | TFAP4 941 | TFCP2 942 | TFCP2L1 943 | TFDP1 944 | TFDP2 945 | TFDP3 946 | TFE3 947 | TFEB 948 | TFEC 949 | TGIF1 950 | TGIF2 951 | TGIF2LX 952 | TGIF2LY 953 | THAP1 954 | THAP10 955 | THAP11 956 | THAP12 957 | THAP2 958 | THAP3 959 | THAP4 960 | THAP5 961 | THAP6 962 | THAP7 963 | THAP8 964 | THAP9 965 | THRA 966 | THRB 967 | THYN1 968 | TIGD1 969 | TIGD2 970 | TIGD3 971 | TIGD4 972 | TIGD5 973 | TIGD6 974 | TIGD7 975 | TLX1 976 | TLX2 977 | TLX3 978 | TMF1 979 | TOPORS 980 | TP53 981 | TP63 982 | TP73 983 | TPRX1 984 | TRAFD1 985 | TRERF1 986 | TRPS1 987 | TSC22D1 988 | TSHZ1 989 | TSHZ2 990 | TSHZ3 991 | TTF1 992 | TWIST1 993 | TWIST2 994 | UBP1 995 | UNCX 996 | USF1 997 | USF2 998 | USF3 999 | VAX1 1000 | VAX2 1001 | VDR 1002 | VENTX 1003 | VEZF1 1004 | VSX1 1005 | VSX2 1006 | WIZ 1007 | WT1 1008 | XBP1 1009 | XPA 1010 | YBX1 1011 | YBX2 1012 | YBX3 1013 | YY1 1014 | YY2 1015 | ZBED1 1016 | ZBED2 1017 | ZBED3 1018 | ZBED4 1019 | ZBED5 1020 | ZBED6 1021 | ZBED9 1022 | ZBTB1 1023 | ZBTB10 1024 | ZBTB11 1025 | ZBTB12 1026 | ZBTB14 1027 | ZBTB16 1028 | ZBTB17 1029 | ZBTB18 1030 | ZBTB2 1031 | ZBTB20 1032 | ZBTB21 1033 | ZBTB22 1034 | ZBTB24 1035 | ZBTB25 1036 | ZBTB26 1037 | ZBTB3 1038 | ZBTB32 1039 | ZBTB33 1040 | ZBTB34 1041 | ZBTB37 1042 | ZBTB38 1043 | ZBTB39 1044 | ZBTB4 1045 | ZBTB40 1046 | ZBTB41 1047 | ZBTB42 1048 | ZBTB43 1049 | ZBTB44 1050 | ZBTB45 1051 | ZBTB46 1052 | ZBTB47 1053 | ZBTB48 1054 | ZBTB49 1055 | ZBTB5 1056 | ZBTB6 1057 | ZBTB7A 1058 | ZBTB7B 1059 | ZBTB7C 1060 | ZBTB8A 1061 | ZBTB8B 1062 | ZBTB9 1063 | ZC3H8 1064 | ZEB1 1065 | ZEB2 1066 | ZFAT 1067 | ZFHX2 1068 | ZFHX3 1069 | ZFHX4 1070 | ZFP1 1071 | ZFP14 1072 | ZFP2 1073 | ZFP28 1074 | ZFP3 1075 | ZFP30 1076 | ZFP37 1077 | ZFP41 1078 | ZFP42 1079 | ZFP57 1080 | ZFP62 1081 | ZFP64 1082 | ZFP69 1083 | ZFP69B 1084 | ZFP82 1085 | ZFP90 1086 | ZFP91 1087 | ZFP92 1088 | ZFPM1 1089 | ZFPM2 1090 | ZFX 1091 | ZFY 1092 | ZGLP1 1093 | ZGPAT 1094 | ZHX1 1095 | ZHX2 1096 | ZHX3 1097 | ZIC1 1098 | ZIC2 1099 | ZIC3 1100 | ZIC4 1101 | ZIC5 1102 | ZIK1 1103 | ZIM2 1104 | ZIM3 1105 | ZKSCAN1 1106 | ZKSCAN2 1107 | ZKSCAN3 1108 | ZKSCAN4 1109 | ZKSCAN5 1110 | ZKSCAN7 1111 | ZKSCAN8 1112 | ZMAT1 1113 | ZMAT4 1114 | ZNF10 1115 | ZNF100 1116 | ZNF101 1117 | ZNF107 1118 | ZNF112 1119 | ZNF114 1120 | ZNF117 1121 | ZNF12 1122 | ZNF121 1123 | ZNF124 1124 | ZNF131 1125 | ZNF132 1126 | ZNF133 1127 | ZNF134 1128 | ZNF135 1129 | ZNF136 1130 | ZNF138 1131 | ZNF14 1132 | ZNF140 1133 | ZNF141 1134 | ZNF142 1135 | ZNF143 1136 | ZNF146 1137 | ZNF148 1138 | ZNF154 1139 | ZNF155 1140 | ZNF157 1141 | ZNF16 1142 | ZNF160 1143 | ZNF165 1144 | ZNF169 1145 | ZNF17 1146 | ZNF174 1147 | ZNF175 1148 | ZNF177 1149 | ZNF18 1150 | ZNF180 1151 | ZNF181 1152 | ZNF182 1153 | ZNF184 1154 | ZNF189 1155 | ZNF19 1156 | ZNF195 1157 | ZNF197 1158 | ZNF2 1159 | ZNF20 1160 | ZNF200 1161 | ZNF202 1162 | ZNF205 1163 | ZNF207 1164 | ZNF208 1165 | ZNF211 1166 | ZNF212 1167 | ZNF213 1168 | ZNF214 1169 | ZNF215 1170 | ZNF217 1171 | ZNF219 1172 | ZNF22 1173 | ZNF221 1174 | ZNF222 1175 | ZNF223 1176 | ZNF224 1177 | ZNF225 1178 | ZNF226 1179 | ZNF227 1180 | ZNF229 1181 | ZNF23 1182 | ZNF230 1183 | ZNF232 1184 | ZNF233 1185 | ZNF234 1186 | ZNF235 1187 | ZNF236 1188 | ZNF239 1189 | ZNF24 1190 | ZNF248 1191 | ZNF25 1192 | ZNF250 1193 | ZNF251 1194 | ZNF253 1195 | ZNF254 1196 | ZNF256 1197 | ZNF257 1198 | ZNF26 1199 | ZNF260 1200 | ZNF263 1201 | ZNF264 1202 | ZNF266 1203 | ZNF267 1204 | ZNF268 1205 | ZNF273 1206 | ZNF274 1207 | ZNF275 1208 | ZNF276 1209 | ZNF277 1210 | ZNF28 1211 | ZNF280A 1212 | ZNF280B 1213 | ZNF280C 1214 | ZNF280D 1215 | ZNF281 1216 | ZNF282 1217 | ZNF283 1218 | ZNF284 1219 | ZNF285 1220 | ZNF286A 1221 | ZNF286B 1222 | ZNF287 1223 | ZNF292 1224 | ZNF296 1225 | ZNF3 1226 | ZNF30 1227 | ZNF300 1228 | ZNF302 1229 | ZNF304 1230 | ZNF311 1231 | ZNF316 1232 | ZNF317 1233 | ZNF318 1234 | ZNF319 1235 | ZNF32 1236 | ZNF320 1237 | ZNF322 1238 | ZNF324 1239 | ZNF324B 1240 | ZNF326 1241 | ZNF329 1242 | ZNF331 1243 | ZNF333 1244 | ZNF334 1245 | ZNF335 1246 | ZNF337 1247 | ZNF33A 1248 | ZNF33B 1249 | ZNF34 1250 | ZNF341 1251 | ZNF343 1252 | ZNF345 1253 | ZNF346 1254 | ZNF347 1255 | ZNF35 1256 | ZNF350 1257 | ZNF354A 1258 | ZNF354B 1259 | ZNF354C 1260 | ZNF358 1261 | ZNF362 1262 | ZNF365 1263 | ZNF366 1264 | ZNF367 1265 | ZNF37A 1266 | ZNF382 1267 | ZNF383 1268 | ZNF384 1269 | ZNF385A 1270 | ZNF385B 1271 | ZNF385C 1272 | ZNF385D 1273 | ZNF391 1274 | ZNF394 1275 | ZNF395 1276 | ZNF396 1277 | ZNF397 1278 | ZNF398 1279 | ZNF404 1280 | ZNF407 1281 | ZNF408 1282 | ZNF41 1283 | ZNF410 1284 | ZNF414 1285 | ZNF415 1286 | ZNF416 1287 | ZNF417 1288 | ZNF418 1289 | ZNF419 1290 | ZNF420 1291 | ZNF423 1292 | ZNF425 1293 | ZNF426 1294 | ZNF428 1295 | ZNF429 1296 | ZNF43 1297 | ZNF430 1298 | ZNF431 1299 | ZNF432 1300 | ZNF433 1301 | ZNF436 1302 | ZNF438 1303 | ZNF439 1304 | ZNF44 1305 | ZNF440 1306 | ZNF441 1307 | ZNF442 1308 | ZNF443 1309 | ZNF444 1310 | ZNF445 1311 | ZNF446 1312 | ZNF449 1313 | ZNF45 1314 | ZNF451 1315 | ZNF454 1316 | ZNF460 1317 | ZNF461 1318 | ZNF462 1319 | ZNF467 1320 | ZNF468 1321 | ZNF469 1322 | ZNF470 1323 | ZNF471 1324 | ZNF473 1325 | ZNF474 1326 | ZNF479 1327 | ZNF48 1328 | ZNF480 1329 | ZNF483 1330 | ZNF484 1331 | ZNF485 1332 | ZNF486 1333 | ZNF487 1334 | ZNF488 1335 | ZNF490 1336 | ZNF491 1337 | ZNF492 1338 | ZNF493 1339 | ZNF496 1340 | ZNF497 1341 | ZNF500 1342 | ZNF501 1343 | ZNF502 1344 | ZNF503 1345 | ZNF506 1346 | ZNF507 1347 | ZNF510 1348 | ZNF511 1349 | ZNF512 1350 | ZNF512B 1351 | ZNF513 1352 | ZNF514 1353 | ZNF516 1354 | ZNF517 1355 | ZNF518A 1356 | ZNF518B 1357 | ZNF519 1358 | ZNF521 1359 | ZNF524 1360 | ZNF525 1361 | ZNF526 1362 | ZNF527 1363 | ZNF528 1364 | ZNF529 1365 | ZNF530 1366 | ZNF532 1367 | ZNF534 1368 | ZNF536 1369 | ZNF540 1370 | ZNF541 1371 | ZNF543 1372 | ZNF544 1373 | ZNF546 1374 | ZNF547 1375 | ZNF548 1376 | ZNF549 1377 | ZNF550 1378 | ZNF551 1379 | ZNF552 1380 | ZNF554 1381 | ZNF555 1382 | ZNF556 1383 | ZNF557 1384 | ZNF558 1385 | ZNF559 1386 | ZNF560 1387 | ZNF561 1388 | ZNF562 1389 | ZNF563 1390 | ZNF564 1391 | ZNF565 1392 | ZNF566 1393 | ZNF567 1394 | ZNF568 1395 | ZNF569 1396 | ZNF57 1397 | ZNF570 1398 | ZNF571 1399 | ZNF572 1400 | ZNF573 1401 | ZNF574 1402 | ZNF575 1403 | ZNF576 1404 | ZNF577 1405 | ZNF578 1406 | ZNF579 1407 | ZNF580 1408 | ZNF581 1409 | ZNF582 1410 | ZNF583 1411 | ZNF584 1412 | ZNF585A 1413 | ZNF585B 1414 | ZNF586 1415 | ZNF587 1416 | ZNF587B 1417 | ZNF589 1418 | ZNF592 1419 | ZNF594 1420 | ZNF595 1421 | ZNF596 1422 | ZNF597 1423 | ZNF598 1424 | ZNF599 1425 | ZNF600 1426 | ZNF605 1427 | ZNF606 1428 | ZNF607 1429 | ZNF608 1430 | ZNF609 1431 | ZNF610 1432 | ZNF611 1433 | ZNF613 1434 | ZNF614 1435 | ZNF615 1436 | ZNF616 1437 | ZNF618 1438 | ZNF619 1439 | ZNF620 1440 | ZNF621 1441 | ZNF623 1442 | ZNF624 1443 | ZNF625 1444 | ZNF626 1445 | ZNF627 1446 | ZNF628 1447 | ZNF629 1448 | ZNF630 1449 | ZNF639 1450 | ZNF641 1451 | ZNF644 1452 | ZNF645 1453 | ZNF646 1454 | ZNF648 1455 | ZNF649 1456 | ZNF652 1457 | ZNF653 1458 | ZNF654 1459 | ZNF655 1460 | ZNF658 1461 | ZNF66 1462 | ZNF660 1463 | ZNF662 1464 | ZNF664 1465 | ZNF665 1466 | ZNF667 1467 | ZNF668 1468 | ZNF669 1469 | ZNF670 1470 | ZNF671 1471 | ZNF672 1472 | ZNF674 1473 | ZNF675 1474 | ZNF676 1475 | ZNF677 1476 | ZNF678 1477 | ZNF679 1478 | ZNF680 1479 | ZNF681 1480 | ZNF682 1481 | ZNF683 1482 | ZNF684 1483 | ZNF687 1484 | ZNF688 1485 | ZNF689 1486 | ZNF69 1487 | ZNF691 1488 | ZNF692 1489 | ZNF695 1490 | ZNF696 1491 | ZNF697 1492 | ZNF699 1493 | ZNF7 1494 | ZNF70 1495 | ZNF700 1496 | ZNF701 1497 | ZNF703 1498 | ZNF704 1499 | ZNF705A 1500 | ZNF705B 1501 | ZNF705D 1502 | ZNF705E 1503 | ZNF705G 1504 | ZNF706 1505 | ZNF707 1506 | ZNF708 1507 | ZNF709 1508 | ZNF71 1509 | ZNF710 1510 | ZNF711 1511 | ZNF713 1512 | ZNF714 1513 | ZNF716 1514 | ZNF717 1515 | ZNF718 1516 | ZNF721 1517 | ZNF724 1518 | ZNF726 1519 | ZNF727 1520 | ZNF728 1521 | ZNF729 1522 | ZNF730 1523 | ZNF732 1524 | ZNF735 1525 | ZNF736 1526 | ZNF737 1527 | ZNF74 1528 | ZNF740 1529 | ZNF746 1530 | ZNF747 1531 | ZNF749 1532 | ZNF750 1533 | ZNF75A 1534 | ZNF75D 1535 | ZNF76 1536 | ZNF761 1537 | ZNF763 1538 | ZNF764 1539 | ZNF765 1540 | ZNF766 1541 | ZNF768 1542 | ZNF77 1543 | ZNF770 1544 | ZNF771 1545 | ZNF772 1546 | ZNF773 1547 | ZNF774 1548 | ZNF775 1549 | ZNF776 1550 | ZNF777 1551 | ZNF778 1552 | ZNF780A 1553 | ZNF780B 1554 | ZNF781 1555 | ZNF782 1556 | ZNF783 1557 | ZNF784 1558 | ZNF785 1559 | ZNF786 1560 | ZNF787 1561 | ZNF788 1562 | ZNF789 1563 | ZNF79 1564 | ZNF790 1565 | ZNF791 1566 | ZNF792 1567 | ZNF793 1568 | ZNF799 1569 | ZNF8 1570 | ZNF80 1571 | ZNF800 1572 | ZNF804A 1573 | ZNF804B 1574 | ZNF805 1575 | ZNF808 1576 | ZNF81 1577 | ZNF813 1578 | ZNF814 1579 | ZNF816 1580 | ZNF821 1581 | ZNF823 1582 | ZNF827 1583 | ZNF829 1584 | ZNF83 1585 | ZNF830 1586 | ZNF831 1587 | ZNF835 1588 | ZNF836 1589 | ZNF837 1590 | ZNF84 1591 | ZNF841 1592 | ZNF843 1593 | ZNF844 1594 | ZNF845 1595 | ZNF846 1596 | ZNF85 1597 | ZNF850 1598 | ZNF852 1599 | ZNF853 1600 | ZNF860 1601 | ZNF865 1602 | ZNF878 1603 | ZNF879 1604 | ZNF880 1605 | ZNF883 1606 | ZNF888 1607 | ZNF891 1608 | ZNF90 1609 | ZNF91 1610 | ZNF92 1611 | ZNF93 1612 | ZNF98 1613 | ZNF99 1614 | ZSCAN1 1615 | ZSCAN10 1616 | ZSCAN12 1617 | ZSCAN16 1618 | ZSCAN18 1619 | ZSCAN2 1620 | ZSCAN20 1621 | ZSCAN21 1622 | ZSCAN22 1623 | ZSCAN23 1624 | ZSCAN25 1625 | ZSCAN26 1626 | ZSCAN29 1627 | ZSCAN30 1628 | ZSCAN31 1629 | ZSCAN32 1630 | ZSCAN4 1631 | ZSCAN5A 1632 | ZSCAN5B 1633 | ZSCAN5C 1634 | ZSCAN9 1635 | ZUFSP 1636 | ZXDA 1637 | ZXDB 1638 | ZXDC 1639 | ZZZ3 1640 | -------------------------------------------------------------------------------- /datasets/humanTFs_v3.txt: -------------------------------------------------------------------------------- 1 | SORBS2 2 | CEBPB 3 | EBF1 4 | ETS2 5 | FOXC1 6 | ID3 7 | MEF2C 8 | NR2F2 9 | NR4A2 10 | NR4A3 11 | SMAD7 12 | ZFHX3 13 | ZNF90 14 | IFI16 15 | HMGA1 16 | PRRX1 17 | KLF5 18 | FBN1 19 | PLAGL1 20 | FOXS1 21 | HMGB3 22 | DEPDC1 23 | FOXM1 24 | MXD3 25 | HMGB2 26 | HMGB1 27 | E2F7 28 | EZH2 29 | HIST1H1B 30 | HIST1H1D 31 | MYBL1 32 | DEK 33 | MYBL2 34 | E2F1 35 | H1FX 36 | CARHSP1 37 | HIST1H1A 38 | HIST1H1C 39 | HIST1H1E 40 | LHX2 41 | PAX6 42 | POU3F2 43 | SOX11 44 | ARX 45 | CHD9 46 | FOXJ1 47 | GSX2 48 | HES5 49 | INSM1 50 | NEUROD1 51 | OSR1 52 | PBX1 53 | POU3F4 54 | PROX1 55 | SALL3 56 | SOX21 57 | ZMAT1 58 | ZNF117 59 | CHD7 60 | H1F0 61 | HEY2 62 | JDP2 63 | MLXIP 64 | NFATC1 65 | OSR2 66 | SEMA4A 67 | SKIL 68 | TSC22D3 69 | ZNF331 70 | ZNF503 71 | DBX2 72 | RORA 73 | TCF12 74 | ZIC1 75 | NFIB 76 | NR2F1 77 | PITX1 78 | RORB 79 | STAT1 80 | STAT2 81 | MEOX2 82 | ASCL1 83 | ETV1 84 | HES6 85 | NFIA 86 | OLIG2 87 | RFX4 88 | SOX8 89 | TCF4 90 | ZEB1 91 | ZNF704 92 | HEY1 93 | MEIS2 94 | POU3F3 95 | SOX2 96 | MITF 97 | PAX3 98 | PLXNC1 99 | SNAI2 100 | EPAS1 101 | MAF 102 | TBX2 103 | MET 104 | PLXNA1 105 | AHR 106 | GLIS3 107 | PAWR 108 | BMP2 109 | DRAP1 110 | ELK3 111 | FOSL1 112 | FOXP1 113 | GTF2F2 114 | HMGA2 115 | HOXB2 116 | ID1 117 | KLF7 118 | NR1D1 119 | PRDM1 120 | RUNX1 121 | TBX3 122 | HES1 123 | HIC1 124 | TWIST1 125 | XBP1 126 | PLXNA4 127 | ARID5B 128 | KLF9 129 | MACF1 130 | EGR3 131 | MYC 132 | NFIL3 133 | NR4A1 134 | ATF3 135 | CREB5 136 | EGR1 137 | EGR2 138 | FOS 139 | FOSB 140 | ID4 141 | JUN 142 | JUNB 143 | JUND 144 | KLF10 145 | KLF2 146 | KLF4 147 | KLF6 148 | MAFF 149 | ZFP36 150 | ZFP36L1 151 | ZFP36L2 152 | DDIT3 153 | FOSL2 154 | IRF1 155 | TIPARP 156 | TSC22D1 157 | HOPX 158 | OLIG1 159 | TSC22D4 160 | DPF3 161 | HES4 162 | ID2 163 | SMAD1 164 | ZBTB20 165 | BAZ2B 166 | FAM171B 167 | SOX9 168 | TSHZ2 169 | ZFHX4 170 | ZMAT3 171 | NFATC2 172 | TFAP2B 173 | TFAP2A 174 | GPR155 175 | POU3F1 176 | RXRG 177 | SOX10 178 | SOX4 179 | SOX6 180 | ZEB2 181 | ZNF536 182 | AC008770.3 183 | AC023509.3 184 | AC092835.1 185 | AC138696.1 186 | ADNP 187 | ADNP2 188 | AEBP1 189 | AEBP2 190 | AHCTF1 191 | AHDC1 192 | AHR 193 | AHRR 194 | AIRE 195 | AKAP8 196 | AKAP8L 197 | AKNA 198 | ALX1 199 | ALX3 200 | ALX4 201 | ANHX 202 | ANKZF1 203 | AR 204 | ARGFX 205 | ARHGAP35 206 | ARID2 207 | ARID3A 208 | ARID3B 209 | ARID3C 210 | ARID5A 211 | ARID5B 212 | ARNT 213 | ARNT2 214 | ARNTL 215 | ARNTL2 216 | ARX 217 | ASCL1 218 | ASCL2 219 | ASCL3 220 | ASCL4 221 | ASCL5 222 | ASH1L 223 | ATF1 224 | ATF2 225 | ATF3 226 | ATF4 227 | ATF5 228 | ATF6 229 | ATF6B 230 | ATF7 231 | ATMIN 232 | ATOH1 233 | ATOH7 234 | ATOH8 235 | BACH1 236 | BACH2 237 | BARHL1 238 | BARHL2 239 | BARX1 240 | BARX2 241 | BATF 242 | BATF2 243 | BATF3 244 | BAZ2A 245 | BAZ2B 246 | BBX 247 | BCL11A 248 | BCL11B 249 | BCL6 250 | BCL6B 251 | BHLHA15 252 | BHLHA9 253 | BHLHE22 254 | BHLHE23 255 | BHLHE40 256 | BHLHE41 257 | BNC1 258 | BNC2 259 | BORCS8-MEF2B 260 | BPTF 261 | BRF2 262 | BSX 263 | C11orf95 264 | CAMTA1 265 | CAMTA2 266 | CARF 267 | CASZ1 268 | CBX2 269 | CC2D1A 270 | CCDC169-SOHLH2 271 | CCDC17 272 | CDC5L 273 | CDX1 274 | CDX2 275 | CDX4 276 | CEBPA 277 | CEBPB 278 | CEBPD 279 | CEBPE 280 | CEBPG 281 | CEBPZ 282 | CENPA 283 | CENPB 284 | CENPBD1 285 | CENPS 286 | CENPT 287 | CENPX 288 | CGGBP1 289 | CHAMP1 290 | CHCHD3 291 | CIC 292 | CLOCK 293 | CPEB1 294 | CPXCR1 295 | CREB1 296 | CREB3 297 | CREB3L1 298 | CREB3L2 299 | CREB3L3 300 | CREB3L4 301 | CREB5 302 | CREBL2 303 | CREBZF 304 | CREM 305 | CRX 306 | CSRNP1 307 | CSRNP2 308 | CSRNP3 309 | CTCF 310 | CTCFL 311 | CUX1 312 | CUX2 313 | CXXC1 314 | CXXC4 315 | CXXC5 316 | DACH1 317 | DACH2 318 | DBP 319 | DBX1 320 | DBX2 321 | DDIT3 322 | DEAF1 323 | DLX1 324 | DLX2 325 | DLX3 326 | DLX4 327 | DLX5 328 | DLX6 329 | DMBX1 330 | DMRT1 331 | DMRT2 332 | DMRT3 333 | DMRTA1 334 | DMRTA2 335 | DMRTB1 336 | DMRTC2 337 | DMTF1 338 | DNMT1 339 | DNTTIP1 340 | DOT1L 341 | DPF1 342 | DPF3 343 | DPRX 344 | DR1 345 | DRAP1 346 | DRGX 347 | DUX1 348 | DUX3 349 | DUX4 350 | DUXA 351 | DZIP1 352 | E2F1 353 | E2F2 354 | E2F3 355 | E2F4 356 | E2F5 357 | E2F6 358 | E2F7 359 | E2F8 360 | E4F1 361 | EBF1 362 | EBF2 363 | EBF3 364 | EBF4 365 | EEA1 366 | EGR1 367 | EGR2 368 | EGR3 369 | EGR4 370 | EHF 371 | ELF1 372 | ELF2 373 | ELF3 374 | ELF4 375 | ELF5 376 | ELK1 377 | ELK3 378 | ELK4 379 | EMX1 380 | EMX2 381 | EN1 382 | EN2 383 | EOMES 384 | EPAS1 385 | ERF 386 | ERG 387 | ESR1 388 | ESR2 389 | ESRRA 390 | ESRRB 391 | ESRRG 392 | ESX1 393 | ETS1 394 | ETS2 395 | ETV1 396 | ETV2 397 | ETV3 398 | ETV3L 399 | ETV4 400 | ETV5 401 | ETV6 402 | ETV7 403 | EVX1 404 | EVX2 405 | FAM170A 406 | FAM200B 407 | FBXL19 408 | FERD3L 409 | FEV 410 | FEZF1 411 | FEZF2 412 | FIGLA 413 | FIZ1 414 | FLI1 415 | FLYWCH1 416 | FOS 417 | FOSB 418 | FOSL1 419 | FOSL2 420 | FOXA1 421 | FOXA2 422 | FOXA3 423 | FOXB1 424 | FOXB2 425 | FOXC1 426 | FOXC2 427 | FOXD1 428 | FOXD2 429 | FOXD3 430 | FOXD4 431 | FOXD4L1 432 | FOXD4L3 433 | FOXD4L4 434 | FOXD4L5 435 | FOXD4L6 436 | FOXE1 437 | FOXE3 438 | FOXF1 439 | FOXF2 440 | FOXG1 441 | FOXH1 442 | FOXI1 443 | FOXI2 444 | FOXI3 445 | FOXJ1 446 | FOXJ2 447 | FOXJ3 448 | FOXK1 449 | FOXK2 450 | FOXL1 451 | FOXL2 452 | FOXM1 453 | FOXN1 454 | FOXN2 455 | FOXN3 456 | FOXN4 457 | FOXO1 458 | FOXO3 459 | FOXO4 460 | FOXO6 461 | FOXP1 462 | FOXP2 463 | FOXP3 464 | FOXP4 465 | FOXQ1 466 | FOXR1 467 | FOXR2 468 | FOXS1 469 | GABPA 470 | GATA1 471 | GATA2 472 | GATA3 473 | GATA4 474 | GATA5 475 | GATA6 476 | GATAD2A 477 | GATAD2B 478 | GBX1 479 | GBX2 480 | GCM1 481 | GCM2 482 | GFI1 483 | GFI1B 484 | GLI1 485 | GLI2 486 | GLI3 487 | GLI4 488 | GLIS1 489 | GLIS2 490 | GLIS3 491 | GLMP 492 | GLYR1 493 | GMEB1 494 | GMEB2 495 | GPBP1 496 | GPBP1L1 497 | GRHL1 498 | GRHL2 499 | GRHL3 500 | GSC 501 | GSC2 502 | GSX1 503 | GSX2 504 | GTF2B 505 | GTF2I 506 | GTF2IRD1 507 | GTF2IRD2 508 | GTF2IRD2B 509 | GTF3A 510 | GZF1 511 | HAND1 512 | HAND2 513 | HBP1 514 | HDX 515 | HELT 516 | HES1 517 | HES2 518 | HES3 519 | HES4 520 | HES5 521 | HES6 522 | HES7 523 | HESX1 524 | HEY1 525 | HEY2 526 | HEYL 527 | HHEX 528 | HIC1 529 | HIC2 530 | HIF1A 531 | HIF3A 532 | HINFP 533 | HIVEP1 534 | HIVEP2 535 | HIVEP3 536 | HKR1 537 | HLF 538 | HLX 539 | HMBOX1 540 | HMG20A 541 | HMG20B 542 | HMGA1 543 | HMGA2 544 | HMGN3 545 | HMX1 546 | HMX2 547 | HMX3 548 | HNF1A 549 | HNF1B 550 | HNF4A 551 | HNF4G 552 | HOMEZ 553 | HOXA1 554 | HOXA10 555 | HOXA11 556 | HOXA13 557 | HOXA2 558 | HOXA3 559 | HOXA4 560 | HOXA5 561 | HOXA6 562 | HOXA7 563 | HOXA9 564 | HOXB1 565 | HOXB13 566 | HOXB2 567 | HOXB3 568 | HOXB4 569 | HOXB5 570 | HOXB6 571 | HOXB7 572 | HOXB8 573 | HOXB9 574 | HOXC10 575 | HOXC11 576 | HOXC12 577 | HOXC13 578 | HOXC4 579 | HOXC5 580 | HOXC6 581 | HOXC8 582 | HOXC9 583 | HOXD1 584 | HOXD10 585 | HOXD11 586 | HOXD12 587 | HOXD13 588 | HOXD3 589 | HOXD4 590 | HOXD8 591 | HOXD9 592 | HSF1 593 | HSF2 594 | HSF4 595 | HSF5 596 | HSFX1 597 | HSFX2 598 | HSFY1 599 | HSFY2 600 | IKZF1 601 | IKZF2 602 | IKZF3 603 | IKZF4 604 | IKZF5 605 | INSM1 606 | INSM2 607 | IRF1 608 | IRF2 609 | IRF3 610 | IRF4 611 | IRF5 612 | IRF6 613 | IRF7 614 | IRF8 615 | IRF9 616 | IRX1 617 | IRX2 618 | IRX3 619 | IRX4 620 | IRX5 621 | IRX6 622 | ISL1 623 | ISL2 624 | ISX 625 | JAZF1 626 | JDP2 627 | JRK 628 | JRKL 629 | JUN 630 | JUNB 631 | JUND 632 | KAT7 633 | KCMF1 634 | KCNIP3 635 | KDM2A 636 | KDM2B 637 | KDM5B 638 | KIN 639 | KLF1 640 | KLF10 641 | KLF11 642 | KLF12 643 | KLF13 644 | KLF14 645 | KLF15 646 | KLF16 647 | KLF17 648 | KLF2 649 | KLF3 650 | KLF4 651 | KLF5 652 | KLF6 653 | KLF7 654 | KLF8 655 | KLF9 656 | KMT2A 657 | KMT2B 658 | L3MBTL1 659 | L3MBTL3 660 | L3MBTL4 661 | LBX1 662 | LBX2 663 | LCOR 664 | LCORL 665 | LEF1 666 | LEUTX 667 | LHX1 668 | LHX2 669 | LHX3 670 | LHX4 671 | LHX5 672 | LHX6 673 | LHX8 674 | LHX9 675 | LIN28A 676 | LIN28B 677 | LIN54 678 | LMX1A 679 | LMX1B 680 | LTF 681 | LYL1 682 | MAF 683 | MAFA 684 | MAFB 685 | MAFF 686 | MAFG 687 | MAFK 688 | MAX 689 | MAZ 690 | MBD1 691 | MBD2 692 | MBD3 693 | MBD4 694 | MBD6 695 | MBNL2 696 | MECOM 697 | MECP2 698 | MEF2A 699 | MEF2B 700 | MEF2C 701 | MEF2D 702 | MEIS1 703 | MEIS2 704 | MEIS3 705 | MEOX1 706 | MEOX2 707 | MESP1 708 | MESP2 709 | MGA 710 | MITF 711 | MIXL1 712 | MKX 713 | MLX 714 | MLXIP 715 | MLXIPL 716 | MNT 717 | MNX1 718 | MSANTD1 719 | MSANTD3 720 | MSANTD4 721 | MSC 722 | MSGN1 723 | MSX1 724 | MSX2 725 | MTERF1 726 | MTERF2 727 | MTERF3 728 | MTERF4 729 | MTF1 730 | MTF2 731 | MXD1 732 | MXD3 733 | MXD4 734 | MXI1 735 | MYB 736 | MYBL1 737 | MYBL2 738 | MYC 739 | MYCL 740 | MYCN 741 | MYF5 742 | MYF6 743 | MYNN 744 | MYOD1 745 | MYOG 746 | MYPOP 747 | MYRF 748 | MYRFL 749 | MYSM1 750 | MYT1 751 | MYT1L 752 | MZF1 753 | NACC2 754 | NAIF1 755 | NANOG 756 | NANOGNB 757 | NANOGP8 758 | NCOA1 759 | NCOA2 760 | NCOA3 761 | NEUROD1 762 | NEUROD2 763 | NEUROD4 764 | NEUROD6 765 | NEUROG1 766 | NEUROG2 767 | NEUROG3 768 | NFAT5 769 | NFATC1 770 | NFATC2 771 | NFATC3 772 | NFATC4 773 | NFE2 774 | NFE2L1 775 | NFE2L2 776 | NFE2L3 777 | NFE4 778 | NFIA 779 | NFIB 780 | NFIC 781 | NFIL3 782 | NFIX 783 | NFKB1 784 | NFKB2 785 | NFX1 786 | NFXL1 787 | NFYA 788 | NFYB 789 | NFYC 790 | NHLH1 791 | NHLH2 792 | NKRF 793 | NKX1-1 794 | NKX1-2 795 | NKX2-1 796 | NKX2-2 797 | NKX2-3 798 | NKX2-4 799 | NKX2-5 800 | NKX2-6 801 | NKX2-8 802 | NKX3-1 803 | NKX3-2 804 | NKX6-1 805 | NKX6-2 806 | NKX6-3 807 | NME2 808 | NOBOX 809 | NOTO 810 | NPAS1 811 | NPAS2 812 | NPAS3 813 | NPAS4 814 | NR0B1 815 | NR1D1 816 | NR1D2 817 | NR1H2 818 | NR1H3 819 | NR1H4 820 | NR1I2 821 | NR1I3 822 | NR2C1 823 | NR2C2 824 | NR2E1 825 | NR2E3 826 | NR2F1 827 | NR2F2 828 | NR2F6 829 | NR3C1 830 | NR3C2 831 | NR4A1 832 | NR4A2 833 | NR4A3 834 | NR5A1 835 | NR5A2 836 | NR6A1 837 | NRF1 838 | NRL 839 | OLIG1 840 | OLIG2 841 | OLIG3 842 | ONECUT1 843 | ONECUT2 844 | ONECUT3 845 | OSR1 846 | OSR2 847 | OTP 848 | OTX1 849 | OTX2 850 | OVOL1 851 | OVOL2 852 | OVOL3 853 | PA2G4 854 | PATZ1 855 | PAX1 856 | PAX2 857 | PAX3 858 | PAX4 859 | PAX5 860 | PAX6 861 | PAX7 862 | PAX8 863 | PAX9 864 | PBX1 865 | PBX2 866 | PBX3 867 | PBX4 868 | PCGF2 869 | PCGF6 870 | PDX1 871 | PEG3 872 | PGR 873 | PHF1 874 | PHF19 875 | PHF20 876 | PHF21A 877 | PHOX2A 878 | PHOX2B 879 | PIN1 880 | PITX1 881 | PITX2 882 | PITX3 883 | PKNOX1 884 | PKNOX2 885 | PLAG1 886 | PLAGL1 887 | PLAGL2 888 | PLSCR1 889 | POGK 890 | POU1F1 891 | POU2AF1 892 | POU2F1 893 | POU2F2 894 | POU2F3 895 | POU3F1 896 | POU3F2 897 | POU3F3 898 | POU3F4 899 | POU4F1 900 | POU4F2 901 | POU4F3 902 | POU5F1 903 | POU5F1B 904 | POU5F2 905 | POU6F1 906 | POU6F2 907 | PPARA 908 | PPARD 909 | PPARG 910 | PRDM1 911 | PRDM10 912 | PRDM12 913 | PRDM13 914 | PRDM14 915 | PRDM15 916 | PRDM16 917 | PRDM2 918 | PRDM4 919 | PRDM5 920 | PRDM6 921 | PRDM8 922 | PRDM9 923 | PREB 924 | PRMT3 925 | PROP1 926 | PROX1 927 | PROX2 928 | PRR12 929 | PRRX1 930 | PRRX2 931 | PTF1A 932 | PURA 933 | PURB 934 | PURG 935 | RAG1 936 | RARA 937 | RARB 938 | RARG 939 | RAX 940 | RAX2 941 | RBAK 942 | RBCK1 943 | RBPJ 944 | RBPJL 945 | RBSN 946 | REL 947 | RELA 948 | RELB 949 | REPIN1 950 | REST 951 | REXO4 952 | RFX1 953 | RFX2 954 | RFX3 955 | RFX4 956 | RFX5 957 | RFX6 958 | RFX7 959 | RFX8 960 | RHOXF1 961 | RHOXF2 962 | RHOXF2B 963 | RLF 964 | RORA 965 | RORB 966 | RORC 967 | RREB1 968 | RUNX1 969 | RUNX2 970 | RUNX3 971 | RXRA 972 | RXRB 973 | RXRG 974 | SAFB 975 | SAFB2 976 | SALL1 977 | SALL2 978 | SALL3 979 | SALL4 980 | SATB1 981 | SATB2 982 | SCMH1 983 | SCML4 984 | SCRT1 985 | SCRT2 986 | SCX 987 | SEBOX 988 | SETBP1 989 | SETDB1 990 | SETDB2 991 | SGSM2 992 | SHOX 993 | SHOX2 994 | SIM1 995 | SIM2 996 | SIX1 997 | SIX2 998 | SIX3 999 | SIX4 1000 | SIX5 1001 | SIX6 1002 | SKI 1003 | SKIL 1004 | SKOR1 1005 | SKOR2 1006 | SLC2A4RG 1007 | SMAD1 1008 | SMAD3 1009 | SMAD4 1010 | SMAD5 1011 | SMAD9 1012 | SMYD3 1013 | SNAI1 1014 | SNAI2 1015 | SNAI3 1016 | SNAPC2 1017 | SNAPC4 1018 | SNAPC5 1019 | SOHLH1 1020 | SOHLH2 1021 | SON 1022 | SOX1 1023 | SOX10 1024 | SOX11 1025 | SOX12 1026 | SOX13 1027 | SOX14 1028 | SOX15 1029 | SOX17 1030 | SOX18 1031 | SOX2 1032 | SOX21 1033 | SOX3 1034 | SOX30 1035 | SOX4 1036 | SOX5 1037 | SOX6 1038 | SOX7 1039 | SOX8 1040 | SOX9 1041 | SP1 1042 | SP100 1043 | SP110 1044 | SP140 1045 | SP140L 1046 | SP2 1047 | SP3 1048 | SP4 1049 | SP5 1050 | SP6 1051 | SP7 1052 | SP8 1053 | SP9 1054 | SPDEF 1055 | SPEN 1056 | SPI1 1057 | SPIB 1058 | SPIC 1059 | SPZ1 1060 | SRCAP 1061 | SREBF1 1062 | SREBF2 1063 | SRF 1064 | SRY 1065 | ST18 1066 | STAT1 1067 | STAT2 1068 | STAT3 1069 | STAT4 1070 | STAT5A 1071 | STAT5B 1072 | STAT6 1073 | T 1074 | TAL1 1075 | TAL2 1076 | TBP 1077 | TBPL1 1078 | TBPL2 1079 | TBR1 1080 | TBX1 1081 | TBX10 1082 | TBX15 1083 | TBX18 1084 | TBX19 1085 | TBX2 1086 | TBX20 1087 | TBX21 1088 | TBX22 1089 | TBX3 1090 | TBX4 1091 | TBX5 1092 | TBX6 1093 | TCF12 1094 | TCF15 1095 | TCF20 1096 | TCF21 1097 | TCF23 1098 | TCF24 1099 | TCF3 1100 | TCF4 1101 | TCF7 1102 | TCF7L1 1103 | TCF7L2 1104 | TCFL5 1105 | TEAD1 1106 | TEAD2 1107 | TEAD3 1108 | TEAD4 1109 | TEF 1110 | TERB1 1111 | TERF1 1112 | TERF2 1113 | TET1 1114 | TET2 1115 | TET3 1116 | TFAP2A 1117 | TFAP2B 1118 | TFAP2C 1119 | TFAP2D 1120 | TFAP2E 1121 | TFAP4 1122 | TFCP2 1123 | TFCP2L1 1124 | TFDP1 1125 | TFDP2 1126 | TFDP3 1127 | TFE3 1128 | TFEB 1129 | TFEC 1130 | TGIF1 1131 | TGIF2 1132 | TGIF2LX 1133 | TGIF2LY 1134 | THAP1 1135 | THAP10 1136 | THAP11 1137 | THAP12 1138 | THAP2 1139 | THAP3 1140 | THAP4 1141 | THAP5 1142 | THAP6 1143 | THAP7 1144 | THAP8 1145 | THAP9 1146 | THRA 1147 | THRB 1148 | THYN1 1149 | TIGD1 1150 | TIGD2 1151 | TIGD3 1152 | TIGD4 1153 | TIGD5 1154 | TIGD6 1155 | TIGD7 1156 | TLX1 1157 | TLX2 1158 | TLX3 1159 | TMF1 1160 | TOPORS 1161 | TP53 1162 | TP63 1163 | TP73 1164 | TPRX1 1165 | TRAFD1 1166 | TRERF1 1167 | TRPS1 1168 | TSC22D1 1169 | TSHZ1 1170 | TSHZ2 1171 | TSHZ3 1172 | TTF1 1173 | TWIST1 1174 | TWIST2 1175 | UBP1 1176 | UNCX 1177 | USF1 1178 | USF2 1179 | USF3 1180 | VAX1 1181 | VAX2 1182 | VDR 1183 | VENTX 1184 | VEZF1 1185 | VSX1 1186 | VSX2 1187 | WIZ 1188 | WT1 1189 | XBP1 1190 | XPA 1191 | YBX1 1192 | YBX2 1193 | YBX3 1194 | YY1 1195 | YY2 1196 | ZBED1 1197 | ZBED2 1198 | ZBED3 1199 | ZBED4 1200 | ZBED5 1201 | ZBED6 1202 | ZBED9 1203 | ZBTB1 1204 | ZBTB10 1205 | ZBTB11 1206 | ZBTB12 1207 | ZBTB14 1208 | ZBTB16 1209 | ZBTB17 1210 | ZBTB18 1211 | ZBTB2 1212 | ZBTB20 1213 | ZBTB21 1214 | ZBTB22 1215 | ZBTB24 1216 | ZBTB25 1217 | ZBTB26 1218 | ZBTB3 1219 | ZBTB32 1220 | ZBTB33 1221 | ZBTB34 1222 | ZBTB37 1223 | ZBTB38 1224 | ZBTB39 1225 | ZBTB4 1226 | ZBTB40 1227 | ZBTB41 1228 | ZBTB42 1229 | ZBTB43 1230 | ZBTB44 1231 | ZBTB45 1232 | ZBTB46 1233 | ZBTB47 1234 | ZBTB48 1235 | ZBTB49 1236 | ZBTB5 1237 | ZBTB6 1238 | ZBTB7A 1239 | ZBTB7B 1240 | ZBTB7C 1241 | ZBTB8A 1242 | ZBTB8B 1243 | ZBTB9 1244 | ZC3H8 1245 | ZEB1 1246 | ZEB2 1247 | ZFAT 1248 | ZFHX2 1249 | ZFHX3 1250 | ZFHX4 1251 | ZFP1 1252 | ZFP14 1253 | ZFP2 1254 | ZFP28 1255 | ZFP3 1256 | ZFP30 1257 | ZFP37 1258 | ZFP41 1259 | ZFP42 1260 | ZFP57 1261 | ZFP62 1262 | ZFP64 1263 | ZFP69 1264 | ZFP69B 1265 | ZFP82 1266 | ZFP90 1267 | ZFP91 1268 | ZFP92 1269 | ZFPM1 1270 | ZFPM2 1271 | ZFX 1272 | ZFY 1273 | ZGLP1 1274 | ZGPAT 1275 | ZHX1 1276 | ZHX2 1277 | ZHX3 1278 | ZIC1 1279 | ZIC2 1280 | ZIC3 1281 | ZIC4 1282 | ZIC5 1283 | ZIK1 1284 | ZIM2 1285 | ZIM3 1286 | ZKSCAN1 1287 | ZKSCAN2 1288 | ZKSCAN3 1289 | ZKSCAN4 1290 | ZKSCAN5 1291 | ZKSCAN7 1292 | ZKSCAN8 1293 | ZMAT1 1294 | ZMAT4 1295 | ZNF10 1296 | ZNF100 1297 | ZNF101 1298 | ZNF107 1299 | ZNF112 1300 | ZNF114 1301 | ZNF117 1302 | ZNF12 1303 | ZNF121 1304 | ZNF124 1305 | ZNF131 1306 | ZNF132 1307 | ZNF133 1308 | ZNF134 1309 | ZNF135 1310 | ZNF136 1311 | ZNF138 1312 | ZNF14 1313 | ZNF140 1314 | ZNF141 1315 | ZNF142 1316 | ZNF143 1317 | ZNF146 1318 | ZNF148 1319 | ZNF154 1320 | ZNF155 1321 | ZNF157 1322 | ZNF16 1323 | ZNF160 1324 | ZNF165 1325 | ZNF169 1326 | ZNF17 1327 | ZNF174 1328 | ZNF175 1329 | ZNF177 1330 | ZNF18 1331 | ZNF180 1332 | ZNF181 1333 | ZNF182 1334 | ZNF184 1335 | ZNF189 1336 | ZNF19 1337 | ZNF195 1338 | ZNF197 1339 | ZNF2 1340 | ZNF20 1341 | ZNF200 1342 | ZNF202 1343 | ZNF205 1344 | ZNF207 1345 | ZNF208 1346 | ZNF211 1347 | ZNF212 1348 | ZNF213 1349 | ZNF214 1350 | ZNF215 1351 | ZNF217 1352 | ZNF219 1353 | ZNF22 1354 | ZNF221 1355 | ZNF222 1356 | ZNF223 1357 | ZNF224 1358 | ZNF225 1359 | ZNF226 1360 | ZNF227 1361 | ZNF229 1362 | ZNF23 1363 | ZNF230 1364 | ZNF232 1365 | ZNF233 1366 | ZNF234 1367 | ZNF235 1368 | ZNF236 1369 | ZNF239 1370 | ZNF24 1371 | ZNF248 1372 | ZNF25 1373 | ZNF250 1374 | ZNF251 1375 | ZNF253 1376 | ZNF254 1377 | ZNF256 1378 | ZNF257 1379 | ZNF26 1380 | ZNF260 1381 | ZNF263 1382 | ZNF264 1383 | ZNF266 1384 | ZNF267 1385 | ZNF268 1386 | ZNF273 1387 | ZNF274 1388 | ZNF275 1389 | ZNF276 1390 | ZNF277 1391 | ZNF28 1392 | ZNF280A 1393 | ZNF280B 1394 | ZNF280C 1395 | ZNF280D 1396 | ZNF281 1397 | ZNF282 1398 | ZNF283 1399 | ZNF284 1400 | ZNF285 1401 | ZNF286A 1402 | ZNF286B 1403 | ZNF287 1404 | ZNF292 1405 | ZNF296 1406 | ZNF3 1407 | ZNF30 1408 | ZNF300 1409 | ZNF302 1410 | ZNF304 1411 | ZNF311 1412 | ZNF316 1413 | ZNF317 1414 | ZNF318 1415 | ZNF319 1416 | ZNF32 1417 | ZNF320 1418 | ZNF322 1419 | ZNF324 1420 | ZNF324B 1421 | ZNF326 1422 | ZNF329 1423 | ZNF331 1424 | ZNF333 1425 | ZNF334 1426 | ZNF335 1427 | ZNF337 1428 | ZNF33A 1429 | ZNF33B 1430 | ZNF34 1431 | ZNF341 1432 | ZNF343 1433 | ZNF345 1434 | ZNF346 1435 | ZNF347 1436 | ZNF35 1437 | ZNF350 1438 | ZNF354A 1439 | ZNF354B 1440 | ZNF354C 1441 | ZNF358 1442 | ZNF362 1443 | ZNF365 1444 | ZNF366 1445 | ZNF367 1446 | ZNF37A 1447 | ZNF382 1448 | ZNF383 1449 | ZNF384 1450 | ZNF385A 1451 | ZNF385B 1452 | ZNF385C 1453 | ZNF385D 1454 | ZNF391 1455 | ZNF394 1456 | ZNF395 1457 | ZNF396 1458 | ZNF397 1459 | ZNF398 1460 | ZNF404 1461 | ZNF407 1462 | ZNF408 1463 | ZNF41 1464 | ZNF410 1465 | ZNF414 1466 | ZNF415 1467 | ZNF416 1468 | ZNF417 1469 | ZNF418 1470 | ZNF419 1471 | ZNF420 1472 | ZNF423 1473 | ZNF425 1474 | ZNF426 1475 | ZNF428 1476 | ZNF429 1477 | ZNF43 1478 | ZNF430 1479 | ZNF431 1480 | ZNF432 1481 | ZNF433 1482 | ZNF436 1483 | ZNF438 1484 | ZNF439 1485 | ZNF44 1486 | ZNF440 1487 | ZNF441 1488 | ZNF442 1489 | ZNF443 1490 | ZNF444 1491 | ZNF445 1492 | ZNF446 1493 | ZNF449 1494 | ZNF45 1495 | ZNF451 1496 | ZNF454 1497 | ZNF460 1498 | ZNF461 1499 | ZNF462 1500 | ZNF467 1501 | ZNF468 1502 | ZNF469 1503 | ZNF470 1504 | ZNF471 1505 | ZNF473 1506 | ZNF474 1507 | ZNF479 1508 | ZNF48 1509 | ZNF480 1510 | ZNF483 1511 | ZNF484 1512 | ZNF485 1513 | ZNF486 1514 | ZNF487 1515 | ZNF488 1516 | ZNF490 1517 | ZNF491 1518 | ZNF492 1519 | ZNF493 1520 | ZNF496 1521 | ZNF497 1522 | ZNF500 1523 | ZNF501 1524 | ZNF502 1525 | ZNF503 1526 | ZNF506 1527 | ZNF507 1528 | ZNF510 1529 | ZNF511 1530 | ZNF512 1531 | ZNF512B 1532 | ZNF513 1533 | ZNF514 1534 | ZNF516 1535 | ZNF517 1536 | ZNF518A 1537 | ZNF518B 1538 | ZNF519 1539 | ZNF521 1540 | ZNF524 1541 | ZNF525 1542 | ZNF526 1543 | ZNF527 1544 | ZNF528 1545 | ZNF529 1546 | ZNF530 1547 | ZNF532 1548 | ZNF534 1549 | ZNF536 1550 | ZNF540 1551 | ZNF541 1552 | ZNF543 1553 | ZNF544 1554 | ZNF546 1555 | ZNF547 1556 | ZNF548 1557 | ZNF549 1558 | ZNF550 1559 | ZNF551 1560 | ZNF552 1561 | ZNF554 1562 | ZNF555 1563 | ZNF556 1564 | ZNF557 1565 | ZNF558 1566 | ZNF559 1567 | ZNF560 1568 | ZNF561 1569 | ZNF562 1570 | ZNF563 1571 | ZNF564 1572 | ZNF565 1573 | ZNF566 1574 | ZNF567 1575 | ZNF568 1576 | ZNF569 1577 | ZNF57 1578 | ZNF570 1579 | ZNF571 1580 | ZNF572 1581 | ZNF573 1582 | ZNF574 1583 | ZNF575 1584 | ZNF576 1585 | ZNF577 1586 | ZNF578 1587 | ZNF579 1588 | ZNF580 1589 | ZNF581 1590 | ZNF582 1591 | ZNF583 1592 | ZNF584 1593 | ZNF585A 1594 | ZNF585B 1595 | ZNF586 1596 | ZNF587 1597 | ZNF587B 1598 | ZNF589 1599 | ZNF592 1600 | ZNF594 1601 | ZNF595 1602 | ZNF596 1603 | ZNF597 1604 | ZNF598 1605 | ZNF599 1606 | ZNF600 1607 | ZNF605 1608 | ZNF606 1609 | ZNF607 1610 | ZNF608 1611 | ZNF609 1612 | ZNF610 1613 | ZNF611 1614 | ZNF613 1615 | ZNF614 1616 | ZNF615 1617 | ZNF616 1618 | ZNF618 1619 | ZNF619 1620 | ZNF620 1621 | ZNF621 1622 | ZNF623 1623 | ZNF624 1624 | ZNF625 1625 | ZNF626 1626 | ZNF627 1627 | ZNF628 1628 | ZNF629 1629 | ZNF630 1630 | ZNF639 1631 | ZNF641 1632 | ZNF644 1633 | ZNF645 1634 | ZNF646 1635 | ZNF648 1636 | ZNF649 1637 | ZNF652 1638 | ZNF653 1639 | ZNF654 1640 | ZNF655 1641 | ZNF658 1642 | ZNF66 1643 | ZNF660 1644 | ZNF662 1645 | ZNF664 1646 | ZNF665 1647 | ZNF667 1648 | ZNF668 1649 | ZNF669 1650 | ZNF670 1651 | ZNF671 1652 | ZNF672 1653 | ZNF674 1654 | ZNF675 1655 | ZNF676 1656 | ZNF677 1657 | ZNF678 1658 | ZNF679 1659 | ZNF680 1660 | ZNF681 1661 | ZNF682 1662 | ZNF683 1663 | ZNF684 1664 | ZNF687 1665 | ZNF688 1666 | ZNF689 1667 | ZNF69 1668 | ZNF691 1669 | ZNF692 1670 | ZNF695 1671 | ZNF696 1672 | ZNF697 1673 | ZNF699 1674 | ZNF7 1675 | ZNF70 1676 | ZNF700 1677 | ZNF701 1678 | ZNF703 1679 | ZNF704 1680 | ZNF705A 1681 | ZNF705B 1682 | ZNF705D 1683 | ZNF705E 1684 | ZNF705G 1685 | ZNF706 1686 | ZNF707 1687 | ZNF708 1688 | ZNF709 1689 | ZNF71 1690 | ZNF710 1691 | ZNF711 1692 | ZNF713 1693 | ZNF714 1694 | ZNF716 1695 | ZNF717 1696 | ZNF718 1697 | ZNF721 1698 | ZNF724 1699 | ZNF726 1700 | ZNF727 1701 | ZNF728 1702 | ZNF729 1703 | ZNF730 1704 | ZNF732 1705 | ZNF735 1706 | ZNF736 1707 | ZNF737 1708 | ZNF74 1709 | ZNF740 1710 | ZNF746 1711 | ZNF747 1712 | ZNF749 1713 | ZNF750 1714 | ZNF75A 1715 | ZNF75D 1716 | ZNF76 1717 | ZNF761 1718 | ZNF763 1719 | ZNF764 1720 | ZNF765 1721 | ZNF766 1722 | ZNF768 1723 | ZNF77 1724 | ZNF770 1725 | ZNF771 1726 | ZNF772 1727 | ZNF773 1728 | ZNF774 1729 | ZNF775 1730 | ZNF776 1731 | ZNF777 1732 | ZNF778 1733 | ZNF780A 1734 | ZNF780B 1735 | ZNF781 1736 | ZNF782 1737 | ZNF783 1738 | ZNF784 1739 | ZNF785 1740 | ZNF786 1741 | ZNF787 1742 | ZNF788 1743 | ZNF789 1744 | ZNF79 1745 | ZNF790 1746 | ZNF791 1747 | ZNF792 1748 | ZNF793 1749 | ZNF799 1750 | ZNF8 1751 | ZNF80 1752 | ZNF800 1753 | ZNF804A 1754 | ZNF804B 1755 | ZNF805 1756 | ZNF808 1757 | ZNF81 1758 | ZNF813 1759 | ZNF814 1760 | ZNF816 1761 | ZNF821 1762 | ZNF823 1763 | ZNF827 1764 | ZNF829 1765 | ZNF83 1766 | ZNF830 1767 | ZNF831 1768 | ZNF835 1769 | ZNF836 1770 | ZNF837 1771 | ZNF84 1772 | ZNF841 1773 | ZNF843 1774 | ZNF844 1775 | ZNF845 1776 | ZNF846 1777 | ZNF85 1778 | ZNF850 1779 | ZNF852 1780 | ZNF853 1781 | ZNF860 1782 | ZNF865 1783 | ZNF878 1784 | ZNF879 1785 | ZNF880 1786 | ZNF883 1787 | ZNF888 1788 | ZNF891 1789 | ZNF90 1790 | ZNF91 1791 | ZNF92 1792 | ZNF93 1793 | ZNF98 1794 | ZNF99 1795 | ZSCAN1 1796 | ZSCAN10 1797 | ZSCAN12 1798 | ZSCAN16 1799 | ZSCAN18 1800 | ZSCAN2 1801 | ZSCAN20 1802 | ZSCAN21 1803 | ZSCAN22 1804 | ZSCAN23 1805 | ZSCAN25 1806 | ZSCAN26 1807 | ZSCAN29 1808 | ZSCAN30 1809 | ZSCAN31 1810 | ZSCAN32 1811 | ZSCAN4 1812 | ZSCAN5A 1813 | ZSCAN5B 1814 | ZSCAN5C 1815 | ZSCAN9 1816 | ZUFSP 1817 | ZXDA 1818 | ZXDB 1819 | ZXDC 1820 | ZZZ3 1821 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | #' Compute OLS coefficients 2 | #' 3 | #' If the design matrix has full column-rank, then use the normal 4 | #' least squares estimate. Otherwise, use the Moore-Penrose inverse 5 | #' to compute the least squares estimate. 6 | #' 7 | #' @param y Target vector (n x 1)/matrix (n x m) 8 | #' @param x Design matrix (n x p) 9 | #' 10 | #' @return Vector of OLS coefficients 11 | #' 12 | #' @keywords internal 13 | coef_ols <- function(y, x) { 14 | # Pre-compute quantities 15 | n <- nrow(x) 16 | p <- ncol(x) 17 | xtx <- crossprod(x) 18 | xty <- crossprod(x, y) 19 | 20 | if (n < p) { 21 | # Compute the pseudo-inverse of xtx 22 | xtx_svd <- svd(xtx) 23 | d <- xtx_svd$d 24 | idx <- which(d > .Machine$double.eps * p * max(d)) 25 | d[idx] <- 1 / d[idx] 26 | d[setdiff(seq_len(p), idx)] <- 0 27 | xtx_inv <- xtx_svd$v %*% diag(d, nrow = p, ncol = p) %*% t(xtx_svd$u) 28 | 29 | # Compute least squares solution using pseudo-inverse 30 | beta_ols <- xtx_inv %*% xty 31 | } else { 32 | # Compute least squares solution directly 33 | beta_ols <- solve(xtx, xty) 34 | } 35 | 36 | beta_ols 37 | } 38 | 39 | #' Compute ridge regression coefficients 40 | #' 41 | #' 42 | #' @param y Target vector (n x 1)/matrix (n x m) 43 | #' @param x Design matrix (n x p) 44 | #' @param lambda Positive parameter for ridge penalty 45 | #' 46 | #' @return Vector of ridge regression coefficients 47 | #' 48 | #' @keywords internal 49 | coef_ridge <- function(y, x, lambda) { 50 | # Pre-compute quantities 51 | p <- ncol(x) 52 | xtx <- crossprod(x) 53 | xty <- crossprod(x, y) 54 | 55 | # Compute ridge regression solution directly 56 | solve(xtx + diag(lambda, p, p), xty) 57 | } 58 | 59 | #' Quick'n'dirty progress bar 60 | #' 61 | #' Creates a progress bar and returns it as a string. 62 | #' 63 | #' @param step current step being worked on 64 | #' @param n_steps total number of steps 65 | #' @param name name of the process 66 | #' @param finished whether the process is finished 67 | #' @param progress_length length of the progress bar in ascii signs 68 | #' 69 | #' @return A string formatted as a progress bar 70 | #' 71 | #' @keywords internal 72 | progstr <- function(step, n_steps, name, 73 | finished = FALSE, progress_length = 20L) { 74 | steps_done <- floor(progress_length * (step - 1) / n_steps) 75 | 76 | parts <- c("|", rep.int(cli::col_blue(cli::symbol$square), steps_done)) 77 | if (!finished) { 78 | parts <- c( 79 | parts, 80 | rep.int(cli::symbol$line, progress_length - steps_done) 81 | ) 82 | } else { 83 | parts <- c(parts, rep.int( 84 | cli::col_blue(cli::symbol$square), progress_length - steps_done 85 | )) 86 | } 87 | parts <- c(parts, "| ", cli::col_grey("%d/%d ", name)) 88 | 89 | sprintf(paste0(parts, collapse = ""), step, n_steps) 90 | } 91 | 92 | #' Format count table nicely 93 | #' 94 | #' @param counts a list of count vectors with `1 + n_cl` entries each. 95 | #' `NA` values are replaced with `-` 96 | #' @param title title above the table 97 | #' @param row_names a vector of row names, one for each count vector 98 | #' @param col_width minimum width for columns 99 | #' 100 | #' @return A string formatted as a table 101 | #' 102 | #' @keywords internal 103 | count_table <- function(counts, 104 | title, 105 | row_names, 106 | col_width = 5) { 107 | nms <- c("Noise", as.character(seq_len(length(counts[[1]]) - 1))) 108 | counts_chr <- lapply(counts, as.character) 109 | # Replace NA with `-` 110 | counts_chr <- lapply(counts_chr, function(cn) { 111 | cn[is.na(cn)] <- "-" 112 | cn 113 | }) 114 | 115 | stopifnot(length(row_names) == length(counts_chr)) 116 | 117 | cws <- c( 118 | max( 119 | c( 120 | nchar("Noise"), 121 | sapply(counts_chr, function(cn) nchar(cn[1])), 122 | col_width 123 | ) 124 | ), 125 | do.call(pmax, c( 126 | list(unname(sapply(nms[-1], nchar))), 127 | lapply(counts_chr, function(cn) unname(sapply(cn[-1], nchar))), 128 | list(col_width) 129 | )) 130 | ) 131 | 132 | # longest row name 133 | width_row_nms <- max(sapply(c("Module", row_names), nchar)) 134 | 135 | fmt_strs <- sprintf("%%%ds", cws) 136 | fmt_row_str <- sprintf(" %%%ds ", width_row_nms) 137 | 138 | width <- cli::console_width() 139 | 140 | # Two spaces + longest row name + two spaces 141 | tbl_widths <- ( 142 | 2 + width_row_nms + 2 + cumsum(cws + c(0, rep(3, length(cws) - 1))) 143 | ) 144 | tbl_rows <- list(which(tbl_widths <= width)) 145 | cws_tmp <- cws[tbl_widths > width] 146 | while (length(cws_tmp) > 0) { 147 | tbl_widths <- ( 148 | 2 + width_row_nms + 2 149 | + cumsum(cws_tmp + c(0, rep(3, length(cws_tmp) - 1))) 150 | ) 151 | 152 | tbl_rows <- c(tbl_rows, list( 153 | max(tbl_rows[[length(tbl_rows)]]) + which(tbl_widths <= width)) 154 | ) 155 | cws_tmp <- cws_tmp[tbl_widths > width] 156 | } 157 | 158 | # Grey-out `-` and `0`s 159 | counts_chr <- lapply(counts_chr, function(cn) { 160 | cn_out <- sprintf(fmt_strs, cn) 161 | cn_out[cn == "-"] <- cli::col_grey(sprintf(fmt_strs[cn == "-"], "-")) 162 | cn_out[cn == "0"] <- cli::col_grey(sprintf(fmt_strs[cn == "0"], "0")) 163 | cn_out 164 | }) 165 | 166 | do.call(function(...) paste(..., sep = "\n"), c( 167 | list(cli::col_grey(sprintf("# %s", title))), 168 | lapply( 169 | tbl_rows, 170 | function(elems) { 171 | paste0(paste( 172 | paste0( 173 | cli::col_blue(sprintf(fmt_row_str, "Module")), 174 | cli::col_grey( 175 | paste0( 176 | sprintf(fmt_strs[elems], nms[elems]), 177 | collapse = cli::col_grey(" | ") 178 | ) 179 | ) 180 | ), 181 | do.call( 182 | function(...) paste(..., sep = "\n"), 183 | lapply(seq_along(counts_chr), function(i) { 184 | paste0( 185 | cli::col_blue(sprintf(fmt_row_str, row_names[i])), 186 | paste0( 187 | sprintf(fmt_strs[elems], counts_chr[[i]][elems]), 188 | collapse = cli::col_grey(" | ") 189 | ) 190 | ) 191 | }) 192 | ), 193 | sep = "\n" 194 | ), "\n") 195 | } 196 | ) 197 | )) 198 | } 199 | 200 | #' Compute indicator matrix of pairwise distances smaller than threshold 201 | #' 202 | #' Computes the Jaccard distance between rows of a matrix and returns a 203 | #' sparse symmetric indicator matrix containing the entries with a distance 204 | #' of less than a given upper bound. Note that the diagonal is always 1. 205 | #' 206 | #' @param x the input matrix with vectors to be compared in the rows. 207 | #' @param upper_bnd pairs with a Jaccard distance below this upper bound are 208 | #' returned as 1 while all others receive the entry 0. 209 | #' 210 | #' @return A list of vectors describing a sparse lower triangular pattern matrix 211 | #' \item{i}{Row indices} 212 | #' \item{j}{Column indices} 213 | #' 214 | #' @keywords internal 215 | jaccard_indicator <- function(x, upper_bnd = 0.8) { 216 | # Treat matrix as sparse pattern matrix 217 | x <- methods::as(x, "ngCMatrix") 218 | 219 | # Dimension along which pairwise distances are computed 220 | n <- x@Dim[1] 221 | 222 | # Retrieve row and column indices of non-zero entries 223 | xs <- Matrix::summary(x) 224 | i <- xs$i 225 | j <- xs$j 226 | 227 | # Split column indices by row indices 228 | # -> jsplit will have exactly n entries 229 | # -> This is almost equivalent to the call `split(j[iord], i[iord] + 1)` 230 | # except that rows with zero ones result in an empty vector 231 | # whereas they would not appear in the `split` call. 232 | iord <- order(i) 233 | iord_rle <- rle(i[iord] + 1L) 234 | iord_rle_cs <- c(1L, cumsum(iord_rle$lengths)) 235 | jord <- j[iord] 236 | jsplit <- vector(mode = "list", n) 237 | m <- 1L 238 | len_iuniq <- length(iord_rle$values) 239 | for (l in seq_len(n)) { 240 | if (m > len_iuniq) { 241 | break 242 | } 243 | 244 | if (iord_rle$values[m] == l) { 245 | jsplit[[l]] <- jord[iord_rle_cs[m]:iord_rle_cs[m + 1L]] 246 | m <- m + 1L 247 | } else { 248 | jsplit[[l]] <- vector("integer", 0L) 249 | } 250 | } 251 | 252 | # Run actual computation of Jaccard distances and save those 253 | # entries that have distance below the upper_bnd. 254 | out <- jaccard_indicator_comp( 255 | jsplit, 256 | eps = upper_bnd 257 | ) 258 | 259 | # Form the indicator matrix 260 | methods::as(Matrix::sparseMatrix( 261 | c(out$i, out$j), 262 | c(out$j, out$i), 263 | dims = c(n, n) 264 | ) + Matrix::Diagonal(n), "ngCMatrix") 265 | } 266 | 267 | #' Determine initial centers for the kmeans++ algorithm 268 | #' 269 | #' @param x data matrix to be clustered 270 | #' @param dm distance matrix (between rows of x; of class "dist") 271 | #' 272 | #' @return Row indices of initial cluster centers of x 273 | #' 274 | #' @keywords internal 275 | kmeanspp_init <- function(n_cluster, x = NULL, dm = NULL) { 276 | if (sum(c(is.null(x), is.null(dm))) %in% c(0L, 2L)) { 277 | stop("Exactly one of x or dm needs to be supplied") 278 | } 279 | 280 | if (!is.null(x)) { 281 | dm <- dist(x) 282 | } 283 | 284 | n <- attr(dm, "Size") 285 | 286 | centers <- sample(n, size = 1L) 287 | for (i in 2L:n_cluster) { 288 | remaining_obs <- setdiff(seq_len(n), centers) 289 | log_ws_sq <- log(apply(do.call( 290 | cbind, lapply( 291 | centers, 292 | function(c) { 293 | lower_idx <- remaining_obs[remaining_obs < c] 294 | upper_idx <- remaining_obs[remaining_obs > c] 295 | 296 | c( 297 | dm[ 298 | n * (lower_idx - 1) 299 | - lower_idx * (lower_idx - 1) / 2 300 | + c 301 | - lower_idx 302 | ], 303 | dm[ 304 | n * (c - 1) 305 | - c * (c - 1) / 2 306 | + upper_idx 307 | - c 308 | ] 309 | ) 310 | # # More straight-forward but less memory efficient method 311 | # dist_vals2 <- as.vector(as.matrix(dm)[remaining_obs, c]) 312 | } 313 | ) 314 | ), 1, min)^2) 315 | max_log_ws_sq <- max(log_ws_sq) 316 | ps <- ( 317 | exp(log_ws_sq - max_log_ws_sq) / sum(exp(log_ws_sq - max_log_ws_sq)) 318 | ) 319 | centers <- c(centers, remaining_obs[ 320 | sample.int(length(remaining_obs), size = 1, prob = ps) 321 | ]) 322 | } 323 | 324 | centers 325 | } 326 | 327 | #' Perform the k-means++ algorithm 328 | #' 329 | #' Performs the k-means++ algorithm to cluster the rows of the input matrix. 330 | #' 331 | #' Estimation is repeated 332 | #' 333 | #' @param x Input matrix (n x p) 334 | #' @param n_cluster Number of clusters 335 | #' @param n_init_clusterings Number of repeated random initializations 336 | #' to perform 337 | #' @param n_max_iter Number of maximum iterations to perform in the k-means 338 | #' algorithm 339 | #' 340 | #' @return An object of class [`stats::kmeans`]. 341 | #' 342 | #' @references 343 | #' David Arthur and Sergei Vassilvitskii. K-Means++: The advantages 344 | #' of careful seeding. In Proceedings of the Eighteenth Annual ACM-SIAM 345 | #' Symposium on Discrete Algorithms, SODA '07, pages 1027––1035. 346 | #' Society for Industrial and Applied Mathematics, 2007. 347 | #' 348 | #' @concept helpers 349 | #' 350 | #' @export 351 | kmeanspp <- function(x, n_cluster, n_init_clusterings = 10L, n_max_iter = 10L) { 352 | dm <- dist(x) 353 | initial_center_indices <- lapply( 354 | seq_len(n_init_clusterings), 355 | function(i) { 356 | kmeanspp_init(n_cluster, dm = dm) 357 | } 358 | ) 359 | # Remove reference to dm 360 | dm <- NULL 361 | 362 | clusterings <- lapply( 363 | initial_center_indices, 364 | function(center_idx) { 365 | stats::kmeans( 366 | x, 367 | centers = x[center_idx, , drop = FALSE], 368 | iter.max = n_max_iter 369 | ) 370 | } 371 | ) 372 | 373 | min_idx <- which.min(sapply(clusterings, function(cl) cl$tot.withinss)) 374 | clusterings[[min_idx]]$cluster 375 | } 376 | 377 | #' Determine module sizes 378 | #' 379 | #' @param module Vector of module indices 380 | #' @param n_modules Total number of modules 381 | #' 382 | #' @return A named vector containing the name of the module (its index or 383 | #' `"Noise"`) and the number of elements in that module 384 | #' 385 | #' @concept helpers 386 | #' 387 | #' @export 388 | find_module_sizes <- function(module, n_modules) { 389 | sapply(c(-1L, seq_len(n_modules)), function(i) { 390 | v <- sum(module == i) 391 | if (i == -1) { 392 | names(v) <- "Noise" 393 | } else { 394 | names(v) <- i 395 | } 396 | v 397 | }) 398 | } 399 | 400 | #' Remove empty modules 401 | #' 402 | #' @details 403 | #' Only iterates through modules with positive index, leaving the noise 404 | #' module untouched. 405 | #' 406 | #' @param module Vector of module indices 407 | #' 408 | #' @return The updated vector of module indices with empty modules removed. 409 | #' 410 | #' @keywords internal 411 | remove_empty_modules <- function(module) { 412 | module_ <- module 413 | if (max(module) > length(unique(module[module > 0]))) { 414 | unique_module <- unique(module[module > 0]) 415 | for (i in seq_len(length(unique_module))) { 416 | module_[which(module == unique_module[i])] <- i 417 | } 418 | } 419 | 420 | module_ 421 | } 422 | 423 | #' Extract target gene modules for given penalization parameters 424 | #' 425 | #' @param fit An object of class `scregclust` 426 | #' @param penalization A numeric vector of penalization parameters. 427 | #' The penalization parameters specified here must have 428 | #' been used used during fitting of the `fit` object. 429 | #' 430 | #' @return A list of lists of final target modules. One list for each 431 | #' parameter in `penalization`. The lists contain the modules of 432 | #' target genes for each final configuration. 433 | #' 434 | #' @concept utilities 435 | #' 436 | #' @export 437 | get_target_gene_modules <- function(fit, penalization = NULL) { 438 | if (!all(penalization %in% fit$penalization)) { 439 | cli::cli_abort(c( 440 | "Not all parameter values in {.var penalization} have been fitted.", 441 | "i" = paste( 442 | "Penalization parameters in {.class scregclust} object:", 443 | "{fit$penalization}" 444 | ), 445 | "i" = "Penalization parameters provided: {penalization}" 446 | )) 447 | } 448 | 449 | if (is.null(penalization)) { 450 | idx <- seq_along(fit$penalization) 451 | } else { 452 | idx <- which(fit$penalization %in% penalization) 453 | } 454 | 455 | lapply(idx, function(i) { 456 | lapply( 457 | fit$results[[i]]$output, 458 | function(o) { 459 | o$module[!fit$results[[i]]$is_regulator] 460 | } 461 | ) 462 | }) 463 | } 464 | 465 | #' Create a table of module overlap for two clusterings 466 | #' 467 | #' Compares two clusterings and creates a table of overlap between them. 468 | #' Module labels do not have to match. 469 | #' 470 | #' @param k1 First clustering 471 | #' @param k2 Second clustering 472 | #' 473 | #' @return A matrix showing the module overlap with the labels of `k1` in 474 | #' the columns and the labels of `k2` in the rows. 475 | #' 476 | #' @concept helpers 477 | #' 478 | #' @export 479 | cluster_overlap <- function(k1, k2) { 480 | if (length(k1) != length(k2)) { 481 | cli::cli_abort(c( 482 | "Clusterings are not the same length.", 483 | "i" = "Length of {.var k1}: {length(k1)}", 484 | "i" = "Length of {.var k2}: {length(k2)}" 485 | )) 486 | } 487 | 488 | e_k1 <- sort(unique(k1)) 489 | e_k2 <- sort(unique(k2)) 490 | 491 | out <- do.call(cbind, lapply(e_k1, function(i1) { 492 | stats::setNames(vapply(e_k2, function(i2) { 493 | sum((k1 == i1) & (k2 == i2)) 494 | }, 1L), e_k2) 495 | })) 496 | colnames(out) <- e_k1 497 | 498 | out 499 | } 500 | 501 | #' Extract final configurations into a data frame 502 | #' 503 | #' @param obj An object of class `scregclust` 504 | #' 505 | #' @return A [`data.frame`] containing penalization parameters and 506 | #' final configurations for those penalizations. 507 | #' 508 | #' @concept helpers 509 | #' 510 | #' @export 511 | available_results <- function(obj) { 512 | data.frame( 513 | penalization = obj$penalization, 514 | final_configurations = sapply(obj$results, function(res) length(res$output)) 515 | ) 516 | } 517 | 518 | #' Fast computation of correlation 519 | #' 520 | #' This uses a more memory-intensive but much faster algorithm than 521 | #' the built-in `cor` function. 522 | #' 523 | #' Computes the correlation between the columns of `x` and `y`. 524 | #' 525 | #' @param x first input matrix 526 | #' @param y second input matrix 527 | #' 528 | #' @return Correlations matrix between the columns of `x` and `y` 529 | #' 530 | #' @concept helpers 531 | #' 532 | #' @export 533 | fast_cor <- function(x, y) { 534 | xv <- scale(x, center = TRUE, scale = FALSE) 535 | yv <- scale(y, center = TRUE, scale = FALSE) 536 | xvss <- colSums(xv * xv) 537 | yvss <- colSums(yv * yv) 538 | result <- crossprod(xv, yv) / sqrt(outer(xvss, yvss)) 539 | 540 | pmax(pmin(result, 1), -1) 541 | } 542 | 543 | #' Return the number of final configurations 544 | #' 545 | #' Returns the number of final configurations per penalization parameter in an 546 | #' scRegClust object. 547 | #' 548 | #' @param fit An object of class `scRegClust` 549 | #' 550 | #' @return An integer vector containing the number of final configurations 551 | #' for each penalization parameter. 552 | #' 553 | #' @concept utilities 554 | #' 555 | #' @export 556 | get_num_final_configs <- function(fit) { 557 | sapply(fit$results, function(r) length(r$output)) 558 | } 559 | 560 | #' Get the average number of active regulators per module 561 | #' 562 | #' @param fit An object of class `scRegClust` 563 | #' 564 | #' @return A [`data.frame`] containing the average number of active regulators 565 | #' per module for each penalization parameter. 566 | #' 567 | #' @concept utilities 568 | #' 569 | #' @export 570 | get_avg_num_regulators <- function(fit) { 571 | as.data.frame(do.call(rbind, lapply(fit$results, function(r) { 572 | c( 573 | penalization = r$penalization, 574 | colMeans( 575 | do.call(rbind, lapply(r$output, function(o) { 576 | stats::setNames( 577 | colSums(o$models), 578 | seq_len(ncol(o$models)) 579 | ) 580 | })) 581 | ) 582 | ) 583 | }))) 584 | } 585 | 586 | #' Compute the Rand index 587 | #' 588 | #' @param k1 First clustering as vector of integers 589 | #' @param k2 Second clustering as vector of integers 590 | #' 591 | #' @return The Rand index as a numeric value 592 | #' 593 | #' @references 594 | #' W. M. Rand (1971). "Objective criteria for the evaluation of clustering 595 | #' methods". Journal of the American Statistical Association 66 (336): 846–850. 596 | #' DOI:10.2307/2284239 597 | #' 598 | #' @keywords internal 599 | compute_rand_index <- function(k1, k2) { 600 | n <- length(k1) 601 | 602 | # Assertion 603 | stopifnot(length(k2) == n) 604 | stopifnot(is.numeric(k1), all(as.integer(k1) == k1)) 605 | stopifnot(is.numeric(k2), all(as.integer(k2) == k2)) 606 | 607 | # Requires that k1 and k2 are integer vectors (or integers in numeric format) 608 | m1 <- do.call(c, lapply( 609 | seq_len(n - 1L), function(i) abs(k1[i] - k1[(i + 1):n]) 610 | )) 611 | m2 <- do.call(c, lapply( 612 | seq_len(n - 1L), function(i) abs(k2[i] - k2[(i + 1):n]) 613 | )) 614 | 615 | # Compute Rand index 616 | (sum(!m1 & !m2) + sum(m1 & m2)) / choose(n, 2) 617 | } 618 | 619 | #' Compute Hubert's and Arabie's Adjusted Rand index 620 | #' 621 | #' @param k1 First clustering as vector of integers 622 | #' @param k2 Second clustering as vector of integers 623 | #' 624 | #' @return The Adjusted Rand index as a numeric value 625 | #' 626 | #' @references 627 | #' Lawrence Hubert and Phipps Arabie (1985). "Comparing partitions". 628 | #' Journal of Classification. 2 (1): 193–218. DOI:10.1007/BF01908075 629 | #' 630 | #' @keywords internal 631 | compute_adjusted_rand_index <- function(k1, k2) { 632 | n <- length(k1) 633 | 634 | # Assertion 635 | stopifnot(length(k2) == n) 636 | stopifnot(is.numeric(k1), all(as.integer(k1) == k1)) 637 | stopifnot(is.numeric(k2), all(as.integer(k2) == k2)) 638 | 639 | # Construct contingency table 640 | ct <- table(k1, k2) 641 | 642 | # Compute binomial pair sums 643 | sum_as <- sum(choose(rowSums(ct), 2)) 644 | sum_bs <- sum(choose(colSums(ct), 2)) 645 | sum_ns <- sum(choose(as.vector(ct), 2)) 646 | denom <- choose(n, 2) 647 | 648 | # Compute adjusted Rand index 649 | ( 650 | (sum_ns - (sum_as * sum_bs) / denom) 651 | / ( 652 | 0.5 * (sum_as + sum_bs) 653 | - (sum_as * sum_bs) / denom 654 | ) 655 | ) 656 | } 657 | 658 | #' Compute Rand indices 659 | #' 660 | #' Compute Rand indices for fitted scregclust object 661 | #' 662 | #' @param fit An object of class `scregclust` 663 | #' @param groundtruth A known clustering of the target genes (integer vector) 664 | #' @param adjusted If TRUE, the Adjusted Rand index is computed. Otherwise the 665 | #' ordinary Rand index is computed. 666 | #' 667 | #' @return A [`data.frame`] containing the Rand indices. Since there can 668 | #' be more than one final configuration for some penalization 669 | #' parameters, Rand indices are averaged for each fixed penalization 670 | #' parameter. Returned are the mean, standard deviation and number 671 | #' of final configurations that were averaged. 672 | #' 673 | #' @references 674 | #' W. M. Rand (1971). "Objective criteria for the evaluation of clustering 675 | #' methods". Journal of the American Statistical Association 66 (336): 846–850. 676 | #' DOI:10.2307/2284239 677 | #' 678 | #' Lawrence Hubert and Phipps Arabie (1985). "Comparing partitions". 679 | #' Journal of Classification. 2 (1): 193–218. DOI:10.1007/BF01908075 680 | #' 681 | #' @concept utilities 682 | #' 683 | #' @export 684 | get_rand_indices <- function(fit, groundtruth, adjusted = TRUE) { 685 | df <- do.call(rbind, lapply(get_target_gene_modules(fit), function(cs) { 686 | indices <- sapply(cs, function(cl) { 687 | noise_idx <- which(cl == -1) 688 | if (length(noise_idx) > 0) { 689 | cl_ <- cl[-noise_idx] 690 | gt_ <- groundtruth[-noise_idx] 691 | } else { 692 | cl_ <- cl 693 | gt_ <- groundtruth 694 | } 695 | 696 | if (length(cl_) > 0) { 697 | if (adjusted) { 698 | compute_adjusted_rand_index(gt_, cl_) * length(cl_) / length(cl) 699 | } else { 700 | compute_rand_index(gt_, cl_) * length(cl_) / length(cl) 701 | } 702 | } else { 703 | c(0) 704 | } 705 | }) 706 | 707 | data.frame(mean = mean(indices), sd = sd(indices), n = length(indices)) 708 | })) 709 | 710 | cbind(data.frame(penalization = fit$penalization), df) 711 | } 712 | --------------------------------------------------------------------------------