├── R ├── AllGenerics.R ├── AllClasses.R ├── GeneMatrix-methods.R ├── utilities.R ├── network.R ├── vectorized_tests.R ├── CompareY.R ├── GOMembershipMatrix.R ├── TestAssociation.R └── treemap.R ├── .Rbuildignore ├── LICENSE ├── .gitignore ├── tests ├── test-all.R └── testthat │ └── test-GSEAL.R ├── data └── factorial.rda ├── _vignettes ├── GSEAMA.pdf ├── brauer.Rmd └── GSEAMA.Rnw ├── GSEAMA.Rproj ├── man ├── CompareTopSets.Rd ├── trim_ellipses.Rd ├── mean_difference.Rd ├── GenerateNetwork.Rd ├── sparse_cast_.Rd ├── find_minimal_rooted_tree.Rd ├── assign_genes_to_paths.Rd ├── GetGORelatives.Rd ├── PlotNetwork.Rd ├── ExtractGenes.Rd ├── vectorized_wilcoxon_test.Rd ├── get_ancestry_matrix.Rd ├── CompareTopColumns.Rd ├── vectorized_t_test.Rd ├── GetEdgesTable.Rd ├── CompareY.Rd ├── ThresholdSets.Rd ├── sparse_cast.Rd ├── GenerateTreemap.Rd ├── TestAssociation.Rd └── GOMembershipMatrix.Rd ├── README.mkd ├── DESCRIPTION ├── NAMESPACE └── inst └── tests └── test-wilcoxon.R /R/AllGenerics.R: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2016 2 | COPYRIGHT HOLDER: David Robinson 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | inst/doc 5 | -------------------------------------------------------------------------------- /tests/test-all.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | 3 | test_check("GSEAMA") 4 | -------------------------------------------------------------------------------- /data/factorial.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgrtwo/GSEAMA/HEAD/data/factorial.rda -------------------------------------------------------------------------------- /_vignettes/GSEAMA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgrtwo/GSEAMA/HEAD/_vignettes/GSEAMA.pdf -------------------------------------------------------------------------------- /GSEAMA.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | PackageRoxygenize: rd,collate,namespace 19 | -------------------------------------------------------------------------------- /man/CompareTopSets.Rd: -------------------------------------------------------------------------------- 1 | \name{CompareTopSets} 2 | \alias{CompareTopSets} 3 | \title{Create a graph comparing the top n columns (in terms of association with y)} 4 | \usage{ 5 | CompareTopSets(m, n = 9, ...) 6 | } 7 | \arguments{ 8 | \item{m}{GeneMatrix object} 9 | 10 | \item{n}{Number of genes to compare} 11 | 12 | \item{...}{Additional arguments to be passed to CompareY} 13 | } 14 | \description{ 15 | Create a graph comparing the top n columns (in terms of 16 | association with y) 17 | } 18 | 19 | -------------------------------------------------------------------------------- /man/trim_ellipses.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utilities.R 3 | \name{trim_ellipses} 4 | \alias{trim_ellipses} 5 | \title{Trim the length of a character vector, adding ellipses to trimmed elements} 6 | \usage{ 7 | trim_ellipses(x, width) 8 | } 9 | \arguments{ 10 | \item{x}{character vector} 11 | 12 | \item{width}{length of string to trim to} 13 | } 14 | \description{ 15 | This utility function is used to shorten column names so they can appear 16 | on a figure. 17 | } 18 | 19 | -------------------------------------------------------------------------------- /man/mean_difference.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/TestAssociation.R 3 | \name{mean_difference} 4 | \alias{mean_difference} 5 | \title{Compute mean differences between "in a set" and "not in a set"} 6 | \usage{ 7 | mean_difference(m) 8 | } 9 | \arguments{ 10 | \item{m}{GeneMatrix object} 11 | } 12 | \description{ 13 | For each set in a GeneMatrix, compute the difference in y between 14 | "in the set" 15 | and "outside the set". It will be set in the MeanDifference column 16 | } 17 | 18 | -------------------------------------------------------------------------------- /R/AllClasses.R: -------------------------------------------------------------------------------- 1 | ################################## 2 | ######## AllClasses.R 3 | ######## 4 | ######## all classes in GSEAMA 5 | ################################## 6 | #################################################################### 7 | 8 | setClass("GeneMatrix", 9 | representation( 10 | matrix = "Matrix", 11 | colData = "ANY", 12 | geneData = "ANY", 13 | fit = "ANY", 14 | rankingMetric = "character", 15 | effectMetric = "character", 16 | plottingMetric = "character", 17 | assocMethod = "character" 18 | ) 19 | ) 20 | -------------------------------------------------------------------------------- /README.mkd: -------------------------------------------------------------------------------- 1 | Gene Set Enrichment Analysis Made Awesome 2 | ============================= 3 | 4 | Installation 5 | ------------- 6 | 7 | First install the Bioconductor dependencies: 8 | 9 | source("http://bioconductor.org/biocLite.R") 10 | biocLite(c("GO.db", "AnnotationDbi", "qvalue")) 11 | 12 | Then install the [devtools](https://github.com/hadley/devtools) package, and use it to install GSEAMA: 13 | 14 | install.packages("devtools") 15 | devtools::install_github("dgrtwo/GSEAMA") 16 | 17 | The package vignette can be downloaded [here](https://github.com/dgrtwo/GSEAMA/blob/master/_vignettes/GSEAMA.pdf?raw=true). 18 | -------------------------------------------------------------------------------- /man/GenerateNetwork.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/network.R 3 | \name{GenerateNetwork} 4 | \alias{GenerateNetwork} 5 | \title{Create an igraph from a GeneMatrix representing GO terms} 6 | \usage{ 7 | GenerateNetwork(m, edges = NULL, ...) 8 | } 9 | \arguments{ 10 | \item{m}{A GeneMatrix object} 11 | 12 | \item{edges}{A table of edges to plot. If not given, compute with 13 | \code{\link{GetEdgesTable}}} 14 | 15 | \item{...}{Extra arguments passed on to GetEdgesTable} 16 | } 17 | \value{ 18 | An igraph object 19 | } 20 | \description{ 21 | Create an igraph from a GeneMatrix representing GO terms 22 | } 23 | 24 | -------------------------------------------------------------------------------- /man/sparse_cast_.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utilities.R 3 | \name{sparse_cast_} 4 | \alias{sparse_cast_} 5 | \title{Standard-evaluation version of sparse_cast} 6 | \usage{ 7 | sparse_cast_(data, row_col, column_col, value_col = NULL) 8 | } 9 | \arguments{ 10 | \item{data}{A tbl} 11 | 12 | \item{row_col}{String version of column to use as row names} 13 | 14 | \item{column_col}{String version of column to use as column names} 15 | 16 | \item{value_col}{String version of column to use as sparse matrix values, 17 | or a numeric value to use} 18 | } 19 | \description{ 20 | Standard-evaluation version of sparse_cast 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/find_minimal_rooted_tree.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treemap.R 3 | \name{find_minimal_rooted_tree} 4 | \alias{find_minimal_rooted_tree} 5 | \title{Find Minimal Rooted Tree} 6 | \usage{ 7 | find_minimal_rooted_tree(nodes, edges) 8 | } 9 | \arguments{ 10 | \item{nodes}{data_frame significance of gene sets} 11 | 12 | \item{edges}{data_frame go_id1 are parents of go_id2} 13 | } 14 | \value{ 15 | an igraph object of the gene set minimal spanning tree 16 | } 17 | \description{ 18 | Removes all loops so that gene sets can be nested within one another: creates an optimal directed acyclic graph 19 | pointing from general GO terms down to specific ones. 20 | } 21 | 22 | -------------------------------------------------------------------------------- /man/assign_genes_to_paths.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treemap.R 3 | \name{assign_genes_to_paths} 4 | \alias{assign_genes_to_paths} 5 | \title{Assign Genes to Paths} 6 | \usage{ 7 | assign_genes_to_paths(m, minimal_edge_set, ...) 8 | } 9 | \arguments{ 10 | \item{m}{GeneMatrix object representing GO terms} 11 | 12 | \item{minimal_edge_set}{a network which is a weighted directed minimal spanning tree of gene ontologies} 13 | 14 | \item{...}{Extra arguments passed on to threshold_sets} 15 | } 16 | \value{ 17 | a data_frame which contains which terminal gene set each gene is assigned to 18 | } 19 | \description{ 20 | Assign genes to a GO hierarchy which has the greatest signal 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/GetGORelatives.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/GOMembershipMatrix.R 3 | \name{GetGORelatives} 4 | \alias{GetGORelatives} 5 | \title{Get all GO IDs in all_IDs that are ancestors/descendants of those in IDs} 6 | \usage{ 7 | GetGORelatives(IDs, all_IDs, ancestors = TRUE, direct = FALSE, 8 | combine = TRUE) 9 | } 10 | \arguments{ 11 | \item{all_IDs}{A vector of GO IDs to look for relatives of x} 12 | 13 | \item{ancestors}{If true look among ancestors, if false look for children} 14 | 15 | \item{combine}{Whether to combine it with the sets in the IDs parameter} 16 | 17 | \item{x}{A vector of GO IDs} 18 | } 19 | \description{ 20 | Get all GO IDs in all_IDs that are ancestors/descendants of those in IDs 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/PlotNetwork.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/network.R 3 | \name{PlotNetwork} 4 | \alias{PlotNetwork} 5 | \title{Plot a network from an igraph representation} 6 | \usage{ 7 | PlotNetwork(g, algorithm = "kk", arrow = grid::arrow(length = 8 | grid::unit(0.1, "inches"))) 9 | } 10 | \arguments{ 11 | \item{g}{An igraph object, generally computed by \link{\code{GenerateNetwork}}} 12 | 13 | \item{algorithm}{Algorithm to use for layout} 14 | 15 | \item{arrow}{Arrow} 16 | } 17 | \description{ 18 | Return a ggplot2 object representing a network of sets. 19 | This is meant to be a sensible default for plotting 20 | GO networks, it will not cover all cases. We recommend learning 21 | to use ggraph or another igraph plotting functionality. 22 | } 23 | 24 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: GSEAMA 2 | Type: Package 3 | Title: Gene set enrichment analysis made awesome 4 | Version: 0.99.0 5 | Date: 2016-01-21 6 | Authors@R: c( 7 | person("David", "Robinson", email = "admiral.david@gmail.com", role = c("aut", "cre")), 8 | person("Sean", "Hackett", email = "seanmchackett@gmail.com", role = c("aut")) 9 | ) 10 | Description: Efficient and interpretable gene set enrichment analysis methods 11 | based on a membership matrix. 12 | VignetteBuilder: knitr 13 | Imports: 14 | dplyr, 15 | Matrix, 16 | ggplot2, 17 | AnnotationDbi, 18 | qvalue 19 | Suggests: 20 | glmnet, 21 | igraph, 22 | graph, 23 | RBGL, 24 | knitr, 25 | testthat 26 | URL: http://github.com/dgrtwo/GSEAMA 27 | License: MIT + file LICENSE 28 | RoxygenNote: 5.0.1 29 | -------------------------------------------------------------------------------- /man/ExtractGenes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CompareY.R 3 | \name{ExtractGenes} 4 | \alias{ExtractGenes} 5 | \title{Return all genes, along with the saved metric in each of the given sets} 6 | \usage{ 7 | ExtractGenes(m, columns, char_limit = NULL, add_count = FALSE, 8 | overall = FALSE) 9 | } 10 | \arguments{ 11 | \item{m}{a GeneMatrix object} 12 | 13 | \item{columns}{vector of columns (e.g. gene sets) whose genes should be extracted} 14 | 15 | \item{char_limit}{optionally shorten the Term field to the desired length} 16 | 17 | \item{add_count}{whether the count within each set should be added to the 18 | Term field (like "Metabolism (360)")} 19 | } 20 | \description{ 21 | Return all genes, along with the saved metric in each of the given sets 22 | } 23 | 24 | -------------------------------------------------------------------------------- /man/vectorized_wilcoxon_test.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/vectorized_tests.R 3 | \name{vectorized_wilcoxon_test} 4 | \alias{vectorized_wilcoxon_test} 5 | \title{Perform a Wilcoxon rank-sum test comparing a metric to each column of a matrix} 6 | \usage{ 7 | vectorized_wilcoxon_test(m, y, alternative = "two.sided", tbl = FALSE) 8 | } 9 | \arguments{ 10 | \item{m}{binary matrix} 11 | 12 | \item{y}{numeric vector to test each column of m against} 13 | 14 | \item{alternative}{whether to use a one-sided test, and if so which way} 15 | 16 | \item{tbl}{whether to return a data.frame rather than a vector} 17 | } 18 | \description{ 19 | This function is similar to \code{apply(m, 2, function(col) wilcox.test(y ~ col)$p.value)}, 20 | but is vectorized to make it much faster 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/get_ancestry_matrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/GOMembershipMatrix.R 3 | \name{get_ancestry_matrix} 4 | \alias{get_ancestry_matrix} 5 | \title{Build an offspring matrix of GO terms} 6 | \usage{ 7 | get_ancestry_matrix(terms, ontology = c("BP", "MF", "CC"), 8 | type = "OFFSPRING", upward = TRUE, tbl = FALSE) 9 | } 10 | \arguments{ 11 | \item{terms}{IDs of GO terms that should be included in the ancestry matrix} 12 | 13 | \item{ontology}{Ontologies to use} 14 | 15 | \item{type}{Either OFFSPRING (default), CHILDREN, ANCESTOR, or PARENT} 16 | } 17 | \description{ 18 | A sparse binary Matrix object with one row and column for each pair 19 | of GO terms provided, where each row represents an ancestor and each column 20 | represents a descendant, with 1 marking ancestor/descendant pairs. 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/CompareTopColumns.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CompareY.R 3 | \name{CompareTopColumns} 4 | \alias{CompareTopColumns} 5 | \title{Create a graph comparing the top n columns (in terms of association with y)} 6 | \usage{ 7 | CompareTopColumns(m, n = 9, ...) 8 | } 9 | \arguments{ 10 | \item{m}{GeneMatrix object} 11 | 12 | \item{n}{Number of genes to compare} 13 | 14 | \item{...}{Additional arguments to be passed to CompareY} 15 | } 16 | \description{ 17 | Plot a bocplot, violin plot, or faceted histogram with the distributions 18 | of y within the most associated gene sets. 19 | } 20 | \details{ 21 | Most methods ("t.test", "wilcoxon", "hypergeometric") use the 22 | p-value to find the most significant columns. The "lasso" uses the order 23 | in which the terms were added to the regression 24 | } 25 | 26 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(CompareTopColumns) 4 | export(CompareY) 5 | export(ExtractGenes) 6 | export(GOMembershipMatrix) 7 | export(GenerateNetwork) 8 | export(GenerateTreemap) 9 | export(GetEdgesTable) 10 | export(PlotNetwork) 11 | export(TestAssociation) 12 | export(get_ancestry_matrix) 13 | export(mean_difference) 14 | export(trim_ellipses) 15 | export(vectorized_t_test) 16 | export(vectorized_wilcoxon_test) 17 | import(AnnotationDbi) 18 | import(GO.db) 19 | import(Matrix) 20 | import(ggplot2) 21 | import(ggraph) 22 | importFrom(dplyr,"%>%") 23 | importFrom(dplyr,arrange) 24 | importFrom(dplyr,data_frame) 25 | importFrom(dplyr,distinct_) 26 | importFrom(dplyr,do) 27 | importFrom(dplyr,filter) 28 | importFrom(dplyr,group_by) 29 | importFrom(dplyr,inner_join) 30 | importFrom(dplyr,mutate) 31 | importFrom(dplyr,tbl_df) 32 | -------------------------------------------------------------------------------- /man/vectorized_t_test.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/vectorized_tests.R 3 | \name{vectorized_t_test} 4 | \alias{vectorized_t_test} 5 | \title{Perform a Student's T-test comparing a metric to each column of a matrix} 6 | \usage{ 7 | vectorized_t_test(m, y, var.equal = FALSE, alternative = "two.sided", 8 | tbl = FALSE) 9 | } 10 | \arguments{ 11 | \item{m}{binary matrix} 12 | 13 | \item{y}{numeric vector to test each column of m against} 14 | 15 | \item{var.equal}{assume that the variances are equal} 16 | 17 | \item{alternative}{whether to use a one-sided test, and if so which way} 18 | 19 | \item{tbl}{Whether to return as a data frame} 20 | } 21 | \description{ 22 | This function is similar to \code{apply(m, 2, function(col) t.test(y ~ col)$p.value)}, 23 | but is vectorized to make it much faster 24 | } 25 | 26 | -------------------------------------------------------------------------------- /man/GetEdgesTable.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/network.R 3 | \name{GetEdgesTable} 4 | \alias{GetEdgesTable} 5 | \title{Create a data frame of edges from a GO term matrix} 6 | \usage{ 7 | GetEdgesTable(m, sets = NULL, ancestors = TRUE, ontology = c("BP", "MF", 8 | "CC"), ...) 9 | } 10 | \arguments{ 11 | \item{m}{GeneMatrix object representing GO terms} 12 | 13 | \item{sets}{Gene sets to use in the network. If NULL, use 14 | \code{\link{ThresholdSets}} to find them} 15 | 16 | \item{ancestors}{Whether to include all ancestors of significant 17 | in the matrix} 18 | 19 | \item{ontology}{Which of BP, MF, and CC to use (default all three)} 20 | 21 | \item{...}{Extra arguments passed on to threshold_sets} 22 | } 23 | \value{ 24 | An igraph 25 | } 26 | \description{ 27 | Create a data frame of edges from a GO term matrix 28 | } 29 | 30 | -------------------------------------------------------------------------------- /man/CompareY.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CompareY.R 3 | \name{CompareY} 4 | \alias{CompareY} 5 | \title{Create a violin plot, boxplot or histogram looking at the metric of interest 6 | over one or more columns} 7 | \usage{ 8 | CompareY(m, columns, mode = "boxplot", char_limit = 35) 9 | } 10 | \arguments{ 11 | \item{m}{a GeneMatrix} 12 | 13 | \item{columns}{The ID of one or more columns to plot} 14 | 15 | \item{mode}{Type of plot to create: either "violin", "boxplot" or "histogram"} 16 | 17 | \item{char_limit}{Maximum number of characters to include in column names} 18 | } 19 | \description{ 20 | This compares the metric of interest (y) across the given genes, as well as 21 | comparing it to the overall distribution. This shows a visual representation 22 | of how a column (a gene set, a motif, a transcription factor, etc) is associated 23 | with the metric. 24 | } 25 | 26 | -------------------------------------------------------------------------------- /man/ThresholdSets.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/TestAssociation.R 3 | \name{ThresholdSets} 4 | \alias{ThresholdSets} 5 | \title{Return a vector of genes that pass a threshold for includions} 6 | \usage{ 7 | ThresholdSets(m, alpha = 0.05, method = "fdr", ...) 8 | } 9 | \arguments{ 10 | \item{m}{GeneMatrix object} 11 | 12 | \item{alpha}{Threshold for (corrected) p-values, not used in LASSO} 13 | 14 | \item{method}{Method, passed to \code{\link{p.adjust}}, or "qvalue" 15 | to use qvalue. Not used in LASSO} 16 | 17 | \item{...}{Extra arguments to pass on to correction method} 18 | } 19 | \description{ 20 | Return a vector of genes that pass a threshold of "significance" 21 | or other kind of inclusion. For tests with a p-value (t-test, 22 | Wilcoxon, hypergeometric), this is FDR-controlled p-values. 23 | For LASSO, this is all cases where beta1sd != 0. 24 | } 25 | \seealso{ 26 | \link{p.adjust}, \link{p.adjust.methods} 27 | } 28 | 29 | -------------------------------------------------------------------------------- /man/sparse_cast.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utilities.R 3 | \name{sparse_cast} 4 | \alias{sparse_cast} 5 | \title{Create a sparse matrix from row names, column names, and values 6 | in a table.} 7 | \usage{ 8 | sparse_cast(data, row, column, value) 9 | } 10 | \arguments{ 11 | \item{data}{A tbl} 12 | 13 | \item{row}{A bare column name to use as row names in sparse matrix} 14 | 15 | \item{column}{A bare column name to use as column names in sparse matrix} 16 | 17 | \item{value}{A bare column name to use as sparse matrix values, default 1} 18 | } 19 | \value{ 20 | A sparse Matrix object, with one row for each unique value in 21 | the \code{row} column, one column for each unique value in the \code{column} 22 | column, and with as many non-zero values as there are rows in \code{data}. 23 | } 24 | \description{ 25 | Create a sparse matrix from row names, column names, and values 26 | in a table. 27 | } 28 | \examples{ 29 | 30 | dat <- data.frame(a = c("row1", "row1", "row2", "row2", "row2"), 31 | b = c("col1", "col2", "col1", "col3", "col4"), 32 | val = 2) 33 | 34 | sparse_cast(dat, a, b) 35 | 36 | sparse_cast(dat, a, b, val) 37 | 38 | } 39 | 40 | -------------------------------------------------------------------------------- /man/GenerateTreemap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/treemap.R 3 | \name{GenerateTreemap} 4 | \alias{GenerateTreemap} 5 | \title{Generate Treemap} 6 | \usage{ 7 | GenerateTreemap(m, edges = NULL, ...) 8 | } 9 | \arguments{ 10 | \item{m}{GeneMatrix object representing GO terms} 11 | 12 | \item{...}{Extra arguments passed on to threshold_sets} 13 | } 14 | \value{ 15 | Generates a treemap hierarchy 16 | Treemap GO Plot 17 | 18 | Generate a treemap plot using the minimal spanning GO tree with genes assigned to individual GO terms. 19 | 20 | Default color based on GO significance and effect sizes of individual genes 21 | } 22 | \description{ 23 | Visualize genes nested within specific and general GO sets using treemaps 24 | } 25 | \details{ 26 | Each gene should maximize the absolute sum of relevant node weights but every GO category will 27 | not be included since a gene can only be assigned to a single ancestry. A small GO category may 28 | fall into multiple ancestral paths some which are parents of other meaningful categories and others 29 | that are dead-ends. Should favor attachment to pathways with the most informative children. 30 | To allow this to occur, three edge weights are used 31 | 1) the path with the highest overall absolute score * number of genes affected is chosen 32 | 2) ancestors of LASSO nodes all get a smaller score: [min(abs(beta))]*0.01*N_lasso_children 33 | - applied to edge where ancestors are children 34 | 3) all remaining edges that are not predictive recieve a tiny weight so that they are not filtered: [min(abs(beta))]*1e-4 35 | } 36 | 37 | -------------------------------------------------------------------------------- /man/TestAssociation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/TestAssociation.R 3 | \name{TestAssociation} 4 | \alias{TestAssociation} 5 | \title{Test association of a per-gene attribute with each column in a GeneMatrix} 6 | \usage{ 7 | TestAssociation(m, genes, y, method = "lasso", ...) 8 | } 9 | \arguments{ 10 | \item{m}{A GeneMatrix object} 11 | 12 | \item{genes}{A vector of gene names (must match the gene names in the GeneMatrix)} 13 | 14 | \item{y}{A per-gene metric of the same length as the vector of gene names} 15 | 16 | \item{method}{Method used to test association between the per-gene metric and 17 | each column - either c("lasso", "t.test", "wilcoxon" or "hypergeometric")} 18 | 19 | \item{...}{Additional arguments that will be given to the test in question} 20 | } 21 | \description{ 22 | Test the association of a per-gene attribute y with each column, where a column 23 | can represent a gene set, a motif, transcription factor targets, or other genes 24 | that are functionally related 25 | } 26 | \examples{ 27 | 28 | library(org.Sc.sgd.db) 29 | 30 | n_genes <- 200 31 | genes <- sample(mappedkeys(org.Sc.sgdGO), n_genes) 32 | 33 | mm <- GOMembershipMatrix(org.Sc.sgdGO, min_size = 5, 34 | max_size = 50, chosen_genes = genes) 35 | # some genes are dropped because they do not have GO categories 36 | 37 | n_genes <- nrow(mm) 38 | genes <- mm@geneData$ID 39 | 40 | beta = c(20:1, rep(0, times = nrow(mm@colData)-20)) 41 | y = c(as.matrix(mm) \%*\% beta + rnorm(n_genes, 0, 2)) 42 | 43 | results <- TestAssociation(mm, genes, y) 44 | View(results@colData) 45 | 46 | } 47 | 48 | -------------------------------------------------------------------------------- /inst/tests/test-wilcoxon.R: -------------------------------------------------------------------------------- 1 | test_that("Wilcoxon rank-sum test generates correct p-values", { 2 | data(factorial) 3 | mm = MembershipMatrix(organism = "Sc.sgd", ontology = "BP", min.size = 5, max.size = 250) 4 | 5 | # perform three hypothesis tests, one for each alternative 6 | for (ah in c("two.sided", "greater", "less")) { 7 | wilcoxon.mm = TestEnrichment(mm, factorial$ORF, factorial$RNA.Seq.logFC, method = "wilcoxon", alternative=ah) 8 | 9 | # check that it is internally consistent 10 | expect_that(NROW(wilcoxon.mm@setData), equals(NCOL(wilcoxon.mm@matrix))) 11 | expect_that(NROW(wilcoxon.mm@geneData), equals(NROW(wilcoxon.mm@matrix))) 12 | 13 | # check the first 6 and the last 6 pvalues 14 | check.vals = as.numeric(c(1:6, tail(seq_along(wilcoxon.mm@setData$pvalue)))) 15 | for (ind in check.vals) { 16 | b = (wilcoxon.mm@matrix[, ind] > 0) 17 | manual.pval = wilcox.test(wilcoxon.mm@geneData$y[b], wilcoxon.mm@geneData$y[!b], alternative=ah)$p.value 18 | expect_that(wilcoxon.mm@setData$pvalue[ind], equals(manual.pval)) 19 | # check that hypothesis test means what we want it to 20 | r = rank(wilcoxon.mm@geneData$y) 21 | members.greater = mean(r[b]) > mean(r[!b]) 22 | if (ah == "greater") { 23 | expect_that(wilcoxon.mm@setData$pvalue[ind] < .5, 24 | equals(members.greater)) 25 | } 26 | if (ah == "less") { 27 | expect_that(wilcoxon.mm@setData$pvalue[ind] < .5, 28 | equals(!members.greater)) 29 | } 30 | } 31 | } 32 | }) 33 | 34 | -------------------------------------------------------------------------------- /man/GOMembershipMatrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/GOMembershipMatrix.R 3 | \name{GOMembershipMatrix} 4 | \alias{GOMembershipMatrix} 5 | \title{Construct a new gene set membership matrix} 6 | \usage{ 7 | GOMembershipMatrix(annotations, ontology = c("BP", "MF", "CC"), 8 | evidence = NULL, min_size = 1, max_size = Inf, ancestors = TRUE, 9 | chosen_genes = NULL) 10 | } 11 | \arguments{ 12 | \item{annotations}{A Go3AnnDbBimap object, typically from a Bioconductor Annotation 13 | package. For example, org.Hs.egGO from org.Hs.eg.db for human genes.} 14 | 15 | \item{ontology}{A vector of ontologies to include; should be a subset 16 | of c("BP", "MF", "CC")} 17 | 18 | \item{evidence}{A vector of GO evidence codes to include} 19 | 20 | \item{min_size}{Minimum size of a gene set to be included} 21 | 22 | \item{max_size}{Maximum size of a gene set to be included} 23 | 24 | \item{ancestors}{Whether a gene included in a gene set should also be included 25 | as being in all ancestors of that gene set (default TRUE)} 26 | 27 | \item{chosen_genes}{If provided, restricts genes to those in chosen_genes 28 | 29 | The GO annotation maps typically come from the Bioconductor AnnotationDB packages. 30 | A couple of notable examples are: 31 | 32 | Homo sapiens: org.Hs.egGO 33 | S. cerevisiae: org.Sc.sgdGO 34 | E coli K12: org.EcK12.egGO 35 | 36 | You can restrict the sets to the BP (Biological Process), MF (Molecular Function), 37 | or CC (Cellular Compartment) ontologies (by default all are included).} 38 | } 39 | \description{ 40 | Construct a gene set membership matrix (one row per gene, one column per 41 | gene set, with 1 indicating membership) from a GO annotation map. 42 | } 43 | \examples{ 44 | 45 | # yeast membership matrix 46 | library(org.Sc.sgd.db) 47 | mm <- GOMembershipMatrix(org.Sc.sgdGO, min_size = 5, max_size = 250) 48 | 49 | # restrict to Biological Process ontology: 50 | mm <- GOMembershipMatrix(org.Sc.sgdGO, ontology = "BP", min_size = 5, max_size = 250) 51 | 52 | # human membership matrix 53 | library(org.Hs.eg.db) 54 | mm <- GOMembershipMatrix(org.Hs.sgdGO, min_size = 5, max_size = 250) 55 | 56 | } 57 | 58 | -------------------------------------------------------------------------------- /R/GeneMatrix-methods.R: -------------------------------------------------------------------------------- 1 | # methods for working with a GeneMatrix 2 | 3 | setMethod("dim", "GeneMatrix", function(x) dim(x@matrix)) 4 | setMethod("nrow", "GeneMatrix", function(x) nrow(x@matrix)) 5 | setMethod("ncol", "GeneMatrix", function(x) ncol(x@matrix)) 6 | setMethod("rownames", "GeneMatrix", function(x) rownames(x@matrix)) 7 | setMethod("colnames", "GeneMatrix", function(x) colnames(x@matrix)) 8 | setMethod("as.matrix", "GeneMatrix", function(x, ...) as.matrix(x@matrix)) 9 | 10 | 11 | setMethod("print", signature(x = "GeneMatrix"), function(x) { 12 | cat(paste("Gene Matrix with", nrow(x), "genes and", ncol(x), "columns")) 13 | }) 14 | 15 | 16 | setMethod("show", signature(object = "GeneMatrix"), function(object) { 17 | print(object) 18 | }) 19 | 20 | 21 | setMethod("[", c("GeneMatrix", "ANY", "ANY", "ANY"), 22 | function(x, i, j, ..., drop = TRUE) 23 | { 24 | if (!is.null(x@fit)) { 25 | warning("Subsetting a model that has already been tested") 26 | } 27 | 28 | # subset the matrix 29 | if (missing(i)) { 30 | i <- seq_len(nrow(x@matrix)) 31 | } 32 | if (missing(j)) { 33 | j <- seq_len(ncol(x@matrix)) 34 | } 35 | 36 | mat <- x@matrix[i, j] 37 | if (length(i) == 1) { 38 | mat <- Matrix(mat, nrow = 1) 39 | } else if (length(j) == 1) { 40 | mat <- Matrix(mat, ncol = 1) 41 | } 42 | x@matrix <- mat 43 | 44 | # subset gene and column data 45 | if (!is.character(i)) { 46 | x@geneData = x@geneData[i, ] 47 | } else { 48 | x@geneData = x@geneData[match(i, x@geneData$ID), ] 49 | } 50 | 51 | if (!is.character(j)) { 52 | x@colData = x@colData[j, ] 53 | } else { 54 | x@colData = x@colData[match(j, x@colData$ID)] 55 | } 56 | 57 | x@geneData$Size = rowSums(x@matrix != 0) 58 | x@colData$Size = colSums(x@matrix != 0) 59 | stopifnot(all(rownames(x@matrix) == x@geneData$ID)) 60 | stopifnot(all(colnames(x@matrix) == x@colData$ID)) 61 | x 62 | }) 63 | -------------------------------------------------------------------------------- /R/utilities.R: -------------------------------------------------------------------------------- 1 | #' Standard-evaluation version of sparse_cast 2 | #' 3 | #' @param data A tbl 4 | #' @param row_col String version of column to use as row names 5 | #' @param column_col String version of column to use as column names 6 | #' @param value_col String version of column to use as sparse matrix values, 7 | #' or a numeric value to use 8 | #' 9 | #' @import Matrix 10 | sparse_cast_ <- function(data, row_col, column_col, value_col = NULL) { 11 | row_names <- data[[row_col]] 12 | col_names <- data[[column_col]] 13 | if (is.numeric(value_col)) { 14 | values <- value_col 15 | } else { 16 | values <- data[[value_col]] 17 | } 18 | 19 | # if it's a factor, preserve ordering 20 | row_u <- if (is.factor(row_names)) levels(row_names) else unique(row_names) 21 | col_u <- if (is.factor(col_names)) levels(col_names) else unique(col_names) 22 | 23 | ret <- Matrix(0, nrow = length(row_u), ncol = length(col_u), 24 | dimnames = list(as.character(row_u), as.character(col_u)), 25 | sparse = TRUE) 26 | 27 | ret[cbind(match(row_names, row_u), match(col_names, col_u))] <- values 28 | 29 | ret 30 | } 31 | 32 | 33 | #' Create a sparse matrix from row names, column names, and values 34 | #' in a table. 35 | #' 36 | #' @param data A tbl 37 | #' @param row A bare column name to use as row names in sparse matrix 38 | #' @param column A bare column name to use as column names in sparse matrix 39 | #' @param value A bare column name to use as sparse matrix values, default 1 40 | #' 41 | #' @return A sparse Matrix object, with one row for each unique value in 42 | #' the \code{row} column, one column for each unique value in the \code{column} 43 | #' column, and with as many non-zero values as there are rows in \code{data}. 44 | #' 45 | #' @examples 46 | #' 47 | #' dat <- data.frame(a = c("row1", "row1", "row2", "row2", "row2"), 48 | #' b = c("col1", "col2", "col1", "col3", "col4"), 49 | #' val = 2) 50 | #' 51 | #' sparse_cast(dat, a, b) 52 | #' 53 | #' sparse_cast(dat, a, b, val) 54 | #' 55 | #' @name sparse_cast 56 | sparse_cast <- function(data, row, column, value) { 57 | if (missing(value)) { 58 | value_col <- 1 59 | } else { 60 | value_col <- as.character(substitute(value)) 61 | if (is.null(data[[value_col]])) { 62 | value_col <- value 63 | } 64 | } 65 | 66 | sparse_cast_(data, as.character(substitute(row)), 67 | as.character(substitute(column)), value_col) 68 | } 69 | 70 | 71 | #' Trim the length of a character vector, adding ellipses to trimmed elements 72 | #' 73 | #' This utility function is used to shorten column names so they can appear 74 | #' on a figure. 75 | #' 76 | #' @param x character vector 77 | #' @param width length of string to trim to 78 | #' 79 | #' @export 80 | trim_ellipses <- function(x, width) { 81 | ret <- stringr::str_sub(x, 1, width) 82 | toolong <- stringr::str_length(x) > width 83 | ret[toolong] <- stringr::str_c(ret[toolong], "...") 84 | ret 85 | } 86 | -------------------------------------------------------------------------------- /tests/testthat/test-GSEAL.R: -------------------------------------------------------------------------------- 1 | ### setup ### 2 | 3 | library(org.Sc.sgd.db) 4 | 5 | # shared matrix for general use 6 | m = GOMembershipMatrix(org.Sc.sgdGO) 7 | 8 | consistency.check = function(m) { 9 | expect_equal(rowSums(m@matrix != 0), m@geneData$Count) 10 | expect_equal(colSums(m@matrix != 0), m@colData$Count) 11 | } 12 | 13 | context("GO membership matrices") 14 | 15 | test_that("We can create a gene set membership matrix", { 16 | mm = GOMembershipMatrix(org.Sc.sgdGO) 17 | consistency.check(mm) 18 | # check that it contains only 0s and ones 19 | expect_equal(unique(c(as.matrix(mm))), c(0, 1)) 20 | }) 21 | 22 | test_that("min.size and max.size arguments of GOMembershipMatrix work", { 23 | mm = GOMembershipMatrix(org.Sc.sgdGO, min.size=5, max.size=200) 24 | consistency.check(mm) 25 | expect_more_than(min(colSums(mm@matrix)), 4) 26 | expect_less_than(max(rowSums(mm@matrix)), 201) 27 | }) 28 | 29 | context("Subsetting a matrix") 30 | 31 | test_that("Subsetting a gene matrix leads to correct dimensions", { 32 | consistency.check(m[2:4, ]) 33 | consistency.check(m[, 50:55]) 34 | consistency.check(m[20:25, 40:50]) 35 | 36 | expect_equal(dim(m[2:4, ]), c(3, ncol(m))) 37 | expect_equal(dim(m[, 6:10]), c(nrow(m), 5)) 38 | expect_equal(dim(m[11:50, 20:21]), c(40, 2)) 39 | 40 | expect_equal(nrow(m[, 5:7]@colData), 3) 41 | expect_equal(nrow(m[, 5:7]@geneData), nrow(m@geneData)) 42 | expect_equal(nrow(m[16:20, ]@geneData), 5) 43 | expect_equal(nrow(m[16:20, ]@colData), nrow(m@colData)) 44 | }) 45 | 46 | test_that("Subsetting a matrix with a numeric vector gets correct rows", { 47 | # todo 48 | }) 49 | 50 | test_that("Subsetting a matrix with a logical vector gets correct rows", { 51 | # todo 52 | }) 53 | 54 | context("Association testing") 55 | 56 | test_that("Wilcoxon rank sum association testing calculates correct p-values", { 57 | # create a random trait to test 58 | trait = rnorm(nrow(m)) 59 | 60 | # test in all three alternative hypotheses 61 | testers = sample(which(colSums(m@matrix) > 10), 6) 62 | for (alternative in c("two.sided", "less", "greater")) { 63 | m = TestAssociation(m, rownames(m), trait, method="wilcoxon", 64 | alternative=alternative) 65 | manual.tests = apply(as.matrix(m@matrix)[, testers], 2, function(col) { 66 | wilcox.test(trait[col != 0], trait[col == 0], alternative=alternative)$p.value 67 | }) 68 | expect_equal(m@colData$pvalue[testers], unname(manual.tests)) 69 | } 70 | }) 71 | 72 | test_that("T-test association testing calculates correct p-values", { 73 | # todo: check that it is close enough, may not be exact 74 | 75 | # create a random trait to test 76 | trait = rnorm(nrow(m)) 77 | 78 | testers = sample(which(colSums(m@matrix) > 10), 3) 79 | # test in all three alternative hypotheses 80 | for (alternative in c("two.sided", "less", "greater")) { 81 | m = TestAssociation(m, rownames(m), trait, method="t.test", 82 | alternative=alternative) 83 | manual.tests = apply(as.matrix(m@matrix)[, testers], 2, function(col) { 84 | t.test(trait[col != 0], trait[col == 0], alternative=alternative)$p.value 85 | }) 86 | expect_equal(m@colData$pvalue[testers], unname(manual.tests)) 87 | } 88 | }) 89 | 90 | test_that("Hypergeometric association testing calculates correct p-values", { 91 | # todo 92 | }) 93 | -------------------------------------------------------------------------------- /R/network.R: -------------------------------------------------------------------------------- 1 | #' Create a data frame of edges from a GO term matrix 2 | #' 3 | #' @param m GeneMatrix object representing GO terms 4 | #' @param sets Gene sets to use in the network. If NULL, use 5 | #' \code{\link{ThresholdSets}} to find them 6 | #' @param ancestors Whether to include all ancestors of significant 7 | #' in the matrix 8 | #' @param ontology Which of BP, MF, and CC to use (default all three) 9 | #' @param ... Extra arguments passed on to threshold_sets 10 | #' 11 | #' @return An igraph 12 | #' 13 | #' @export 14 | GetEdgesTable <- function(m, sets = NULL, ancestors = TRUE, 15 | ontology = c("BP", "MF", "CC"), ...) { 16 | # include ancestors 17 | if (is.null(sets)) { 18 | sets <- ThresholdSets(m, ...) 19 | } 20 | if (length(sets) == 0) { 21 | stop("No thresholded sets to include in network") 22 | } 23 | 24 | if (ancestors) { 25 | sets <- GetGORelatives(sets, m@colData$ID) 26 | } 27 | 28 | go_terms <- m@colData %>% 29 | dplyr::filter(ID %in% sets) %>% 30 | dplyr::filter(Ontology %in% ontology) 31 | 32 | # get direct edges going downward to create the graph, and combine with column data 33 | edges <- get_ancestry_matrix(go_terms$ID, tbl = TRUE, type = "CHILDREN", upward = FALSE) %>% 34 | dplyr::mutate(go_id1 = as.character(go_id1)) %>% 35 | dplyr::mutate(go_id2 = as.character(go_id2)) %>% 36 | dplyr::left_join(dplyr::select(go_terms, ID, Ontology), by = c(go_id1 = "ID")) %>% 37 | dplyr::tbl_df() 38 | 39 | edges 40 | } 41 | 42 | 43 | #' Create an igraph from a GeneMatrix representing GO terms 44 | #' 45 | #' @param m A GeneMatrix object 46 | #' @param edges A table of edges to plot. If not given, compute with 47 | #' \code{\link{GetEdgesTable}} 48 | #' @param ... Extra arguments passed on to GetEdgesTable 49 | #' 50 | #' @return An igraph object 51 | #' 52 | #' @export 53 | GenerateNetwork <- function(m, edges = NULL, ...) { 54 | if (is.null(edges)) { 55 | edges <- GetEdgesTable(m, ...) 56 | } 57 | 58 | g <- igraph::graph.data.frame(edges) 59 | 60 | go_terms <- m@colData %>% 61 | filter(ID %in% c(edges$go_id1, edges$go_id2)) 62 | 63 | go_node_data <- data.frame(ID = names(igraph::V(g)), stringsAsFactors = FALSE) %>% 64 | inner_join(go_terms, by = "ID") 65 | 66 | # transfer column information from nodes and edges 67 | for (col in colnames(go_node_data)) { 68 | # a hack, but necessary since V(g)[[col]] doesn't work 69 | eval(substitute(igraph::V(g)$replace <- go_node_data[[col]], list(replace = col))) 70 | } 71 | 72 | for (col in colnames(edges)) { 73 | eval(substitute(igraph::E(g)$replace <- edges[[col]], list(replace = col))) 74 | } 75 | 76 | g 77 | } 78 | 79 | 80 | #' Plot a network from an igraph representation 81 | #' 82 | #' Return a ggplot2 object representing a network of sets. 83 | #' This is meant to be a sensible default for plotting 84 | #' GO networks, it will not cover all cases. We recommend learning 85 | #' to use ggraph or another igraph plotting functionality. 86 | #' 87 | #' @param g An igraph object, generally computed by \link{\code{GenerateNetwork}} 88 | #' @param algorithm Algorithm to use for layout 89 | #' @param arrow Arrow 90 | #' 91 | #' @import ggplot2 92 | #' @import ggraph 93 | #' 94 | #' @export 95 | PlotNetwork <- function(g, algorithm = 'kk', arrow = grid::arrow(length = grid::unit(.1, "inches"))) { 96 | ggraph(g, 'igraph', algorithm = algorithm) + 97 | geom_edge_link(arrow = arrow) + 98 | geom_node_point(aes(size = Size)) + 99 | geom_node_point(aes(color = MeanDifference, size = Size)) + 100 | geom_node_text(aes(label = Term), check_overlap = TRUE, size = 3) + 101 | ggforce::theme_no_axes() + 102 | scale_colour_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, 103 | space = "Lab", na.value = "grey50", guide = "colourbar") 104 | } -------------------------------------------------------------------------------- /R/vectorized_tests.R: -------------------------------------------------------------------------------- 1 | #' Perform a Student's T-test comparing a metric to each column of a matrix 2 | #' 3 | #' This function is similar to \code{apply(m, 2, function(col) t.test(y ~ col)$p.value)}, 4 | #' but is vectorized to make it much faster 5 | #' 6 | #' @param m binary matrix 7 | #' @param y numeric vector to test each column of m against 8 | #' @param var.equal assume that the variances are equal 9 | #' @param alternative whether to use a one-sided test, and if so which way 10 | #' @param tbl Whether to return as a data frame 11 | #' 12 | #' @export 13 | vectorized_t_test <- function(m, y, var.equal=FALSE, alternative="two.sided", 14 | tbl = FALSE) { 15 | stopifnot(NROW(m) == length(y)) 16 | 17 | in.m <- y * m 18 | out.m <- y * (1 - m) 19 | n.in <- colSums(m) 20 | n.out <- NROW(m) - n.in 21 | 22 | in.mu <- colSums(in.m) / n.in 23 | out.mu <- colSums(out.m) / n.out 24 | 25 | v.in <- colSums((in.m - t(t(m) * in.mu)) ^ 2) / (n.in - 1) 26 | v.out <- colSums((out.m - t(t(m) * out.mu)) ^ 2) / (n.out - 1) 27 | 28 | if (var.equal) { 29 | df <- NROW(m) - 2 30 | v <- (n.in - 1) * v.in + (n.out - 1) * v.out 31 | v <- v / df 32 | stderr <- sqrt(v*(1 / n.in + 1 / n.out)) 33 | } 34 | else { 35 | stderr.in <- sqrt(v.in/n.in) 36 | stderr.out <- sqrt(v.out/n.out) 37 | stderr <- sqrt(stderr.in ^ 2 + stderr.out ^ 2) 38 | df <- stderr ^ 4 / (stderr.in ^ 4 / (n.in - 1) + stderr.out ^ 4 / (n.out - 1)) 39 | } 40 | 41 | tstat <- (in.mu - out.mu) / stderr 42 | 43 | PVAL <- switch(alternative, less = pt(tstat, df), 44 | greater = pt(tstat, df, lower.tail = FALSE), 45 | two.sided = 2 * pt(-abs(tstat), df)) 46 | 47 | if (tbl) { 48 | ret <- data.frame(column = colnames(m), 49 | p.value = PVAL, 50 | estimate = in.mu - out.mu) 51 | } else { 52 | # name them, if the matrix has names 53 | names(PVAL) <- colnames(m) 54 | PVAL 55 | } 56 | } 57 | 58 | 59 | #' Perform a Wilcoxon rank-sum test comparing a metric to each column of a matrix 60 | #' 61 | #' This function is similar to \code{apply(m, 2, function(col) wilcox.test(y ~ col)$p.value)}, 62 | #' but is vectorized to make it much faster 63 | #' 64 | #' @param m binary matrix 65 | #' @param y numeric vector to test each column of m against 66 | #' @param alternative whether to use a one-sided test, and if so which way 67 | #' @param tbl whether to return a data.frame rather than a vector 68 | #' 69 | #' @export 70 | vectorized_wilcoxon_test = function(m, y, alternative="two.sided", tbl = FALSE) { 71 | # given a boolean matrix and a vector y, apply the wilcoxon test to see 72 | # if y depends on each column of the matrix, returning a vector of 73 | # p-values 74 | stopifnot(NROW(m) == length(y)) 75 | 76 | rk = rank(y) 77 | n.x = colSums(m) 78 | n.y = NROW(m) - n.x 79 | 80 | STATISTIC = colSums(rk * m) - n.x * (n.x + 1) / 2 81 | NTIES = table(rk) 82 | 83 | z <- STATISTIC - n.x * n.y / 2 84 | 85 | CORRECTION <- switch(alternative, 86 | "two.sided" = sign(z) * 0.5, 87 | "greater" = 0.5, 88 | "less" = -0.5) 89 | 90 | SIGMA <- sqrt((n.x * n.y / 12) * 91 | ((n.x + n.y + 1) 92 | - sum(NTIES ^ 3 - NTIES) 93 | / ((n.x + n.y) * (n.x + n.y - 1)))) 94 | z <- (z - CORRECTION) / SIGMA 95 | 96 | PVAL <- switch(alternative, 97 | "less" = pnorm(z), 98 | "greater" = pnorm(z, lower.tail = FALSE), 99 | "two.sided" = 2 * pmin(pnorm(z), 100 | pnorm(z, lower.tail = FALSE))) 101 | 102 | if (tbl) { 103 | # return as a tidy table 104 | auc <- STATISTIC / (n.x * n.y) 105 | data.frame(column = colnames(m), p.value = PVAL, auc = auc) 106 | } else { 107 | # name them, if the matrix has names 108 | names(PVAL) <- colnames(m) 109 | PVAL 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /R/CompareY.R: -------------------------------------------------------------------------------- 1 | #' Return all genes, along with the saved metric in each of the given sets 2 | #' 3 | #' @param m a GeneMatrix object 4 | #' @param columns vector of columns (e.g. gene sets) whose genes should be extracted 5 | #' @param char_limit optionally shorten the Term field to the desired length 6 | #' @param add_count whether the count within each set should be added to the 7 | #' Term field (like "Metabolism (360)") 8 | #' 9 | #' @importFrom dplyr inner_join do 10 | #' 11 | #' @export 12 | ExtractGenes <- function(m, columns, char_limit = NULL, add_count = FALSE, 13 | overall = FALSE) { 14 | # repeat gene data, once for each column 15 | dat <- data_frame(Set = columns) %>% 16 | group_by(Set) %>% 17 | do(m@geneData) 18 | 19 | dat$Present <- c(as.matrix(m@matrix[, columns]) > 0) 20 | 21 | dat <- dat %>% 22 | filter(Present == TRUE) %>% 23 | dplyr::select(-Present, -Size) %>% 24 | inner_join(m@colData, by = c(Set = "ID")) %>% 25 | arrange(match(Set, columns)) 26 | 27 | if (!is.null(char_limit)) { 28 | # shorten the Term 29 | dat <- dat %>% 30 | mutate(Term = trim_ellipses(Term, char_limit)) 31 | } 32 | if (add_count) { 33 | # add parenthetical counts to each term 34 | dat <- dat %>% 35 | mutate(Term = paste0(Term, " (", Size, ")")) 36 | } 37 | if (overall) { 38 | dat <- bind_rows(dat, cbind(m@geneData, Term = "Overall")) 39 | } 40 | 41 | # place terms in order of appearance 42 | dat$Term <- factor(dat$Term, levels = dat$Term[!duplicated(dat$Term)]) 43 | dat 44 | } 45 | 46 | 47 | #' Create a violin plot, boxplot or histogram looking at the metric of interest 48 | #' over one or more columns 49 | #' 50 | #' This compares the metric of interest (y) across the given genes, as well as 51 | #' comparing it to the overall distribution. This shows a visual representation 52 | #' of how a column (a gene set, a motif, a transcription factor, etc) is associated 53 | #' with the metric. 54 | #' 55 | #' @param m a GeneMatrix 56 | #' @param columns The ID of one or more columns to plot 57 | #' @param mode Type of plot to create: either "violin", "boxplot" or "histogram" 58 | #' @param char_limit Maximum number of characters to include in column names 59 | #' 60 | #' @import ggplot2 61 | #' 62 | #' @export 63 | CompareY = function(m, columns, mode = "boxplot", char_limit = 35) { 64 | # graph one or more columns 65 | # reverse the list (so that it reads top down instead of bottom up) 66 | dat <- ExtractGenes(m, rev(columns), 67 | char_limit = char_limit, 68 | add_count = TRUE, 69 | overall = TRUE) 70 | 71 | mode <- match.arg(mode, c("boxplot", "violin", "histogram")) 72 | if (mode == "histogram") { 73 | g <- ggplot(dat, aes(y)) + 74 | geom_histogram() + 75 | facet_wrap(~Term, ncol = 1, scale = "free_y") 76 | } 77 | else { 78 | g <- ggplot(dat, aes(x = Term, y = y)) + 79 | coord_flip() + 80 | theme(axis.text.y = element_text(color = "black", size = 15)) 81 | if (mode == "violin") { 82 | g = g + geom_violin() 83 | } 84 | else { 85 | g = g + geom_boxplot() 86 | } 87 | } 88 | g 89 | } 90 | 91 | 92 | #' Create a graph comparing the top n columns (in terms of association with y) 93 | #' 94 | #' Plot a bocplot, violin plot, or faceted histogram with the distributions 95 | #' of y within the most associated gene sets. 96 | #' 97 | #' @param m GeneMatrix object 98 | #' @param n Number of genes to compare 99 | #' @param ... Additional arguments to be passed to CompareY 100 | #' 101 | #' @details Most methods ("t.test", "wilcoxon", "hypergeometric") use the 102 | #' p-value to find the most significant columns. The "lasso" uses the order 103 | #' in which the terms were added to the regression 104 | #' 105 | #' @export 106 | CompareTopColumns = function(m, n = 9, ...) { 107 | if (length(m@rankingMetric) == 0) { 108 | stop("Cannot plot top genes without first running a test") 109 | } 110 | 111 | top_sets <- m@colData %>% 112 | arrange_(m@rankingMetric) %>% 113 | head(n) 114 | 115 | CompareY(m, top_sets$ID, ...) 116 | } 117 | -------------------------------------------------------------------------------- /_vignettes/brauer.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Analysis of Brauer et al 2008" 3 | output: html_document 4 | --- 5 | 6 | ```{r} 7 | library(readr) 8 | library(dplyr) 9 | library(tidyr) 10 | library(ggplot2) 11 | 12 | url <- "http://varianceexplained.org/files/Brauer2008_DataSet1.tds" 13 | nutrient_names <- c(G = "Glucose", L = "Leucine", P = "Phosphate", 14 | S = "Sulfate", N = "Ammonia", U = "Uracil") 15 | 16 | cleaned_data <- read_delim(url, delim = "\t") %>% 17 | separate(NAME, c("name", "BP", "MF", "systematic_name", "number"), sep = "\\|\\|") %>% 18 | mutate_each(funs(trimws), name:systematic_name) %>% 19 | dplyr::select(-number, -GID, -YORF, -GWEIGHT) %>% 20 | gather(sample, expression, G0.05:U0.3) %>% 21 | separate(sample, c("nutrient", "rate"), sep = 1, convert = TRUE) %>% 22 | mutate(nutrient = plyr::revalue(nutrient, nutrient_names)) %>% 23 | filter(!is.na(expression), systematic_name != "") 24 | 25 | exprs <- cleaned_data %>% 26 | reshape2::acast(systematic_name + nutrient ~ rate, value.var = "expression") 27 | 28 | head(exprs) 29 | 30 | rate <- as.numeric(colnames(exprs)) 31 | 32 | library(limma) 33 | fit <- lmFit(exprs, model.matrix(~rate)) 34 | eb <- eBayes(fit) 35 | 36 | library(biobroom) 37 | library(tidyr) 38 | td <- tidy(eb, intercept = TRUE) %>% 39 | separate(gene, c("ID", "nutrient"), sep = "_") %>% 40 | filter(!is.na(estimate)) 41 | 42 | td <- td %>% 43 | group_by(term, ID) %>% 44 | mutate(centered = estimate - (sum(estimate) - estimate) / (n() - 1)) %>% 45 | ungroup() 46 | 47 | combined <- td %>% 48 | group_by(term, ID) %>% 49 | summarise(centered = mean(estimate)) %>% 50 | bind_rows(td) %>% 51 | replace_na(list(nutrient = "Average")) 52 | 53 | P <- td %>% 54 | filter(term == "rate", nutrient == "Phosphate") 55 | ``` 56 | 57 | ```{r} 58 | library(GSEAMA) 59 | library(org.Sc.sgd.db) 60 | mm <- GOMembershipMatrix(org.Sc.sgdGO, ontology = "BP", min_size = 5, max_size = 2000) 61 | 62 | # need to apply BP, MF or CC before here for LASSO 63 | a <- TestAssociation(mm, P$ID, P$centered, method = "lasso") 64 | ``` 65 | 66 | ```{r} 67 | library(igraph) 68 | library(ggraph) 69 | library(ggforce) 70 | 71 | g <- GenerateNetwork(a) 72 | PlotNetwork(g) 73 | ``` 74 | 75 | ```{r} 76 | library(multidplyr) 77 | 78 | # TODO: don't use multidplyr 79 | associations_setup <- combined %>% 80 | dplyr::select(term, nutrient, ID, centered) %>% 81 | nest(ID, centered) 82 | 83 | associations_setup$mm <- lapply(1:nrow(associations_setup), function(x) mm) 84 | 85 | associations <- associations_setup %>% 86 | group_by(term, nutrient) %>% 87 | do(association = GSEAMA::TestAssociation(.$mm[[1]], .$data[[1]]$ID, .$data[[1]]$centered, method = "lasso")) %>% 88 | collect() 89 | 90 | associations$networks <- mclapply(associations$association, GSEAMA::GenerateNetwork) 91 | associations$graphs <- mclapply(associations$networks, GSEAMA::PlotNetwork) 92 | 93 | save(associations, file = "~/Desktop/associations.rda") 94 | ``` 95 | 96 | ```{r} 97 | sets_each <- associations %>% 98 | group_by(term, nutrient) %>% 99 | do(data.frame(set = ThresholdSets(.$association[[1]]))) 100 | 101 | big_network <- GenerateNetwork(associations$association[[1]], sets = unique(sets_each$set)) 102 | 103 | layout <- createLayout(big_network, "igraph", algorithm = "kk") 104 | terms <- trim_ellipses(layout$Term, 30) 105 | 106 | for (i in seq_len(nrow(associations))) { 107 | name <- paste(associations$term[i], associations$nutrient[i]) 108 | adata <- associations$association[[i]]@colData 109 | layout$MeanDifference <- adata$MeanDifference[match(layout$ID, adata$ID)] 110 | layout$MeanDifference <- pmax(pmin(layout$MeanDifference, 1), -1) 111 | layout$Term <- ifelse(abs(layout$MeanDifference) > .5, terms, NA) 112 | 113 | g <- ggraph(data = layout) + 114 | geom_edge_link() + 115 | geom_node_point(aes(color = MeanDifference, size = Size)) + 116 | geom_node_text(aes(label = Term, alpha = abs(MeanDifference) ^ 2), check_overlap = TRUE, size = 3) + 117 | ggforce::theme_no_axes() + 118 | scale_colour_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, space = "Lab", na.value = "grey50", guide = "colourbar") + 119 | ggtitle(name) 120 | 121 | ggsave(filename = paste0(name, ".png"), g) 122 | } 123 | ``` 124 | 125 | ```{r} 126 | #PlotNetwork(big_network) 127 | all_set_data <- associations %>% 128 | group_by(term, nutrient) %>% 129 | do(.$assocation[[1]]@colData) 130 | 131 | 132 | ``` -------------------------------------------------------------------------------- /_vignettes/GSEAMA.Rnw: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | 3 | %\VignetteEngine{knitr::knitr} 4 | %\VignetteIndexEntry{subSeq Example} 5 | 6 | \usepackage{natbib} 7 | \usepackage{graphics} 8 | \usepackage{amsmath} 9 | \usepackage{indentfirst} 10 | \usepackage[utf8]{inputenc} 11 | \usepackage{hyperref} 12 | 13 | \Sexpr{library(knitr); opts_chunk$set(tidy=TRUE, cache=TRUE, warning=FALSE, message=FALSE)} 14 | 15 | <>= 16 | options(keep.source = TRUE, width = 60) 17 | desc <- packageDescription("GSEAMA") 18 | @ 19 | 20 | \title{GSEAMA (Gene Set Enrichment Analysis with LASSO) Package Vignette (Version \Sexpr{desc$Version})} 21 | \author{David G. Robinson} 22 | 23 | \begin{document} 24 | 25 | \maketitle 26 | 27 | \section{Introduction} 28 | 29 | This is a vignette for the \verb@GSEAMA@ package, which performs gene set enrichment analysis using Lasso (L1-constrained) regression. \emph{etc} 30 | 31 | \section{Example} 32 | 33 | \subsection{Data} 34 | 35 | First we load in data from the RNA-Seq/Microarray factorial experiment comparing glucose and ethanol: 36 | 37 | <>= 38 | library(GSEAMA) 39 | data(factorial) 40 | 41 | dim(factorial) 42 | head(factorial) 43 | @ 44 | 45 | The only inputs we need are the systematic name of each gene (\texttt{factorial\$ORF}) and some metric in which we are measuring enrichment. Here we have four choices: the p-values or the log fold-changes from either the RNA-Seq or the microarray parallel experiment. We'll use the RNA-Seq log fold changes for this experiment. 46 | 47 | \subsection{Membership Matrix} 48 | 49 | The central data structure of this package is the GeneMatrix class. You can create one quite easily by providing the GO map for that species as found in the \href{http://www.bioconductor.org/packages/release/data/annotation/}{AnnotationData} Packages. 50 | \begin{center} 51 | \begin{tabular}{| c | c | c |} 52 | \hline 53 | \textbf{Organism} & \textbf{Package} & \textbf{GO map} \\ 54 | \hline 55 | Yeast & \texttt{org.Sc.sgd.db} & \texttt{org.Sc.sgdGO} \\ 56 | \hline 57 | Human & \texttt{org.Hs.eg.db} & \texttt{org.Hs.egGO} \\ 58 | \hline 59 | Mouse & \texttt{org.Mm.eg.db} & \texttt{org.Mm.egGO} \\ 60 | \hline 61 | E. coli K12 & \texttt{org.EcK12.eg.db} & \texttt{org.EcK12.egGO} \\ 62 | \hline 63 | \end{tabular} 64 | \end{center} 65 | 66 | <>= 67 | library(org.Sc.sgd.db) 68 | mm = GOMembershipMatrix(org.Sc.sgdGO, ontology="BP", min.size=5, max.size=250) 69 | @ 70 | 71 | The membership matrix includes useful information about each gene set: 72 | 73 | <>= 74 | mm@colData[398, ] 75 | @ 76 | 77 | As well as a table of information about each gene: 78 | 79 | <>= 80 | mm@geneData 81 | @ 82 | 83 | \clearpage 84 | 85 | \subsection{Wilcoxon Test} 86 | 87 | One simple test you can perform is a Wilcoxon rank sum test, comparing the $y$ within a set to the $y$ outside that set. This can be done with the \texttt{TestAssociation} function: 88 | 89 | <>= 90 | wilcoxon.mm = TestAssociation(mm, factorial$ORF, factorial$RNA.Seq.logFC, method="wilcoxon") 91 | @ 92 | 93 | It returns a MembershipMatrix object as well. The p-values from the Wilcoxon test have been added to the \texttt{colData} table (one p-value for each set). 94 | 95 | <>= 96 | hist(wilcoxon.mm@colData$pvalue) 97 | @ 98 | 99 | The \texttt{CompareTopColumns} function is useful for looking at the actual distribution of $y$ within each of the top significant sets: 100 | 101 | <>= 102 | CompareTopColumns(wilcoxon.mm) 103 | @ 104 | 105 | If you are interested in other specific gene sets, you can use the CompareY function to compare them one at a time, by ID: 106 | 107 | <>= 108 | CompareY(wilcoxon.mm, "GO:0006094") 109 | @ 110 | 111 | Or we can look at multiple sets that we're interested in: 112 | 113 | <<>>= 114 | glucose.genes = wilcoxon.mm@colData[grep("glucose", Definition), ]$ID 115 | glucose.genes 116 | 117 | CompareY(wilcoxon.mm, glucose.genes) 118 | @ 119 | 120 | \subsection{LASSO} 121 | 122 | One flaw with the Wilcoxon test is that it treats every hypothesis as being separate, when in fact they are likely highly correlated. For example, gene sets are highly redundant: all gene sets are contained within "parent" gene sets, and some heavily overlap. 123 | 124 | <>= 125 | lasso.mm = TestAssociation(mm, factorial$ORF, factorial$RNA.Seq.logFC, method="lasso") 126 | @ 127 | 128 | <>= 129 | CompareTopColumns(lasso.mm, n=15) 130 | @ 131 | 132 | \end{document} 133 | -------------------------------------------------------------------------------- /R/GOMembershipMatrix.R: -------------------------------------------------------------------------------- 1 | #' Construct a new gene set membership matrix 2 | #' 3 | #' Construct a gene set membership matrix (one row per gene, one column per 4 | #' gene set, with 1 indicating membership) from a GO annotation map. 5 | #' 6 | #' @param annotations A Go3AnnDbBimap object, typically from a Bioconductor Annotation 7 | #' package. For example, org.Hs.egGO from org.Hs.eg.db for human genes. 8 | #' @param ontology A vector of ontologies to include; should be a subset 9 | #' of c("BP", "MF", "CC") 10 | #' @param evidence A vector of GO evidence codes to include 11 | #' @param min_size Minimum size of a gene set to be included 12 | #' @param max_size Maximum size of a gene set to be included 13 | #' @param ancestors Whether a gene included in a gene set should also be included 14 | #' as being in all ancestors of that gene set (default TRUE) 15 | #' @param chosen_genes If provided, restricts genes to those in chosen_genes 16 | #' 17 | #' The GO annotation maps typically come from the Bioconductor AnnotationDB packages. 18 | #' A couple of notable examples are: 19 | #' 20 | #' Homo sapiens: org.Hs.egGO 21 | #' S. cerevisiae: org.Sc.sgdGO 22 | #' E coli K12: org.EcK12.egGO 23 | #' 24 | #' You can restrict the sets to the BP (Biological Process), MF (Molecular Function), 25 | #' or CC (Cellular Compartment) ontologies (by default all are included). 26 | #' 27 | #' @import Matrix AnnotationDbi 28 | #' @importFrom dplyr %>% data_frame tbl_df filter mutate group_by arrange distinct_ 29 | #' 30 | #' @examples 31 | #' 32 | #' # yeast membership matrix 33 | #' library(org.Sc.sgd.db) 34 | #' mm <- GOMembershipMatrix(org.Sc.sgdGO, min_size = 5, max_size = 250) 35 | #' 36 | #' # restrict to Biological Process ontology: 37 | #' mm <- GOMembershipMatrix(org.Sc.sgdGO, ontology = "BP", min_size = 5, max_size = 250) 38 | #' 39 | #' # human membership matrix 40 | #' library(org.Hs.eg.db) 41 | #' mm <- GOMembershipMatrix(org.Hs.sgdGO, min_size = 5, max_size = 250) 42 | #' 43 | #' @export 44 | GOMembershipMatrix = function(annotations, 45 | ontology = c("BP", "MF", "CC"), 46 | evidence = NULL, 47 | min_size = 1, 48 | max_size = Inf, 49 | ancestors = TRUE, 50 | chosen_genes = NULL) { 51 | # create membership matrix 52 | frame_df <- annotations %>% 53 | AnnotationDbi::toTable() %>% 54 | dplyr::tbl_df() %>% 55 | dplyr::filter(Ontology %in% ontology) 56 | #filter(go_id %in% available_go_ids) 57 | 58 | ontology_tbl <- frame_df %>% 59 | dplyr::select(ID = go_id, Ontology) %>% 60 | dplyr::distinct(ID, .keep_all = TRUE) 61 | 62 | if (!is.null(evidence)) { 63 | frame_df <- frame_df %>% 64 | dplyr::filter(evidence %in% Evidence) 65 | } 66 | 67 | id_field <- colnames(frame_df)[1] 68 | frame_df <- frame_df %>% 69 | dplyr::distinct_(id_field, "go_id", .keep_all = TRUE) 70 | 71 | if (!is.null(chosen_genes)) { 72 | frame_df <- frame_df[frame_df[[id_field]] %in% genes, ] 73 | } 74 | 75 | # turn into a sparse membership matrix 76 | m <- sparse_cast_(frame_df, id_field, "go_id", 1) 77 | 78 | if (ancestors) { 79 | # include relationships recursively: if a gene is included in a set, 80 | # also include it in sets that are ancestors of that set 81 | ancestry_matrix <- get_ancestry_matrix(colnames(m)) 82 | 83 | # combine with descendants, turn back into binary matrix 84 | m <- m + m %*% ancestry_matrix 85 | m = (m > 0) + 0 86 | } 87 | 88 | # eliminate sets with too few genes, and then genes with no sets 89 | set_size = colSums(m) 90 | m <- m[, set_size >= min_size & set_size <= max_size] 91 | m <- m[rowSums(m) > 0, ] 92 | sets <- colnames(m) 93 | genes <- rownames(m) 94 | 95 | # create table of set data, matching the membership matrix 96 | set_data <- data_frame(ID = sets, 97 | Term = Term(sets), 98 | Definition = Definition(sets), 99 | Size = colSums(m)) %>% 100 | dplyr::inner_join(ontology_tbl, by = "ID") 101 | 102 | # just make sure nothing was reordered 103 | stopifnot(!all(sets == set_data)) 104 | 105 | # create table of gene data 106 | gene_data <- data_frame(ID = rownames(m), Size = rowSums(m)) 107 | 108 | ret <- new("GeneMatrix", 109 | matrix = m, 110 | colData = set_data, 111 | geneData = gene_data) 112 | 113 | ret 114 | } 115 | 116 | 117 | #' Build an offspring matrix of GO terms 118 | #' 119 | #' A sparse binary Matrix object with one row and column for each pair 120 | #' of GO terms provided, where each row represents an ancestor and each column 121 | #' represents a descendant, with 1 marking ancestor/descendant pairs. 122 | #' 123 | #' @param terms IDs of GO terms that should be included in the ancestry matrix 124 | #' @param ontology Ontologies to use 125 | #' @param type Either OFFSPRING (default), CHILDREN, ANCESTOR, or PARENT 126 | #' 127 | #' @import GO.db 128 | #' 129 | #' @export 130 | get_ancestry_matrix <- function(terms, ontology = c("BP", "MF", "CC"), 131 | type = "OFFSPRING", upward = TRUE, tbl = FALSE) { 132 | offspring_df <- do.call(rbind, (lapply(ontology, function(o) { 133 | toTable(get(paste0("GO", o, type))) 134 | })))[, 1:2] 135 | 136 | ret <- offspring_df %>% 137 | setNames(c("go_id1", "go_id2")) %>% 138 | filter(go_id1 %in% terms, go_id2 %in% terms) %>% 139 | mutate(go_id1 = factor(go_id1, levels = terms), 140 | go_id2 = factor(go_id2, levels = terms)) %>% 141 | tbl_df() 142 | 143 | if (!upward) { 144 | tmp <- ret$go_id1 145 | ret$go_id1 <- ret$go_id2 146 | ret$go_id2 <- tmp 147 | } 148 | 149 | if (!tbl) { 150 | ret <- sparse_cast(ret, go_id1, go_id2) 151 | } 152 | 153 | ret 154 | } 155 | 156 | 157 | #' Get all GO IDs in all_IDs that are ancestors/descendants of those in IDs 158 | #' 159 | #' @param x A vector of GO IDs 160 | #' @param all_IDs A vector of GO IDs to look for relatives of x 161 | #' @param ancestors If true look among ancestors, if false look for children 162 | #' @param combine Whether to combine it with the sets in the IDs parameter 163 | GetGORelatives <- function(IDs, all_IDs, 164 | ancestors = TRUE, 165 | direct = FALSE, 166 | combine = TRUE) { 167 | type <- if (direct) { "CHILDREN" } else { "OFFSPRING" } 168 | am <- get_ancestry_matrix(all_IDs, type = type, upward = ancestors) 169 | ret <- names(which(colSums(am[IDs, , drop = FALSE]) > 0)) 170 | 171 | if (combine) { 172 | ret <- unique(c(IDs, ret)) 173 | } 174 | ret 175 | } 176 | -------------------------------------------------------------------------------- /R/TestAssociation.R: -------------------------------------------------------------------------------- 1 | #' Test association of a per-gene attribute with each column in a GeneMatrix 2 | #' 3 | #' Test the association of a per-gene attribute y with each column, where a column 4 | #' can represent a gene set, a motif, transcription factor targets, or other genes 5 | #' that are functionally related 6 | #' 7 | #' @param m A GeneMatrix object 8 | #' @param genes A vector of gene names (must match the gene names in the GeneMatrix) 9 | #' @param y A per-gene metric of the same length as the vector of gene names 10 | #' @param method Method used to test association between the per-gene metric and 11 | #' each column - either c("lasso", "t.test", "wilcoxon" or "hypergeometric") 12 | #' @param ... Additional arguments that will be given to the test in question 13 | #' 14 | #' @details A t-test or Wilcoxon test can take an additional argument of \code{alternative}, 15 | #' which allows a one-sided test with an alternative hypothesis of "less" or "greater". 16 | #' A t=test can also take the \code{var.equal} argument, indicating whether to treat the 17 | #' variance in and outside each group as identical. 18 | #' 19 | #' LASSO fits a penalized least squares model: 20 | #' 21 | #' min (y - \beta X)^2 + \lambda \sum|\beta|} 22 | #' 23 | #' @examples 24 | #' 25 | #'library(org.Sc.sgd.db) 26 | #' 27 | #'n_genes <- 200 28 | #'genes <- sample(mappedkeys(org.Sc.sgdGO), n_genes) 29 | #' 30 | #'mm <- GOMembershipMatrix(org.Sc.sgdGO, min_size = 5, 31 | #' max_size = 50, chosen_genes = genes) 32 | #'# some genes are dropped because they do not have GO categories 33 | #' 34 | #'n_genes <- nrow(mm) 35 | #'genes <- mm@geneData$ID 36 | #' 37 | #'beta = c(20:1, rep(0, times = nrow(mm@colData)-20)) 38 | #'y = c(as.matrix(mm) %*% beta + rnorm(n_genes, 0, 2)) 39 | #' 40 | #'results <- TestAssociation(mm, genes, y) 41 | #'View(results@colData) 42 | #' 43 | #' @export 44 | TestAssociation = function(m, genes, y, method = "lasso", ...) { 45 | if (!inherits(m, "GeneMatrix")) { 46 | stop("TestAssociation should be given a GeneMatrix object") 47 | } 48 | 49 | if (!is.null(m@fit)) { 50 | warning(paste0("Performing a test on a model that has already been ", 51 | "tested, previous results may be overwritten")) 52 | } 53 | 54 | # prefiltering of membership matrix 55 | v <- genes %in% m@geneData$ID 56 | y <- y[v] 57 | genes <- genes[v] 58 | m <- m[genes, ] 59 | m <- m[, colSums(m@matrix != 0) > 0] # filter columns *after* rows 60 | m@geneData$y <- y 61 | 62 | # apply desired method 63 | method = match.arg(method, c("lasso", "wilcoxon", "hypergeometric", "t.test")) 64 | if (method == "lasso") { 65 | m@fit <- glmnet::cv.glmnet(m@matrix, y, ...) 66 | beta_ind <- which(m@fit$lambda.1se == m@fit$lambda) 67 | m@colData$beta <- apply(m@fit$glmnet.fit$beta, 1, 68 | function(r) r[r != 0][1]) 69 | m@colData$beta1se <- as.numeric(m@fit$glmnet.fit$beta[, as.numeric(beta_ind)]) 70 | m@colData$step <- apply(m@fit$glmnet.fit$beta, 1, 71 | function(r) which(r != 0)[1]) 72 | m@colData$ranking <- -abs(m@colData$step) 73 | 74 | m@rankingMetric <- "ranking" 75 | m@effectMetric <- "beta" 76 | m@plottingMetric <- "beta1se" 77 | } 78 | else if (method == "t.test") { 79 | tt <- vectorized_t_test(m@matrix > 0, y, tbl = TRUE, ...) 80 | m@colData$estimate <- tt$estimate 81 | m@colData$p.value <- tt$p.value 82 | m@rankingMetric <- "p.value" 83 | m@effectMetric <- "estimate" 84 | m@plottingMetrix <- "estimate" 85 | } 86 | else if (method == "wilcoxon") { 87 | w <- vectorized_wilcoxon_test(m@matrix > 0, y, tbl = TRUE, ...) 88 | m@colData$auc <- w$auc 89 | m@colData$p.value <- w$p.value 90 | m@rankingMetric <- "p.value" 91 | m@effectMetric <- "auc" 92 | m@plottingMetric <- "auc" 93 | } 94 | else if (method == "hypergeometric") { 95 | if (!is.logical(y) & !(is.numeric(y) & all(y == 0 | y == 1))) { 96 | stop("For hypergeometric test, y must be either logical or binary") 97 | } 98 | stop("Not yet implemented") 99 | mat <- m@matrix > 0 100 | 101 | overlaps <- colSums(mat[as.logical(y), ]) 102 | 103 | total_genes <- nrow(m) 104 | total_y <- sum(y) 105 | totals <- colSums(mat) 106 | 107 | #m@colData$phi <- 108 | m@colData$p.value <- phyper(overlaps - 1, # number of white balls in each set 109 | total_y, # white balls in urn 110 | total_genes - total_genes, # black balls in urn 111 | totals, # number drawn in each set 112 | lower.tail = FALSE) 113 | m@rankingMetric <- "p.value" 114 | m@effectMetric <- "phi" 115 | m@plottingMetric <- "phi" 116 | } 117 | 118 | # save the method used 119 | m@assocMethod <- method 120 | 121 | # finally, compute the difference within and between the sets 122 | mat <- m@matrix 123 | not_mat <- 1 - mat 124 | mean_within <- colSums(y * mat) / colSums(mat) 125 | mean_outside <- colSums(y * not_mat) / colSums(not_mat) 126 | m@colData$MeanDifference <- mean_within - mean_outside 127 | 128 | m 129 | } 130 | 131 | #' Compute mean differences between "in a set" and "not in a set" 132 | #' 133 | #' For each set in a GeneMatrix, compute the difference in y between 134 | #' "in the set" 135 | #' and "outside the set". It will be set in the MeanDifference column 136 | #' 137 | #' @param m GeneMatrix object 138 | #' 139 | #' @export 140 | mean_difference <- function(m) { 141 | y <- m@geneData$y 142 | 143 | if (is.null(y)) { 144 | stop("y not yet set, has TestAssociation been performed?") 145 | } 146 | 147 | sub_m <- m@matrix 148 | sub_m_not <- 1 - sub_m 149 | mean_within <- colSums(y * sub_m) / colSums(sub_m) 150 | mean_outside <- colSums(y * sub_m_not) / colSums(sub_m_not) 151 | m@colData$MeanDifference <- mean_within - mean_outside 152 | 153 | m 154 | } 155 | 156 | 157 | #' Return a vector of genes that pass a threshold for includions 158 | #' 159 | #' Return a vector of genes that pass a threshold of "significance" 160 | #' or other kind of inclusion. For tests with a p-value (t-test, 161 | #' Wilcoxon, hypergeometric), this is FDR-controlled p-values. 162 | #' For LASSO, this is all cases where beta1sd != 0. 163 | #' 164 | #' @param m GeneMatrix object 165 | #' @param alpha Threshold for (corrected) p-values, not used in LASSO 166 | #' @param method Method, passed to \code{\link{p.adjust}}, or "qvalue" 167 | #' to use qvalue. Not used in LASSO 168 | #' @param ... Extra arguments to pass on to correction method 169 | #' 170 | #' @seealso \link{p.adjust}, \link{p.adjust.methods} 171 | ThresholdSets <- function(m, alpha = .05, method = "fdr", ...) { 172 | 173 | if(m@assocMethod == "lasso"){ 174 | # filtering of lasso values where beta1sd != 0 175 | m@colData$ID[m@colData$beta1se != 0] 176 | }else{ 177 | # p-value filtering based on provided filter_method 178 | if (method == "qvalue") { 179 | p.adjusted <- qvalue::qvalue(m@colData$p.value, ...)$qvalues 180 | } else { 181 | p.adjusted <- p.adjust(m@colData$p.value, method, ...) 182 | } 183 | m@colData$ID[!is.na(p.adjusted) & p.adjusted < alpha] 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /R/treemap.R: -------------------------------------------------------------------------------- 1 | #' Generate Treemap 2 | #' 3 | #' Visualize genes nested within specific and general GO sets using treemaps 4 | #' 5 | #' Each gene should maximize the absolute sum of relevant node weights but every GO category will 6 | #' not be included since a gene can only be assigned to a single ancestry. A small GO category may 7 | #' fall into multiple ancestral paths some which are parents of other meaningful categories and others 8 | #' that are dead-ends. Should favor attachment to pathways with the most informative children. 9 | #' To allow this to occur, three edge weights are used 10 | #' 1) the path with the highest overall absolute score * number of genes affected is chosen 11 | #' 2) ancestors of LASSO nodes all get a smaller score: [min(abs(beta))]*0.01*N_lasso_children 12 | #' - applied to edge where ancestors are children 13 | #' 3) all remaining edges that are not predictive recieve a tiny weight so that they are not filtered: [min(abs(beta))]*1e-4 14 | #' 15 | #' @inheritParams GetEdgesTable 16 | #' 17 | #' @export 18 | #' @return Generates a treemap hierarchy 19 | GenerateTreemap <- function(m, edges = NULL, ...) { 20 | 21 | if(!requireNamespace("igraph", quietly = T)){ 22 | stop("igraph is required for treemaps") 23 | } 24 | if(!requireNamespace("graph", quietly = T)){ 25 | stop("graph is required for treemaps") 26 | } 27 | if(!requireNamespace("RBGL", quietly = T)){ 28 | stop("RBGL is required for treemaps") 29 | } 30 | 31 | if (is.null(edges)) { 32 | signif_sets <- ThresholdSets(m, ...) 33 | edges <- GetEdgesTable(m, sets = m@colData$ID, ...) 34 | # orphaned parents connect to root 35 | edge_roots <- data_frame(go_id1 = "root", go_id2 = setdiff(edges$go_id1, edges$go_id2)) %>% 36 | dplyr::left_join(edges %>% 37 | dplyr::select(go_id2 = go_id1, Ontology) %>% 38 | dplyr::distinct(), by = "go_id2") 39 | edges <- rbind(edges, edge_roots) 40 | 41 | ancestors <- get_ancestry_matrix(m@colData$ID, tbl = TRUE, type = "ANCESTOR", upward = FALSE) %>% 42 | dplyr::mutate(go_id1 = as.character(go_id1)) %>% 43 | dplyr::mutate(go_id2 = as.character(go_id2)) %>% 44 | dplyr::filter(go_id1 %in% m@colData$ID, go_id2 %in% m@colData$ID) 45 | ancestors <- bind_rows(ancestors, data_frame(go_id1 = "root", go_id2 = m@colData$ID)) 46 | } 47 | 48 | if(length(signif_edges) == 0){ 49 | stop("No thresholded sets to include in treemap") 50 | } 51 | 52 | signif_sets_weight <- m@colData %>% 53 | dplyr::filter(ID %in% signif_sets) %>% 54 | dplyr::select_("ID", "Size", Weight = m@plottingMetric) %>% 55 | dplyr::mutate(Weight = abs(Weight), 56 | nWeight = Size * Weight) %>% 57 | dplyr::select(-Size) 58 | 59 | nodes <- data_frame(ID = unique(c(edges$go_id1, edges$go_id2))) %>% 60 | dplyr::left_join(m@colData %>% dplyr::select(ID, Size), by = "ID") %>% 61 | dplyr::left_join(signif_sets_weight, by = "ID") %>% 62 | dplyr::mutate(Weight = ifelse(is.na(Weight), 0, Weight), 63 | nWeight = ifelse(is.na(nWeight), 0, nWeight)) 64 | 65 | # ancestor weights encourage the same ancestor nodes to be used when significant descendents exist 66 | ancestor_weights <- ancestors %>% 67 | dplyr::left_join(nodes %>% dplyr::select(go_id2 = ID, weight = nWeight), by = "go_id2") %>% 68 | dplyr::mutate(weight = ifelse(is.na(weight), 0, weight)) %>% 69 | dplyr::group_by(go_id1) %>% 70 | dplyr::summarize(weight = sum(weight)*0.01) %>% 71 | dplyr::arrange(desc(weight)) 72 | 73 | nodes <- nodes %>% 74 | dplyr::left_join(ancestor_weights, by = c("ID" = "go_id1")) %>% 75 | dplyr::rowwise() %>% 76 | dplyr::mutate(nWeight = sum(nWeight, weight, na.rm = T)) %>% 77 | dplyr::select(-weight) %>% 78 | dplyr::ungroup() 79 | 80 | # find a unique GO hierarchy that 81 | minimal_edge_set <- find_minimal_rooted_tree(nodes, edges) 82 | 83 | # find the optimal path to assign each gene 84 | gene_paths <- assign_genes_to_paths(m, minimal_edge_set) 85 | 86 | # prune gene sets that are not used 87 | all_used_go_terms <- unique(gene_paths$full_path$ID) 88 | utilized_go_network <- igraph::delete_vertices(minimal_edge_set, setdiff(igraph::V(minimal_edge_set)$name, all_used_go_terms)) 89 | 90 | 91 | #' Treemap GO Plot 92 | #' 93 | #' Generate a treemap plot using the minimal spanning GO tree with genes assigned to individual GO terms. 94 | #' 95 | #' Default color based on GO significance and effect sizes of individual genes 96 | treemap_GO_plot <- function(m, gene_paths, utilized_go_network, treemap_thresh = 4, ...){ 97 | 98 | # gene effects 99 | gene_treemap_data <- gene_paths$assigned_genes %>% 100 | dplyr::left_join(m@geneData %>% dplyr::select(gene = ID, gene_effect = y), by = "gene") %>% 101 | dplyr::mutate(gene_effect = pmax(pmin(gene_effect, treemap_thresh), -1*treemap_thresh)) 102 | 103 | gene_treemap_data <- gene_treemap_data %>% 104 | dplyr::select(ID = gene, gene_effect) %>% 105 | dplyr::mutate(go_name = NA_character_, 106 | go_effect = NA_real_, 107 | data_type = "gene") 108 | 109 | # go effects 110 | go_treemap_data <- m@colData %>% 111 | dplyr::filter(ID %in% intersect(signif_sets, igraph::V(utilized_go_network)$name)) %>% 112 | dplyr::select_("ID", go_name = "Term", go_effect = m@plottingMetric) 113 | 114 | go_treemap_data <- tibble::data_frame(ID = igraph::V(utilized_go_network)$name) %>% 115 | dplyr::left_join(go_treemap_data, by = "ID") %>% 116 | dplyr::mutate(gene_effect = NA_real_, 117 | data_type = "go") 118 | 119 | all_node_data <- dplyr::bind_rows(gene_treemap_data, go_treemap_data) 120 | 121 | # update network 122 | updated_edges <- dplyr::bind_rows( 123 | igraph::get.edgelist(utilized_go_network) %>% as.data.frame(stringsAsFactors = F), 124 | unname(gene_paths$assigned_genes) %>% as.matrix() %>% as.data.frame(stringsAsFactors = F)) 125 | 126 | # use utilized_go_network with genes assigned to categories 127 | full_treemap_data <- igraph::graph_from_data_frame(updated_edges, directed = TRUE, vertices = all_node_data) 128 | 129 | # update network 130 | 131 | #library(ggraph) 132 | #library(ggplot2) 133 | 134 | #ggraph(full_treemap_data, 'igraph', algorithm = 'tree', circular = TRUE) + 135 | # geom_edge_diagonal(aes(alpha = ..index..)) + 136 | # coord_fixed() + 137 | # scale_edge_alpha('Direction', guide = 'edge_direction') + 138 | # geom_node_point(aes(color = gene_effect, filter = igraph::degree(full_treemap_data, mode = 'out') == 0), size = 1) + 139 | # ggforce::theme_no_axes() + 140 | # scale_color_gradient2("MeanDifference", low = "blue", high = "red") 141 | 142 | # tree 143 | 144 | full_treemap_data <- ggraph::treeApply(full_treemap_data, function(node, parent, depth, tree) { 145 | tree <- igraph::set_vertex_attr(tree, 'depth', node, depth) 146 | if (depth == 1) { 147 | tree <- igraph::set_vertex_attr(tree, 'class', node, igraph::V(tree)$shortName[node]) 148 | } else if (depth > 1) { 149 | tree <- igraph::set_vertex_attr(tree, 'class', node, igraph::V(tree)$class[parent]) 150 | } 151 | tree 152 | }) 153 | igraph::V(full_treemap_data)$leaf <- igraph::degree(full_treemap_data, mode = 'out') == 0 154 | 155 | ggraph(full_treemap_data, 'treemap') + 156 | geom_treemap(aes(fill = gene_effect, filter = leaf), colour = NA) + 157 | geom_treemap(aes(size = depth * ifelse(data_type == "gene", 1, 0), alpha = depth, 158 | colour = ifelse(data_type == "gene", "white", ifelse(go_effect != 0, "yellow", "black"))), fill = NA) + 159 | geom_node_text(aes(label = go_name), size = 3, check_overlap = T, repel = T) + 160 | scale_fill_gradient2("MeanDifference", low = "green3", high = "firebrick1") + 161 | scale_color_identity() + 162 | scale_size(range = c(1, 0), guide = 'none') + 163 | scale_alpha(range = c(1, 0.2), guide = 'none') + 164 | ggforce::theme_no_axes() 165 | 166 | } 167 | } 168 | 169 | 170 | get_minimal_gene_path <- function(a_gene, m, nodes, minimal_edge_set){ 171 | 172 | gene_GO <- m@matrix[a_gene, m@matrix[a_gene,] == 1, drop = F] %>% colnames() 173 | # add ancestors that may have been missed for the gene 174 | gene_GO <- union(gene_GO, ancestors$go_id1[ancestors$go_id2 %in% gene_GO]) 175 | gene_GO <- gene_GO[gene_GO %in% nodes$ID] 176 | 177 | gene_subgraph <- igraph::induced_subgraph(minimal_edge_set, gene_GO) 178 | 179 | gene_paths <- igraph::get.shortest.paths(gene_subgraph, from = "root", to = igraph::V(gene_subgraph))$vpath 180 | gene_paths <- lapply(seq_along(gene_paths), function(i){ 181 | tibble::data_frame(ID = gene_paths[[i]]$name) %>% 182 | dplyr::mutate(gene = a_gene, terminus = igraph::V(gene_subgraph)$name[i], step = 1:n()) 183 | }) %>% 184 | dplyr::bind_rows() 185 | 186 | gene_paths 187 | } 188 | 189 | #' Assign Genes to Paths 190 | #' 191 | #' Assign genes to a GO hierarchy which has the greatest signal 192 | #' 193 | #' @inheritParams GenerateTreemap 194 | #' @param minimal_edge_set a network which is a weighted directed minimal spanning tree of gene ontologies 195 | #' 196 | #' @return a data_frame which contains which terminal gene set each gene is assigned to 197 | assign_genes_to_paths <- function(m, minimal_edge_set, ...){ 198 | 199 | # find all paths for each gene 200 | all_gene_paths <- mclapply(m@geneData$ID, function(a_gene){ 201 | print(a_gene) 202 | get_minimal_gene_path(a_gene, m, nodes, minimal_edge_set) 203 | }, mc.cores = parallel::detectCores()) %>% 204 | dplyr::bind_rows() 205 | 206 | # all shortest paths found, now find the path that maximizes the path weight 207 | 208 | assigned_paths <- all_gene_paths %>% 209 | dplyr::left_join(nodes, by = "ID") %>% 210 | dplyr::group_by(gene, terminus) %>% 211 | dplyr::summarize(Weight = sum(Weight), 212 | nWeight = sum(nWeight), 213 | nSteps = n()) %>% 214 | dplyr::group_by(gene) %>% 215 | dplyr::arrange(desc(Weight), desc(nWeight), nSteps) %>% 216 | dplyr::slice(1) 217 | 218 | gene_paths <- list() 219 | gene_paths$full_path <- all_gene_paths %>% 220 | dplyr::semi_join(assigned_paths, by = c("terminus", "gene")) 221 | gene_paths$assigned_genes <- assigned_paths %>% 222 | dplyr::select(terminus, gene) 223 | gene_paths 224 | } 225 | 226 | #' Find Minimal Rooted Tree 227 | #' 228 | #' Removes all loops so that gene sets can be nested within one another: creates an optimal directed acyclic graph 229 | #' pointing from general GO terms down to specific ones. 230 | #' 231 | #' @param nodes data_frame significance of gene sets 232 | #' @param edges data_frame go_id1 are parents of go_id2 233 | #' 234 | #' @return an igraph object of the gene set minimal spanning tree 235 | find_minimal_rooted_tree <- function(nodes, edges){ 236 | 237 | if(clusters(graph_from_data_frame(edges))$no != 1){ 238 | stop("edges must be a single connected network") 239 | } 240 | 241 | edge_weights <- edges %>% 242 | dplyr::left_join(nodes %>% dplyr::select(go_id1 = ID, nWeight_1 = nWeight), by = "go_id1") %>% 243 | dplyr::left_join(nodes %>% dplyr::select(go_id2 = ID, nWeight_2 = nWeight), by = "go_id2") %>% 244 | dplyr::mutate(weight = nWeight_1 * 0.1 + nWeight_2, 245 | weight = ifelse(weight == 0, min(weight[weight != 0])*1e-4, weight)) 246 | 247 | GO_graph_NEL <- new("graphNEL", nodes = nodes$ID, edgemode="directed") 248 | GO_graph_NEL <- graph::addEdge(from = edge_weights$go_id1, to = edge_weights$go_id2, graph = GO_graph_NEL, weights = edge_weights$weight) 249 | 250 | GO_optim_branching <- RBGL::edmondsOptimumBranching(GO_graph_NEL) 251 | 252 | # recreate graphNEL object from edmonds output 253 | edmonds_edgeList <- as.data.frame(t(GO_optim_branching$edgeList)) 254 | minimum_igraph_network <- graph_from_data_frame(edmonds_edgeList) 255 | 256 | # 257 | GO_graph_parsimony <- graph_from_data_frame(edges, vertices = nodes) 258 | pruned_edges <- attr(igraph::E(GO_graph_parsimony), "vnames")[!(attr(igraph::E(GO_graph_parsimony), "vnames") %in% attr(igraph::E(minimum_igraph_network), "vnames"))] 259 | GO_graph_parsimony <- igraph::delete_edges(GO_graph_parsimony, pruned_edges) 260 | 261 | if(clusters(minimum_igraph_network)$no != 1 | clusters(GO_graph_parsimony)$no != 1){ 262 | stop("a single connected network was not found in the minimal network") 263 | } 264 | if(setdiff(igraph::get.edgelist(minimum_igraph_network)[,1],igraph::get.edgelist(minimum_igraph_network)[,2]) != "root"){ 265 | stop("a single root was not found in the minimal network") 266 | } 267 | 268 | GO_graph_parsimony 269 | } 270 | 271 | update_treemap_root <- function(GO_graph_parsimony){ 272 | 273 | GO_graph_clusters <- igraph::clusters(GO_graph_parsimony) 274 | 275 | new_roots <- c() 276 | for(i in seq_along(GO_graph_clusters$csize)){ 277 | if(GO_graph_clusters$membership['root'] == i){next} 278 | 279 | subgraph_nodes <- names(which(GO_graph_clusters$membership == i)) 280 | 281 | subgraph_graph <- igraph::induced_subgraph(GO_graph_parsimony, subgraph_nodes) 282 | 283 | subgraph_root <- setdiff(igraph::get.edgelist(subgraph_graph)[,1], igraph::get.edgelist(subgraph_graph)[,2]) 284 | if(length(subgraph_root) == 0){ 285 | # if there is no parent then either there is one node or the graph is not a DAG 286 | if(length(subgraph_nodes) == 1){ 287 | subgraph_root <- subgraph_nodes 288 | }else{ 289 | # solve the mst problem 290 | mst_graph <- igraph::minimum.spanning.tree(igraph::induced_subgraph(GO_graph_parsimony, subgraph_nodes)) 291 | 292 | # figure out which edges were removed 293 | GO_graph_parsimony <- igraph::delete_vertices(GO_graph_parsimony, E(subgraph_graph)[setdiff(E(subgraph_graph), E(mst_graph))]) 294 | 295 | subgraph_root <- setdiff(igraph::get.edgelist(mst_graph)[,1], igraph::get.edgelist(mst_graph)[,2]) 296 | 297 | if(length(subgraph_root) == 0){ 298 | subgraph_root <- subgraph_nodes 299 | } 300 | } 301 | } 302 | if(length(subgraph_root) == 0){stop("no root found")} 303 | new_roots <- c(new_roots, subgraph_root) 304 | } 305 | 306 | if(length(new_roots) != 0){ 307 | GO_graph_parsimony <- igraph::add_edges(GO_graph_parsimony, c(t(data_frame("root", new_roots)))) 308 | } 309 | 310 | GO_graph_parsimony 311 | } 312 | --------------------------------------------------------------------------------