├── R
    ├── AllGenerics.R
    ├── AllClasses.R
    ├── GeneMatrix-methods.R
    ├── utilities.R
    ├── network.R
    ├── vectorized_tests.R
    ├── CompareY.R
    ├── GOMembershipMatrix.R
    ├── TestAssociation.R
    └── treemap.R
├── .Rbuildignore
├── LICENSE
├── .gitignore
├── tests
    ├── test-all.R
    └── testthat
    │   └── test-GSEAL.R
├── data
    └── factorial.rda
├── _vignettes
    ├── GSEAMA.pdf
    ├── brauer.Rmd
    └── GSEAMA.Rnw
├── GSEAMA.Rproj
├── man
    ├── CompareTopSets.Rd
    ├── trim_ellipses.Rd
    ├── mean_difference.Rd
    ├── GenerateNetwork.Rd
    ├── sparse_cast_.Rd
    ├── find_minimal_rooted_tree.Rd
    ├── assign_genes_to_paths.Rd
    ├── GetGORelatives.Rd
    ├── PlotNetwork.Rd
    ├── ExtractGenes.Rd
    ├── vectorized_wilcoxon_test.Rd
    ├── get_ancestry_matrix.Rd
    ├── CompareTopColumns.Rd
    ├── vectorized_t_test.Rd
    ├── GetEdgesTable.Rd
    ├── CompareY.Rd
    ├── ThresholdSets.Rd
    ├── sparse_cast.Rd
    ├── GenerateTreemap.Rd
    ├── TestAssociation.Rd
    └── GOMembershipMatrix.Rd
├── README.mkd
├── DESCRIPTION
├── NAMESPACE
└── inst
    └── tests
        └── test-wilcoxon.R


/R/AllGenerics.R:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2016
2 | COPYRIGHT HOLDER: David Robinson
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | inst/doc
5 | 


--------------------------------------------------------------------------------
/tests/test-all.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | 
3 | test_check("GSEAMA")
4 | 


--------------------------------------------------------------------------------
/data/factorial.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dgrtwo/GSEAMA/HEAD/data/factorial.rda


--------------------------------------------------------------------------------
/_vignettes/GSEAMA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dgrtwo/GSEAMA/HEAD/_vignettes/GSEAMA.pdf


--------------------------------------------------------------------------------
/GSEAMA.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | PackageRoxygenize: rd,collate,namespace
19 | 


--------------------------------------------------------------------------------
/man/CompareTopSets.Rd:
--------------------------------------------------------------------------------
 1 | \name{CompareTopSets}
 2 | \alias{CompareTopSets}
 3 | \title{Create a graph comparing the top n columns (in terms of association with y)}
 4 | \usage{
 5 | CompareTopSets(m, n = 9, ...)
 6 | }
 7 | \arguments{
 8 |   \item{m}{GeneMatrix object}
 9 | 
10 |   \item{n}{Number of genes to compare}
11 | 
12 |   \item{...}{Additional arguments to be passed to CompareY}
13 | }
14 | \description{
15 | Create a graph comparing the top n columns (in terms of
16 | association with y)
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/man/trim_ellipses.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utilities.R
 3 | \name{trim_ellipses}
 4 | \alias{trim_ellipses}
 5 | \title{Trim the length of a character vector, adding ellipses to trimmed elements}
 6 | \usage{
 7 | trim_ellipses(x, width)
 8 | }
 9 | \arguments{
10 | \item{x}{character vector}
11 | 
12 | \item{width}{length of string to trim to}
13 | }
14 | \description{
15 | This utility function is used to shorten column names so they can appear
16 | on a figure.
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/man/mean_difference.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TestAssociation.R
 3 | \name{mean_difference}
 4 | \alias{mean_difference}
 5 | \title{Compute mean differences between "in a set" and "not in a set"}
 6 | \usage{
 7 | mean_difference(m)
 8 | }
 9 | \arguments{
10 | \item{m}{GeneMatrix object}
11 | }
12 | \description{
13 | For each set in a GeneMatrix, compute the difference in y between
14 | "in the set"
15 | and "outside the set". It will be set in the MeanDifference column
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/R/AllClasses.R:
--------------------------------------------------------------------------------
 1 | ##################################
 2 | ######## AllClasses.R
 3 | ########
 4 | ########    all classes in GSEAMA
 5 | ##################################
 6 | ####################################################################
 7 | 
 8 | setClass("GeneMatrix",
 9 |     representation(
10 |         matrix = "Matrix",
11 |         colData = "ANY",
12 |         geneData = "ANY",
13 |         fit = "ANY",
14 |         rankingMetric = "character",
15 |         effectMetric = "character",
16 |         plottingMetric = "character",
17 |         assocMethod = "character"
18 |     )
19 | )
20 | 


--------------------------------------------------------------------------------
/README.mkd:
--------------------------------------------------------------------------------
 1 | Gene Set Enrichment Analysis Made Awesome
 2 | =============================
 3 | 
 4 | Installation
 5 | -------------
 6 | 
 7 | First install the Bioconductor dependencies:
 8 | 
 9 |     source("http://bioconductor.org/biocLite.R")
10 |     biocLite(c("GO.db", "AnnotationDbi", "qvalue"))
11 | 
12 | Then install the [devtools](https://github.com/hadley/devtools) package, and use it to install GSEAMA:
13 | 
14 |     install.packages("devtools")
15 |     devtools::install_github("dgrtwo/GSEAMA")
16 | 
17 | The package vignette can be downloaded [here](https://github.com/dgrtwo/GSEAMA/blob/master/_vignettes/GSEAMA.pdf?raw=true).
18 | 


--------------------------------------------------------------------------------
/man/GenerateNetwork.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/network.R
 3 | \name{GenerateNetwork}
 4 | \alias{GenerateNetwork}
 5 | \title{Create an igraph from a GeneMatrix representing GO terms}
 6 | \usage{
 7 | GenerateNetwork(m, edges = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{m}{A GeneMatrix object}
11 | 
12 | \item{edges}{A table of edges to plot. If not given, compute with
13 | \code{\link{GetEdgesTable}}}
14 | 
15 | \item{...}{Extra arguments passed on to GetEdgesTable}
16 | }
17 | \value{
18 | An igraph object
19 | }
20 | \description{
21 | Create an igraph from a GeneMatrix representing GO terms
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/man/sparse_cast_.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utilities.R
 3 | \name{sparse_cast_}
 4 | \alias{sparse_cast_}
 5 | \title{Standard-evaluation version of sparse_cast}
 6 | \usage{
 7 | sparse_cast_(data, row_col, column_col, value_col = NULL)
 8 | }
 9 | \arguments{
10 | \item{data}{A tbl}
11 | 
12 | \item{row_col}{String version of column to use as row names}
13 | 
14 | \item{column_col}{String version of column to use as column names}
15 | 
16 | \item{value_col}{String version of column to use as sparse matrix values,
17 | or a numeric value to use}
18 | }
19 | \description{
20 | Standard-evaluation version of sparse_cast
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/find_minimal_rooted_tree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treemap.R
 3 | \name{find_minimal_rooted_tree}
 4 | \alias{find_minimal_rooted_tree}
 5 | \title{Find Minimal Rooted Tree}
 6 | \usage{
 7 | find_minimal_rooted_tree(nodes, edges)
 8 | }
 9 | \arguments{
10 | \item{nodes}{data_frame significance of gene sets}
11 | 
12 | \item{edges}{data_frame go_id1 are parents of go_id2}
13 | }
14 | \value{
15 | an igraph object of the gene set minimal spanning tree
16 | }
17 | \description{
18 | Removes all loops so that gene sets can be nested within one another: creates an optimal directed acyclic graph
19 | pointing from general GO terms down to specific ones.
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/man/assign_genes_to_paths.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treemap.R
 3 | \name{assign_genes_to_paths}
 4 | \alias{assign_genes_to_paths}
 5 | \title{Assign Genes to Paths}
 6 | \usage{
 7 | assign_genes_to_paths(m, minimal_edge_set, ...)
 8 | }
 9 | \arguments{
10 | \item{m}{GeneMatrix object representing GO terms}
11 | 
12 | \item{minimal_edge_set}{a network which is a weighted directed minimal spanning tree of gene ontologies}
13 | 
14 | \item{...}{Extra arguments passed on to threshold_sets}
15 | }
16 | \value{
17 | a data_frame which contains which terminal gene set each gene is assigned to
18 | }
19 | \description{
20 | Assign genes to a GO hierarchy which has the greatest signal
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/GetGORelatives.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/GOMembershipMatrix.R
 3 | \name{GetGORelatives}
 4 | \alias{GetGORelatives}
 5 | \title{Get all GO IDs in all_IDs that are ancestors/descendants of those in IDs}
 6 | \usage{
 7 | GetGORelatives(IDs, all_IDs, ancestors = TRUE, direct = FALSE,
 8 |   combine = TRUE)
 9 | }
10 | \arguments{
11 | \item{all_IDs}{A vector of GO IDs to look for relatives of x}
12 | 
13 | \item{ancestors}{If true look among ancestors, if false look for children}
14 | 
15 | \item{combine}{Whether to combine it with the sets in the IDs parameter}
16 | 
17 | \item{x}{A vector of GO IDs}
18 | }
19 | \description{
20 | Get all GO IDs in all_IDs that are ancestors/descendants of those in IDs
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/PlotNetwork.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/network.R
 3 | \name{PlotNetwork}
 4 | \alias{PlotNetwork}
 5 | \title{Plot a network from an igraph representation}
 6 | \usage{
 7 | PlotNetwork(g, algorithm = "kk", arrow = grid::arrow(length =
 8 |   grid::unit(0.1, "inches")))
 9 | }
10 | \arguments{
11 | \item{g}{An igraph object, generally computed by \link{\code{GenerateNetwork}}}
12 | 
13 | \item{algorithm}{Algorithm to use for layout}
14 | 
15 | \item{arrow}{Arrow}
16 | }
17 | \description{
18 | Return a ggplot2 object representing a network of sets.
19 | This is meant to be a sensible default for plotting
20 | GO networks, it will not cover all cases. We recommend learning
21 | to use ggraph or another igraph plotting functionality.
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: GSEAMA
 2 | Type: Package
 3 | Title: Gene set enrichment analysis made awesome
 4 | Version: 0.99.0
 5 | Date: 2016-01-21
 6 | Authors@R: c(
 7 |     person("David", "Robinson", email = "admiral.david@gmail.com", role = c("aut", "cre")),
 8 |     person("Sean", "Hackett", email = "seanmchackett@gmail.com", role = c("aut"))
 9 |     )
10 | Description: Efficient and interpretable gene set enrichment analysis methods
11 |     based on a membership matrix.
12 | VignetteBuilder: knitr
13 | Imports:
14 |     dplyr,
15 |     Matrix,
16 |     ggplot2,
17 |     AnnotationDbi,
18 |     qvalue
19 | Suggests:
20 |     glmnet,
21 |     igraph,
22 |     graph,
23 |     RBGL,
24 |     knitr,
25 |     testthat
26 | URL: http://github.com/dgrtwo/GSEAMA
27 | License: MIT + file LICENSE
28 | RoxygenNote: 5.0.1
29 | 


--------------------------------------------------------------------------------
/man/ExtractGenes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CompareY.R
 3 | \name{ExtractGenes}
 4 | \alias{ExtractGenes}
 5 | \title{Return all genes, along with the saved metric in each of the given sets}
 6 | \usage{
 7 | ExtractGenes(m, columns, char_limit = NULL, add_count = FALSE,
 8 |   overall = FALSE)
 9 | }
10 | \arguments{
11 | \item{m}{a GeneMatrix object}
12 | 
13 | \item{columns}{vector of columns (e.g. gene sets) whose genes should be extracted}
14 | 
15 | \item{char_limit}{optionally shorten the Term field to the desired length}
16 | 
17 | \item{add_count}{whether the count within each set should be added to the
18 | Term field (like "Metabolism (360)")}
19 | }
20 | \description{
21 | Return all genes, along with the saved metric in each of the given sets
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/man/vectorized_wilcoxon_test.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/vectorized_tests.R
 3 | \name{vectorized_wilcoxon_test}
 4 | \alias{vectorized_wilcoxon_test}
 5 | \title{Perform a Wilcoxon rank-sum test comparing a metric to each column of a matrix}
 6 | \usage{
 7 | vectorized_wilcoxon_test(m, y, alternative = "two.sided", tbl = FALSE)
 8 | }
 9 | \arguments{
10 | \item{m}{binary matrix}
11 | 
12 | \item{y}{numeric vector to test each column of m against}
13 | 
14 | \item{alternative}{whether to use a one-sided test, and if so which way}
15 | 
16 | \item{tbl}{whether to return a data.frame rather than a vector}
17 | }
18 | \description{
19 | This function is similar to \code{apply(m, 2, function(col) wilcox.test(y ~ col)$p.value)},
20 | but is vectorized to make it much faster
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/get_ancestry_matrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/GOMembershipMatrix.R
 3 | \name{get_ancestry_matrix}
 4 | \alias{get_ancestry_matrix}
 5 | \title{Build an offspring matrix of GO terms}
 6 | \usage{
 7 | get_ancestry_matrix(terms, ontology = c("BP", "MF", "CC"),
 8 |   type = "OFFSPRING", upward = TRUE, tbl = FALSE)
 9 | }
10 | \arguments{
11 | \item{terms}{IDs of GO terms that should be included in the ancestry matrix}
12 | 
13 | \item{ontology}{Ontologies to use}
14 | 
15 | \item{type}{Either OFFSPRING (default), CHILDREN, ANCESTOR, or PARENT}
16 | }
17 | \description{
18 | A sparse binary Matrix object with one row and column for each pair
19 | of GO terms provided, where each row represents an ancestor and each column
20 | represents a descendant, with 1 marking ancestor/descendant pairs.
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/CompareTopColumns.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CompareY.R
 3 | \name{CompareTopColumns}
 4 | \alias{CompareTopColumns}
 5 | \title{Create a graph comparing the top n columns (in terms of association with y)}
 6 | \usage{
 7 | CompareTopColumns(m, n = 9, ...)
 8 | }
 9 | \arguments{
10 | \item{m}{GeneMatrix object}
11 | 
12 | \item{n}{Number of genes to compare}
13 | 
14 | \item{...}{Additional arguments to be passed to CompareY}
15 | }
16 | \description{
17 | Plot a bocplot, violin plot, or faceted histogram with the distributions
18 | of y within the most associated gene sets.
19 | }
20 | \details{
21 | Most methods ("t.test", "wilcoxon", "hypergeometric") use the
22 | p-value to find the most significant columns. The "lasso" uses the order
23 | in which the terms were added to the regression
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(CompareTopColumns)
 4 | export(CompareY)
 5 | export(ExtractGenes)
 6 | export(GOMembershipMatrix)
 7 | export(GenerateNetwork)
 8 | export(GenerateTreemap)
 9 | export(GetEdgesTable)
10 | export(PlotNetwork)
11 | export(TestAssociation)
12 | export(get_ancestry_matrix)
13 | export(mean_difference)
14 | export(trim_ellipses)
15 | export(vectorized_t_test)
16 | export(vectorized_wilcoxon_test)
17 | import(AnnotationDbi)
18 | import(GO.db)
19 | import(Matrix)
20 | import(ggplot2)
21 | import(ggraph)
22 | importFrom(dplyr,"%>%")
23 | importFrom(dplyr,arrange)
24 | importFrom(dplyr,data_frame)
25 | importFrom(dplyr,distinct_)
26 | importFrom(dplyr,do)
27 | importFrom(dplyr,filter)
28 | importFrom(dplyr,group_by)
29 | importFrom(dplyr,inner_join)
30 | importFrom(dplyr,mutate)
31 | importFrom(dplyr,tbl_df)
32 | 


--------------------------------------------------------------------------------
/man/vectorized_t_test.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/vectorized_tests.R
 3 | \name{vectorized_t_test}
 4 | \alias{vectorized_t_test}
 5 | \title{Perform a Student's T-test comparing a metric to each column of a matrix}
 6 | \usage{
 7 | vectorized_t_test(m, y, var.equal = FALSE, alternative = "two.sided",
 8 |   tbl = FALSE)
 9 | }
10 | \arguments{
11 | \item{m}{binary matrix}
12 | 
13 | \item{y}{numeric vector to test each column of m against}
14 | 
15 | \item{var.equal}{assume that the variances are equal}
16 | 
17 | \item{alternative}{whether to use a one-sided test, and if so which way}
18 | 
19 | \item{tbl}{Whether to return as a data frame}
20 | }
21 | \description{
22 | This function is similar to \code{apply(m, 2, function(col) t.test(y ~ col)$p.value)},
23 | but is vectorized to make it much faster
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/man/GetEdgesTable.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/network.R
 3 | \name{GetEdgesTable}
 4 | \alias{GetEdgesTable}
 5 | \title{Create a data frame of edges from a GO term matrix}
 6 | \usage{
 7 | GetEdgesTable(m, sets = NULL, ancestors = TRUE, ontology = c("BP", "MF",
 8 |   "CC"), ...)
 9 | }
10 | \arguments{
11 | \item{m}{GeneMatrix object representing GO terms}
12 | 
13 | \item{sets}{Gene sets to use in the network. If NULL, use
14 | \code{\link{ThresholdSets}} to find them}
15 | 
16 | \item{ancestors}{Whether to include all ancestors of significant
17 | in the matrix}
18 | 
19 | \item{ontology}{Which of BP, MF, and CC to use (default all three)}
20 | 
21 | \item{...}{Extra arguments passed on to threshold_sets}
22 | }
23 | \value{
24 | An igraph
25 | }
26 | \description{
27 | Create a data frame of edges from a GO term matrix
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/man/CompareY.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CompareY.R
 3 | \name{CompareY}
 4 | \alias{CompareY}
 5 | \title{Create a violin plot, boxplot or histogram looking at the metric of interest
 6 | over one or more columns}
 7 | \usage{
 8 | CompareY(m, columns, mode = "boxplot", char_limit = 35)
 9 | }
10 | \arguments{
11 | \item{m}{a GeneMatrix}
12 | 
13 | \item{columns}{The ID of one or more columns to plot}
14 | 
15 | \item{mode}{Type of plot to create: either "violin", "boxplot" or "histogram"}
16 | 
17 | \item{char_limit}{Maximum number of characters to include in column names}
18 | }
19 | \description{
20 | This compares the metric of interest (y) across the given genes, as well as
21 | comparing it to the overall distribution. This shows a visual representation
22 | of how a column (a gene set, a motif, a transcription factor, etc) is associated
23 | with the metric.
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/man/ThresholdSets.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TestAssociation.R
 3 | \name{ThresholdSets}
 4 | \alias{ThresholdSets}
 5 | \title{Return a vector of genes that pass a threshold for includions}
 6 | \usage{
 7 | ThresholdSets(m, alpha = 0.05, method = "fdr", ...)
 8 | }
 9 | \arguments{
10 | \item{m}{GeneMatrix object}
11 | 
12 | \item{alpha}{Threshold for (corrected) p-values, not used in LASSO}
13 | 
14 | \item{method}{Method, passed to \code{\link{p.adjust}}, or "qvalue"
15 | to use qvalue. Not used in LASSO}
16 | 
17 | \item{...}{Extra arguments to pass on to correction method}
18 | }
19 | \description{
20 | Return a vector of genes that pass a threshold of "significance"
21 | or other kind of inclusion. For tests with a p-value (t-test,
22 | Wilcoxon, hypergeometric), this is FDR-controlled p-values.
23 | For LASSO, this is all cases where beta1sd != 0.
24 | }
25 | \seealso{
26 | \link{p.adjust}, \link{p.adjust.methods}
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/man/sparse_cast.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utilities.R
 3 | \name{sparse_cast}
 4 | \alias{sparse_cast}
 5 | \title{Create a sparse matrix from row names, column names, and values
 6 | in a table.}
 7 | \usage{
 8 | sparse_cast(data, row, column, value)
 9 | }
10 | \arguments{
11 | \item{data}{A tbl}
12 | 
13 | \item{row}{A bare column name to use as row names in sparse matrix}
14 | 
15 | \item{column}{A bare column name to use as column names in sparse matrix}
16 | 
17 | \item{value}{A bare column name to use as sparse matrix values, default 1}
18 | }
19 | \value{
20 | A sparse Matrix object, with one row for each unique value in
21 | the \code{row} column, one column for each unique value in the \code{column}
22 | column, and with as many non-zero values as there are rows in \code{data}.
23 | }
24 | \description{
25 | Create a sparse matrix from row names, column names, and values
26 | in a table.
27 | }
28 | \examples{
29 | 
30 | dat <- data.frame(a = c("row1", "row1", "row2", "row2", "row2"),
31 |   b = c("col1", "col2", "col1", "col3", "col4"),
32 |   val = 2)
33 | 
34 | sparse_cast(dat, a, b)
35 | 
36 | sparse_cast(dat, a, b, val)
37 | 
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/man/GenerateTreemap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/treemap.R
 3 | \name{GenerateTreemap}
 4 | \alias{GenerateTreemap}
 5 | \title{Generate Treemap}
 6 | \usage{
 7 | GenerateTreemap(m, edges = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{m}{GeneMatrix object representing GO terms}
11 | 
12 | \item{...}{Extra arguments passed on to threshold_sets}
13 | }
14 | \value{
15 | Generates a treemap hierarchy
16 | Treemap GO Plot
17 | 
18 | Generate a treemap plot using the minimal spanning GO tree with genes assigned to individual GO terms.
19 | 
20 | Default color based on GO significance and effect sizes of individual genes
21 | }
22 | \description{
23 | Visualize genes nested within specific and general GO sets using treemaps
24 | }
25 | \details{
26 | Each gene should maximize the absolute sum of relevant node weights but every GO category will
27 | not be included since a gene can only be assigned to a single ancestry. A small GO category may
28 | fall into multiple ancestral paths some which are parents of other meaningful categories and others
29 | that are dead-ends.  Should favor attachment to pathways with the most informative children.
30 | To allow this to occur, three edge weights are used
31 | 1) the path with the highest overall absolute score * number of genes affected is chosen
32 | 2) ancestors of LASSO nodes all get a smaller score: [min(abs(beta))]*0.01*N_lasso_children
33 | - applied to edge where ancestors are children
34 | 3) all remaining edges that are not predictive recieve a tiny weight so that they are not filtered: [min(abs(beta))]*1e-4
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/man/TestAssociation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TestAssociation.R
 3 | \name{TestAssociation}
 4 | \alias{TestAssociation}
 5 | \title{Test association of a per-gene attribute with each column in a GeneMatrix}
 6 | \usage{
 7 | TestAssociation(m, genes, y, method = "lasso", ...)
 8 | }
 9 | \arguments{
10 | \item{m}{A GeneMatrix object}
11 | 
12 | \item{genes}{A vector of gene names (must match the gene names in the GeneMatrix)}
13 | 
14 | \item{y}{A per-gene metric of the same length as the vector of gene names}
15 | 
16 | \item{method}{Method used to test association between the per-gene metric and
17 | each column - either c("lasso", "t.test", "wilcoxon" or "hypergeometric")}
18 | 
19 | \item{...}{Additional arguments that will be given to the test in question}
20 | }
21 | \description{
22 | Test the association of a per-gene attribute y with each column, where a column
23 | can represent a gene set, a motif, transcription factor targets, or other genes
24 | that are functionally related
25 | }
26 | \examples{
27 | 
28 | library(org.Sc.sgd.db)
29 | 
30 | n_genes <- 200
31 | genes <- sample(mappedkeys(org.Sc.sgdGO), n_genes)
32 | 
33 | mm <- GOMembershipMatrix(org.Sc.sgdGO, min_size = 5,
34 |                        max_size = 50, chosen_genes = genes)
35 | # some genes are dropped because they do not have GO categories
36 | 
37 | n_genes <- nrow(mm)
38 | genes <- mm@geneData$ID
39 | 
40 | beta = c(20:1, rep(0, times = nrow(mm@colData)-20))
41 | y = c(as.matrix(mm) \%*\% beta + rnorm(n_genes, 0, 2))
42 | 
43 | results <- TestAssociation(mm, genes, y)
44 | View(results@colData)
45 | 
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/inst/tests/test-wilcoxon.R:
--------------------------------------------------------------------------------
 1 | test_that("Wilcoxon rank-sum test generates correct p-values", {
 2 |     data(factorial)
 3 |     mm = MembershipMatrix(organism = "Sc.sgd", ontology = "BP", min.size = 5, max.size = 250)
 4 |     
 5 |     # perform three hypothesis tests, one for each alternative
 6 |     for (ah in c("two.sided", "greater", "less")) {
 7 |         wilcoxon.mm = TestEnrichment(mm, factorial$ORF, factorial$RNA.Seq.logFC, method = "wilcoxon", alternative=ah)
 8 |         
 9 |         # check that it is internally consistent
10 |         expect_that(NROW(wilcoxon.mm@setData), equals(NCOL(wilcoxon.mm@matrix)))
11 |         expect_that(NROW(wilcoxon.mm@geneData), equals(NROW(wilcoxon.mm@matrix)))
12 |         
13 |         # check the first 6 and the last 6 pvalues
14 |         check.vals = as.numeric(c(1:6, tail(seq_along(wilcoxon.mm@setData$pvalue))))
15 |         for (ind in check.vals) {
16 |             b = (wilcoxon.mm@matrix[, ind] > 0)
17 |             manual.pval = wilcox.test(wilcoxon.mm@geneData$y[b], wilcoxon.mm@geneData$y[!b], alternative=ah)$p.value
18 |             expect_that(wilcoxon.mm@setData$pvalue[ind], equals(manual.pval))
19 |             # check that hypothesis test means what we want it to
20 |             r = rank(wilcoxon.mm@geneData$y)
21 |             members.greater = mean(r[b]) > mean(r[!b])
22 |             if (ah == "greater") {
23 |                 expect_that(wilcoxon.mm@setData$pvalue[ind] < .5,
24 |                             equals(members.greater))
25 |             }
26 |             if (ah == "less") {
27 |                 expect_that(wilcoxon.mm@setData$pvalue[ind] < .5,
28 |                             equals(!members.greater))
29 |             }
30 |         }    
31 |     }
32 | })
33 | 
34 | 


--------------------------------------------------------------------------------
/man/GOMembershipMatrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/GOMembershipMatrix.R
 3 | \name{GOMembershipMatrix}
 4 | \alias{GOMembershipMatrix}
 5 | \title{Construct a new gene set membership matrix}
 6 | \usage{
 7 | GOMembershipMatrix(annotations, ontology = c("BP", "MF", "CC"),
 8 |   evidence = NULL, min_size = 1, max_size = Inf, ancestors = TRUE,
 9 |   chosen_genes = NULL)
10 | }
11 | \arguments{
12 | \item{annotations}{A Go3AnnDbBimap object, typically from a Bioconductor Annotation
13 | package. For example, org.Hs.egGO from org.Hs.eg.db for human genes.}
14 | 
15 | \item{ontology}{A vector of ontologies to include; should be a subset
16 | of c("BP", "MF", "CC")}
17 | 
18 | \item{evidence}{A vector of GO evidence codes to include}
19 | 
20 | \item{min_size}{Minimum size of a gene set to be included}
21 | 
22 | \item{max_size}{Maximum size of a gene set to be included}
23 | 
24 | \item{ancestors}{Whether a gene included in a gene set should also be included
25 | as being in all ancestors of that gene set (default TRUE)}
26 | 
27 | \item{chosen_genes}{If provided, restricts genes to those in chosen_genes
28 | 
29 | The GO annotation maps typically come from the Bioconductor AnnotationDB packages.
30 | A couple of notable examples are:
31 | 
32 | Homo sapiens: org.Hs.egGO
33 | S. cerevisiae: org.Sc.sgdGO
34 | E coli K12: org.EcK12.egGO
35 | 
36 | You can restrict the sets to the BP (Biological Process), MF (Molecular Function),
37 | or CC (Cellular Compartment) ontologies (by default all are included).}
38 | }
39 | \description{
40 | Construct a gene set membership matrix (one row per gene, one column per
41 | gene set, with 1 indicating membership) from a GO annotation map.
42 | }
43 | \examples{
44 | 
45 | # yeast membership matrix
46 | library(org.Sc.sgd.db)
47 | mm <- GOMembershipMatrix(org.Sc.sgdGO, min_size = 5, max_size = 250)
48 | 
49 | # restrict to Biological Process ontology:
50 | mm <- GOMembershipMatrix(org.Sc.sgdGO, ontology = "BP", min_size = 5, max_size = 250)
51 | 
52 | # human membership matrix
53 | library(org.Hs.eg.db)
54 | mm <- GOMembershipMatrix(org.Hs.sgdGO, min_size = 5, max_size = 250)
55 | 
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/R/GeneMatrix-methods.R:
--------------------------------------------------------------------------------
 1 | # methods for working with a GeneMatrix
 2 | 
 3 | setMethod("dim", "GeneMatrix", function(x) dim(x@matrix))
 4 | setMethod("nrow", "GeneMatrix", function(x) nrow(x@matrix))
 5 | setMethod("ncol", "GeneMatrix", function(x) ncol(x@matrix))
 6 | setMethod("rownames", "GeneMatrix", function(x) rownames(x@matrix))
 7 | setMethod("colnames", "GeneMatrix", function(x) colnames(x@matrix))
 8 | setMethod("as.matrix", "GeneMatrix", function(x, ...) as.matrix(x@matrix))
 9 | 
10 | 
11 | setMethod("print", signature(x = "GeneMatrix"), function(x) {
12 |   cat(paste("Gene Matrix with", nrow(x), "genes and", ncol(x), "columns"))
13 | })
14 | 
15 | 
16 | setMethod("show", signature(object = "GeneMatrix"), function(object) {
17 |   print(object)
18 | })
19 | 
20 | 
21 | setMethod("[", c("GeneMatrix", "ANY", "ANY", "ANY"),
22 |           function(x, i, j, ..., drop = TRUE)
23 |           {
24 |             if (!is.null(x@fit)) {
25 |               warning("Subsetting a model that has already been tested")        
26 |             }
27 |             
28 |             # subset the matrix
29 |             if (missing(i)) {
30 |               i <- seq_len(nrow(x@matrix))
31 |             }
32 |             if (missing(j)) {
33 |               j <- seq_len(ncol(x@matrix))
34 |             }
35 |             
36 |             mat <- x@matrix[i, j]
37 |             if (length(i) == 1) {
38 |               mat <- Matrix(mat, nrow = 1)
39 |             } else if (length(j) == 1) {
40 |               mat <- Matrix(mat, ncol = 1)
41 |             }
42 |             x@matrix <- mat
43 |             
44 |             # subset gene and column data
45 |             if (!is.character(i)) {
46 |               x@geneData = x@geneData[i, ]
47 |             } else {
48 |               x@geneData = x@geneData[match(i, x@geneData$ID), ]
49 |             }
50 |             
51 |             if (!is.character(j)) {
52 |               x@colData = x@colData[j, ]
53 |             } else {
54 |               x@colData = x@colData[match(j, x@colData$ID)]
55 |             }    
56 |             
57 |             x@geneData$Size = rowSums(x@matrix != 0)
58 |             x@colData$Size = colSums(x@matrix != 0)
59 |             stopifnot(all(rownames(x@matrix) == x@geneData$ID))
60 |             stopifnot(all(colnames(x@matrix) == x@colData$ID))
61 |             x
62 |           })
63 | 


--------------------------------------------------------------------------------
/R/utilities.R:
--------------------------------------------------------------------------------
 1 | #' Standard-evaluation version of sparse_cast
 2 | #'
 3 | #' @param data A tbl
 4 | #' @param row_col String version of column to use as row names
 5 | #' @param column_col String version of column to use as column names
 6 | #' @param value_col String version of column to use as sparse matrix values,
 7 | #' or a numeric value to use
 8 | #'
 9 | #' @import Matrix
10 | sparse_cast_ <- function(data, row_col, column_col, value_col = NULL) {
11 |   row_names <- data[[row_col]]
12 |   col_names <- data[[column_col]]
13 |   if (is.numeric(value_col)) {
14 |     values <- value_col
15 |   } else {
16 |     values <- data[[value_col]]
17 |   }
18 |   
19 |   # if it's a factor, preserve ordering
20 |   row_u <- if (is.factor(row_names)) levels(row_names) else unique(row_names)
21 |   col_u <- if (is.factor(col_names)) levels(col_names) else unique(col_names)
22 |   
23 |   ret <- Matrix(0, nrow = length(row_u), ncol = length(col_u),
24 |                 dimnames = list(as.character(row_u), as.character(col_u)),
25 |                 sparse = TRUE)
26 |   
27 |   ret[cbind(match(row_names, row_u), match(col_names, col_u))] <- values
28 |   
29 |   ret
30 | }
31 | 
32 | 
33 | #' Create a sparse matrix from row names, column names, and values
34 | #' in a table.
35 | #'
36 | #' @param data A tbl
37 | #' @param row A bare column name to use as row names in sparse matrix
38 | #' @param column A bare column name to use as column names in sparse matrix
39 | #' @param value A bare column name to use as sparse matrix values, default 1
40 | #'
41 | #' @return A sparse Matrix object, with one row for each unique value in
42 | #' the \code{row} column, one column for each unique value in the \code{column}
43 | #' column, and with as many non-zero values as there are rows in \code{data}.
44 | #'
45 | #' @examples
46 | #'
47 | #' dat <- data.frame(a = c("row1", "row1", "row2", "row2", "row2"),
48 | #'   b = c("col1", "col2", "col1", "col3", "col4"),
49 | #'   val = 2)
50 | #'
51 | #' sparse_cast(dat, a, b)
52 | #'
53 | #' sparse_cast(dat, a, b, val)
54 | #'
55 | #' @name sparse_cast
56 | sparse_cast <- function(data, row, column, value) {
57 |   if (missing(value)) {
58 |     value_col <- 1
59 |   } else {
60 |     value_col <- as.character(substitute(value))
61 |     if (is.null(data[[value_col]])) {
62 |       value_col <- value
63 |     }
64 |   }
65 |   
66 |   sparse_cast_(data, as.character(substitute(row)),
67 |                as.character(substitute(column)), value_col)
68 | }
69 | 
70 | 
71 | #' Trim the length of a character vector, adding ellipses to trimmed elements
72 | #' 
73 | #' This utility function is used to shorten column names so they can appear
74 | #' on a figure.
75 | #' 
76 | #' @param x character vector
77 | #' @param width length of string to trim to
78 | #' 
79 | #' @export
80 | trim_ellipses <- function(x, width) {
81 |   ret <- stringr::str_sub(x, 1, width)
82 |   toolong <- stringr::str_length(x) > width
83 |   ret[toolong] <- stringr::str_c(ret[toolong], "...")
84 |   ret
85 | }
86 | 


--------------------------------------------------------------------------------
/tests/testthat/test-GSEAL.R:
--------------------------------------------------------------------------------
 1 | ### setup ###
 2 | 
 3 | library(org.Sc.sgd.db)
 4 | 
 5 | # shared matrix for general use
 6 | m = GOMembershipMatrix(org.Sc.sgdGO)
 7 | 
 8 | consistency.check = function(m) {
 9 |     expect_equal(rowSums(m@matrix != 0), m@geneData$Count)
10 |     expect_equal(colSums(m@matrix != 0), m@colData$Count)
11 | }
12 | 
13 | context("GO membership matrices")
14 | 
15 | test_that("We can create a gene set membership matrix", {
16 |     mm = GOMembershipMatrix(org.Sc.sgdGO)
17 |     consistency.check(mm)
18 |     # check that it contains only 0s and ones
19 |     expect_equal(unique(c(as.matrix(mm))), c(0, 1))
20 | })
21 | 
22 | test_that("min.size and max.size arguments of GOMembershipMatrix work", {
23 |     mm = GOMembershipMatrix(org.Sc.sgdGO, min.size=5, max.size=200)
24 |     consistency.check(mm)
25 |     expect_more_than(min(colSums(mm@matrix)), 4)
26 |     expect_less_than(max(rowSums(mm@matrix)), 201)
27 | })
28 | 
29 | context("Subsetting a matrix")
30 | 
31 | test_that("Subsetting a gene matrix leads to correct dimensions", {
32 |     consistency.check(m[2:4, ])
33 |     consistency.check(m[, 50:55])
34 |     consistency.check(m[20:25, 40:50])
35 |     
36 |     expect_equal(dim(m[2:4, ]), c(3, ncol(m)))
37 |     expect_equal(dim(m[, 6:10]), c(nrow(m), 5))
38 |     expect_equal(dim(m[11:50, 20:21]), c(40, 2))
39 | 
40 |     expect_equal(nrow(m[, 5:7]@colData), 3)
41 |     expect_equal(nrow(m[, 5:7]@geneData), nrow(m@geneData))
42 |     expect_equal(nrow(m[16:20, ]@geneData), 5)
43 |     expect_equal(nrow(m[16:20, ]@colData), nrow(m@colData))
44 | })
45 | 
46 | test_that("Subsetting a matrix with a numeric vector gets correct rows", {
47 |     # todo
48 | })
49 | 
50 | test_that("Subsetting a matrix with a logical vector gets correct rows", {
51 |     # todo
52 | })
53 | 
54 | context("Association testing")
55 | 
56 | test_that("Wilcoxon rank sum association testing calculates correct p-values", {
57 |     # create a random trait to test
58 |     trait = rnorm(nrow(m))
59 | 
60 |     # test in all three alternative hypotheses
61 |     testers = sample(which(colSums(m@matrix) > 10), 6)
62 |     for (alternative in c("two.sided", "less", "greater")) {
63 |         m = TestAssociation(m, rownames(m), trait, method="wilcoxon",
64 |                             alternative=alternative)
65 |         manual.tests = apply(as.matrix(m@matrix)[, testers], 2, function(col) {
66 |             wilcox.test(trait[col != 0], trait[col == 0], alternative=alternative)$p.value
67 |         })
68 |         expect_equal(m@colData$pvalue[testers], unname(manual.tests))
69 |     }
70 | })
71 | 
72 | test_that("T-test association testing calculates correct p-values", {
73 |     # todo: check that it is close enough, may not be exact
74 | 
75 |     # create a random trait to test
76 |     trait = rnorm(nrow(m))
77 |     
78 |     testers = sample(which(colSums(m@matrix) > 10), 3)
79 |     # test in all three alternative hypotheses
80 |     for (alternative in c("two.sided", "less", "greater")) {
81 |         m = TestAssociation(m, rownames(m), trait, method="t.test",
82 |                             alternative=alternative)
83 |         manual.tests = apply(as.matrix(m@matrix)[, testers], 2, function(col) {
84 |             t.test(trait[col != 0], trait[col == 0], alternative=alternative)$p.value
85 |         })
86 |         expect_equal(m@colData$pvalue[testers], unname(manual.tests))
87 |     }
88 | })
89 | 
90 | test_that("Hypergeometric association testing calculates correct p-values", {
91 |     # todo
92 | })
93 | 


--------------------------------------------------------------------------------
/R/network.R:
--------------------------------------------------------------------------------
  1 | #' Create a data frame of edges from a GO term matrix
  2 | #' 
  3 | #' @param m GeneMatrix object representing GO terms
  4 | #' @param sets Gene sets to use in the network. If NULL, use
  5 | #' \code{\link{ThresholdSets}} to find them
  6 | #' @param ancestors Whether to include all ancestors of significant
  7 | #' in the matrix
  8 | #' @param ontology Which of BP, MF, and CC to use (default all three)
  9 | #' @param ... Extra arguments passed on to threshold_sets
 10 | #' 
 11 | #' @return An igraph
 12 | #' 
 13 | #' @export
 14 | GetEdgesTable <- function(m, sets = NULL, ancestors = TRUE,
 15 |                           ontology = c("BP", "MF", "CC"), ...) {
 16 |   # include ancestors
 17 |   if (is.null(sets)) {
 18 |     sets <- ThresholdSets(m, ...)
 19 |   }
 20 |   if (length(sets) == 0) {
 21 |     stop("No thresholded sets to include in network")
 22 |   }
 23 |   
 24 |   if (ancestors) {
 25 |     sets <- GetGORelatives(sets, m@colData$ID) 
 26 |   }
 27 |   
 28 |   go_terms <- m@colData %>%
 29 |     dplyr::filter(ID %in% sets) %>%
 30 |     dplyr::filter(Ontology %in% ontology)
 31 |   
 32 |   # get direct edges going downward to create the graph, and combine with column data
 33 |   edges <- get_ancestry_matrix(go_terms$ID, tbl = TRUE, type = "CHILDREN", upward = FALSE) %>%
 34 |     dplyr::mutate(go_id1 = as.character(go_id1)) %>%
 35 |     dplyr::mutate(go_id2 = as.character(go_id2)) %>%
 36 |     dplyr::left_join(dplyr::select(go_terms, ID, Ontology), by = c(go_id1 = "ID")) %>%
 37 |     dplyr::tbl_df()
 38 |   
 39 |   edges
 40 | }
 41 | 
 42 | 
 43 | #' Create an igraph from a GeneMatrix representing GO terms
 44 | #' 
 45 | #' @param m A GeneMatrix object
 46 | #' @param edges A table of edges to plot. If not given, compute with
 47 | #' \code{\link{GetEdgesTable}}
 48 | #' @param ... Extra arguments passed on to GetEdgesTable
 49 | #' 
 50 | #' @return An igraph object
 51 | #' 
 52 | #' @export
 53 | GenerateNetwork <- function(m, edges = NULL, ...) {
 54 |   if (is.null(edges)) {
 55 |     edges <- GetEdgesTable(m, ...)
 56 |   }
 57 |   
 58 |   g <- igraph::graph.data.frame(edges)
 59 |   
 60 |   go_terms <- m@colData %>%
 61 |     filter(ID %in% c(edges$go_id1, edges$go_id2))
 62 |   
 63 |   go_node_data <- data.frame(ID = names(igraph::V(g)), stringsAsFactors = FALSE) %>%
 64 |     inner_join(go_terms, by = "ID")
 65 |   
 66 |   # transfer column information from nodes and edges
 67 |   for (col in colnames(go_node_data)) {
 68 |     # a hack, but necessary since V(g)[[col]] doesn't work
 69 |     eval(substitute(igraph::V(g)$replace <- go_node_data[[col]], list(replace = col)))
 70 |   }
 71 | 
 72 |   for (col in colnames(edges)) {
 73 |     eval(substitute(igraph::E(g)$replace <- edges[[col]], list(replace = col)))
 74 |   }
 75 |   
 76 |   g
 77 | }
 78 | 
 79 | 
 80 | #' Plot a network from an igraph representation
 81 | #' 
 82 | #' Return a ggplot2 object representing a network of sets.
 83 | #' This is meant to be a sensible default for plotting
 84 | #' GO networks, it will not cover all cases. We recommend learning
 85 | #' to use ggraph or another igraph plotting functionality.
 86 | #' 
 87 | #' @param g An igraph object, generally computed by \link{\code{GenerateNetwork}}
 88 | #' @param algorithm Algorithm to use for layout
 89 | #' @param arrow Arrow 
 90 | #' 
 91 | #' @import ggplot2
 92 | #' @import ggraph
 93 | #' 
 94 | #' @export
 95 | PlotNetwork <- function(g, algorithm = 'kk', arrow = grid::arrow(length = grid::unit(.1, "inches"))) {
 96 |   ggraph(g, 'igraph', algorithm = algorithm) +
 97 |     geom_edge_link(arrow = arrow) +
 98 |     geom_node_point(aes(size = Size)) +
 99 |     geom_node_point(aes(color = MeanDifference, size = Size)) +
100 |     geom_node_text(aes(label = Term), check_overlap = TRUE, size = 3) +
101 |     ggforce::theme_no_axes() +
102 |     scale_colour_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0,
103 |                            space = "Lab", na.value = "grey50", guide = "colourbar")
104 | }


--------------------------------------------------------------------------------
/R/vectorized_tests.R:
--------------------------------------------------------------------------------
  1 | #' Perform a Student's T-test comparing a metric to each column of a matrix
  2 | #'
  3 | #' This function is similar to \code{apply(m, 2, function(col) t.test(y ~ col)$p.value)},
  4 | #' but is vectorized to make it much faster
  5 | #'
  6 | #' @param m binary matrix
  7 | #' @param y numeric vector to test each column of m against
  8 | #' @param var.equal assume that the variances are equal
  9 | #' @param alternative whether to use a one-sided test, and if so which way
 10 | #' @param tbl Whether to return as a data frame
 11 | #'
 12 | #' @export
 13 | vectorized_t_test <- function(m, y, var.equal=FALSE, alternative="two.sided",
 14 |                               tbl = FALSE) {
 15 |   stopifnot(NROW(m) == length(y))
 16 |   
 17 |   in.m <- y * m
 18 |   out.m <- y * (1 - m)
 19 |   n.in <- colSums(m)
 20 |   n.out <- NROW(m) - n.in
 21 |   
 22 |   in.mu <- colSums(in.m) / n.in
 23 |   out.mu <- colSums(out.m) / n.out
 24 |   
 25 |   v.in <- colSums((in.m - t(t(m) * in.mu)) ^ 2) / (n.in - 1)
 26 |   v.out <- colSums((out.m - t(t(m) * out.mu)) ^ 2) / (n.out - 1)
 27 |   
 28 |   if (var.equal) {
 29 |     df <- NROW(m) - 2
 30 |     v <- (n.in - 1) * v.in + (n.out - 1) * v.out
 31 |     v <- v / df
 32 |     stderr <- sqrt(v*(1 / n.in + 1 / n.out))
 33 |   }
 34 |   else {
 35 |     stderr.in <- sqrt(v.in/n.in)
 36 |     stderr.out <- sqrt(v.out/n.out)
 37 |     stderr <- sqrt(stderr.in ^ 2 + stderr.out ^ 2)
 38 |     df <- stderr ^ 4 / (stderr.in ^ 4 / (n.in - 1) + stderr.out ^ 4 / (n.out - 1))
 39 |   }
 40 |   
 41 |   tstat <- (in.mu - out.mu) / stderr
 42 |   
 43 |   PVAL <- switch(alternative, less = pt(tstat, df),
 44 |                  greater = pt(tstat, df, lower.tail = FALSE),
 45 |                  two.sided = 2 * pt(-abs(tstat), df))
 46 | 
 47 |   if (tbl) {
 48 |     ret <- data.frame(column = colnames(m),
 49 |                       p.value = PVAL,
 50 |                       estimate = in.mu - out.mu)
 51 |   } else {
 52 |     # name them, if the matrix has names
 53 |     names(PVAL) <- colnames(m)
 54 |     PVAL
 55 |   }
 56 | }
 57 | 
 58 | 
 59 | #' Perform a Wilcoxon rank-sum test comparing a metric to each column of a matrix
 60 | #'
 61 | #' This function is similar to \code{apply(m, 2, function(col) wilcox.test(y ~ col)$p.value)},
 62 | #' but is vectorized to make it much faster
 63 | #'
 64 | #' @param m binary matrix
 65 | #' @param y numeric vector to test each column of m against
 66 | #' @param alternative whether to use a one-sided test, and if so which way
 67 | #' @param tbl whether to return a data.frame rather than a vector
 68 | #'
 69 | #' @export
 70 | vectorized_wilcoxon_test = function(m, y, alternative="two.sided", tbl = FALSE) {
 71 |   # given a boolean matrix and a vector y, apply the wilcoxon test to see
 72 |   # if y depends on each column of the matrix, returning a vector of
 73 |   # p-values
 74 |   stopifnot(NROW(m) == length(y))
 75 |   
 76 |   rk = rank(y)
 77 |   n.x = colSums(m)
 78 |   n.y = NROW(m) - n.x
 79 |   
 80 |   STATISTIC = colSums(rk * m) - n.x * (n.x + 1) / 2
 81 |   NTIES = table(rk)
 82 |   
 83 |   z <- STATISTIC - n.x * n.y / 2
 84 |   
 85 |   CORRECTION <- switch(alternative,
 86 |                        "two.sided" = sign(z) * 0.5,
 87 |                        "greater" = 0.5,
 88 |                        "less" = -0.5)
 89 |   
 90 |   SIGMA <- sqrt((n.x * n.y / 12) *
 91 |                   ((n.x + n.y + 1)
 92 |                    - sum(NTIES ^ 3 - NTIES)
 93 |                    / ((n.x + n.y) * (n.x + n.y - 1))))
 94 |   z <- (z - CORRECTION) / SIGMA
 95 |   
 96 |   PVAL <- switch(alternative,
 97 |                  "less" = pnorm(z),
 98 |                  "greater" = pnorm(z, lower.tail = FALSE),
 99 |                  "two.sided" = 2 * pmin(pnorm(z),
100 |                                         pnorm(z, lower.tail = FALSE)))
101 |   
102 |   if (tbl) {
103 |     # return as a tidy table
104 |     auc <- STATISTIC / (n.x * n.y)
105 |     data.frame(column = colnames(m), p.value = PVAL, auc = auc)
106 |   } else {
107 |     # name them, if the matrix has names
108 |     names(PVAL) <- colnames(m)
109 |     PVAL
110 |   }
111 | }
112 | 


--------------------------------------------------------------------------------
/R/CompareY.R:
--------------------------------------------------------------------------------
  1 | #' Return all genes, along with the saved metric in each of the given sets
  2 | #' 
  3 | #' @param m a GeneMatrix object
  4 | #' @param columns vector of columns (e.g. gene sets) whose genes should be extracted
  5 | #' @param char_limit optionally shorten the Term field to the desired length
  6 | #' @param add_count whether the count within each set should be added to the
  7 | #' Term field (like "Metabolism (360)")
  8 | #' 
  9 | #' @importFrom dplyr inner_join do
 10 | #' 
 11 | #' @export
 12 | ExtractGenes <- function(m, columns, char_limit = NULL, add_count = FALSE,
 13 |                          overall = FALSE) {
 14 |     # repeat gene data, once for each column
 15 |     dat <- data_frame(Set = columns) %>%
 16 |       group_by(Set) %>%
 17 |       do(m@geneData)
 18 | 
 19 |     dat$Present <- c(as.matrix(m@matrix[, columns]) > 0)
 20 |     
 21 |     dat <- dat %>%
 22 |       filter(Present == TRUE) %>%
 23 |       dplyr::select(-Present, -Size) %>%
 24 |       inner_join(m@colData, by = c(Set = "ID")) %>%
 25 |       arrange(match(Set, columns))
 26 |     
 27 |     if (!is.null(char_limit)) {
 28 |         # shorten the Term
 29 |         dat <- dat %>%
 30 |           mutate(Term = trim_ellipses(Term, char_limit))
 31 |     }
 32 |     if (add_count) {
 33 |         # add parenthetical counts to each term
 34 |         dat <- dat %>%
 35 |           mutate(Term = paste0(Term, " (", Size, ")"))
 36 |     }
 37 |     if (overall) {
 38 |         dat <- bind_rows(dat, cbind(m@geneData, Term = "Overall"))
 39 |     }
 40 |     
 41 |     # place terms in order of appearance
 42 |     dat$Term <- factor(dat$Term, levels = dat$Term[!duplicated(dat$Term)])
 43 |     dat
 44 | }
 45 | 
 46 | 
 47 | #' Create a violin plot, boxplot or histogram looking at the metric of interest
 48 | #' over one or more columns
 49 | #' 
 50 | #' This compares the metric of interest (y) across the given genes, as well as
 51 | #' comparing it to the overall distribution. This shows a visual representation
 52 | #' of how a column (a gene set, a motif, a transcription factor, etc) is associated
 53 | #' with the metric.
 54 | #' 
 55 | #' @param m a GeneMatrix
 56 | #' @param columns The ID of one or more columns to plot
 57 | #' @param mode Type of plot to create: either "violin", "boxplot" or "histogram"
 58 | #' @param char_limit Maximum number of characters to include in column names
 59 | #' 
 60 | #' @import ggplot2
 61 | #' 
 62 | #' @export
 63 | CompareY = function(m, columns, mode = "boxplot", char_limit = 35) {
 64 |   # graph one or more columns 
 65 |   # reverse the list (so that it reads top down instead of bottom up)
 66 |   dat <- ExtractGenes(m, rev(columns),
 67 |                       char_limit = char_limit,
 68 |                       add_count = TRUE,
 69 |                       overall = TRUE)
 70 |   
 71 |   mode <- match.arg(mode, c("boxplot", "violin", "histogram"))
 72 |   if (mode == "histogram") {
 73 |     g <- ggplot(dat, aes(y)) +
 74 |       geom_histogram() +
 75 |       facet_wrap(~Term, ncol = 1, scale = "free_y")
 76 |   }
 77 |   else {
 78 |     g <- ggplot(dat, aes(x = Term, y = y)) +
 79 |       coord_flip() +
 80 |       theme(axis.text.y = element_text(color = "black", size = 15))
 81 |     if (mode == "violin") {
 82 |       g = g + geom_violin()
 83 |     }
 84 |     else {
 85 |       g = g + geom_boxplot()
 86 |     }
 87 |   }
 88 |   g
 89 | }
 90 | 
 91 | 
 92 | #' Create a graph comparing the top n columns (in terms of association with y)
 93 | #' 
 94 | #' Plot a bocplot, violin plot, or faceted histogram with the distributions
 95 | #' of y within the most associated gene sets.
 96 | #'
 97 | #' @param m GeneMatrix object
 98 | #' @param n Number of genes to compare
 99 | #' @param ... Additional arguments to be passed to CompareY
100 | #' 
101 | #' @details Most methods ("t.test", "wilcoxon", "hypergeometric") use the
102 | #' p-value to find the most significant columns. The "lasso" uses the order
103 | #' in which the terms were added to the regression
104 | #' 
105 | #' @export
106 | CompareTopColumns = function(m, n = 9, ...) {
107 |   if (length(m@rankingMetric) == 0) {
108 |     stop("Cannot plot top genes without first running a test")
109 |   }
110 | 
111 |   top_sets <- m@colData %>%
112 |     arrange_(m@rankingMetric) %>%
113 |     head(n)
114 |     
115 |   CompareY(m, top_sets$ID, ...)
116 | }
117 | 


--------------------------------------------------------------------------------
/_vignettes/brauer.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Analysis of Brauer et al 2008"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r}
  7 | library(readr)
  8 | library(dplyr)
  9 | library(tidyr)
 10 | library(ggplot2)
 11 | 
 12 | url <- "http://varianceexplained.org/files/Brauer2008_DataSet1.tds"
 13 | nutrient_names <- c(G = "Glucose", L = "Leucine", P = "Phosphate",
 14 |                     S = "Sulfate", N = "Ammonia", U = "Uracil")
 15 | 
 16 | cleaned_data <- read_delim(url, delim = "\t") %>%
 17 |   separate(NAME, c("name", "BP", "MF", "systematic_name", "number"), sep = "\\|\\|") %>%
 18 |   mutate_each(funs(trimws), name:systematic_name) %>%
 19 |   dplyr::select(-number, -GID, -YORF, -GWEIGHT) %>%
 20 |   gather(sample, expression, G0.05:U0.3) %>%
 21 |   separate(sample, c("nutrient", "rate"), sep = 1, convert = TRUE) %>%
 22 |   mutate(nutrient = plyr::revalue(nutrient, nutrient_names)) %>%
 23 |   filter(!is.na(expression), systematic_name != "")
 24 | 
 25 | exprs <- cleaned_data %>%
 26 |   reshape2::acast(systematic_name + nutrient ~ rate, value.var = "expression")
 27 | 
 28 | head(exprs)
 29 | 
 30 | rate <- as.numeric(colnames(exprs))
 31 | 
 32 | library(limma)
 33 | fit <- lmFit(exprs, model.matrix(~rate))
 34 | eb <- eBayes(fit)
 35 | 
 36 | library(biobroom)
 37 | library(tidyr)
 38 | td <- tidy(eb, intercept = TRUE) %>%
 39 |   separate(gene, c("ID", "nutrient"), sep = "_") %>%
 40 |   filter(!is.na(estimate))
 41 | 
 42 | td <- td %>%
 43 |   group_by(term, ID) %>%
 44 |   mutate(centered = estimate - (sum(estimate) - estimate) / (n() - 1)) %>%
 45 |   ungroup()
 46 | 
 47 | combined <- td %>%
 48 |   group_by(term, ID) %>%
 49 |   summarise(centered = mean(estimate)) %>%
 50 |   bind_rows(td) %>%
 51 |   replace_na(list(nutrient = "Average"))
 52 | 
 53 | P <- td %>%
 54 |   filter(term == "rate", nutrient == "Phosphate")
 55 | ```
 56 | 
 57 | ```{r}
 58 | library(GSEAMA)
 59 | library(org.Sc.sgd.db)
 60 | mm <- GOMembershipMatrix(org.Sc.sgdGO, ontology = "BP", min_size = 5, max_size = 2000)
 61 | 
 62 | # need to apply BP, MF or CC before here for LASSO
 63 | a <- TestAssociation(mm, P$ID, P$centered, method = "lasso")
 64 | ```
 65 | 
 66 | ```{r}
 67 | library(igraph)
 68 | library(ggraph)
 69 | library(ggforce)
 70 | 
 71 | g <- GenerateNetwork(a)
 72 | PlotNetwork(g)
 73 | ```
 74 | 
 75 | ```{r}
 76 | library(multidplyr)
 77 | 
 78 | # TODO: don't use multidplyr
 79 | associations_setup <- combined %>%
 80 |   dplyr::select(term, nutrient, ID, centered) %>%
 81 |   nest(ID, centered)
 82 | 
 83 | associations_setup$mm <- lapply(1:nrow(associations_setup), function(x) mm)
 84 | 
 85 | associations <- associations_setup %>%
 86 |   group_by(term, nutrient) %>%
 87 |   do(association = GSEAMA::TestAssociation(.$mm[[1]], .$data[[1]]$ID, .$data[[1]]$centered, method = "lasso")) %>%
 88 |   collect()
 89 | 
 90 | associations$networks <- mclapply(associations$association, GSEAMA::GenerateNetwork)
 91 | associations$graphs <- mclapply(associations$networks, GSEAMA::PlotNetwork)
 92 | 
 93 | save(associations, file = "~/Desktop/associations.rda")
 94 | ```
 95 | 
 96 | ```{r}
 97 | sets_each <- associations %>%
 98 |   group_by(term, nutrient) %>%
 99 |   do(data.frame(set = ThresholdSets(.$association[[1]])))
100 | 
101 | big_network <- GenerateNetwork(associations$association[[1]], sets = unique(sets_each$set))
102 | 
103 | layout <- createLayout(big_network, "igraph", algorithm = "kk")
104 | terms <- trim_ellipses(layout$Term, 30)
105 | 
106 | for (i in seq_len(nrow(associations))) {
107 |   name <- paste(associations$term[i], associations$nutrient[i])
108 |   adata <- associations$association[[i]]@colData
109 |   layout$MeanDifference <- adata$MeanDifference[match(layout$ID, adata$ID)]
110 |   layout$MeanDifference <- pmax(pmin(layout$MeanDifference, 1), -1)
111 |   layout$Term <- ifelse(abs(layout$MeanDifference) > .5, terms, NA)
112 |   
113 |   g <- ggraph(data = layout) +
114 |     geom_edge_link() +
115 |     geom_node_point(aes(color = MeanDifference, size = Size)) +
116 |     geom_node_text(aes(label = Term, alpha = abs(MeanDifference) ^ 2), check_overlap = TRUE, size = 3) +
117 |     ggforce::theme_no_axes() +
118 |     scale_colour_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, space = "Lab", na.value = "grey50", guide = "colourbar") +
119 |     ggtitle(name)
120 | 
121 |   ggsave(filename = paste0(name, ".png"), g)
122 | }
123 | ```
124 | 
125 | ```{r}
126 | #PlotNetwork(big_network)
127 | all_set_data <- associations %>%
128 |   group_by(term, nutrient) %>%
129 |   do(.$assocation[[1]]@colData)
130 | 
131 | 
132 | ```


--------------------------------------------------------------------------------
/_vignettes/GSEAMA.Rnw:
--------------------------------------------------------------------------------
  1 | \documentclass{article}
  2 | 
  3 | %\VignetteEngine{knitr::knitr}
  4 | %\VignetteIndexEntry{subSeq Example}
  5 | 
  6 | \usepackage{natbib}
  7 | \usepackage{graphics}
  8 | \usepackage{amsmath}
  9 | \usepackage{indentfirst}
 10 | \usepackage[utf8]{inputenc}
 11 | \usepackage{hyperref}
 12 | 
 13 | \Sexpr{library(knitr); opts_chunk$set(tidy=TRUE, cache=TRUE, warning=FALSE, message=FALSE)}
 14 | 
 15 | <<foo,include=FALSE,echo=FALSE>>=
 16 | options(keep.source = TRUE, width = 60)
 17 | desc <- packageDescription("GSEAMA")
 18 | @
 19 | 
 20 | \title{GSEAMA (Gene Set Enrichment Analysis with LASSO) Package Vignette (Version \Sexpr{desc$Version})}
 21 | \author{David G. Robinson}
 22 | 
 23 | \begin{document}
 24 | 
 25 | \maketitle
 26 | 
 27 | \section{Introduction}
 28 | 
 29 | This is a vignette for the \verb@GSEAMA@ package, which performs gene set enrichment analysis using Lasso (L1-constrained) regression. \emph{etc}
 30 | 
 31 | \section{Example}
 32 | 
 33 | \subsection{Data}
 34 | 
 35 | First we load in data from the RNA-Seq/Microarray factorial experiment comparing glucose and ethanol:
 36 | 
 37 | <<import_data>>=
 38 | library(GSEAMA)
 39 | data(factorial)
 40 | 
 41 | dim(factorial)
 42 | head(factorial)
 43 | @
 44 | 
 45 | The only inputs we need are the systematic name of each gene (\texttt{factorial\$ORF}) and some metric in which we are measuring enrichment. Here we have four choices: the p-values or the log fold-changes from either the RNA-Seq or the microarray parallel experiment. We'll use the RNA-Seq log fold changes for this experiment.
 46 | 
 47 | \subsection{Membership Matrix}
 48 | 
 49 | The central data structure of this package is the GeneMatrix class. You can create one quite easily by providing the GO map for that species as found in the \href{http://www.bioconductor.org/packages/release/data/annotation/}{AnnotationData} Packages.
 50 | \begin{center}
 51 | \begin{tabular}{| c | c | c |}
 52 | \hline
 53 | \textbf{Organism} & \textbf{Package} & \textbf{GO map} \\
 54 | \hline
 55 | Yeast & \texttt{org.Sc.sgd.db} & \texttt{org.Sc.sgdGO} \\
 56 | \hline
 57 | Human & \texttt{org.Hs.eg.db} & \texttt{org.Hs.egGO} \\
 58 | \hline
 59 | Mouse & \texttt{org.Mm.eg.db} & \texttt{org.Mm.egGO} \\
 60 | \hline
 61 | E. coli K12 & \texttt{org.EcK12.eg.db} & \texttt{org.EcK12.egGO} \\
 62 | \hline
 63 | \end{tabular}
 64 | \end{center}
 65 | 
 66 | <<create_mm, dependson="import_data">>=
 67 | library(org.Sc.sgd.db)
 68 | mm = GOMembershipMatrix(org.Sc.sgdGO, ontology="BP", min.size=5, max.size=250)
 69 | @
 70 | 
 71 | The membership matrix includes useful information about each gene set:
 72 | 
 73 | <<show_set_data, dependson="create_mm">>=
 74 | mm@colData[398, ]
 75 | @
 76 | 
 77 | As well as a table of information about each gene:
 78 | 
 79 | <<show_gene_data, dependson="create_mm">>=
 80 | mm@geneData
 81 | @
 82 | 
 83 | \clearpage
 84 | 
 85 | \subsection{Wilcoxon Test}
 86 | 
 87 | One simple test you can perform is a Wilcoxon rank sum test, comparing the $y$ within a set to the $y$ outside that set. This can be done with the \texttt{TestAssociation} function:
 88 | 
 89 | <<test_wilcoxon, dependson="create_mm">>=
 90 | wilcoxon.mm = TestAssociation(mm, factorial$ORF, factorial$RNA.Seq.logFC, method="wilcoxon")
 91 | @
 92 | 
 93 | It returns a MembershipMatrix object as well. The p-values from the Wilcoxon test have been added to the \texttt{colData} table (one p-value for each set).
 94 | 
 95 | <<hist_Wilcoxon_pvalues, dependson="test_wilcoxon">>=
 96 | hist(wilcoxon.mm@colData$pvalue)
 97 | @
 98 | 
 99 | The \texttt{CompareTopColumns} function is useful for looking at the actual distribution of $y$ within each of the top significant sets:
100 | 
101 | <<top_sets, dependson="test_wilcoxon">>=
102 | CompareTopColumns(wilcoxon.mm)
103 | @
104 | 
105 | If you are interested in other specific gene sets, you can use the CompareY function to compare them one at a time, by ID:
106 | 
107 | <<compare_Y, dependson="test_wilcoxon">>=
108 | CompareY(wilcoxon.mm, "GO:0006094")
109 | @
110 | 
111 | Or we can look at multiple sets that we're interested in:
112 | 
113 | <<>>=
114 | glucose.genes = wilcoxon.mm@colData[grep("glucose", Definition), ]$ID
115 | glucose.genes
116 | 
117 | CompareY(wilcoxon.mm, glucose.genes)
118 | @
119 | 
120 | \subsection{LASSO}
121 | 
122 | One flaw with the Wilcoxon test is that it treats every hypothesis as being separate, when in fact they are likely highly correlated. For example, gene sets are highly redundant: all gene sets are contained within "parent" gene sets, and some heavily overlap.
123 | 
124 | <<lasso, dependson="create_mm">>=
125 | lasso.mm = TestAssociation(mm, factorial$ORF, factorial$RNA.Seq.logFC, method="lasso")
126 | @
127 | 
128 | <<plot_top_lasso, dependson="lasso">>=
129 | CompareTopColumns(lasso.mm, n=15)
130 | @
131 | 
132 | \end{document}
133 | 


--------------------------------------------------------------------------------
/R/GOMembershipMatrix.R:
--------------------------------------------------------------------------------
  1 | #' Construct a new gene set membership matrix
  2 | #' 
  3 | #' Construct a gene set membership matrix (one row per gene, one column per
  4 | #' gene set, with 1 indicating membership) from a GO annotation map.
  5 | #' 
  6 | #' @param annotations A Go3AnnDbBimap object, typically from a Bioconductor Annotation
  7 | #' package. For example, org.Hs.egGO from org.Hs.eg.db for human genes.
  8 | #' @param ontology A vector of ontologies to include; should be a subset
  9 | #' of c("BP", "MF", "CC")
 10 | #' @param evidence A vector of GO evidence codes to include
 11 | #' @param min_size Minimum size of a gene set to be included
 12 | #' @param max_size Maximum size of a gene set to be included
 13 | #' @param ancestors Whether a gene included in a gene set should also be included
 14 | #' as being in all ancestors of that gene set (default TRUE)
 15 | #' @param chosen_genes If provided, restricts genes to those in chosen_genes
 16 | #' 
 17 | #' The GO annotation maps typically come from the Bioconductor AnnotationDB packages.
 18 | #' A couple of notable examples are:
 19 | #' 
 20 | #' Homo sapiens: org.Hs.egGO
 21 | #' S. cerevisiae: org.Sc.sgdGO
 22 | #' E coli K12: org.EcK12.egGO
 23 | #' 
 24 | #' You can restrict the sets to the BP (Biological Process), MF (Molecular Function),
 25 | #' or CC (Cellular Compartment) ontologies (by default all are included).
 26 | #' 
 27 | #' @import Matrix AnnotationDbi
 28 | #' @importFrom dplyr %>% data_frame tbl_df filter mutate group_by arrange distinct_
 29 | #' 
 30 | #' @examples
 31 | #' 
 32 | #' # yeast membership matrix
 33 | #' library(org.Sc.sgd.db)
 34 | #' mm <- GOMembershipMatrix(org.Sc.sgdGO, min_size = 5, max_size = 250)
 35 | #' 
 36 | #' # restrict to Biological Process ontology:
 37 | #' mm <- GOMembershipMatrix(org.Sc.sgdGO, ontology = "BP", min_size = 5, max_size = 250)
 38 | #' 
 39 | #' # human membership matrix
 40 | #' library(org.Hs.eg.db)
 41 | #' mm <- GOMembershipMatrix(org.Hs.sgdGO, min_size = 5, max_size = 250)
 42 | #' 
 43 | #' @export
 44 | GOMembershipMatrix = function(annotations,
 45 |                                 ontology = c("BP", "MF", "CC"),
 46 |                                 evidence = NULL,
 47 |                                 min_size = 1,
 48 |                                 max_size = Inf,
 49 |                                 ancestors = TRUE,
 50 |                                 chosen_genes = NULL) {
 51 |   # create membership matrix
 52 |   frame_df <- annotations %>%
 53 |     AnnotationDbi::toTable() %>%
 54 |     dplyr::tbl_df() %>%
 55 |     dplyr::filter(Ontology %in% ontology)
 56 |     #filter(go_id %in% available_go_ids)
 57 |   
 58 |   ontology_tbl <- frame_df %>%
 59 |     dplyr::select(ID = go_id, Ontology) %>%
 60 |     dplyr::distinct(ID, .keep_all = TRUE)
 61 | 
 62 |   if (!is.null(evidence)) {
 63 |     frame_df <- frame_df %>%
 64 |       dplyr::filter(evidence %in% Evidence)
 65 |   }
 66 |   
 67 |   id_field <- colnames(frame_df)[1]
 68 |   frame_df <- frame_df %>%
 69 |     dplyr::distinct_(id_field, "go_id", .keep_all = TRUE)
 70 |   
 71 |   if (!is.null(chosen_genes)) {
 72 |     frame_df <- frame_df[frame_df[[id_field]] %in% genes, ]
 73 |   }
 74 |   
 75 |   # turn into a sparse membership matrix
 76 |   m <- sparse_cast_(frame_df, id_field, "go_id", 1)
 77 | 
 78 |   if (ancestors) {
 79 |     # include relationships recursively: if a gene is included in a set,
 80 |     # also include it in sets that are ancestors of that set
 81 |     ancestry_matrix <- get_ancestry_matrix(colnames(m))
 82 |     
 83 |     # combine with descendants, turn back into binary matrix
 84 |     m <- m + m %*% ancestry_matrix
 85 |     m = (m > 0) + 0
 86 |   }
 87 |   
 88 |   # eliminate sets with too few genes, and then genes with no sets
 89 |   set_size = colSums(m)
 90 |   m <- m[, set_size >= min_size & set_size <= max_size]
 91 |   m <- m[rowSums(m) > 0, ]
 92 |   sets <- colnames(m)
 93 |   genes <- rownames(m)
 94 |   
 95 |   # create table of set data, matching the membership matrix
 96 |   set_data <- data_frame(ID = sets,
 97 |                         Term = Term(sets),
 98 |                         Definition = Definition(sets),
 99 |                         Size = colSums(m)) %>%
100 |     dplyr::inner_join(ontology_tbl, by = "ID")
101 |   
102 |   # just make sure nothing was reordered
103 |   stopifnot(!all(sets == set_data))
104 |   
105 |   # create table of gene data
106 |   gene_data <- data_frame(ID = rownames(m), Size = rowSums(m))
107 |   
108 |   ret <- new("GeneMatrix",
109 |              matrix = m,
110 |              colData = set_data,
111 |              geneData = gene_data)
112 | 
113 |   ret
114 | }
115 | 
116 | 
117 | #' Build an offspring matrix of GO terms
118 | #' 
119 | #' A sparse binary Matrix object with one row and column for each pair
120 | #' of GO terms provided, where each row represents an ancestor and each column
121 | #' represents a descendant, with 1 marking ancestor/descendant pairs.
122 | #' 
123 | #' @param terms IDs of GO terms that should be included in the ancestry matrix
124 | #' @param ontology Ontologies to use
125 | #' @param type Either OFFSPRING (default), CHILDREN, ANCESTOR, or PARENT
126 | #' 
127 | #' @import GO.db
128 | #' 
129 | #' @export
130 | get_ancestry_matrix <- function(terms, ontology = c("BP", "MF", "CC"),
131 |                                 type = "OFFSPRING", upward = TRUE, tbl = FALSE) {
132 |   offspring_df <- do.call(rbind, (lapply(ontology, function(o) {
133 |     toTable(get(paste0("GO", o, type)))
134 |   })))[, 1:2]
135 |   
136 |   ret <- offspring_df %>%
137 |     setNames(c("go_id1", "go_id2")) %>%
138 |     filter(go_id1 %in% terms, go_id2 %in% terms) %>%
139 |     mutate(go_id1 = factor(go_id1, levels = terms),
140 |            go_id2 = factor(go_id2, levels = terms)) %>%
141 |     tbl_df()
142 |   
143 |   if (!upward) {
144 |     tmp <- ret$go_id1
145 |     ret$go_id1 <- ret$go_id2
146 |     ret$go_id2 <- tmp
147 |   }
148 |   
149 |   if (!tbl) {
150 |     ret <- sparse_cast(ret, go_id1, go_id2)
151 |   }
152 |   
153 |   ret
154 | }
155 | 
156 | 
157 | #' Get all GO IDs in all_IDs that are ancestors/descendants of those in IDs
158 | #' 
159 | #' @param x A vector of GO IDs
160 | #' @param all_IDs A vector of GO IDs to look for relatives of x
161 | #' @param ancestors If true look among ancestors, if false look for children
162 | #' @param combine Whether to combine it with the sets in the IDs parameter
163 | GetGORelatives <- function(IDs, all_IDs,
164 |                              ancestors = TRUE,
165 |                              direct = FALSE,
166 |                              combine = TRUE) {
167 |   type <- if (direct) { "CHILDREN" } else { "OFFSPRING" }
168 |   am <- get_ancestry_matrix(all_IDs, type = type, upward = ancestors)
169 |   ret <- names(which(colSums(am[IDs, , drop = FALSE]) > 0))
170 |   
171 |   if (combine) {
172 |     ret <- unique(c(IDs, ret))
173 |   }
174 |   ret
175 | }
176 | 


--------------------------------------------------------------------------------
/R/TestAssociation.R:
--------------------------------------------------------------------------------
  1 | #' Test association of a per-gene attribute with each column in a GeneMatrix
  2 | #' 
  3 | #' Test the association of a per-gene attribute y with each column, where a column
  4 | #' can represent a gene set, a motif, transcription factor targets, or other genes
  5 | #' that are functionally related
  6 | #' 
  7 | #' @param m A GeneMatrix object
  8 | #' @param genes A vector of gene names (must match the gene names in the GeneMatrix)
  9 | #' @param y A per-gene metric of the same length as the vector of gene names
 10 | #' @param method Method used to test association between the per-gene metric and
 11 | #' each column - either c("lasso", "t.test", "wilcoxon" or "hypergeometric")
 12 | #' @param ... Additional arguments that will be given to the test in question
 13 | #' 
 14 | #' @details A t-test or Wilcoxon test can take an additional argument of \code{alternative},
 15 | #' which allows a one-sided test with an alternative hypothesis of "less" or "greater".
 16 | #' A t=test can also take the \code{var.equal} argument, indicating whether to treat the
 17 | #' variance in and outside each group as identical.
 18 | #' 
 19 | #' LASSO fits a penalized least squares model: 
 20 | #' 
 21 | #' min    (y - \beta X)^2 + \lambda \sum|\beta|}
 22 | #' 
 23 | #' @examples
 24 | #' 
 25 | #'library(org.Sc.sgd.db)
 26 | #'
 27 | #'n_genes <- 200
 28 | #'genes <- sample(mappedkeys(org.Sc.sgdGO), n_genes)
 29 | #'
 30 | #'mm <- GOMembershipMatrix(org.Sc.sgdGO, min_size = 5,
 31 | #'                        max_size = 50, chosen_genes = genes)
 32 | #'# some genes are dropped because they do not have GO categories
 33 | #'
 34 | #'n_genes <- nrow(mm)
 35 | #'genes <- mm@geneData$ID
 36 | #'
 37 | #'beta = c(20:1, rep(0, times = nrow(mm@colData)-20))
 38 | #'y = c(as.matrix(mm) %*% beta + rnorm(n_genes, 0, 2))
 39 | #'
 40 | #'results <- TestAssociation(mm, genes, y)
 41 | #'View(results@colData)
 42 | #' 
 43 | #' @export
 44 | TestAssociation = function(m, genes, y, method = "lasso", ...) {
 45 |     if (!inherits(m, "GeneMatrix")) {
 46 |         stop("TestAssociation should be given a GeneMatrix object")
 47 |     }
 48 | 
 49 |     if (!is.null(m@fit)) {
 50 |         warning(paste0("Performing a test on a model that has already been ",
 51 |                        "tested, previous results may be overwritten"))
 52 |     }
 53 |     
 54 |     # prefiltering of membership matrix
 55 |     v <- genes %in% m@geneData$ID
 56 |     y <- y[v]
 57 |     genes <- genes[v]
 58 |     m <- m[genes, ]
 59 |     m <- m[, colSums(m@matrix != 0) > 0]  # filter columns *after* rows
 60 |     m@geneData$y <- y
 61 | 
 62 |     # apply desired method
 63 |     method = match.arg(method, c("lasso", "wilcoxon", "hypergeometric", "t.test"))
 64 |     if (method == "lasso") {
 65 |         m@fit <- glmnet::cv.glmnet(m@matrix, y, ...)
 66 |         beta_ind <- which(m@fit$lambda.1se == m@fit$lambda)
 67 |         m@colData$beta <- apply(m@fit$glmnet.fit$beta, 1,
 68 |                                 function(r) r[r != 0][1])
 69 |         m@colData$beta1se <- as.numeric(m@fit$glmnet.fit$beta[, as.numeric(beta_ind)])
 70 |         m@colData$step <- apply(m@fit$glmnet.fit$beta, 1,
 71 |                                 function(r) which(r != 0)[1])
 72 |         m@colData$ranking <- -abs(m@colData$step)
 73 |         
 74 |         m@rankingMetric <- "ranking"
 75 |         m@effectMetric <- "beta"
 76 |         m@plottingMetric <- "beta1se"
 77 |     }
 78 |     else if (method == "t.test") {
 79 |         tt <- vectorized_t_test(m@matrix > 0, y, tbl = TRUE, ...)
 80 |         m@colData$estimate <- tt$estimate
 81 |         m@colData$p.value <- tt$p.value
 82 |         m@rankingMetric <- "p.value"
 83 |         m@effectMetric <- "estimate"
 84 |         m@plottingMetrix <- "estimate"
 85 |     }
 86 |     else if (method == "wilcoxon") {
 87 |         w <- vectorized_wilcoxon_test(m@matrix > 0, y, tbl = TRUE, ...)
 88 |         m@colData$auc <- w$auc
 89 |         m@colData$p.value <- w$p.value
 90 |         m@rankingMetric <- "p.value"
 91 |         m@effectMetric <- "auc"
 92 |         m@plottingMetric <- "auc"
 93 |     }
 94 |     else if (method == "hypergeometric") {
 95 |         if (!is.logical(y) & !(is.numeric(y) & all(y == 0 | y == 1))) {
 96 |             stop("For hypergeometric test, y must be either logical or binary")
 97 |         }
 98 |         stop("Not yet implemented")
 99 |         mat <- m@matrix > 0
100 |         
101 |         overlaps <- colSums(mat[as.logical(y), ])
102 |         
103 |         total_genes <- nrow(m)
104 |         total_y <- sum(y)
105 |         totals <- colSums(mat)
106 |         
107 |         #m@colData$phi <- 
108 |         m@colData$p.value <- phyper(overlaps - 1,      # number of white balls in each set
109 |                                     total_y,            # white balls in urn
110 |                                     total_genes - total_genes,  # black balls in urn
111 |                                     totals,            # number drawn in each set
112 |                                     lower.tail = FALSE)
113 |         m@rankingMetric <- "p.value"
114 |         m@effectMetric <- "phi"
115 |         m@plottingMetric <- "phi"
116 |     }
117 |     
118 |     # save the method used
119 |     m@assocMethod <- method
120 |     
121 |     # finally, compute the difference within and between the sets
122 |     mat <- m@matrix
123 |     not_mat <- 1 - mat
124 |     mean_within <- colSums(y * mat) / colSums(mat)
125 |     mean_outside <- colSums(y * not_mat) / colSums(not_mat)
126 |     m@colData$MeanDifference <- mean_within - mean_outside
127 | 
128 |     m
129 | }
130 | 
131 | #' Compute mean differences between "in a set" and "not in a set"
132 | #' 
133 | #' For each set in a GeneMatrix, compute the difference in y between
134 | #' "in the set"
135 | #' and "outside the set". It will be set in the MeanDifference column
136 | #' 
137 | #' @param m GeneMatrix object
138 | #' 
139 | #' @export
140 | mean_difference <- function(m) {
141 |   y <- m@geneData$y
142 |   
143 |   if (is.null(y)) {
144 |     stop("y not yet set, has TestAssociation been performed?")
145 |   }
146 |   
147 |   sub_m <- m@matrix
148 |   sub_m_not <- 1 - sub_m
149 |   mean_within <- colSums(y * sub_m) / colSums(sub_m)
150 |   mean_outside <- colSums(y * sub_m_not) / colSums(sub_m_not)
151 |   m@colData$MeanDifference <- mean_within - mean_outside
152 |   
153 |   m
154 | }
155 | 
156 | 
157 | #' Return a vector of genes that pass a threshold for includions
158 | #' 
159 | #' Return a vector of genes that pass a threshold of "significance"
160 | #' or other kind of inclusion. For tests with a p-value (t-test,
161 | #' Wilcoxon, hypergeometric), this is FDR-controlled p-values.
162 | #' For LASSO, this is all cases where beta1sd != 0.
163 | #' 
164 | #' @param m GeneMatrix object
165 | #' @param alpha Threshold for (corrected) p-values, not used in LASSO
166 | #' @param method Method, passed to \code{\link{p.adjust}}, or "qvalue"
167 | #' to use qvalue. Not used in LASSO
168 | #' @param ... Extra arguments to pass on to correction method
169 | #' 
170 | #' @seealso \link{p.adjust}, \link{p.adjust.methods}
171 | ThresholdSets <- function(m, alpha = .05, method = "fdr", ...) {
172 |   
173 |   if(m@assocMethod == "lasso"){
174 |     # filtering of lasso values where beta1sd != 0
175 |     m@colData$ID[m@colData$beta1se != 0]
176 |   }else{
177 |     # p-value filtering based on provided filter_method
178 |     if (method == "qvalue") {
179 |       p.adjusted <- qvalue::qvalue(m@colData$p.value, ...)$qvalues
180 |     } else {
181 |       p.adjusted <- p.adjust(m@colData$p.value, method, ...)
182 |     }
183 |     m@colData$ID[!is.na(p.adjusted) & p.adjusted < alpha]
184 |   }
185 | }
186 | 


--------------------------------------------------------------------------------
/R/treemap.R:
--------------------------------------------------------------------------------
  1 | #' Generate Treemap
  2 | #' 
  3 | #' Visualize genes nested within specific and general GO sets using treemaps
  4 | #' 
  5 | #' Each gene should maximize the absolute sum of relevant node weights but every GO category will
  6 | #' not be included since a gene can only be assigned to a single ancestry. A small GO category may
  7 | #' fall into multiple ancestral paths some which are parents of other meaningful categories and others
  8 | #' that are dead-ends.  Should favor attachment to pathways with the most informative children.
  9 | #' To allow this to occur, three edge weights are used
 10 | #' 1) the path with the highest overall absolute score * number of genes affected is chosen
 11 | #' 2) ancestors of LASSO nodes all get a smaller score: [min(abs(beta))]*0.01*N_lasso_children
 12 | #' - applied to edge where ancestors are children
 13 | #' 3) all remaining edges that are not predictive recieve a tiny weight so that they are not filtered: [min(abs(beta))]*1e-4
 14 | #' 
 15 | #' @inheritParams GetEdgesTable
 16 | #' 
 17 | #' @export
 18 | #' @return Generates a treemap hierarchy
 19 | GenerateTreemap <- function(m, edges = NULL, ...) {
 20 |   
 21 |   if(!requireNamespace("igraph", quietly = T)){
 22 |     stop("igraph is required for treemaps")
 23 |   }
 24 |   if(!requireNamespace("graph", quietly = T)){
 25 |     stop("graph is required for treemaps")
 26 |   }
 27 |   if(!requireNamespace("RBGL", quietly = T)){
 28 |     stop("RBGL is required for treemaps")
 29 |   }
 30 |   
 31 |   if (is.null(edges)) {
 32 |     signif_sets <- ThresholdSets(m, ...)
 33 |     edges <- GetEdgesTable(m, sets = m@colData$ID, ...)
 34 |     # orphaned parents connect to root
 35 |     edge_roots <- data_frame(go_id1 = "root", go_id2 = setdiff(edges$go_id1, edges$go_id2)) %>%
 36 |       dplyr::left_join(edges %>%
 37 |                          dplyr::select(go_id2 = go_id1, Ontology) %>%
 38 |                          dplyr::distinct(), by = "go_id2")
 39 |     edges <- rbind(edges, edge_roots)
 40 |     
 41 |     ancestors <- get_ancestry_matrix(m@colData$ID, tbl = TRUE, type = "ANCESTOR", upward = FALSE) %>%
 42 |       dplyr::mutate(go_id1 = as.character(go_id1)) %>%
 43 |       dplyr::mutate(go_id2 = as.character(go_id2)) %>%
 44 |       dplyr::filter(go_id1 %in% m@colData$ID, go_id2 %in% m@colData$ID)
 45 |     ancestors <- bind_rows(ancestors, data_frame(go_id1 = "root", go_id2 = m@colData$ID))
 46 |   }
 47 |   
 48 |   if(length(signif_edges) == 0){
 49 |     stop("No thresholded sets to include in treemap")
 50 |   }
 51 |   
 52 |   signif_sets_weight <- m@colData %>%
 53 |     dplyr::filter(ID %in% signif_sets) %>%
 54 |     dplyr::select_("ID", "Size", Weight = m@plottingMetric) %>%
 55 |     dplyr::mutate(Weight = abs(Weight),
 56 |                   nWeight = Size * Weight) %>%
 57 |     dplyr::select(-Size)
 58 |   
 59 |   nodes <- data_frame(ID = unique(c(edges$go_id1, edges$go_id2))) %>%
 60 |     dplyr::left_join(m@colData %>% dplyr::select(ID, Size), by = "ID") %>%
 61 |     dplyr::left_join(signif_sets_weight, by = "ID") %>%
 62 |     dplyr::mutate(Weight = ifelse(is.na(Weight), 0, Weight),
 63 |                   nWeight = ifelse(is.na(nWeight), 0, nWeight))
 64 |   
 65 |   # ancestor weights encourage the same ancestor nodes to be used when significant descendents exist
 66 |   ancestor_weights <- ancestors %>%
 67 |     dplyr::left_join(nodes %>% dplyr::select(go_id2 = ID, weight = nWeight), by = "go_id2") %>%
 68 |     dplyr::mutate(weight = ifelse(is.na(weight), 0, weight)) %>%
 69 |     dplyr::group_by(go_id1) %>%
 70 |     dplyr::summarize(weight = sum(weight)*0.01) %>%
 71 |     dplyr::arrange(desc(weight))
 72 |   
 73 |   nodes <- nodes %>%
 74 |     dplyr::left_join(ancestor_weights, by = c("ID" = "go_id1")) %>%
 75 |     dplyr::rowwise() %>%
 76 |     dplyr::mutate(nWeight = sum(nWeight, weight, na.rm = T)) %>%
 77 |     dplyr::select(-weight) %>%
 78 |     dplyr::ungroup()
 79 |   
 80 |   # find a unique GO hierarchy that 
 81 |   minimal_edge_set <- find_minimal_rooted_tree(nodes, edges)
 82 |   
 83 |   # find the optimal path to assign each gene
 84 |   gene_paths <- assign_genes_to_paths(m, minimal_edge_set)
 85 |   
 86 |   # prune gene sets that are not used
 87 |   all_used_go_terms <- unique(gene_paths$full_path$ID)
 88 |   utilized_go_network <- igraph::delete_vertices(minimal_edge_set, setdiff(igraph::V(minimal_edge_set)$name, all_used_go_terms))
 89 |   
 90 |     
 91 |   #' Treemap GO Plot
 92 |   #' 
 93 |   #' Generate a treemap plot using the minimal spanning GO tree with genes assigned to individual GO terms.
 94 |   #' 
 95 |   #' Default color based on GO significance and effect sizes of individual genes
 96 |   treemap_GO_plot <- function(m, gene_paths, utilized_go_network, treemap_thresh = 4, ...){
 97 |     
 98 |     # gene effects
 99 |     gene_treemap_data <- gene_paths$assigned_genes %>%
100 |       dplyr::left_join(m@geneData %>% dplyr::select(gene = ID, gene_effect = y), by = "gene") %>%
101 |       dplyr::mutate(gene_effect = pmax(pmin(gene_effect, treemap_thresh), -1*treemap_thresh))
102 |     
103 |     gene_treemap_data <- gene_treemap_data %>%
104 |       dplyr::select(ID = gene, gene_effect) %>%
105 |       dplyr::mutate(go_name = NA_character_,
106 |                     go_effect = NA_real_,
107 |                     data_type = "gene")
108 |     
109 |     # go effects
110 |     go_treemap_data <- m@colData %>%
111 |       dplyr::filter(ID %in% intersect(signif_sets, igraph::V(utilized_go_network)$name)) %>%
112 |       dplyr::select_("ID", go_name = "Term", go_effect = m@plottingMetric)
113 |     
114 |     go_treemap_data <- tibble::data_frame(ID = igraph::V(utilized_go_network)$name) %>%
115 |       dplyr::left_join(go_treemap_data, by = "ID") %>%
116 |       dplyr::mutate(gene_effect = NA_real_,
117 |                     data_type = "go")
118 |     
119 |     all_node_data <- dplyr::bind_rows(gene_treemap_data, go_treemap_data)
120 |     
121 |     # update network
122 |     updated_edges <- dplyr::bind_rows(
123 |       igraph::get.edgelist(utilized_go_network) %>% as.data.frame(stringsAsFactors = F),
124 |       unname(gene_paths$assigned_genes) %>% as.matrix() %>% as.data.frame(stringsAsFactors = F))
125 |     
126 |     # use utilized_go_network with genes assigned to categories
127 |     full_treemap_data <- igraph::graph_from_data_frame(updated_edges, directed = TRUE, vertices = all_node_data)
128 |     
129 |     # update network 
130 |     
131 |     #library(ggraph)
132 |     #library(ggplot2)
133 |     
134 |     #ggraph(full_treemap_data, 'igraph', algorithm = 'tree', circular = TRUE) + 
135 |     # geom_edge_diagonal(aes(alpha = ..index..)) +
136 |     #  coord_fixed() + 
137 |     #  scale_edge_alpha('Direction', guide = 'edge_direction') +
138 |     #  geom_node_point(aes(color = gene_effect, filter = igraph::degree(full_treemap_data, mode = 'out') == 0), size = 1) +
139 |     #  ggforce::theme_no_axes() +
140 |     #  scale_color_gradient2("MeanDifference", low = "blue", high = "red")
141 |     
142 |     # tree
143 |     
144 |     full_treemap_data <- ggraph::treeApply(full_treemap_data, function(node, parent, depth, tree) {
145 |       tree <- igraph::set_vertex_attr(tree, 'depth', node, depth)
146 |       if (depth == 1) {
147 |         tree <- igraph::set_vertex_attr(tree, 'class', node, igraph::V(tree)$shortName[node])
148 |       } else if (depth > 1) {
149 |         tree <- igraph::set_vertex_attr(tree, 'class', node, igraph::V(tree)$class[parent])
150 |       }
151 |       tree
152 |     })
153 |   igraph::V(full_treemap_data)$leaf <- igraph::degree(full_treemap_data, mode = 'out') == 0
154 | 
155 | ggraph(full_treemap_data, 'treemap') + 
156 |   geom_treemap(aes(fill = gene_effect, filter = leaf), colour = NA) + 
157 |   geom_treemap(aes(size = depth * ifelse(data_type == "gene", 1, 0), alpha = depth,
158 |                    colour = ifelse(data_type == "gene", "white", ifelse(go_effect != 0, "yellow", "black"))), fill = NA) + 
159 |   geom_node_text(aes(label = go_name), size = 3, check_overlap = T, repel = T) +
160 |   scale_fill_gradient2("MeanDifference", low = "green3", high = "firebrick1") +
161 |   scale_color_identity() +
162 |   scale_size(range = c(1, 0), guide = 'none') +
163 |   scale_alpha(range = c(1, 0.2), guide = 'none') +
164 |   ggforce::theme_no_axes()
165 |     
166 |   }
167 | }
168 | 
169 | 
170 | get_minimal_gene_path <- function(a_gene, m, nodes, minimal_edge_set){
171 |   
172 |   gene_GO <- m@matrix[a_gene, m@matrix[a_gene,] == 1, drop = F] %>% colnames()
173 |   # add ancestors that may have been missed for the gene
174 |   gene_GO <- union(gene_GO, ancestors$go_id1[ancestors$go_id2 %in% gene_GO])
175 |   gene_GO <- gene_GO[gene_GO %in% nodes$ID]
176 |   
177 |   gene_subgraph <- igraph::induced_subgraph(minimal_edge_set, gene_GO)
178 |   
179 |   gene_paths <- igraph::get.shortest.paths(gene_subgraph, from = "root", to = igraph::V(gene_subgraph))$vpath
180 |   gene_paths <- lapply(seq_along(gene_paths), function(i){
181 |     tibble::data_frame(ID = gene_paths[[i]]$name) %>%
182 |       dplyr::mutate(gene = a_gene, terminus = igraph::V(gene_subgraph)$name[i], step = 1:n())
183 |   }) %>%
184 |     dplyr::bind_rows()
185 |   
186 |   gene_paths
187 | }
188 | 
189 | #' Assign Genes to Paths
190 | #' 
191 | #' Assign genes to a GO hierarchy which has the greatest signal
192 | #' 
193 | #' @inheritParams GenerateTreemap
194 | #' @param minimal_edge_set a network which is a weighted directed minimal spanning tree of gene ontologies
195 | #' 
196 | #' @return a data_frame which contains which terminal gene set each gene is assigned to
197 | assign_genes_to_paths <- function(m, minimal_edge_set, ...){
198 |   
199 |   # find all paths for each gene
200 |   all_gene_paths <- mclapply(m@geneData$ID, function(a_gene){
201 |     print(a_gene)
202 |     get_minimal_gene_path(a_gene, m, nodes, minimal_edge_set)
203 |   }, mc.cores = parallel::detectCores()) %>%
204 |     dplyr::bind_rows()
205 |   
206 |   # all shortest paths found, now find the path that maximizes the path weight
207 |   
208 |   assigned_paths <- all_gene_paths %>%
209 |     dplyr::left_join(nodes, by = "ID") %>%
210 |     dplyr::group_by(gene, terminus) %>%
211 |     dplyr::summarize(Weight = sum(Weight),
212 |                      nWeight = sum(nWeight),
213 |                      nSteps = n()) %>%
214 |     dplyr::group_by(gene) %>%
215 |     dplyr::arrange(desc(Weight), desc(nWeight), nSteps) %>%
216 |     dplyr::slice(1)
217 |   
218 |   gene_paths <- list()
219 |   gene_paths$full_path <- all_gene_paths %>%
220 |     dplyr::semi_join(assigned_paths, by = c("terminus", "gene"))
221 |   gene_paths$assigned_genes <- assigned_paths %>%
222 |     dplyr::select(terminus, gene)
223 |   gene_paths
224 | }
225 | 
226 | #' Find Minimal Rooted Tree
227 | #' 
228 | #' Removes all loops so that gene sets can be nested within one another: creates an optimal directed acyclic graph
229 | #' pointing from general GO terms down to specific ones.
230 | #' 
231 | #' @param nodes data_frame significance of gene sets
232 | #' @param edges data_frame go_id1 are parents of go_id2
233 | #' 
234 | #' @return an igraph object of the gene set minimal spanning tree
235 | find_minimal_rooted_tree <- function(nodes, edges){
236 |   
237 |   if(clusters(graph_from_data_frame(edges))$no != 1){
238 |     stop("edges must be a single connected network") 
239 |   }
240 |   
241 |   edge_weights <- edges %>%
242 |     dplyr::left_join(nodes %>% dplyr::select(go_id1 = ID, nWeight_1 = nWeight), by = "go_id1") %>%
243 |     dplyr::left_join(nodes %>% dplyr::select(go_id2 = ID, nWeight_2 = nWeight), by = "go_id2") %>%
244 |     dplyr::mutate(weight = nWeight_1 * 0.1 + nWeight_2,
245 |                   weight = ifelse(weight == 0, min(weight[weight != 0])*1e-4, weight))
246 |   
247 |   GO_graph_NEL <- new("graphNEL", nodes = nodes$ID, edgemode="directed")
248 |   GO_graph_NEL <- graph::addEdge(from = edge_weights$go_id1, to = edge_weights$go_id2, graph = GO_graph_NEL, weights = edge_weights$weight)
249 |   
250 |   GO_optim_branching <- RBGL::edmondsOptimumBranching(GO_graph_NEL)
251 |   
252 |   # recreate graphNEL object from edmonds output 
253 |   edmonds_edgeList <- as.data.frame(t(GO_optim_branching$edgeList))
254 |   minimum_igraph_network <- graph_from_data_frame(edmonds_edgeList)
255 |   
256 |   # 
257 |   GO_graph_parsimony <- graph_from_data_frame(edges, vertices = nodes)
258 |   pruned_edges <- attr(igraph::E(GO_graph_parsimony), "vnames")[!(attr(igraph::E(GO_graph_parsimony), "vnames") %in% attr(igraph::E(minimum_igraph_network), "vnames"))]
259 |   GO_graph_parsimony <- igraph::delete_edges(GO_graph_parsimony, pruned_edges)
260 |   
261 |   if(clusters(minimum_igraph_network)$no != 1 | clusters(GO_graph_parsimony)$no != 1){
262 |     stop("a single connected network was not found in the minimal network")
263 |   }
264 |   if(setdiff(igraph::get.edgelist(minimum_igraph_network)[,1],igraph::get.edgelist(minimum_igraph_network)[,2]) != "root"){
265 |     stop("a single root was not found in the minimal network") 
266 |   }
267 |   
268 |   GO_graph_parsimony
269 | }
270 | 
271 | update_treemap_root <- function(GO_graph_parsimony){
272 |   
273 |   GO_graph_clusters <- igraph::clusters(GO_graph_parsimony)
274 |   
275 |   new_roots <- c()
276 |   for(i in seq_along(GO_graph_clusters$csize)){
277 |     if(GO_graph_clusters$membership['root'] == i){next}
278 |     
279 |     subgraph_nodes <- names(which(GO_graph_clusters$membership == i))
280 |     
281 |     subgraph_graph <- igraph::induced_subgraph(GO_graph_parsimony, subgraph_nodes)
282 |     
283 |     subgraph_root <- setdiff(igraph::get.edgelist(subgraph_graph)[,1], igraph::get.edgelist(subgraph_graph)[,2])
284 |     if(length(subgraph_root) == 0){
285 |       # if there is no parent then either there is one node or the graph is not a DAG
286 |       if(length(subgraph_nodes) == 1){
287 |         subgraph_root <- subgraph_nodes 
288 |       }else{
289 |         # solve the mst problem
290 |         mst_graph <- igraph::minimum.spanning.tree(igraph::induced_subgraph(GO_graph_parsimony, subgraph_nodes))
291 |         
292 |         # figure out which edges were removed
293 |         GO_graph_parsimony <- igraph::delete_vertices(GO_graph_parsimony, E(subgraph_graph)[setdiff(E(subgraph_graph), E(mst_graph))])
294 |         
295 |         subgraph_root <- setdiff(igraph::get.edgelist(mst_graph)[,1], igraph::get.edgelist(mst_graph)[,2])
296 |         
297 |         if(length(subgraph_root) == 0){
298 |           subgraph_root <- subgraph_nodes
299 |         }
300 |       }
301 |     }
302 |     if(length(subgraph_root) == 0){stop("no root found")}
303 |     new_roots <- c(new_roots, subgraph_root)
304 |   }
305 |   
306 |   if(length(new_roots) != 0){
307 |   GO_graph_parsimony <- igraph::add_edges(GO_graph_parsimony, c(t(data_frame("root", new_roots))))
308 |   }
309 |   
310 |   GO_graph_parsimony
311 | }
312 | 


--------------------------------------------------------------------------------