├── misc
    ├── .gitignore
    ├── groupList_human_raw.RData
    ├── makeMDF.R
    ├── FigS1_removed.R
    ├── preprocessing_scripts
    │   ├── uploadVSD.R
    │   ├── getNormCounts.R
    │   ├── uploadTPM.R
    │   ├── tissueDictionary.json
    │   ├── calculatCorrelations.R
    │   ├── helpers.R
    │   └── tissueDictionary.R
    ├── normality.R
    ├── FigS6_S7.R
    ├── FigS2.R
    └── FigS8.R
├── .travis.yml
├── .gitignore
├── .Rbuildignore
├── data
    ├── humanGenesVST.rda
    ├── human_coldata.rda
    ├── blackListHuman.rda
    ├── human_grouplist.rda
    ├── pathwayCategories.rda
    ├── MSIGDB_Geneset_Names.rda
    ├── sampleVSTOrderHuman.rda
    ├── MSIGDB_Geneset_Small_Names.rda
    └── hsapiens_corrSmall_geneNames.rda
├── man
    ├── human_grouplist.Rd
    ├── human_coldata.Rd
    ├── humanGenesVST.Rd
    ├── blackListHuman.Rd
    ├── MSIGDB_Geneset_Names.Rd
    ├── hsapiens_corrSmall_geneNames.Rd
    ├── sampleVSTOrderHuman.Rd
    ├── fixStrings.Rd
    ├── pathwayCategories.Rd
    ├── MSIGDB_Geneset_Small_Names.Rd
    ├── getAvailableGenes.Rd
    ├── getTissueTypes.Rd
    ├── getTissueVST.Rd
    ├── getCorrelationData.Rd
    ├── myGSEA.Rd
    ├── generateCorrelations.Rd
    ├── geneVsGeneListAnalyze.Rd
    ├── analyzeGenePairs.Rd
    ├── analyzeSingleGenes.Rd
    ├── analyzeGenesetTopology.Rd
    └── getTERM2GENE.Rd
├── correlationAnalyzeR.Rproj
├── NAMESPACE
├── R
    ├── getAvailableGenes.R
    ├── getTissueTypes.R
    ├── data.R
    ├── generateCorrelations.R
    ├── getTissueVST.R
    ├── getCorrelationData.R
    ├── getTERM2GENE.R
    ├── myGSEA.R
    └── fixStrings.R
├── DESCRIPTION
├── README.md
├── tests
    └── Test.R
└── vignettes
    └── correlationAnalyzeR.Rmd


/misc/.gitignore:
--------------------------------------------------------------------------------
1 | *.rda
2 | *.csv
3 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: r
2 | r: bioc-release
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | doc
6 | Meta
7 | 
8 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^doc$
4 | ^Meta$
5 | misc
6 | .travis.yml
7 | 


--------------------------------------------------------------------------------
/data/humanGenesVST.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bishop-Laboratory/correlationAnalyzeR/HEAD/data/humanGenesVST.rda


--------------------------------------------------------------------------------
/data/human_coldata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bishop-Laboratory/correlationAnalyzeR/HEAD/data/human_coldata.rda


--------------------------------------------------------------------------------
/data/blackListHuman.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bishop-Laboratory/correlationAnalyzeR/HEAD/data/blackListHuman.rda


--------------------------------------------------------------------------------
/data/human_grouplist.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bishop-Laboratory/correlationAnalyzeR/HEAD/data/human_grouplist.rda


--------------------------------------------------------------------------------
/data/pathwayCategories.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bishop-Laboratory/correlationAnalyzeR/HEAD/data/pathwayCategories.rda


--------------------------------------------------------------------------------
/data/MSIGDB_Geneset_Names.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bishop-Laboratory/correlationAnalyzeR/HEAD/data/MSIGDB_Geneset_Names.rda


--------------------------------------------------------------------------------
/data/sampleVSTOrderHuman.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bishop-Laboratory/correlationAnalyzeR/HEAD/data/sampleVSTOrderHuman.rda


--------------------------------------------------------------------------------
/misc/groupList_human_raw.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bishop-Laboratory/correlationAnalyzeR/HEAD/misc/groupList_human_raw.RData


--------------------------------------------------------------------------------
/data/MSIGDB_Geneset_Small_Names.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bishop-Laboratory/correlationAnalyzeR/HEAD/data/MSIGDB_Geneset_Small_Names.rda


--------------------------------------------------------------------------------
/data/hsapiens_corrSmall_geneNames.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bishop-Laboratory/correlationAnalyzeR/HEAD/data/hsapiens_corrSmall_geneNames.rda


--------------------------------------------------------------------------------
/man/human_grouplist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{human_grouplist}
 5 | \alias{human_grouplist}
 6 | \title{A list of sample categorizations (human)}
 7 | \format{
 8 | An object of class \code{list} of length 25.
 9 | }
10 | \usage{
11 | human_grouplist
12 | }
13 | \description{
14 | A list of sample categorizations (human)
15 | }
16 | \keyword{data}
17 | 


--------------------------------------------------------------------------------
/man/human_coldata.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{human_coldata}
 5 | \alias{human_coldata}
 6 | \title{A data frame of sample info from GEO (human)}
 7 | \format{
 8 | An object of class \code{data.frame} with 81716 rows and 6 columns.
 9 | }
10 | \usage{
11 | human_coldata
12 | }
13 | \description{
14 | A data frame of sample info from GEO (human)
15 | }
16 | \keyword{data}
17 | 


--------------------------------------------------------------------------------
/man/humanGenesVST.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{humanGenesVST}
 5 | \alias{humanGenesVST}
 6 | \title{A vector of valid human genes with VST data available}
 7 | \format{
 8 | An object of class \code{character} of length 28722.
 9 | }
10 | \usage{
11 | humanGenesVST
12 | }
13 | \description{
14 | A vector of valid human genes with VST data available
15 | }
16 | \keyword{data}
17 | 


--------------------------------------------------------------------------------
/man/blackListHuman.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{blackListHuman}
 5 | \alias{blackListHuman}
 6 | \title{A vector of blacklisted tissue-disease categories for human samples}
 7 | \format{
 8 | An object of class \code{character} of length 6.
 9 | }
10 | \usage{
11 | blackListHuman
12 | }
13 | \description{
14 | A vector of blacklisted tissue-disease categories for human samples
15 | }
16 | \keyword{data}
17 | 


--------------------------------------------------------------------------------
/man/MSIGDB_Geneset_Names.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{MSIGDB_Geneset_Names}
 5 | \alias{MSIGDB_Geneset_Names}
 6 | \title{A vector of valid MSIGDB geneset names}
 7 | \format{
 8 | An object of class \code{character} of length 17810.
 9 | }
10 | \source{
11 | msigdbr()
12 | }
13 | \usage{
14 | MSIGDB_Geneset_Names
15 | }
16 | \description{
17 | A vector of valid MSIGDB geneset names
18 | }
19 | \keyword{data}
20 | 


--------------------------------------------------------------------------------
/correlationAnalyzeR.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(analyzeGenePairs)
 4 | export(analyzeGenesetTopology)
 5 | export(analyzeSingleGenes)
 6 | export(fixStrings)
 7 | export(geneVsGeneListAnalyze)
 8 | export(generateCorrelations)
 9 | export(getAvailableGenes)
10 | export(getCorrelationData)
11 | export(getTERM2GENE)
12 | export(getTissueTypes)
13 | export(getTissueVST)
14 | export(myGSEA)
15 | import(clusterProfiler)
16 | import(dplyr)
17 | import(tibble)
18 | import(tidyr)
19 | importFrom(rlang,.data)
20 | 


--------------------------------------------------------------------------------
/man/hsapiens_corrSmall_geneNames.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{hsapiens_corrSmall_geneNames}
 5 | \alias{hsapiens_corrSmall_geneNames}
 6 | \title{A vector of valid human genes to extract correlations from}
 7 | \format{
 8 | An object of class \code{character} of length 28722.
 9 | }
10 | \usage{
11 | hsapiens_corrSmall_geneNames
12 | }
13 | \description{
14 | A vector of valid human genes to extract correlations from
15 | }
16 | \keyword{data}
17 | 


--------------------------------------------------------------------------------
/man/sampleVSTOrderHuman.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{sampleVSTOrderHuman}
 5 | \alias{sampleVSTOrderHuman}
 6 | \title{A vector containing the order of human samples in the sample-tissue SQL table}
 7 | \format{
 8 | An object of class \code{character} of length 81716.
 9 | }
10 | \usage{
11 | sampleVSTOrderHuman
12 | }
13 | \description{
14 | A vector containing the order of human samples in the sample-tissue SQL table
15 | }
16 | \keyword{data}
17 | 


--------------------------------------------------------------------------------
/man/fixStrings.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fixStrings.R
 3 | \name{fixStrings}
 4 | \alias{fixStrings}
 5 | \title{Fix strings (helper function)}
 6 | \usage{
 7 | fixStrings(StringVec)
 8 | }
 9 | \arguments{
10 | \item{StringVec}{A vector of titles (usually GSEA) to clean for visualizations}
11 | }
12 | \value{
13 | A vector of cleaned string titles in the same order as the input.
14 | }
15 | \description{
16 | Convert vector of GSEA (or other) names to publication-ready titles
17 | }
18 | 


--------------------------------------------------------------------------------
/man/pathwayCategories.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{pathwayCategories}
 5 | \alias{pathwayCategories}
 6 | \title{A vector containing the names of valid TERM2GENE categories for GSEA_Type or pathwayType input.}
 7 | \format{
 8 | An object of class \code{character} of length 19.
 9 | }
10 | \usage{
11 | pathwayCategories
12 | }
13 | \description{
14 | A vector containing the names of valid TERM2GENE categories for GSEA_Type or pathwayType input.
15 | }
16 | \keyword{data}
17 | 


--------------------------------------------------------------------------------
/man/MSIGDB_Geneset_Small_Names.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{MSIGDB_Geneset_Small_Names}
 5 | \alias{MSIGDB_Geneset_Small_Names}
 6 | \title{A vector of valid MSIGDB geneset names with fewer then 500 genes associated with them}
 7 | \format{
 8 | An object of class \code{character} of length 21940.
 9 | }
10 | \source{
11 | msigdbr()
12 | }
13 | \usage{
14 | MSIGDB_Geneset_Small_Names
15 | }
16 | \description{
17 | A vector of valid MSIGDB geneset names with fewer then 500 genes associated with them
18 | }
19 | \keyword{data}
20 | 


--------------------------------------------------------------------------------
/man/getAvailableGenes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getAvailableGenes.R
 3 | \name{getAvailableGenes}
 4 | \alias{getAvailableGenes}
 5 | \title{Get Available Genes (helper function)}
 6 | \usage{
 7 | getAvailableGenes(pool = NULL)
 8 | }
 9 | \arguments{
10 | \item{pool}{an object created by pool::dbPool to accessing SQL database.
11 | It will be created if not supplied.}
12 | }
13 | \value{
14 | A vector of genes with associated correlation data
15 | }
16 | \description{
17 | Finds available genes within correlation data
18 | }
19 | \examples{
20 | genes <- correlationAnalyzeR::getAvailableGenes()
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/getTissueTypes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getTissueTypes.R
 3 | \name{getTissueTypes}
 4 | \alias{getTissueTypes}
 5 | \title{Get available tissue types}
 6 | \usage{
 7 | getTissueTypes(useBlackList = FALSE, pool = NULL)
 8 | }
 9 | \arguments{
10 | \item{useBlackList}{Should black-listed tissue/disease categories for this species
11 | be removed from the returned list? Improves the quality of analysis by removing
12 | categories with low sample numbers and high observed variance.}
13 | 
14 | \item{pool}{an object created by pool::dbPool to accessing SQL database.
15 | It will be created if not supplied.}
16 | }
17 | \value{
18 | Vector containing available tissue types.
19 | }
20 | \description{
21 | Finds tissue types with available correlation data for a given species
22 | }
23 | \examples{
24 | tissueTypes <- correlationAnalyzeR::getTissueTypes()
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/misc/makeMDF.R:
--------------------------------------------------------------------------------
 1 | # Make new TERM2GENE
 2 | mdf1 <- msigdbr::msigdbr()
 3 | mdf2 <- msigdbr::msigdbr(species = "Mus musculus")
 4 | library(dplyr)
 5 | mdf1 <- mdf1 %>% select(gs_name, gs_id, gs_cat, gs_subcat, human_gene_symbol)
 6 | mdf2 <- mdf2 %>% select(gs_name, gs_id, gs_cat, gs_subcat, human_gene_symbol, gene_symbol)
 7 | mdf <- full_join(mdf1, mdf2, by = colnames(mdf1))
 8 | which(is.na(mdf$gene_symbol))
 9 | mdf <- mdf %>% select(-gs_id)
10 | colnames(mdf)[c(4:5)] <- c("human_gene_symbol", "mouse_gene_symbol")
11 | # TERM2GENEHuman <- mdf %>%
12 | #   filter(gs_cat %in% c("H")) %>%
13 | #   select(gs_name, human_gene_symbol) %>%
14 | #   distinct()
15 | 
16 | table(mdf$gs_subcat)
17 | mdf$gs_subcat <- gsub(mdf$gs_subcat, pattern = "", replacement = "None")
18 | mdf$gs_cat <- paste0(mdf$gs_cat, "_", mdf$gs_subcat)
19 | mdf <- mdf %>% select(-gs_subcat)
20 | MDF <- mdf %>% distinct()
21 | # save(MDF, file = "misc/MDF.rda", compression_level = 9)
22 | 
23 | term2geneCount <- as.data.frame(table(MDF$gs_name), stringsAsFactors = FALSE)
24 | GlobalData$MSIGDB_Geneset_Small_Names <- term2geneCount$Var1[which(term2geneCount$Freq < 501)]
25 | 


--------------------------------------------------------------------------------
/man/getTissueVST.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getTissueVST.R
 3 | \name{getTissueVST}
 4 | \alias{getTissueVST}
 5 | \title{Get VST values for tissues and gene of interest}
 6 | \usage{
 7 | getTissueVST(
 8 |   genesOfInterest,
 9 |   Tissues = "all",
10 |   Sample_Type = c("all", "normal", "cancer"),
11 |   useBlackList = TRUE,
12 |   pool = NULL
13 | )
14 | }
15 | \arguments{
16 | \item{genesOfInterest}{A length-two vector with genes to compare.}
17 | 
18 | \item{Tissues}{Which tissue type should VST be collected for? See available options
19 | with getTissueTypes().}
20 | 
21 | \item{Sample_Type}{Type of RNA Seq samples to obtain VST for? See available options
22 | with getTissueTypes().}
23 | 
24 | \item{useBlackList}{Should black-listed tissue/disease categories for this species
25 | be removed from the returned list? Improves the quality of analysis by removing
26 | categories with low sample numbers and high observed variance.}
27 | 
28 | \item{pool}{an object created by pool::dbPool to accessing SQL database.
29 | It will be created if not supplied.}
30 | }
31 | \value{
32 | List of VST matrices for each selected tissue-disease combination.
33 | }
34 | \description{
35 | Downloads VST values for tissues of interest
36 | }
37 | \examples{
38 | VSTdata <- getTissueVST(genesOfInterest = c("BRCA1", "ATM"),
39 |                     Tissues = c("brain", "respiratory"),
40 |                     Sample_Type = "all",
41 |                     useBlackList = TRUE)
42 | }
43 | 


--------------------------------------------------------------------------------
/man/getCorrelationData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getCorrelationData.R
 3 | \name{getCorrelationData}
 4 | \alias{getCorrelationData}
 5 | \title{Get Gene Correlation Data}
 6 | \usage{
 7 | getCorrelationData(Sample_Type, Tissue, geneList, corrMat = NULL, pool = NULL)
 8 | }
 9 | \arguments{
10 | \item{Sample_Type}{Type of RNA Seq samples used to create correlation data.
11 | Either "all", "normal", or "cancer". Can be a single value for all genes,
12 | or a vector corresponding to geneList. Not used if corrMat is set.}
13 | 
14 | \item{Tissue}{Which tissue type should gene correlations be derived from?
15 | Default = "all". Can be a single value for all genes,
16 | or a vector corresponding to geneList. Not used if corrMat is set.
17 | Run getTissueTypes() to see available tissue list.}
18 | 
19 | \item{geneList}{Vector of genes for which data will be extracted.}
20 | 
21 | \item{corrMat}{A custom correlation matrix generated by generateCorrelations()
22 | to use instead of pre-supplied databases.}
23 | 
24 | \item{pool}{an object created by pool::dbPool to accessing SQL database.
25 | It will be created if not supplied. Not used if corrMat is set.}
26 | }
27 | \value{
28 | A correlation data frame object
29 | }
30 | \description{
31 | Obtain correlation data by querying MySQL database
32 | }
33 | \examples{
34 | corrData <- correlationAnalyzeR::getCorrelationData(Sample_Type = "normal",
35 |                                        Tissue = "kidney",
36 |                                        geneList = c("ATM", "BRCA1"))
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/R/getAvailableGenes.R:
--------------------------------------------------------------------------------
 1 | #' Get Available Genes (helper function)
 2 | #'
 3 | #' Finds available genes within correlation data
 4 | #'
 5 | #' @param pool an object created by pool::dbPool to accessing SQL database.
 6 | #' It will be created if not supplied.
 7 | #' @return A vector of genes with associated correlation data
 8 | #'
 9 | #' @examples
10 | #' genes <- correlationAnalyzeR::getAvailableGenes()
11 | #'
12 | #' @export
13 | getAvailableGenes <- function(# Species = c("hsapiens", "mmusculus"),
14 |                               pool = NULL) {
15 | 
16 |   # # Bug testing
17 |   # Species <- "hsapiens"
18 |   # pool <- NULL
19 | 
20 | 
21 |   Species <- "hsapiens"
22 | 
23 |   if (! is.null(pool)) {
24 |     if (! pool$valid) {
25 |       pool <- NULL
26 |     }
27 |   }
28 | 
29 |   # Specify information about the download location and species type
30 |   if (Species[1] == "hsapiens") {
31 |     gene <- "A1BG"
32 |   } else if (Species[1] == "mmusculus") {
33 |     gene <- "A1bg"
34 |   } else {
35 |     stop("\ncorrelationAnalyzeR currently supports only Human and Mouse data.
36 |          Please select either 'hsapiens' or 'mmusculus' for Species parameter.
37 |          \n")
38 |   }
39 |   Sample_Type = "normal" # This is default behavior
40 |   Tissue = "brain"
41 |   # Download a sample file which contains all gene identifiers
42 | 
43 |   geneNamesDF <- correlationAnalyzeR::getCorrelationData(#Species = Species[1],
44 |                                                          Sample_Type = Sample_Type,
45 |                                                          Tissue = Tissue,
46 |                                                          geneList = gene, pool = pool)
47 |   avGenes <- rownames(geneNamesDF)
48 |   return(avGenes)
49 | }
50 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: correlationAnalyzeR
 2 | Type: Package
 3 | Title: Generate Novel Insights from Gene Correlation Data
 4 | Date: 2021-03-29
 5 | Version: 1.0.0
 6 | Authors@R: person("Henry", "Miller", email = "millerh1@livemail.uthscsa.edu",
 7 |     role = c("aut", "cre"))
 8 | Description: Correlation AnalyzeR is a powerful tool for understanding how genes
 9 |     relate to eachother based on the world's largest repository of standardized
10 |     RNA-Seq data, ARCHS4 [https://amp.pharm.mssm.edu/archs4/index.html]. In 
11 |     'single-gene' mode, correlation data for genes of interest are used to
12 |     return correlated pathways by GSEA.In 'Gene vs gene' mode, the differences 
13 |     between two genes of interest are elucidated based on their 
14 |     co-expression correlations. In 'Gene vs gene list' mode, a user can
15 |     examine how a primary gene of interest correlates with a defined list of
16 |     other genes. Finally, 'Topology' mode examines correlation data for a
17 |     defined gene group to generate a robust gene-level topology.
18 | Depends: R (>= 3.6)
19 | License: MIT
20 | Encoding: UTF-8
21 | LazyData: true
22 | VignetteBuilder: knitr
23 | Suggests: knitr, rmarkdown, EnsDb.Hsapiens.v86, airway, tidyverse
24 | RoxygenNote: 7.1.1
25 | Imports: 
26 |     shiny,
27 |     clusterProfiler,
28 |     ggplot2,
29 |     ggpubr,
30 |     data.table,
31 |     gridExtra,
32 |     grid,
33 |     dplyr,
34 |     tidyr,
35 |     tibble,
36 |     gplots,
37 |     RColorBrewer,
38 |     fgsea,
39 |     parallel,
40 |     SummarizedExperiment,
41 |     WGCNA,
42 |     ggplotify,
43 |     pool,
44 |     DESeq2,
45 |     ggrepel,
46 |     stringr,
47 |     matrixStats,
48 |     metaMA,
49 |     msigdbr (>= 7.2.1),
50 |     pheatmap,
51 |     preprocessCore,
52 |     enrichplot,
53 |     boot,
54 |     SuperExactTest,
55 |     Rtsne,
56 |     RMySQL,
57 |     DBI,
58 |     rlang,
59 |     grDevices,
60 |     stats,
61 |     utils
62 | 


--------------------------------------------------------------------------------
/man/myGSEA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/myGSEA.R
 3 | \name{myGSEA}
 4 | \alias{myGSEA}
 5 | \title{Wrapper for clusterProlifer's GSEA()}
 6 | \usage{
 7 | myGSEA(
 8 |   ranks,
 9 |   TERM2GENE,
10 |   padjustedCutoff = 0.05,
11 |   returnDataOnly = TRUE,
12 |   nperm = 2000,
13 |   topPlots = FALSE,
14 |   outDir,
15 |   Condition = "GSEA Results",
16 |   plotFile = "GSEA_results"
17 | )
18 | }
19 | \arguments{
20 | \item{ranks}{Numeric of gene 'scores' ordered by decreasing value and
21 | named with gene symbols.}
22 | 
23 | \item{TERM2GENE}{Data frame with two columns: gene set identifiers and
24 | gene symbols. Can be generated using correlationAnalyzeR::getTERM2GENE()}
25 | 
26 | \item{padjustedCutoff}{Value to use as a cutoff for returned gene sets.}
27 | 
28 | \item{returnDataOnly}{Should GSEA data/plots be saved to file? Default: TRUE}
29 | 
30 | \item{nperm}{Number of permutations to run. Default is 2000}
31 | 
32 | \item{topPlots}{Should top GSEA pathways be plotted? Default: FALSE}
33 | 
34 | \item{outDir}{output directory.}
35 | 
36 | \item{Condition}{Name to use for titles of plots. Default = "GSEA Results".}
37 | 
38 | \item{plotFile}{prefix to use for naming output files.}
39 | }
40 | \value{
41 | Named list containing GSEA() output, GSEA data frame, and visualizations.
42 | }
43 | \description{
44 | Runs GSEA() from clusterProfiler and creates useful
45 | visualizations.
46 | }
47 | \examples{
48 | corrDF <- correlationAnalyzeR::analyzeSingleGenes(genesOfInterest = c("BRCA1"),
49 |                                                   returnDataOnly = TRUE,
50 |                                                   runGSEA = FALSE,
51 |                                                   Sample_Type = "normal")
52 | ranks <- corrDF$correlations[,1]
53 | names(ranks) <- rownames(corrDF$correlations)
54 | TERM2GENE <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = "simple",
55 |                                                Species = "hsapiens")
56 | res <- correlationAnalyzeR::myGSEA(ranks = ranks,
57 |                                    TERM2GENE = TERM2GENE,
58 |                                    plotFile = "GSEA_out", outDir = getwd(),
59 |                                    topPlots = FALSE, returnDataOnly=TRUE,
60 |                                     Condition = "GSEA Results")
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/man/generateCorrelations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generateCorrelations.R
 3 | \name{generateCorrelations}
 4 | \alias{generateCorrelations}
 5 | \title{Generate a correlation matrix from user-supplied data}
 6 | \usage{
 7 | generateCorrelations(cts, transformed = FALSE, cores = 1)
 8 | }
 9 | \arguments{
10 | \item{cts}{a gene count matrix where rownames are genes and colnames are sample IDs.}
11 | 
12 | \item{transformed}{Boolean. Indicates whether data is already transformed using
13 | VST or a similar approach. If TRUE, VST transformation with DESeq2 will not be
14 | performed. Default: FALSE}
15 | 
16 | \item{cores}{Numeric. Number of cores to use for calculating size factors.
17 | NOTE: cores > 1 does not work on Windows. Detault: 1.}
18 | }
19 | \value{
20 | Matrix with gene co-expression correlations.
21 | }
22 | \description{
23 | Generate a correlation matrix from user-supplied data
24 | }
25 | \details{
26 | This function performs the same normalization and transformation steps on
27 | a user-supplied dataset that were originally used to generate the data provided
28 | in the pre-calculated databases used by this package. The resulting correlation
29 | matrix can be supplied to analyzeSingleGenes() as an input.
30 | NOTE: the resulting matrix is very large and will take up ~8 GB of memory.
31 | }
32 | \examples{
33 | 
34 | if (! 'airway' in rownames(install.packages())) {
35 |     if (!requireNamespace("BiocManager", quietly = TRUE))
36 |         install.packages("BiocManager")
37 |     BiocManager::install("airway")
38 | }
39 | 
40 | if (! 'EnsDb.Hsapiens.v86' in rownames(install.packages())) {
41 |     if (!requireNamespace("BiocManager", quietly = TRUE))
42 |         install.packages("BiocManager")
43 |     BiocManager::install("EnsDb.Hsapiens.v86")
44 | }
45 | 
46 | if (! 'dplyr' in rownames(install.packages())) {
47 |     install.packages("dplyr")
48 | }
49 | 
50 | data(airway)
51 | cts <- assay(airway)
52 | ens2gene <- ensembldb::select(EnsDb.Hsapiens.v86, keys = rownames(cts),
53 |                               columns = c("SYMBOL"), keytype = "GENEID") \%>\%
54 |   dplyr::distinct(SYMBOL, .keep_all = TRUE) \%>\%
55 |   dplyr::inner_join(y = data.frame("GENEID" = rownames(cts)))
56 | 
57 | cts <- cts[ens2gene$GENEID,]
58 | rownames(cts) <- ens2gene$SYMBOL
59 | 
60 | corrMat <- generateCorrelations(cts)
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/R/getTissueTypes.R:
--------------------------------------------------------------------------------
 1 | #' Get available tissue types
 2 | #'
 3 | #' Finds tissue types with available correlation data for a given species
 4 | #'
 5 | #' @param useBlackList Should black-listed tissue/disease categories for this species
 6 | #' be removed from the returned list? Improves the quality of analysis by removing
 7 | #' categories with low sample numbers and high observed variance.
 8 | #' @param pool an object created by pool::dbPool to accessing SQL database.
 9 | #' It will be created if not supplied.
10 | #' @return Vector containing available tissue types.
11 | #'
12 | #' @examples
13 | #' tissueTypes <- correlationAnalyzeR::getTissueTypes()
14 | #'
15 | #' @export
16 | getTissueTypes <- function(#Species = c("hsapiens", "mmusculus"),
17 |                            useBlackList = FALSE, pool = NULL) {
18 | 
19 |   # # bug testing
20 |   # Species = "hsapiens"
21 |   # useBlackList = TRUE
22 |   # pool = NULL
23 | 
24 |   Species = "hsapiens"
25 | 
26 |   if (! is.null(pool)) {
27 |     if (! pool$valid) {
28 |       pool <- NULL
29 |     } else {
30 |       conn <- pool::poolCheckout(pool)
31 |       doPool <- TRUE
32 |       on.exit(pool::poolReturn(conn))
33 |     }
34 |   }
35 | 
36 |   if (is.null(pool)) {
37 |     doPool <- FALSE
38 |     conn <- NULL
39 |     retryCounter <- 1
40 |     while(is.null(conn)) {
41 |       conn <- try(silent = T, eval({
42 |         DBI::dbConnect(
43 |           drv = RMySQL::MySQL(),
44 |           user = "public-rds-user@m2600az-db01p.mysql.database.azure.com", port = 3306,
45 |           dbname="correlation_analyzer",
46 |           password='public-user-password',
47 |           host="m2600az-db01p.mysql.database.azure.com"
48 |         )
49 |       }))
50 |       if ("try-error" %in% class(conn)) {
51 |         if (retryCounter == 3) {
52 |           stop("Unable to connect to database. Check internet connection and please contanct",
53 |                " package maintainer if you believe this is an error.")
54 |         }
55 |         warning(paste0("Failed to establish connection to database ... retrying now ... ",
56 |                        (4-retryCounter), " attempts left."),
57 |                 immediate. = T)
58 |         conn <- NULL
59 |         retryCounter <- retryCounter + 1
60 |         Sys.sleep(1)
61 |       }
62 |     }
63 | 
64 |     on.exit(DBI::dbDisconnect(conn))
65 |   }
66 | 
67 |   Species <- Species[1]
68 |   tabs <- DBI::dbListTables(conn)
69 |   tabs <- tabs[grep(tabs, pattern = paste0("correlations_", Species))]
70 |   tabs <- gsub(tabs, pattern = paste0("correlations_", Species, "_"),
71 |                replacement = "")
72 |   if (useBlackList) {
73 |     blackList <- correlationAnalyzeR::blackListHuman
74 |     tabs <- tabs[grep(x = tabs, pattern = paste(blackList, collapse = "|"), invert = TRUE)]
75 |   }
76 |   tissues <- gsub(tabs, pattern = "^([a-z]+)_([a-z_]+)$", replacement = "\\2")
77 |   types <- gsub(tabs, pattern = "^([a-z]+)_([a-z_]+)$", replacement = "\\1")
78 |   result <- paste0(tissues, " - ", types)
79 |   result <- result[order(result)]
80 |   return(result)
81 | }
82 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | # # Get datasets
 2 | # load("data/MSIGDB_Geneset_Names.rda")
 3 | # usethis::use_data(MSIGDB_Geneset_Names, internal = F, overwrite = T)
 4 | #
 5 | # load("data/blackListHuman.rda")
 6 | # usethis::use_data(blackListHuman, internal = F, overwrite = T)
 7 | #
 8 | # load("data/blackListMouse.rda")
 9 | # usethis::use_data(blackListMouse, internal = F, overwrite = T)
10 | #
11 | #
12 | # load("data/hsapiens_corrSmall_geneNames.rda")
13 | # usethis::use_data(hsapiens_corrSmall_geneNames, internal = F, overwrite = T)
14 | #
15 | # load("data/mmusculus_corrSmall_geneNames.rda")
16 | # usethis::use_data(mmusculus_corrSmall_geneNames, internal = F, overwrite = T)
17 | #
18 | # load("data/humanGenesVST.rda")
19 | # usethis::use_data(humanGenesVST, internal = F, overwrite = T)
20 | #
21 | # load("data/mouseGenesVST.rda")
22 | # usethis::use_data(mouseGenesVST, internal = F, overwrite = T)
23 | #
24 | # load("data/sampleVSTOrderHuman.rda")
25 | # usethis::use_data(sampleVSTOrderHuman, internal = F, overwrite = T)
26 | #
27 | # load("data/sampleVSTOrderMouse.rda")
28 | # usethis::use_data(sampleVSTOrderMouse, internal = F, overwrite = T)
29 | #
30 | # load("data/MSIGDB_Geneset_Small_Names.rda")
31 | # usethis::use_data(MSIGDB_Geneset_Small_Names, internal = F, overwrite = T)
32 | 
33 | # load("misc/groupList_human_raw.RData")
34 | # human_coldata <- colData_human_raw
35 | # usethis::use_data(human_coldata)
36 | # human_grouplist <- groupList_human_raw
37 | # usethis::use_data(human_grouplist)
38 | 
39 | # tabDF <- as.data.frame(table( paste0(human_coldata$Tissue, " - ", human_coldata$disease)), stringsAsFactors = F)
40 | # keepSamps <- unique(tabDF$Var1[tabDF$Freq > 30])
41 | # keepSamps <- tolower(keepSamps)
42 | # newList <- unlist(correlationAnalyzeR::human_grouplist, recursive = F)
43 | # groupNow <- gsub(names(newList), pattern = "\\.", replacement = " - ")
44 | # blackListHuman <- groupNow[! groupNow %in% keepSamps]
45 | # blackListHuman <- blackListHuman[grep(blackListHuman, pattern = " all", invert = T)]
46 | # usethis::use_data(blackListHuman, overwrite = T)
47 | 
48 | 
49 | #' A data frame of sample info from GEO (human)
50 | #'@docType data
51 | #'@keywords data
52 | "human_coldata"
53 | 
54 | #' A list of sample categorizations (human)
55 | #'@docType data
56 | #'@keywords data
57 | "human_grouplist"
58 | 
59 | #' A vector of valid MSIGDB geneset names
60 | #'@source msigdbr()
61 | #'@docType data
62 | #'@keywords data
63 | "MSIGDB_Geneset_Names"
64 | 
65 | #' A vector of valid MSIGDB geneset names with fewer then 500 genes associated with them
66 | #'@source msigdbr()
67 | #'@docType data
68 | #'@keywords data
69 | "MSIGDB_Geneset_Small_Names"
70 | 
71 | #' A vector of blacklisted tissue-disease categories for human samples
72 | #' @docType data
73 | #' @keywords data
74 | "blackListHuman"
75 | 
76 | #' A vector of valid human genes to extract correlations from
77 | #' @docType data
78 | #' @keywords data
79 | "hsapiens_corrSmall_geneNames"
80 | 
81 | 
82 | #' A vector of valid human genes with VST data available
83 | #' @docType data
84 | #' @keywords data
85 | "humanGenesVST"
86 | 
87 | 
88 | #' A vector containing the order of human samples in the sample-tissue SQL table
89 | #' @docType data
90 | #' @keywords data
91 | "sampleVSTOrderHuman"
92 | 
93 | 
94 | #' A vector containing the names of valid TERM2GENE categories for GSEA_Type or pathwayType input.
95 | #' @docType data
96 | #' @keywords data
97 | "pathwayCategories"
98 | 


--------------------------------------------------------------------------------
/misc/FigS1_removed.R:
--------------------------------------------------------------------------------
  1 | # Refseq summary
  2 | library(biomaRt)
  3 | library(openxlsx)
  4 | 
  5 | ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")
  6 | martRes <- getBM(attributes = c("external_gene_name",
  7 |                                 "refseq_mrna"),
  8 |                  mart = ensembl)
  9 | geneInfo <- read.xlsx("../summaryRefseq.xlsx")
 10 | colnames(geneInfo) <- c("refseq_mrna", "something", "Function")
 11 | geneInfoFinal <- merge(x = martRes, y = geneInfo, by = "refseq_mrna")
 12 | 
 13 | 
 14 | geneInfoFinal <- geneInfoFinal[,c(2, 4)]
 15 | geneInfoFinal <- unique(geneInfoFinal)
 16 | geneWithRef <- geneInfoFinal$external_gene_name[which(geneInfoFinal$Function != "")]
 17 | 
 18 | 
 19 | geneInfo$refSeq <- FALSE
 20 | geneInfo$refSeq[grep(geneInfo$Function, pattern = "RefSeq")] <- TRUE
 21 | toPlot <- as.data.frame(table(geneInfo$refSeq))
 22 | 
 23 | # Fig 1A
 24 | pdf("fig1Pie.pdf", height = 6, width = 8)
 25 | pie(toPlot$Freq, col = c("azure", "coral1"),
 26 |     labels = c("No annotation (59.6%)", "RefSeq (40.4%)"),
 27 |     init.angle = 90)
 28 | dev.off()
 29 | 
 30 | 
 31 | 
 32 | # Get the survival tables from GEPIA and determine # with refseq acc
 33 | 
 34 | names <- c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "DLBC", "ESCA",
 35 |            "GBM", "HNSC", "KICH", "KIRC", "KIRP", "LAML", "LGG", "LIHC",
 36 |            "LUAD", "LUSC", "MESO", "OV")
 37 | files <- list.files("data/survivalTables/")
 38 | orderNum <- gsub(files, pattern = ".*\\((.+)\\).*", replacement = "\\1")
 39 | orderNum[20] <- 0
 40 | orderNum <- as.numeric(orderNum)
 41 | files <- files[order(orderNum)]
 42 | names(files) <- names
 43 | 
 44 | nIn <- 100
 45 | inVec <- c()
 46 | outVec <- c()
 47 | for (i in 1:length(files)) {
 48 |   file <- files[i]
 49 |   group <- names(file)
 50 |   print(group)
 51 |   df <- read.table(file.path("data/survivalTables", file), stringsAsFactors = FALSE,
 52 |                    sep = "\t", header = TRUE)
 53 |   df <- unique(df[c(1:nIn),c(1, 3)])
 54 |   outVec <- c(outVec, length(df$Gene.Symbol[which(! df$Gene.Symbol %in% geneWithRef)]))
 55 |   inVec <- c(inVec, length(df$Gene.Symbol[which(df$Gene.Symbol %in% geneWithRef)]))
 56 | }
 57 | 
 58 | library(ggpubr)
 59 | plotDF <- data.frame(group = rep(names(files), 2), Annotation = c(rep("RefSeq", length(files)),
 60 |                                                                   rep("No Annotation", length(files))),
 61 |                      number = c(inVec, outVec), stringsAsFactors = FALSE)
 62 | plotDF <- plotDF[order(plotDF$Annotation, plotDF$number),]
 63 | 
 64 | # Fig 1B
 65 | ggbarplot(data = plotDF, x = "group", y = "number", legend = "right",
 66 |           font.tickslab = c(15), font.legend = c(15),
 67 |           palette = c("azure", "coral1"),
 68 |           fill = "Annotation") +
 69 |   rotate() + rremove("xlab") +
 70 |   rremove("ylab") +
 71 |   rremove("legend.title")
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | # # Make public user
 79 | # library(DBI)
 80 | # credFile <- "misc/credFile.txt"
 81 | # credentials <- suppressWarnings(read.delim(credFile, sep = ";",
 82 | #                                            header = FALSE, stringsAsFactors = F))
 83 | # uName <- credentials$V1
 84 | # pWord <- credentials$V2
 85 | # conn <- dbConnect(drv = RMySQL::MySQL(), user = uName,
 86 | #                   port = 3306, dbname="correlation_analyzer",
 87 | #                   password=pWord,
 88 | #                   host="m2600az-db01p.mysql.database.azure.com")
 89 | # sql <- "CREATE USER 'public-rds-user'@'%' IDENTIFIED BY 'public-user-password';"
 90 | # dbExecute(conn, sql)
 91 | # dbDisconnect(conn)
 92 | # sql <- "GRANT SELECT ON * TO 'public-rds-user'@'%'"
 93 | # dbExecute(conn, sql)
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/misc/preprocessing_scripts/uploadVSD.R:
--------------------------------------------------------------------------------
  1 | load("Data/vsd_for_corr.rda")
  2 | 
  3 | library(foreach)
  4 | doParallel::registerDoParallel()
  5 | credFile <- "Data/credFile.txt"
  6 | source("Scripts/helpers.R")
  7 | 
  8 | # Process for upload
  9 | if (! file.exists("Data/geneVSD_forUpload_geneList.RData")) {
 10 |   # Get tx2gene objects
 11 |   # Aggregate transcript TPM to gene level
 12 |   cores <- 120
 13 |   genes <- rownames(vsd)
 14 |   nr <- length(genes)
 15 |   n <- ceiling(nr/cores)
 16 |   dfVsd <- as.data.frame(assay(vsd))
 17 |   forUploadList <- split(dfVsd, rep(1:ceiling(nr/n), each=n, length.out=nr))
 18 |   samplesVSD <- colnames(dfVsd)    
 19 |   save(samplesVSD, file = "Data/samplesVSD.rda")
 20 |   k <- length(forUploadList)
 21 |   print("running foreach")
 22 |   start <- proc.time()
 23 |   resList <- mclapply(forUploadList, FUN = makeUploadVSD, mc.cores = 10)
 24 |   
 25 |   finalDF <- data.table::rbindlist(resList)
 26 |   ans <- proc.time() - start
 27 |   print(ans)
 28 |   print("Saving")
 29 |   save(finalDF, file = file.path("Data/geneVSD_forUpload.RData"))
 30 |   humanGenesTPM <- unique(as.character(finalDF$geneName))
 31 |   save(humanGenesTPM,file = "Data/geneVSD_forUpload_geneList.RData")
 32 | }
 33 | 
 34 | # Upload
 35 | finalDF2 <- finalDF[,c(-1)]
 36 | rownames(finalDF2) <- finalDF$geneName
 37 | checkBool <- uploadToAzureGeneKey(finalDF2, credentials = credFile,
 38 |                                   tableName = "vsd_hsapiens", check = T)
 39 | if (checkBool & ! doKey) {
 40 |   # Upload to AWS
 41 |   print("uploading to Azure ... ")
 42 |   uploadToAzureGeneKey(finalDF2, credentials = credFile,
 43 |                 tableName = "vsd_hsapiens", check = F)
 44 |   # sql <- paste0("ALTER TABLE ", tableName, " ADD PRIMARY KEY (row_names);")
 45 |   # dbExecute(conn = conn, statement = sql)
 46 |   # tryCatch(expr = {uploadToAzure(finalDF2, tableName)},
 47 |   #          error = function(e) {cat("AWS error -- probably not enough space")})
 48 |   
 49 | } else {
 50 |   print("Table already uploaded ... ")
 51 | }
 52 | if (doKey) {
 53 |   credentials <- "Data/credFile.txt"
 54 |   credentials <- suppressWarnings(read.delim(credentials, sep = ";",
 55 |                                              header = FALSE, stringsAsFactors = F))
 56 |   uName <- credentials$V1
 57 |   pWord <- credentials$V2
 58 |   tableName = "vsd_hsapiens"
 59 |   conn <- dbConnect(drv = RMySQL::MySQL(), user = uName, 
 60 |                     port = 3306, dbname="correlation_analyzer",
 61 |                     password=pWord,
 62 |                     host="m2600az-db01p.mysql.database.azure.com")
 63 |   sql <- paste0("ALTER TABLE ", tableName, " ADD PRIMARY KEY (row_names);")
 64 |   done <- 0
 65 |   while (done  == 0) {
 66 |     done <- tryCatch(expr = {
 67 |       dbExecute(conn = conn, statement = sql)
 68 |       dbDisconnect(conn)
 69 |       rm(conn)
 70 |       gc()
 71 |       1
 72 |     },
 73 |     error = function(e) {
 74 |       print(e$message)
 75 |       retCall <- "could not run statement: Multiple primary key defined"
 76 |       if (e$message == retCall) {
 77 |         cat("Already defined key")
 78 |         dbDisconnect(conn)
 79 |         rm(conn)
 80 |         gc()
 81 |         1
 82 |       } else {
 83 |         cat("Fail -- retry with new connection")
 84 |         rm(conn)
 85 |         gc()
 86 |         conn <- dbConnect(drv = RMySQL::MySQL(), user = uName,
 87 |                           port = 3306, dbname="correlation_analyzer",
 88 |                           password=pWord,
 89 |                           host="m2600az-db01p.mysql.database.azure.com")
 90 |         0
 91 |       }
 92 |     })
 93 |     print(done)
 94 |   }
 95 | }
 96 | lapply(dbListConnections(drv = RMySQL::MySQL()), dbDisconnect)
 97 | Sys.sleep(3)
 98 | cat("\nNext sample ... \n")
 99 | rm(finalDF2)
100 | gc()
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/misc/normality.R:
--------------------------------------------------------------------------------
  1 | # ## Part #1: Add in correlation of two genes with different normalization ##
  2 | # set.seed(42)
  3 | #
  4 | # gene1 <- "BRCA1"
  5 | # gene2 <- "NQO1"
  6 | # Sample_Type <- "cancer"
  7 | # Tissue <- "bone"
  8 | #
  9 | # vstB <- getTissueVST(genesOfInterest = c(gene1, gene2),
 10 | #                      Sample_Type = Sample_Type,
 11 | #                      Tissues = Tissue) %>%
 12 | #   bind_rows() %>%
 13 | #   inner_join(correlationAnalyzeR::human_coldata, by = "samples")
 14 | # B1_corr <- getCorrelationData(geneList = gene1,
 15 | #                               Sample_Type = Sample_Type, Tissue = Tissue)
 16 | # pDF <- apply(B1_corr, MARGIN = 1:2, n = length(B1_corr[,1]), FUN = function(x, n) {
 17 | #   stats::dt(abs(x)/sqrt((1-x^2)/(n-2)), df = 2)
 18 | # })
 19 | # padj <- p.adjust(pDF[,1], method = "BH")
 20 | # B1B2r <- B1_corr[gene2,]
 21 | # B1B2p <- pDF[gene2,]
 22 | # B1B2padj <- padj[gene2]
 23 | # to_sample <- ifelse(length(vstB$samples) > 5000, 5000, length(vstB$samples))
 24 | # plt <- vstB %>%
 25 | #   filter(samples %in% sample(samples, to_sample)) %>%
 26 | #   ggplot(aes(x = !!sym(gene1), y = !!sym(gene2), color = disease,
 27 | #              text = paste0("samples", "\n", "disease", "\n", "Tissue"))) +
 28 | #   geom_point(alpha = .5) +
 29 | #   labs(title = paste0(gene1, " vs ", gene2),
 30 | #        subtitle = paste0("Pearson's R = ", round(B1B2r, 3),
 31 | #                          " (padj = ", signif(B1B2padj, 3), ")")) +
 32 | #   ggplot2::theme_bw(base_size = 16) +
 33 | #   scale_color_manual(name = "Disease", values = c("firebrick", "forestgreen")) +
 34 | #   xlab(paste0(gene1, " Expression (VST)")) +
 35 | #   ylab(paste0(gene2, " Expression (VST)"))
 36 | # plt
 37 | 
 38 | ## Part 2: Compare different correlation types ##
 39 | # genes_to_test <- sample(correlationAnalyzeR::humanGenesVST, 1000)
 40 | # my_corr <- correlationAnalyzeR::getTissueVST(genesOfInterest = genes_to_test,
 41 | #                                              Sample_Type = "all")
 42 | # my_corr2 <- bind_rows(my_corr) %>%
 43 | #   inner_join(correlationAnalyzeR::human_coldata, by = "samples") %>%
 44 | #   distinct(samples, .keep_all = TRUE)
 45 | # save(genes_to_test, my_corr2, file = "misc/my_corr2.rda")
 46 | 
 47 | 
 48 | load("misc/my_corr2.rda")
 49 | 
 50 | res <- lapply(seq(genes_to_test), FUN = function(i) {
 51 |   print(i)
 52 |   gene_now <- genes_to_test[i]
 53 |   resS <- lapply(seq(1000), function(j) {
 54 |     my_corr_tmp <- my_corr2  %>%
 55 |       filter(samples %in% sample(my_corr2$samples, 25))
 56 |     df <- tryCatch({
 57 |       ddb1 <- shapiro.test(my_corr_tmp[,gene_now])
 58 |       pval <- ddb1$p.value
 59 |       w <- ddb1$statistic
 60 |       data.frame(pval, w)
 61 |     },
 62 |     error=function(cond){
 63 |       return("STOP")
 64 |     })
 65 |     return(df)
 66 |   })
 67 | 
 68 |   if (any(unlist(resS) == "STOP")) {
 69 |     return(NULL)
 70 |   }
 71 |   dd <- bind_rows(resS)
 72 |   dd$padj <- p.adjust(dd$pval)
 73 |   dd$gene <- gene_now
 74 |   return(dd)
 75 | })
 76 | 
 77 | boot_shapiro <- bind_rows(res)
 78 | readr::write_csv(boot_shapiro, "misc/boot_shapiro.csv")
 79 | # boot_shapiro <- read_csv("misc/boot_shapiro.csv")
 80 | #
 81 | # hist(-log10(boot_shapiro$padj), breaks = 100)
 82 | #
 83 | # quantile(-log10(boot_shapiro$padj))
 84 | #
 85 | #
 86 | # # boot_shapiro %>%
 87 | # #   group_by(gene) %>%
 88 | # #   summarise(avg = median(padj)) -> dd
 89 | # # hist(dd$avg)
 90 | #
 91 | # #
 92 | # #
 93 | #
 94 | # x <- my_corr2$AC012354.8
 95 | # hist(x)
 96 | # qqnorm(x); qqline(x)
 97 | #
 98 | #
 99 | 
100 | res <- analyzeGenePairs(
101 |   genesOfInterest = c("BRCA1", "BRCA2"), runGSEA = F
102 | )
103 | 
104 | res$compared$VST_corrPlot$corrPlot_all +
105 |   labs(subtitle = NULL)
106 | 
107 | uuu <- res$compared$VST_corrPlot$corrPlot_VST_data
108 | cor(x = uuu$BRCA1, y = uuu$BRCA2, method = "kendall")
109 | cor(x = uuu$BRCA1, y = uuu$BRCA2, method = "pearson")
110 | cor(x = uuu$BRCA1, y = uuu$BRCA2, method = "spearman")
111 | 


--------------------------------------------------------------------------------
/man/geneVsGeneListAnalyze.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/analyzeGeneVsGeneList.R
 3 | \name{geneVsGeneListAnalyze}
 4 | \alias{geneVsGeneListAnalyze}
 5 | \title{Analyze gene vs gene list relationship}
 6 | \usage{
 7 | geneVsGeneListAnalyze(
 8 |   pairedGenesList,
 9 |   Sample_Type = c("normal", "cancer"),
10 |   Tissue = "all",
11 |   outputPrefix = "CorrelationAnalyzeR_Output",
12 |   plotLabels = TRUE,
13 |   sigTest = TRUE,
14 |   nPerm = 2000,
15 |   plotMaxMinCorr = TRUE,
16 |   onlyTop = FALSE,
17 |   topCutoff = 0.5,
18 |   autoRug = TRUE,
19 |   plotTitle = TRUE,
20 |   returnDataOnly = TRUE,
21 |   pool = NULL,
22 |   makePool = FALSE
23 | )
24 | }
25 | \arguments{
26 | \item{pairedGenesList}{A list, named with primary genes of interest with
27 | vectors of secondary genes to test against OR a string containing the
28 | official MSIGDB name for a gene set of interest. See examples.}
29 | 
30 | \item{Sample_Type}{Type of RNA Seq samples used to create correlation data.
31 | Either "all", "normal", or "cancer". Can be a single value for all genes,
32 | or a vector corresponding to the entries in pairedGenesList. Default: "normal".}
33 | 
34 | \item{Tissue}{Which tissue type should gene correlations be derived from?
35 | Can be a single value for all genes, or a vector corresponding to the entries in pairedGenesList.
36 | Run getTissueTypes() to see available tissues. Default: "all"}
37 | 
38 | \item{outputPrefix}{Prefix for saved files. Should include directory info.}
39 | 
40 | \item{plotLabels}{If TRUE, correlation histograms will contain labeled lines showing
41 | secondary genes and their correlation values.
42 | If list of secondary genes is large, set this to FALSE or onlyTop to TRUE
43 | to avoid cluttering the plot. Default: TRUE.}
44 | 
45 | \item{sigTest}{Should the results be compared against random? Default: TRUE.}
46 | 
47 | \item{nPerm}{Number of bootstrap sampling events to run during sigTest. Default: 2000.}
48 | 
49 | \item{plotMaxMinCorr}{If TRUE, the top correlated and anti-correlated genes
50 | will be plotted alongside the selected secondary genes. Default: TRUE.}
51 | 
52 | \item{onlyTop}{For larger secondary gene lists -- This will filter the
53 | number of secondary genes which are plotted to avoid clutter if plotLabels = TRUE.
54 | Default: FALSE.}
55 | 
56 | \item{topCutoff}{The value used for filtering if 'onlyTop' is 'TRUE'. Default: .5}
57 | 
58 | \item{autoRug}{If the size of a secondary gene list > 50, plot lines will be replaced
59 | by an auto-generated rug. Default: TRUE.}
60 | 
61 | \item{plotTitle}{If TRUE, plot title will be added to visualizations. Default: TRUE.}
62 | 
63 | \item{returnDataOnly}{if TRUE will only return a list containing correlations
64 | and significance testing results if applicable. Default: TRUE.}
65 | 
66 | \item{pool}{an object created by pool::dbPool to accessing SQL database.
67 | It will be created if not supplied.}
68 | 
69 | \item{makePool}{Logical. Should a pool be created if one is not supplied? Default: FALSE.}
70 | }
71 | \value{
72 | A list containing correlation values and signficance testing results
73 | }
74 | \description{
75 | Explores how a list of secondary genes relates to a primary gene of interest
76 | }
77 | \details{
78 | Gene vs Gene List mode allows users to determine how similarly correlated a gene
79 | is with a list of secondary genes. Please view the vignette for more information and
80 | examples.
81 | }
82 | \examples{
83 | pairedGenesList <- list("TP53" = c("BRCA1", "CDK12", "PARP1"),
84 |                         "SON" = c("AURKB", "SFPQ", "DHX9"))
85 | 
86 | res <- correlationAnalyzeR::geneVsGeneListAnalyze(pairedGenesList = pairedGenesList,
87 |                                                   returnDataOnly = TRUE,
88 |                                                   Sample_Type = "normal",
89 |                                                   Tissue = "brain")
90 | 
91 | geneset <- list("BRCA1" = "KEGG_CELL_CYCLE")
92 | res <- correlationAnalyzeR::geneVsGeneListAnalyze(pairedGenesList = geneset)
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/misc/preprocessing_scripts/getNormCounts.R:
--------------------------------------------------------------------------------
  1 | library(rhdf5)
  2 | library(DESeq2)
  3 | library(uwot)
  4 | library(Seurat)
  5 | library(jsonlite)
  6 | 
  7 | source("Scripts/helpers.R")
  8 | 
  9 | # Make directory tree
 10 | dir.create("Data", showWarnings = F)
 11 | dir.create("Data/ARCHS4_Download", showWarnings = F)
 12 | 
 13 | # Download files from ARCHS4
 14 | humanExpURL <- "https://s3.amazonaws.com/mssm-seq-matrix/human_matrix.h5"
 15 | mouseExpURL <- "https://s3.amazonaws.com/mssm-seq-matrix/mouse_matrix.h5"
 16 | downList <- c(humanExpURL, mouseExpURL)
 17 | for (i in 1:length(downList)) {
 18 |   file <- downList[i]
 19 |   outFile <- file.path("Data/ARCHS4_Download", gsub(file, pattern = ".+/([a-zA-Z0-9_]+\\.h5)$",
 20 |                                                     replacement = "\\1"))
 21 |   if (! file.exists(outFile)) {
 22 |     download.file(file, destfile = outFile)
 23 |   }
 24 | }
 25 | 
 26 | fileList <- paste0("Data/ARCHS4_Download/",
 27 |                    gsub(downList[c(1)],
 28 |                         pattern = "https://s3.amazonaws.com/mssm-seq-matrix/",
 29 |                         replacement = ""))
 30 | 
 31 | cat("\n", timestamp2(), " Loading & filtering expression data...\n", sep = "")
 32 | if (! file.exists("Data/fullRawCountsFiltered.rda")) {
 33 |   # Load and filter expression data
 34 |   expression <- h5read(dataFile, "data/expression")
 35 |   samples <- h5read(dataFile, "meta/Sample_geo_accession")
 36 |   genes <- h5read(dataFile, name = "meta/genes")
 37 |   rownames(expression) <- genes
 38 |   colnames(expression) <- samples
 39 |   colDataFinal <- colDataFinal[colDataFinal$samples %in% colnames(expression),]
 40 |   expression <- expression[, which(colnames(expression) %in% colDataFinal$samples)]
 41 |   colDataFinal <- colDataFinal[order(match(colDataFinal$samples, colnames(expression))),]
 42 |   if (! all(colDataFinal$samples == colnames(expression))) {
 43 |     stop("ColData samples are not identical to colnames of expression data...",
 44 |          " Please email Code/generateFigures_logFile.txt to author if you find this error and/or submit issue on github.")
 45 |   }
 46 |   # # Keep genes expressed in 10%+ of samples
 47 |   # nonZeroCount <- apply(expression, 1, nonZeroSamps)
 48 |   # expression <- expression[which(nonZeroCount > (length(colnames(expression)) * .01)),]
 49 |   timestamp()
 50 |   cat("\nDone. Saving expression data...\n")
 51 |   save(expression, file = "Data/fullRawCountsFiltered.rda")
 52 | }
 53 | 
 54 | 
 55 | if (! file.exists("Data/vsd_for_corr.rda")) {
 56 |   load("Data/fullRawCountsFiltered.rda")
 57 |   colDataFinal <- colDataFinal[colDataFinal$samples %in% colnames(expression),]
 58 |   colDataFinal <- colDataFinal[order(match(colDataFinal$samples, colnames(expression))),]
 59 |   all(colnames(expression) == colDataFinal$samples)
 60 |   rownames(colDataFinal) <- colDataFinal$samples
 61 |   # Keep genes expressed in 10%+ of samples
 62 |   nonZeroCount <- apply(expression, 1, nonZeroSamps)
 63 |   keepInd <- which(nonZeroCount > (length(colnames(expression)) * .1))
 64 |   expression <- expression[keepInd,]
 65 | 
 66 |   # Transform and normalize
 67 |   dds <- DESeqDataSetFromMatrix(expression, colData = colDataFinal,
 68 |                                 design = ~1)
 69 |   # from https://support.bioconductor.org/p/62246/#62250
 70 |   inds <- rownames(expression)
 71 |   geoMeansList <- mclapply(inds, FUN = function(ind) {
 72 |     row <- expression[ind,]
 73 |     if (all(row == 0)) {
 74 |       0
 75 |     } else {
 76 |       exp(sum(log(row[row != 0]))/length(row))
 77 |     }
 78 |   }, mc.cores = 25)
 79 |   geoMeans <- unlist(geoMeansList)
 80 |   dds <- estimateSizeFactors(dds, geoMeans=geoMeans)
 81 |   vsd <- vst(dds)
 82 |   timestamp()
 83 |   cat("\nDone. Saving vst data...\n")
 84 |   save(vsd, file = "Data/vsd_for_corr.rda")
 85 | } else {
 86 |   cat("\nVSD found -- loading...\n")
 87 |   load("Data/vsd_for_corr.rda")
 88 | }
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/R/generateCorrelations.R:
--------------------------------------------------------------------------------
  1 | #' Generate a correlation matrix from user-supplied data
  2 | #'
  3 | #' @param cts a gene count matrix where rownames are genes and colnames are sample IDs.
  4 | #'
  5 | #' @param transformed Boolean. Indicates whether data is already transformed using
  6 | #' VST or a similar approach. If TRUE, VST transformation with DESeq2 will not be
  7 | #' performed. Default: FALSE
  8 | #'
  9 | #' @param cores Numeric. Number of cores to use for calculating size factors.
 10 | #' NOTE: cores > 1 does not work on Windows. Detault: 1.
 11 | #'
 12 | #' @return Matrix with gene co-expression correlations.
 13 | #'
 14 | #' @details This function performs the same normalization and transformation steps on
 15 | #' a user-supplied dataset that were originally used to generate the data provided
 16 | #' in the pre-calculated databases used by this package. The resulting correlation
 17 | #' matrix can be supplied to analyzeSingleGenes() as an input.
 18 | #' NOTE: the resulting matrix is very large and will take up ~8 GB of memory.
 19 | #'
 20 | #' @examples
 21 | #'
 22 | #' if (! 'airway' in rownames(install.packages())) {
 23 | #'     if (!requireNamespace("BiocManager", quietly = TRUE))
 24 | #'         install.packages("BiocManager")
 25 | #'     BiocManager::install("airway")
 26 | #' }
 27 | #'
 28 | #' if (! 'EnsDb.Hsapiens.v86' in rownames(install.packages())) {
 29 | #'     if (!requireNamespace("BiocManager", quietly = TRUE))
 30 | #'         install.packages("BiocManager")
 31 | #'     BiocManager::install("EnsDb.Hsapiens.v86")
 32 | #' }
 33 | #'
 34 | #' if (! 'dplyr' in rownames(install.packages())) {
 35 | #'     install.packages("dplyr")
 36 | #' }
 37 | #'
 38 | #' data(airway)
 39 | #' cts <- assay(airway)
 40 | #' ens2gene <- ensembldb::select(EnsDb.Hsapiens.v86, keys = rownames(cts),
 41 | #'                               columns = c("SYMBOL"), keytype = "GENEID") %>%
 42 | #'   dplyr::distinct(SYMBOL, .keep_all = TRUE) %>%
 43 | #'   dplyr::inner_join(y = data.frame("GENEID" = rownames(cts)))
 44 | #'
 45 | #' cts <- cts[ens2gene$GENEID,]
 46 | #' rownames(cts) <- ens2gene$SYMBOL
 47 | #'
 48 | #' corrMat <- generateCorrelations(cts)
 49 | #'
 50 | #' @export
 51 | generateCorrelations <- function(cts,
 52 |                                  transformed=FALSE,
 53 |                                  cores=1) {
 54 | 
 55 |   # library(airway)
 56 |   # library(EnsDb.Hsapiens.v86)
 57 |   # library(dplyr)
 58 |   #
 59 |   # data(airway)
 60 |   # cts <- assay(airway)
 61 |   # ens2gene <- ensembldb::select(EnsDb.Hsapiens.v86, keys = rownames(cts),
 62 |   #                               columns = c("SYMBOL"), keytype = "GENEID") %>%
 63 |   #   dplyr::distinct(SYMBOL, .keep_all = TRUE) %>%
 64 |   #   dplyr::inner_join(y = data.frame("GENEID" = rownames(cts)))
 65 |   # cts <- cts[ens2gene$GENEID,]
 66 |   # rownames(cts) <- ens2gene$SYMBOL
 67 |   #
 68 |   # label <- 'my_data'
 69 |   # transformed <- FALSE
 70 |   # cores <- 1
 71 | 
 72 |   if (! transformed) {
 73 |     # Keep genes expressed in 10%+ of samples
 74 |     nonZeroCount <- apply(cts, 1, function(row) {
 75 |       return(sum(row > 0))
 76 |     })
 77 |     keepInd <- which(nonZeroCount > (length(colnames(cts)) * .1))
 78 |     cts <- cts[keepInd,]
 79 | 
 80 |     # Transform and normalize
 81 |     dds <- DESeq2::DESeqDataSetFromMatrix(cts, colData = data.frame(sampleID=colnames(cts)),
 82 |                                           design = ~1)
 83 |     # from https://support.bioconductor.org/p/62246/#62250
 84 |     inds <- rownames(cts)
 85 |     geoMeansList <- parallel::mclapply(inds, FUN = function(ind) {
 86 |       row <- cts[ind,]
 87 |       if (all(row == 0)) {
 88 |         0
 89 |       } else {
 90 |         exp(sum(log(row[row != 0]))/length(row))
 91 |       }
 92 |     }, mc.cores = cores)
 93 |     geoMeans <- unlist(geoMeansList)
 94 |     dds <- DESeq2::estimateSizeFactors(dds, geoMeans=geoMeans)
 95 |     vsd <- DESeq2::vst(dds)
 96 |     cts <- SummarizedExperiment::assay(vsd)
 97 |   }
 98 | 
 99 |   corrMat <- WGCNA::cor(x = t(cts), verbose = 1, nThreads = cores)
100 | 
101 |   return(corrMat)
102 | }
103 | 
104 | 


--------------------------------------------------------------------------------
/misc/FigS6_S7.R:
--------------------------------------------------------------------------------
  1 | ### Additional Figure S5 and S6 based on requests from reviewers ###
  2 | library(tidyverse)
  3 | library(ggpubr)
  4 | library(correlationAnalyzeR)
  5 | 
  6 | # Hallmark geneset
  7 | t2g <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = "Hallmark")
  8 | 
  9 | # Get VST for 500 genes in the Hallmark gene set and wrangle
 10 | vstNow <- correlationAnalyzeR::getTissueVST(genesOfInterest = sample(unique(t2g$gene_symbol),
 11 |                                                                      500))
 12 | vstNow2 <- bind_rows(vstNow) %>%
 13 |   inner_join(correlationAnalyzeR::human_coldata, by = "samples") %>%
 14 |   distinct(samples, .keep_all = TRUE)
 15 | save(vstNow2, file = "misc/vstNow2.rda")
 16 | 
 17 | ## Boostrap Pearson and Spearman ##
 18 | load("misc/vstNow2.rda")
 19 | genes_to_testNow <- colnames(vstNow2)
 20 | genes_to_testNow <- genes_to_testNow[which(genes_to_testNow %in% t2g$gene_symbol)]
 21 | 
 22 | # Bootstrap 5000 simulations
 23 | res <- lapply(seq(5000), function(x) {
 24 |   gs <- sample(unique(t2g$gs_name), 1)
 25 |   genes <- sample(t2g$gene_symbol[t2g$gs_name == gs &
 26 |                                     t2g$gene_symbol %in% genes_to_testNow], 2)
 27 |   # cor(x = vstNow2[,genes[1]], y = vstNow2[,genes[2]], method = "kendall")
 28 |   pr <- cor(x = vstNow2[,genes[1]], y = vstNow2[,genes[2]], method = "pearson")
 29 |   sr <- cor(x = vstNow2[,genes[1]], y = vstNow2[,genes[2]], method = "spearman")
 30 |   return(data.frame(pr, sr, gs, gene1=genes[1], gene2=genes[2]))
 31 | })
 32 | 
 33 | # Find difference between Pearson and Spearman AND remove duplicate gene pairs
 34 | dd <- bind_rows(res) %>%
 35 |   mutate(diff = pr - sr) %>%
 36 |   distinct(diff, .keep_all = TRUE)
 37 | save(dd, file = "misc/simulated_spearman_vs_pearson.rda")
 38 | load("misc/simulated_spearman_vs_pearson.rda")
 39 | 
 40 | ## Generate Figures ##
 41 | 
 42 | # Fig S5
 43 | g1 <- dd %>%
 44 |   select(Pearson = pr, Spearman = sr) %>%
 45 |   ggplot(aes(x = Pearson, y = Spearman)) +
 46 |   geom_point() +
 47 |   geom_abline(intercept = 0, slope = 1,
 48 |               size = 1.5, color = "firebrick") +
 49 |   scale_x_continuous(limits = c(-.75, 1)) +
 50 |   scale_y_continuous(limits = c(-.75, 1)) +
 51 |   theme_bw(base_size = 16) +
 52 |   labs(title = "Correlation Methods in Hallmark Collection",
 53 |        subtitle = "2,832 Simulations via bootstrapping")
 54 | g2 <- dd %>%
 55 |   pivot_longer(cols = c(pr, sr)) %>%
 56 |   mutate(name = case_when(name == "pr" ~ "Pearson",
 57 |                           TRUE ~ "Spearman")) %>%
 58 |   ggplot(aes(x = name, y = value, fill = name)) +
 59 |   geom_boxplot() +
 60 |   ggpubr::stat_compare_means(method = "t.test",
 61 |                              comparisons = list(c("Pearson", "Spearman")),
 62 |                              method.args = list("alternative" = "greater"),
 63 |                              label = "p.signif") +
 64 |   theme_bw(base_size = 16) +
 65 |   ggpubr::rremove("legend") +
 66 |   labs(title = "Correlation Methods in Hallmark Collection",
 67 |        subtitle = "2,832 Simulations via bootstrapping") +
 68 |   ylab("Correlation Coefficient") +
 69 |   xlab(NULL)
 70 | 
 71 | ggg <- ggarrange(g1, g2, align = "hv")
 72 | annotate_figure(ggg, top = text_grob("Correlation comparison using 'Hallmark' collection\n(2,832 Simulations)",
 73 |                                      face = "bold", size = 18)) +
 74 |   ggsave(filename = "../Manuscript/Assets/Correlation_Methods_Compare_comb.pdf",
 75 |          height = 5, width = 10) +
 76 |   ggsave(filename = "../Manuscript/Assets/Correlation_Methods_Compare_comb.png",
 77 |          height = 5, width = 10)
 78 | 
 79 | 
 80 | # Fig S6
 81 | ex1 <- analyzeGenePairs(
 82 |   genesOfInterest = c("MYL3", "TCAP"), runGSEA = F
 83 | )
 84 | g11 <- ex1$compared$VST_corrPlot$corrPlot_tissue +
 85 |   labs(subtitle = NULL, title = "Top Pearson-Specific Correlation")
 86 | ex2 <- analyzeGenePairs(
 87 |   genesOfInterest = c("TFF2", "IL12B"), runGSEA = F
 88 | )
 89 | g22 <- ex2$compared$VST_corrPlot$corrPlot_tissue +
 90 |   labs(subtitle = NULL, title = "Top Spearman-Specific Correlation")
 91 | ggg2 <- ggarrange(g11, g22, align = "hv", common.legend = TRUE, legend = "bottom") +
 92 |   ggsave(filename = "../Manuscript/Assets/Correlation_Methods_Compare_example_VST.pdf") +
 93 |   ggsave(filename = "../Manuscript/Assets/Correlation_Methods_Compare_example_VST.png",
 94 |          height = 7, width = 12)
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/man/analyzeGenePairs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/analyzeGenePairs.R
 3 | \name{analyzeGenePairs}
 4 | \alias{analyzeGenePairs}
 5 | \title{Analyze Gene Pairs}
 6 | \usage{
 7 | analyzeGenePairs(
 8 |   genesOfInterest,
 9 |   Sample_Type = c("normal", "normal"),
10 |   Tissue = c("all", "all"),
11 |   GSEA_Type = c("simple"),
12 |   outputPrefix = "CorrelationAnalyzeR_Output_Paired",
13 |   crossCompareMode = FALSE,
14 |   runGSEA = TRUE,
15 |   TERM2GENE = NULL,
16 |   nperm = 2000,
17 |   sampler = FALSE,
18 |   topPlots = FALSE,
19 |   returnDataOnly = TRUE,
20 |   pool = NULL,
21 |   makePool = FALSE
22 | )
23 | }
24 | \arguments{
25 | \item{genesOfInterest}{A length-two vector with genes to compare.}
26 | 
27 | \item{Sample_Type}{A length-two vector of sample types corresponding to genesOfInterest.
28 | Choose "all", "normal", or "cancer". Default: c("normal", "normal")}
29 | 
30 | \item{Tissue}{A length-two vector of tissue types corresponding to genesOfInterest.
31 | Run getTissueTypes() to see available list. Default: c("all", "all")}
32 | 
33 | \item{GSEA_Type}{Character vector listing the gene set databases to use.
34 | Options are listed in correlationAnalyzeR::pathwayCategories --
35 | See details of ?getTERM2GENE for more info.}
36 | 
37 | \item{outputPrefix}{Prefix for saved files -- the directory name to store output files in.
38 | This is ignored unless returnDataOnly is FALSE.}
39 | 
40 | \item{crossCompareMode}{Use this mode to generate comparisons
41 | across all tissue and disease types. If both genes for genesOfInterest are the
42 | same -- will compare normal vs cancer for that gene in each available tissue. Else, will
43 | perform comparison of two different genes in all tissue-disease groups.
44 | Will only consider user input for returnDataOnly, outputPrefix, and genesOfInterest.}
45 | 
46 | \item{runGSEA}{If TRUE will run GSEA using gene correlation values. Default: TRUE.}
47 | 
48 | \item{TERM2GENE}{Mapping of geneset IDs to gene names. If not supplied, it will be
49 | generated automatically. Only applicable if GSEA is to be run. TERM2GENE objects
50 | can be generated manually using the getTERM2GENE() function.}
51 | 
52 | \item{nperm}{Number of permutations to run in GSEA. Default is 2000}
53 | 
54 | \item{sampler}{Logical. If TRUE, will only return 100,000 random genesets from either
55 | simple or complex TERM2GENEs. Useful for reducing GSEA computational burden.}
56 | 
57 | \item{topPlots}{Logical. If TRUE, myGSEA() will build gsea plots for top correlated genesets.
58 | Default: TRUE.}
59 | 
60 | \item{returnDataOnly}{if TRUE will return result list object
61 | and will not generate any folders or files. Default: TRUE.}
62 | 
63 | \item{pool}{an object created by pool::dbPool to accessing SQL database.
64 | It will be created if not supplied.}
65 | 
66 | \item{makePool}{Logical. Should a pool be created if one is not supplied? Default: FALSE.}
67 | }
68 | \value{
69 | A named list containing visualizations and correlation data from paired analysis.
70 | }
71 | \description{
72 | Compares correlations and corGSEA results for two genes of interest.
73 | }
74 | \details{
75 | Comaprison of two genes of interest using correlation values.
76 | This can be 2 different genes in the same tissue or sample type or
77 | the same gene accross two sample or tissue types. Alternatively, specify 'crossCompareMode'
78 | to view compared correrlations across all available tissue types.
79 | Please view the vignette for more detail about this function,
80 | including the structure of the ouput data list.
81 | }
82 | \examples{
83 | genesOfInterest <- c("ATM", "SLC7A11")
84 | res <- correlationAnalyzeR::analyzeGenePairs(genesOfInterest = genesOfInterest,
85 |                               GSEA_Type = "simple", returnDataOnly = TRUE,
86 |                               Sample_Type = c("normal", "normal"),
87 |                               Tissue = c("brain", "brain"))
88 | genesOfInterest <- c("BRCA1", "BRCA1")
89 | res <- correlationAnalyzeR::analyzeGenePairs(genesOfInterest = genesOfInterest,
90 |                               GSEA_Type = "simple", returnDataOnly = TRUE,
91 |                               Sample_Type = c("normal", "cancer"),
92 |                               Tissue = c("respiratory", "respiratory"))
93 | genesOfInterest <- c("NFKB1", "SOX10")
94 | res <- correlationAnalyzeR::analyzeGenePairs(genesOfInterest = genesOfInterest,
95 |                               returnDataOnly = TRUE,
96 |                               crossCompareMode = TRUE)
97 | }
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## correlationAnalyzeR
 2 | [![releaseVersion](https://img.shields.io/badge/Version-1.0.0-blue.svg)](https://github.com/millerh1/correlationAnalyzeR)
 3 | <!---
 4 | [![Build Status](https://travis-ci.com/Bishop-Laboratory/correlationAnalyzeR.svg?branch=master)](https://travis-ci.com/Bishop-Laboratory/correlationAnalyzeR)
 5 | -->
 6 | 
 7 | The correlationAnalyzeR package uses correlation data derived from the [ARCHS4](https://amp.pharm.mssm.edu/archs4/index.html) RNA Seq repository to generate biological insights about a gene or genes of interest. The web implementation can be found [here](http://gccri.bishop-lab.uthscsa.edu/correlation-analyzer).
 8 | 
 9 | ## Quickstart
10 | To get started with `correlationAnalyzeR`, please view the vignette for this package [here](https://misc-items-to-share.s3-us-west-2.amazonaws.com/correlationAnalyzeR.html).
11 | 
12 | ### Installation
13 | correlationAnalyzeR using `install_github()` from the `devtools` package.
14 | 
15 | ``` r
16 | ## install.packages("devtools")
17 | devtools::install_github("Bishop-Laboratory/correlationAnalyzeR")
18 | ```
19 | 
20 | ## Motivation
21 | This project is motivated by a recurring issue that arises during exploratory bioinformatics: *Sometimes little to no information exists about a gene or gene(s) of interest.* 
22 | One way to address this problem is to compute gene expression correlations. These values indicate how genes vary in relation to eachother.
23 | 
24 | With gene correlation data, it is possible to implement three levels of analysis:
25 | - *Single gene*: Analyses such as [GSEA](http://software.broadinstitute.org/gsea/index.jsp) (Gene Set Enrichment Analysis) which predict
26 | the biological pathways correlated with a gene of interest.
27 | - *Paired gene set*: Statistical approaches to determine if a gene set is significantly correlated with a gene of interest.
28 | - *Gene set toplogy*: Methods to uncover the topology of a gene set by using gene correlation data as the input for tools such as UpSet 
29 | and hierarchical clustering. 
30 | 
31 | ## Contributing [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/millerh1/correlationAnalyzeR/issues)
32 | Please report issues and feel free to contribute! I hope this project can some day incorporate other sources of data such as ppi networks
33 | and continue to develop as a useful tool for anyone who wants to understand more about their HTS data.
34 | 
35 | ## Software/tools cited
36 | 
37 | - Bairoch,A. (2018) The cellosaurus, a cell-line knowledge resource. J. Biomol. Tech.
38 | - Bengtsson,H. (2019) matrixStats: Functions that Apply to Rows and Columns of Matrices (and to Vectors).
39 | - Canty,A. and Ripley,B. (2019) boot: Bootstrap Functions (Originally by Angelo Canty for S).
40 | - Chang,W. et al. (2019) shiny: Web Application Framework for R.
41 | - Dolgalev,I. (2018) msigdbr: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format.
42 | - Galili et al. (2017) heatmaply: an R package for creating interactive cluster heatmaps for online publishing. Bioinformatics.
43 | - Kassambara,A. (2019) ggpubr: ‘ggplot2’ Based Publication Ready Plots.
44 | - Kolde,R. (2019) pheatmap: Pretty Heatmaps.
45 | - Krijthe,J. and Maaten,L. van der (2018) Rtsne: T-Distributed Stochastic Neighbor Embedding using a Barnes-Hut Implementation.
46 | - Lachmann,A. et al. (2018) Massive mining of publicly available RNA-seq data from human and mouse. Nat. Commun., 9.
47 | - Langfelder,P. and Horvath,S. (2008) WGCNA: An R package for weighted correlation network analysis. BMC Bioinformatics.
48 | - Liberzon,A. et al. (2011) Molecular signatures database (MSigDB) 3.0. Bioinformatics.
49 | - Liberzon,A. et al. (2015) The Molecular Signatures Database Hallmark Gene Set Collection. Cell Syst.
50 | - Love,M.I. et al. (2014) Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol.
51 | - R Core Team (2019) R: A Language and Environment for Statistical Computing.
52 | - Sergushichev,A.A. (2016) An algorithm for fast preranked gene set enrichment analysis using cumulative statistic calculation. bioRxiv.
53 | - Sievert,C. (2018) plotly for R.
54 | - Subramanian,A. et al. (2005) Gene set enrichment analysis: A knowledge-based approach for interpreting genome-wide expression profiles. Proc. Natl. Acad. Sci. U. S. A.
55 | - Wickham,H. (2016) ggplot2: Elegant Graphics for Data Analysis Springer-Verlag New York.
56 | - Xie,Y. et al. (2019) DT: A Wrapper of the JavaScript Library ‘DataTables’.
57 | - Yu,G. et al. (2012) ClusterProfiler: An R package for comparing biological themes among gene clusters. Omi. A J. Integr. Biol.
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/misc/FigS2.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | library(ggpubr)
  3 | library(correlationAnalyzeR)
  4 | res <- correlationAnalyzeR::analyzeGenePairs(
  5 |   genesOfInterest = c("BRCA1", "BRCA2"),
  6 |   Sample_Type = "all",
  7 |   runGSEA = FALSE
  8 | )
  9 | g1 <- res$compared$VST_corrPlot$corrPlot_disease +
 10 |   labs(title = "All samples")
 11 | 
 12 | resNorm <- correlationAnalyzeR::analyzeGenePairs(
 13 |   genesOfInterest = c("BRCA1", "BRCA2"),
 14 |   Sample_Type = "normal",
 15 |   runGSEA = FALSE
 16 | )
 17 | g2 <- resNorm$compared$VST_corrPlot$corrPlot_disease +
 18 |   labs(title = "Normal samples")
 19 | 
 20 | resCancer <- correlationAnalyzeR::analyzeGenePairs(
 21 |   genesOfInterest = c("BRCA1", "BRCA2"),
 22 |   Sample_Type = "cancer",
 23 |   runGSEA = FALSE
 24 | )
 25 | g3 <- resCancer$compared$VST_corrPlot$corrPlot_disease +
 26 |   labs(title = "Cancer samples")
 27 | 
 28 | 
 29 | ggarrange(g1, g2, g3, nrow = 1, align = "hv") +
 30 |   ggsave(filename = "../Manuscript/FinalAssets/FigureS2_raw.png",
 31 |          height = 5, width = 18)
 32 | 
 33 | cd <- correlationAnalyzeR::human_coldata
 34 | write_csv(cd, file = "misc/colData.csv")
 35 | 
 36 | ### IL1B; IL1RN
 37 | 
 38 | resRev <- analyzeGenePairs(genesOfInterest = c("IL1B", "IL1RN"),
 39 |                            runGSEA = F)
 40 | resRev$compared$VST_corrPlot$corrPlot_tissue
 41 | 
 42 | 
 43 | 
 44 | g1 <- res$compared$VST_corrPlot$corrPlot_disease +
 45 |   labs(title = "All samples")
 46 | 
 47 | resNorm <- correlationAnalyzeR::analyzeGenePairs(
 48 |   genesOfInterest = c("BRCA1", "NQO1"),
 49 |   Sample_Type = "normal",
 50 |   runGSEA = FALSE
 51 | )
 52 | g2 <- resNorm$compared$VST_corrPlot$corrPlot_disease +
 53 |   labs(title = "Normal samples")
 54 | 
 55 | resCancer <- correlationAnalyzeR::analyzeGenePairs(
 56 |   genesOfInterest = c("BRCA1", "NQO1"),
 57 |   Sample_Type = "cancer",
 58 |   runGSEA = FALSE
 59 | )
 60 | g3 <- resCancer$compared$VST_corrPlot$corrPlot_disease +
 61 |   labs(title = "Cancer samples")
 62 | 
 63 | 
 64 | geneOne <- "BRCA1"
 65 | geneTwo <- "NQO1"
 66 | titleStr <- "Normal samples"
 67 | Rval <- res$compared$VST_corrPlot$Rval
 68 | Padj <- res$compared$VST_corrPlot$Padj
 69 | resNorm$compared$VST_corrPlot$corrPlot_VST_data %>%
 70 |   mutate(Condition = ifelse(Group %in% c(
 71 |                                          # "Mammary - Normal",
 72 |                                          # "Respiratory - Normal",
 73 |                                          # "Thyroid - Normal",
 74 |                                          # "Esophagus - Normal",
 75 |                                          "Pancreas - Normal",
 76 |                                          "Prostate - Normal",
 77 |                                          "Skin - Normal",
 78 |                                          # "Mammary - Normal",
 79 |                                          # "Female Reproductive - Normal",
 80 |                                          # "Respiratory - Normal",
 81 |                                          # "Kidney - Normal",
 82 |                                          "Liver - Normal"
 83 |                                          # "Cartilage - Normal",
 84 |                                          # "Adispoe - Normal",
 85 |                                          # "Muscle - Normal",
 86 |                                          # "Brain - Normal",
 87 |                                          # "Retina - Normal",
 88 |                                          # "Endothelial - Normal"
 89 |                                          ), "Correlated",
 90 |                             ifelse(Group %in% c(
 91 |                               "Prenatal - Normal",
 92 |                               "Male Reproductive - Normal",
 93 |                               "Stem Like - Normal",
 94 |                               "Bone - Normal"
 95 |                             ), "Anticorrelated", "Other"
 96 |                           ))) %>%
 97 |   mutate(Condition = factor(Condition, levels = c(
 98 |     "Correlated",
 99 |     "Anticorrelated",
100 |     "Other"
101 |   ))) %>%
102 |   arrange(desc(Condition)) %>%
103 |   ggplot2::ggplot(ggplot2::aes_string(x = geneOne,
104 |                                       y = geneTwo,
105 |                                       group="Group",
106 |                                       text = "samples",
107 |                                       color = "Condition")) +
108 |   ggplot2::geom_point(alpha = .8) +
109 |   ggplot2::labs(title = titleStr) +
110 |   ggplot2::theme_bw(base_size = 16) +
111 |   ggplot2::xlab(paste0(geneOne, " Expression (VST)")) +
112 |   ggplot2::ylab(paste0(geneTwo, " Expression (VST)"))
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/man/analyzeSingleGenes.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/analyzeSingleGenes.R
  3 | \name{analyzeSingleGenes}
  4 | \alias{analyzeSingleGenes}
  5 | \title{Analyze Single Genes}
  6 | \usage{
  7 | analyzeSingleGenes(
  8 |   genesOfInterest,
  9 |   GSEA_Type = c("simple"),
 10 |   Sample_Type = "normal",
 11 |   Tissue = "all",
 12 |   crossCompareMode = FALSE,
 13 |   nperm = 2000,
 14 |   TERM2GENE = NULL,
 15 |   whichCompareGroups = c("all", "normal", "cancer"),
 16 |   outputPrefix = "CorrelationAnalyzeR_Output",
 17 |   sampler = FALSE,
 18 |   runGSEA = TRUE,
 19 |   topPlots = TRUE,
 20 |   returnDataOnly = TRUE,
 21 |   pool = NULL,
 22 |   corrMat = NULL,
 23 |   corrMat_label = "User-Supplied",
 24 |   makePool = FALSE
 25 | )
 26 | }
 27 | \arguments{
 28 | \item{genesOfInterest}{A vector of genes to analyze.}
 29 | 
 30 | \item{GSEA_Type}{Character vector listing the gene set databases to use.
 31 | Options are listed in correlationAnalyzeR::pathwayCategories --
 32 | See details of ?getTERM2GENE for more info.}
 33 | 
 34 | \item{Sample_Type}{Type of RNA Seq samples used to create correlation data.
 35 | Either "all", "normal", or "cancer". Can be a single value for all genes,
 36 | or a vector corresponding to genesOfInterest. Default: "normal"}
 37 | 
 38 | \item{Tissue}{Which tissue type should gene correlations be derived from?
 39 | Can be a single value for all genes, or a vector corresponding to genesOfInterest.
 40 | Run getTissueTypes() to see available tissues. Default: "all"}
 41 | 
 42 | \item{crossCompareMode}{Instead of normal single gene analysis, analyzeSingleGenes() will generate
 43 | correlations for a single gene across all tissue-disease groups. GSEA will not be run.
 44 | analyzeSingleGenes() will only consider user input for returnDataOnly, whichCompareGroups,
 45 | outputPrefix, and genesOfInterest.}
 46 | 
 47 | \item{nperm}{Number of permutations to run in GSEA. Default: 2000}
 48 | 
 49 | \item{TERM2GENE}{Mapping of geneset IDs to gene names. If not supplied, it will be
 50 | generated automatically. Only applicable if GSEA is to be run. TERM2GENE objects
 51 | can be generated manually using the getTERM2GENE() function.}
 52 | 
 53 | \item{whichCompareGroups}{For crossCompareMode, select "all", "normal", or "cancer"
 54 | to analyze correlations from the corresponding groups. Default: "all".}
 55 | 
 56 | \item{outputPrefix}{Prefix for saved files -- the directory name to store output files in.
 57 | This is ignored unless returnDataOnly is FALSE. Default: "CorrelationAnalyzeR_Output"}
 58 | 
 59 | \item{sampler}{Logical. If TRUE, will only return 100,000 random genesets from either
 60 | simple or complex TERM2GENEs. Useful for reducing GSEA computational burden.}
 61 | 
 62 | \item{runGSEA}{If TRUE will run GSEA using gene correlation values. Default: TRUE.}
 63 | 
 64 | \item{topPlots}{Logical. If TRUE, myGSEA() will build gsea plots for top correlated genesets.
 65 | Default: TRUE.}
 66 | 
 67 | \item{returnDataOnly}{if TRUE will return result list object
 68 | and will not generate any folders or files. Default: TRUE.}
 69 | 
 70 | \item{pool}{an object created by pool::dbPool to accessing SQL database.
 71 | It will be created if not supplied as long as makePool is TRUE.}
 72 | 
 73 | \item{corrMat}{A custom correlation matrix generated by generateCorrelations()
 74 | to use instead of pre-supplied databases. If supplied, "Tissue" and "Sample_Type" are ignored.}
 75 | 
 76 | \item{corrMat_label}{If corrMat is provided, this label will be used for plotting. Default: "User-Supplied".}
 77 | 
 78 | \item{makePool}{Logical. Should a database pool be created if one is not supplied? Default: FALSE.}
 79 | }
 80 | \value{
 81 | A named list of correlation values, corGSEA results,
 82 | and visualizations for each gene of interest.
 83 | }
 84 | \description{
 85 | Obtains correlations and corGSEA results for each gene of interest.
 86 | }
 87 | \details{
 88 | analyzeSingleGenes() performs most of the core tasks for analyzing gene function
 89 | via co-expression correlations. Please view the vignette for more detail about this function,
 90 | including the structure of the ouput data list.
 91 | }
 92 | \examples{
 93 | genesOfInterest <- c("ATM", "SLC7A11")
 94 | res <- correlationAnalyzeR::analyzeSingleGenes(genesOfInterest = genesOfInterest,
 95 |                               returnDataOnly = TRUE,
 96 |                               GSEA_Type = "simple",
 97 |                               Sample_Type = c("normal", "cancer"),
 98 |                               Tissue = c("respiratory", "pancreas"))
 99 | 
100 | genesOfInterest <- c("BRCA1")
101 | res <- correlationAnalyzeR::analyzeSingleGenes(genesOfInterest = genesOfInterest,
102 |                               GSEA_Type = "simple", returnDataOnly = TRUE,
103 |                               crossCompareMode = TRUE,
104 |                               whichCompareGroups = "normal")
105 | 
106 | }
107 | 


--------------------------------------------------------------------------------
/tests/Test.R:
--------------------------------------------------------------------------------
  1 | # library(correlationAnalyzeR)
  2 | # res <- correlationAnalyzeR::analyzeSingleGenes(genesOfInterest = "SRY",
  3 | #                                                whichCompareGroups = "normal",
  4 | #                                          crossCompareMode = TRUE)
  5 | 
  6 | 
  7 | #
  8 | # res <- correlationAnalyzeR::analyzeGenePairs(genesOfInterest = c("BRCA1", "BRCA1"), crossCompareMode = TRUE)
  9 | #
 10 | # res <- correlationAnalyzeR::geneVsGeneListAnalyze(pairedGenesList = list("BRCA1" = c("ATM", "EZH2", "STAG2")))
 11 | #
 12 | # res <- correlationAnalyzeR::analyzeGenesetTopology(genesOfInterest = c("ATM", "SETX", "CDKN1A", "CDKN2A", "FUS"))
 13 | #
 14 | #
 15 | #
 16 | # correlationAnalyzeR::getAvailableGenes()
 17 | #
 18 | # genesOfInterest <- c("CDK12", "AURKB", "SFPQ", "NFKB1", "BRCC3", "BRCA2", "PARP1",
 19 | #                      "DHX9", "SON", "AURKA", "SETX", "BRCA1", "ATMIN")
 20 | # res <- correlationAnalyzeR::analyzeGenesetTopology(genesOfInterest = genesOfInterest,
 21 | #                                             Sample_Type = "cancer", returnDataOnly = TRUE,
 22 | #                                             Tissue = "brain")
 23 | #
 24 | # res <- correlationAnalyzeR::getTissueVST(genesOfInterest = c("BRCA1", "ATM"),
 25 | #                     Tissues = c("brain", "respiratory"),
 26 | #                     Sample_Type = "all",
 27 | #                     useBlackList = TRUE)
 28 | #
 29 | # genesOfInterest <- c("ATM", "SLC7A11")
 30 | # res <- correlationAnalyzeR::analyzeGenePairs(genesOfInterest = genesOfInterest,
 31 | #                                       GSEA_Type = "simple", returnDataOnly = TRUE,
 32 | #                                       Sample_Type = c("normal", "normal"),
 33 | #                                       Tissue = c("brain", "brain"))
 34 | # genesOfInterest <- c("BRCA1", "BRCA1")
 35 | # res <- correlationAnalyzeR::analyzeGenePairs(genesOfInterest = genesOfInterest,
 36 | #                                       GSEA_Type = "simple", returnDataOnly = TRUE,
 37 | #                                       Sample_Type = c("normal", "cancer"),
 38 | #                                       Tissue = c("respiratory", "respiratory"))
 39 | # genesOfInterest <- c("NFKB1", "SOX10")
 40 | # res <- correlationAnalyzeR::analyzeGenePairs(genesOfInterest = genesOfInterest,
 41 | #                                       returnDataOnly = TRUE,
 42 | #                                       crossCompareMode = TRUE)
 43 | # genesOfInterest <- c("SOX10", "SOX10")
 44 | # res <- correlationAnalyzeR::analyzeGenePairs(genesOfInterest = genesOfInterest,
 45 | #                                              returnDataOnly = TRUE,
 46 | #                                              crossCompareMode = TRUE)
 47 | #
 48 | #
 49 | #
 50 | # pairedGenesList <- list("TP53" = c("BRCA1", "CDK12", "PARP1"),
 51 | #                         "SON" = c("AURKB", "SFPQ", "DHX9"))
 52 | #
 53 | # res <- correlationAnalyzeR::geneVsGeneListAnalyze(pairedGenesList = pairedGenesList,
 54 | #                                            returnDataOnly = TRUE,
 55 | #                                            Sample_Type = "normal",
 56 | #                                            Tissue = "brain")
 57 | #
 58 | #
 59 | # genesOfInterest <- c("ATM", "SLC7A11")
 60 | # res <- correlationAnalyzeR::analyzeSingleGenes(genesOfInterest = genesOfInterest,
 61 | #                                         returnDataOnly = TRUE,
 62 | #                                         GSEA_Type = "simple",
 63 | #                                         Sample_Type = c("normal", "cancer"),
 64 | #                                         Tissue = c("respiratory", "pancreas"))
 65 | #
 66 | # genesOfInterest <- c("BRCA1")
 67 | # res <- correlationAnalyzeR::analyzeSingleGenes(genesOfInterest = genesOfInterest,
 68 | #                                         GSEA_Type = "simple", returnDataOnly = TRUE,
 69 | #                                         crossCompareMode = TRUE,
 70 | #                                         whichCompareGroups = "normal")
 71 | #
 72 | #
 73 | # res <- correlationAnalyzeR::getCorrelationData(Sample_Type = "normal",
 74 | #                                         Tissue = "kidney",
 75 | #                                         geneList = c("ATM", "BRCA1"))
 76 | #
 77 | #
 78 | #
 79 | #
 80 | # res <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = "simple")
 81 | # res <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = c("Hallmark", "KEGG"))
 82 | #
 83 | #
 84 | #
 85 | # correlationAnalyzeR::getTissueTypes()
 86 | #
 87 | #
 88 | #
 89 | # corrDF <- correlationAnalyzeR::analyzeSingleGenes(genesOfInterest = c("BRCA1"),
 90 | #                                                   returnDataOnly = TRUE, runGSEA = FALSE, Sample_Type = "normal")
 91 | # ranks <- corrDF$correlations[,1]
 92 | # names(ranks) <- rownames(corrDF$correlations)
 93 | # TERM2GENE <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = "simple",
 94 | #                                                Species = "hsapiens")
 95 | # res <- correlationAnalyzeR::myGSEA(ranks = ranks,
 96 | #                             TERM2GENE = TERM2GENE,
 97 | #                             plotFile = "GSEA_out", outDir = getwd(),
 98 | #                             topPlots = FALSE, returnDataOnly=TRUE, Condition = "GSEA Results")
 99 | #
100 | #
101 | #
102 | #
103 | #
104 | #
105 | 


--------------------------------------------------------------------------------
/man/analyzeGenesetTopology.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/analyzeGeneListTopology.R
  3 | \name{analyzeGenesetTopology}
  4 | \alias{analyzeGenesetTopology}
  5 | \title{Analyze Gene List Topology}
  6 | \usage{
  7 | analyzeGenesetTopology(
  8 |   genesOfInterest,
  9 |   Sample_Type = "normal",
 10 |   Tissue = "all",
 11 |   crossComparisonType = c("PCA", "variantGenes", "coCorrelativeGenes", "pathwayEnrich"),
 12 |   pathwayType = c("simple"),
 13 |   setComparisonCutoff = "Auto",
 14 |   pathwayEnrichment = FALSE,
 15 |   pValueCutoff = 0.05,
 16 |   numTopGenesToPlot = "Auto",
 17 |   alternativeTSNE = TRUE,
 18 |   numClusters = "Auto",
 19 |   outputPrefix = "CorrelationAnalyzeR_Output",
 20 |   returnDataOnly = TRUE,
 21 |   pool = NULL,
 22 |   makePool = FALSE
 23 | )
 24 | }
 25 | \arguments{
 26 | \item{genesOfInterest}{A vector of genes to analyze or the name of an official MSIGDB term.}
 27 | 
 28 | \item{Sample_Type}{Type of RNA Seq samples used to create correlation data.
 29 | Either "all", "normal", or "cancer". Can be a single value for all genes,
 30 | or a vector corresponding to genesOfInterest. Default: "normal"}
 31 | 
 32 | \item{Tissue}{Which tissue type should gene correlations be derived from?
 33 | Can be a single value for all genes, or a vector corresponding to genesOfInterest.
 34 | Run getTissueTypes() to see available tissues. Default: "all"}
 35 | 
 36 | \item{crossComparisonType}{The type of topology tests to run. (see details).
 37 | Default: c("PCA", "variantGenes", "coCorrelativeGenes", "pathwayEnrich")}
 38 | 
 39 | \item{pathwayType}{Which pathway annotations should be considered? Options listed in
 40 | correlationAnalyzeR::MSIGDB_Geneset_Names See details of ?getTERM2GENE for more info.
 41 | Default: "simple".}
 42 | 
 43 | \item{setComparisonCutoff}{Only relevant for co-correlation analysis -- the number of genes which
 44 | must aggree for a gene to be considered co-correlative within the input gene list.
 45 | Default: "Auto"}
 46 | 
 47 | \item{pathwayEnrichment}{Logic. If TRUE, pathway enrichment will be performed on variant genes --
 48 | if 'variantGenes' selected -- and/or on co-correlative genes -- if "coCorrelativeGenes" selected.
 49 | Default: FALSE.}
 50 | 
 51 | \item{pValueCutoff}{Numeric. The p value cutoff applied when running all pathway enrichment tests.
 52 | Default: .05.}
 53 | 
 54 | \item{numTopGenesToPlot}{When creating a heatmap of the top co-correlative or top variant genes,
 55 | how many genes should be plotted on the y axis? Default: "Auto"}
 56 | 
 57 | \item{alternativeTSNE}{Logical. If TRUE, then a TSNE will be run as an alternative to PCA for visualizing
 58 | large input gene lists. This is highly recommended as 100+ member gene lists cannot be visualized otherwise.
 59 | Default: TRUE.}
 60 | 
 61 | \item{numClusters}{The number of clusters to create with hclust or TSNE analysis.}
 62 | 
 63 | \item{outputPrefix}{Prefix for saved files. Should include directory info.
 64 | Ignored if returnDataOnly = TRUE. Default: "CorrelationAnalyzeR_Output"}
 65 | 
 66 | \item{returnDataOnly}{if TRUE will return only a list of analysis results. Default: TRUE}
 67 | 
 68 | \item{pool}{an object created by pool::dbPool to accessing SQL database.
 69 | It will be created if not supplied.}
 70 | 
 71 | \item{makePool}{Logical. Should a pool be created if one is not supplied? Default: FALSE.}
 72 | }
 73 | \value{
 74 | A list of correlations for input genes, and the results of chosen analysis + visualizations.
 75 | }
 76 | \description{
 77 | Analyzes the topology of a gene list using gene correlation data and dimension-reduction techniques.
 78 | }
 79 | \details{
 80 | analyzeGenesetTopology() uses the matrix of co-expression correlations to perform
 81 | dimensionality reduction, clustering, and it also performs pathway enrichment. See the
 82 | vignette for usage examples and information about the output format.
 83 | 
 84 | Cross Comparison Types:
 85 | - variantGenes: These are the genes which best explain variation between genes within the input list.
 86 |                 These genes can divide a list into functional groups.
 87 | - coCorrelativeGenes: These are the genes which best explain similarities between all genes in the input list.
 88 |                       These genes can explain what biological processes unify the input genes.
 89 | - PCA: This is a dimensionality reduction technique for exploring the topology of a gene list.
 90 |        The PCA analyses here employes hclust to divide the gene list into functional clusters.
 91 |        If the input list is > 100 genes, RTsne will be used for visualization.
 92 | - pathwayEnrich: Cluster profiler's enricher function will be run on the input gene list.
 93 | }
 94 | \examples{
 95 | genesOfInterest <- c("CDK12", "AURKB", "SFPQ", "NFKB1", "BRCC3", "BRCA2", "PARP1",
 96 |                      "DHX9", "SON", "AURKA", "SETX", "BRCA1", "ATMIN")
 97 | res <- correlationAnalyzeR::analyzeGenesetTopology(genesOfInterest = genesOfInterest,
 98 |                                  Sample_Type = "cancer", returnDataOnly = TRUE,
 99 |                                  Tissue = "brain",
100 |                                  crossComparisonType = c("variantGenes", "PCA"))
101 | 
102 | 
103 | res <- correlationAnalyzeR::analyzeGenesetTopology(genesOfInterest = "HALLMARK_ADIPOGENESIS")
104 | 
105 | }
106 | 


--------------------------------------------------------------------------------
/man/getTERM2GENE.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/getTERM2GENE.R
  3 | \name{getTERM2GENE}
  4 | \alias{getTERM2GENE}
  5 | \title{Obtains TERM2GENE object for corGSEA}
  6 | \usage{
  7 | getTERM2GENE(
  8 |   GSEA_Type = c("simple"),
  9 |   Species = c("hsapiens", "mmusculus"),
 10 |   sampler = FALSE,
 11 |   listReturn = FALSE
 12 | )
 13 | }
 14 | \arguments{
 15 | \item{GSEA_Type}{Which pathway annotations should be considered? Options listed in
 16 | correlationAnalyzeR::pathwayCategories -- See details below for more info.}
 17 | 
 18 | \item{Species}{Species to obtain gene names for.
 19 | Either 'hsapiens' or 'mmusculus'}
 20 | 
 21 | \item{sampler}{If TRUE, will only return 100,000 random genesets from either
 22 | simple or complex TERM2GENEs. Useful for reducing GSEA computational burden.}
 23 | 
 24 | \item{listReturn}{If TRUE, will return annotations as a list object.}
 25 | }
 26 | \value{
 27 | A tbl object with columns "gs_name" and "gene_symbol"
 28 | }
 29 | \description{
 30 | Wrapper for msgidb::msigdbr() function
 31 | }
 32 | \details{
 33 | GSEA_Type category names and their MSIGDB description:
 34 | 
 35 | \strong{Hallmark} (a.k.a "H" in MSIGDB): "Hallmark gene sets summarize and represent
 36 | specific well-defined biological states or processes and display coherent expression.
 37 | These gene sets were generated by a computational methodology based on identifying
 38 | overlaps between gene sets in other MSigDB collections and retaining genes that display
 39 | coordinate expression."
 40 | 
 41 | \strong{Cytogenic bands} (a.k.a "C1" in MSIGDB): "Gene sets corresponding to each human
 42 | chromosome and each cytogenetic band that has at least one gene."
 43 | 
 44 | \strong{Perturbations} (a.k.a. "C2:CGP" in MSIGDB): "Gene sets represent
 45 | expression signatures of genetic and chemical perturbations. A number of these gene
 46 | sets come in pairs: xxx_UP (and xxx_DN) gene set representing genes induced
 47 | (and repressed) by the perturbation."
 48 | 
 49 | \strong{Canonical pathways} (a.k.a. "C2:CP" in MSIGDB): "Gene sets from pathway databases.
 50 | Usually, these gene sets are canonical representations of a biological process
 51 |  compiled by domain experts."
 52 | 
 53 | \strong{BioCarta} (a.k.a. "C2:CP:BIOCARTA" in MSIGDB): "Gene sets derived from the
 54 | BioCarta pathway database."
 55 | 
 56 | \strong{KEGG} (a.k.a. "C2:CP:KEGG" in MSIGDB): "Gene sets derived from the
 57 | KEGG pathway database."
 58 | 
 59 | \strong{PID} (a.k.a. "C2:CP:PID" in MSIGDB): "Gene sets derived from the
 60 | PID pathway database."
 61 | 
 62 | \strong{Reactome} (a.k.a. "C2:CP:REACTOME" in MSIGDB): "Gene sets derived from the
 63 | Reactome pathway database."
 64 | 
 65 | \strong{miRNA targets} (a.k.a. "C3:MIR" in MSIGDB): "Gene sets that contain genes
 66 | sharing putative target sites (seed matches) of human mature miRNA in their 3'-UTRs."
 67 | 
 68 | \strong{TF targets} (a.k.a. "C3:TFT" in MSIGDB): "Gene sets that share upstream
 69 |  cis-regulatory motifs which can function as potential transcription factor
 70 |   binding sites. Based on work by Xie et al. 2005"
 71 | 
 72 | \strong{Cancer gene neighborhoods} (a.k.a. "C4:CGN" in MSIGDB): "Gene sets defined
 73 |  by expression neighborhoods centered on 380 cancer-associated genes. This collection
 74 |  is described in Subramanian, Tamayo et al. 2005"
 75 | 
 76 | \strong{Cancer modules} (a.k.a. "C4:CGN" in MSIGDB): "Gene sets defined by Segal
 77 |  et al. 2004. Briefly, the authors compiled gene sets ('modules') from a variety of
 78 |  resources such as KEGG, GO, and others. By mining a large compendium of cancer-related
 79 |   microarray data, they identified 456 such modules as significantly changed in a variety
 80 |    of cancer conditions."
 81 | 
 82 | \strong{GO:BP} (a.k.a. "C5:BP" in MSIGDB): "Gene sets derived from the GO Biological Process Ontology."
 83 | 
 84 | \strong{GO:CC} (a.k.a. "C5:CC" in MSIGDB): "Gene sets derived from the GO Cellular Component Ontology."
 85 | 
 86 | \strong{GO:MF} (a.k.a. "C5:MF" in MSIGDB): "Gene sets derived from the GO Molecular Function Ontology."
 87 | 
 88 | \strong{Oncogenic signatures} (a.k.a. "C6" in MSIGDB): "Gene sets that represent signatures of cellular
 89 |  pathways which are often dis-regulated in cancer. The majority of signatures were generated directly
 90 |  from microarray data from NCBI GEO or from internal unpublished profiling experiments involving perturbation
 91 |  of known cancer genes."
 92 | 
 93 | \strong{Immunological signatures} (a.k.a. "C7" in MSIGDB): "Gene sets that represent cell states and
 94 | perturbations within the immune system. The signatures were generated by manual curation of
 95 | published studies in human and mouse immunology."
 96 | 
 97 | \strong{Cell Type signatures} (a.k.a. "C8" in MSIGDB): "Gene sets that contain curated cluster
 98 |  markers for cell types identified in single-cell sequencing studies of human tissue."
 99 | 
100 | \strong{simple}: This is the combination of "Hallmark", "Perturbations",
101 | "BioCarta", "GO:BP", "GO:CC", "GO:MF", "KEGG", "Canonical pathways", "PID", and "Reactome"
102 | 
103 | \strong{complex}: This includes all possible gene sets.
104 | }
105 | \examples{
106 | TERM2GENE <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = "simple")
107 | TERM2GENE <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = c("Hallmark", "KEGG"))
108 | 
109 | }
110 | 


--------------------------------------------------------------------------------
/misc/preprocessing_scripts/uploadTPM.R:
--------------------------------------------------------------------------------
  1 | library(foreach)
  2 | doParallel::registerDoParallel()
  3 | credFile <- "Data/credFile.txt"
  4 | source("Scripts/helpers.R")
  5 | 
  6 | # Get TPM data
  7 | if (! file.exists("Data/ARCHS4_Download/human_tpm_v8.h5")){
  8 |   # Download TPM files
  9 |   humanTxURL <- "https://s3.amazonaws.com/mssm-seq-matrix/human_tpm_v8.h5"
 10 |   download.file(humanTxURL, destfile = "Data/ARCHS4_Download/human_tpm_v8.h5")
 11 | }
 12 | 
 13 | # Process for upload
 14 | if (! file.exists("Data/geneTPM_forUpload_geneList.RData")) {
 15 |   # Get tx2gene objects
 16 |   library(biomaRt)
 17 |   print("Getting biomaRt")
 18 |   # Get v87 for human
 19 |   ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl",
 20 |                      host = "http://dec2016.archive.ensembl.org")
 21 |   tx2geneHuman <- getBM(attributes = c("ensembl_transcript_id", 
 22 |                                        "external_gene_name"),
 23 |                         mart = ensembl)
 24 |   # # Get v88 for mouse
 25 |   # ensembl <- useMart("ensembl", dataset="mmusculus_gene_ensembl",
 26 |   #                    host = "http://mar2017.archive.ensembl.org")
 27 |   # tx2geneMouse <- getBM(attributes = c("ensembl_transcript_id", 
 28 |   #                                      "external_gene_name"),
 29 |   #                       mart = ensembl)
 30 |   
 31 |   ## Human
 32 |   # Aggregate transcript TPM to gene level
 33 |   library(tximport)
 34 |   library(rhdf5)
 35 |   speciesScien <- "hsapiens"
 36 |   cat("Loading data ... \n")
 37 |   dataFile <- "Data/ARCHS4_Download/human_tpm_v8.h5"
 38 |   load("Data/groupList_human_raw.RData")
 39 |   samples <- h5read(dataFile, "meta/Sample_geo_accession")
 40 |   tx2gene <- tx2geneHuman
 41 |   colnames(tx2gene) <- c("tx", "gene")
 42 |   load("Data/hsapiens_corrSmall_geneNames.rda")
 43 |   transcripts <- h5read(dataFile, "meta/transcripts")
 44 |   transcripts <- gsub(transcripts, pattern = "\\..*", replacement = "")
 45 |   samplesInd <- which(samples %in% colData_human_raw$samples)
 46 |   names(samplesInd) <- samples[samplesInd]
 47 |   sampleTPMOrderHuman <- names(samplesInd)
 48 |   save(sampleTPMOrderHuman, file = file.path("Data/sampleTPMOrderHuman.RData"))
 49 |   
 50 |   geneNames <- hsapiens_corrSmall_geneNames
 51 |   
 52 |   # Set up foreach
 53 |   cores <- 120
 54 |   nr <- length(samplesInd)
 55 |   n <- ceiling(nr/cores)
 56 |   forUploadList <- split(samplesInd, rep(1:ceiling(nr/n), each=n, length.out=nr))
 57 |   k <- length(forUploadList)
 58 |   print("running foreach")
 59 |   start <- proc.time()
 60 |   resList <- foreach(j = 1:k, 
 61 |                      .verbose = T,
 62 |                      .export = c("k", "forUploadList")) %dopar%
 63 |     makeUploadTable(tx2gene = tx2gene, samplesInd = forUploadList[[j]], 
 64 |                     transcripts = transcripts, dataFile = dataFile)
 65 |   
 66 |   finalDF <- data.table::rbindlist(resList)
 67 |   ans <- proc.time() - start
 68 |   print(ans)
 69 |   print("Saving")
 70 |   save(finalDF, file = file.path("Data/geneTPM_forUpload.RData"))
 71 |   humanSamplesTPM <- unique(as.character(finalDF$sampleName))
 72 |   save(humanSamplesTPM,file = "Data/geneTPM_forUpload_sampleList.RData")
 73 | }
 74 | 
 75 | # Upload
 76 | finalDF2 <- finalDF[,c(-1)]
 77 | rownames(finalDF2) <- finalDF$sampleName
 78 | checkBool <- uploadToAzure(finalDF2, credentials = credFile,
 79 |                             tableName = "tpm_hsapiens", check = T)
 80 | if (checkBool & ! doKey) {
 81 |   # Upload to AWS
 82 |   print("uploading to Azure ... ")
 83 |   uploadToAzure(finalDF2, credentials = credFile,
 84 |                 tableName = "tpm_hsapiens", check = F)
 85 |   # sql <- paste0("ALTER TABLE ", tableName, " ADD PRIMARY KEY (row_names);")
 86 |   # dbExecute(conn = conn, statement = sql)
 87 |   # tryCatch(expr = {uploadToAzure(finalDF2, tableName)},
 88 |   #          error = function(e) {cat("AWS error -- probably not enough space")})
 89 |   
 90 | } else {
 91 |   print("Table already uploaded ... ")
 92 | }
 93 | if (doKey) {
 94 |   credentials <- "Data/credFile.txt"
 95 |   credentials <- suppressWarnings(read.delim(credentials, sep = ";",
 96 |                                              header = FALSE, stringsAsFactors = F))
 97 |   uName <- credentials$V1
 98 |   pWord <- credentials$V2
 99 |   tableName = "tpm_hsapiens"
100 |   conn <- dbConnect(drv = RMySQL::MySQL(), user = uName, 
101 |                     port = 3306, dbname="correlation_analyzer",
102 |                     password=pWord,
103 |                     host="m2600az-db01p.mysql.database.azure.com")
104 |   sql <- paste0("ALTER TABLE ", tableName, " ADD PRIMARY KEY (row_names);")
105 |   done <- 0
106 |   while (done  == 0) {
107 |     done <- tryCatch(expr = {
108 |       dbExecute(conn = conn, statement = sql)
109 |       dbDisconnect(conn)
110 |       rm(conn)
111 |       gc()
112 |       1
113 |     },
114 |     error = function(e) {
115 |       print(e$message)
116 |       retCall <- "could not run statement: Multiple primary key defined"
117 |       if (e$message == retCall) {
118 |         cat("Already defined key")
119 |         dbDisconnect(conn)
120 |         rm(conn)
121 |         gc()
122 |         1
123 |       } else {
124 |         cat("Fail -- retry with new connection")
125 |         rm(conn)
126 |         gc()
127 |         conn <- dbConnect(drv = RMySQL::MySQL(), user = uName,
128 |                           port = 3306, dbname="correlation_analyzer",
129 |                           password=pWord,
130 |                           host="m2600az-db01p.mysql.database.azure.com")
131 |         0
132 |       }
133 |     })
134 |     print(done)
135 |   }
136 | }
137 | lapply(dbListConnections(drv = RMySQL::MySQL()), dbDisconnect)
138 | Sys.sleep(3)
139 | cat("\nNext sample ... \n")
140 | rm(finalDF2)
141 | gc()
142 | 
143 | 
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/misc/preprocessing_scripts/tissueDictionary.json:
--------------------------------------------------------------------------------
1 | {"brain":{"yes":["cortex","brain","lobe","hippoc","^pfc$","\\bpfc\\b","\\bmge\\b","glioblastoma","a172","^vc$","^cbc$","gyrus","stroke","sciencell","gbm","purkinje","pyramidal","u87","\\bglioma\\b","alzheim","frontal","dentate","white matter","brian","cranial","^glioma","[ -]+glioma","optic chiasm","grey matter","gray matter","striatum","pericyte","nerv","gangli","bipol","medull","putamen","hippocamp","neur","glia","amygdala","oligodendro","spine","spinal","astrocyt","cereb","pineal"],"no":["liver","kidney","microgli","aneurysm","vessel","precursor","progenitor","stem cell","Neuroectodermal","NPC","NSC"]},"thyroid":{"yes":["thyroid","thyrocyt"],"no":["whole blood"]},"respiratory":["lung","airway","mesothel","beas-2b","nasal","hsaec","nsclc","trach","pleura","alveol","bronch"],"skin":{"yes":["skin","keratin","dermis","\\bNHEM\\b","NHEM_M2","wm989","squamous cell","NHEM-M2","hfdpc","arn8","cutaneous","wm983b","^dk$","epidermis","melano","psoriasis"],"no":["fibroblas","subcutaneous"]},"pancreas":{"yes":["pancreas","pancrea","pdac","yapc","panc","islet","\\balpha","\\bbeta","\\bdelta","\\bepsilon"],"no":["falpha","fbeta","progen","a673"]},"kidney":{"yes":["kidney","nephr","glomerul","reninoma","proximal tubular epithelial cell","\\bhkc\\b","renal","\\bclear cell","\\bptec\\b","ccrcc"],"no":["airway","hek","293"]},"cartilage":["cartilag","chondr","joint"],"mammary":{"yes":["mammary","breast","mcf[ -_]*[0-9]","hmler","ductal","imec","hmec","\\bmda[ -]*231","sum159","hme1","skbr3","bt474","\\bmda[ -_]*mb[ -_]*[0-9]+\\b","hmepc","t47d","reduction mammoplasty no known cancer"],"no":["airway","pancrea","PDAC"]},"stomach":["stomach","gastric"],"esophagus":["esophag","escc"],"intestines":{"yes":["intestine","intestinal","caco-2et","colon","duoden","colorect","\\bcrc\\b","ileum","ileal","rectal","\\bec cells","gut","bowel","jejun","sigmoid","recto","ileocolic","tubular adenoma","hct116"],"no":["colonization","ncm356d"]},"muscle":{"yes":["muscle","myo","^smc"," smc","lateralis","gastrocnemius","skmc","hpasmc","skeletal","brach","satellite","ceps"],"no":["cardiac","heart","ffpe bladder","endothel","vessel","cardiomyopathy"]},"liver":{"yes":["liver","hepat","hepg2","kupffer","phh","HCC"],"no":["deliver"]},"adipose":{"yes":["adipose","fat","hwp","adipo","^wat$","^bat$","liposarcoma"],"no":["mesenchymal","milk","adipocyterna","hdfatprx1","gluteal subcutaneous"]},"prenatal":{"yes":["placent","fetal","decidua","germ","fetus","embry","hpiepc","parthenogenic","blastoycst","trophoblast","zygote","endoderm","morula","oocyte"," ICM$"," te$","oocy","oophorus","293","hpc-pl","primitive streak","fetal","Germinal","Embryo","Fetus","mesoderm","ectoderm"],"no":["stem","endothel","ewing","tc32","sknmc"]},"stem-like":{"yes":["stem","progen","prog$","hesc","hues64","\\b[h]*msc","mesenchymal","mesenchymal","hff1","HFF-1","nhdf","^derma$","fibroblas","IMR[- ]*90","HFF","fbs","tra-1-60","tra1-60","^bms$","detroit 551","cord","cd34","\\bips\\b","\\bbj\\b","ht1080","\\bhpc\\b","NPC","NSC","npsc","pMN progenitor","ncc","ncsc","pscs","cpcs","\\bhes[;]*\\b","hvmf","fibrosarc","haoaf","hesc","\\besc[s]*\\b","\\besc[s]*[ -_]","^h9$"," h9 "," h9$","^h9 ","h1esc","pluripoten","neural crest","hNCCs","NCCs","pES[0-9]+","HUES[0-9]+","embyonic","human es","^h1","embryonic stem","iPSC","ips cell","^ips$","hiPS","human ips","hpsc","pluripotent stem","Induced pluripotent","dental pulp","periodontal ligament stem","HSC","hematopoeitic stem","cd34","precursor","hnspc","Neuron Precursor","neural precursor","ncsc","ncc","hnspc","^kp$","ECFC","npc","hpsc","hfl1","HSPC","dental pulp cells","cpcs","fbs","[a-zA-Z]genic","poetic"],"no":["h1-neurons","\\bb cell","hepatic","shScramble","hnscc","ccrf-cem","escc","nasopharyngeal","\\ball\\b","all-sil","[ _-]+all","tissue: blood","fetal lung fibroblasts","leukemi","glioblastoma","\\baml\\b","npc tumor","glioma","gbm","UMSCC","pMSCV"]},"cardiac":{"yes":["cardiac","heart","atria","atrium","HCASMC","coron","aort","ventric"],"no":["airway","fibroblast","progenitor"]},"endothelial":{"yes":["endoth","huvec","hdmec","ECFC","vascul","vessel","f[0-9]ecs","haoec","hsavec","\\blsec\\b","rac-vec","hpmec","ecctr","ectnf","ecil"],"no":["bladder","whole blood"]},"spleen":["spleen","splen"],"bladder":{"yes":["bladder","urin","urothe"],"no":["during","gall","mn cell line","h3 es cells"]},"retina":["retina","macular","\\brpe[ -_]*[1]*","retin","photo"],"thymus":["thymus","thymic"],"male reproductive":{"yes":["testis","testes","testic","leydig","peritubular","sertoli","cauda","corpus","caput","sperm","epidid","gonad"],"no":["prostate","prostatic"]},"prostate":{"yes":["prostate","rwpe1","arcapm","lncap","lucap","prostatic","\\bprec\\b"],"no":["lymph"]},"female reproductive":{"yes":["ovar","amniotic","endomet","uter","placent","fallopian","deciduo","cervix","amnion","\\bhela\\b","hela_","chorionic villus","endometrial stroma","endometri","oviductal","cervic","vagi","granulosa"],"no":["ddasdasdasdasd"]},"immune":{"yes":["immune","macroph","leuk","mdsc","killer","lymph","gm18507","\\bt-all\\b","cd[0-9]+"," AML ","^AML "," AML$","hl60","NKT","microgli","gm12878","\\btcell","\\bbcell","\\bb[ -]cell","\\bt[ -]cell","k562","ccrf-cem","white blood cell","\\bth1\\b","sk[-]*no[-]*1","cd4","cd8","nk cell","marrow","akata","hmnc-pb","\\bbjab\\b","cuttl1","mn cell line","th2","j-lat","dlbcl","hut78","monocyt","dendrit","granulocyt","lympho","mononucle","pbmc","neutro","treg"],"no":["ctc","osteo","vein","stroma","msc","hematopoetic","stem cell","precursor","mesenchymal","vessel","cord","cd34"]},"bone":{"yes":["femur","osteo","u2os","hfob","ewing","\\bhob\\b","a673","mandible","bone","joint"],"no":["MSC","marrow","stroma"]},"tumors":{"yes":["cancer","hela","metast","t47d","sk[-]*no[-]*1","mdamd231","\\bmda[ -]*231","^rt[0-9]+","g401","Soft Tissue, Mesenchymal","tumor","hec1b","^omental tissue$","HNSCC","HCC","hela","k562","reh","jurkat","leukemi","bewo","kras","lncap","bjab","gbm"," aml","t-all","\\bpanc\\b","skbr3","u87","wm983b","a673","hepg2","dlbcl","caco-2et","ccrcc","\\bcrc\\b","rko","ramos","mel888","aml ","nsclc","mda_mb_231","vcap","saos2","vapc","nalm6","set2","tov21","cancer","carcin","sarcom","metasta","tumor","[a-zA-Z]oma\\b","NCCIT","a172","yapc","u2os","wm989","hct116","ht1080","arn8","lncap","squamous cell carcinoma","mcf_7","tumour","gbm","lucap","mn cell line","sum159","ccrf-cem","panc1","mcf7","mcf-7","\\bmda[ -_]*mb[ -_]*[0-9]+\\b","pc3","hl60","bt474","escc"],"no":["healthy","normal","Uninvolved","293","stroma","woman","Mycosis"]},"SingleCell":["single cell RNA","single-cell","smart seq","in-drop","cel-seq","10X genomics","scRNA seq","smartseq","CELseq","smart-seq","indrop","drop-seq","drop seq","single nucleus","single-nucleus","snRNA-Seq","snRNASeq","fluidigm","scRNASeq","scRNA-Seq","chromium"]}
2 | 


--------------------------------------------------------------------------------
/R/getTissueVST.R:
--------------------------------------------------------------------------------
  1 | #' Get VST values for tissues and gene of interest
  2 | #'
  3 | #' Downloads VST values for tissues of interest
  4 | #'
  5 | #' @param genesOfInterest A length-two vector with genes to compare.
  6 | #'
  7 | #' @param Tissues Which tissue type should VST be collected for? See available options
  8 | #' with getTissueTypes().
  9 | #'
 10 | #' @param Sample_Type Type of RNA Seq samples to obtain VST for? See available options
 11 | #' with getTissueTypes().
 12 | #'
 13 | #' @param useBlackList Should black-listed tissue/disease categories for this species
 14 | #' be removed from the returned list? Improves the quality of analysis by removing
 15 | #' categories with low sample numbers and high observed variance.
 16 | #' @param pool an object created by pool::dbPool to accessing SQL database.
 17 | #' It will be created if not supplied.
 18 | #' @return List of VST matrices for each selected tissue-disease combination.
 19 | #'
 20 | #' @examples
 21 | #' VSTdata <- getTissueVST(genesOfInterest = c("BRCA1", "ATM"),
 22 | #'                     Tissues = c("brain", "respiratory"),
 23 | #'                     Sample_Type = "all",
 24 | #'                     useBlackList = TRUE)
 25 | #' @export
 26 | getTissueVST <- function(genesOfInterest,
 27 |                          # Species = c("hsapiens", "mmusculus"),
 28 |                          Tissues = "all",
 29 |                          Sample_Type = c("all", "normal", "cancer"),
 30 |                          useBlackList = TRUE, pool = NULL) {
 31 | 
 32 |   # genesOfInterest = c("BRCA1")
 33 |   # Species = "hsapiens"
 34 |   # Tissues = "all"
 35 |   # Sample_Type = c("normal")
 36 |   # useBlackList = TRUE
 37 |   # pool = NULL
 38 | 
 39 |   Species = "hsapiens"
 40 | 
 41 |   if (! is.null(pool)) {
 42 |     if (! pool$valid) {
 43 |       pool <- NULL
 44 |     } else {
 45 |       conn <- pool::poolCheckout(pool)
 46 |       doPool <- TRUE
 47 |       on.exit(pool::poolReturn(conn))
 48 |     }
 49 |   }
 50 | 
 51 |   if (is.null(pool)) {
 52 |     doPool <- FALSE
 53 |     conn <- NULL
 54 |     retryCounter <- 1
 55 |     # cat("\nEstablishing connection to database ... \n")
 56 |     while(is.null(conn)) {
 57 |       conn <- try(silent = T, eval({
 58 |         DBI::dbConnect(
 59 |           drv = RMySQL::MySQL(),
 60 |           user = "public-rds-user@m2600az-db01p.mysql.database.azure.com", port = 3306,
 61 |           dbname="correlation_analyzer",
 62 |           password='public-user-password',
 63 |           host="m2600az-db01p.mysql.database.azure.com"
 64 |         )
 65 |       }))
 66 |       if ("try-error" %in% class(conn)) {
 67 |         if (retryCounter == 3) {
 68 |           stop("Unable to connect to database. Check internet connection and please contanct",
 69 |                " package maintainer if you believe this is an error.")
 70 |         }
 71 |         warning(paste0("Failed to establish connection to database ... retrying now ... ",
 72 |                        (4-retryCounter), " attempts left."),
 73 |                 immediate. = T)
 74 |         conn <- NULL
 75 |         retryCounter <- retryCounter + 1
 76 |         Sys.sleep(1)
 77 |       }
 78 |     }
 79 | 
 80 |     on.exit(DBI::dbDisconnect(conn))
 81 |   }
 82 | 
 83 |   samples <- correlationAnalyzeR::sampleVSTOrderHuman
 84 |   possibleGenes <- correlationAnalyzeR::humanGenesVST
 85 | 
 86 |   # Get samples for each tissue group
 87 |   possibleTissues <- correlationAnalyzeR::getTissueTypes(#Species = Species,
 88 |                                                          useBlackList = useBlackList,
 89 |                                                          pool = pool)
 90 |   possibleTissues1 <- gsub(possibleTissues, pattern = " - .*", replacement = "")
 91 |   possibleTissues2 <- gsub(possibleTissues, pattern = ".* - ", replacement = "")
 92 |   possibleRetrieval <- paste0(possibleTissues2, "_", possibleTissues1)
 93 |   if (any(Tissues == "all")) {
 94 |     ofInterest <- possibleRetrieval
 95 |   } else {
 96 |     ofInterest <- possibleRetrieval[grep(x = possibleRetrieval,
 97 |                                          pattern = paste0("_", Tissues, collapse = "|"))]
 98 |     if (! length(ofInterest)) {
 99 |       stop("No valid tissue types returned. Please check that your tissue types are ",
100 |            "correct by running getTissueTypes()")
101 |     }
102 |   }
103 |   if (all(Sample_Type != "all")) {
104 |     ofInterest <- ofInterest[grep(x = ofInterest,
105 |                                   pattern = paste0(Sample_Type, collapse = "|"))]
106 |   }
107 |   genesOfInterest <- unique(genesOfInterest)
108 |   genesOfInterestBad <- genesOfInterest[which(! genesOfInterest %in% possibleGenes)]
109 |   genesOfInterestFinal <- genesOfInterest[which(genesOfInterest %in% possibleGenes)]
110 |   if (! length(genesOfInterestFinal)) {
111 |     stop(paste0(genesOfInterestBad, collapse = ", "), " not found in VST data.",
112 |          " view data(humanVSTGenes) or data(mouseVSTGenes) to see available gene list")
113 |   } else if (length(genesOfInterestBad)) {
114 |     warning(paste0(genesOfInterestBad, collapse = ", "), " not found in VST data.",
115 |             " view data(humanVSTGenes) or data(mouseVSTGenes) to see available gene list")
116 |   }
117 | 
118 | 
119 | 
120 |   # Gather VST across samples for genes of interest
121 |   sql <- paste0("SELECT * FROM VSD_",
122 |                 Species,
123 |                 " WHERE row_names IN ('",
124 |                 paste(genesOfInterestFinal, collapse = "','"), "')")
125 | 
126 |   resdf <- try(silent = T, eval({
127 |     DBI::dbGetQuery(conn, sql)
128 |   }))
129 |   if ("try-error" %in% class(resdf)) {
130 |     stop("ERROR IN DB CONNECTION: ", resdf)
131 |   }
132 |   # Parse VST frame
133 |   resdf2 <- stringr::str_split_fixed(resdf$values, stringr::fixed(","), n = Inf)
134 |   resdf2 <- apply(t(resdf2), 1:2, as.numeric)
135 |   resdf2 <- as.data.frame(resdf2)
136 |   colnames(resdf2) <- resdf$row_names
137 |   resdf2 <- cbind(samples, resdf2)
138 |   rownames(resdf2) <- NULL
139 |   # Return VST frame for each specified group
140 |   resFrameList <- list()
141 |   newList <- unlist(correlationAnalyzeR::human_grouplist, recursive = F)
142 |   groups <- gsub(names(newList), pattern = "\\.", replacement = " - ")
143 |   newList <- newList[! groups %in% correlationAnalyzeR::blackListHuman]
144 |   groupNow2 <- c()
145 |   for (i in 1:length(newList)) {
146 |     samplesNow <- newList[[i]]
147 |     groupNow <- gsub(names(newList)[i], pattern = "\\.", replacement = " - ")
148 |     name2 <- gsub(names(newList)[i], pattern = "(.+)\\.(.+)", replacement = "\\2_\\1")
149 |     groupNow2 <- c(groupNow2, gsub(name2, pattern = "\\.| ", replacement = "_"))
150 |     frameNow <- resdf2[which(resdf2$samples %in% samplesNow),, drop = FALSE]
151 |     resFrameList[[i]] <- frameNow
152 |     names(resFrameList)[i] <- groupNow
153 |   }
154 |   groupNow2 <- gsub(groupNow2, pattern = "-", replacement = "_")
155 |   keepInd <- which(groupNow2 %in% ofInterest)
156 |   resFrameList <- resFrameList[keepInd]
157 |   return(resFrameList)
158 | }
159 | 
160 | 


--------------------------------------------------------------------------------
/R/getCorrelationData.R:
--------------------------------------------------------------------------------
  1 | #' Get Gene Correlation Data
  2 | #'
  3 | #' Obtain correlation data by querying MySQL database
  4 | #'
  5 | #' @param Sample_Type Type of RNA Seq samples used to create correlation data.
  6 | #' Either "all", "normal", or "cancer". Can be a single value for all genes,
  7 | #' or a vector corresponding to geneList. Not used if corrMat is set.
  8 | #' @param Tissue Which tissue type should gene correlations be derived from?
  9 | #' Default = "all". Can be a single value for all genes,
 10 | #' or a vector corresponding to geneList. Not used if corrMat is set.
 11 | #' Run getTissueTypes() to see available tissue list.
 12 | #' @param geneList Vector of genes for which data will be extracted.
 13 | #' @param corrMat A custom correlation matrix generated by generateCorrelations()
 14 | #' to use instead of pre-supplied databases.
 15 | #' @param pool an object created by pool::dbPool to accessing SQL database.
 16 | #' It will be created if not supplied. Not used if corrMat is set.
 17 | #' @return A correlation data frame object
 18 | #'
 19 | #' @examples
 20 | #' corrData <- correlationAnalyzeR::getCorrelationData(Sample_Type = "normal",
 21 | #'                                        Tissue = "kidney",
 22 | #'                                        geneList = c("ATM", "BRCA1"))
 23 | #'
 24 | #' @export
 25 | getCorrelationData <- function(Sample_Type,
 26 |                                Tissue, geneList,
 27 |                                corrMat = NULL,
 28 |                                pool = NULL) {
 29 | 
 30 |   # Species = "hsapiens"
 31 |   # Sample_Type = "all"
 32 |   # Tissue = c("female_reproductive")
 33 |   # geneList = c("BRCA1")
 34 |   # pool = NULL
 35 | 
 36 |   # Use the user-supplied corrMat if provided
 37 |   if (! is.null(corrMat)) {
 38 |     resdf <- as.data.frame(corrMat[, geneList])
 39 |     colnames(resdf) <- geneList
 40 |     return(resdf)
 41 |   }
 42 | 
 43 |   Species <- "hsapiens"
 44 | 
 45 |   if (! is.null(pool)) {
 46 |     if (! pool$valid) {
 47 |       pool <- NULL
 48 |     } else {
 49 |       conn <- pool::poolCheckout(pool)
 50 |       doPool <- TRUE
 51 |       on.exit(pool::poolReturn(conn))
 52 |     }
 53 |   }
 54 | 
 55 |   if (is.null(pool)) {
 56 |     doPool <- FALSE
 57 |     conn <- NULL
 58 |     retryCounter <- 1
 59 |     # cat("\nEstablishing connection to database ... \n")
 60 |     while(is.null(conn)) {
 61 |       conn <- try(silent = T, eval({
 62 |         DBI::dbConnect(
 63 |           drv = RMySQL::MySQL(),
 64 |           user = "public-rds-user@m2600az-db01p.mysql.database.azure.com", port = 3306,
 65 |           dbname="correlation_analyzer",
 66 |           password='public-user-password',
 67 |           host="m2600az-db01p.mysql.database.azure.com"
 68 |         )
 69 |       }))
 70 |       if ("try-error" %in% class(conn)) {
 71 |         if (retryCounter == 3) {
 72 |           stop("Unable to connect to database. Check internet connection and please contanct",
 73 |                " package maintainer if you believe this is an error.")
 74 |         }
 75 |         warning(paste0("Failed to establish connection to database ... retrying now ... ",
 76 |                        (4-retryCounter), " attempts left."),
 77 |                 immediate. = T)
 78 |         conn <- NULL
 79 |         retryCounter <- retryCounter + 1
 80 |         Sys.sleep(1)
 81 |       }
 82 |     }
 83 |     on.exit(DBI::dbDisconnect(conn))
 84 |   }
 85 | 
 86 |   # Make sure all tissue entries are appropriate
 87 |   goodConditions <- getTissueTypes(#Species = Species,
 88 |                                    pool = pool)
 89 | 
 90 |   goodTissues <- unique(gsub(goodConditions,
 91 |                              pattern = "(.*) - (.*)",
 92 |                              replacement = "\\1"))
 93 |   goodSamples <- unique(gsub(goodConditions,
 94 |                              pattern = "(.*) - (.*)",
 95 |                              replacement = "\\2"))
 96 | 
 97 |   if (! all(Sample_Type %in% goodSamples)) {
 98 |     stop("Sample type must be either 'normal' or 'cancer'")
 99 |   } else if (! all(Tissue %in% goodTissues)) {
100 |     stop("Tissue types must be selected from available options.",
101 |          " Run correlationAnalyzeR::getTissueTypes() to see available tissue - sample groups.")
102 |   }
103 | 
104 |   geneNames <- correlationAnalyzeR::hsapiens_corrSmall_geneNames
105 |   # Queries from multiple db at once
106 |   if (length(Tissue) == 1) {
107 |     Tissue <- rep(Tissue, length(geneList))
108 |   } else if (length(Tissue) > 1) {
109 |     if (length(Tissue) != length(geneList)) {
110 |       warning("Number of valid genes not equal
111 |            to length of supplied Tissue vector. Using only ", Tissue[1])
112 |       Tissue <- rep(Tissue[1], length(geneList))
113 |     }
114 |   }
115 |   if (length(Sample_Type) == 1) {
116 |     Sample_Type <- rep(Sample_Type, length(geneList))
117 |   } else if (length(Sample_Type) > 1) {
118 |     if (length(Sample_Type) != length(geneList)) {
119 |       warning("Number of valid genes not equal
120 |            to length of supplied Sample_Type vector. Using only ", Sample_Type[1])
121 |       Sample_Type <- rep(Sample_Type[1], length(geneList))
122 |     }
123 |   }
124 | 
125 |   if (length(unique(Tissue)) == 1 & length(unique(Sample_Type)) == 1) {
126 |     sql <- paste0("SELECT * FROM correlations_",
127 |                   Species, "_",
128 |                   tolower(unique(Sample_Type)), "_",
129 |                   tolower(unique(Tissue)),
130 |                   " WHERE row_names IN ('",
131 |                   paste(geneList, collapse = "','"), "')")
132 |     resdf <- try(silent = T, eval({
133 |       DBI::dbGetQuery(conn, sql)
134 |     }))
135 |     if ("try-error" %in% class(resdf)) {
136 |       stop("ERROR IN DB CONNECTION: ", resdf)
137 |     }
138 |     resdf2 <- stringr::str_split_fixed(resdf$values, stringr::fixed(","), n = Inf)
139 |     resdf2 <- apply(t(resdf2), 1:2, as.numeric)
140 |     resdf2 <- as.data.frame(resdf2)
141 |     colnames(resdf2) <- resdf$row_names
142 |     rownames(resdf2) <- geneNames
143 |     if (length(geneList) > 1) {
144 |       resdf2 <- resdf2[,order(match(colnames(resdf2), geneList))]
145 |     }
146 |   } else {
147 |     resDfList <- list()
148 |     for ( i in 1:length(geneList) ) {
149 |       geneName <- geneList[i]
150 |       TissueNow <- Tissue[i]
151 |       Sample_TypeNow <- Sample_Type[i]
152 |       sql <- paste0("SELECT * FROM correlations_",
153 |                     Species, "_",
154 |                     tolower(Sample_TypeNow), "_", tolower(TissueNow),
155 |                     " WHERE row_names IN ('",
156 |                     geneName, "')")
157 |       resdf <- try(silent = T, eval({
158 |         DBI::dbGetQuery(conn, sql)
159 |       }))
160 |       if ("try-error" %in% class(resdf)) {
161 |         stop("ERROR IN DB CONNECTION: ", resdf)
162 |       }
163 |       resdf2 <- stringr::str_split_fixed(resdf$values, stringr::fixed(","), n = Inf)
164 |       resdf2 <- apply(t(resdf2), 1:2, as.numeric)
165 |       resdf2 <- as.data.frame(resdf2)
166 |       colnames(resdf2) <- resdf$row_names
167 |       rownames(resdf2) <- geneNames
168 |       resDfList[[i]] <- resdf2
169 |     }
170 |     resdf2 <- dplyr::bind_cols(resDfList)
171 |     rownames(resdf2) <- geneNames
172 |   }
173 |   return(resdf2)
174 | }
175 | 
176 | 


--------------------------------------------------------------------------------
/misc/preprocessing_scripts/calculatCorrelations.R:
--------------------------------------------------------------------------------
  1 | library(sva)
  2 | library(BiocParallel)
  3 | library(foreach)
  4 | library(data.table)
  5 | library(DBI)
  6 | #library(future)
  7 | library(DESeq2)
  8 | #options(future.globals.maxSize= 5e10)
  9 | #plan(multicore(workers = 10))
 10 | library(WGCNA)
 11 | WGCNA::enableWGCNAThreads()
 12 | doParallel::registerDoParallel()
 13 | 
 14 | source("Scripts/helpers.R")
 15 | 
 16 | set.seed(42)
 17 | 
 18 | # Should we just use the primary key logic?
 19 | doKey <- TRUE
 20 | 
 21 | species <- "Human"
 22 | cat("Species is ", species)
 23 | speciesScien <- ifelse(test = species == "Human", yes = "hsapiens", no = "mmusculus")
 24 | cat("Loading data ... \n")
 25 | dataDir <- "Data/"
 26 | load("Data/groupList_human_raw.RData")
 27 | credFile <- "Data/credFile.txt"
 28 | colData <- colData_human_raw
 29 | groupList <- groupList_human_raw
 30 | 
 31 | # Add the all category
 32 | cancerList <- c()
 33 | normalList <- c()
 34 | for (i in 1:length(groupList)) {
 35 |   if (length(groupList[[i]]$cancer)) {
 36 |     cancerList <- c(cancerList, sample(x = groupList[[i]]$cancer, 100, replace = T))
 37 |   }
 38 |   if (length(groupList[[i]]$normal)) {
 39 |     normalList <- c(normalList, sample(x = groupList[[i]]$normal, 100, replace = T))
 40 |   }
 41 | }
 42 | 
 43 | groupList[["all"]] <- list("all" = unique(c(cancerList, normalList)),
 44 |                            "cancer" = unique(cancerList),
 45 |                            "normal" = unique(normalList))
 46 | 
 47 | dir.create("Data/corr_mats")
 48 | 
 49 | cat("Starting analysis ... \n")
 50 | for (i in 1:length(groupList)) {
 51 |   groups <- groupList[[i]]
 52 |   name <- names(groupList)[i]
 53 |   name <- gsub(name, pattern = " ", replacement = "_")
 54 |   dir.create(file.path("Data/corr_mats", name))
 55 |   cat("Current group: ", name, "\n")
 56 |   for (type in c("all", "cancer", "normal")) {
 57 |     #  samples
 58 |     Samps <- groups[[type]]
 59 |     n <- length(Samps)
 60 |     cat(paste0("\n", type, " - ", n), "\n")
 61 |     if (n < 30) {
 62 |       warning("Not enough samples")
 63 |     } else {
 64 |       Path <- file.path("Data/corr_mats", name, type)
 65 |       dir.create(path = Path, showWarnings = F)
 66 |       if (! file.exists(file.path(Path, "corMatForUpload.RData"))) {
 67 |         load("Data/vsd_for_corr.rda")
 68 |         expNow <- assay(vsd)[,colnames(assay(vsd)) %in% Samps]
 69 |         rm(vsd)
 70 |         gc()
 71 |         colDataNow <- colData[which(colData$samples %in% colnames(expNow)),]
 72 |         colDataNow <- colDataNow[order(match(colDataNow$samples, colnames(expNow))),]
 73 |         all(colDataNow$samples %in% colnames(expNow))
 74 |         Samps <- colDataNow$samples
 75 |         corMat <- WGCNA::cor(x = t(expNow), verbose = 1, nThreads = 100)
 76 |         forUpload <- cbind(as.data.frame(rownames(corMat)), corMat)
 77 |         cores <- 20
 78 |         colnames(forUpload)[1] <- "geneName"
 79 |         nr <- nrow(forUpload)
 80 |         n <- ceiling(nr/cores)
 81 |         forUploadList <- split(forUpload, rep(1:ceiling(nr/n), each=n, length.out=nr))
 82 |         k <- length(forUploadList)
 83 |         print("running foreach")
 84 |         start <- proc.time()
 85 |         resList <- mclapply(forUploadList, convertForUpload, mc.cores = cores)
 86 |         finalDF <- data.table::rbindlist(resList)
 87 |         ans <- proc.time() - start
 88 |         print(ans)
 89 |         print("Saving")
 90 |         finalDF2 <- finalDF[,c(-1), drop = F]
 91 |         rownames(finalDF2) <- finalDF$geneName
 92 |         save(finalDF2, file = file.path(Path, "corMatForUpload.RData"))
 93 |       } else {
 94 |         # Check whether upload necessary before loading
 95 |         tableName <- paste0("correlations_", speciesScien, "_", type, "_", name)
 96 |         tableName <- gsub(tableName, pattern = "-", replacement = "_")
 97 |         checkBool <- uploadToAzure(tableName = tableName, check = T,
 98 |                                    credentials = credFile)
 99 |         if (checkBool & ! doKey) {
100 |           print("Loading matrix for upload ... ")
101 |           load(file.path(Path, "corMatForUpload.RData"))
102 |           
103 |         }
104 |       }
105 |       tableName <- paste0("correlations_", speciesScien, "_", type, "_", name)
106 |       tableName <- gsub(tableName, pattern = "-", replacement = "_")
107 |       checkBool <- uploadToAzure(tableName = tableName, credentials = credFile,
108 |                                  check = T)
109 |       if (checkBool & ! doKey) {
110 |         # Upload to AWS
111 |         print("uploading to Azure ... ")
112 |         uploadToAzure(finalDF2, credentials = credFile,
113 |                       tableName, check = F)
114 |         # sql <- paste0("ALTER TABLE ", tableName, " ADD PRIMARY KEY (row_names);")
115 |         # dbExecute(conn = conn, statement = sql)
116 |         # tryCatch(expr = {uploadToAzure(finalDF2, tableName)},
117 |         #          error = function(e) {cat("AWS error -- probably not enough space")})
118 |         
119 |       } else {
120 |         print("Table already uploaded ... ")
121 |       }
122 |       if (doKey) {
123 |         credentials <- "Data/credFile.txt"
124 |         credentials <- suppressWarnings(read.delim(credentials, sep = ";",
125 |                                                    header = FALSE, stringsAsFactors = F))
126 |         uName <- credentials$V1
127 |         pWord <- credentials$V2
128 |         conn <- dbConnect(drv = RMySQL::MySQL(), user = uName, 
129 |                           port = 3306, dbname="correlation_analyzer",
130 |                           password=pWord,
131 |                           host="m2600az-db01p.mysql.database.azure.com")
132 |         sql <- paste0("ALTER TABLE ", tableName, " ADD PRIMARY KEY (row_names);")
133 |         done <- 0
134 |         while (done  == 0) {
135 |           done <- tryCatch(expr = {
136 |             dbExecute(conn = conn, statement = sql)
137 |             dbDisconnect(conn)
138 |             rm(conn)
139 |             gc()
140 |             1
141 |           },
142 |           error = function(e) {
143 |             print(e$message)
144 |             retCall <- "could not run statement: Multiple primary key defined"
145 |             if (e$message == retCall) {
146 |               cat("Already defined key")
147 |               dbDisconnect(conn)
148 |               rm(conn)
149 |               gc()
150 |               1
151 |             } else {
152 |               cat("Fail -- retry with new connection")
153 |               rm(conn)
154 |               gc()
155 |               conn <- dbConnect(drv = RMySQL::MySQL(), user = uName,
156 |                                 port = 3306, dbname="correlation_analyzer",
157 |                                 password=pWord,
158 |                                 host="m2600az-db01p.mysql.database.azure.com")
159 |               0
160 |             }
161 |           })
162 |           print(done)
163 |         }
164 |       }
165 |       lapply(dbListConnections(drv = RMySQL::MySQL()), dbDisconnect)
166 |       Sys.sleep(3)
167 |       cat("\nNext sample ... \n")
168 |       rm(finalDF2)
169 |       gc()
170 |     } 
171 |   }
172 | }
173 | 
174 | # 
175 | # hsapiens_corrSmall_geneNames <- rownames(finalDF2)
176 | # save(hsapiens_corrSmall_geneNames, file = "Data/hsapiens_corrSmall_geneNames.rda")
177 | # 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/R/getTERM2GENE.R:
--------------------------------------------------------------------------------
  1 | #' Obtains TERM2GENE object for corGSEA
  2 | #'
  3 | #' Wrapper for msgidb::msigdbr() function
  4 | #'
  5 | #' @param Species Species to obtain gene names for.
  6 | #' Either 'hsapiens' or 'mmusculus'
  7 | #'
  8 | #' @param GSEA_Type Which pathway annotations should be considered? Options listed in
  9 | #' correlationAnalyzeR::pathwayCategories -- See details below for more info.
 10 | #'
 11 | #' @param sampler If TRUE, will only return 100,000 random genesets from either
 12 | #' simple or complex TERM2GENEs. Useful for reducing GSEA computational burden.
 13 | #'
 14 | #' @param listReturn If TRUE, will return annotations as a list object.
 15 | #'
 16 | #' @return A tbl object with columns "gs_name" and "gene_symbol"
 17 | #'
 18 | #' @details GSEA_Type category names and their MSIGDB description:
 19 | #'
 20 | #' \strong{Hallmark} (a.k.a "H" in MSIGDB): "Hallmark gene sets summarize and represent
 21 | #' specific well-defined biological states or processes and display coherent expression.
 22 | #' These gene sets were generated by a computational methodology based on identifying
 23 | #' overlaps between gene sets in other MSigDB collections and retaining genes that display
 24 | #' coordinate expression."
 25 | #'
 26 | #' \strong{Cytogenic bands} (a.k.a "C1" in MSIGDB): "Gene sets corresponding to each human
 27 | #' chromosome and each cytogenetic band that has at least one gene."
 28 | #'
 29 | #' \strong{Perturbations} (a.k.a. "C2:CGP" in MSIGDB): "Gene sets represent
 30 | #' expression signatures of genetic and chemical perturbations. A number of these gene
 31 | #' sets come in pairs: xxx_UP (and xxx_DN) gene set representing genes induced
 32 | #' (and repressed) by the perturbation."
 33 | #'
 34 | #' \strong{Canonical pathways} (a.k.a. "C2:CP" in MSIGDB): "Gene sets from pathway databases.
 35 | #' Usually, these gene sets are canonical representations of a biological process
 36 | #'  compiled by domain experts."
 37 | #'
 38 | #' \strong{BioCarta} (a.k.a. "C2:CP:BIOCARTA" in MSIGDB): "Gene sets derived from the
 39 | #' BioCarta pathway database."
 40 | #'
 41 | #' \strong{KEGG} (a.k.a. "C2:CP:KEGG" in MSIGDB): "Gene sets derived from the
 42 | #' KEGG pathway database."
 43 | #'
 44 | #' \strong{PID} (a.k.a. "C2:CP:PID" in MSIGDB): "Gene sets derived from the
 45 | #' PID pathway database."
 46 | #'
 47 | #' \strong{Reactome} (a.k.a. "C2:CP:REACTOME" in MSIGDB): "Gene sets derived from the
 48 | #' Reactome pathway database."
 49 | #'
 50 | #' \strong{miRNA targets} (a.k.a. "C3:MIR" in MSIGDB): "Gene sets that contain genes
 51 | #' sharing putative target sites (seed matches) of human mature miRNA in their 3'-UTRs."
 52 | #'
 53 | #' \strong{TF targets} (a.k.a. "C3:TFT" in MSIGDB): "Gene sets that share upstream
 54 | #'  cis-regulatory motifs which can function as potential transcription factor
 55 | #'   binding sites. Based on work by Xie et al. 2005"
 56 | #'
 57 | #' \strong{Cancer gene neighborhoods} (a.k.a. "C4:CGN" in MSIGDB): "Gene sets defined
 58 | #'  by expression neighborhoods centered on 380 cancer-associated genes. This collection
 59 | #'  is described in Subramanian, Tamayo et al. 2005"
 60 | #'
 61 | #' \strong{Cancer modules} (a.k.a. "C4:CGN" in MSIGDB): "Gene sets defined by Segal
 62 | #'  et al. 2004. Briefly, the authors compiled gene sets ('modules') from a variety of
 63 | #'  resources such as KEGG, GO, and others. By mining a large compendium of cancer-related
 64 | #'   microarray data, they identified 456 such modules as significantly changed in a variety
 65 | #'    of cancer conditions."
 66 | #'
 67 | #' \strong{GO:BP} (a.k.a. "C5:BP" in MSIGDB): "Gene sets derived from the GO Biological Process Ontology."
 68 | #'
 69 | #' \strong{GO:CC} (a.k.a. "C5:CC" in MSIGDB): "Gene sets derived from the GO Cellular Component Ontology."
 70 | #'
 71 | #' \strong{GO:MF} (a.k.a. "C5:MF" in MSIGDB): "Gene sets derived from the GO Molecular Function Ontology."
 72 | #'
 73 | #' \strong{Oncogenic signatures} (a.k.a. "C6" in MSIGDB): "Gene sets that represent signatures of cellular
 74 | #'  pathways which are often dis-regulated in cancer. The majority of signatures were generated directly
 75 | #'  from microarray data from NCBI GEO or from internal unpublished profiling experiments involving perturbation
 76 | #'  of known cancer genes."
 77 | #'
 78 | #' \strong{Immunological signatures} (a.k.a. "C7" in MSIGDB): "Gene sets that represent cell states and
 79 | #' perturbations within the immune system. The signatures were generated by manual curation of
 80 | #' published studies in human and mouse immunology."
 81 | #'
 82 | #' \strong{Cell Type signatures} (a.k.a. "C8" in MSIGDB): "Gene sets that contain curated cluster
 83 | #'  markers for cell types identified in single-cell sequencing studies of human tissue."
 84 | #'
 85 | #' \strong{simple}: This is the combination of "Hallmark", "Perturbations",
 86 | #' "BioCarta", "GO:BP", "GO:CC", "GO:MF", "KEGG", "Canonical pathways", "PID", and "Reactome"
 87 | #'
 88 | #' \strong{complex}: This includes all possible gene sets.
 89 | #'
 90 | #' @examples
 91 | #' TERM2GENE <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = "simple")
 92 | #' TERM2GENE <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = c("Hallmark", "KEGG"))
 93 | #'
 94 | #' @importFrom rlang .data
 95 | #' @import dplyr
 96 | #' @import tibble
 97 | #' @import tidyr
 98 | #'
 99 | #' @export
100 | getTERM2GENE <- function(GSEA_Type = c("simple"),
101 |                          Species = c("hsapiens", "mmusculus"),
102 |                          sampler = FALSE, listReturn = FALSE) {
103 | 
104 |   # Species = "hsapiens"
105 |   # GSEA_Type = "simple"
106 |   # sampler = FALSE
107 | 
108 |   if (Species[1] == "hsapiens") {
109 |     msigSpec <- "Homo sapiens"
110 |   } else {
111 |     msigSpec <- "Mus musculus"
112 |   }
113 | 
114 |   # Get data object
115 |   MDFraw <- msigdbr::msigdbr(species = msigSpec)
116 |   if (listReturn) {
117 |     MDFThin <- MDF[,c(1, 8)]
118 |     cats <- paste0(MDF$gs_cat, ":", MDF$gs_subcat)
119 |     MDFL <- split(MDFThin, f = cats)
120 |     names(MDFL) <- gsub(names(MDFL), pattern = "(.+):$", replacement = "\\1")
121 |     return(MDFL)
122 |   }
123 | 
124 |   MDF <- MDFraw
125 |   MDF$gs_subcat <- gsub(MDF$gs_subcat, pattern = "CP:", replacement = "", perl = TRUE)
126 |   MDF$gs_cat <- paste0(MDF$gs_cat, ":", MDF$gs_subcat)
127 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = ":$", replacement = "", perl = TRUE)
128 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C1", replacement = "Cytogenic bands", perl = TRUE)
129 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C6", replacement = "Oncogenic signatures", perl = TRUE)
130 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C7", replacement = "Immunological signatures", perl = TRUE)
131 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C8", replacement = "Cell Type signatures", perl = TRUE)
132 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C2:", replacement = "", perl = TRUE)
133 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C5:", replacement = "", perl = TRUE)
134 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "H$", replacement = "Hallmark", perl = TRUE)
135 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "CP", replacement = "Canonical pathways", perl = TRUE)
136 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "CGP", replacement = "Perturbations", perl = TRUE)
137 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C4:CGN", replacement = "Cancer gene neighborhoods", perl = TRUE)
138 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C4:CM", replacement = "Cancer modules", perl = TRUE)
139 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C3:MIR:MIRDB", replacement = "miRNA targets", perl = TRUE)
140 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "C3:TFT:GTRD", replacement = "TF targets", perl = TRUE)
141 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "BIOCARTA", replacement = "BioCarta", perl = TRUE)
142 |   MDF$gs_cat <- gsub(MDF$gs_cat, pattern = "REACTOME", replacement = "Reactome", perl = TRUE)
143 | 
144 |   # Filter for pathways of interest
145 |   optionsNow <- c("simple", "complex", unique(MDF$gs_cat))
146 |   if (! all(GSEA_Type %in% optionsNow)) {
147 |     stop("\nPlease enter a valid GSEA_Type. Use ?getTERM2GENE to see available options.\n")
148 |   }
149 | 
150 |   categories <- c()
151 |   if ("simple" %in% GSEA_Type) {
152 |     categories <- c(categories, "Hallmark", "Perturbations", "BioCarta",
153 |                     "GO:BP", "KEGG", "Canonical pathways", "Reactome", "GO:MF", "GO:CC", "PID")
154 |   }
155 | 
156 |   if ("complex" %in% GSEA_Type) {
157 |     categories <- c(categories, optionsNow)
158 |   }
159 | 
160 |   categories <- unique(c(categories, GSEA_Type))
161 |   TERM2GENE <- MDF %>%
162 |     filter(.data$gs_cat %in% categories) %>%
163 |     select(.data$gs_name, .data$gene_symbol)
164 | 
165 |   if (sampler) {
166 |     print("Using sampler!")
167 |     set.seed(1)
168 |     TERM2GENE <- TERM2GENE[sample(nrow(TERM2GENE), size = 100000),]
169 |   }
170 | 
171 |   return(TERM2GENE)
172 | }
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/misc/FigS8.R:
--------------------------------------------------------------------------------
  1 | # Script used to generate Figure S7
  2 | # It assumes you've downloaded the correlation matrix from ARCHS4: https://s3.amazonaws.com/mssm-seq-matrix/human_correlation.rda
  3 | # It also assumes you have downloaded and unzipped the correlations from COXPRESSdb: https://coxpresdb.jp/download/Hsa-u.c2-0/coex/
  4 | # It also assumes you have downloaded and decompressed the correlations from GeneFriends: "Pearson correlation genes" in https://genefriends.org/RNAseq/about/
  5 | # It also assumes you have used BioGRiD on BRCA1, AURKB, and HSP90AA1 and downloaded the interactors as a CSV.
  6 | 
  7 | # Get libraries / annotations
  8 | library(PerformanceAnalytics)
  9 | library(org.Hs.eg.db)
 10 | library(tidyverse)
 11 | library(VennDiagram)
 12 | library(biomaRt)
 13 | aa <- listAttributes(ensembl)
 14 | entrez2gene <- getBM()
 15 | source("https://raw.githubusercontent.com/Bishop-Laboratory/Ewing-sarcoma-paper-Miller-2020/master/helpers_v2.R")
 16 | 
 17 | ## BRCA1 cor compare ##
 18 | # Get the ARCHS4 correlation data
 19 | load("Data/human_correlation.rda")
 20 | BRCA1_ARCHS4 <- cc["BRCA1",]
 21 | BRCA1_ARCHS4 <- data.frame(geneName = names(BRCA1_ARCHS4),
 22 |                            ARCHS4 = BRCA1_ARCHS4)
 23 | 
 24 | # Get the COXPRESdb data
 25 | file <- list.files("Data/Hsa-u.v18-12.G26050-S164823.combat_pca_subagging.mrgeo.d/",
 26 |                    pattern = "^672$", full.names = T)
 27 | BRCA1_COXPR <- read.table(file)
 28 | info4gene = select(org.Hs.eg.db, keys = as.character(BRCA1_COXPR$V1),
 29 |                    columns =  c("SYMBOL"))
 30 | BRCA1_COXPR <- data.frame(geneName = info4gene$SYMBOL, COXPRESdb = rev(BRCA1_COXPR$V2))
 31 | 
 32 | # Get the GeneFriends data
 33 | file <- list.files("Data/home/nis/priyanka/work/GF_2019_Uploads/Human/HumanGenes_Correlation/",
 34 |                    pattern = "ENSG00000012048", full.names = T)
 35 | BRCA1_GF <- read.table(file)
 36 | BRCA1_GF$ENSEMBL <- rownames(BRCA1_GF)
 37 | info4gene = select(org.Hs.eg.db, keys = as.character(rownames(BRCA1_GF)),
 38 |                    columns =  c("SYMBOL"), keytype = "ENSEMBL")
 39 | BRCA1_GF <- merge(x = info4gene, y = BRCA1_GF, by = "ENSEMBL")
 40 | BRCA1_GF <- unique(BRCA1_GF[which(! is.na(BRCA1_GF$SYMBOL)),c(-1)])
 41 | BRCA1_GF <- BRCA1_GF[order(BRCA1_GF[,2], decreasing = TRUE),]
 42 | colnames(BRCA1_GF) <- c("geneName", "GeneFriends")
 43 | 
 44 | corCompare <- merge(x = BRCA1_ARCHS4, y = BRCA1_COXPR, by = "geneName")
 45 | corCompare <- merge(x = corCompare, y = BRCA1_GF, by = "geneName")
 46 | 
 47 | BRCA1_me <- correlationAnalyzeR::getCorrelationData(Species = "hsapiens", Sample_Type = "normal",
 48 |                                                     Tissue = "all", geneList = "BRCA1")
 49 | BRCA1_me$geneName <- rownames(BRCA1_me)
 50 | colnames(BRCA1_me)[1] <- "correlationAnalyzeR"
 51 | corCompare <- merge(x = BRCA1_me, y = corCompare, by = "geneName")
 52 | 
 53 | ## Compare BRCA1 top 100 to biogrid
 54 | biogrid <- read_tsv("misc/BIOGRID-GENE-107140-4.1.190.tab3.txt")
 55 | 
 56 | top50 <- BRCA1_me %>%
 57 |   top_n(wt = BRCA1, n = 500)
 58 | 
 59 | interacts <- biogrid %>%
 60 |   select(`Official Symbol Interactor B`) %>%
 61 |   distinct()
 62 | 
 63 | vl <- list(
 64 |   "Co-expression" = top50$geneName,
 65 |   "Interaction" = interacts$`Official Symbol Interactor B`
 66 | )
 67 | ol <- calculate.overlap(vl)
 68 | calculate.overlap.and.pvalue(list1 = vl$`Co-expression`, list2 = vl$Interaction,
 69 |                              total.size = unique(length(BRCA1_me$geneName)),
 70 |                              lower.tail = FALSE)
 71 | vd <- venn.diagram(vl, filename = NULL, fill = c("forestgreen", "firebrick"),
 72 |                    margin = .05)
 73 | dev.off()
 74 | grid.draw(vd)
 75 | 
 76 | 
 77 | ## AURKB cor compare ##
 78 | 
 79 | # ARCHS4
 80 | AURKB_ARCHS4 <- cc["AURKB",]
 81 | AURKB_ARCHS4 <- data.frame(geneName = names(AURKB_ARCHS4),
 82 |                           ARCHS4 = AURKB_ARCHS4)
 83 | # COXPRESSdb
 84 | file <- list.files("Data/Hsa-u.v18-12.G26050-S164823.combat_pca_subagging.mrgeo.d/",
 85 |                    pattern = "^9212$", full.names = T)
 86 | AURKB_COXPR <- read.table(file)
 87 | info4gene = select(org.Hs.eg.db, keys = as.character(AURKB_COXPR$V1),
 88 |                    columns =  c("SYMBOL"))
 89 | AURKB_COXPR <- data.frame(geneName = info4gene$SYMBOL, COXPRESdb = rev(AURKB_COXPR$V2))
 90 | 
 91 | # GeneFriends
 92 | file <- list.files("Data/home/nis/priyanka/work/GF_2019_Uploads/Human/HumanGenes_Correlation/",
 93 |                    pattern = "ENSG00000178999", full.names = T)
 94 | AURKB_GF <- read.table(file)
 95 | AURKB_GF$ENSEMBL <- rownames(AURKB_GF)
 96 | info4gene = select(org.Hs.eg.db, keys = as.character(rownames(AURKB_GF)),
 97 |                    columns =  c("SYMBOL"), keytype = "ENSEMBL")
 98 | AURKB_GF <- merge(x = info4gene, y = AURKB_GF, by = "ENSEMBL")
 99 | AURKB_GF <- unique(AURKB_GF[which(! is.na(AURKB_GF$SYMBOL)),c(-1)])
100 | AURKB_GF <- AURKB_GF[order(AURKB_GF[,2], decreasing = TRUE),]
101 | colnames(AURKB_GF) <- c("geneName", "GeneFriends")
102 | 
103 | corCompare <- merge(x = AURKB_ARCHS4, y = AURKB_COXPR, by = "geneName")
104 | corCompare <- merge(x = corCompare, y = AURKB_GF, by = "geneName")
105 | 
106 | # correlationAnalyzeR
107 | AURKB_me <- correlationAnalyzeR::getCorrelationData(Species = "hsapiens", Sample_Type = "normal",
108 |                                                    Tissue = "all", geneList = "AURKB")
109 | AURKB_me$geneName <- rownames(AURKB_me)
110 | colnames(AURKB_me)[1] <- "correlationAnalyzeR"
111 | corCompare <- merge(x = AURKB_me, y = corCompare, by = "geneName")
112 | 
113 | # Plot
114 | chart.Correlation(corCompare[,c(-1)],   method = "spearman")
115 | 
116 | ## Compare AURKB top 100 to biogrid
117 | biogrid <- read_tsv("misc/BIOGRID-GENE-114646-4.1.190.tab3.txt")
118 | 
119 | top50 <- AURKB_me %>%
120 |   top_n(wt = AURKB, n = 500)
121 | 
122 | interacts <- biogrid %>%
123 |   select(`Official Symbol Interactor B`) %>%
124 |   distinct()
125 | vl <- list(
126 |   "Co-expression" = top50$geneName,
127 |   "Interaction" = interacts$`Official Symbol Interactor B`
128 | )
129 | ol <- calculate.overlap(vl)
130 | calculate.overlap.and.pvalue(list1 = vl$`Co-expression`, list2 = vl$Interaction,
131 |                              total.size = unique(length(BRCA1_me$geneName)),
132 |                              lower.tail = FALSE)
133 | vd <- venn.diagram(vl, filename = NULL, fill = c("forestgreen", "firebrick"),
134 |                    margin = .05)
135 | 
136 | dev.off()
137 | grid.draw(vd)
138 | 
139 | 
140 | ## HSP90AA1 ##
141 | # ARCHS4
142 | HSP90AA1_ARCHS4 <- cc["HSP90AA1",]
143 | HSP90AA1_ARCHS4 <- data.frame(geneName = names(HSP90AA1_ARCHS4),
144 |                               ARCHS4 = HSP90AA1_ARCHS4)
145 | # COXPRESDB
146 | file <- list.files("Data/Hsa-u.v18-12.G26050-S164823.combat_pca_subagging.mrgeo.d/",
147 |                    pattern = "^3320$", full.names = T)
148 | HSP90AA1_COXPR <- read.table(file)
149 | info4gene = select(org.Hs.eg.db, keys = as.character(HSP90AA1_COXPR$V1),
150 |                    columns =  c("SYMBOL"))
151 | HSP90AA1_COXPR <- data.frame(geneName = info4gene$SYMBOL, COXPRESdb = rev(HSP90AA1_COXPR$V2))
152 | 
153 | # Get the GeneFriends data
154 | file <- list.files("Data/home/nis/priyanka/work/GF_2019_Uploads/Human/HumanGenes_Correlation/",
155 |                    pattern = "ENSG00000080824", full.names = T)
156 | HSP90AA1_GF <- read.table(file)
157 | HSP90AA1_GF$ENSEMBL <- rownames(HSP90AA1_GF)
158 | info4gene = select(org.Hs.eg.db, keys = as.character(rownames(HSP90AA1_GF)),
159 |                    columns =  c("SYMBOL"), keytype = "ENSEMBL")
160 | HSP90AA1_GF <- merge(x = info4gene, y = HSP90AA1_GF, by = "ENSEMBL")
161 | HSP90AA1_GF <- unique(HSP90AA1_GF[which(! is.na(HSP90AA1_GF$SYMBOL)),c(-1)])
162 | HSP90AA1_GF <- HSP90AA1_GF[order(HSP90AA1_GF[,2], decreasing = TRUE),]
163 | colnames(HSP90AA1_GF) <- c("geneName", "GeneFriends")
164 | 
165 | corCompare <- merge(x = HSP90AA1_ARCHS4, y = HSP90AA1_COXPR, by = "geneName")
166 | corCompare <- merge(x = corCompare, y = HSP90AA1_GF, by = "geneName")
167 | 
168 | # correlationAnalyzeR
169 | HSP90AA1_me <- correlationAnalyzeR::getCorrelationData(Species = "hsapiens", Sample_Type = "normal",
170 |                                                        Tissue = "all", geneList = "HSP90AA1")
171 | HSP90AA1_me$geneName <- rownames(HSP90AA1_me)
172 | colnames(HSP90AA1_me)[1] <- "correlationAnalyzeR"
173 | corCompare <- merge(x = HSP90AA1_me, y = corCompare, by = "geneName")
174 | 
175 | # Plot
176 | chart.Correlation(corCompare[,c(-1)],   method = "spearman")
177 | 
178 | 
179 | ## Compare HSP90AA1 top 100 to biogrid
180 | biogrid <- read_tsv("misc/BIOGRID-GENE-109552-4.1.190.tab3.txt")
181 | 
182 | top50 <- HSP90AA1_me %>%
183 |   top_n(wt = HSP90AA1, n = 500)
184 | 
185 | interacts <- biogrid %>%
186 |   select(`Official Symbol Interactor B`) %>%
187 |   distinct()
188 | vl <- list(
189 |   "Co-expression" = top50$geneName,
190 |   "Interaction" = interacts$`Official Symbol Interactor B`
191 | )
192 | ol <- calculate.overlap(vl)
193 | calculate.overlap.and.pvalue(list1 = vl$`Co-expression`, list2 = vl$Interaction,
194 |                              total.size = unique(length(BRCA1_me$geneName)),
195 |                              lower.tail = FALSE)
196 | vd <- venn.diagram(vl, filename = NULL, fill = c("forestgreen", "firebrick"),
197 |                    margin = .05)
198 | 
199 | dev.off()
200 | grid.draw(vd)
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 


--------------------------------------------------------------------------------
/misc/preprocessing_scripts/helpers.R:
--------------------------------------------------------------------------------
  1 | # Categorizes Bulk RNA-Seq metadata using custom regex dictionary
  2 | categorizeMetaData <- function(metadata, cols, dictionary) {
  3 |   for (i in 1:length(names(dictionary))) {
  4 |     termNow <- names(dictionary)[i]
  5 |     dictNow <- dictionary[[i]]
  6 |     posInd <- c()
  7 |     negInd <- c()
  8 |     metadata$newColNow <- 0
  9 |     if ("yes" %in% names(dictNow)) {
 10 |       for (j in 1:length(cols)) {
 11 |         colNow <- cols[j]
 12 |         colIndNow <- which(colnames(metadata) == colNow)
 13 |         posInd <- c(posInd, grep(x = metadata[,colIndNow], 
 14 |                                  pattern = paste0(dictNow$yes, collapse = "|"), 
 15 |                                  perl = T, ignore.case = T))
 16 |         if (is.null(dictNow$no)) {
 17 |           negInd <- c(negInd)
 18 |         } else {
 19 |           negInd <- c(negInd, grep(x = metadata[,colIndNow], pattern = paste0(dictNow$no, collapse = "|"), perl = T, ignore.case = T))
 20 |         }
 21 |       }
 22 |       posInd <- unique(posInd[! posInd %in% negInd])
 23 |     } else {
 24 |       for (j in 1:length(cols)) {
 25 |         colNow <- cols[j]
 26 |         colIndNow <- which(colnames(metadata) == colNow)
 27 |         posInd <- c(posInd, grep(x = metadata[,colIndNow], pattern = paste0(dictNow, collapse = "|"), perl = T, ignore.case = T))
 28 |       }
 29 |     }
 30 |     metadata$newColNow[posInd] <- 1
 31 |     colnames(metadata)[which(colnames(metadata) == "newColNow")] <-  termNow
 32 |   }
 33 |   return(metadata)
 34 | }
 35 | 
 36 | # Finds number of samples gene is expressed in
 37 | nonZeroSamps <- function(row) {
 38 |   return(sum(row > 0))
 39 | }
 40 | 
 41 | # Pretty timestamp
 42 | timestamp2 <- function() {
 43 |   # Partially from https://stackoverflow.com/questions/1962278/dealing-with-timestamps-in-r
 44 |   now <- Sys.time()
 45 |   timeList <- unclass(as.POSIXlt(now))
 46 |   secNow <- ifelse(round(timeList$sec) < 10, paste0(0, round(timeList$sec)), round(timeList$sec))
 47 |   minNow <- ifelse(round(timeList$min) < 10, paste0(0, round(timeList$min)), round(timeList$min))
 48 |   paste0("[", timeList$hour,":",minNow,":",secNow,
 49 |          " ", (timeList$mon+1), "/", timeList$mday, "/", (timeList$year + 1900), "]", sep = "")
 50 | }
 51 | 
 52 | 
 53 | 
 54 | 
 55 | convertForUpload <- function(forUploadListPart) {
 56 |   listVec <- list()
 57 |   for (i in 1:length(forUploadListPart[,1])) {
 58 |     corr_frame_row <- forUploadListPart[i,]
 59 |     geneName <- as.character(corr_frame_row[1,1])
 60 |     corr_frame_row <- corr_frame_row[1,-1]
 61 |     vec <- signif(as.numeric(corr_frame_row),digits = 3)
 62 |     vecStr <- paste(vec, collapse = ",")
 63 |     listVec[[i]] <- data.frame(geneName = geneName, values = vecStr)
 64 |   }
 65 |   resDF <- rbindlist(listVec)
 66 |   return(resDF)
 67 | }
 68 | 
 69 | uploadToAzureSampleKey <- function(finalDF2= NULL, tableName,
 70 |                           credentials,
 71 |                           check = F) {
 72 |   
 73 |   # # bug testing
 74 |   # check = F
 75 |   # finalDF2 <- finalDF
 76 |   # tableName = "tpm_hsapiens"
 77 |   # credentials <- "Data/credFile.txt"
 78 |   
 79 |   
 80 |   require(DBI)
 81 |   
 82 |   # credentials <- "Data/credFile.txt"
 83 |   
 84 |   credentials <- suppressWarnings(read.delim(credentials, sep = ";",
 85 |                                              header = FALSE, stringsAsFactors = F))
 86 |   uName <- credentials$V1
 87 |   pWord <- credentials$V2
 88 |   conn <- dbConnect(drv = RMySQL::MySQL(), user = uName, 
 89 |                     port = 3306, dbname="correlation_analyzer",
 90 |                     password=pWord,
 91 |                     host="m2600az-db01p.mysql.database.azure.com")
 92 |   
 93 |   sql <- paste0("SELECT COUNT(*) FROM ", tableName)
 94 |   tabs <- dbListTables(conn)
 95 |   if(! tableName %in% tabs & check) {
 96 |     return(TRUE)
 97 |   } else if (! tableName %in% tabs) {
 98 |     countVal <- 0
 99 |   } else {
100 |     res <- dbSendQuery(conn, sql)
101 |     res <- dbFetch(res)
102 |     countVal <- res[1,1]
103 |   }
104 |   
105 |   if (! is.null(finalDF2)) {
106 |     n <- nrow(finalDF2)
107 |   } else {
108 |     countVal <- 1.34
109 |   }
110 |   
111 |   if (countVal != n) {
112 |     if (check) {
113 |       return(TRUE)
114 |     }
115 |     print("Making table ... ")
116 |     if (tableName %in% tabs) {
117 |       dbRemoveTable(conn, tableName)
118 |     }
119 |     seqN <- seq(from = 1, to = n, by = 100)
120 |     for ( j in 1:length(seqN)) {
121 |       msg <- paste0("\n", j , " out of ", length(seqN))
122 |       cat(msg)
123 |       start <- seqN[j]
124 |       if (j == length(seqN)) {
125 |         end <- n
126 |       } else {
127 |         end <- seqN[j+1]-1
128 |       }
129 |       uploadDF <- finalDF2[c(start:end), , drop=F]
130 |       rownames(uploadDF) <- rownames(finalDF2)[c(start:end)]
131 |       
132 |       # tryCatch({dbExecute(conn, sql)}, function(e) "hey")
133 |       # testQuery <- dbExecute(conn, sql)
134 |       done <- 0
135 |       while (done  == 0) {
136 |         done <- tryCatch(expr = {
137 |           dbWriteTable(conn = conn, tableName, uploadDF, row.names = T,
138 |                        field.types = c("row_names" = "VARCHAR(255)", "values" = "mediumtext"), 
139 |                        append = T)
140 |           1
141 |         },
142 |         error = function(e) {
143 |           cat("Fail -- retry with new connection")
144 |           rm(conn)
145 |           gc()
146 |           conn <- dbConnect(drv = RMySQL::MySQL(), user = uName, 
147 |                             port = 3306, dbname="correlation_analyzer",
148 |                             password=pWord,
149 |                             host="m2600az-db01p.mysql.database.azure.com")
150 |           0
151 |         })
152 |         print(done)
153 |       }
154 |     }
155 |     Sys.sleep(4)
156 |     # sql <- paste0("ALTER TABLE ", tableName, " ADD PRIMARY KEY (row_names);")
157 |     # dbExecute(conn = conn, statement = sql)
158 |   } else {
159 |     if (check) {
160 |       return(FALSE)
161 |     }
162 |     print("Table already filled ... continuing")
163 |   }
164 |   print("DONE")
165 |   if (length(dbListConnections(drv = RMySQL::MySQL())) != 0) {
166 |     lapply(dbListConnections(drv = RMySQL::MySQL()), dbDisconnect)
167 |   }
168 |     
169 | }
170 | 
171 | # Same but with genes as the key
172 | uploadToAzureGeneKey <- function(finalDF2= NULL, tableName,
173 |                                    credentials,
174 |                                    check = F) {
175 |   
176 |   # # bug testing
177 |   # check = F
178 |   # finalDF2 <- finalDF
179 |   # tableName = "tpm_hsapiens"
180 |   # credentials <- "Data/credFile.txt"
181 |   
182 |   
183 |   require(DBI)
184 |   
185 |   # credentials <- "Data/credFile.txt"
186 |   
187 |   credentials <- suppressWarnings(read.delim(credentials, sep = ";",
188 |                                              header = FALSE, stringsAsFactors = F))
189 |   uName <- credentials$V1
190 |   pWord <- credentials$V2
191 |   conn <- dbConnect(drv = RMySQL::MySQL(), user = uName, 
192 |                     port = 3306, dbname="correlation_analyzer",
193 |                     password=pWord,
194 |                     host="m2600az-db01p.mysql.database.azure.com")
195 |   
196 |   sql <- paste0("SELECT COUNT(*) FROM ", tableName)
197 |   tabs <- dbListTables(conn)
198 |   if(! tableName %in% tabs & check) {
199 |     return(TRUE)
200 |   } else if (! tableName %in% tabs) {
201 |     countVal <- 0
202 |   } else {
203 |     res <- dbSendQuery(conn, sql)
204 |     res <- dbFetch(res)
205 |     countVal <- res[1,1]
206 |   }
207 |   
208 |   if (! is.null(finalDF2)) {
209 |     n <- nrow(finalDF2)
210 |   } else {
211 |     countVal <- 1.34
212 |   }
213 |   
214 |   if (countVal != n) {
215 |     if (check) {
216 |       return(TRUE)
217 |     }
218 |     print("Making table ... ")
219 |     if (tableName %in% tabs) {
220 |       dbRemoveTable(conn, tableName)
221 |     }
222 |     seqN <- seq(from = 1, to = n, by = 100)
223 |     for ( j in 1:length(seqN)) {
224 |       msg <- paste0("\n", j , " out of ", length(seqN))
225 |       cat(msg)
226 |       start <- seqN[j]
227 |       if (j == length(seqN)) {
228 |         end <- n
229 |       } else {
230 |         end <- seqN[j+1]-1
231 |       }
232 |       uploadDF <- finalDF2[c(start:end), , drop=F]
233 |       rownames(uploadDF) <- rownames(finalDF2)[c(start:end)]
234 |       
235 |       # tryCatch({dbExecute(conn, sql)}, function(e) "hey")
236 |       # testQuery <- dbExecute(conn, sql)
237 |       done <- 0
238 |       while (done  == 0) {
239 |         done <- tryCatch(expr = {
240 |           dbWriteTable(conn = conn, tableName, uploadDF, row.names = T,
241 |                        field.types = c("row_names" = "VARCHAR(255)", "values" = "mediumtext"), 
242 |                        append = T)
243 |           1
244 |         },
245 |         error = function(e) {
246 |           cat("Fail -- retry with new connection")
247 |           rm(conn)
248 |           gc()
249 |           conn <- dbConnect(drv = RMySQL::MySQL(), user = uName, 
250 |                             port = 3306, dbname="correlation_analyzer",
251 |                             password=pWord,
252 |                             host="m2600az-db01p.mysql.database.azure.com")
253 |           0
254 |         })
255 |         print(done)
256 |       }
257 |     }
258 |     Sys.sleep(4)
259 |     # sql <- paste0("ALTER TABLE ", tableName, " ADD PRIMARY KEY (row_names);")
260 |     # dbExecute(conn = conn, statement = sql)
261 |   } else {
262 |     if (check) {
263 |       return(FALSE)
264 |     }
265 |     print("Table already filled ... continuing")
266 |   }
267 |   print("DONE")
268 |   if (length(dbListConnections(drv = RMySQL::MySQL())) != 0) {
269 |     lapply(dbListConnections(drv = RMySQL::MySQL()), dbDisconnect)
270 |   }
271 |   
272 | }
273 | 
274 | # TPM upload table
275 | makeUploadTable <- function(tx2gene, transcripts, 
276 |                             dataFile, samplesInd) {
277 |   resList <- list()
278 |   # geneNames <- geneNames[c(1:2)]
279 |   for (j in 1:length(samplesInd)) {
280 |     # gene <- geneNames[j]
281 |     sample <- names(samplesInd)[j]
282 |     sampleInd <- samplesInd[j]
283 |     print(j)
284 |     # gene <- "AC012454.4"
285 |     print(sample)
286 |     tx2geneNow <- tx2gene[which(tx2gene$tx %in% transcripts),]
287 |     transcriptsInd <- which(transcripts %in% tx2geneNow$tx)
288 |     if (! length(transcriptsInd)) {
289 |       resList[[j]] <- NULL
290 |       next
291 |     }
292 |     H5close()
293 |     expression <- as.data.frame(h5read(dataFile, "data/expression", 
294 |                          index = list(transcriptsInd, sampleInd)))
295 |     H5close()
296 |     expression$tx <- transcripts[transcriptsInd]
297 |     expressionMerge <- merge(x = expression, y = tx2gene, by = "tx")
298 |     geneExp <- aggregate(
299 |       expressionMerge$V1, by = list(expressionMerge$gene), FUN = sum
300 |     )
301 |     rownames(geneExp) <- geneExp$Group.1
302 |     vec <- signif(as.numeric(geneExp$x), digits = 4)
303 |     vecStr <- paste(vec, collapse = ",")
304 |     resList[[j]] <- data.frame(sampleName = sample, values = vecStr)
305 |   }
306 |   resDF <- data.table::rbindlist(resList)
307 |   return(resDF)
308 | }
309 | 
310 | 
311 | # VSD and TPM upload table
312 | makeUploadTableExp <- function(upNow) {
313 |   geneVec <- rownames(upNow)
314 |   resNow <- apply(upNow, MARGIN = 1, FUN = function(row) {
315 |     row <- signif(as.numeric(row), digits = 4)
316 |     vecStr <- paste(row, collapse = ",")
317 |     vecStr
318 |   })
319 |   resDF <- data.frame(geneName = geneVec, values = resNow)
320 |   return(resDF)
321 | }
322 | 
323 | 


--------------------------------------------------------------------------------
/R/myGSEA.R:
--------------------------------------------------------------------------------
  1 | #' Wrapper for clusterProlifer's GSEA()
  2 | #'
  3 | #' Runs GSEA() from clusterProfiler and creates useful
  4 | #' visualizations.
  5 | #'
  6 | #' @param ranks Numeric of gene 'scores' ordered by decreasing value and
  7 | #'     named with gene symbols.
  8 | #' @param TERM2GENE Data frame with two columns: gene set identifiers and
  9 | #' gene symbols. Can be generated using correlationAnalyzeR::getTERM2GENE()
 10 | #' @param plotFile prefix to use for naming output files.
 11 | #' @param outDir output directory.
 12 | #' @param Condition Name to use for titles of plots. Default = "GSEA Results".
 13 | #' @param nperm Number of permutations to run. Default is 2000
 14 | #' @param padjustedCutoff Value to use as a cutoff for returned gene sets.
 15 | #' @param returnDataOnly Should GSEA data/plots be saved to file? Default: TRUE
 16 | #' @param topPlots Should top GSEA pathways be plotted? Default: FALSE
 17 | #'
 18 | #' @return Named list containing GSEA() output, GSEA data frame, and visualizations.
 19 | #'
 20 | #' @examples
 21 | #' corrDF <- correlationAnalyzeR::analyzeSingleGenes(genesOfInterest = c("BRCA1"),
 22 | #'                                                   returnDataOnly = TRUE,
 23 | #'                                                   runGSEA = FALSE,
 24 | #'                                                   Sample_Type = "normal")
 25 | #' ranks <- corrDF$correlations[,1]
 26 | #' names(ranks) <- rownames(corrDF$correlations)
 27 | #' TERM2GENE <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = "simple",
 28 | #'                                                Species = "hsapiens")
 29 | #' res <- correlationAnalyzeR::myGSEA(ranks = ranks,
 30 | #'                                    TERM2GENE = TERM2GENE,
 31 | #'                                    plotFile = "GSEA_out", outDir = getwd(),
 32 | #'                                    topPlots = FALSE, returnDataOnly=TRUE,
 33 | #'                                     Condition = "GSEA Results")
 34 | #'
 35 | #' @import dplyr
 36 | #' @import clusterProfiler
 37 | #'
 38 | #' @export
 39 | myGSEA <- function(ranks,
 40 |                    TERM2GENE,
 41 |                    padjustedCutoff = .05,
 42 |                    returnDataOnly = TRUE,
 43 |                    nperm = 2000,
 44 |                    topPlots = FALSE,
 45 |                    outDir,
 46 |                    Condition = "GSEA Results",
 47 |                    plotFile = "GSEA_results") {
 48 | 
 49 | 
 50 |   # # Bug testing
 51 |   # padjustedCutoff = .05
 52 |   # topPlots = FALSE
 53 |   # nperm = 2000
 54 |   # corrDF <- correlationAnalyzeR::analyzeSingleGenes(genesOfInterest = c("ATM"),
 55 |   #                                                   returnDataOnly = TRUE,
 56 |   #                                                   runGSEA = FALSE,
 57 |   #                                                   Sample_Type = "normal")
 58 |   # ranks <- corrDF$correlations[,1]
 59 |   # names(ranks) <- rownames(corrDF$correlations)
 60 |   # TERM2GENE <- correlationAnalyzeR::getTERM2GENE(GSEA_Type = "simple",
 61 |   #                                                Species = "hsapiens")
 62 | 
 63 |   resList <- list()
 64 |   ranks <- ranks[which(! duplicated(names(ranks)))]
 65 |   ranks <- ranks[which(! is.na(ranks))]
 66 |   ranks <- ranks[order(ranks, decreasing = TRUE)]
 67 | 
 68 |   EGMT <- GSEA2(TERM2GENE = TERM2GENE, ranks = ranks, nproc = 1,
 69 |                 nperm = nperm, pvalueCutoff = padjustedCutoff)
 70 | 
 71 |   resGSEA <- as.data.frame(EGMT)
 72 | 
 73 |   resList[["EGMT"]] <- EGMT
 74 | 
 75 |   if (length(resGSEA$ID) < 10){
 76 |     warning(paste0("GSEA Failed -- No significant pathways at designated pValue: ",
 77 |                    padjustedCutoff, ". Rerunning with higher pValue."))
 78 |     EGMT <- GSEA2(TERM2GENE = TERM2GENE, ranks = ranks, nproc = 1,
 79 |                   nperm = nperm, pvalueCutoff = padjustedCutoff + .15)
 80 |     resGSEA <- as.data.frame(EGMT)
 81 |     resList[["EGMT"]] <- EGMT
 82 |   }
 83 |   if (length(resGSEA$ID) < 10){
 84 |     warning(paste0("GSEA Failed -- No significant pathways at designated pValue: ",
 85 |                    padjustedCutoff, ". Rerunning with higher pValue."))
 86 |     EGMT <- GSEA2(TERM2GENE = TERM2GENE, ranks = ranks, nproc = 1,
 87 |                   nperm = nperm, pvalueCutoff = padjustedCutoff + .45)
 88 |     resGSEA <- as.data.frame(EGMT)
 89 |     resList[["EGMT"]] <- EGMT
 90 |   }
 91 |   if (length(resGSEA$ID) < 10){
 92 |     stop(paste0("GSEA Failed -- No significant pathways at designated pValue: ",
 93 |                 padjustedCutoff, ". Please check your data. If you believe this ",
 94 |                 "behavior is a bug, please contact the package maintainer."))
 95 |   }
 96 |   if (topPlots) {
 97 |     resGSEA <- resGSEA[order(resGSEA$NES, decreasing = TRUE),]
 98 |     topUP <- resGSEA$ID[1:10]
 99 |     resGSEA <- resGSEA[order(resGSEA$NES, decreasing = FALSE),]
100 |     topDOWN <- resGSEA$ID[1:10]
101 |     resGSEA <- resGSEA[order(resGSEA$pvalue),]
102 |     plUP <- list()
103 |     plDOWN <- list()
104 |     for ( i in 1:6 ) {
105 |       pathway <- topUP[i]
106 |       if (nchar(pathway) > 35) {
107 |         pathTitle <- paste0(substr(pathway, 1, 30), "...")
108 |       } else {
109 |         pathTitle <- pathway
110 |       }
111 |       gp <- clusterProfiler::gseaplot(EGMT, pathway, title = NULL)
112 |       gp <- gp + ggplot2::labs(title = pathTitle,
113 |                                subtitle = paste0("Enrichment score: ",
114 |                                                  round(resGSEA$NES[which(
115 |                                                    resGSEA$ID == pathway
116 |                                                  )], 3)))
117 |       gg <- ggplot2::theme_classic()
118 |       gp <-  gp + ggplot2::theme(plot.title = gg[["plot.title"]],
119 |                                  plot.subtitle = gg[["plot.subtitle"]],
120 |                                  plot.margin = gg[["plot.margin"]])
121 |       if ( i == 1) {
122 |         gp <- gp + ggplot2::theme(plot.margin = ggplot2::margin(45, 20, 20, 45))
123 |       } else if (i == 2) {
124 |         gp <- gp + ggplot2::theme(plot.margin = ggplot2::margin(45, 20, 20, 20))
125 |       } else if (i == 3) {
126 |         gp <- gp + ggplot2::theme(plot.margin = ggplot2::margin(45, 45, 20, 20))
127 |       } else if (i == 4) {
128 |         gp <- gp + ggplot2::theme(plot.margin = ggplot2::margin(20, 20, 45, 45))
129 |       } else if (i == 5) {
130 |         gp <- gp + ggplot2::theme(plot.margin = ggplot2::margin(20, 20, 45, 20))
131 |       } else if (i == 6) {
132 |         gp <- gp + ggplot2::theme(plot.margin = ggplot2::margin(20, 45, 45, 20))
133 |       }
134 |       plUP[[i]] <- gp
135 | 
136 |       pathway <- topDOWN[i]
137 |       if (nchar(pathway) > 35) {
138 |         pathTitle <- paste0(substr(pathway, 1, 30), "...")
139 |       } else {
140 |         pathTitle <- pathway
141 |       }
142 |       gp <- clusterProfiler::gseaplot(EGMT, pathway, title = NULL)
143 |       gp <- gp + ggplot2::labs(title = pathTitle,
144 |                                subtitle = paste0("Enrichment score: ",
145 |                                                  round(resGSEA$NES[which(
146 |                                                    resGSEA$ID == pathway
147 |                                                  )], 3)))
148 |       gg <- ggplot2::theme_classic()
149 |       gp <-  gp + ggplot2::theme(plot.title = gg[["plot.title"]],
150 |                                  plot.subtitle = gg[["plot.subtitle"]],
151 |                                  plot.margin = gg[["plot.margin"]])
152 |       if (i == 1) {
153 |         gp <- gp + ggplot2::theme(plot.margin = ggplot2::margin(45, 20, 20, 45))
154 |       } else if (i == 2) {
155 |         gp <- gp + ggplot2::theme(plot.margin = ggplot2::margin(45, 20, 20, 20))
156 |       } else if (i == 3) {
157 |         gp <- gp + ggplot2::theme(plot.margin = ggplot2::margin(45, 45, 20, 20))
158 |       } else if (i == 4) {
159 |         gp <- gp + ggplot2::theme(plot.margin = ggplot2::margin(20, 20, 45, 45))
160 |       } else if (i == 5) {
161 |         gp <- gp + ggplot2::theme(plot.margin = ggplot2::margin(20, 20, 45, 20))
162 |       } else if (i == 6) {
163 |         gp <- gp + ggplot2::theme(plot.margin = ggplot2::margin(20, 45, 45, 20))
164 |       }
165 |       plDOWN[[i]] <- gp
166 | 
167 |     }
168 | 
169 |     gaUP <- ggpubr::ggarrange(plotlist = plUP, nrow = 2, ncol = 3)
170 |     gaUP <- ggpubr::annotate_figure(gaUP,
171 |                                     top = ggpubr::text_grob(paste0(
172 |                                       "Top Over-Expressed Pathways in ", Condition
173 |                                     ),
174 |                                     size = 35)
175 |     )
176 |     resList[["GSEA_up"]] <- gaUP
177 |     if (! returnDataOnly) {
178 |       ggplot2::ggsave(plot = gaUP,
179 |                       filename = file.path(outDir, paste0(plotFile, "_topPathwaysUP.png")),
180 |                       height = 14, width = 20)
181 |     }
182 | 
183 | 
184 |     gaDOWN <- ggpubr::ggarrange(plotlist = plDOWN, nrow = 2, ncol = 3)
185 |     gaDOWN <- ggpubr::annotate_figure(gaDOWN,
186 |                                       top = ggpubr::text_grob(paste0(
187 |                                         "Top Under-Expressed Pathways in ", Condition
188 |                                       ),
189 |                                       size = 35)
190 |     )
191 | 
192 |     if (! returnDataOnly) {
193 |       ggplot2::ggsave(plot = gaDOWN,
194 |                       filename = file.path(outDir, paste0(plotFile, "_topPathwaysDOWN.png")),
195 |                       height = 14, width = 20)
196 |     }
197 |     resList[["GSEA_down"]] <- gaDOWN
198 | 
199 |   }
200 | 
201 |   if (! returnDataOnly) {
202 |     data.table::fwrite(x = resGSEA, file = file.path(outDir,
203 |                                                      paste0(plotFile,
204 |                                                             "_GSEA.csv")))
205 |   }
206 | 
207 |   resList[["eres"]] <- resGSEA
208 | 
209 |   cat("\nReturning ... ", names(resList), "\n")
210 | 
211 |   return(resList)
212 | }
213 | 
214 | 
215 | 
216 | # Modified GSEA from clusterProfiler. Reduces computational time.
217 | GSEA2 <- function(TERM2GENE, ranks,
218 |                   nperm = 2000, nproc = "auto",
219 |                   pvalueCutoff = .05) {
220 |   if (nproc == "auto") {
221 |     nproc = parallel::detectCores()
222 |   }
223 |   TERMList <- TERM2GENE %>% split(x = TERM2GENE$gene_symbol, f = TERM2GENE$gs_name)
224 |   EGMT <- suppressWarnings(fgsea::fgsea(pathways = TERMList, nproc = nproc,
225 |                        maxSize = 500,
226 |                        minSize = 15,
227 |                        stats = ranks, nperm = nperm))
228 |   res <- data.frame(
229 |     ID = as.character(EGMT$pathway),
230 |     Description = as.character(EGMT$pathway),
231 |     setSize = EGMT$size,
232 |     enrichmentScore = EGMT$ES,
233 |     NES = EGMT$NES,
234 |     pvalue = EGMT$pval,
235 |     p.adjust = EGMT$padj,
236 |     core_enrichment = vapply(EGMT$leadingEdge, FUN.VALUE = "char",
237 |                              paste0, collapse='/'),
238 |     stringsAsFactors = FALSE
239 |   )
240 |   res <- res[!is.na(res$pvalue),]
241 |   res <- res[ res$pvalue <= pvalueCutoff, ]
242 |   res <- res[ res$p.adjust <= pvalueCutoff, ]
243 |   idx <- order(res$pvalue, decreasing = FALSE)
244 |   res <- res[idx, ]
245 |   params <- list(pvalueCutoff = pvalueCutoff,
246 |                  nPerm = nperm,
247 |                  pAdjustMethod = "BH",
248 |                  exponent = 1,
249 |                  minGSSize = 15,
250 |                  maxGSSize = 500
251 |   )
252 |   row.names(res) <- res$ID
253 |   EGMT <- new("gseaResult",
254 |               result     = res,
255 |               geneSets   = TERMList,
256 |               geneList   = ranks,
257 |               params     = params,
258 |               readable   = FALSE
259 |   )
260 |   EGMT@organism <- "UNKNOWN"
261 |   EGMT@setType <- "UNKNOWN"
262 |   EGMT@keytype <- "UNKNOWN"
263 |   return(EGMT)
264 | }
265 | 


--------------------------------------------------------------------------------
/R/fixStrings.R:
--------------------------------------------------------------------------------
  1 | #' Fix strings (helper function)
  2 | #'
  3 | #' Convert vector of GSEA (or other) names to publication-ready titles
  4 | #'
  5 | #' @param StringVec A vector of titles (usually GSEA) to clean for visualizations
  6 | #' @return A vector of cleaned string titles in the same order as the input.
  7 | #' @export
  8 | 
  9 | 
 10 | fixStrings <- function(StringVec) {
 11 | 
 12 |   # # Bug testing
 13 |   # StringVec <- c("HALLMARK_APOPTOSIS", "GO_MIR21_TARGETS",
 14 |   #                "GSE121239_THING_HAPPENED", "CTTGAT_MIR381",
 15 |   #                "GTGCAGAG_EZH2", "GSE12309_WHATEVER",
 16 |   #                "MEISSNER_NPC_HCP_WITH_H3K4ME2",
 17 |   #                "BASSO_CD40_SIGNALING_DN",
 18 |   #                "GSE36078_UNTREATED_VS_AD5_INF_IL1R_KO_MOUSE_LUNG_DC_DN",
 19 |   #                "KEGG_ACUTE_MYELOID_LEUKEMIA",
 20 |   #                "GSE31082_CD4_VS_CD8_SP_THYMOCYTE_UP",
 21 |   #                "GSE3920_IFNA_VS_IFNG_TREATED_ENDOTHELIAL_CELL_UP",
 22 |   #                "GSE37605_FOXP3_FUSION_GFP_VS_IRES_GFP_TREG_C57BL6_UP",
 23 |   #                "GSE41176_UNSTIM_VS_ANTI_IGM_STIM_BCELL_24H_DN",
 24 |   #                "MORI_LARGE_PRE_BII_LYMPHOCYTE_UP",
 25 |   #                "RYAAAKNNNNNNTTGW_UNKNOWN",
 26 |   #                "GGGTGGRR_PAX4_03",
 27 |   #                "GGGNNTTTCC_NFKB_Q6_01",
 28 |   #                "AAAYWAACM_HFH4_01",
 29 |   #                "KANG_DOXORUBICIN_RESISTANCE_UP")
 30 | 
 31 |   StringVec <- gsub(StringVec, pattern = "_", replacement = " ")
 32 |   StringVec <- tolower(StringVec)
 33 |   StringVec <- stringr::str_to_title(StringVec)
 34 |   StringVec <- gsub(StringVec, pattern = "Iii", replacement = "III", ignore.case = FALSE)
 35 |   StringVec <- gsub(StringVec, pattern = "Ii", replacement = "II", ignore.case = FALSE)
 36 |   StringVec <- gsub(StringVec, pattern = " Of ", replacement = " of ", ignore.case = FALSE)
 37 |   StringVec <- gsub(StringVec, pattern = " To ", replacement = " to ", ignore.case = FALSE)
 38 |   StringVec <- gsub(StringVec, pattern = " In ", replacement = " in ", ignore.case = FALSE)
 39 |   StringVec <- gsub(StringVec, pattern = " With ", replacement = " with ", ignore.case = FALSE)
 40 |   StringVec <- gsub(StringVec, pattern = " Without ", replacement = " without ", ignore.case = FALSE)
 41 |   StringVec <- gsub(StringVec, pattern = " Upon ", replacement = " upon ", ignore.case = FALSE)
 42 |   StringVec <- gsub(StringVec, pattern = " An ", replacement = " an ", ignore.case = FALSE)
 43 |   StringVec <- gsub(StringVec, pattern = " By ", replacement = " by ", ignore.case = FALSE)
 44 |   StringVec <- gsub(StringVec, pattern = " For ", replacement = " for ", ignore.case = FALSE)
 45 |   StringVec <- gsub(StringVec, pattern = " Via ", replacement = " via ", ignore.case = FALSE)
 46 |   StringVec <- gsub(StringVec, pattern = " Lof ", replacement = " LOF ", ignore.case = FALSE)
 47 |   StringVec <- gsub(StringVec, pattern = " Lof$", replacement = " LOF", ignore.case = FALSE)
 48 |   StringVec <- gsub(StringVec, pattern = " Loh ", replacement = " LOH ", ignore.case = FALSE)
 49 |   StringVec <- gsub(StringVec, pattern = " Loh$", replacement = " LOH", ignore.case = FALSE)
 50 |   StringVec <- gsub(StringVec, pattern = " Arms ", replacement = " ARMS ", ignore.case = FALSE)
 51 |   StringVec <- gsub(StringVec, pattern = " Erms ", replacement = " ERMS ", ignore.case = FALSE)
 52 |   StringVec <- gsub(StringVec, pattern = " Nadh ", replacement = " NADH ", ignore.case = FALSE)
 53 |   StringVec <- gsub(StringVec, pattern = " Nadph ", replacement = " NADPH ", ignore.case = FALSE)
 54 |   StringVec <- gsub(StringVec, pattern = " Cll ", replacement = " CLL ", ignore.case = FALSE)
 55 |   StringVec <- gsub(StringVec, pattern = " Cml ", replacement = " CML ", ignore.case = FALSE)
 56 |   StringVec <- gsub(StringVec, pattern = " Aml ", replacement = " AML ", ignore.case = FALSE)
 57 |   StringVec <- gsub(StringVec, pattern = " All ", replacement = " ALL ", ignore.case = FALSE)
 58 |   StringVec <- gsub(StringVec, pattern = "Kim ALL", replacement = "Kim all", ignore.case = FALSE)
 59 |   StringVec <- gsub(StringVec, pattern = "CMV ALL", replacement = "CMV all", ignore.case = FALSE)
 60 |   StringVec <- gsub(StringVec, pattern = " Nhek ", replacement = " NHEK ", ignore.case = FALSE)
 61 |   StringVec <- gsub(StringVec, pattern = " Ner ", replacement = " NER ", ignore.case = FALSE)
 62 |   StringVec <- gsub(StringVec, pattern = " Nmda ", replacement = " NMDA ", ignore.case = FALSE)
 63 |   StringVec <- gsub(StringVec, pattern = " Dc ", replacement = " DC ", ignore.case = FALSE)
 64 |   StringVec <- gsub(StringVec, pattern = " Cd4 ", replacement = " CD4 ", ignore.case = FALSE)
 65 |   StringVec <- gsub(StringVec, pattern = " Cd8 ", replacement = " CD8 ", ignore.case = FALSE)
 66 |   StringVec <- gsub(StringVec, pattern = " Gc ", replacement = " GC ", ignore.case = FALSE)
 67 |   StringVec <- gsub(StringVec, pattern = " Hdl ", replacement = " HDL ", ignore.case = FALSE)
 68 |   StringVec <- gsub(StringVec, pattern = " Dn$", replacement = " Down", ignore.case = FALSE)
 69 |   StringVec <- gsub(StringVec, pattern = " Ldl ", replacement = " LDL ", ignore.case = FALSE)
 70 |   StringVec <- gsub(StringVec, pattern = " Tcr ", replacement = " TCR ", ignore.case = FALSE)
 71 |   StringVec <- gsub(StringVec, pattern = " Mdc ", replacement = " MDC ", ignore.case = FALSE)
 72 |   StringVec <- gsub(StringVec, pattern = " Bcr ", replacement = " BCR ", ignore.case = FALSE)
 73 |   StringVec <- gsub(StringVec, pattern = " Icp ", replacement = " ICP ", ignore.case = FALSE)
 74 |   StringVec <- gsub(StringVec, pattern = " Hbv ", replacement = " HBV ", ignore.case = FALSE)
 75 |   StringVec <- gsub(StringVec, pattern = "Dlbcl", replacement = "DLBCL", ignore.case = FALSE)
 76 |   StringVec <- gsub(StringVec, pattern = " Gist ", replacement = " GIST ", ignore.case = FALSE)
 77 |   StringVec <- gsub(StringVec, pattern = " Gist$", replacement = " GIST", ignore.case = FALSE)
 78 |   StringVec <- gsub(StringVec, pattern = " Dp ", replacement = " DP ", ignore.case = FALSE)
 79 |   StringVec <- gsub(StringVec, pattern = " Dn ", replacement = " DN ", ignore.case = FALSE)
 80 |   StringVec <- gsub(StringVec, pattern = " H2o2 ", replacement = " H2O2 ", ignore.case = FALSE)
 81 |   StringVec <- gsub(StringVec, pattern = " With ", replacement = " with ", ignore.case = FALSE)
 82 |   StringVec <- gsub(StringVec, pattern = "Ntreg", replacement = "nTreg", ignore.case = FALSE)
 83 |   StringVec <- gsub(StringVec, pattern = " Mlr ", replacement = " MLR ", ignore.case = FALSE)
 84 |   StringVec <- gsub(StringVec, pattern = "Gfp", replacement = "GFP", ignore.case = FALSE)
 85 |   StringVec <- gsub(StringVec, pattern = " Vs ", replacement = " vs ", ignore.case = FALSE)
 86 |   StringVec <- gsub(StringVec, pattern = " And ", replacement = " and ", ignore.case = FALSE)
 87 |   StringVec <- gsub(StringVec, pattern = " Wt ", replacement = " WT ", ignore.case = FALSE)
 88 |   StringVec <- gsub(StringVec, pattern = " Ros$", replacement = " ROS", ignore.case = FALSE)
 89 |   StringVec <- gsub(StringVec, pattern = " Ros ", replacement = " ROS ", ignore.case = FALSE)
 90 |   StringVec <- gsub(StringVec, pattern = " Ko ", replacement = " KO ", ignore.case = FALSE)
 91 |   StringVec <- gsub(StringVec, pattern = " Pdc ", replacement = " PDC ", ignore.case = FALSE)
 92 |   StringVec <- gsub(StringVec, pattern = "Pdgf", replacement = "PDGF", ignore.case = FALSE)
 93 |   StringVec <- gsub(StringVec, pattern = " Rna ", replacement = " RNA ", ignore.case = FALSE)
 94 |   StringVec <- gsub(StringVec, pattern = "Mrna", replacement = "mRNA", ignore.case = FALSE)
 95 |   StringVec <- gsub(StringVec, pattern = "Mirna", replacement = "miRNA", ignore.case = FALSE)
 96 |   StringVec <- gsub(StringVec, pattern = "Sirna", replacement = "siRNA", ignore.case = FALSE)
 97 |   StringVec <- gsub(StringVec, pattern = "Trna", replacement = "tRNA", ignore.case = FALSE)
 98 |   StringVec <- gsub(StringVec, pattern = "Ncrna", replacement = "ncRNA", ignore.case = FALSE)
 99 |   StringVec <- gsub(StringVec, pattern = "Snrna", replacement = "snRNA", ignore.case = FALSE)
100 |   StringVec <- gsub(StringVec, pattern = "Rrna", replacement = "rRNA", ignore.case = FALSE)
101 |   StringVec <- gsub(StringVec, pattern = "rna$", replacement = "RNA", ignore.case = FALSE)
102 |   StringVec <- gsub(StringVec, pattern = "rna ", replacement = "RNA ", ignore.case = FALSE)
103 |   StringVec <- gsub(StringVec, pattern = "Flii", replacement = "Fli1", ignore.case = FALSE)
104 |   StringVec <- gsub(StringVec, pattern = " Hcp ", replacement = " HCP ", ignore.case = FALSE)
105 |   StringVec <- gsub(StringVec, pattern = " Tnf ", replacement = " TNF ", ignore.case = FALSE)
106 |   StringVec <- gsub(StringVec, pattern = " Srp ", replacement = " SRP ", ignore.case = FALSE)
107 |   StringVec <- gsub(StringVec, pattern = " Utr ", replacement = " UTR ", ignore.case = FALSE)
108 |   StringVec <- gsub(StringVec, pattern = " Dna ", replacement = " DNA ", ignore.case = FALSE)
109 |   StringVec <- gsub(StringVec, pattern = "Rdna", replacement = "rDNA", ignore.case = FALSE)
110 |   StringVec <- gsub(StringVec, pattern = " Hiv ", replacement = " HIV ", ignore.case = FALSE)
111 |   StringVec <- gsub(StringVec, pattern = " Hiv1 ", replacement = " HIV1 ", ignore.case = FALSE)
112 |   StringVec <- gsub(StringVec, pattern = "dna", replacement = "DNA", ignore.case = FALSE)
113 |   StringVec <- gsub(StringVec, pattern = " Lps ", replacement = " LPS ", ignore.case = FALSE)
114 |   StringVec <- gsub(StringVec, pattern = " Gmcsf ", replacement = " GMCSF ", ignore.case = FALSE)
115 |   StringVec <- gsub(StringVec, pattern = " Gm Csf ", replacement = " GMCSF ", ignore.case = FALSE)
116 |   StringVec <- gsub(StringVec, pattern = "Balbc", replacement = "BALBc", ignore.case = FALSE)
117 |   StringVec <- gsub(StringVec, pattern = "Lcmv", replacement = "LCMV", ignore.case = FALSE)
118 |   StringVec <- gsub(StringVec, pattern = "Mcmv", replacement = "MCMV", ignore.case = FALSE)
119 |   StringVec <- gsub(StringVec, pattern = " Pcc ", replacement = " PCC ", ignore.case = FALSE)
120 |   StringVec <- gsub(StringVec, pattern = "Ecm", replacement = "ECM", ignore.case = FALSE)
121 |   StringVec <- gsub(StringVec, pattern = "G1s", replacement = "G1S", ignore.case = FALSE)
122 |   StringVec <- gsub(StringVec, pattern = " G1 S ", replacement = " G1S ", ignore.case = FALSE)
123 |   StringVec <- gsub(StringVec, pattern = " G1 S", replacement = " G1S", ignore.case = FALSE)
124 |   StringVec <- gsub(StringVec, pattern = "G2m", replacement = "G2M", ignore.case = FALSE)
125 |   StringVec <- gsub(StringVec, pattern = " G2 M ", replacement = " G2M ", ignore.case = FALSE)
126 |   StringVec <- gsub(StringVec, pattern = " G2 M", replacement = " G2M", ignore.case = FALSE)
127 |   StringVec <- gsub(StringVec, pattern = "Hcmv", replacement = "HCMV", ignore.case = FALSE)
128 |   StringVec <- gsub(StringVec, pattern = "Pbmc", replacement = "PBMC", ignore.case = FALSE)
129 |   StringVec <- gsub(StringVec, pattern = " Atp ", replacement = " ATP ", ignore.case = FALSE)
130 |   StringVec <- gsub(StringVec, pattern = " Atp$", replacement = " ATP", ignore.case = FALSE)
131 |   StringVec <- gsub(StringVec, pattern = "Gtp", replacement = "GTP", ignore.case = FALSE)
132 |   StringVec <- gsub(StringVec, pattern = " Mut ", replacement = " MUT ", ignore.case = FALSE)
133 |   StringVec <- gsub(StringVec, pattern = "Et Al", replacement = "et al", ignore.case = FALSE)
134 |   StringVec <- gsub(StringVec, pattern = "Cpg", replacement = "CPG", ignore.case = FALSE)
135 |   StringVec <- gsub(StringVec, pattern = " Nkt ", replacement = " NKT ", ignore.case = FALSE)
136 |   StringVec <- gsub(StringVec, pattern = " Hsc ", replacement = " HSC ", ignore.case = FALSE)
137 |   StringVec <- gsub(StringVec, pattern = " Ln ", replacement = " LN ", ignore.case = FALSE)
138 |   StringVec <- gsub(StringVec, pattern = "Cmv", replacement = "CMV", ignore.case = FALSE)
139 |   StringVec <- gsub(StringVec, pattern = " Bm ", replacement = " BM ", ignore.case = FALSE)
140 |   StringVec <- gsub(StringVec, pattern = " Bmdc ", replacement = " BMDC ", ignore.case = FALSE)
141 |   StringVec <- gsub(StringVec, pattern = "Esc ", replacement = "ESC ", ignore.case = FALSE)
142 |   StringVec <- gsub(StringVec, pattern = " Esc ", replacement = " ESC ", ignore.case = FALSE)
143 |   StringVec <- gsub(StringVec, pattern = "Mcf10a", replacement = "MCF10A", ignore.case = FALSE)
144 |   StringVec <- gsub(StringVec, pattern = "Tca", replacement = "TCA", ignore.case = FALSE)
145 |   StringVec <- gsub(StringVec, pattern = "Nkcell", replacement = "NK-Cell", ignore.case = FALSE)
146 |   StringVec <- gsub(StringVec, pattern = "Tcell", replacement = "T-Cell", ignore.case = FALSE)
147 |   StringVec <- gsub(StringVec, pattern = "T Cell", replacement = "T-Cell", ignore.case = FALSE)
148 |   StringVec <- gsub(StringVec, pattern = "B Cell", replacement = "B-Cell", ignore.case = FALSE)
149 |   StringVec <- gsub(StringVec, pattern = "Hela", replacement = "HeLa", ignore.case = FALSE)
150 |   StringVec <- gsub(StringVec, pattern = "Treg", replacement = "T-Reg", ignore.case = FALSE)
151 |   StringVec <- gsub(StringVec, pattern = "Tconv", replacement = "T-Conv", ignore.case = FALSE)
152 |   StringVec <- gsub(StringVec, pattern = "Bcell", replacement = "B-Cell", ignore.case = FALSE)
153 |   StringVec <- gsub(StringVec, pattern = " Uv ", replacement = " UV ", ignore.case = FALSE)
154 |   StringVec <- gsub(StringVec, pattern = " Uv$", replacement = " UV", ignore.case = FALSE)
155 |   StringVec <- gsub(StringVec, pattern = "^Gse", replacement = "GSE", ignore.case = FALSE)
156 |   StringVec <- gsub(StringVec, pattern = "^Gnf2", replacement = "GNF2", ignore.case = FALSE)
157 |   StringVec <- gsub(StringVec, pattern = "^Gcm", replacement = "GCM", ignore.case = FALSE)
158 |   StringVec <- gsub(StringVec, pattern = "^Morf", replacement = "MORF", ignore.case = FALSE)
159 |   StringVec <- gsub(StringVec, pattern = "^Kegg", replacement = "KEGG", ignore.case = FALSE)
160 |   StringVec <- gsub(StringVec, pattern = "^Pid", replacement = "PID", ignore.case = FALSE)
161 |   StringVec <- gsub(StringVec, pattern = "^Go ", replacement = "GO ", ignore.case = FALSE)
162 |   StringVec <- gsub(StringVec, pattern = "^([GCATSMWNRYK][gcatsmwrnyk]+)( [A-Za-z0-9]+ Q[0-9]$)",
163 |                     replacement = "\\U\\1\\E\\2", perl = TRUE, ignore.case = FALSE)
164 |   StringVec <- gsub(StringVec, pattern = "^([GCATSMWNRYK][gcatsmwrnyk]+)( [A-Za-z0-9]+ Q[0-9] [0-9]+$)",
165 |                     replacement = "\\U\\1\\E\\2", perl = TRUE, ignore.case = FALSE)
166 |   StringVec <- gsub(StringVec, pattern = "^([GCATSMWNRYK][gcatsmwrnyk]+)( [A-Za-z0-9]+ [0-9]+$)",
167 |                     replacement = "\\U\\1\\E\\2", perl = TRUE, ignore.case = FALSE)
168 |   StringVec <- gsub(StringVec, pattern = "^([GCATSMWNRYK][gcatsmwrnyk]+)( Unknown$)",
169 |                     replacement = "\\U\\1\\E\\2", perl = TRUE, ignore.case = FALSE)
170 |   StringVec <- gsub(StringVec, pattern = "Mir([0-9]*.*)", replacement = "miR\\1",
171 |                     perl = TRUE, ignore.case = FALSE)
172 |   StringVec <- gsub(StringVec, pattern = "Ifn([a-z])", perl = TRUE,
173 |                     replacement = "IFN\\1", ignore.case = FALSE)
174 |   StringVec <- gsub(StringVec, pattern = "Il([0-9]+)", perl = TRUE,
175 |                     replacement = "IL\\1", ignore.case = FALSE)
176 |   StringVec <- gsub(StringVec, pattern = "(IL[0-9]+)(r)", perl = TRUE,
177 |                     replacement = "\\1\\U\\2", ignore.case = FALSE)
178 |   StringVec <- gsub(StringVec, pattern = "Cd([0-9])", perl = TRUE,
179 |                     replacement = "CD\\1", ignore.case = FALSE)
180 |   StringVec <- gsub(StringVec, pattern = "(Ig)([a-z])", perl = TRUE,
181 |                     replacement = "\\1\\U\\2", ignore.case = FALSE)
182 |   StringVec <- gsub(StringVec, pattern = "(H[0-9])(k[0-9]+)", perl = TRUE,
183 |                     replacement = "\\1\\U\\2", ignore.case = FALSE)
184 |   StringVec <- gsub(StringVec, pattern = "(B)(i+) ", perl = TRUE,
185 |                     replacement = "\\1\\U\\2 ", ignore.case = FALSE)
186 | 
187 |   return(StringVec)
188 | }
189 | 


--------------------------------------------------------------------------------
/misc/preprocessing_scripts/tissueDictionary.R:
--------------------------------------------------------------------------------
  1 | # Tissue dict
  2 | tissueDict <- list("brain" = list("yes" = c("cortex", "brain", "lobe", "hippoc", "^pfc$", 
  3 |                                             "\\bpfc\\b", "\\bmge\\b", "glioblastoma", "a172",
  4 |                                             "^vc$", "^cbc$", "gyrus", "stroke", "sciencell",
  5 |                                             "gbm", "purkinje", "pyramidal", "u87", "\\bglioma\\b",
  6 |                                             "alzheim", "frontal", "dentate", "white matter", "brian",
  7 |                                             "cranial", "^glioma", "[ -]+glioma", "optic chiasm",
  8 |                                             "grey matter", "gray matter", "striatum", "pericyte",
  9 |                                             "nerv", "gangli", "bipol", "medull", "putamen",
 10 |                                             "hippocamp", "neur", "glia", "amygdala", "oligodendro",
 11 |                                             "spine", "spinal", "astrocyt", "cereb",
 12 |                                             "pineal"),
 13 |                                   "no" = c("liver", "kidney", "microgli", "aneurysm", "vessel",
 14 |                                            "precursor", 'progenitor', "stem cell", "Neuroectodermal",
 15 |                                            "NPC", "NSC")),
 16 |                    "thyroid" = list("yes" = c("thyroid", "thyrocyt"),
 17 |                                     "no" = c("whole blood")),
 18 |                    "respiratory" = c("lung", "airway", "mesothel", "beas-2b", "nasal", "hsaec", "nsclc",
 19 |                                      "trach", "pleura", "alveol", "bronch"),
 20 |                    "skin" = list("yes" = c("skin", "keratin", "dermis",
 21 |                               "\\bNHEM\\b", "NHEM_M2", "wm989", "squamous cell",
 22 |                               "NHEM-M2", "hfdpc", "arn8", "cutaneous", "wm983b",
 23 |                               "^dk$", "epidermis", "melano", "psoriasis"),
 24 |                               "no" = c("fibroblas", "subcutaneous")),
 25 |                    "pancreas" = list("yes" = c("pancreas", "pancrea", "pdac", 'yapc', "panc",
 26 |                                                "islet", "\\balpha", "\\bbeta", "\\bdelta", "\\bepsilon"),
 27 |                                      "no" = c("falpha", "fbeta", "progen", "a673")),
 28 |                    "kidney" = list("yes" = c("kidney", "nephr", "glomerul", "reninoma",
 29 |                                              "proximal tubular epithelial cell", "\\bhkc\\b",
 30 |                                              "renal", "\\bclear cell", "\\bptec\\b", "ccrcc"),
 31 |                                    "no" = c("airway", "hek", "293")),
 32 |                    "cartilage" = c("cartilag", "chondr", "joint"),
 33 |                    "mammary" = list("yes" = c("mammary", "breast", "mcf[ -_]*[0-9]", "hmler",
 34 |                                               "ductal", "imec", "hmec", "\\bmda[ -]*231",
 35 |                                               "sum159", "hme1", "skbr3", "bt474",
 36 |                                               "\\bmda[ -_]*mb[ -_]*[0-9]+\\b", "hmepc", "t47d",
 37 |                                               "reduction mammoplasty no known cancer"),
 38 |                                     "no" = c("airway", "pancrea", "PDAC")),
 39 |                    "stomach"= c("stomach", "gastric"),
 40 |                    "esophagus" = c("esophag", "escc"),
 41 |                    "intestines" = list("yes" = c("intestine", "intestinal", "caco-2et",
 42 |                                                  "colon", "duoden", "colorect", "\\bcrc\\b",
 43 |                                                  "ileum", "ileal", "rectal", "\\bec cells", "gut", "bowel", "jejun", "sigmoid",
 44 |                                                  "recto", "ileocolic", "tubular adenoma", "hct116"),
 45 |                                        "no" = c("colonization",
 46 |                                                 "ncm356d")),
 47 |                    "muscle" = list("yes" = c("muscle", "myo",  "^smc", " smc",
 48 |                                              "lateralis", "gastrocnemius", "skmc",
 49 |                                              "hpasmc",
 50 |                                              "skeletal", "brach", "satellite", "ceps"), 
 51 |                                    "no" = c("cardiac", 'heart', "ffpe bladder",
 52 |                                             "endothel", "vessel", "cardiomyopathy")),
 53 |                    "liver" = list("yes" = c("liver", "hepat", "hepg2",
 54 |                                             "kupffer", "phh", "HCC"),
 55 |                                   "no" = c("deliver")),
 56 |                    "adipose" = list(
 57 |                      "yes" = c("adipose", "fat", "hwp", "adipo", "^wat$", "^bat$", "liposarcoma"),
 58 |                      "no" = c("mesenchymal", "milk", "adipocyterna", "hdfatprx1",
 59 |                               "gluteal subcutaneous")
 60 |                    ),
 61 |                    # "pluripotent" = list("yes" = c("hesc", "^esc$", "^h9$", " h9 ", 
 62 |                    #                                " h9$", "^h9 ", "h1esc", "pluripoten",
 63 |                    #                                "pES[0-9]+","HUES[0-9]+", "embyonic", 
 64 |                    #                                "human es", "^h1", "embryonic stem",
 65 |                    #                                "iPSC", "ips cell", "^ips$","hiPS",
 66 |                    #                                "human ips", "hpsc", "pluripotent stem", "Induced pluripotent"),
 67 |                    #                      "no" = c("derived", "NPC", "MSC")),
 68 |                    "prenatal" = list("yes" = c("placent", "fetal", "decidua", "germ",
 69 |                                   "fetus", "embry", "hpiepc",
 70 |                                   "parthenogenic", "blastoycst", "trophoblast",
 71 |                                   "zygote", "endoderm", "morula", "oocyte", 
 72 |                                   " ICM$", " te$", "oocy", "oophorus", 
 73 |                                   "293", "hpc-pl",
 74 |                                   "primitive streak", "fetal", "Germinal",
 75 |                                   "Embryo", "Fetus","mesoderm","ectoderm"),
 76 |                                   "no" = c("stem", "endothel", "ewing", "tc32", "sknmc")),
 77 |                    # "Mesenchymal stroma" = list("yes" = c("MSC", "mesenchymal", "mesenchymal",
 78 |                    #                              "hff1", "HFF-1", "nhdf", "^derma$",
 79 |                    #                              "fibroblas", "IMR90", "HFF", "fbs"),
 80 |                    #                  "no" = c("UMSCC", "pMSCV")),
 81 |                    "stem-like" = list("yes" = c("stem", "progen", "prog$", 
 82 |                                                 "hesc", "hues64", "\\b[h]*msc",
 83 |                                                 "mesenchymal", "mesenchymal",
 84 |                                                 "hff1", "HFF-1", "nhdf", "^derma$",
 85 |                                                 "fibroblas", "IMR[- ]*90", 
 86 |                                                 "HFF", "fbs", "tra-1-60", "tra1-60",
 87 |                                                 "^bms$","detroit 551",
 88 |                                                 "cord", "cd34", "\\bips\\b",
 89 |                                                 "\\bbj\\b",
 90 |                                                 "ht1080", "\\bhpc\\b",
 91 |                                                 "NPC", "NSC", "npsc", "pMN progenitor",
 92 |                                                 "ncc", "ncsc", "pscs", "cpcs", "\\bhes[;]*\\b",
 93 |                                                 "hvmf", "fibrosarc", "haoaf",
 94 |                                                 "hesc", "\\besc[s]*\\b", 
 95 |                                                 "\\besc[s]*[ -_]", "^h9$", " h9 ", 
 96 |                                                 " h9$", "^h9 ", "h1esc", "pluripoten",
 97 |                                                 "neural crest", "hNCCs", "NCCs",
 98 |                                                 "pES[0-9]+","HUES[0-9]+", "embyonic", 
 99 |                                                 "human es", "^h1", "embryonic stem",
100 |                                                 "iPSC", "ips cell", "^ips$","hiPS",
101 |                                                 "human ips", "hpsc", "pluripotent stem", "Induced pluripotent",
102 |                                                 "dental pulp", "periodontal ligament stem",
103 |                                                 "HSC", "hematopoeitic stem", "cd34", "precursor",
104 |                                                 "hnspc", "Neuron Precursor", "neural precursor",
105 |                                                 "ncsc", "ncc", "hnspc", "^kp$", "ECFC",
106 |                                                 "npc", "hpsc", "hfl1", "HSPC",
107 |                                                 "dental pulp cells", "cpcs", "fbs",
108 |                                                 "[a-zA-Z]genic", "poetic"),
109 |                                       "no" = c( "h1-neurons", "\\bb cell",
110 |                                                'hepatic', "shScramble", "hnscc", "ccrf-cem",
111 |                                                # "esc[ -]derived", 'ipsc[ -]derived',
112 |                                                # 'ips[ -]derived', 'es[ -]derived',
113 |                                                # "msc[ -]derived",
114 |                                                "escc", "nasopharyngeal",
115 |                                                "\\ball\\b", "all-sil",
116 |                                                "[ _-]+all", "tissue: blood",
117 |                                                "fetal lung fibroblasts", "leukemi",
118 |                                                "glioblastoma", "\\baml\\b",
119 |                                                "npc tumor", "glioma", "gbm",
120 |                                                "UMSCC", "pMSCV")),
121 |                    "cardiac" = list("yes" = c("cardiac", "heart", 
122 |                                               "atria", "atrium", "HCASMC",
123 |                                               "coron", "aort", "ventric"),
124 |                                     "no" = c("airway", 'fibroblast', "progenitor")),
125 |                    "endothelial" = list("yes" = c("endoth", "huvec", "hdmec", "ECFC",
126 |                                                   "vascul", "vessel","f[0-9]ecs",
127 |                                                   "haoec", "hsavec",
128 |                                                   "\\blsec\\b", "rac-vec",
129 |                                                   "hpmec", "ecctr", "ectnf", "ecil"),
130 |                                         "no" = c("bladder", "whole blood")),
131 |                    "spleen" = c("spleen", "splen"),
132 |                    "bladder" = list("yes" = c("bladder", "urin", "urothe"),
133 |                                     "no" = c("during", "gall", "mn cell line", "h3 es cells")),
134 |                    "retina" = c("retina", "macular", "\\brpe[ -_]*[1]*", "retin", "photo"),
135 |                    "thymus" = c("thymus", "thymic"),
136 |                    "male reproductive" = list("yes" = c("testis", "testes", "testic",
137 |                                                         "leydig", "peritubular", "sertoli",
138 |                                                         "cauda", "corpus", "caput",
139 |                                                         "sperm",
140 |                                                         "epidid", "gonad"),
141 |                                               "no" = c("prostate", "prostatic")),
142 |                    "prostate" = list("yes" = c("prostate", "rwpe1", "arcapm", "lncap",
143 |                                                "lucap",
144 |                                                "prostatic", "\\bprec\\b"),
145 |                                      "no" = c("lymph")),
146 |                    "female reproductive" = list("yes" = c("ovar",  "amniotic", "endomet", 
147 |                                                           "uter", "placent", "fallopian",
148 |                                                           "deciduo", "cervix", "amnion",
149 |                                                           "\\bhela\\b", "hela_",
150 |                                                           "chorionic villus", "endometrial stroma",
151 |                                                           "endometri", "oviductal",
152 |                                                           "cervic", "vagi", "granulosa"),
153 |                                                 "no" = c("ddasdasdasdasd")),
154 |                    "immune" = list("yes" = c("immune", "macroph", "leuk", "mdsc",
155 |                                              "killer", "lymph", "gm18507", "\\bt-all\\b",
156 |                                              "cd[0-9]+", " AML ", "^AML ", " AML$","hl60",
157 |                                              "NKT", "microgli", "gm12878", "\\btcell", "\\bbcell",
158 |                                              "\\bb[ -]cell", "\\bt[ -]cell", "k562", "ccrf-cem",
159 |                                              "white blood cell", "\\bth1\\b", "sk[-]*no[-]*1",
160 |                                              "cd4", "cd8", "nk cell", "marrow", "akata",
161 |                                              "hmnc-pb", "\\bbjab\\b", "cuttl1",
162 |                                              "mn cell line",
163 |                                              "th2", "j-lat", "dlbcl", "hut78",
164 |                                              "monocyt", "dendrit", "granulocyt",
165 |                                              "lympho", "mononucle", "pbmc", "neutro", "treg"),
166 |                                    "no" = c("ctc", "osteo", "vein", "stroma", "msc",
167 |                                             "hematopoetic", "stem cell", "precursor",
168 |                                             "mesenchymal",
169 |                                             "vessel", "cord", "cd34")),
170 |                    "bone" = list("yes" = c("femur", "osteo", "u2os", "hfob", "ewing", "\\bhob\\b",
171 |                                            "a673", 
172 |                                            "mandible", "bone", "joint"),
173 |                                  "no" = c("MSC", "marrow", "stroma")),
174 |                    "tumors" = list("yes" = c("cancer", "hela", "metast", "t47d","sk[-]*no[-]*1",
175 |                                              "mdamd231", "\\bmda[ -]*231", "^rt[0-9]+", "g401", "Soft Tissue, Mesenchymal",
176 |                                              "tumor", "hec1b", "^omental tissue$", "HNSCC", "HCC",
177 |                                              "hela", "k562", "reh", "jurkat", "leukemi", "bewo",
178 |                                              "kras", "lncap", "bjab", "gbm", " aml", "t-all",
179 |                                              "\\bpanc\\b", "skbr3", "u87", 
180 |                                              "wm983b", "a673",
181 |                                              "hepg2", "dlbcl", "caco-2et",
182 |                                              "ccrcc", "\\bcrc\\b",
183 |                                              "rko", "ramos", "mel888", 
184 |                                              "aml ", "nsclc", "mda_mb_231",
185 |                                              "vcap", "saos2", "vapc", "nalm6", "set2", "tov21",
186 |                                              "cancer", "carcin", "sarcom", "metasta", "tumor",
187 |                                              "[a-zA-Z]oma\\b", "NCCIT", "a172", "yapc", "u2os",
188 |                                              "wm989", "hct116", "ht1080", "arn8", "lncap",
189 |                                              "squamous cell carcinoma", "mcf_7", "tumour",
190 |                                              "gbm", "lucap", "mn cell line", "sum159", "ccrf-cem",
191 |                                              "panc1", "mcf7", "mcf-7", "\\bmda[ -_]*mb[ -_]*[0-9]+\\b",
192 |                                              "pc3", "hl60", "bt474", "escc"),
193 |                                    "no" = c("healthy", "normal", "Uninvolved", "293",
194 |                                             "stroma", "woman", "Mycosis")),
195 |                    "SingleCell" = c("single cell RNA", "single-cell", "smart seq", "in-drop",
196 |                                     "cel-seq", "10X genomics", "scRNA seq", "smartseq", "CELseq",
197 |                                     "smart-seq", "indrop", "drop-seq", "drop seq", "single nucleus",
198 |                                     "single-nucleus", "snRNA-Seq", "snRNASeq",
199 |                                     "fluidigm", "scRNASeq", "scRNA-Seq", "chromium"))
200 | 
201 | jsonlite::write_json(x = tissueDict, path = "Scripts/tissueDictionary.json")
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 


--------------------------------------------------------------------------------
/vignettes/correlationAnalyzeR.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Using correlationAnalyzeR"
  3 | author: "Henry Miller"
  4 | date: "`r Sys.Date()`"
  5 | output: 
  6 |   rmarkdown::html_document:
  7 |     highlight: pygments
  8 |     toc: true
  9 |     fig_width: 6
 10 | vignette: >
 11 |   %\VignetteIndexEntry{correlationAnalyzeR Quickstart}
 12 |   %\VignetteEngine{knitr::rmarkdown}
 13 |   %\VignetteEncoding{UTF-8}
 14 | ---
 15 | 
 16 | ```{r setup, include = FALSE}
 17 | knitr::opts_chunk$set(
 18 |   tidy = FALSE,
 19 |   cache = FALSE,
 20 |   dpi = 72,
 21 |   dev = "png",
 22 |   message = FALSE, error = FALSE, warning = TRUE
 23 | )
 24 | ```
 25 | 
 26 | `correlationAnalyzeR` is the R interface to the Correlation AnalyzeR database
 27 | and web application. The web version can be accessed [here](http://gccri.bishop-lab.uthscsa.edu/correlation-analyzer).
 28 | 
 29 | This package is designed to allow greater customization and control over 
 30 | the functions in the web interface. This vignette will demonstrate each 
 31 | function using an example. Additional info can be found in the reference manual.
 32 | 
 33 | ## Analyze Single Genes
 34 | 
 35 | To speed up the analysis, it is useful to generate a `TERM2GENE` object ahead of time. 
 36 | The `GSEA_Type` argument specifies which gene set databases to pull annotations from. 
 37 | See the details of `?getTERM2GENE` to see the different options.
 38 | ```{r}
 39 | library(correlationAnalyzeR)
 40 | TERM2GENE <- getTERM2GENE(GSEA_Type = c("GO:BP"))  # GO Biological Process
 41 | ```
 42 | 
 43 | ### Basic Analysis
 44 | 
 45 | `correlationAnalyzeR` can be used to predict gene function using `analyzeSingleGenes()` (the equivalent of Single Gene Mode in the web application). In this example, `Tissue` and `Sample_Type` arguments were set in order to limit the analysis to co-expression correlations in normal brain samples. 
 46 | 
 47 | ```{r}
 48 | res <- analyzeSingleGenes(genesOfInterest = c("BRCA1"), 
 49 |                           Tissue = "brain", Sample_Type = "normal",
 50 |                           TERM2GENE = TERM2GENE)
 51 | ```
 52 | This runs most of the core tasks for predicting gene functionality using this analysis mode. This includes running "corGSEA", an implementation of GSEA developed in this package for use on genome-wide co-expression correlations. 
 53 | 
 54 | The results are a list containing several items:
 55 | 
 56 | #### Tables
 57 | 
 58 | 1. The genome-wide correlations (Pearson's R) for BRCA1:
 59 | 
 60 | ```{r}
 61 | head(res$correlations)
 62 | ```
 63 | 
 64 | 2. The associated correlation P values:
 65 | 
 66 | ```{r}
 67 | head(res$`P values`)
 68 | ```
 69 | 
 70 | 3. The table of corGSEA results:
 71 | 
 72 | ```{r}
 73 | head(res$`BRCA1, Brain - Normal`$GSEA$eres)
 74 | ```
 75 | 
 76 | #### Figures
 77 | 
 78 | 1. A histogram showing the genome-wide correlation value (R) distribution:
 79 | 
 80 | ```{r, fig.height=4, fig.width=7.5}
 81 | res$`BRCA1, Brain - Normal`$corrHist
 82 | ```
 83 | 
 84 | 2. The top increasing corGSEA hits:
 85 | 
 86 | ```{r, fig.height=14, fig.width=22}
 87 | res$`BRCA1, Brain - Normal`$GSEA$GSEA_up
 88 | ```
 89 | 
 90 | 3. The top decreasing corGSEA hits:
 91 | 
 92 | ```{r, fig.height=14, fig.width=22}
 93 | res$`BRCA1, Brain - Normal`$GSEA$GSEA_down
 94 | ```
 95 | 
 96 | ### Supplying a custom dataset
 97 | 
 98 | `correlationAnalyzeR` relies on pre-calculated datasets which are stored in a cloud database. However, it is also possible for users to generate predictions from their own datasets. To generate a correlation matrix you can supply a read count matrix to `generateCorrelations()`
 99 | 
100 | Here is an example using the `airway` dataset. We first wrangle the dataset into a raw read count matrix:
101 | 
102 | ```{r get_airway}
103 | library(airway)
104 | library(EnsDb.Hsapiens.v86)
105 | library(dplyr)
106 | 
107 | data(airway)
108 | cts <- assay(airway)
109 | ens2gene <- ensembldb::select(EnsDb.Hsapiens.v86, keys = rownames(cts),
110 |                               columns = c("SYMBOL"), keytype = "GENEID") %>%
111 |   dplyr::distinct(SYMBOL, .keep_all = TRUE) %>%
112 |   dplyr::inner_join(y = data.frame("GENEID" = rownames(cts)))
113 | cts <- cts[ens2gene$GENEID,]
114 | rownames(cts) <- ens2gene$SYMBOL
115 | ```
116 | 
117 | We then generate the correlation matrix with the `generateCorrelations()` function:
118 | ```{r generate_correlations, cache=FALSE}
119 | corrMat <- generateCorrelations(cts)
120 | ```
121 | 
122 | Once the correlation matrix is generated, it can be used as the input to `analyzeSingleGenes()` via the `corrMat` argument with a `corrMat_label` set (this is the custom label used during plotting functions).
123 | 
124 | ```{r analyzeSingleGenes_custom}
125 | res <- analyzeSingleGenes(genesOfInterest = c("BRCA1"), corrMat = corrMat,
126 |                           corrMat_label = "User-supplied DataSet",
127 |                           TERM2GENE = TERM2GENE)
128 | ```
129 | 
130 | Here is the correlation histogram produced with the custom dataset:
131 | 
132 | ```{r, fig.height=4, fig.width=7.5}
133 | res$`BRCA1, User-supplied DataSet`$corrHist
134 | ```
135 | 
136 | It is important to note that user-supplied datasets should provide enough samples to ensure robust co-expression calculations. In the above example, it is clear that there are not enough samples within the `airway` dataset to support this calculation. In our experience, it is necessary to have at least 30 samples in most cases.
137 | 
138 | ### Cross-compare mode
139 | 
140 | `crossCompareMode` allows a user to examine the correlations across multiple tissue and disease conditions. For example, to analyze the correlations of BRCA1 across all tissues, we could do the following:
141 | 
142 | ```{r}
143 | res <- analyzeSingleGenes(genesOfInterest = c("BRCA1"), crossCompareMode = TRUE)
144 | ```
145 | 
146 | The output is a list containing several tables:
147 | 
148 | 1. The co-expression correlations for BRCA1 across all tissues:
149 | 
150 | ```{r}
151 | head(res$BRCA1$correlations, n = 3)
152 | ```
153 | 
154 | 2. All the VST-transformed counts for BRCA1 across all samples:
155 | 
156 | ```{r}
157 | head(res$BRCA1$VST_DF)
158 | ```
159 | 
160 | The output list also contains several plots:
161 | 
162 | 1. A box plot comparing cancer and normal samples by VST
163 | 
164 | ```{r, fig.height=5, fig.width=8}
165 | res$BRCA1$VST_boxPlot
166 | ```
167 | 
168 | 2. A heatmap of the top 30 co-correlated genes with BRCA1 (genes which show similar
169 | co-expression correlations to BRCA1). The accompanying values for this plot are in 
170 | `res$BRCA1$heatmapSmallDataCo`.
171 | 
172 | ```{r, fig.height=6, fig.width=10}
173 | res$BRCA1$heatmapSmallCo
174 | ```
175 | 
176 | 3. A heatmap of the top 200 co-correlated genes with BRCA1. The accompanying values for this plot are in `res$BRCA1$heatmapBigDataCo`.
177 | 
178 | ```{r, fig.height=6, fig.width=10}
179 | res$BRCA1$heatmapBigCo
180 | ```
181 | 
182 | 4. A heatmap of the top 30 variably-correlated genes with BRCA1 (genes which show divergent
183 | co-expression correlations compared to BRCA1). The accompanying values for this plot are in 
184 | `res$BRCA1$heatmapSmallDataCo`.
185 | 
186 | ```{r, fig.height=6, fig.width=10}
187 | res$BRCA1$heatmapSmallVar
188 | ```
189 | 
190 | 5. A heatmap of the top 200 variably-correlated genes with BRCA1. The accompanying values for this plot are in `res$BRCA1$heatmapBigDataVar`.
191 | 
192 | ```{r, fig.height=6, fig.width=10}
193 | res$BRCA1$heatmapBigVar
194 | ```
195 | 
196 | ## Analyze Gene Pairs 
197 | 
198 | ### Basic Analysis
199 | 
200 | `correlationAnalyzeR` can be used to analyze differences between two genes using `analyzeGenePairs()` (the equivalent of Gene vs Gene Mode in the web application).
201 | 
202 | 
203 | ```{r}
204 | res <- analyzeGenePairs(genesOfInterest = c("BRCA1", "BRCA2"),
205 |                         Tissue = "all", Sample_Type = "all",
206 |                         TERM2GENE = TERM2GENE)
207 | ```
208 | 
209 | The `analyzeGenePairs()` function performs `analyzeSingleGenes()` on both of the supplied genes and then compares the results, generating several tables and figures:
210 | 
211 | 1. The correlation between the two genes is visualized using a scatter plot of 
212 | their VST-transformed expression values. 
213 | 
214 | - By Disease:
215 | ```{r, fig.height=5, fig.width=8}
216 | res$compared$VST_corrPlot$corrPlot_disease
217 | ```
218 | 
219 | - By Tissue:
220 | ```{r, fig.height=5, fig.width=10}
221 | res$compared$VST_corrPlot$corrPlot_tissue
222 | ```
223 | 
224 | 2. The gene co-expression correlations with the average Pearson R and variance included. Note that, with only two data points, variance is just 2x the squared deviation from the mean.  
225 | 
226 | ```{r}
227 | res$compared$correlations %>%
228 |        arrange(desc(average)) %>%
229 |   head()
230 | ```
231 | 
232 | 3. The p values of the correlation calculation.
233 | ```{r}
234 | head(res$compared$`P values`)
235 | ```
236 | 
237 | 4. The combined results of corGSEA for BRCA1 and BRCA2
238 | ```{r}
239 | head(res$compared$correlatedPathwaysDataFrame)
240 | ```
241 | 
242 | 5. The VST-transformed counts for BRCA1 and BRCA2 across all tissues
243 | ```{r}
244 | head(res$compared$VST_Data)
245 | ```
246 | 
247 | 6. A scatter plot comparing the genome-wide co-expression correlations for BRCA1 and BRCA2.
248 | ```{r, fig.height=6, fig.width=7.5}
249 | res$compared$correlationPlot
250 | ```
251 | 
252 | 7. The same plot, binned to reduce the computational requirements for plotting:
253 | ```{r, fig.height=6, fig.width=7.5}
254 | res$compared$correlationPlotBin
255 | ```
256 | 
257 | 8. A heatmap showing the genes with the top variance between BRCA1 and BRCA2 by co-expression correlation. This is simply a measure of the absolute difference between them. 
258 | ```{r, fig.height=6, fig.width=10}
259 | res$compared$correlationVarianceHeatmap
260 | ```
261 | 
262 | 9. A heatmap showing the genes with the top similarity in co-expression correlation between BRCA1 and BRCA2.
263 | 
264 | ```{r, fig.height=6, fig.width=10}
265 | res$compared$correlationSimilarityHeatmap
266 | ```
267 | 
268 | 10. A heatmap showing the pathways with the top variance between BRCA1 and BRCA2 by corGSEA score.
269 | 
270 | ```{r, fig.height=6, fig.width=10}
271 | res$compared$pathwayVarianceHeatmap
272 | ```
273 | 
274 | 11. A heatmap showing the pathways with the top similarity in corGSEA score between BRCA1 and BRCA2.
275 | 
276 | ```{r, fig.height=6, fig.width=10}
277 | res$compared$pathwaySimilarityHeatmap
278 | ```
279 | 
280 | ### Cross-compare mode
281 | 
282 | In `analyzeGenePairs()`, cross-compare mode allows the user to analyze the co-expression of two genes across all tissue-disease conditions (`geneVsGene`) or one gene in cancer vs normal (`normalVsCancer`). 
283 | 
284 | #### Gene vs Gene
285 | 
286 | When `genesOfInterest` is supplied with two different genes and `crossCompareMode=TRUE`, then `geneVsGene` mode is executed.
287 | 
288 | ```{r}
289 | res <- analyzeGenePairs(genesOfInterest = c("BRCA1", "BRCA2"), 
290 |                         crossCompareMode = TRUE)
291 | ```
292 | 
293 | This analysis produces a list containing several figures and tables:
294 | 
295 | 1. The co-expression correlation results within each tissue-disease condition, along with the average co-expression values and variance for each gene. 
296 | 
297 | ```{r}
298 | res$Correlations %>%
299 |        arrange(desc(average)) %>%
300 |   head(n=3)
301 | ```
302 | 
303 | 2. The correlation p values for each condition.
304 | 
305 | ```{r}
306 | head(res$`P values`, n=3)
307 | ```
308 | 
309 | 3. The VST box plots for each gene across conditions:
310 | ```{r, fig.height=8, fig.width=12}
311 | ggpubr::ggarrange(res$crossCompareVST$VST_boxPlotOne, res$crossCompareVST$VST_boxPlotTwo,
312 |                   nrow = 2)
313 | ```
314 | 
315 | 4. The data which the expression boxplots are based upon:
316 | ```{r}
317 | head(res$crossCompareVST$VST_DF)
318 | ```
319 | 
320 | 5. For each tissue-disease condition, a scatter plot of the genome-wide co-expression correlations between BRCA1 and BRCA2. 
321 | 
322 | ```{r, fig.height=6, fig.width=7.5}
323 | res$pairResList$`Bone Cancer`$scatterPlot
324 | ```
325 | 
326 | 6. For each tissue-disease condition, a heatmap showing the top variable genes between BRCA1 and BRCA2. 
327 | 
328 | ```{r, fig.height=6, fig.width=7.5}
329 | res$pairResList$`Bone Cancer`$heatMap
330 | ```
331 | 
332 | #### Normal vs Cancer
333 | 
334 | When `genesOfInterest` is supplied with only one gene, `Tissue` includes `Cancer` and `Normal`, and `crossCompareMode=TRUE`, then `normalVsCancer` mode is executed.
335 | 
336 | ```{r}
337 | res <- analyzeGenePairs(genesOfInterest = c("BRCA1", "BRCA1"), 
338 |                         Tissue = c("Cancer", "Normal"),
339 |                         crossCompareMode = TRUE)
340 | ```
341 | 
342 | The primary difference betwen `normalVsCancer` and `geneVsGene` is that `analyzeGenePairs` will output:
343 | 
344 | 1. A comparative boxplot showing the difference between cancer and normal conditions across tissues with respect to BRCA1 expression:
345 | 
346 | ```{r, fig.height=5, fig.width=10}
347 | res$crossCompareVST$VST_boxPlot
348 | ```
349 | 
350 | 2. A list of scatter plots in which the genome-wide co-expression correlations for BRCA1 are compared between cancer and normal conditions:
351 | 
352 | ```{r, fig.height=6, fig.width=7.5}
353 | res$pairResList$`Immune - Normal`$scatterPlot
354 | ```
355 | 
356 | 3. And the list also contains heatmaps showing the top variable genes with respect to BRCA1 gene co-expression between cancer and normal:
357 | 
358 | ```{r, fig.height=6, fig.width=7.5}
359 | res$pairResList$`Immune - Normal`$heatMap
360 | ```
361 | 
362 | ## Gene vs Gene List Analysis
363 | 
364 | This mode provides an empirical approach for determining whether a gene is significantly correlated with a list of genes. This is an alternative to the typical Pearson correlation p value which can only determine whether any two genes are significantly co-expressed. To run this mode, use the `geneVsGeneListAnalyze()` function:
365 | 
366 | ```{r}
367 | res <- geneVsGeneListAnalyze(pairedGenesList = list("BRCA1" = c("BRCA2", "EZH2", "CCND1",
368 |                                                          "SLC7A11", "GCLC", "CDKN1A")),
369 |                               Sample_Type = "cancer",
370 |                               Tissue = "bone")
371 | ```
372 | This returns several plots and tables:
373 | 
374 | 1. It returns to genome-wide correlations for BRCA1
375 | 
376 | ```{r}
377 | head(res$correlations)
378 | ```
379 | 
380 | 2. Along with the p values corresponding to these Pearson correlations.
381 | 
382 | ```{r}
383 | head(res$`P values`)
384 | ```
385 | 
386 | 3. It returns a histogram showing the genome-wide co-expression correlations for BRCA1 along with the secondary gene list annotated on top:
387 | 
388 | ```{r, fig.height=6, fig.width=7.5}
389 | res$BRCA1$Correlation_histogram
390 | ```
391 | 
392 | 4. A plot showing the distribution of p values from bootstrapping with the observed p values for the specified gene list. NOTE that the summit represents the point with the highest density from the empirical distribution, not necessarily a specific observation. 
393 | 
394 | ```{r, fig.height=5, fig.width=7.5}
395 | res$BRCA1$sigTest$tTest_pvalsPlot
396 | ```
397 | 
398 | 5. A plot show the empirical distribution of bootstrapped mean and median correlation values with the observed correlation for the specified gene list shown, along with a p value that indicates significance. NOTE that this uses a simplistic approach to finding significance (anything > .95 is significant) and the method used above is preferred.
399 | 
400 | ```{r, fig.height=5, fig.width=7.5}
401 | res$BRCA1$sigTest$meansPlot
402 | res$BRCA1$sigTest$mediansPlot
403 | ```
404 | 
405 | 6. The data accompanying each plot:
406 | ```{r, fig.height=5, fig.width=7.5}
407 | data.frame(means = res$BRCA1$sigTest$means, 
408 |            medians = res$BRCA1$sigTest$medians,
409 |            pvals = res$BRCA1$sigTest$tTest_pvals) %>% head()
410 | ```
411 | 
412 | ## Gene List Topology Analysis
413 | 
414 | Many methods for dimensionality reduction exist, but most are focused on sample-level comparisons and few methods for analyzing feature-space topology exist. In the final analysis mode, `correlationAnalyzeR` uses gene co-expression correlation values as a metric for dimensionality reduction via `PCA` and `tSNE` with agglomerative clustering to determine the topology of a list of genes. 
415 | 
416 | The analysis can be accessed using the `analyzeGenesetTopology()` function from this package:
417 | 
418 | ```{r}
419 | genesOfInterest <- c("CDK12", "AURKB", "SFPQ", "NFKB1", "BRCC3", "BRCA2", "PARP1",
420 |                      "EZH2", "CCND1", "SLC7A11", "GCLC", "CDKN1A", "MTAP",
421 |                      "DHX9", "SON", "AURKA", "SETX", "BRCA1", "ATMIN")
422 | res <- analyzeGenesetTopology(genesOfInterest = genesOfInterest,
423 |                               Sample_Type = "cancer", Tissue = "bone")
424 | ```
425 | This produces several tables and figures:
426 | 
427 | 1. The co-expression correlations for each gene in the supplied gene list:
428 | 
429 | ```{r}
430 | head(res$Correlation_Data)
431 | ```
432 | 
433 | 2. The p values corresponding to these co-expression correlations:
434 | 
435 | ```{r}
436 | head(res$`P values`)
437 | ```
438 | 
439 | 3. A heatmap of the top 50 variant genes across the gene list by co-expression values:
440 | 
441 | ```{r, fig.height=6, fig.width=10}
442 | res$variantGenesHeatmap_Top
443 | ```
444 | 
445 | 4. The data matrix accompanying this heatmap:
446 | 
447 | ```{r}
448 | head(res$variantGenesHeatmap_Top_MAT)
449 | ```
450 | 
451 | 5. A heatmap of the top 500 variant genes across the gene list by co-expression values:
452 | 
453 | ```{r, fig.height=6, fig.width=10}
454 | res$variantGenesHeatmap
455 | ```
456 | 
457 | 6. The data matrix accompanying this heatmap:
458 | 
459 | ```{r}
460 | head(res$variantGenesHeatmap_MAT)
461 | ```
462 | 
463 | 7. The same as (5), but with co-correlative genes (genes which are similarly co-correlated across each gene in the supplied list) instead of variant genes. 
464 | 
465 | ```{r, fig.height=6, fig.width=10}
466 | res$cocorrelativeGenesHeatmap
467 | ```
468 | 
469 | 8. And the corresponding data matrix:
470 | 
471 | ```{r}
472 | head(res$cocorrelativeGenesHeatmap_MAT)
473 | ```
474 | 
475 | 9. The PCA plot showing the gene list members projected in PC1 and PC2, labeled, and colored by hierarchical cluster membership:
476 | 
477 | ```{r}
478 | res$PCA_plot
479 | ```
480 | 
481 | 10. The data frame corresponding to (9):
482 | 
483 | ```{r}
484 | res$PCA_data
485 | ```
486 | 
487 | 11. The pathway enrichment of the input gene list displayed as a dotplot:
488 | 
489 | ```{r, fig.height=5, fig.width=10}
490 | res$inputGenes_pathwayEnrich_dotplot
491 | ```
492 | 
493 | 12. The pathway enrichment results in a data frame:
494 | 
495 | ```{r}
496 | head(res$inputGenes_pathwayEnrich_data)
497 | ```
498 | 
499 | 13. The object generated by `clusterProfiler` during pathway enrichment (this is compatible with the other functions in the `clusterProfiler` package):
500 | 
501 | ```{r}
502 | res$inputGenes_pathwayEnrich
503 | ```
504 | 
505 | ### Enriching with large gene lists
506 | 
507 | Unlike the web application version of Correlation AnalyzeR, the R package is capable of handling arbitrarily-large gene lists for `analyzeGenesetTopology()`. One instance where one might wish to perform an analysis like this could be in parsing an existing gene set from curated sources like Gene Ontology. 
508 | 
509 | To obtain the list of genes for this analysis, it is convenient to use the `msigdbr` package in the following manner:
510 | 
511 | ```{r}
512 | library(tidyverse)
513 | MDF <- msigdbr::msigdbr(category = "C2", subcategory = "CGP")
514 | geneList <- MDF %>%
515 |   filter(gs_name == "RIGGI_EWING_SARCOMA_PROGENITOR_UP") %>%
516 |   pull(gene_symbol)
517 | ```
518 | 
519 | We have now obtained a vector with the 434 genes in the "RIGGI_EWING_SARCOMA_PROGENITOR_UP" gene set from the Chemical and Genetic Perturbations (CGP) database in the "C2" collection of MSigDB. A link to the info page for this gene set can be found [here](http://www.gsea-msigdb.org/gsea/msigdb/cards/RIGGI_EWING_SARCOMA_PROGENITOR_UP.html). Now, we can use this list as the input for `analyzeGenesetTopology()`. NOTE: when a gene in our gene list is not found in the correlation data, it will automatically be skipped. 
520 | 
521 | ```{r}
522 | res <- analyzeGenesetTopology(genesOfInterest = geneList, 
523 |                               Sample_Type = "cancer",
524 |                               Tissue = "bone")
525 | ```
526 | Because of the large number of genes supplied, a tSNE was calculated instead of PCA. This behavior can be prevented by setting the `alternativeTSNE` parameter to `FALSE`. The visualization is designed to allow easier cluster interpretation and does not include gene labels:
527 | 
528 | ```{r}
529 | res$TSNE_plot
530 | ```
531 | 
532 | However, the underlying plot data is supplied as well:
533 | 
534 | ```{r}
535 | head(res$TSNE_data)
536 | ```
537 | Which means that, using `plotly`, it is straightforward to create an interactive visualization that includes gene name information:
538 | 
539 | ```{r}
540 | plt <- (res$TSNE_data %>%
541 |   ggplot(aes(x = tsne1, y = tsne2, color = hclust, label = geneNames)) +
542 |   geom_point()) %>%
543 |   plotly::ggplotly()
544 | ```
545 | 
546 | # Session info
547 | 
548 | ```{r sessionInfo}
549 | sessionInfo()
550 | ```
551 | 
552 | # Questions
553 | 
554 | Feel free to email Henry Miller (millerh1@uthscsa.edu) any time with questions, bug reports, or if you want to contribute!
555 | 
556 | 


--------------------------------------------------------------------------------