├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── annotatr_data_doc.R
    ├── annotatr_package_doc.R
    ├── build_annotations.R
    ├── intersect.R
    ├── randomize.R
    ├── read.R
    ├── summarize.R
    ├── utils.R
    └── visualize.R
├── README.md
├── data-raw
    └── create_example_annotations.R
├── data
    └── annotations.rda
├── inst
    ├── CITATION
    ├── NEWS
    └── extdata
    │   ├── Gm12878_Ezh2_peak_annotations.txt.gz
    │   ├── Gm12878_Ezh2_sorted_scores.narrowPeak.gz
    │   ├── Gm12878_Stat3_chr2.bed.gz
    │   ├── IDH2mut_v_NBM_multi_data_chr9.txt.gz
    │   ├── K562_Cjun_peak_annotations.txt.gz
    │   ├── test_BED3.bed
    │   ├── test_BED4.bed
    │   ├── test_BED5.bed
    │   ├── test_BED6.bed
    │   ├── test_annotation_nooverlap.bed
    │   ├── test_annotations_3.bed
    │   ├── test_annotations_4.bed
    │   ├── test_annotations_5.bed
    │   ├── test_annotations_6.bed
    │   ├── test_annotations_6_gene.bed
    │   ├── test_annotations_6_symbol.bed
    │   ├── test_annotations_6_tx_gene_symbol.bed
    │   ├── test_annotations_minoverlap.bed
    │   ├── test_bedGraph.bedGraph
    │   ├── test_intersect.bed
    │   └── test_read_multiple_data_nohead.bed
├── man
    ├── annotate_regions.Rd
    ├── annotations.Rd
    ├── annotatr.Rd
    ├── annotatr_cache.Rd
    ├── build_ah_annots.Rd
    ├── build_annotations.Rd
    ├── build_cpg_annots.Rd
    ├── build_enhancer_annots.Rd
    ├── build_gene_annots.Rd
    ├── build_hmm_annots.Rd
    ├── build_lncrna_annots.Rd
    ├── builtin_annotations.Rd
    ├── builtin_genomes.Rd
    ├── check_annotations.Rd
    ├── expand_annotations.Rd
    ├── get_cellline_from_code.Rd
    ├── get_cellline_from_shortcut.Rd
    ├── get_orgdb_name.Rd
    ├── get_txdb_name.Rd
    ├── plot_annotation.Rd
    ├── plot_categorical.Rd
    ├── plot_coannotations.Rd
    ├── plot_numerical.Rd
    ├── plot_numerical_coannotations.Rd
    ├── randomize_regions.Rd
    ├── read_annotations.Rd
    ├── read_regions.Rd
    ├── reformat_hmm_codes.Rd
    ├── subset_order_tbl.Rd
    ├── summarize_annotations.Rd
    ├── summarize_categorical.Rd
    ├── summarize_numerical.Rd
    └── tidy_annotations.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test_1_utils.R
    │   ├── test_2_read.R
    │   ├── test_3_build_annotations.R
    │   ├── test_4_intersect.R
    │   ├── test_5_randomize.R
    │   ├── test_6_summarize.R
    │   └── test_7_visualize.R
└── vignettes
    ├── annotatr-vignette.Rmd
    ├── annotatr_cpgs.jpeg
    └── annotatr_genes.jpeg


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.git$
2 | ^data-raw$
3 | ^meta$
4 | ^\.travis\.yml$
5 | README\.md
6 | ^\.Rprofile$
7 | ^doc$
8 | ^Meta$
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .Rproj.user
3 | .Rhistory
4 | .RData
5 | meta/
6 | inst/doc
7 | /doc/
8 | /Meta/
9 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Sample .travis.yml for R projects
 2 | 
 3 | language: r
 4 | r:
 5 |   - devel
 6 | sudo: false
 7 | cache: packages
 8 | bioc_required: true
 9 | 
10 | warnings_are_errors: false
11 | 
12 | r_github_packages:
13 |   - jimhester/covr
14 | 
15 | notifications:
16 |   slack: sartorlab:OpT7L6aC9upo7d3PzW2yzMsh
17 |   email:
18 |     on_success: change
19 |     on_failure: change
20 | 
21 | after_success:
22 |   - Rscript -e 'covr::coveralls()'
23 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: annotatr
 2 | Title: Annotation of Genomic Regions to Genomic Annotations
 3 | Version: 1.31.0
 4 | Date: 2021-11-20
 5 | Authors@R: c(
 6 |     person("Raymond G.", "Cavalcante", email = "rcavalca@umich.edu", role = c("aut", "cre")),
 7 |     person(c("Maureen A."), "Sartor", email = "sartorma@med.umich.edu", role = c("ths")))
 8 | Description: Given a set of genomic sites/regions (e.g. ChIP-seq peaks, CpGs, differentially methylated CpGs or regions, SNPs, etc.) it is often of interest to investigate the intersecting genomic annotations. Such annotations include those relating to gene models (promoters, 5'UTRs, exons, introns, and 3'UTRs), CpGs (CpG islands, CpG shores, CpG shelves), or regulatory sequences such as enhancers. The annotatr package provides an easy way to summarize and visualize the intersection of genomic sites/regions with genomic annotations.
 9 | Depends:
10 |     R (>= 3.4.0)
11 | Imports:
12 |     AnnotationDbi,
13 |     AnnotationHub,
14 |     dplyr,
15 |     GenomicFeatures,
16 |     GenomicRanges,
17 |     GenomeInfoDb (>= 1.10.3),
18 |     ggplot2,
19 |     IRanges,
20 |     methods,
21 |     readr,
22 |     regioneR,
23 |     reshape2,
24 |     rtracklayer,
25 |     S4Vectors (>= 0.23.10),
26 |     stats,
27 |     utils
28 | Suggests:
29 |     BiocStyle,
30 |     devtools,
31 |     knitr,
32 |     org.Dm.eg.db,
33 |     org.Gg.eg.db,
34 |     org.Hs.eg.db,
35 |     org.Mm.eg.db,
36 |     org.Rn.eg.db,
37 |     rmarkdown,
38 |     roxygen2,
39 |     testthat,
40 |     TxDb.Dmelanogaster.UCSC.dm3.ensGene,
41 |     TxDb.Dmelanogaster.UCSC.dm6.ensGene,
42 |     TxDb.Ggallus.UCSC.galGal5.refGene,
43 |     TxDb.Hsapiens.UCSC.hg19.knownGene,
44 |     TxDb.Hsapiens.UCSC.hg38.knownGene,
45 |     TxDb.Mmusculus.UCSC.mm9.knownGene,
46 |     TxDb.Mmusculus.UCSC.mm10.knownGene,
47 |     TxDb.Rnorvegicus.UCSC.rn4.ensGene,
48 |     TxDb.Rnorvegicus.UCSC.rn5.refGene,
49 |     TxDb.Rnorvegicus.UCSC.rn6.refGene
50 | VignetteBuilder: knitr
51 | BugReports: https://www.github.com/rcavalcante/annotatr/issues
52 | License: GPL-3
53 | LazyData: true
54 | RoxygenNote: 7.1.2
55 | biocViews: Software, Annotation, GenomeAnnotation, FunctionalGenomics, Visualization
56 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(annotate_regions)
 4 | export(annotatr_cache)
 5 | export(build_ah_annots)
 6 | export(build_annotations)
 7 | export(builtin_annotations)
 8 | export(builtin_genomes)
 9 | export(expand_annotations)
10 | export(plot_annotation)
11 | export(plot_categorical)
12 | export(plot_coannotations)
13 | export(plot_numerical)
14 | export(plot_numerical_coannotations)
15 | export(randomize_regions)
16 | export(read_annotations)
17 | export(read_regions)
18 | export(subset_order_tbl)
19 | export(summarize_annotations)
20 | export(summarize_categorical)
21 | export(summarize_numerical)
22 | export(tidy_annotations)
23 | import(AnnotationDbi, except='select')
24 | import(AnnotationHub, except='query')
25 | import(GenomicFeatures)
26 | import(GenomicRanges, except=c('union','setdiff','intersect','union'))
27 | import(dplyr)
28 | import(ggplot2)
29 | import(methods)
30 | importClassesFrom(GenomeInfoDb,Seqinfo)
31 | importClassesFrom(S4Vectors,Hits)
32 | importClassesFrom(S4Vectors,Rle)
33 | importFrom(GenomeInfoDb,seqlengths)
34 | importFrom(GenomeInfoDb,seqnames)
35 | importFrom(IRanges,IRanges)
36 | importFrom(S4Vectors,endoapply)
37 | importFrom(S4Vectors,splitAsList)
38 | importFrom(readr,read_tsv)
39 | importFrom(regioneR,randomizeRegions)
40 | importFrom(reshape2,melt)
41 | importFrom(rtracklayer,import)
42 | importFrom(rtracklayer,import.bed)
43 | importFrom(stats,as.formula)
44 | importFrom(utils,combn)
45 | importFrom(utils,data)
46 | 


--------------------------------------------------------------------------------
/R/annotatr_data_doc.R:
--------------------------------------------------------------------------------
 1 | #' example_annotations data
 2 | #'
 3 | #' A \code{GRanges} of precomputed annotations for CpG features. Created by doing
 4 | #' \code{build_annotations(genome='hg19', annotations = 'hg19_cpgs')}.
 5 | #'
 6 | #' @format A \code{GRanges} object with the CpG feature annotations for hg19
 7 | #' and containing \code{mcols}:
 8 | #' \describe{
 9 | #'     \item{id}{The internal ID for the annotation}
10 | #'     \item{tx_id}{All NA, since these are not associated with tx_ids}
11 | #'     \item{gene_id}{All NA, since there are not associated Entrez IDs}
12 | #'     \item{symbols}{All NA, since there are not associated gene symbols}
13 | #'     \item{type}{A character indicating the type of annotation. Including:
14 | #' 'hg19_cpg_islands', 'hg19_cpg_shores', 'hg19_cpg_shelves', and 'hg19_cpg_inter'.}
15 | #' }
16 | #' @source The AnnotationHub resource for hg19 CpG features.
17 | "annotations"
18 | 


--------------------------------------------------------------------------------
/R/annotatr_package_doc.R:
--------------------------------------------------------------------------------
 1 | #' annotatr: Annotation of Genomic Regions to Functional Annotations
 2 | #'
 3 | #' Given a set of genomic sites/regions (e.g. ChIP-seq peaks, CpGs, differentially methylated CpGs or regions, SNPs, etc.) it is often of interest to investigate the intersecting functional annotations. Such annotations include those relating to gene models (promoters, 5'UTRs, exons, introns, and 3'UTRs), CpGs (CpG islands, CpG shores, CpG shelves), the non-coding genome, and enhancers. The annotatr package provides an easy way to summarize and visualize the intersection of genomic sites/regions with the above functional annotations.
 4 | #'
 5 | #' @docType package
 6 | #' @name annotatr
 7 | #'
 8 | #' @rawNamespace import(AnnotationDbi, except='select')
 9 | #' @rawNamespace import(AnnotationHub, except='query')
10 | #' @import dplyr
11 | #' @import ggplot2
12 | #' @import GenomicFeatures
13 | #' @rawNamespace import(GenomicRanges, except=c('union','setdiff','intersect','union'))
14 | #' @importClassesFrom GenomeInfoDb Seqinfo
15 | #' @importFrom GenomeInfoDb seqnames seqlengths
16 | #' @importFrom IRanges IRanges
17 | #' @importFrom S4Vectors endoapply
18 | #' @importFrom S4Vectors splitAsList
19 | #' @import methods
20 | #' @importFrom readr read_tsv
21 | #' @importFrom reshape2 melt
22 | #' @importFrom regioneR randomizeRegions
23 | #' @importFrom rtracklayer import import.bed
24 | #' @importClassesFrom S4Vectors Hits Rle
25 | #' @importFrom stats as.formula
26 | #' @importFrom utils combn data
27 | NULL
28 | 


--------------------------------------------------------------------------------
/R/intersect.R:
--------------------------------------------------------------------------------
 1 | #' A function to intersect user region data with annotation data
 2 | #'
 3 | #' Annotate genomic regions to selected genomic annotations while preserving the data associated with the genomic regions.
 4 | #'
 5 | #' @param regions The GRanges object returned by \code{read_regions()}.
 6 | #' @param annotations A character vector of annotations to build. Valid annotation codes are listed with \code{builtin_annotations()}. The "basicgenes" shortcut builds the following regions: 1-5Kb upstream of TSSs, promoters, 5UTRs, exons, introns, and 3UTRs. The "cpgs" shortcut builds the following regions: CpG islands, shores, shelves, and interCGI regions. NOTE: Shortcuts need to be appended by the genome, e.g. \code{hg19_basicgenes}.
 7 | #' Custom annotations whose names are of the form \code{[genome]_custom_[name]} should also be included. Custom annotations should be read in and converted to \code{GRanges} with \code{read_annotations()}. They can be for a \code{supported_genome()}, or for an unsupported genome.
 8 | #' @param minoverlap A scalar, positive integer, indicating the minimum required overlap of regions with annotations.
 9 | #' @param ignore.strand Logical indicating whether strandedness should be respected in findOverlaps(). Default FALSE.
10 | #' @param quiet Print progress messages (FALSE) or not (TRUE).
11 | #'
12 | #' @return A \code{GRanges} where the \code{granges} are from the regions, and the \code{mcols} include the \code{mcols} from the regions and a column with the annotation \code{GRanges}.
13 | #'
14 | #' @examples
15 | #'    r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr')
16 | #'    extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character')
17 | #'    r = read_regions(con = r_file, extraCols = extraCols, rename_score = 'coverage')
18 | #'
19 | #'    # Get premade CpG annotations
20 | #'    data('annotations', package = 'annotatr')
21 | #'
22 | #'    a = annotate_regions(
23 | #'        regions = r,
24 | #'        annotations = annotations,
25 | #'        ignore.strand = TRUE)
26 | #'
27 | #' @export
28 | annotate_regions = function(regions, annotations, minoverlap = 1L, ignore.strand = TRUE, quiet = FALSE) {
29 |     # Checks before moving forward
30 |     if(class(regions)[1] != "GRanges") {
31 |         stop('Error in annotate_regions(...): regions object is not GRanges.')
32 |     }
33 | 
34 |     if(class(annotations)[1] != "GRanges") {
35 |         stop('Error in annotate_regions(...): annotations object is not GRanges. Use build_annotations(...) to construct the annotations before calling annotate_regions(...).')
36 |     }
37 | 
38 |     # Perform the intersections
39 |     if(!quiet) {
40 |         message('Annotating...')
41 |     }
42 | 
43 |     intersections = GenomicRanges::findOverlaps(regions, annotations, minoverlap = minoverlap, ignore.strand = ignore.strand)
44 | 
45 |     if(length(intersections) > 0) {
46 |         gr = regions[S4Vectors::queryHits(intersections)]
47 |         GenomicRanges::mcols(gr)$annot = annotations[S4Vectors::subjectHits(intersections)]
48 |         return(gr)
49 |     } else {
50 |         stop('No annotations intersect the regions.')
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/R/randomize.R:
--------------------------------------------------------------------------------
 1 | #' Randomize Regions
 2 | #'
 3 | #' \code{randomize_regions} is a wrapper function for \code{regioneR::randomizeRegions()} that simplifies the creation of randomized regions for an input set of regions read with \code{read_regions()}. It relies on the \code{seqlengths} of \code{regions} in order to build the appropriate \code{genome} object for \code{regioneR::randomizeRegions()}.
 4 | #'
 5 | #' NOTE: The data associated with the input \code{regions} are not passed on to the random regions.
 6 | #'
 7 | #' @param regions A \code{GRanges} object from \code{read_regions}.
 8 | #' @param allow.overlaps A logical stating whether random regions can overlap input regions (TRUE) or not (FALSE). Default TRUE.
 9 | #' @param per.chromosome A logical stating whether the random regions should remain on the same chromosome (TRUE) or not (FALSE). Default TRUE.
10 | #' @param quiet Print progress messages (FALSE) or not (TRUE).
11 | #'
12 | #' @return A \code{GRanges} object of randomized regions based on \code{regions} from \code{read_regions()}. NOTE: Data associated with the original regions is not attached to the randomized regions.
13 | #'
14 | #' @examples
15 | #'    # Create random region set based on ENCODE ChIP-seq data
16 | #'    file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr')
17 | #'    r = read_regions(con = file, genome = 'hg19')
18 | #'
19 | #'    random_r = randomize_regions(regions = r)
20 | #'
21 | #' @export
22 | randomize_regions = function(regions, allow.overlaps = TRUE, per.chromosome = TRUE, quiet = FALSE) {
23 | 
24 |     ########################################################################
25 |     # Argument parsing and error handling
26 |     if(class(regions)[1] != "GRanges") {
27 |         stop('Error: regions must have class GRanges. The best way to ensure this is to pass the result of read_regions() into this function.')
28 |     }
29 | 
30 |     # Get the genome from the regions
31 |     genome = unique(GenomeInfoDb::genome(regions))
32 | 
33 |     if(is.na(genome)) {
34 |         stop('Error: regions GRanges object must have a valid genome to randomize its regions.')
35 |     } else {
36 |         chr_lengths = GenomeInfoDb::Seqinfo(genome = genome)
37 |         chr_lengths = GenomeInfoDb::seqlengths(chr_lengths)
38 | 
39 |         df_genome = data.frame(
40 |             'chr' = names(chr_lengths),
41 |             'start' = rep.int(1, length(chr_lengths)),
42 |             'end' = as.numeric(chr_lengths),
43 |             stringsAsFactors = FALSE)
44 |     }
45 | 
46 |     if(!quiet) {
47 |         message('Randomizing regions...')
48 |     }
49 | 
50 |     # Randomize the regions
51 |     randomized = regioneR::randomizeRegions(A = regions, genome = df_genome,
52 |         per.chromosome = per.chromosome, allow.overlaps = allow.overlaps)
53 | 
54 |     # Sort the randomized
55 |     randomized = sort(randomized)
56 | 
57 |     return(randomized)
58 | }
59 | 


--------------------------------------------------------------------------------
/R/read.R:
--------------------------------------------------------------------------------
  1 | #' Read genomic regions in BEDX+Y format
  2 | #'
  3 | #' \code{read_regions()} reads genomic regions by calling the \code{rtracklayer::import()} function. This function can automatically deal with BEDX files from BED3 to BED6. For BED6+Y, the \code{extraCols} argument should be used to correctly interpret the extra columns.
  4 | #'
  5 | #' NOTE: The \code{name} (4th) and \code{score} (5th) columns are so named. If these columns have a particular meaning for your data, they should be renamed with the \code{rename_name} and/or \code{rename_score} parameters.
  6 | #'
  7 | #' @param con A path, URL, connection or BEDFile object. See \code{rtracklayer::import()} documentation.
  8 | #' @param genome From \code{rtracklayer::import()}: The identifier of a genome, or NA if unknown. Typically, this is a UCSC identifier like 'hg19'. An attempt will be made to derive the \code{seqinfo} on the return value using either an installed BSgenome package or UCSC, if network access is available.
  9 | #' @param format From \code{rtracklayer::import()}: The format of the output. If not missing, should be one of 'bed', 'bed15', 'bedGraph' or 'bedpe'. If missing and 'con' is a filename, the format is derived from the file extension. This argument is unnecessary when 'con' is a derivative of 'RTLFile'.
 10 | #' @param extraCols From \code{rtracklayer::import()}: A character vector in the same form as 'colClasses' from 'read.table'.  It should indicate the name and class of each extra/special column to read from the BED file. As BED does not encode column names, these are assumed to be the last columns in the file. This enables parsing of the various BEDX+Y formats.
 11 | #' @param rename_name A string to rename the name column of the BED file. For example, if the name column actually contains a categorical variable.
 12 | #' @param rename_score A string to rename the score column of the BED file. For example, if the score column represents a quantity about the data besides the score in the BED specification.
 13 | #' @param ... Parameters to pass onto the format-specific method of \code{rtracklayer::import()}.
 14 | #'
 15 | #' @return A \code{GRanges} object.
 16 | #'
 17 | #' @examples
 18 | #'
 19 | #'    # Example of reading a BED6+3 file where the last 3 columns are non-standard
 20 | #'    file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
 21 | #'    extraCols = c(diff_meth = 'numeric', mu0 = 'numeric', mu1 = 'numeric')
 22 | #'    gr = read_regions(con = file, genome = 'hg19', extraCols = extraCols, format = 'bed',
 23 | #'        rename_name = 'DM_status', rename_score = 'pval')
 24 | #'
 25 | #' @export
 26 | read_regions = function(con, genome = NA, format, extraCols = character(), rename_name, rename_score, ...) {
 27 | 
 28 |     if(!missing(format)) {
 29 |         gr = rtracklayer::import(con = con, genome = genome, format = format, extraCols = extraCols, ...)
 30 |     } else {
 31 |         gr = rtracklayer::import(con = con, genome = genome, extraCols = extraCols, ...)
 32 |     }
 33 | 
 34 |     # Rename name and score columns if the user desires
 35 |     if(!missing(rename_name)) {
 36 |         if(any(colnames(GenomicRanges::mcols(gr)) == 'name')) {
 37 |             colnames(GenomicRanges::mcols(gr))[which(colnames(GenomicRanges::mcols(gr)) == 'name')] = rename_name
 38 |         } else {
 39 |             warning('Ignoring rename_name parameter because con has no name column.')
 40 |         }
 41 |     }
 42 |     if(!missing(rename_score)) {
 43 |         if(any(colnames(GenomicRanges::mcols(gr)) == 'score')) {
 44 |             colnames(GenomicRanges::mcols(gr))[which(colnames(GenomicRanges::mcols(gr)) == 'score')] = rename_score
 45 |         } else {
 46 |             warning('Ignoring rename_score parameter because con has no score column.')
 47 |         }
 48 |     }
 49 | 
 50 |     return(gr)
 51 | }
 52 | 
 53 | #' Read custom annotations
 54 | #'
 55 | #' \code{read_annotations()} is a wrapper for the \code{rtracklayer::import()} function that creates a \code{GRanges} object matching the structure of annotations built with \code{build_annotations()}. The structure is defined by \code{GRanges}, with the \code{mcols()} with names \code{c('id','gene_id','symbol','type')}.
 56 | #'
 57 | #' @param con A path, URL, connection or BEDFile object. See \code{rtracklayer::import.bed()} documentation.
 58 | #' @param name A string for the name of the annotations to be used in the name of the object, [genome]_custom_[name]
 59 | #' @param genome From \code{rtracklayer::import()}: The identifier of a genome, or NA if unknown. Typically, this is a UCSC identifier like 'hg19'. An attempt will be made to derive the \code{seqinfo} on the return value using either an installed BSgenome package or UCSC, if network access is available.
 60 | #' @param format From \code{rtracklayer::import()}: The format of the output. If not missing, should be one of 'bed', 'bed15', 'bedGraph' or 'bedpe'. If missing and 'con' is a filename, the format is derived from the file extension. This argument is unnecessary when 'con' is a derivative of 'RTLFile'.
 61 | #' @param extraCols From \code{rtracklayer::import.bed()}: A character vector in the same form as 'colClasses' from 'read.table'.  It should indicate the name and class of each extra/special column to read from the BED file. As BED does not encode column names, these are assumed to be the last columns in the file. This enables parsing of the various BEDX+Y formats.
 62 | #' @param ... Parameters to pass onto the format-specific method of \code{rtracklayer::import()}.
 63 | #'
 64 | #' @return A \code{GRanges} object stored in \code{annotatr_cache}. To view a custom annotation, do \code{annotatr_cache$get(name)}. To add a custom annotation to the set of annotations, include \code{'[genome]_custom_[name]'} in the call to \code{build_annotations()}. See example below.
 65 | #'
 66 | #' @examples
 67 | #'
 68 | #'  # Read in a BED3 file as a custom annotation
 69 | #'  file = system.file('extdata', 'test_annotations_3.bed', package='annotatr')
 70 | #'  read_annotations(con = file, name = 'test', genome = 'hg19')
 71 | #'  build_annotations(genome = 'hg19', annotations = 'hg19_custom_test')
 72 | #'
 73 | #'  print(annotatr_cache$get('hg19_custom_test'))
 74 | #'
 75 | #' @export
 76 | read_annotations = function(con, name, genome = NA, format, extraCols = character(), ...) {
 77 | 
 78 |     if(missing(name)) {
 79 |         name = 'annotations'
 80 |     }
 81 |     if(is.na(genome)) {
 82 |         genome_name = 'genome'
 83 |     } else {
 84 |         genome_name = genome
 85 |     }
 86 | 
 87 |     protected_extraCols = c('gene_id','symbol','tx_id')
 88 | 
 89 |     if(!missing(format)) {
 90 |         gr = rtracklayer::import(con = con, genome = genome, format = format, extraCols = extraCols, ...)
 91 |     } else {
 92 |         gr = rtracklayer::import(con = con, genome = genome, extraCols = extraCols, ...)
 93 |     }
 94 | 
 95 |     # Determine whether gene_id or symbol are missing from extraCols
 96 |     missing_extraCols = base::setdiff(protected_extraCols, names(extraCols))
 97 | 
 98 |     if(any(missing_extraCols == 'gene_id')) {
 99 |         GenomicRanges::mcols(gr)$gene_id = NA
100 |     }
101 |     if(any(missing_extraCols == 'symbol')) {
102 |         GenomicRanges::mcols(gr)$symbol = NA
103 |     }
104 |     if(any(missing_extraCols == 'tx_id')) {
105 |         GenomicRanges::mcols(gr)$tx_id = NA
106 |     }
107 | 
108 |     GenomicRanges::mcols(gr)$id = paste0(name,':',seq_along(gr))
109 |     GenomicRanges::mcols(gr)$type = sprintf('%s_custom_%s', genome_name, name)
110 | 
111 |     # Make sure only the desired mcols make it out
112 |     GenomicRanges::mcols(gr) = GenomicRanges::mcols(gr)[,c('id','tx_id','gene_id','symbol','type')]
113 | 
114 |     ########################################################
115 |     # Write the object named [genome]_custom_[name] to the annotatr_cache
116 |     annotatr_cache$set(sprintf('%s_custom_%s', genome_name, name), gr)
117 | }
118 | 


--------------------------------------------------------------------------------
/R/summarize.R:
--------------------------------------------------------------------------------
  1 | #' Summarize annotation counts
  2 | #'
  3 | #' Given a \code{GRanges} of annotated regions, count the number of regions in each annotation type. If \code{annotated_random} is not \code{NULL}, then the same is computed for the random regions.
  4 | #'
  5 | #' If a region is annotated to multiple annotations of the same \code{annot.type}, the region will only be counted once. For example, if a region were annotated to multiple exons, it would only count once toward the exons, but if it were annotated to an exon and an intron, it would count towards both.
  6 | #'
  7 | #' @param annotated_regions The \code{GRanges} result of \code{annotate_regions()}.
  8 | #' @param annotated_random The \code{GRanges} result of \code{annotate_regions()} on the randomized regions created from \code{randomize_regions()}.
  9 | #' @param quiet Print progress messages (FALSE) or not (TRUE).
 10 | #'
 11 | #' @return A \code{tbl_df} of the number of regions per annotation type.
 12 | #'
 13 | #' @examples
 14 | #'    ### An example of ChIP-seq peaks with signalValue
 15 | #'
 16 | #'    # Get premade CpG annotations
 17 | #'    data('annotations', package = 'annotatr')
 18 | #'
 19 | #'    file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr')
 20 | #'    r = read_regions(con = file, genome = 'hg19')
 21 | #'
 22 | #'    a = annotate_regions(
 23 | #'        regions = r,
 24 | #'        annotations = annotations,
 25 | #'        ignore.strand = TRUE,
 26 | #'        quiet = FALSE)
 27 | #'
 28 | #'    rnd = randomize_regions(regions = r)
 29 | #'
 30 | #'    rnd_annots = annotate_regions(
 31 | #'        regions = rnd,
 32 | #'        annotations = annotations,
 33 | #'        ignore.strand = TRUE,
 34 | #'        quiet = FALSE)
 35 | #'
 36 | #'    # Summarize the annotated regions without randomized regions
 37 | #'    s = summarize_annotations(annotated_regions = a)
 38 | #'
 39 | #'    # Summarize the annotated regions with randomized regions
 40 | #'    s_rnd = summarize_annotations(
 41 | #'        annotated_regions = a,
 42 | #'        annotated_random = rnd_annots)
 43 | #'
 44 | #' @export
 45 | summarize_annotations = function(annotated_regions, annotated_random, quiet = FALSE) {
 46 |     # Tidy the GRanges into a tbl_df for use with dplyr functions
 47 |     annotated_regions = as.data.frame(annotated_regions, row.names = NULL)
 48 | 
 49 |     ########################################################################
 50 |     # If a region has multiple annotation types that are the same, count only one
 51 |     # from each type of annotation
 52 |     annotated_regions = dplyr::distinct(
 53 |         dplyr::ungroup(annotated_regions),
 54 |         across(c('seqnames', 'start', 'end', 'annot.type')), .keep_all=TRUE)
 55 | 
 56 |     # Tally over data and random regions if annotated_random isn't null,
 57 |     # otherwise tally over data only
 58 |     if(!missing(annotated_random)) {
 59 |         # Tidy the GRanges into a tbl_df for use with dplyr functions
 60 |         annotated_random = as.data.frame(annotated_random, row.names = NULL)
 61 | 
 62 |         # If a region has multiple annotation types that are the same, count only one
 63 |         # from each type of annotation
 64 |         annotated_random = dplyr::distinct(
 65 |             dplyr::ungroup(annotated_random),
 66 |             across(c('seqnames', 'start', 'end', 'annot.type')), .keep_all=TRUE)
 67 | 
 68 |         if(!quiet) {
 69 |             message('Counting annotation types in data and random regions')
 70 |         }
 71 | 
 72 |         combined_annots = dplyr::bind_rows('Data' = annotated_regions, 'Random Regions' = annotated_random, .id = 'data_type')
 73 | 
 74 |         agg = dplyr::tally(
 75 |             dplyr::group_by(combined_annots, across(c('data_type', 'annot.type')))
 76 |         )
 77 |     } else {
 78 |         if(!quiet) {
 79 |             message('Counting annotation types')
 80 |         }
 81 | 
 82 |         # Tally over the normal data
 83 |         agg = dplyr::tally(
 84 |             dplyr::group_by(annotated_regions, across(c('annot.type')))
 85 |         )
 86 |     }
 87 | 
 88 |     return(agg)
 89 | }
 90 | 
 91 | #' Summarize numerical data over groupings of annotated regions
 92 | #'
 93 | #' Given a \code{GRanges} of annotated regions, summarize numerical data columns based on a grouping.
 94 | #'
 95 | #' NOTE: We do not take the distinct values of \code{seqnames}, \code{start}, \code{end}, \code{annot.type} as in the other \code{summarize_*()} functions because in the case of a region that intersected two distinct exons, using \code{distinct()} would destroy the information of the mean of the numerical column over one of the exons, which is not desirable.
 96 | #'
 97 | #' @param annotated_regions The \code{GRanges} result of \code{annotate_regions()}.
 98 | #' @param by A character vector of the columns of \code{as.data.frame(annotated_regions)} to group over. Default is \code{c(annot.type, annot.id)}.
 99 | #' @param over A character vector of the numerical columns in \code{as.data.frame(annotated_regions)} to \code{count}, take the \code{mean}, and take the \code{sd} over after grouping according to the \code{by} column. NOTE: If more than one value is used, the naming scheme for the resuling \code{dplyr::tbl} summary columns are \code{COLNAME_n}, \code{COLNAME_mean}, \code{COLNAME_sd}. If \code{over} has length one, then the column names are \code{n}, \code{mean}, \code{sd}.
100 | #' @param quiet Print progress messages (FALSE) or not (TRUE).
101 | #'
102 | #' @return A grouped \code{dplyr::tbl_df}, and the \code{count}, \code{mean}, and \code{sd} of the \code{cols} \code{by} the groupings.
103 | #'
104 | #' @examples
105 | #' ### Test on a very simple bed file to demonstrate different options
106 | #'
107 | #' # Get premade CpG annotations
108 | #' data('annotations', package = 'annotatr')
109 | #'
110 | #' r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr')
111 | #' extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character')
112 | #' r = read_regions(con = r_file, genome = 'hg19', extraCols = extraCols, rename_score = 'coverage')
113 | #'
114 | #' a = annotate_regions(
115 | #'        regions = r,
116 | #'        annotations = annotations,
117 | #'        ignore.strand = TRUE)
118 | #'
119 | #' # Testing over normal by
120 | #' sn1 = summarize_numerical(
121 | #'        annotated_regions = a,
122 | #'        by = c('annot.type', 'annot.id'),
123 | #'        over = c('coverage', 'mu1', 'mu0'),
124 | #'        quiet = FALSE)
125 | #'
126 | #' # Testing over a different by
127 | #' sn2 = summarize_numerical(
128 | #'        annotated_regions = a,
129 | #'        by = c('diff_exp'),
130 | #'        over = c('coverage', 'mu1', 'mu0'))
131 | #'
132 | #' @export
133 | summarize_numerical = function(annotated_regions, by = c('annot.type', 'annot.id'), over, quiet = FALSE) {
134 |     # Tidy the GRanges into a tbl_df for use with dplyr functions
135 |     annotated_regions = as.data.frame(annotated_regions, row.names = NULL)
136 | 
137 |     if(missing(over)) {
138 |         stop("Error: over cannot be missing.")
139 |     }
140 | 
141 |     if(!quiet) {
142 |         message(sprintf('Grouping regions by %s, and summarizing numerical data over %s',
143 |             paste(by, collapse=' & '), paste(over, collapse=' & ')))
144 |     }
145 |     agg = dplyr::summarize_at(
146 |         dplyr::group_by(annotated_regions, across(by)),
147 |         over,
148 |         dplyr::funs(n(), 'mean', 'sd'))
149 | 
150 |     return(agg)
151 | }
152 | 
153 | #' Summarize categorical data over groupings of annotated regions
154 | #'
155 | #' Given a \code{GRanges} of annotated regions, count the number of regions when the annotations are grouped \code{by} categorical columns.
156 | #'
157 | #' If a region is annotated to multiple annotations of the same \code{annot.type}, the region will only be counted once. For example, if a region were annotated to multiple exons, it would only count once toward the exons, but if it were annotated to an exon and an intron, it would count towards both.
158 | #'
159 | #' @param annotated_regions The \code{GRanges} result of \code{annotate_regions()}.
160 | #' @param by A character vector to group the data in \code{as.data.frame(annotated_regions)} by and tally over. Default is \code{c('annot.type', 'annot.id')}.
161 | #' @param quiet Print progress messages (FALSE) or not (TRUE).
162 | #'
163 | #' @return A grouped \code{dplyr::tbl_df} of the counts of groupings according to the \code{by} vector.
164 | #'
165 | #' @examples
166 | #'
167 | #'    # Get premade CpG annotations
168 | #'    data('annotations', package = 'annotatr')
169 | #'
170 | #'    r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr')
171 | #'    extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character')
172 | #'    r = read_regions(con = r_file, genome = 'hg19', extraCols = extraCols, rename_score = 'coverage')
173 | #'
174 | #'    a = annotate_regions(
175 | #'        regions = r,
176 | #'        annotations = annotations,
177 | #'        ignore.strand = TRUE)
178 | #'
179 | #'    sc = summarize_categorical(
180 | #'        annotated_regions = a,
181 | #'        by = c('annot.type', 'name'),
182 | #'        quiet = FALSE)
183 | #'
184 | #' @export
185 | summarize_categorical = function(annotated_regions, by = c('annot.type', 'annot.id'), quiet = FALSE) {
186 |     # Tidy the GRanges into a tbl_df for use with dplyr functions
187 |     annotated_regions = as.data.frame(annotated_regions, row.names = NULL)
188 | 
189 |     ########################################################################
190 |     # If a region has multiple annotation types that are the same, count only one
191 |     # from each type of annotation
192 |     annotated_regions = dplyr::distinct(
193 |         dplyr::ungroup(annotated_regions),
194 |         across(c('seqnames', 'start', 'end', by)), .keep_all=TRUE)
195 | 
196 |     if(!quiet) {
197 |         message(sprintf('Grouping regions by %s, and tallying',
198 |             paste(by, collapse=' & ')))
199 |     }
200 | 
201 |     agg = dplyr::tally(
202 |         dplyr::group_by(annotated_regions, across(by)))
203 | 
204 |     return(agg)
205 | }
206 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | ### Constants
  2 | # TxDb.* family of packages
  3 | TXDBS = c(
  4 |     'TxDb.Dmelanogaster.UCSC.dm3.ensGene',
  5 |     'TxDb.Dmelanogaster.UCSC.dm6.ensGene',
  6 |     'TxDb.Ggallus.UCSC.galGal5.refGene',
  7 |     'TxDb.Hsapiens.UCSC.hg19.knownGene',
  8 |     'TxDb.Hsapiens.UCSC.hg38.knownGene',
  9 |     'TxDb.Mmusculus.UCSC.mm9.knownGene',
 10 |     'TxDb.Mmusculus.UCSC.mm10.knownGene',
 11 |     'TxDb.Rnorvegicus.UCSC.rn4.ensGene',
 12 |     'TxDb.Rnorvegicus.UCSC.rn5.refGene',
 13 |     'TxDb.Rnorvegicus.UCSC.rn6.refGene')
 14 | 
 15 | # org.* family of packages
 16 | ORGDBS = data.frame(
 17 |     genome = c('dm3','dm6','galGal5','hg19','hg38','mm9','mm10','rn4','rn5','rn6'),
 18 |     org = c('Dm','Dm','Gg','Hs','Hs','Mm','Mm','Rn','Rn','Rn'),
 19 |     stringsAsFactors = FALSE)
 20 | 
 21 | HMMCELLLINES = c('Gm12878','H1hesc','Hepg2','Hmec','Hsmm','Huvec','K562','Nhek','Nhlf')
 22 | 
 23 | HMMCODES = c('1_Active_Promoter', '2_Weak_Promoter' ,'3_Poised_Promoter' ,'4_Strong_Enhancer', '5_Strong_Enhancer', '6_Weak_Enhancer', '7_Weak_Enhancer', '8_Insulator', '9_Txn_Transition', '10_Txn_Elongation', '11_Weak_Txn', '12_Repressed', '13_Heterochrom/lo', '14_Repetitive/CNV')
 24 | 
 25 | #' Function to recode classes from chromHMM type column
 26 | #'
 27 | #' @param hmm_codes in the original form from UCSC Genome Browser track.
 28 | #'
 29 | #' @return A character vector of chromHMM classes with numbers and underscores removed.
 30 | reformat_hmm_codes = function(hmm_codes) {
 31 |     new_codes = sapply(hmm_codes,
 32 |             function(hmm){paste(unlist(strsplit(hmm,'_'))[-1],collapse='')},
 33 |             USE.NAMES=FALSE)
 34 |     return(new_codes)
 35 | }
 36 | 
 37 | #' Function to return cell line from chromatin annotation shortcut
 38 | #'
 39 | #' @param shortcut The annotation shortcut, used in \code{build_annotations()}.
 40 | #'
 41 | #' @return A string of the cell line used in a chromatin annotation shortcut
 42 | get_cellline_from_shortcut = function(shortcut) {
 43 |     return(unlist(strsplit(unlist(strsplit(shortcut,'_'))[2], '-'))[1])
 44 | }
 45 | 
 46 | #' Function to return cell line from chromatin annotation code
 47 | #'
 48 | #' @param code The annotation code, used in \code{build_annotations()}.
 49 | #'
 50 | #' @return A string of the cell line used in a chromatin annotation code
 51 | get_cellline_from_code = function(code) {
 52 |     return(unlist(strsplit(unlist(strsplit(code,'_'))[3], '-'))[1])
 53 | }
 54 | 
 55 | #' Function listing which annotations are available.
 56 | #'
 57 | #' This includes the shortcuts. The \code{expand_annotations()} function helps
 58 | #' handle the shortcuts.
 59 | #'
 60 | #' @return A character vector of available annotations.
 61 | #'
 62 | #' @examples
 63 | #' builtin_annotations()
 64 | #'
 65 | #' @export
 66 | builtin_annotations = function() {
 67 |     # Create annotation code endings
 68 |         shortcut_ends = c('basicgenes','cpgs')
 69 | 
 70 |         # Gene codes
 71 |         gene_genomes = annotatr::builtin_genomes()
 72 |         gene_ends = c('1to5kb', 'promoters', 'cds', '5UTRs', 'exons', 'firstexons', 'introns', 'intronexonboundaries', 'exonintronboundaries', '3UTRs', 'intergenic')
 73 | 
 74 |         # CpG codes
 75 |         cpg_genomes = base::setdiff(annotatr::builtin_genomes(),c('dm3','dm6'))
 76 |         cpg_ends = c('islands', 'shores', 'shelves', 'inter')
 77 | 
 78 |         # Chromatin state codes
 79 |         # Remove numbers, and underscores, and take unique
 80 |         chromatin_recode = unique(reformat_hmm_codes(HMMCODES))
 81 | 
 82 |         chromatin_ends = apply(
 83 |             expand.grid(HMMCELLLINES, chromatin_recode, stringsAsFactors = FALSE),
 84 |             1, paste, collapse='-')
 85 | 
 86 |         chromatin_shortcut_ends = apply(
 87 |             expand.grid(HMMCELLLINES, 'chromatin', stringsAsFactors = FALSE),
 88 |             1, paste, collapse='-')
 89 | 
 90 |     # Create full annotation codes
 91 |         gene_codes = apply(
 92 |             expand.grid(gene_genomes, 'genes', gene_ends, stringsAsFactors = FALSE),
 93 |             1, paste, collapse='_')
 94 |         cpg_codes = apply(
 95 |             expand.grid(cpg_genomes, 'cpg', cpg_ends, stringsAsFactors= FALSE),
 96 |             1, paste, collapse='_')
 97 |         chromatin_codes = apply(
 98 |             expand.grid('hg19', 'chromatin', chromatin_ends, stringsAsFactors=FALSE),
 99 |             1, paste, collapse='_')
100 | 
101 |         enhancer_codes = c('hg19_enhancers_fantom','hg38_enhancers_fantom','mm9_enhancers_fantom','mm10_enhancers_fantom')
102 |         lncrna_codes = c('hg19_lncrna_gencode','hg38_lncrna_gencode','mm10_lncrna_gencode')
103 | 
104 |         gene_shortcut_codes = apply(
105 |             expand.grid(gene_genomes, 'basicgenes', stringsAsFactors = FALSE),
106 |             1, paste, collapse='_')
107 |         cpg_shortcut_codes = apply(
108 |             expand.grid(cpg_genomes, 'cpgs', stringsAsFactors = FALSE),
109 |             1, paste, collapse='_')
110 |         chromatin_shortcut_codes = paste('hg19', chromatin_shortcut_ends, sep='_')
111 | 
112 |     # Create the big vector of supported annotations
113 |     annots = c(gene_codes, cpg_codes, chromatin_codes, enhancer_codes, lncrna_codes,
114 |         gene_shortcut_codes, cpg_shortcut_codes, chromatin_shortcut_codes)
115 | 
116 |     return(annots)
117 | }
118 | 
119 | #' Function returning supported TxDb.* genomes
120 | #'
121 | #' @return A character vector of genomes for supported TxDb.* packages
122 | #'
123 | #' @examples
124 | #' builtin_genomes()
125 | #'
126 | #' @export
127 | builtin_genomes = function() {
128 |     return(ORGDBS$genome)
129 | }
130 | 
131 | #' Function to get correct TxDb.* package name based on genome
132 | #'
133 | #' @param genome A string giving the genome assembly.
134 | #'
135 | #' @return A string giving the name of the correct TxDb.* package name based on \code{genome}.
136 | get_txdb_name = function(genome = annotatr::builtin_genomes()) {
137 |     # Ensure valid arguments
138 |     genome = match.arg(genome)
139 | 
140 |     db = grep(genome, TXDBS, value = TRUE)
141 | 
142 |     return(db)
143 | }
144 | 
145 | #' Function to get correct org.* package name based on genome
146 | #'
147 | #' @param genome A string giving the genome assembly.
148 | #'
149 | #' @return A string giving the correct org for org.db packages. e.g. hg19 -> Hs.
150 | get_orgdb_name = function(genome = annotatr::builtin_genomes()) {
151 |     # Ensure valid arguments
152 |     genome = match.arg(genome)
153 | 
154 |     org = ORGDBS[ORGDBS$genome == genome, 'org']
155 | 
156 |     return(org)
157 | }
158 | 
159 | #' Function to tidy up annotation accessors for visualization
160 | #'
161 | #' @param annotations A character vector of annotations, in the order they are to appear in the visualization.
162 | #'
163 | #' @return A list of mappings from original annotation names to names ready for visualization.
164 | #' @export
165 | tidy_annotations = function(annotations) {
166 |     tidy = sapply(annotations, function(a){
167 |         tokens = unlist(strsplit(a,'_'))
168 |         if(tokens[2] == 'cpg') {
169 |             if(tokens[3] == 'inter') {
170 |                 return('interCGI')
171 |             } else {
172 |                 return(paste('CpG', tokens[3]))
173 |             }
174 |         } else if (tokens[2] == 'genes') {
175 |             if(tokens[3] == 'firstexons') {
176 |                 return('first exons')
177 |             } else if (tokens[3] == 'intronexonboundaries') {
178 |                 return('intron/exon boundaries')
179 |             } else if (tokens[3] == 'exonintronboundaries') {
180 |                 return('exon/intron boundaries')
181 |             } else {
182 |                 return(tokens[3])
183 |             }
184 |         } else if (tokens[2] == 'enhancers') {
185 |             return('enhancers')
186 |         } else if (tokens[2] == 'chromatin') {
187 |             return(tokens[3])
188 |         } else if (tokens[2] == 'custom') {
189 |             return(tokens[3])
190 |         } else if (tokens[2] == 'lncrna') {
191 |             return('GENCODE lncRNA')
192 |         } else {
193 |             return(sprintf('%s %s', tokens[2], tokens[3]))
194 |         }
195 |     })
196 | 
197 |     flip_tidy = names(tidy)
198 |     names(flip_tidy) = tidy
199 | 
200 |     return(as.list(flip_tidy))
201 | }
202 | 
203 | #' Function to check for valid annotations
204 | #'
205 | #' Gives errors if any annotations are not in builtin_annotations() (and they are not in the required custom format), basicgenes are used, or the genome prefixes are not the same for all annotations.
206 | #'
207 | #' @param annotations A character vector of annotations possibly using the shortcuts
208 | #' @return If all the checks on the annotations pass, returns NULL to allow code to move forward.
209 | check_annotations = function(annotations) {
210 |     # Pull out any custom annotations before checking
211 |     custom_annotations = grep('custom', annotations, value = TRUE)
212 |     annotations = base::setdiff(annotations, custom_annotations)
213 | 
214 |     # Check that the annotations are supported, tell the user which are unsupported
215 |     if( !all(annotations %in% annotatr::builtin_annotations()) ) {
216 |         unsupported = base::setdiff(annotations, annotatr::builtin_annotations())
217 | 
218 |         stop(sprintf('Error: "%s" is(are) not supported. See builtin_annotations().',
219 |             paste(unsupported, collapse=', ')))
220 |     }
221 | 
222 |     # Recombine annotations and custom_annotations or you get failure when
223 |     # there are only custom annotations
224 |     annotations = c(custom_annotations, annotations)
225 | 
226 |     genomes = sapply(annotations, function(a){
227 |         unlist(strsplit(a, '_'))[1]
228 |     }, USE.NAMES = FALSE)
229 | 
230 |     # Check for same genome on all annotations
231 |     if( length(unique(genomes)) != 1 ){
232 |         stop('Error: genome prefix on all annotations must be the same.')
233 |     }
234 | 
235 |     return(NULL)
236 | }
237 | 
238 | #' Function to expand annotation shortcuts
239 | #'
240 | #' @param annotations A character vector of annotations, possibly using the shortcut accessors
241 | #'
242 | #' @return A vector of data accession-ized names that are ordered from upstream to downstream in the case of knownGenes and islands to interCGI in the case of cpgs.
243 | #' @export
244 | expand_annotations = function(annotations) {
245 |     are_basicgenes = any(grepl('basicgenes', annotations))
246 |     are_cpgs = any(grepl('cpgs', annotations))
247 |     are_hmms = any(grepl('-chromatin', annotations))
248 | 
249 |     which_are_shortcuts = c(which(grepl('basicgenes', annotations)), which(grepl('cpgs', annotations)), which(grepl('-chromatin', annotations)))
250 | 
251 |     # expand_shortcuts() will always be run after check_annotations() so we can be
252 |     # sure that the genome prefixes are the same for all annotaitons.
253 |     genome = unique( sapply(annotations, function(a){ unlist(strsplit(a, '_'))[1] }, USE.NAMES = FALSE) )
254 | 
255 |     if(are_basicgenes || are_cpgs || are_hmms) {
256 | 
257 |         # Check for shortcut annotation accessors 'cpgs', 'basicgenes'
258 |         # and create the right annotations based on the genome
259 |         new_annotations = c()
260 |         remove_shortcuts = c()
261 |         if(are_cpgs) {
262 |             new_annotations = paste(genome, 'cpg', c('islands','shores','shelves','inter'), sep='_')
263 |         }
264 |         if(are_basicgenes) {
265 |             new_annotations = c(new_annotations, paste(genome, 'genes', c('1to5kb','promoters','5UTRs','exons','introns','3UTRs'), sep='_'))
266 |         }
267 |         if(are_hmms) {
268 |             # Could conceivably use shortcuts for multiple cell lines
269 |             hmms = grep('-chromatin', annotations, value = TRUE)
270 |             cell_lines = sapply(hmms, get_cellline_from_shortcut, USE.NAMES = FALSE)
271 | 
272 |             new_hmm_codes = apply(
273 |                 expand.grid(cell_lines, unique(reformat_hmm_codes(HMMCODES)), stringsAsFactors = FALSE),
274 |                 1, paste, collapse='-')
275 | 
276 |             new_annotations = c(new_annotations,
277 |                 paste(genome, 'chromatin', new_hmm_codes, sep='_'))
278 |         }
279 |         annotations = base::setdiff(c(annotations, new_annotations), annotations[which_are_shortcuts])
280 |     }
281 | 
282 |     return(annotations)
283 | }
284 | 
285 | #' Function to subset a tbl_df or grouped_df by a column
286 | #'
287 | #' @param tbl A \code{tbl_df} or \code{grouped_df}.
288 | #' @param col A string indicating which column of of \code{tbl} to subset and order
289 | #' @param col_order A character vector indicating the order of \code{col}.
290 | #'
291 | #' @return A modified version of \code{summary} with \code{col} subsetted by \code{col_order}.
292 | #' @export
293 | subset_order_tbl = function(tbl, col, col_order) {
294 |     if(!is.null(col)) {
295 |         # Collect all types in the column
296 |         all_col_names = unique(tbl[[col]])
297 | 
298 |         # Inherit col_order from the order in tbl
299 |         if(is.null(col_order)) {
300 |             col_order = all_col_names
301 |         }
302 | 
303 |         # Check set equality of col in the summary and the col_order
304 |         if( !dplyr::setequal(all_col_names, col_order) ) {
305 |             if( all(col_order %in% all_col_names) ) {
306 |                 tbl = subset(tbl, tbl[[col]] %in% col_order)
307 |             } else {
308 |         # Intersect col_order with unique(tbl[[col]]) to deal with possible 0 tallies
309 |         col_order = intersect(col_order, unique(tbl[[col]]))
310 |                 warning('There are elements in col_order that are not present in the corresponding column. Check for typos, or this could be a result of 0 tallies.')
311 |             }
312 |         }
313 | 
314 |         # Convert fill to factor with levels in the correct order
315 |         tbl[[col]] = factor(tbl[[col]], levels = col_order)
316 |         # Also convert the levels to tidy names if fill is annotations
317 |         if(col == 'annot.type') {
318 |             levels(tbl[[col]]) = tidy_annotations(col_order)
319 |         }
320 |     }
321 |     return(tbl)
322 | }
323 | 


--------------------------------------------------------------------------------
/R/visualize.R:
--------------------------------------------------------------------------------
  1 | #' Plot the number of regions per annotation
  2 | #'
  3 | #' Given a \code{GRanges} of annotated regions, plot the number of regions with the corresponding genomic annotations used in \code{annotation_order}. If a region is annotated to multiple annotations of the same \code{annot.type}, the region will only be counted once in the corresponding bar plot. For example, if a region were annotated to multiple exons, it would only count once toward the exon bar in the plot, but if it were annotated to an exon and an intron, it would count towards both.
  4 | #'
  5 | #' @param annotated_regions The \code{GRanges} result of \code{annotate_regions()}.
  6 | #' @param annotated_random The \code{GRanges} result of \code{annotate_regions()} on the randomized regions created from \code{randomize_regions()}.
  7 | #' @param annotation_order A character vector which doubles as the subset of annotations desired for the plot as well as the ordering. If \code{NULL}, all annotations are displayed.
  8 | #' @param plot_title A string used for the title of the plot. If missing, no title is displayed.
  9 | #' @param x_label A string used for the x-axis label. If missing, no x-axis label is displayed.
 10 | #' @param y_label A string used for the y-axis label. If missing, no y-axis label is displayed.
 11 | #' @param quiet Print progress messages (FALSE) or not (TRUE).
 12 | #'
 13 | #' @return A \code{ggplot} object which can be viewed by calling it, saved with \code{ggplot2::ggsave}, or edited.
 14 | #'
 15 | #' @examples
 16 | #'    ########################################################################
 17 | #'    # An example of ChIP-seq peaks with signalValue used for score
 18 | #'
 19 | #'    # Get premade CpG annotations
 20 | #'    data('annotations', package = 'annotatr')
 21 | #'
 22 | #'    chip_bed = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr')
 23 | #'    chip_regions = read_regions(con = chip_bed, genome = 'hg19')
 24 | #'
 25 | #'    chip_rnd = randomize_regions(regions = chip_regions)
 26 | #'
 27 | #'    chip_annots = annotate_regions(
 28 | #'        regions = chip_regions,
 29 | #'        annotations = annotations,
 30 | #'        ignore.strand = TRUE)
 31 | #'
 32 | #'    chip_rnd_annots = annotate_regions(
 33 | #'        regions = chip_rnd,
 34 | #'        annotations = annotations,
 35 | #'        ignore.strand = TRUE)
 36 | #'
 37 | #'    annots_order = c(
 38 | #'        'hg19_cpg_islands',
 39 | #'        'hg19_cpg_shores')
 40 | #'
 41 | #'    p_annots = plot_annotation(annotated_regions = chip_annots,
 42 | #'        annotation_order = annots_order)
 43 | #'    p_annots_rnd = plot_annotation(annotated_regions = chip_annots,
 44 | #'        annotated_random = chip_rnd_annots, annotation_order = annots_order)
 45 | #'
 46 | #' @export
 47 | plot_annotation = function(annotated_regions, annotated_random, annotation_order = NULL,
 48 |     plot_title, x_label, y_label, quiet = FALSE) {
 49 | 
 50 |     # Tidy the GRanges into a tbl_df for use with dplyr functions
 51 |     annotated_regions = as.data.frame(annotated_regions, row.names = NULL)
 52 | 
 53 |     ########################################################################
 54 |     # Order and subset the annotations
 55 |     annotated_regions = subset_order_tbl(tbl = annotated_regions, col='annot.type', col_order=annotation_order)
 56 | 
 57 |     ########################################################################
 58 |     # If a region has multiple annotation types that are the same, count only one
 59 |     # from each type of annotation
 60 |     annotated_regions = dplyr::distinct(
 61 |         dplyr::ungroup(annotated_regions),
 62 |         across(c('seqnames', 'start', 'end', 'annot.type')), .keep_all=TRUE)
 63 | 
 64 |     # Do particular things if annotated_random isn't NULL
 65 |     if(!missing(annotated_random)) {
 66 |         # Tidy the GRanges into a tbl_df for use with dplyr functions
 67 |         annotated_random = as.data.frame(annotated_random, row.names = NULL)
 68 | 
 69 |         # Order and subset the randomized annotations
 70 |         annotated_random = subset_order_tbl(tbl = annotated_random, col='annot.type', col_order=annotation_order)
 71 | 
 72 |         # If a region has multiple annotation types that are the same, count only one
 73 |         # from each type of annotation
 74 |         annotated_random = dplyr::distinct(
 75 |             dplyr::ungroup(annotated_random),
 76 |             across(c('seqnames', 'start', 'end', 'annot.type')), .keep_all=TRUE)
 77 | 
 78 |         # Combine the tbl_dfs in preparation for visualization
 79 |         annotated_regions = dplyr::bind_rows("Data" = annotated_regions, "Random Regions" = annotated_random, .id = 'data_type')
 80 |     }
 81 | 
 82 |     ########################################################################
 83 |     # Construct the plot
 84 | 
 85 |     # Make the base ggplot
 86 |     # NOTE: binwidth may need to be a parameter
 87 |     if(missing(annotated_random)) {
 88 |         plot =
 89 |         ggplot(annotated_regions, aes_string(x='annot.type')) +
 90 |             geom_bar() +
 91 |             theme_bw() +
 92 |             theme(axis.text.x = element_text(angle = 30, hjust = 1),
 93 |                 legend.title=element_blank(), legend.position="bottom", legend.key = element_rect(color = 'white'))
 94 |     } else {
 95 |         plot =
 96 |             ggplot(annotated_regions, aes_string(x='annot.type')) +
 97 |             geom_bar(aes_string(fill = 'data_type'), position='dodge') +
 98 |             theme_bw() +
 99 |             scale_fill_grey() +
100 |             theme(axis.text.x = element_text(angle = 30, hjust = 1),
101 |                 legend.title=element_blank(), legend.position="bottom", legend.key = element_rect(color = 'white'))
102 |     }
103 | 
104 |     # Add any user defined labels to the plot if their values are not NULL
105 |     # if they are NULL, ggplot() will use defaults
106 |     if(!missing(plot_title)) {
107 |         plot = plot + ggtitle(plot_title)
108 |     }
109 |     if(!missing(x_label)) {
110 |         plot = plot + xlab(x_label)
111 |     }
112 |     if(!missing(y_label)) {
113 |         plot = plot + ylab(y_label)
114 |     }
115 | 
116 |     return(plot)
117 | }
118 | 
119 | #' Plot pair-wise annotations across regions
120 | #'
121 | #' All co-occurring annotations associated with a region are computed and displayed as a heatmap.
122 | #'
123 | #' As with \code{plot_annotation()}, the number in each cell is the number of unique regions annotated to the pair of annotations.
124 | #'
125 | #' For example, if a region is annotated to both a CpG shore and to two different exons simultaneously, the region will only be counted once in the CpG shore / exon cell. NOTE, this same region will count once in both the CpG shore and exon cells on the diagonal.
126 | #'
127 | #' @param annotated_regions The \code{GRanges} result of \code{annotate_regions()}.
128 | #' @param annotation_order A character vector which doubles as the subset of annotations desired for plot as well as the ordering. If \code{NULL}, all annotations are displayed.
129 | #' @param plot_title A string used for the title of the plot. If missing, no plot title label is displayed.
130 | #' @param axes_label A string used for the axis labels. If missing, corresponding variable name used.
131 | #' @param quiet Print progress messages (FALSE) or not (TRUE).
132 | #'
133 | #' @return A \code{ggplot} object which can be viewed by calling it, saved with \code{ggplot2::ggsave}, or edited.
134 | #'
135 | #' @examples
136 | #'    # Get premade CpG annotations
137 | #'    data('annotations', package = 'annotatr')
138 | #'
139 | #'    dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
140 | #'    extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric')
141 | #'    dm_regions = read_regions(con = dm_file, extraCols = extraCols,
142 | #'        rename_score = 'pval', rename_name = 'DM_status', format = 'bed')
143 | #'    dm_regions = dm_regions[1:1000]
144 | #'
145 | #'    dm_annots = annotate_regions(
146 | #'        regions = dm_regions,
147 | #'        annotations = annotations,
148 | #'        ignore.strand = TRUE)
149 | #'
150 | #'    all_order = c(
151 | #'        'hg19_cpg_islands',
152 | #'        'hg19_cpg_shores',
153 | #'        'hg19_cpg_shelves',
154 | #'        'hg19_cpg_inter')
155 | #'
156 | #'    dm_vs_ca = plot_coannotations(
157 | #'        annotated_regions = dm_annots,
158 | #'        annotation_order = all_order,
159 | #'        axes_label = 'Annotations',
160 | #'        plot_title = 'Co-occurrence of Annotations')
161 | #'
162 | #' @export
163 | plot_coannotations = function(annotated_regions, annotation_order = NULL,
164 |     plot_title, axes_label, quiet = FALSE) {
165 | 
166 |     # Tidy the GRanges into a tbl_df for use with dplyr functions
167 |     annotated_regions = as.data.frame(annotated_regions, row.names = NULL)
168 | 
169 |     ########################################################################
170 |     # Order and subset the annotations
171 |     annotated_regions = subset_order_tbl(tbl = annotated_regions, col='annot.type', col_order=annotation_order)
172 | 
173 |     ########################################################################
174 |     # Find the co-annotations
175 | 
176 |     annotation_pairs_by_region = dplyr::do(
177 |         dplyr::group_by(annotated_regions, across(c('seqnames', 'start', 'end'))),
178 |         expand.grid(.$annot.type, .$annot.type, stringsAsFactors = FALSE))
179 | 
180 |     annotation_pairs_by_region = dplyr::distinct(dplyr::ungroup(annotation_pairs_by_region),
181 |         across(c('seqnames', 'start', 'end', 'Var1', 'Var2')), .keep_all=TRUE)
182 | 
183 |     pairwise_annotation_counts = table(annotation_pairs_by_region[['Var1']], annotation_pairs_by_region[['Var2']])
184 | 
185 |     pac_m = reshape2::melt(pairwise_annotation_counts, value.name = 'Counts')
186 | 
187 |     ########################################################################
188 |     # Construct the plot
189 | 
190 |     # Make the base ggplot
191 |     # NOTE: binwidth may need to be a parameter
192 |     plot = ggplot(pac_m, aes_string('Var1', 'Var2')) +
193 |         geom_raster(aes_string(fill = 'Counts')) +
194 |         geom_text(aes_string(label = 'Counts')) +
195 |         scale_fill_gradient(low = "white", high = "steelblue") +
196 |         theme(axis.text.x = element_text(angle = 30, hjust = 1), axis.text.y = element_text(angle = 30, hjust = 1))
197 | 
198 |     # Add any user defined labels to the plot if their values are not NULL
199 |     # if they are NULL, ggplot() will use defaults
200 |     if(!missing(plot_title)) {
201 |         plot = plot + ggtitle(plot_title)
202 |     }
203 |     if(!missing(axes_label)) {
204 |         plot = plot + xlab(axes_label)
205 |         plot = plot + ylab(axes_label)
206 |     }
207 | 
208 |     return(plot)
209 | }
210 | 
211 | #' Plot numerical data over regions or regions summarized over annotations
212 | #'
213 | #' This function produces either histograms over \code{facet}, or x-y scatterplots over \code{facet}. In the case of histograms over facets, the All distribution (hollow histogram with red outline) is the distribution of \code{x} over all the regions in the data. The facet specific distributions (solid gray) are the distribution of \code{x} over the regions in each facet. For example, a CpG with associated percent methylation annotated to a CpG island and a promoter will count once in the All distribution, but will count once each in the CpG island and promoter facet distributions.
214 | #'
215 | #' @param annotated_regions A \code{GRanges} returned from \code{annotate_regions()}. If the data is not summarized, the data is at the region level. If it is summarized, it represents the average or standard deviation of the regions by the character vector used for \code{by} in \code{summarize_numerical()}.
216 | #' @param x A string indicating the column of the \code{GRanges} to use for the x-axis.
217 | #' @param y A string indicating the column of the \code{GRanges} to use for the y-axis. If missing, a a histogram over \code{x} will be plotted. If not missing, a scatterplot is plotted.
218 | #' @param facet A string, or character vector of two strings, indicating indicating which categorical variable(s) in the \code{GRanges} to make \code{ggplot2} facets over. When two facets are given, the first entry is the vertical facet and the second entry is the horizontal facet. Default is \code{annot.type}.
219 | #' @param facet_order A character vector, or list of character vectors if \code{facet} has length 2, which gives the order of the facets, and can be used to subset the column in the \code{GRanges} used for the \code{facet}. For example, if \code{facet = 'annot.type'}, then the annotations maybe subsetted to just CpG annotations. Default is \code{NULL}, meaning all annotations in their default order are used.
220 | #' @param bin_width An integer indicating the bin width of the histogram used for score. Default 10. Select something appropriate for the data. NOTE: This is only used if \code{y} is \code{NULL}.
221 | #' @param plot_title A string used for the title of the plot. If missing, no title is displayed.
222 | #' @param x_label A string used for the x-axis label. If missing, no x-axis label is displayed.
223 | #' @param y_label A string used for the y-axis label. If missing, no y-axis label is displayed.
224 | #' @param legend_facet_label A string used to label the gray bar portion of the legend. Defaults to "x in facet".
225 | #' @param legend_cum_label A string used to label the red outline portion of the legend. Defaults to "All in x".
226 | #' @param quiet Print progress messages (FALSE) or not (TRUE).
227 | #'
228 | #' @return A \code{ggplot} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}.
229 | #'
230 | #' @examples
231 | #'    # An example with multi-columned data
232 | #'
233 | #'    # Get premade CpG annotations
234 | #'    data('annotations', package = 'annotatr')
235 | #'
236 | #'    dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
237 | #'    extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric')
238 | #'    dm_regions = read_regions(con = dm_file, extraCols = extraCols,
239 | #'        rename_score = 'pval', rename_name = 'DM_status', format = 'bed')
240 | #'    dm_regions = dm_regions[1:1000]
241 | #'
242 | #'    # Annotate the regions
243 | #'    dm_annots = annotate_regions(
244 | #'        regions = dm_regions,
245 | #'        annotations = annotations,
246 | #'        ignore.strand = TRUE)
247 | #'
248 | #'    # Plot histogram of group 1 methylation rates across the CpG annotations.
249 | #'    # NOTE: Overall distribution (everything in \code{facet_order})
250 | #'    # is plotted in each facet for comparison.
251 | #'    dm_vs_regions_mu1 = plot_numerical(
252 | #'        annotated_regions = dm_annots,
253 | #'        x = 'mu1',
254 | #'        facet = 'annot.type',
255 | #'        facet_order = c('hg19_cpg_islands','hg19_cpg_shores',
256 | #'            'hg19_cpg_shelves','hg19_cpg_inter'),
257 | #'        bin_width = 5,
258 | #'        plot_title = 'Group 1 Methylation over CpG Annotations',
259 | #'        x_label = 'Group 1 Methylation')
260 | #'
261 | #'    # Plot histogram of group 1 methylation rates across the CpG annotations
262 | #'    # crossed with DM_status
263 | #'    dm_vs_regions_diffmeth = plot_numerical(
264 | #'        annotated_regions = dm_annots,
265 | #'        x = 'diff_meth',
266 | #'        facet = c('annot.type','DM_status'),
267 | #'        facet_order = list(
268 | #'            c('hg19_genes_promoters','hg19_genes_5UTRs','hg19_cpg_islands'),
269 | #'            c('hyper','hypo','none')),
270 | #'        bin_width = 5,
271 | #'        plot_title = 'Group 0 Region Methylation In Genes',
272 | #'        x_label = 'Methylation Difference')
273 | #'
274 | #'    # Can also use the result of annotate_regions() to plot two numerical
275 | #'    # data columns against each other for each region, and facet by annotations.
276 | #'    dm_vs_regions_annot = plot_numerical(
277 | #'        annotated_regions = dm_annots,
278 | #'        x = 'mu0',
279 | #'        y = 'mu1',
280 | #'        facet = 'annot.type',
281 | #'        facet_order = c('hg19_cpg_islands','hg19_cpg_shores',
282 | #'            'hg19_cpg_shelves','hg19_cpg_inter'),
283 | #'        plot_title = 'Region Methylation: Group 0 vs Group 1',
284 | #'        x_label = 'Group 0',
285 | #'        y_label = 'Group 1')
286 | #'
287 | #'    # Another example, but using differential methylation status as the facets.
288 | #'    dm_vs_regions_name = plot_numerical(
289 | #'        annotated_regions = dm_annots,
290 | #'        x = 'mu0',
291 | #'        y = 'mu1',
292 | #'        facet = 'DM_status',
293 | #'        facet_order = c('hyper','hypo','none'),
294 | #'        plot_title = 'Region Methylation: Group 0 vs Group 1',
295 | #'        x_label = 'Group 0',
296 | #'        y_label = 'Group 1')
297 | #'
298 | #' @export
299 | plot_numerical = function(annotated_regions, x, y, facet, facet_order, bin_width=10,
300 |     plot_title, x_label, y_label, legend_facet_label, legend_cum_label, quiet = FALSE) {
301 | 
302 |     # Check for facet facet_order mismatches
303 |     if(length(facet) == 2) {
304 |         if(!is(facet_order, 'list')) {
305 |             stop('When facet is of length two, facet_order must be a list giving the order for each facet variable.')
306 |         }
307 |         two_facets = TRUE
308 |     } else {
309 |         two_facets = FALSE
310 |     }
311 | 
312 |     # Deal with facet formula
313 |     if(two_facets) {
314 |         facet_formula = paste(facet[1], "~", facet[2])
315 |     } else {
316 |         facet_formula = paste("~", facet)
317 |     }
318 | 
319 |     # Tidy the GRanges into a tbl_df for use with dplyr functions
320 |     tbl = as.data.frame(annotated_regions, row.names = NULL)
321 | 
322 |     ########################################################################
323 |     # Order and subset the annotations
324 |     if(two_facets) {
325 |         sub_tbl = subset_order_tbl(tbl = tbl, col = facet[1], col_order = facet_order[[1]])
326 |         sub_tbl = subset_order_tbl(tbl = sub_tbl, col = facet[2], col_order = facet_order[[2]])
327 |     } else {
328 |         sub_tbl = subset_order_tbl(tbl = tbl, col = facet, col_order = facet_order)
329 |     }
330 | 
331 |     ########################################################################
332 |     # Create data objects for plots
333 |     facet_data = dplyr::distinct(dplyr::ungroup(sub_tbl), across(c('seqnames', 'start', 'end', 'annot.type')), .keep_all=TRUE)
334 |     if(two_facets) {
335 |         all_data = dplyr::distinct(dplyr::select(dplyr::ungroup(tbl), -matches(facet[1])), across(c('seqnames', 'start', 'end')), .keep_all=TRUE)
336 |         all_data = dplyr::distinct(dplyr::select(all_data, -matches(facet[2])), across(c('seqnames', 'start', 'end')), .keep_all=TRUE)
337 |     } else {
338 |         all_data = dplyr::distinct(dplyr::select(dplyr::ungroup(tbl), -matches(facet)), across(c('seqnames', 'start', 'end')), .keep_all=TRUE)
339 |     }
340 | 
341 | 
342 |     ########################################################################
343 |     # Construct the plot
344 |     # Note, data must be dplyr::ungroup()-ed before hand for the proper
345 |     # display of the overall distribution.
346 | 
347 |     if(missing(y)) {
348 |         if(missing(legend_facet_label)) {
349 |             if(two_facets) {
350 |                 legend_facet_label = sprintf('%s in %s x %s', x, facet[1], facet[2])
351 |             } else {
352 |                 legend_facet_label = sprintf('%s in %s', x, facet)
353 |             }
354 |         }
355 |         if(missing(legend_cum_label)) {
356 |             legend_cum_label = sprintf('All %s', x)
357 |         }
358 |         fill_man = c(NA, 'gray')
359 |         names(fill_man) = c(legend_cum_label, legend_facet_label)
360 | 
361 |         # Make the base histogram ggplot
362 |         plot =
363 |             # Facet hists are plotted with distinct (seqnames, start, end, annot.type) combinations
364 |             ggplot(
365 |                 data = facet_data,
366 |                 aes_string(x=x, y='..density..')) +
367 |             geom_histogram(binwidth=bin_width, aes(fill = legend_facet_label)) +
368 |             facet_wrap( stats::as.formula(facet_formula) ) + # Over the facets
369 |             # All hist is plotted with distinct (seqnames, start, end) combinations
370 |             geom_histogram(
371 |                 data = all_data,
372 |                 binwidth=bin_width, aes(fill = legend_cum_label, color = 'red')) + # All the data
373 |             theme_bw() +
374 |             scale_fill_manual(values = fill_man) +
375 |             guides(color = 'none') +
376 |             theme(legend.title=element_blank(), legend.position="bottom", legend.key = element_rect(color = c('red','white')))
377 |     } else {
378 |         # Make the base scatter ggplot
379 |         plot = ggplot(facet_data, aes_string(x=x, y=y)) +
380 |             geom_point(alpha = 1/8, size = 1) +
381 |             facet_wrap( stats::as.formula(facet_formula) ) +
382 |             theme_bw()
383 |     }
384 | 
385 |     # Add any user defined labels to the plot if their values are not NULL
386 |     # if they are NULL, ggplot() will use defaults
387 |     if(!missing(plot_title)) {
388 |         plot = plot + ggtitle(plot_title)
389 |     }
390 |     if(!missing(x_label)) {
391 |         plot = plot + xlab(x_label)
392 |     }
393 |     if(!missing(y_label)) {
394 |         plot = plot + ylab(y_label)
395 |     }
396 | 
397 |     return(plot)
398 | }
399 | 
400 | #' Plot numerical data occurring in pairs of annotations
401 | #'
402 | #' Plot numerical data associated with regions occurring in \code{annot1}, \code{annot2} and in both. As with \code{plot_numerical()}, the result is a plot of histograms or x-y scatterplots.
403 | #'
404 | #' For example, a CpG with associated percent methylation annotated to a CpG island and a promoter will count once in the All distribution and once in the CpG island / promoter facet distribution. However, a CpG associated only with a promoter will count once in the All distribution and once in the promoter / promoter distribution.
405 | #'
406 | #' @param annotated_regions A \code{GRanges} returned from \code{annotate_regions()}.
407 | #' @param x A string indicating the column of the \code{GRanges} to use for the x-axis.
408 | #' @param y A string indicating the column of the \code{GRanges} to use for the y-axis. If missing, a histogram over \code{x} will be plotted. If not missing, a scatterplot is plotted.
409 | #' @param annot1 A string indicating the first annotation type.
410 | #' @param annot2 A string indicating the second annotation type.
411 | #' @param bin_width An integer indicating the bin width of the histogram used for score. Default 10. Select something appropriate for the data. NOTE: This is only used if \code{y} is \code{NULL}.
412 | #' @param plot_title A string used for the title of the plot. If missing, no title is displayed.
413 | #' @param x_label A string used for the x-axis label. If missing, no x-axis label is displayed.
414 | #' @param y_label A string used for the y-axis label. If missing, no y-axis label is displayed.
415 | #' @param legend_facet_label A string used to label the gray bar portion of the legend. Defaults to "x in annot pair".
416 | #' @param legend_cum_label A string used to label the red outline portion of the legend. Defaults to "All x".
417 | #' @param quiet Print progress messages (FALSE) or not (TRUE).
418 | #'
419 | #' @return A \code{ggplot} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}.
420 | #'
421 | #' @examples
422 | #'    # Get premade CpG annotations
423 | #'    data('annotations', package = 'annotatr')
424 | #'
425 | #'    dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
426 | #'    extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric')
427 | #'    dm_regions = read_regions(con = dm_file, extraCols = extraCols,
428 | #'        rename_score = 'pval', rename_name = 'DM_status', format = 'bed')
429 | #'    dm_regions = dm_regions[1:1000]
430 | #'
431 | #'    dm_annots = annotate_regions(
432 | #'        regions = dm_regions,
433 | #'        annotations = annotations,
434 | #'        ignore.strand = TRUE)
435 | #'
436 | #'    dm_vs_num_co = plot_numerical_coannotations(
437 | #'        annotated_regions = dm_annots,
438 | #'        x = 'mu0',
439 | #'        annot1 = 'hg19_cpg_islands',
440 | #'        annot2 = 'hg19_cpg_shelves',
441 | #'        bin_width = 5,
442 | #'        plot_title = 'Group 0 Perc. Meth. in CpG Islands and Promoters',
443 | #'        x_label = 'Percent Methylation')
444 | #'
445 | #' @export
446 | plot_numerical_coannotations = function(annotated_regions, x, y, annot1, annot2, bin_width=10,
447 |     plot_title, x_label, y_label, legend_facet_label, legend_cum_label, quiet = FALSE) {
448 | 
449 |     # Tidy the GRanges into a tbl_df for use with dplyr functions
450 |     tbl = as.data.frame(annotated_regions, row.names = NULL)
451 | 
452 |     ########################################################################
453 |     # Order and subset the annotations
454 |     annotation_order = c(annot1,annot2)
455 |     sub_tbl = subset_order_tbl(tbl = tbl, col='annot.type', col_order=annotation_order)
456 | 
457 |     ########################################################################
458 |     # Find the co-annotations
459 | 
460 |     # Use combn instead of expand.grid because we do not want regions annotated to
461 |     # a CpG island and a promoter having their data value count in the island / island
462 |     # facet as well as the promoter / promoter facet. We want it *ONLY* in the
463 |     # island / promoter facet. Note, sorting ensures island / promoter and promoter / island
464 |     # are aggregated
465 |     pairs_by_region = dplyr::do(
466 |         dplyr::group_by(sub_tbl, across(c('seqnames', 'start', 'end'))),
467 |         if(nrow(.) == 1) {
468 |             as.data.frame(
469 |                 t(
470 |                     utils::combn(
471 |                         rep.int(as.character(.$annot.type), 2)
472 |                     , 2))
473 |                 , stringsAsFactors = FALSE)
474 |         } else {
475 |             as.data.frame(
476 |                 t(
477 |                     utils::combn(
478 |                         sort(as.character(.$annot.type))
479 |                     , 2))
480 |             , stringsAsFactors = FALSE)
481 |         }
482 |     )
483 | 
484 |     # Join on the data chromosome locations
485 |     pairs_by_region = dplyr::inner_join(x = pairs_by_region, y = sub_tbl, by = c('seqnames','start','end'))
486 | 
487 |     ########################################################################
488 |     # Create data objects for plots
489 |     facet_data = dplyr::distinct(dplyr::ungroup(pairs_by_region),
490 |         across(c('seqnames', 'start', 'end', 'V1', 'V2')), .keep_all=TRUE)
491 |     all_data = dplyr::distinct(dplyr::ungroup(tbl), across(c('seqnames', 'start', 'end')), .keep_all=TRUE)
492 | 
493 |     ########################################################################
494 |     # Construct the plot
495 |     # Note, data must be dplyr::ungroup()-ed before hand for the proper
496 |     # display of the overall distribution.
497 | 
498 |     if(missing(y)) {
499 |         if(missing(legend_facet_label)) {
500 |             legend_facet_label = sprintf('%s in %s', x, 'annot pair')
501 |         }
502 |         if(missing(legend_cum_label)) {
503 |             legend_cum_label = sprintf('All %s', x)
504 |         }
505 |         fill_man = c(NA, 'gray')
506 |         names(fill_man) = c(legend_cum_label, legend_facet_label)
507 | 
508 |         # Make the base histogram ggplot
509 |         plot =
510 |             # Facet hists are plotted with distinct (seqnames, start, end, annot1, annot2) combinations
511 |             ggplot(
512 |                 data = facet_data,
513 |                 aes_string(x=x, y='..density..')) +
514 |             geom_histogram(binwidth=bin_width, aes(fill = legend_facet_label)) +
515 |             facet_wrap( V1 ~ V2 ) + # Over the facets
516 |             # All hist is plotted with distinct (seqnames, start, end) combinations
517 |             geom_histogram(
518 |                 data = all_data,
519 |                 binwidth=bin_width, aes(fill = legend_cum_label, color = 'red')) + # All the data
520 |             theme_bw() +
521 |             scale_fill_manual(values = fill_man) +
522 |             guides(color = 'none') +
523 |             theme(legend.title=element_blank(), legend.position="bottom", legend.key = element_rect(color = c('red','white')))
524 |     } else {
525 |         # Make the base scatter ggplot
526 |         plot = ggplot(pairs_by_region, aes_string(x=x, y=y)) +
527 |             geom_point(alpha = 1/8, size = 1) +
528 |             facet_wrap( V1 ~ V2 ) +
529 |             theme_bw()
530 |     }
531 | 
532 |     # Add any user defined labels to the plot if their values are not NULL
533 |     # if they are NULL, ggplot() will use defaults
534 |     if(!missing(plot_title)) {
535 |         plot = plot + ggtitle(plot_title)
536 |     }
537 |     if(!missing(x_label)) {
538 |         plot = plot + xlab(x_label)
539 |     }
540 |     if(!missing(y_label)) {
541 |         plot = plot + ylab(y_label)
542 |     }
543 | 
544 |     return(plot)
545 | }
546 | 
547 | #' Plot a categorical data variable over another
548 | #'
549 | #' Given a \code{GRanges} of annotated regions from \code{annotate_regions()}, visualize the the distribution of categorical data \code{fill} in categorical data \code{x}. A bar representing the distribution of all \code{fill} in \code{x} will be added according to the contents of \code{fill}. This is the distribution over all values of \code{x}. Additionally, when \code{annotated_random} is not missing, a "Random Regions" bar shows the distribution of random regions over \code{fill}.
550 | #'
551 | #' For example, if a differentially methylated region has the categorical label hyper, and is annotated to a promoter, a 5UTR, two exons, and an intron. Each annotation will appear in the All bar once. Likewise for the hyper bar if the differential methylation status is chosen as \code{x} with \code{annot.type} chosen as \code{fill}.
552 | #'
553 | #' @param annotated_regions The \code{GRanges} result of \code{annotate_regions()}.
554 | #' @param annotated_random The \code{GRanges} result of \code{annotate_regions()} on the randomized regions created from \code{randomize_regions()}. Random regions can only be used with \code{fill == 'annot.type'}.
555 | #' @param x One of 'annot.type' or a categorical data column, indicating whether annotation classes or data classes will appear on the x-axis.
556 | #' @param fill One of 'annot.type', a categorical data column, or \code{NULL}, indicating whether annotation classes or data classes will fill the bars. If \code{NULL} then the bars will be the total counts of the x classes.
557 | #' @param x_order A character vector that subsets and orders the x classes. Default \code{NULL}, uses existing values.
558 | #' @param fill_order A character vector that subsets and orders the fill classes. Default \code{NULL}, uses existing values.
559 | #' @param position A string which has the same possible values as in \code{ggplot2::geom_bar(..., position)}, i.e., 'stack', 'fill', 'dodge', etc.
560 | #' @param plot_title A string used for the title of the plot. If missing, no title is displayed.
561 | #' @param legend_title A string used for the legend title to describe fills (if fill is not \code{NULL}). Default displays corresponding variable name.
562 | #' @param x_label A string used for the x-axis label. If missing, corresponding variable name used.
563 | #' @param y_label A string used for the y-axis label. If missing, corresponding variable name used.
564 | #' @param quiet Print progress messages (FALSE) or not (TRUE).
565 | #'
566 | #' @return A \code{ggplot} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}.
567 | #'
568 | #' @examples
569 | #'    # Get premade CpG annotations
570 | #'    data('annotations', package = 'annotatr')
571 | #'
572 | #'    dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
573 | #'    extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric')
574 | #'    dm_regions = read_regions(con = dm_file, extraCols = extraCols, genome = 'hg19',
575 | #'        rename_score = 'pval', rename_name = 'DM_status', format = 'bed')
576 | #'    dm_regions = dm_regions[1:1000]
577 | #'
578 | #'    dm_annots = annotate_regions(
579 | #'        regions = dm_regions,
580 | #'        annotations = annotations,
581 | #'        ignore.strand = TRUE)
582 | #'
583 | #'    dm_order = c(
584 | #'        'hyper',
585 | #'        'hypo')
586 | #'    cpg_order = c(
587 | #'        'hg19_cpg_islands',
588 | #'        'hg19_cpg_shores',
589 | #'        'hg19_cpg_shelves',
590 | #'        'hg19_cpg_inter')
591 | #'
592 | #'    dm_vn = plot_categorical(
593 | #'        annotated_regions = dm_annots,
594 | #'        x = 'DM_status',
595 | #'        fill = 'annot.type',
596 | #'        x_order = dm_order,
597 | #'        fill_order = cpg_order,
598 | #'        position = 'fill',
599 | #'        legend_title = 'knownGene Annotations',
600 | #'        x_label = 'DM status',
601 | #'        y_label = 'Proportion')
602 | #'
603 | #'    # Create randomized regions
604 | #'    dm_rnd_regions = randomize_regions(regions = dm_regions)
605 | #'    dm_rnd_annots = annotate_regions(
606 | #'        regions = dm_rnd_regions,
607 | #'        annotations = annotations,
608 | #'        ignore.strand = TRUE)
609 | #'
610 | #'    dm_vn_rnd = plot_categorical(
611 | #'        annotated_regions = dm_annots,
612 | #'        annotated_random = dm_rnd_annots,
613 | #'        x = 'DM_status',
614 | #'        fill = 'annot.type',
615 | #'        x_order = dm_order,
616 | #'        fill_order = cpg_order,
617 | #'        position = 'fill',
618 | #'        legend_title = 'knownGene Annotations',
619 | #'        x_label = 'DM status',
620 | #'        y_label = 'Proportion')
621 | #'
622 | #' @export
623 | plot_categorical = function(annotated_regions, annotated_random, x, fill=NULL, x_order=NULL, fill_order=NULL,
624 |     position = 'stack', plot_title, legend_title, x_label, y_label, quiet = FALSE) {
625 | 
626 |     ########################################################################
627 |     # Argument parsing and error handling
628 | 
629 |     # Tidy the GRanges into a tbl_df for use with dplyr functions
630 |     annotated_regions = as.data.frame(annotated_regions, row.names = NULL)
631 | 
632 |     # Ensure the value of x is a column name in summarized_cats
633 |     if( !(x %in% colnames(annotated_regions)) ) {
634 |         stop('The column name used for x does not exist in annotated_regions.')
635 |     }
636 | 
637 |     # Ensure the value of fill is a column name in summarized_cats if it isn't NULL
638 |     # Also ensure fill != x
639 |     if( !is.null(fill) ) {
640 |         if( !(fill %in% colnames(annotated_regions)) ) {
641 |             stop('The column name used for fill does not exist in annotated_regions.')
642 |         }
643 |         if( x == fill ) {
644 |             stop('Error: x cannot equal fill')
645 |         }
646 |     }
647 | 
648 |     # If !is.null(annotated_random), check that fill = 'annot.type'. This is the
649 |     # only situation where random regions can be used, because the data from the
650 |     # original regions is not transferred to the random ones.
651 |     if(!missing(annotated_random) && fill != 'annot.type') {
652 |         stop('Error: Random regions can only be used in plot_categorical() when fill == "annot.type" since data from the original regions are not transferred to the random regions.')
653 |     }
654 | 
655 |     # Check valid position argument
656 |     if(position != 'stack' && position != 'fill' && position != 'dodge') {
657 |         stop('Error: position must be one of "stack", "fill", or "dodge"')
658 |     }
659 | 
660 |     ########################################################################
661 |     # Order and subset based on fill_order
662 |     annotated_regions = subset_order_tbl(tbl = annotated_regions, col = fill, col_order = fill_order)
663 | 
664 |     # Take the distinct annotation types per unique data region
665 |     annotated_regions = dplyr::distinct(dplyr::ungroup(annotated_regions), across(c('seqnames', 'start', 'end', x, fill)), .keep_all=TRUE)
666 | 
667 |     ########################################################################
668 |     # Order and subset based on x_order
669 |     if(is.null(x_order)) {
670 |         x_order = unique(annotated_regions[[x]])
671 |     }
672 |     sub_annot_regions = subset_order_tbl(tbl = annotated_regions, col = x, col_order = x_order)
673 | 
674 |     # Do particular things if annotated_random isn't NULL
675 |     if(!missing(annotated_random)) {
676 |         # Tidy the GRanges into a tbl_df for use with dplyr functions
677 |         annotated_random = as.data.frame(annotated_random, row.names = NULL)
678 | 
679 |         # Order and subset the randomized annotations
680 |         annotated_random = subset_order_tbl(tbl = annotated_random, col=fill, col_order=fill_order)
681 | 
682 |         # Take the distinct annotation types per unique random data region
683 |         annotated_random = dplyr::distinct(dplyr::ungroup(annotated_random), across(c('seqnames', 'start', 'end', 'annot.type')), .keep_all=TRUE)
684 | 
685 |         # Combine the tbl_dfs in preparation for visualization
686 |         annotated_regions = dplyr::bind_rows("All" = annotated_regions, "Random Regions" = annotated_random, .id = 'data_type')
687 |     }
688 | 
689 |     ########################################################################
690 |     # Construct the plot
691 | 
692 |     # Make base ggplot
693 |     if(!missing(annotated_random)) {
694 |         plot =
695 |             ggplot(annotated_regions, aes_string(x='data_type')) +
696 |             geom_bar(aes_string(fill=fill), position=position, width=0.5) + # The All bar
697 |             geom_bar(data = sub_annot_regions, aes_string(x=x, fill=fill), position=position, width=0.5) + # The subsets bars
698 |             theme(axis.text.x = element_text(angle = 30, hjust = 1))
699 |     } else {
700 |         plot =
701 |             ggplot(annotated_regions, aes(x='All')) +
702 |             geom_bar(aes_string(fill=fill), position=position, width=0.5) + # The All bar
703 |             geom_bar(data = sub_annot_regions, aes_string(x=x, fill=fill), position=position, width=0.5) + # The subsets bars
704 |             theme(axis.text.x = element_text(angle = 30, hjust = 1))
705 |     }
706 | 
707 |     # Change the fill scale and name if legend_title isn't null
708 |     if(!missing(legend_title)) {
709 |         plot = plot + scale_fill_hue(name=legend_title)
710 |     } else {
711 |         plot = plot + scale_fill_hue()
712 |     }
713 | 
714 |     # Deal with the x-axis labels to make sure the order is correct
715 |     if(!missing(annotated_random)) {
716 |         plot = plot + scale_x_discrete(limits = c('All', x_order, 'Random Regions'))
717 |     } else {
718 |         if(x == 'annot.type') {
719 |             plot = plot + scale_x_discrete(limits = c('All', names(tidy_annotations(x_order))))
720 |         } else {
721 |             plot = plot + scale_x_discrete(limits = c('All', x_order))
722 |         }
723 |     }
724 | 
725 |     # Add any user defined labels to the plot if their values are not NULL
726 |     # if they are NULL, ggplot() will use defaults
727 |     if(!missing(plot_title)) {
728 |         plot = plot + ggtitle(plot_title)
729 |     }
730 |     if(!missing(x_label)) {
731 |         plot = plot + xlab(x_label)
732 |     }
733 |     if(!missing(y_label)) {
734 |         plot = plot + ylab(y_label)
735 |     }
736 | 
737 |     return(plot)
738 | }
739 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [![Travis-CI Build Status](https://travis-ci.org/rcavalcante/annotatr.svg?branch=master)](https://travis-ci.org/rcavalcante/annotatr) [![Coverage Status](https://coveralls.io/repos/rcavalcante/annotatr/badge.svg?branch=master&service=github)](https://coveralls.io/github/rcavalcante/annotatr?branch=master)
2 | 
3 | See the package vignette for a fully worked through use case.
4 | 


--------------------------------------------------------------------------------
/data-raw/create_example_annotations.R:
--------------------------------------------------------------------------------
1 | annotations = build_annotations(genome = 'hg19', annotations = 'hg19_cpgs')
2 | devtools::use_data(annotations, internal = FALSE, compress = 'xz')
3 | 


--------------------------------------------------------------------------------
/data/annotations.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/data/annotations.rda


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite the R package 'annotatr' in publications use:")
 2 | 
 3 | citEntry(
 4 |   entry  = "article",
 5 |   title  = "annotatr: genomic regions in context.",
 6 |   author = personList(
 7 | 	  as.person("Raymond G Cavalcante"),
 8 | 	  as.person("Maureen A Sartor")
 9 |   ),
10 |   year = 2017,
11 |   journal = "Bioinformatics",
12 |   note = paste("R package version", meta$Version),
13 |   textVersion = "Cavalcante RG, Sartor MA. annotatr: genomic regions in context. Bioinformatics. (2017) 33(15):2381-2383. doi:10.1093/bioinformatics/btx183"
14 | )
15 | 
16 | citFooter("This free open-source software implements academic research by the authors and co-workers. If you use it, please support the project by citing the appropriate journal articles.")
17 | 


--------------------------------------------------------------------------------
/inst/NEWS:
--------------------------------------------------------------------------------
  1 | CHANGES IN VERSION 1.16.0
  2 | -------------------------
  3 | 
  4 | USER-FACING CHANGES
  5 | 
  6 |   o Export expand_annotations(), tidy_annotations(), and subset_order_tbl().
  7 | 
  8 | BUGFIXES
  9 | 
 10 |   o Fix incorrect shortcut search for HMMs.
 11 | 
 12 | CHANGES IN VERSION 1.6.0
 13 | ------------------------
 14 | 
 15 | NEW FEATURES
 16 | 
 17 |   o Add support for chicken (galGal5).
 18 | 
 19 | USER-FACING CHANGES
 20 | 
 21 |   o Add the ability to facet over two variables in plot_numerical().
 22 |   o Add the ability to keep duplicate regions in summarize_categorical() and
 23 |     plot_categorical(). This is accomplished with the 'by' parameter in the
 24 |     former and by the 'x' and 'fill' parameters in the latter, and passing
 25 |     their contents into the '.dots' parameter of dplyr::distinct_().
 26 |   o Make TxDb and OrgDb packages Suggests instead of Imports. NOTE: This saves
 27 |     space, but also requires downloading the appropriate packages as needed.
 28 |   o Add list_env() function to the annotatr_cache environment to see what
 29 |     custom annotations have been read in and added to the cache.
 30 | 
 31 | BUGFIXES
 32 | 
 33 |   o Replace dplyr::summarize_each_() with dplyr::summarize_at() in line with
 34 |     deprecation in the dplyr package.
 35 |   o Prefix builtin_ functions with annotatr:: so that packages that Import
 36 |     annotatr don't encounter errors.
 37 | 
 38 | CHANGES IN VERSION 1.2.0
 39 | ------------------------
 40 | 
 41 | NEW FEATURES
 42 | 
 43 |   o Add support for CpG annotations for hg38, mm10, and rn6 via the UCSC goldenpath URLs.
 44 |   o Add a function to build annotations from AnnotationHub resources, build_ah_annots().
 45 |   o Add support for chromHMM tracks (chromatin state) from the UCSC Genome Browser.
 46 |     o Users may annotate to chromatin states in multiple cell lines, if desired.
 47 |   o Use rtracklayer::liftOver to lift hg19 and mm9 enhancers into hg38 and mm10.
 48 | 
 49 | USER-FACING CHANGES
 50 | 
 51 |   o Add minoverlaps parameter to annotate_regions() that is passed to
 52 |     GenomicRanges::findOverlaps().
 53 |   o Change supported_annotations() and supported_genomes() into builtin_annotations()
 54 |     and builtin_genomes(). This enables more flexibility required for AnnotationHub
 55 |     annotations.
 56 |   o Added documentation for coercing result of annotate_regions() to data.frame
 57 |      and subsetting based on gene symbol to the vignette.
 58 | 
 59 | BUGFIXES
 60 | 
 61 |   o Fixed a bug in coercion of GRanges to data.frame where row.names could be
 62 |     duplicated. Thanks to @kdkorthauer.
 63 |   o Require GenomeInfoDb >= 1.10.3 because of changes to NCBI servers.
 64 |   o Change scale_fill_brewer() to scale_fill_hue() in plot_categorical() to enable
 65 |     more categories and avoid plotting abnormalities.
 66 |   o Fixed bug that mistakenly displayed some supported annotations.
 67 |   o Fixed a bug in lncRNA annotation building caused by incomplete reference.
 68 | 
 69 | CHANGES IN VERSION 0.99.13
 70 | --------------------------
 71 | 
 72 | PKG FEATURES
 73 | 
 74 |   o annotatr is a package to quickly and flexibly annotate genomic regions to
 75 |     genomic annotations.
 76 | 
 77 |     o Genomic annotations include CpG features (island, shore, shelves, and
 78 | 	  open sea), genic features (1-5kb upstream of TSS, promoters,
 79 | 	  5'UTRs, exons, introns, CDS, 3'UTRs, intron/exon boundaries, and exon/
 80 | 	  intron boundaries), as well as enhancers from the FANTOM5 consortium for
 81 | 	  hg19 and mm9.
 82 | 
 83 | 	  o Annotations are built at runtime using the TxDb.*, AnnotationHub, and
 84 | 	    rtracklayer packages. Users can select annotations a la carte, or via
 85 | 		shortcuts, such as hg19_basicgenes.
 86 | 
 87 | 	  o Annotations are currently available for hg19, mm9, mm10, dm3, dm6, rn4,
 88 | 	    rn5, and rn6. Any species is supported through custom annotations.
 89 | 
 90 |   o Genomic regions are read in using the rtracklayer::import() function, and
 91 |     the extraCols argument enables users to include an arbitrary number of
 92 | 	categorical or numerical data with the genomic regions.
 93 | 
 94 |   o Annotations are determined via GenomicRanges::findOverlaps(), and all
 95 |     annotations are returned, rather than imposing a prioritization.
 96 | 
 97 |   o annotatr provides several helpful summarization (using dplyr) and plot
 98 |     functions (using ggplot2) to investigate trends in data associated with the
 99 | 	genomic regions over annotations.
100 | 


--------------------------------------------------------------------------------
/inst/extdata/Gm12878_Ezh2_peak_annotations.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/inst/extdata/Gm12878_Ezh2_peak_annotations.txt.gz


--------------------------------------------------------------------------------
/inst/extdata/Gm12878_Ezh2_sorted_scores.narrowPeak.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/inst/extdata/Gm12878_Ezh2_sorted_scores.narrowPeak.gz


--------------------------------------------------------------------------------
/inst/extdata/Gm12878_Stat3_chr2.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/inst/extdata/Gm12878_Stat3_chr2.bed.gz


--------------------------------------------------------------------------------
/inst/extdata/IDH2mut_v_NBM_multi_data_chr9.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/inst/extdata/IDH2mut_v_NBM_multi_data_chr9.txt.gz


--------------------------------------------------------------------------------
/inst/extdata/K562_Cjun_peak_annotations.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/inst/extdata/K562_Cjun_peak_annotations.txt.gz


--------------------------------------------------------------------------------
/inst/extdata/test_BED3.bed:
--------------------------------------------------------------------------------
1 | chr1	10790	10805
2 | chr1	26800	28000
3 | chr1	28800	29000
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_BED4.bed:
--------------------------------------------------------------------------------
1 | chr1	10900	11000	test1
2 | chr1	26800	28000	test2
3 | chr1	28800	29000	test3
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_BED5.bed:
--------------------------------------------------------------------------------
1 | chr1	10900	11000	test1	32
2 | chr1	26800	28000	test2	46
3 | chr1	28800	29000	test3	36
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_BED6.bed:
--------------------------------------------------------------------------------
1 | chr1	10900	11000	test1	1000	+
2 | chr1	26800	28000	test2	1000	-
3 | chr1	28800	29000	test3	1000	-
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_annotation_nooverlap.bed:
--------------------------------------------------------------------------------
1 | chr1	8000	9000
2 | chr1	20000	24000
3 | chr1	28100	28200
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_annotations_3.bed:
--------------------------------------------------------------------------------
1 | chr1	10800	11500
2 | chr1	26500	28200
3 | chr1	28600	29200
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_annotations_4.bed:
--------------------------------------------------------------------------------
1 | chr1	10800	11500	test1
2 | chr1	26500	28200	test2
3 | chr1	28600	29200	test3
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_annotations_5.bed:
--------------------------------------------------------------------------------
1 | chr1	10800	11500	region1	.
2 | chr1	26500	28200	region2	.
3 | chr1	28600	29200	region3	.
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_annotations_6.bed:
--------------------------------------------------------------------------------
1 | chr1	10800	11500	region1	.	+
2 | chr1	26500	28200	region2	.	-
3 | chr1	28600	29200	region3	.	+
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_annotations_6_gene.bed:
--------------------------------------------------------------------------------
1 | chr1	10800	11500	region1	.	.	324
2 | chr1	26500	28200	region2	.	.	4624
3 | chr1	28600	29200	region3	.	.	3447
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_annotations_6_symbol.bed:
--------------------------------------------------------------------------------
1 | chr1	10800	11500	region1	.	.	BRCA
2 | chr1	26500	28200	region2	.	.	TP53
3 | chr1	28600	29200	region3	.	.	HOX1A
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_annotations_6_tx_gene_symbol.bed:
--------------------------------------------------------------------------------
1 | chr1	10800	11500	region1	.	.	351236	BRCA	ENST00000473358.1
2 | chr1	26500	28200	region2	.	.	4624	TP53	ENST00000607096.1
3 | chr1	28600	29200	region3	.	.	3447	HOX1A	ENST00000496488.1
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_annotations_minoverlap.bed:
--------------------------------------------------------------------------------
1 | chr1	10800	11500
2 | chr1	26500	26801
3 | chr1	28600	29200
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_bedGraph.bedGraph:
--------------------------------------------------------------------------------
1 | chr1	10900	11000	31
2 | chr1	26800	28000	36
3 | chr1	28800	29000	83
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_intersect.bed:
--------------------------------------------------------------------------------
1 | chr1	10900	11000	test1	1000	*
2 | chr1	26800	28000	test2	1000	*
3 | chr1	28800	29000	test3	1000	*
4 | 


--------------------------------------------------------------------------------
/inst/extdata/test_read_multiple_data_nohead.bed:
--------------------------------------------------------------------------------
1 | chr1	10800	10900	A	87	+	10e-4	100	13	Y
2 | chr1	11000	11100	A	45	-	1e-6	100	55	N
3 | chr1	27800	28800	A	34	-	0.04	41	7	Y
4 | chr1	29000	29300	B	62	+	0.001	95	33	Y
5 | 


--------------------------------------------------------------------------------
/man/annotate_regions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/intersect.R
 3 | \name{annotate_regions}
 4 | \alias{annotate_regions}
 5 | \title{A function to intersect user region data with annotation data}
 6 | \usage{
 7 | annotate_regions(
 8 |   regions,
 9 |   annotations,
10 |   minoverlap = 1L,
11 |   ignore.strand = TRUE,
12 |   quiet = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{regions}{The GRanges object returned by \code{read_regions()}.}
17 | 
18 | \item{annotations}{A character vector of annotations to build. Valid annotation codes are listed with \code{builtin_annotations()}. The "basicgenes" shortcut builds the following regions: 1-5Kb upstream of TSSs, promoters, 5UTRs, exons, introns, and 3UTRs. The "cpgs" shortcut builds the following regions: CpG islands, shores, shelves, and interCGI regions. NOTE: Shortcuts need to be appended by the genome, e.g. \code{hg19_basicgenes}.
19 | Custom annotations whose names are of the form \code{[genome]_custom_[name]} should also be included. Custom annotations should be read in and converted to \code{GRanges} with \code{read_annotations()}. They can be for a \code{supported_genome()}, or for an unsupported genome.}
20 | 
21 | \item{minoverlap}{A scalar, positive integer, indicating the minimum required overlap of regions with annotations.}
22 | 
23 | \item{ignore.strand}{Logical indicating whether strandedness should be respected in findOverlaps(). Default FALSE.}
24 | 
25 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).}
26 | }
27 | \value{
28 | A \code{GRanges} where the \code{granges} are from the regions, and the \code{mcols} include the \code{mcols} from the regions and a column with the annotation \code{GRanges}.
29 | }
30 | \description{
31 | Annotate genomic regions to selected genomic annotations while preserving the data associated with the genomic regions.
32 | }
33 | \examples{
34 |    r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr')
35 |    extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character')
36 |    r = read_regions(con = r_file, extraCols = extraCols, rename_score = 'coverage')
37 | 
38 |    # Get premade CpG annotations
39 |    data('annotations', package = 'annotatr')
40 | 
41 |    a = annotate_regions(
42 |        regions = r,
43 |        annotations = annotations,
44 |        ignore.strand = TRUE)
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/man/annotations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotatr_data_doc.R
 3 | \docType{data}
 4 | \name{annotations}
 5 | \alias{annotations}
 6 | \title{example_annotations data}
 7 | \format{
 8 | A \code{GRanges} object with the CpG feature annotations for hg19
 9 | and containing \code{mcols}:
10 | \describe{
11 |     \item{id}{The internal ID for the annotation}
12 |     \item{tx_id}{All NA, since these are not associated with tx_ids}
13 |     \item{gene_id}{All NA, since there are not associated Entrez IDs}
14 |     \item{symbols}{All NA, since there are not associated gene symbols}
15 |     \item{type}{A character indicating the type of annotation. Including:
16 | 'hg19_cpg_islands', 'hg19_cpg_shores', 'hg19_cpg_shelves', and 'hg19_cpg_inter'.}
17 | }
18 | }
19 | \source{
20 | The AnnotationHub resource for hg19 CpG features.
21 | }
22 | \usage{
23 | annotations
24 | }
25 | \description{
26 | A \code{GRanges} of precomputed annotations for CpG features. Created by doing
27 | \code{build_annotations(genome='hg19', annotations = 'hg19_cpgs')}.
28 | }
29 | \keyword{datasets}
30 | 


--------------------------------------------------------------------------------
/man/annotatr.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/annotatr_package_doc.R
 3 | \docType{package}
 4 | \name{annotatr}
 5 | \alias{annotatr}
 6 | \title{annotatr: Annotation of Genomic Regions to Functional Annotations}
 7 | \description{
 8 | Given a set of genomic sites/regions (e.g. ChIP-seq peaks, CpGs, differentially methylated CpGs or regions, SNPs, etc.) it is often of interest to investigate the intersecting functional annotations. Such annotations include those relating to gene models (promoters, 5'UTRs, exons, introns, and 3'UTRs), CpGs (CpG islands, CpG shores, CpG shelves), the non-coding genome, and enhancers. The annotatr package provides an easy way to summarize and visualize the intersection of genomic sites/regions with the above functional annotations.
 9 | }
10 | 


--------------------------------------------------------------------------------
/man/annotatr_cache.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/build_annotations.R
 3 | \docType{data}
 4 | \name{annotatr_cache}
 5 | \alias{annotatr_cache}
 6 | \title{A global-variable to hold custom annotations loaded in an R session}
 7 | \format{
 8 | An object of class \code{list} of length 3.
 9 | }
10 | \usage{
11 | annotatr_cache
12 | }
13 | \value{
14 | An environment to contain custom annotations from \code{read_annotations}.
15 | }
16 | \description{
17 | Code thanks to Martin Morgan. This is a global variable that will store custom
18 | annotations that a user reads in during a session in which annotatr is loaded.
19 | }
20 | \examples{
21 |  # Example usage
22 |  annotatr_cache$set("foo", 1:10)
23 |  annotatr_cache$get("foo")
24 | 
25 |  # Read in a BED3 file as a custom annotation
26 |  file = system.file('extdata', 'test_annotations_3.bed', package='annotatr')
27 |  # The custom annotation is added to the annotatr_cache environment in this function
28 |  read_annotations(con = file, name = 'test', genome = 'hg19')
29 |  # The result of read_annotations() is not visible in .GlobalEnv, instead
30 |  # need to use the get method
31 |  print(annotatr_cache$get('hg19_custom_test'))
32 |  # See what is in the annotatr_cache
33 |  annotatr_cache$list_env()
34 | }
35 | \keyword{datasets}
36 | 


--------------------------------------------------------------------------------
/man/build_ah_annots.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/build_annotations.R
 3 | \name{build_ah_annots}
 4 | \alias{build_ah_annots}
 5 | \title{A helper function to build arbitrary annotatinos from AnnotationHub}
 6 | \usage{
 7 | build_ah_annots(genome, ah_codes, annotation_class)
 8 | }
 9 | \arguments{
10 | \item{genome}{The genome assembly.}
11 | 
12 | \item{ah_codes}{A named character vector giving the AnnotationHub accession number (e.g. AH23256), and whose name describes what the annotation is (e.g. Gm12878_H3K4me3).}
13 | 
14 | \item{annotation_class}{A string to name the group of annotations in \code{ah_codes}}
15 | }
16 | \value{
17 | A \code{GRanges} object stored in \code{annotatr_cache}. To view an annotation built with this function, do \code{annotatr_cache$get(name)}. To add these annotations to a set of annotations, include \code{'[genome]_[annotation_class]_[name]'} in the call to \code{build_annotations()}. See example below.
18 | }
19 | \description{
20 | A helper function to build arbitrary annotatinos from AnnotationHub
21 | }
22 | \examples{
23 | 
24 | # Create a named vector for the AnnotationHub accession codes with desired names
25 | h3k4me3_code = c('Gm12878' = 'AH23256')
26 | # Fetch ah_codes from AnnotationHub and create annotations annotatr understands
27 | build_ah_annots(genome = 'hg19', ah_codes = h3k4me3_code, annotation_class = 'H3K4me3')
28 | # The annotations as they appear in annotatr_cache
29 | annot_name = c('hg19_H3K4me3_Gm12878')
30 | # Build the annotations right before annotating any regions
31 | annotations = build_annotations(genome = 'hg19', annotations = annot_name)
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/man/build_annotations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/build_annotations.R
 3 | \name{build_annotations}
 4 | \alias{build_annotations}
 5 | \title{A function to build annotations from TxDb.* and AnnotationHub resources}
 6 | \usage{
 7 | build_annotations(genome, annotations)
 8 | }
 9 | \arguments{
10 | \item{genome}{The genome assembly.}
11 | 
12 | \item{annotations}{A character vector of annotations to build. Valid annotation codes are listed with \code{builtin_annotations()}. The "basicgenes" shortcut builds the following regions: 1-5Kb upstream of TSSs, promoters, 5UTRs, exons, introns, and 3UTRs. The "cpgs" shortcut builds the following regions: CpG islands, shores, shelves, and interCGI regions. NOTE: Shortcuts need to be appended by the genome, e.g. \code{hg19_basicgenes}.
13 | Custom annotations whose names are of the form \code{[genome]_custom_[name]} should also be included. Custom annotations should be read in and converted to \code{GRanges} with \code{read_annotations()}. They can be for a \code{supported_genome()}, or for an unsupported genome.}
14 | }
15 | \value{
16 | A \code{GRanges} object of all the \code{annotations} combined. The \code{mcols} are \code{id, tx_id, gene_id, symbol, type}. The \code{id} column is a unique name, the \code{tx_id} column is either a UCSC knownGene transcript ID (genic annotations) or a Ensembl transcript ID (lncRNA annotations), the \code{gene_id} is the Entrez ID, the \code{symbol} is the gene symbol from the \code{org.*.eg.db} mapping from the Entrez ID, and the \code{type} is of the form \code{[genome]_[type]_[name]}.
17 | }
18 | \description{
19 | Create a \code{GRanges} object consisting of all the desired \code{annotations}. Supported annotation codes are listed by \code{builtin_annotations()}. The basis for enhancer annotations are FANTOM5 data, the basis for CpG related annotations are CpG island tracks from \code{AnnotationHub}, and the basis for genic annotations are from the \code{TxDb.*} and \code{org.db} group of packages.
20 | }
21 | \examples{
22 | # Example with hg19 gene promoters
23 | annots = c('hg19_genes_promoters')
24 | annots_gr = build_annotations(genome = 'hg19', annotations = annots)
25 | 
26 | # See vignette for an example with custom annotation
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/man/build_cpg_annots.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/build_annotations.R
 3 | \name{build_cpg_annots}
 4 | \alias{build_cpg_annots}
 5 | \title{A helper function to build CpG related annotations.}
 6 | \usage{
 7 | build_cpg_annots(
 8 |   genome = annotatr::builtin_genomes(),
 9 |   annotations = annotatr::builtin_annotations()
10 | )
11 | }
12 | \arguments{
13 | \item{genome}{The genome assembly.}
14 | 
15 | \item{annotations}{A character vector with entries of the form \code{[genome]_cpg_{islands,shores,shelves,inter}}.}
16 | }
17 | \value{
18 | A list of \code{GRanges} objects.
19 | }
20 | \description{
21 | Using the \code{AnnotationHub} package, extract CpG island track for the appropriate \code{genome} and construct the shores, shelves, and interCGI annotations as desired.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/build_enhancer_annots.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/build_annotations.R
 3 | \name{build_enhancer_annots}
 4 | \alias{build_enhancer_annots}
 5 | \title{A helper function to build enhancer annotations for hg19 and mm10 from FANTOM5.}
 6 | \usage{
 7 | build_enhancer_annots(genome = c("hg19", "hg38", "mm9", "mm10"))
 8 | }
 9 | \arguments{
10 | \item{genome}{The genome assembly.}
11 | }
12 | \value{
13 | A \code{GRanges} object.
14 | }
15 | \description{
16 | A helper function to build enhancer annotations for hg19 and mm10 from FANTOM5.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/build_gene_annots.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/build_annotations.R
 3 | \name{build_gene_annots}
 4 | \alias{build_gene_annots}
 5 | \title{A helper function to build genic annotations.}
 6 | \usage{
 7 | build_gene_annots(
 8 |   genome = annotatr::builtin_genomes(),
 9 |   annotations = annotatr::builtin_annotations()
10 | )
11 | }
12 | \arguments{
13 | \item{genome}{The genome assembly.}
14 | 
15 | \item{annotations}{A character vector with entries of the form \code{[genome]_genes_{1to5kb,promoters,5UTRs,cds,exons,firstexons,introns,intronexonboundaries,exonintronboundaries,3UTRs,intergenic}}.}
16 | }
17 | \value{
18 | A list of \code{GRanges} objects with unique \code{id} of the form \code{[type]:i}, \code{tx_id} being the UCSC knownGene transcript name, \code{gene_id} being the Entrez Gene ID, \code{symbol} being the gene symbol from the Entrez ID to symbol mapping in \code{org.db} for that species, and \code{type} being the annotation type.
19 | }
20 | \description{
21 | Using the \code{TxDb.*} group of packages, construct genic annotations consisting of any combination of 1-5kb upstream of a TSS, promoters (< 1kb from TSS), 5UTRs, CDS, exons, first exons, introns, intron/exon and exon/intron boundaries, 3UTRs, and intergenic.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/build_hmm_annots.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/build_annotations.R
 3 | \name{build_hmm_annots}
 4 | \alias{build_hmm_annots}
 5 | \title{A helper function to build chromHMM annotations for hg19 from UCSC Genome Browser.}
 6 | \usage{
 7 | build_hmm_annots(
 8 |   genome = c("hg19"),
 9 |   annotations = annotatr::builtin_annotations()
10 | )
11 | }
12 | \arguments{
13 | \item{genome}{The genome assembly.}
14 | 
15 | \item{annotations}{A character vector of valid chromatin state annotatin codes.}
16 | }
17 | \value{
18 | A \code{GRanges} object.
19 | }
20 | \description{
21 | A helper function to build chromHMM annotations for hg19 from UCSC Genome Browser.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/build_lncrna_annots.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/build_annotations.R
 3 | \name{build_lncrna_annots}
 4 | \alias{build_lncrna_annots}
 5 | \title{A helper function to build lncRNA annotations.}
 6 | \usage{
 7 | build_lncrna_annots(genome = c("hg19", "hg38", "mm10"))
 8 | }
 9 | \arguments{
10 | \item{genome}{The genome assembly.}
11 | }
12 | \value{
13 | A \code{GRanges} object with \code{id} giving the \code{transcript_type} from the GENCODE file, \code{tx_id} being the Ensembl transcript ID, \code{gene_id} being the Entrez ID coming from a mapping of gene symbol to Entrez ID, \code{symbol} being the gene_name from the GENCODE file, and the \code{type} being \code{[genome]_lncrna_gencode}.
14 | }
15 | \description{
16 | Using the \code{AnnotationHub} package, retrieve transcript level lncRNA annotations for either human (GRCh38) or mouse (GRCm38). If the genome is 'hg19', use the permalink from GENCODE and \code{rtracklayer::import()} to download and process.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/builtin_annotations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{builtin_annotations}
 4 | \alias{builtin_annotations}
 5 | \title{Function listing which annotations are available.}
 6 | \usage{
 7 | builtin_annotations()
 8 | }
 9 | \value{
10 | A character vector of available annotations.
11 | }
12 | \description{
13 | This includes the shortcuts. The \code{expand_annotations()} function helps
14 | handle the shortcuts.
15 | }
16 | \examples{
17 | builtin_annotations()
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/man/builtin_genomes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{builtin_genomes}
 4 | \alias{builtin_genomes}
 5 | \title{Function returning supported TxDb.* genomes}
 6 | \usage{
 7 | builtin_genomes()
 8 | }
 9 | \value{
10 | A character vector of genomes for supported TxDb.* packages
11 | }
12 | \description{
13 | Function returning supported TxDb.* genomes
14 | }
15 | \examples{
16 | builtin_genomes()
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/man/check_annotations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{check_annotations}
 4 | \alias{check_annotations}
 5 | \title{Function to check for valid annotations}
 6 | \usage{
 7 | check_annotations(annotations)
 8 | }
 9 | \arguments{
10 | \item{annotations}{A character vector of annotations possibly using the shortcuts}
11 | }
12 | \value{
13 | If all the checks on the annotations pass, returns NULL to allow code to move forward.
14 | }
15 | \description{
16 | Gives errors if any annotations are not in builtin_annotations() (and they are not in the required custom format), basicgenes are used, or the genome prefixes are not the same for all annotations.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/expand_annotations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{expand_annotations}
 4 | \alias{expand_annotations}
 5 | \title{Function to expand annotation shortcuts}
 6 | \usage{
 7 | expand_annotations(annotations)
 8 | }
 9 | \arguments{
10 | \item{annotations}{A character vector of annotations, possibly using the shortcut accessors}
11 | }
12 | \value{
13 | A vector of data accession-ized names that are ordered from upstream to downstream in the case of knownGenes and islands to interCGI in the case of cpgs.
14 | }
15 | \description{
16 | Function to expand annotation shortcuts
17 | }
18 | 


--------------------------------------------------------------------------------
/man/get_cellline_from_code.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_cellline_from_code}
 4 | \alias{get_cellline_from_code}
 5 | \title{Function to return cell line from chromatin annotation code}
 6 | \usage{
 7 | get_cellline_from_code(code)
 8 | }
 9 | \arguments{
10 | \item{code}{The annotation code, used in \code{build_annotations()}.}
11 | }
12 | \value{
13 | A string of the cell line used in a chromatin annotation code
14 | }
15 | \description{
16 | Function to return cell line from chromatin annotation code
17 | }
18 | 


--------------------------------------------------------------------------------
/man/get_cellline_from_shortcut.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_cellline_from_shortcut}
 4 | \alias{get_cellline_from_shortcut}
 5 | \title{Function to return cell line from chromatin annotation shortcut}
 6 | \usage{
 7 | get_cellline_from_shortcut(shortcut)
 8 | }
 9 | \arguments{
10 | \item{shortcut}{The annotation shortcut, used in \code{build_annotations()}.}
11 | }
12 | \value{
13 | A string of the cell line used in a chromatin annotation shortcut
14 | }
15 | \description{
16 | Function to return cell line from chromatin annotation shortcut
17 | }
18 | 


--------------------------------------------------------------------------------
/man/get_orgdb_name.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_orgdb_name}
 4 | \alias{get_orgdb_name}
 5 | \title{Function to get correct org.* package name based on genome}
 6 | \usage{
 7 | get_orgdb_name(genome = annotatr::builtin_genomes())
 8 | }
 9 | \arguments{
10 | \item{genome}{A string giving the genome assembly.}
11 | }
12 | \value{
13 | A string giving the correct org for org.db packages. e.g. hg19 -> Hs.
14 | }
15 | \description{
16 | Function to get correct org.* package name based on genome
17 | }
18 | 


--------------------------------------------------------------------------------
/man/get_txdb_name.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_txdb_name}
 4 | \alias{get_txdb_name}
 5 | \title{Function to get correct TxDb.* package name based on genome}
 6 | \usage{
 7 | get_txdb_name(genome = annotatr::builtin_genomes())
 8 | }
 9 | \arguments{
10 | \item{genome}{A string giving the genome assembly.}
11 | }
12 | \value{
13 | A string giving the name of the correct TxDb.* package name based on \code{genome}.
14 | }
15 | \description{
16 | Function to get correct TxDb.* package name based on genome
17 | }
18 | 


--------------------------------------------------------------------------------
/man/plot_annotation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualize.R
 3 | \name{plot_annotation}
 4 | \alias{plot_annotation}
 5 | \title{Plot the number of regions per annotation}
 6 | \usage{
 7 | plot_annotation(
 8 |   annotated_regions,
 9 |   annotated_random,
10 |   annotation_order = NULL,
11 |   plot_title,
12 |   x_label,
13 |   y_label,
14 |   quiet = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{annotated_regions}{The \code{GRanges} result of \code{annotate_regions()}.}
19 | 
20 | \item{annotated_random}{The \code{GRanges} result of \code{annotate_regions()} on the randomized regions created from \code{randomize_regions()}.}
21 | 
22 | \item{annotation_order}{A character vector which doubles as the subset of annotations desired for the plot as well as the ordering. If \code{NULL}, all annotations are displayed.}
23 | 
24 | \item{plot_title}{A string used for the title of the plot. If missing, no title is displayed.}
25 | 
26 | \item{x_label}{A string used for the x-axis label. If missing, no x-axis label is displayed.}
27 | 
28 | \item{y_label}{A string used for the y-axis label. If missing, no y-axis label is displayed.}
29 | 
30 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).}
31 | }
32 | \value{
33 | A \code{ggplot} object which can be viewed by calling it, saved with \code{ggplot2::ggsave}, or edited.
34 | }
35 | \description{
36 | Given a \code{GRanges} of annotated regions, plot the number of regions with the corresponding genomic annotations used in \code{annotation_order}. If a region is annotated to multiple annotations of the same \code{annot.type}, the region will only be counted once in the corresponding bar plot. For example, if a region were annotated to multiple exons, it would only count once toward the exon bar in the plot, but if it were annotated to an exon and an intron, it would count towards both.
37 | }
38 | \examples{
39 |    ########################################################################
40 |    # An example of ChIP-seq peaks with signalValue used for score
41 | 
42 |    # Get premade CpG annotations
43 |    data('annotations', package = 'annotatr')
44 | 
45 |    chip_bed = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr')
46 |    chip_regions = read_regions(con = chip_bed, genome = 'hg19')
47 | 
48 |    chip_rnd = randomize_regions(regions = chip_regions)
49 | 
50 |    chip_annots = annotate_regions(
51 |        regions = chip_regions,
52 |        annotations = annotations,
53 |        ignore.strand = TRUE)
54 | 
55 |    chip_rnd_annots = annotate_regions(
56 |        regions = chip_rnd,
57 |        annotations = annotations,
58 |        ignore.strand = TRUE)
59 | 
60 |    annots_order = c(
61 |        'hg19_cpg_islands',
62 |        'hg19_cpg_shores')
63 | 
64 |    p_annots = plot_annotation(annotated_regions = chip_annots,
65 |        annotation_order = annots_order)
66 |    p_annots_rnd = plot_annotation(annotated_regions = chip_annots,
67 |        annotated_random = chip_rnd_annots, annotation_order = annots_order)
68 | 
69 | }
70 | 


--------------------------------------------------------------------------------
/man/plot_categorical.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/visualize.R
  3 | \name{plot_categorical}
  4 | \alias{plot_categorical}
  5 | \title{Plot a categorical data variable over another}
  6 | \usage{
  7 | plot_categorical(
  8 |   annotated_regions,
  9 |   annotated_random,
 10 |   x,
 11 |   fill = NULL,
 12 |   x_order = NULL,
 13 |   fill_order = NULL,
 14 |   position = "stack",
 15 |   plot_title,
 16 |   legend_title,
 17 |   x_label,
 18 |   y_label,
 19 |   quiet = FALSE
 20 | )
 21 | }
 22 | \arguments{
 23 | \item{annotated_regions}{The \code{GRanges} result of \code{annotate_regions()}.}
 24 | 
 25 | \item{annotated_random}{The \code{GRanges} result of \code{annotate_regions()} on the randomized regions created from \code{randomize_regions()}. Random regions can only be used with \code{fill == 'annot.type'}.}
 26 | 
 27 | \item{x}{One of 'annot.type' or a categorical data column, indicating whether annotation classes or data classes will appear on the x-axis.}
 28 | 
 29 | \item{fill}{One of 'annot.type', a categorical data column, or \code{NULL}, indicating whether annotation classes or data classes will fill the bars. If \code{NULL} then the bars will be the total counts of the x classes.}
 30 | 
 31 | \item{x_order}{A character vector that subsets and orders the x classes. Default \code{NULL}, uses existing values.}
 32 | 
 33 | \item{fill_order}{A character vector that subsets and orders the fill classes. Default \code{NULL}, uses existing values.}
 34 | 
 35 | \item{position}{A string which has the same possible values as in \code{ggplot2::geom_bar(..., position)}, i.e., 'stack', 'fill', 'dodge', etc.}
 36 | 
 37 | \item{plot_title}{A string used for the title of the plot. If missing, no title is displayed.}
 38 | 
 39 | \item{legend_title}{A string used for the legend title to describe fills (if fill is not \code{NULL}). Default displays corresponding variable name.}
 40 | 
 41 | \item{x_label}{A string used for the x-axis label. If missing, corresponding variable name used.}
 42 | 
 43 | \item{y_label}{A string used for the y-axis label. If missing, corresponding variable name used.}
 44 | 
 45 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).}
 46 | }
 47 | \value{
 48 | A \code{ggplot} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}.
 49 | }
 50 | \description{
 51 | Given a \code{GRanges} of annotated regions from \code{annotate_regions()}, visualize the the distribution of categorical data \code{fill} in categorical data \code{x}. A bar representing the distribution of all \code{fill} in \code{x} will be added according to the contents of \code{fill}. This is the distribution over all values of \code{x}. Additionally, when \code{annotated_random} is not missing, a "Random Regions" bar shows the distribution of random regions over \code{fill}.
 52 | }
 53 | \details{
 54 | For example, if a differentially methylated region has the categorical label hyper, and is annotated to a promoter, a 5UTR, two exons, and an intron. Each annotation will appear in the All bar once. Likewise for the hyper bar if the differential methylation status is chosen as \code{x} with \code{annot.type} chosen as \code{fill}.
 55 | }
 56 | \examples{
 57 |    # Get premade CpG annotations
 58 |    data('annotations', package = 'annotatr')
 59 | 
 60 |    dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
 61 |    extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric')
 62 |    dm_regions = read_regions(con = dm_file, extraCols = extraCols, genome = 'hg19',
 63 |        rename_score = 'pval', rename_name = 'DM_status', format = 'bed')
 64 |    dm_regions = dm_regions[1:1000]
 65 | 
 66 |    dm_annots = annotate_regions(
 67 |        regions = dm_regions,
 68 |        annotations = annotations,
 69 |        ignore.strand = TRUE)
 70 | 
 71 |    dm_order = c(
 72 |        'hyper',
 73 |        'hypo')
 74 |    cpg_order = c(
 75 |        'hg19_cpg_islands',
 76 |        'hg19_cpg_shores',
 77 |        'hg19_cpg_shelves',
 78 |        'hg19_cpg_inter')
 79 | 
 80 |    dm_vn = plot_categorical(
 81 |        annotated_regions = dm_annots,
 82 |        x = 'DM_status',
 83 |        fill = 'annot.type',
 84 |        x_order = dm_order,
 85 |        fill_order = cpg_order,
 86 |        position = 'fill',
 87 |        legend_title = 'knownGene Annotations',
 88 |        x_label = 'DM status',
 89 |        y_label = 'Proportion')
 90 | 
 91 |    # Create randomized regions
 92 |    dm_rnd_regions = randomize_regions(regions = dm_regions)
 93 |    dm_rnd_annots = annotate_regions(
 94 |        regions = dm_rnd_regions,
 95 |        annotations = annotations,
 96 |        ignore.strand = TRUE)
 97 | 
 98 |    dm_vn_rnd = plot_categorical(
 99 |        annotated_regions = dm_annots,
100 |        annotated_random = dm_rnd_annots,
101 |        x = 'DM_status',
102 |        fill = 'annot.type',
103 |        x_order = dm_order,
104 |        fill_order = cpg_order,
105 |        position = 'fill',
106 |        legend_title = 'knownGene Annotations',
107 |        x_label = 'DM status',
108 |        y_label = 'Proportion')
109 | 
110 | }
111 | 


--------------------------------------------------------------------------------
/man/plot_coannotations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualize.R
 3 | \name{plot_coannotations}
 4 | \alias{plot_coannotations}
 5 | \title{Plot pair-wise annotations across regions}
 6 | \usage{
 7 | plot_coannotations(
 8 |   annotated_regions,
 9 |   annotation_order = NULL,
10 |   plot_title,
11 |   axes_label,
12 |   quiet = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{annotated_regions}{The \code{GRanges} result of \code{annotate_regions()}.}
17 | 
18 | \item{annotation_order}{A character vector which doubles as the subset of annotations desired for plot as well as the ordering. If \code{NULL}, all annotations are displayed.}
19 | 
20 | \item{plot_title}{A string used for the title of the plot. If missing, no plot title label is displayed.}
21 | 
22 | \item{axes_label}{A string used for the axis labels. If missing, corresponding variable name used.}
23 | 
24 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).}
25 | }
26 | \value{
27 | A \code{ggplot} object which can be viewed by calling it, saved with \code{ggplot2::ggsave}, or edited.
28 | }
29 | \description{
30 | All co-occurring annotations associated with a region are computed and displayed as a heatmap.
31 | }
32 | \details{
33 | As with \code{plot_annotation()}, the number in each cell is the number of unique regions annotated to the pair of annotations.
34 | 
35 | For example, if a region is annotated to both a CpG shore and to two different exons simultaneously, the region will only be counted once in the CpG shore / exon cell. NOTE, this same region will count once in both the CpG shore and exon cells on the diagonal.
36 | }
37 | \examples{
38 |    # Get premade CpG annotations
39 |    data('annotations', package = 'annotatr')
40 | 
41 |    dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
42 |    extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric')
43 |    dm_regions = read_regions(con = dm_file, extraCols = extraCols,
44 |        rename_score = 'pval', rename_name = 'DM_status', format = 'bed')
45 |    dm_regions = dm_regions[1:1000]
46 | 
47 |    dm_annots = annotate_regions(
48 |        regions = dm_regions,
49 |        annotations = annotations,
50 |        ignore.strand = TRUE)
51 | 
52 |    all_order = c(
53 |        'hg19_cpg_islands',
54 |        'hg19_cpg_shores',
55 |        'hg19_cpg_shelves',
56 |        'hg19_cpg_inter')
57 | 
58 |    dm_vs_ca = plot_coannotations(
59 |        annotated_regions = dm_annots,
60 |        annotation_order = all_order,
61 |        axes_label = 'Annotations',
62 |        plot_title = 'Co-occurrence of Annotations')
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/man/plot_numerical.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/visualize.R
  3 | \name{plot_numerical}
  4 | \alias{plot_numerical}
  5 | \title{Plot numerical data over regions or regions summarized over annotations}
  6 | \usage{
  7 | plot_numerical(
  8 |   annotated_regions,
  9 |   x,
 10 |   y,
 11 |   facet,
 12 |   facet_order,
 13 |   bin_width = 10,
 14 |   plot_title,
 15 |   x_label,
 16 |   y_label,
 17 |   legend_facet_label,
 18 |   legend_cum_label,
 19 |   quiet = FALSE
 20 | )
 21 | }
 22 | \arguments{
 23 | \item{annotated_regions}{A \code{GRanges} returned from \code{annotate_regions()}. If the data is not summarized, the data is at the region level. If it is summarized, it represents the average or standard deviation of the regions by the character vector used for \code{by} in \code{summarize_numerical()}.}
 24 | 
 25 | \item{x}{A string indicating the column of the \code{GRanges} to use for the x-axis.}
 26 | 
 27 | \item{y}{A string indicating the column of the \code{GRanges} to use for the y-axis. If missing, a a histogram over \code{x} will be plotted. If not missing, a scatterplot is plotted.}
 28 | 
 29 | \item{facet}{A string, or character vector of two strings, indicating indicating which categorical variable(s) in the \code{GRanges} to make \code{ggplot2} facets over. When two facets are given, the first entry is the vertical facet and the second entry is the horizontal facet. Default is \code{annot.type}.}
 30 | 
 31 | \item{facet_order}{A character vector, or list of character vectors if \code{facet} has length 2, which gives the order of the facets, and can be used to subset the column in the \code{GRanges} used for the \code{facet}. For example, if \code{facet = 'annot.type'}, then the annotations maybe subsetted to just CpG annotations. Default is \code{NULL}, meaning all annotations in their default order are used.}
 32 | 
 33 | \item{bin_width}{An integer indicating the bin width of the histogram used for score. Default 10. Select something appropriate for the data. NOTE: This is only used if \code{y} is \code{NULL}.}
 34 | 
 35 | \item{plot_title}{A string used for the title of the plot. If missing, no title is displayed.}
 36 | 
 37 | \item{x_label}{A string used for the x-axis label. If missing, no x-axis label is displayed.}
 38 | 
 39 | \item{y_label}{A string used for the y-axis label. If missing, no y-axis label is displayed.}
 40 | 
 41 | \item{legend_facet_label}{A string used to label the gray bar portion of the legend. Defaults to "x in facet".}
 42 | 
 43 | \item{legend_cum_label}{A string used to label the red outline portion of the legend. Defaults to "All in x".}
 44 | 
 45 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).}
 46 | }
 47 | \value{
 48 | A \code{ggplot} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}.
 49 | }
 50 | \description{
 51 | This function produces either histograms over \code{facet}, or x-y scatterplots over \code{facet}. In the case of histograms over facets, the All distribution (hollow histogram with red outline) is the distribution of \code{x} over all the regions in the data. The facet specific distributions (solid gray) are the distribution of \code{x} over the regions in each facet. For example, a CpG with associated percent methylation annotated to a CpG island and a promoter will count once in the All distribution, but will count once each in the CpG island and promoter facet distributions.
 52 | }
 53 | \examples{
 54 |    # An example with multi-columned data
 55 | 
 56 |    # Get premade CpG annotations
 57 |    data('annotations', package = 'annotatr')
 58 | 
 59 |    dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
 60 |    extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric')
 61 |    dm_regions = read_regions(con = dm_file, extraCols = extraCols,
 62 |        rename_score = 'pval', rename_name = 'DM_status', format = 'bed')
 63 |    dm_regions = dm_regions[1:1000]
 64 | 
 65 |    # Annotate the regions
 66 |    dm_annots = annotate_regions(
 67 |        regions = dm_regions,
 68 |        annotations = annotations,
 69 |        ignore.strand = TRUE)
 70 | 
 71 |    # Plot histogram of group 1 methylation rates across the CpG annotations.
 72 |    # NOTE: Overall distribution (everything in \code{facet_order})
 73 |    # is plotted in each facet for comparison.
 74 |    dm_vs_regions_mu1 = plot_numerical(
 75 |        annotated_regions = dm_annots,
 76 |        x = 'mu1',
 77 |        facet = 'annot.type',
 78 |        facet_order = c('hg19_cpg_islands','hg19_cpg_shores',
 79 |            'hg19_cpg_shelves','hg19_cpg_inter'),
 80 |        bin_width = 5,
 81 |        plot_title = 'Group 1 Methylation over CpG Annotations',
 82 |        x_label = 'Group 1 Methylation')
 83 | 
 84 |    # Plot histogram of group 1 methylation rates across the CpG annotations
 85 |    # crossed with DM_status
 86 |    dm_vs_regions_diffmeth = plot_numerical(
 87 |        annotated_regions = dm_annots,
 88 |        x = 'diff_meth',
 89 |        facet = c('annot.type','DM_status'),
 90 |        facet_order = list(
 91 |            c('hg19_genes_promoters','hg19_genes_5UTRs','hg19_cpg_islands'),
 92 |            c('hyper','hypo','none')),
 93 |        bin_width = 5,
 94 |        plot_title = 'Group 0 Region Methylation In Genes',
 95 |        x_label = 'Methylation Difference')
 96 | 
 97 |    # Can also use the result of annotate_regions() to plot two numerical
 98 |    # data columns against each other for each region, and facet by annotations.
 99 |    dm_vs_regions_annot = plot_numerical(
100 |        annotated_regions = dm_annots,
101 |        x = 'mu0',
102 |        y = 'mu1',
103 |        facet = 'annot.type',
104 |        facet_order = c('hg19_cpg_islands','hg19_cpg_shores',
105 |            'hg19_cpg_shelves','hg19_cpg_inter'),
106 |        plot_title = 'Region Methylation: Group 0 vs Group 1',
107 |        x_label = 'Group 0',
108 |        y_label = 'Group 1')
109 | 
110 |    # Another example, but using differential methylation status as the facets.
111 |    dm_vs_regions_name = plot_numerical(
112 |        annotated_regions = dm_annots,
113 |        x = 'mu0',
114 |        y = 'mu1',
115 |        facet = 'DM_status',
116 |        facet_order = c('hyper','hypo','none'),
117 |        plot_title = 'Region Methylation: Group 0 vs Group 1',
118 |        x_label = 'Group 0',
119 |        y_label = 'Group 1')
120 | 
121 | }
122 | 


--------------------------------------------------------------------------------
/man/plot_numerical_coannotations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/visualize.R
 3 | \name{plot_numerical_coannotations}
 4 | \alias{plot_numerical_coannotations}
 5 | \title{Plot numerical data occurring in pairs of annotations}
 6 | \usage{
 7 | plot_numerical_coannotations(
 8 |   annotated_regions,
 9 |   x,
10 |   y,
11 |   annot1,
12 |   annot2,
13 |   bin_width = 10,
14 |   plot_title,
15 |   x_label,
16 |   y_label,
17 |   legend_facet_label,
18 |   legend_cum_label,
19 |   quiet = FALSE
20 | )
21 | }
22 | \arguments{
23 | \item{annotated_regions}{A \code{GRanges} returned from \code{annotate_regions()}.}
24 | 
25 | \item{x}{A string indicating the column of the \code{GRanges} to use for the x-axis.}
26 | 
27 | \item{y}{A string indicating the column of the \code{GRanges} to use for the y-axis. If missing, a histogram over \code{x} will be plotted. If not missing, a scatterplot is plotted.}
28 | 
29 | \item{annot1}{A string indicating the first annotation type.}
30 | 
31 | \item{annot2}{A string indicating the second annotation type.}
32 | 
33 | \item{bin_width}{An integer indicating the bin width of the histogram used for score. Default 10. Select something appropriate for the data. NOTE: This is only used if \code{y} is \code{NULL}.}
34 | 
35 | \item{plot_title}{A string used for the title of the plot. If missing, no title is displayed.}
36 | 
37 | \item{x_label}{A string used for the x-axis label. If missing, no x-axis label is displayed.}
38 | 
39 | \item{y_label}{A string used for the y-axis label. If missing, no y-axis label is displayed.}
40 | 
41 | \item{legend_facet_label}{A string used to label the gray bar portion of the legend. Defaults to "x in annot pair".}
42 | 
43 | \item{legend_cum_label}{A string used to label the red outline portion of the legend. Defaults to "All x".}
44 | 
45 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).}
46 | }
47 | \value{
48 | A \code{ggplot} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}.
49 | }
50 | \description{
51 | Plot numerical data associated with regions occurring in \code{annot1}, \code{annot2} and in both. As with \code{plot_numerical()}, the result is a plot of histograms or x-y scatterplots.
52 | }
53 | \details{
54 | For example, a CpG with associated percent methylation annotated to a CpG island and a promoter will count once in the All distribution and once in the CpG island / promoter facet distribution. However, a CpG associated only with a promoter will count once in the All distribution and once in the promoter / promoter distribution.
55 | }
56 | \examples{
57 |    # Get premade CpG annotations
58 |    data('annotations', package = 'annotatr')
59 | 
60 |    dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
61 |    extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric')
62 |    dm_regions = read_regions(con = dm_file, extraCols = extraCols,
63 |        rename_score = 'pval', rename_name = 'DM_status', format = 'bed')
64 |    dm_regions = dm_regions[1:1000]
65 | 
66 |    dm_annots = annotate_regions(
67 |        regions = dm_regions,
68 |        annotations = annotations,
69 |        ignore.strand = TRUE)
70 | 
71 |    dm_vs_num_co = plot_numerical_coannotations(
72 |        annotated_regions = dm_annots,
73 |        x = 'mu0',
74 |        annot1 = 'hg19_cpg_islands',
75 |        annot2 = 'hg19_cpg_shelves',
76 |        bin_width = 5,
77 |        plot_title = 'Group 0 Perc. Meth. in CpG Islands and Promoters',
78 |        x_label = 'Percent Methylation')
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/man/randomize_regions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/randomize.R
 3 | \name{randomize_regions}
 4 | \alias{randomize_regions}
 5 | \title{Randomize Regions}
 6 | \usage{
 7 | randomize_regions(
 8 |   regions,
 9 |   allow.overlaps = TRUE,
10 |   per.chromosome = TRUE,
11 |   quiet = FALSE
12 | )
13 | }
14 | \arguments{
15 | \item{regions}{A \code{GRanges} object from \code{read_regions}.}
16 | 
17 | \item{allow.overlaps}{A logical stating whether random regions can overlap input regions (TRUE) or not (FALSE). Default TRUE.}
18 | 
19 | \item{per.chromosome}{A logical stating whether the random regions should remain on the same chromosome (TRUE) or not (FALSE). Default TRUE.}
20 | 
21 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).}
22 | }
23 | \value{
24 | A \code{GRanges} object of randomized regions based on \code{regions} from \code{read_regions()}. NOTE: Data associated with the original regions is not attached to the randomized regions.
25 | }
26 | \description{
27 | \code{randomize_regions} is a wrapper function for \code{regioneR::randomizeRegions()} that simplifies the creation of randomized regions for an input set of regions read with \code{read_regions()}. It relies on the \code{seqlengths} of \code{regions} in order to build the appropriate \code{genome} object for \code{regioneR::randomizeRegions()}.
28 | }
29 | \details{
30 | NOTE: The data associated with the input \code{regions} are not passed on to the random regions.
31 | }
32 | \examples{
33 |    # Create random region set based on ENCODE ChIP-seq data
34 |    file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr')
35 |    r = read_regions(con = file, genome = 'hg19')
36 | 
37 |    random_r = randomize_regions(regions = r)
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/man/read_annotations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/read.R
 3 | \name{read_annotations}
 4 | \alias{read_annotations}
 5 | \title{Read custom annotations}
 6 | \usage{
 7 | read_annotations(con, name, genome = NA, format, extraCols = character(), ...)
 8 | }
 9 | \arguments{
10 | \item{con}{A path, URL, connection or BEDFile object. See \code{rtracklayer::import.bed()} documentation.}
11 | 
12 | \item{name}{A string for the name of the annotations to be used in the name of the object, [genome]_custom_[name]}
13 | 
14 | \item{genome}{From \code{rtracklayer::import()}: The identifier of a genome, or NA if unknown. Typically, this is a UCSC identifier like 'hg19'. An attempt will be made to derive the \code{seqinfo} on the return value using either an installed BSgenome package or UCSC, if network access is available.}
15 | 
16 | \item{format}{From \code{rtracklayer::import()}: The format of the output. If not missing, should be one of 'bed', 'bed15', 'bedGraph' or 'bedpe'. If missing and 'con' is a filename, the format is derived from the file extension. This argument is unnecessary when 'con' is a derivative of 'RTLFile'.}
17 | 
18 | \item{extraCols}{From \code{rtracklayer::import.bed()}: A character vector in the same form as 'colClasses' from 'read.table'.  It should indicate the name and class of each extra/special column to read from the BED file. As BED does not encode column names, these are assumed to be the last columns in the file. This enables parsing of the various BEDX+Y formats.}
19 | 
20 | \item{...}{Parameters to pass onto the format-specific method of \code{rtracklayer::import()}.}
21 | }
22 | \value{
23 | A \code{GRanges} object stored in \code{annotatr_cache}. To view a custom annotation, do \code{annotatr_cache$get(name)}. To add a custom annotation to the set of annotations, include \code{'[genome]_custom_[name]'} in the call to \code{build_annotations()}. See example below.
24 | }
25 | \description{
26 | \code{read_annotations()} is a wrapper for the \code{rtracklayer::import()} function that creates a \code{GRanges} object matching the structure of annotations built with \code{build_annotations()}. The structure is defined by \code{GRanges}, with the \code{mcols()} with names \code{c('id','gene_id','symbol','type')}.
27 | }
28 | \examples{
29 | 
30 |  # Read in a BED3 file as a custom annotation
31 |  file = system.file('extdata', 'test_annotations_3.bed', package='annotatr')
32 |  read_annotations(con = file, name = 'test', genome = 'hg19')
33 |  build_annotations(genome = 'hg19', annotations = 'hg19_custom_test')
34 | 
35 |  print(annotatr_cache$get('hg19_custom_test'))
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/man/read_regions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/read.R
 3 | \name{read_regions}
 4 | \alias{read_regions}
 5 | \title{Read genomic regions in BEDX+Y format}
 6 | \usage{
 7 | read_regions(
 8 |   con,
 9 |   genome = NA,
10 |   format,
11 |   extraCols = character(),
12 |   rename_name,
13 |   rename_score,
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{con}{A path, URL, connection or BEDFile object. See \code{rtracklayer::import()} documentation.}
19 | 
20 | \item{genome}{From \code{rtracklayer::import()}: The identifier of a genome, or NA if unknown. Typically, this is a UCSC identifier like 'hg19'. An attempt will be made to derive the \code{seqinfo} on the return value using either an installed BSgenome package or UCSC, if network access is available.}
21 | 
22 | \item{format}{From \code{rtracklayer::import()}: The format of the output. If not missing, should be one of 'bed', 'bed15', 'bedGraph' or 'bedpe'. If missing and 'con' is a filename, the format is derived from the file extension. This argument is unnecessary when 'con' is a derivative of 'RTLFile'.}
23 | 
24 | \item{extraCols}{From \code{rtracklayer::import()}: A character vector in the same form as 'colClasses' from 'read.table'.  It should indicate the name and class of each extra/special column to read from the BED file. As BED does not encode column names, these are assumed to be the last columns in the file. This enables parsing of the various BEDX+Y formats.}
25 | 
26 | \item{rename_name}{A string to rename the name column of the BED file. For example, if the name column actually contains a categorical variable.}
27 | 
28 | \item{rename_score}{A string to rename the score column of the BED file. For example, if the score column represents a quantity about the data besides the score in the BED specification.}
29 | 
30 | \item{...}{Parameters to pass onto the format-specific method of \code{rtracklayer::import()}.}
31 | }
32 | \value{
33 | A \code{GRanges} object.
34 | }
35 | \description{
36 | \code{read_regions()} reads genomic regions by calling the \code{rtracklayer::import()} function. This function can automatically deal with BEDX files from BED3 to BED6. For BED6+Y, the \code{extraCols} argument should be used to correctly interpret the extra columns.
37 | }
38 | \details{
39 | NOTE: The \code{name} (4th) and \code{score} (5th) columns are so named. If these columns have a particular meaning for your data, they should be renamed with the \code{rename_name} and/or \code{rename_score} parameters.
40 | }
41 | \examples{
42 | 
43 |    # Example of reading a BED6+3 file where the last 3 columns are non-standard
44 |    file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
45 |    extraCols = c(diff_meth = 'numeric', mu0 = 'numeric', mu1 = 'numeric')
46 |    gr = read_regions(con = file, genome = 'hg19', extraCols = extraCols, format = 'bed',
47 |        rename_name = 'DM_status', rename_score = 'pval')
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/man/reformat_hmm_codes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{reformat_hmm_codes}
 4 | \alias{reformat_hmm_codes}
 5 | \title{Function to recode classes from chromHMM type column}
 6 | \usage{
 7 | reformat_hmm_codes(hmm_codes)
 8 | }
 9 | \arguments{
10 | \item{hmm_codes}{in the original form from UCSC Genome Browser track.}
11 | }
12 | \value{
13 | A character vector of chromHMM classes with numbers and underscores removed.
14 | }
15 | \description{
16 | Function to recode classes from chromHMM type column
17 | }
18 | 


--------------------------------------------------------------------------------
/man/subset_order_tbl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{subset_order_tbl}
 4 | \alias{subset_order_tbl}
 5 | \title{Function to subset a tbl_df or grouped_df by a column}
 6 | \usage{
 7 | subset_order_tbl(tbl, col, col_order)
 8 | }
 9 | \arguments{
10 | \item{tbl}{A \code{tbl_df} or \code{grouped_df}.}
11 | 
12 | \item{col}{A string indicating which column of of \code{tbl} to subset and order}
13 | 
14 | \item{col_order}{A character vector indicating the order of \code{col}.}
15 | }
16 | \value{
17 | A modified version of \code{summary} with \code{col} subsetted by \code{col_order}.
18 | }
19 | \description{
20 | Function to subset a tbl_df or grouped_df by a column
21 | }
22 | 


--------------------------------------------------------------------------------
/man/summarize_annotations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/summarize.R
 3 | \name{summarize_annotations}
 4 | \alias{summarize_annotations}
 5 | \title{Summarize annotation counts}
 6 | \usage{
 7 | summarize_annotations(annotated_regions, annotated_random, quiet = FALSE)
 8 | }
 9 | \arguments{
10 | \item{annotated_regions}{The \code{GRanges} result of \code{annotate_regions()}.}
11 | 
12 | \item{annotated_random}{The \code{GRanges} result of \code{annotate_regions()} on the randomized regions created from \code{randomize_regions()}.}
13 | 
14 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).}
15 | }
16 | \value{
17 | A \code{tbl_df} of the number of regions per annotation type.
18 | }
19 | \description{
20 | Given a \code{GRanges} of annotated regions, count the number of regions in each annotation type. If \code{annotated_random} is not \code{NULL}, then the same is computed for the random regions.
21 | }
22 | \details{
23 | If a region is annotated to multiple annotations of the same \code{annot.type}, the region will only be counted once. For example, if a region were annotated to multiple exons, it would only count once toward the exons, but if it were annotated to an exon and an intron, it would count towards both.
24 | }
25 | \examples{
26 |    ### An example of ChIP-seq peaks with signalValue
27 | 
28 |    # Get premade CpG annotations
29 |    data('annotations', package = 'annotatr')
30 | 
31 |    file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr')
32 |    r = read_regions(con = file, genome = 'hg19')
33 | 
34 |    a = annotate_regions(
35 |        regions = r,
36 |        annotations = annotations,
37 |        ignore.strand = TRUE,
38 |        quiet = FALSE)
39 | 
40 |    rnd = randomize_regions(regions = r)
41 | 
42 |    rnd_annots = annotate_regions(
43 |        regions = rnd,
44 |        annotations = annotations,
45 |        ignore.strand = TRUE,
46 |        quiet = FALSE)
47 | 
48 |    # Summarize the annotated regions without randomized regions
49 |    s = summarize_annotations(annotated_regions = a)
50 | 
51 |    # Summarize the annotated regions with randomized regions
52 |    s_rnd = summarize_annotations(
53 |        annotated_regions = a,
54 |        annotated_random = rnd_annots)
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/man/summarize_categorical.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/summarize.R
 3 | \name{summarize_categorical}
 4 | \alias{summarize_categorical}
 5 | \title{Summarize categorical data over groupings of annotated regions}
 6 | \usage{
 7 | summarize_categorical(
 8 |   annotated_regions,
 9 |   by = c("annot.type", "annot.id"),
10 |   quiet = FALSE
11 | )
12 | }
13 | \arguments{
14 | \item{annotated_regions}{The \code{GRanges} result of \code{annotate_regions()}.}
15 | 
16 | \item{by}{A character vector to group the data in \code{as.data.frame(annotated_regions)} by and tally over. Default is \code{c('annot.type', 'annot.id')}.}
17 | 
18 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).}
19 | }
20 | \value{
21 | A grouped \code{dplyr::tbl_df} of the counts of groupings according to the \code{by} vector.
22 | }
23 | \description{
24 | Given a \code{GRanges} of annotated regions, count the number of regions when the annotations are grouped \code{by} categorical columns.
25 | }
26 | \details{
27 | If a region is annotated to multiple annotations of the same \code{annot.type}, the region will only be counted once. For example, if a region were annotated to multiple exons, it would only count once toward the exons, but if it were annotated to an exon and an intron, it would count towards both.
28 | }
29 | \examples{
30 | 
31 |    # Get premade CpG annotations
32 |    data('annotations', package = 'annotatr')
33 | 
34 |    r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr')
35 |    extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character')
36 |    r = read_regions(con = r_file, genome = 'hg19', extraCols = extraCols, rename_score = 'coverage')
37 | 
38 |    a = annotate_regions(
39 |        regions = r,
40 |        annotations = annotations,
41 |        ignore.strand = TRUE)
42 | 
43 |    sc = summarize_categorical(
44 |        annotated_regions = a,
45 |        by = c('annot.type', 'name'),
46 |        quiet = FALSE)
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/man/summarize_numerical.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/summarize.R
 3 | \name{summarize_numerical}
 4 | \alias{summarize_numerical}
 5 | \title{Summarize numerical data over groupings of annotated regions}
 6 | \usage{
 7 | summarize_numerical(
 8 |   annotated_regions,
 9 |   by = c("annot.type", "annot.id"),
10 |   over,
11 |   quiet = FALSE
12 | )
13 | }
14 | \arguments{
15 | \item{annotated_regions}{The \code{GRanges} result of \code{annotate_regions()}.}
16 | 
17 | \item{by}{A character vector of the columns of \code{as.data.frame(annotated_regions)} to group over. Default is \code{c(annot.type, annot.id)}.}
18 | 
19 | \item{over}{A character vector of the numerical columns in \code{as.data.frame(annotated_regions)} to \code{count}, take the \code{mean}, and take the \code{sd} over after grouping according to the \code{by} column. NOTE: If more than one value is used, the naming scheme for the resuling \code{dplyr::tbl} summary columns are \code{COLNAME_n}, \code{COLNAME_mean}, \code{COLNAME_sd}. If \code{over} has length one, then the column names are \code{n}, \code{mean}, \code{sd}.}
20 | 
21 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).}
22 | }
23 | \value{
24 | A grouped \code{dplyr::tbl_df}, and the \code{count}, \code{mean}, and \code{sd} of the \code{cols} \code{by} the groupings.
25 | }
26 | \description{
27 | Given a \code{GRanges} of annotated regions, summarize numerical data columns based on a grouping.
28 | }
29 | \details{
30 | NOTE: We do not take the distinct values of \code{seqnames}, \code{start}, \code{end}, \code{annot.type} as in the other \code{summarize_*()} functions because in the case of a region that intersected two distinct exons, using \code{distinct()} would destroy the information of the mean of the numerical column over one of the exons, which is not desirable.
31 | }
32 | \examples{
33 | ### Test on a very simple bed file to demonstrate different options
34 | 
35 | # Get premade CpG annotations
36 | data('annotations', package = 'annotatr')
37 | 
38 | r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr')
39 | extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character')
40 | r = read_regions(con = r_file, genome = 'hg19', extraCols = extraCols, rename_score = 'coverage')
41 | 
42 | a = annotate_regions(
43 |        regions = r,
44 |        annotations = annotations,
45 |        ignore.strand = TRUE)
46 | 
47 | # Testing over normal by
48 | sn1 = summarize_numerical(
49 |        annotated_regions = a,
50 |        by = c('annot.type', 'annot.id'),
51 |        over = c('coverage', 'mu1', 'mu0'),
52 |        quiet = FALSE)
53 | 
54 | # Testing over a different by
55 | sn2 = summarize_numerical(
56 |        annotated_regions = a,
57 |        by = c('diff_exp'),
58 |        over = c('coverage', 'mu1', 'mu0'))
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/man/tidy_annotations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{tidy_annotations}
 4 | \alias{tidy_annotations}
 5 | \title{Function to tidy up annotation accessors for visualization}
 6 | \usage{
 7 | tidy_annotations(annotations)
 8 | }
 9 | \arguments{
10 | \item{annotations}{A character vector of annotations, in the order they are to appear in the visualization.}
11 | }
12 | \value{
13 | A list of mappings from original annotation names to names ready for visualization.
14 | }
15 | \description{
16 | Function to tidy up annotation accessors for visualization
17 | }
18 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(annotatr)
3 | 
4 | test_check("annotatr")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test_1_utils.R:
--------------------------------------------------------------------------------
 1 | context('Test utility functions')
 2 | 
 3 | test_that('Test get_*_name() functions', {
 4 |     expect_error(
 5 |         get_txdb_name(genome = 'hg18'),
 6 |         'should be one of')
 7 | 
 8 |     expect_error(
 9 |         get_orgdb_name(genome = 'hg18'),
10 |         'should be one of')
11 | })
12 | 
13 | test_that('Test tidy_annotations()', {
14 |     hg19_annots = c('hg19_cpg_islands', 'hg19_cpg_inter', 'hg19_genes_firstexons', 'hg19_genes_intronexonboundaries', 'hg19_genes_exonintronboundaries', 'hg19_lncrna_gencode', 'hg19_chromatin_Gm12878-ActivePromoter')
15 |     mm9_annots = c('mm9_cpg_islands','mm9_genes_exonsCDSs','mm9_cpg_inter')
16 |     rn4_custom_annots = c('rn4_custom_cpgislands','rn4_custom_TFBS')
17 | 
18 |     hg19_tidy_annots = tidy_annotations(hg19_annots)
19 |     mm9_tidy_annots = tidy_annotations(mm9_annots)
20 |     rn4_tidy_annots = tidy_annotations(rn4_custom_annots)
21 | 
22 |     expect_equal( all(names(hg19_tidy_annots) == c('CpG islands', 'interCGI', 'first exons', 'intron/exon boundaries', 'exon/intron boundaries', 'GENCODE lncRNA', 'Gm12878-ActivePromoter')), expected = TRUE)
23 |     expect_equal( all(names(mm9_tidy_annots) == c('CpG islands', 'exonsCDSs', 'interCGI')), expected = TRUE)
24 |     expect_equal( all(names(rn4_tidy_annots) == c('cpgislands', 'TFBS')), expected = TRUE)
25 |     expect_equal
26 | })
27 | 
28 | test_that('Test check_annotations()', {
29 |     annots1 = c('hg17_genes_promoters','hg19_cpgs')
30 |     annots2 = c('hello','hg19_genes_promoters','hg19_cpgs')
31 |     annots3 = c('hg19_genes_promoters', 'mm9_cpg_islands')
32 | 
33 |     expect_error( check_annotations(annots1), 'not supported. See builtin_annotations()' )
34 |     expect_error( check_annotations(annots2), 'not supported. See builtin_annotations()' )
35 |     expect_error( check_annotations(annots3), 'genome prefix on all annotations must be the same' )
36 | })
37 | 
38 | test_that('Test expand_annotations()', {
39 |     annots1 = c('hg19_genes_promoters', 'hg19_genes_exons')
40 | 
41 |     annots2 = c('mm9_basicgenes', 'mm9_cpgs')
42 |     expanded_annots2 = c('mm9_cpg_islands', 'mm9_cpg_shores', 'mm9_cpg_shelves', 'mm9_cpg_inter', 'mm9_genes_1to5kb', 'mm9_genes_promoters', 'mm9_genes_5UTRs', 'mm9_genes_exons', 'mm9_genes_introns', 'mm9_genes_3UTRs')
43 | 
44 |     annots3 = c('hg19_cpg_shores', 'hg19_cpgs')
45 |     expanded_annots3 = c('hg19_cpg_islands', 'hg19_cpg_shores', 'hg19_cpg_shelves','hg19_cpg_inter')
46 | 
47 |     annots4 = c('hg19_Hepg2-chromatin')
48 |     expanded_annots4 = c('hg19_chromatin_Hepg2-ActivePromoter','hg19_chromatin_Hepg2-WeakPromoter','hg19_chromatin_Hepg2-PoisedPromoter','hg19_chromatin_Hepg2-StrongEnhancer','hg19_chromatin_Hepg2-WeakEnhancer','hg19_chromatin_Hepg2-Insulator','hg19_chromatin_Hepg2-TxnTransition','hg19_chromatin_Hepg2-TxnElongation','hg19_chromatin_Hepg2-WeakTxn','hg19_chromatin_Hepg2-Repressed','hg19_chromatin_Hepg2-Heterochrom/lo','hg19_chromatin_Hepg2-Repetitive/CNV')
49 | 
50 |     expect_equal( dplyr::setequal(expand_annotations(annots1), annots1), expected = TRUE )
51 |     expect_equal( dplyr::setequal(expand_annotations(annots2), expanded_annots2), expected = TRUE )
52 |     expect_equal( dplyr::setequal(expand_annotations(annots3), expanded_annots3), expected = TRUE )
53 |     expect_equal( dplyr::setequal(expand_annotations(annots4),
54 |     expanded_annots4), expected = TRUE)
55 | })
56 | 


--------------------------------------------------------------------------------
/tests/testthat/test_2_read.R:
--------------------------------------------------------------------------------
  1 | context('Test read module')
  2 | 
  3 | ################################################################################
  4 | # Test warnings in read_regions()
  5 | 
  6 | test_that('Test rename_* warnings', {
  7 |     file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr')
  8 | 
  9 |     expect_warning(
 10 |         read_regions(con = file, format = 'bed', rename_name = 'hello'),
 11 |         'Ignoring rename_name parameter because')
 12 | 
 13 |     expect_warning(
 14 |         read_regions(con = file, format = 'bed', rename_score = 'score'),
 15 |         'Ignoring rename_score parameter because')
 16 | })
 17 | 
 18 | ################################################################################
 19 | # Test BED3-6+ and bedGraph
 20 | 
 21 | test_that('Test BED3', {
 22 |     file = system.file('extdata', 'test_BED3.bed', package = 'annotatr')
 23 |     r = read_regions(con = file, format = 'bed')
 24 | 
 25 |     expect_true(is(r, 'GRanges'))
 26 | })
 27 | 
 28 | test_that('Test BED4', {
 29 |     file = system.file('extdata', 'test_BED4.bed', package = 'annotatr')
 30 |     r = read_regions(con = file, format = 'bed')
 31 | 
 32 |     expect_true(is(r, 'GRanges'))
 33 | })
 34 | 
 35 | test_that('Test BED5', {
 36 |     file = system.file('extdata', 'test_BED5.bed', package = 'annotatr')
 37 |     r = read_regions(con = file, format = 'bed')
 38 | 
 39 |     expect_true(is(r, 'GRanges'))
 40 | })
 41 | 
 42 | test_that('Test BED6', {
 43 |     file = system.file('extdata', 'test_BED6.bed', package = 'annotatr')
 44 |     r = read_regions(con = file, format = 'bed')
 45 | 
 46 |     expect_true(is(r, 'GRanges'))
 47 | })
 48 | 
 49 | test_that('Test BED6+ with renaming', {
 50 |     file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
 51 |     extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric')
 52 |     r = read_regions(con = file, extraCols = extraCols, rename_score = 'pval', rename_name = 'DM_status', format = 'bed')
 53 | 
 54 |     expect_true(is(r, 'GRanges'))
 55 | })
 56 | 
 57 | test_that('Test bedGraph', {
 58 |     file = system.file('extdata', 'test_bedGraph.bedGraph', package = 'annotatr')
 59 |     r = read_regions(con = file, format = 'bedGraph')
 60 | 
 61 |     expect_true(is(r, 'GRanges'))
 62 | })
 63 | 
 64 | ################################################################################
 65 | # Test
 66 | 
 67 | test_that('Test custom BED3 with no genome and a name', {
 68 |     file = system.file('extdata', 'test_annotations_3.bed', package = 'annotatr')
 69 |     read_annotations(con = file, name = 'test', format = 'bed')
 70 | 
 71 |     expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_test'))) == c('id','tx_id','gene_id','symbol','type')) )
 72 | })
 73 | 
 74 | test_that('Test custom BED3 with no name and a genome', {
 75 |     file = system.file('extdata', 'test_annotations_3.bed', package = 'annotatr')
 76 |     read_annotations(con = file, genome = 'hg19', format = 'bed')
 77 | 
 78 |     expect_true( all(colnames(mcols(annotatr_cache$get('hg19_custom_annotations'))) == c('id','tx_id','gene_id','symbol','type')) )
 79 | })
 80 | 
 81 | test_that('Test custom BED3 with no name or genome', {
 82 |     file = system.file('extdata', 'test_annotations_3.bed', package = 'annotatr')
 83 |     read_annotations(con = file, format = 'bed')
 84 |     expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_annotations'))) == c('id','tx_id','gene_id','symbol','type')) )
 85 | })
 86 | 
 87 | test_that('Test custom BED4', {
 88 |     file = system.file('extdata', 'test_annotations_4.bed', package = 'annotatr')
 89 |     read_annotations(con = file, format = 'bed')
 90 | 
 91 |     expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_test'))) == c('id','tx_id','gene_id','symbol','type')) )
 92 | })
 93 | 
 94 | test_that('Test custom BED5', {
 95 |     file = system.file('extdata', 'test_annotations_5.bed', package = 'annotatr')
 96 |     read_annotations(con = file, format = 'bed')
 97 | 
 98 |     expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_test'))) == c('id','tx_id','gene_id','symbol','type')) )
 99 | })
100 | 
101 | test_that('Test custom BED6', {
102 |     file = system.file('extdata', 'test_annotations_6.bed', package = 'annotatr')
103 |     read_annotations(con = file, name = 'six', format = 'bed')
104 | 
105 |     expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_six'))) == c('id','tx_id','gene_id','symbol','type')) )
106 | })
107 | 
108 | test_that('Test custom BED6 with gene_id', {
109 |     file = system.file('extdata', 'test_annotations_6_gene.bed', package = 'annotatr')
110 |     extraCols = c(gene_id = 'character')
111 |     read_annotations(con = file, name = 'geneid', format = 'bed', extraCols = extraCols)
112 | 
113 |     expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_geneid'))) == c('id','tx_id','gene_id','symbol','type')) )
114 | })
115 | 
116 | test_that('Test custom BED6 with symbol', {
117 |     file = system.file('extdata', 'test_annotations_6_symbol.bed', package = 'annotatr')
118 |     extraCols = c(symbol = 'character')
119 |     read_annotations(con = file, name = 'symbol', format = 'bed', extraCols = extraCols)
120 | 
121 |     expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_symbol'))) == c('id','tx_id','gene_id','symbol','type')) )
122 | })
123 | 
124 | test_that('Test custom BED6 with symbol nad gene_id', {
125 |     file = system.file('extdata', 'test_annotations_6_tx_gene_symbol.bed', package = 'annotatr')
126 |     extraCols = c(gene_id = 'character', symbol = 'character', tx_id = 'character')
127 |     read_annotations(con = file, name = 'txgenesymbol', format = 'bed', extraCols = extraCols)
128 | 
129 |     expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_txgenesymbol'))) == c('id','tx_id','gene_id','symbol','type')) )
130 | })
131 | 


--------------------------------------------------------------------------------
/tests/testthat/test_3_build_annotations.R:
--------------------------------------------------------------------------------
 1 | context('Test build annotations module')
 2 | 
 3 | ################################################################################
 4 | # Test errors
 5 | 
 6 | test_that('Test error for non-existent custom annotations', {
 7 |     expect_error(
 8 |         build_annotations(genome = 'hg19', annotations = 'hg19_custom_ezh2'),
 9 |         'not in annotatr_cache'
10 |     )
11 | })
12 | 
13 | ################################################################################
14 | # Test annotations that aren't otherwise tested
15 | # intergenic, cds, firstexons, and both boundaries
16 | 
17 | # test_that('Test all annotations', {
18 | #     annots = c('hg19_basicgenes', 'hg19_cpgs', 'hg19_genes_intergenic', 'hg19_genes_cds', 'hg19_genes_firstexons', 'hg19_genes_intronexonboundaries', 'hg19_genes_exonintronboundaries', 'hg19_lncrna_gencode', 'hg19_Gm12878-chromatin', 'hg19_H1hesc-chromatin', 'hg19_Hepg2-chromatin', 'hg19_Hmec-chromatin', 'hg19_Hsmm-chromatin', 'hg19_Huvec-chromatin', 'hg19_K562-chromatin', 'hg19_Nhek-chromatin', 'hg19_Nhlf-chromatin')
19 | #     annotations = build_annotations(genome = 'hg19', annotations = annots)
20 | #     expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) )
21 | #
22 | #     annots = c('hg38_basicgenes', 'hg38_cpgs', 'hg38_genes_intergenic', 'hg38_genes_cds', 'hg38_genes_firstexons', 'hg38_genes_intronexonboundaries', 'hg38_genes_exonintronboundaries', 'hg38_lncrna_gencode', 'hg38_enhancers_fantom')
23 | #     annotations = build_annotations(genome = 'hg38', annotations = annots)
24 | #     expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) )
25 | #
26 | #     annots = c('mm10_basicgenes', 'mm10_cpgs', 'mm10_genes_intergenic', 'mm10_genes_cds', 'mm10_genes_firstexons', 'mm10_genes_intronexonboundaries', 'mm10_genes_exonintronboundaries', 'mm10_lncrna_gencode', 'mm10_enhancers_fantom')
27 | #     annotations = build_annotations(genome = 'mm10', annotations = annots)
28 | #     expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) )
29 | #
30 | #     annots = c('mm9_basicgenes', 'mm9_cpgs', 'mm9_genes_intergenic', 'mm9_genes_cds', 'mm9_genes_firstexons', 'mm9_genes_intronexonboundaries', 'mm9_genes_exonintronboundaries', 'mm9_enhancers_fantom')
31 | #     annotations = build_annotations(genome = 'mm9', annotations = annots)
32 | #     expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) )
33 | #
34 | #     annots = c('rn4_basicgenes', 'rn4_cpgs', 'rn4_genes_intergenic', 'rn4_genes_cds', 'rn4_genes_firstexons', 'rn4_genes_intronexonboundaries', 'rn4_genes_exonintronboundaries')
35 | #     annotations = build_annotations(genome = 'rn4', annotations = annots)
36 | #     expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) )
37 | #
38 | #     annots = c('rn5_basicgenes', 'rn5_cpgs', 'rn5_genes_intergenic', 'rn5_genes_cds', 'rn5_genes_firstexons', 'rn5_genes_intronexonboundaries', 'rn5_genes_exonintronboundaries')
39 | #     annotations = build_annotations(genome = 'rn5', annotations = annots)
40 | #     expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) )
41 | #
42 | #     annots = c('rn6_basicgenes', 'rn6_cpgs', 'rn6_genes_intergenic', 'rn6_genes_cds', 'rn6_genes_firstexons', 'rn6_genes_intronexonboundaries', 'rn6_genes_exonintronboundaries')
43 | #     annotations = build_annotations(genome = 'rn6', annotations = annots)
44 | #     expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) )
45 | #
46 | #     annots = c('dm3_basicgenes', 'dm3_genes_intergenic', 'dm3_genes_cds', 'dm3_genes_firstexons', 'dm3_genes_intronexonboundaries', 'dm3_genes_exonintronboundaries')
47 | #     annotations = build_annotations(genome = 'dm3', annotations = annots)
48 | #     expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) )
49 | #
50 | #     annots = c('dm6_basicgenes', 'dm6_genes_intergenic', 'dm6_genes_cds', 'dm6_genes_firstexons', 'dm6_genes_intronexonboundaries', 'dm6_genes_exonintronboundaries')
51 | #     annotations = build_annotations(genome = 'dm6', annotations = annots)
52 | #     expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) )
53 | # })
54 | 


--------------------------------------------------------------------------------
/tests/testthat/test_4_intersect.R:
--------------------------------------------------------------------------------
  1 | context('Test intersect/annotate module')
  2 | 
  3 | ################################################################################
  4 | # Test errors
  5 | 
  6 | test_that('Test error thrown for non-GRanges regions object in annotate_regions()',{
  7 |     annotations = c('hg19_cpg_islands')
  8 | 
  9 |     bed = system.file('extdata', 'test_intersect.bed', package = 'annotatr')
 10 |     r = suppressMessages(read_regions(con = bed, format = 'bed'))
 11 | 
 12 |     expect_error(
 13 |         annotate_regions(
 14 |             regions = bed,
 15 |             annotations = annotations,
 16 |             ignore.strand = TRUE,
 17 |             quiet = TRUE),
 18 |         "regions object is not GRanges")
 19 | 
 20 |     expect_error(
 21 |         annotate_regions(
 22 |             regions = r,
 23 |             annotations = bed,
 24 |             ignore.strand = TRUE,
 25 |             quiet = TRUE),
 26 |         "annotations object is not GRanges")
 27 | 
 28 |     a_file = system.file('extdata', 'test_annotation_nooverlap.bed', package = 'annotatr')
 29 |     read_annotations(con = a_file, name = 'test')
 30 |     annotations = build_annotations(genome = 'hg19', annotations = 'genome_custom_test')
 31 |     expect_error(
 32 |         annotate_regions(
 33 |             regions = r,
 34 |             annotations = annotations,
 35 |             ignore.strand = TRUE,
 36 |             quiet = TRUE),
 37 |         "No annotations intersect the regions")
 38 | })
 39 | 
 40 | ################################################################################
 41 | # Test annotate_regions()
 42 | 
 43 | test_that('Test a la carte annotations in annotate_regions()',{
 44 |     # Get premade CpG annotations
 45 |     annots = expand_annotations('hg19_cpgs')
 46 |     data('annotations', package = 'annotatr')
 47 | 
 48 |     bed = system.file('extdata', 'test_intersect.bed', package = 'annotatr')
 49 |     r = read_regions(con = bed, format = 'bed')
 50 | 
 51 |     i = annotate_regions(
 52 |         regions = r,
 53 |         annotations = annotations,
 54 |         ignore.strand = TRUE,
 55 |         quiet = TRUE)
 56 | 
 57 |     expect_true( all(unique(i$annot$type) %in% expand_annotations(annots)) )
 58 | })
 59 | 
 60 | test_that('Test a la carte and shortcut annotations in annotate_regions()',{
 61 |     data('annotations', package = 'annotatr')
 62 | 
 63 |     file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr')
 64 |     r = read_regions(con = file, format = 'bed')
 65 | 
 66 |     i = annotate_regions(
 67 |         regions = r,
 68 |         annotations = annotations,
 69 |         ignore.strand = TRUE,
 70 |         quiet = TRUE)
 71 | 
 72 |     expect_true( all(unique(i$annot$type) %in% c('hg19_cpg_islands', 'hg19_cpg_shores', 'hg19_cpg_shelves', 'hg19_cpg_inter')) )
 73 | })
 74 | 
 75 | test_that('Custom annotations work in annotate_regions()', {
 76 |     r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr')
 77 |     extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character')
 78 |     r = read_regions(con = r_file, extraCols = extraCols, rename_score = 'coverage')
 79 | 
 80 |     a_file = system.file('extdata', 'test_annotations_3.bed', package='annotatr')
 81 |     read_annotations(con = a_file, name = 'TFBS', genome = 'hg19')
 82 | 
 83 |     annots = c('hg19_custom_TFBS', 'hg19_cpgs')
 84 |     annotations = build_annotations(genome = 'hg19', annotations = annots)
 85 | 
 86 |     a = annotate_regions(
 87 |         regions = r,
 88 |         annotations = annotations,
 89 |         ignore.strand = TRUE,
 90 |         quiet = TRUE)
 91 | 
 92 |     expect_equal(length(a) == 10, expected = TRUE)
 93 | })
 94 | 
 95 | test_that('annotate_regions() works with only custom annotations', {
 96 |     r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr')
 97 |     extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character')
 98 |     r = read_regions(con = r_file, extraCols = extraCols, rename_score = 'coverage')
 99 | 
100 |     a_file = system.file('extdata', 'test_annotations_3.bed', package='annotatr')
101 |     read_annotations(con = a_file, name = 'TFBS')
102 |     annotations = build_annotations(genome = 'hg19', annotations = 'genome_custom_TFBS')
103 | 
104 |     a = annotate_regions(
105 |         regions = r,
106 |         annotations = annotations,
107 |         ignore.strand = TRUE,
108 |         quiet = FALSE)
109 | 
110 |     expect_equal(length(a) == 5, expected = TRUE)
111 | })
112 | 
113 | test_that('annotate_regions() uses minoverlap correctly', {
114 |     file = system.file('extdata', 'test_BED3.bed', package = 'annotatr')
115 |     r = read_regions(con = file, format = 'bed')
116 | 
117 |     a_file = system.file('extdata', 'test_annotations_minoverlap.bed', package='annotatr')
118 |     read_annotations(con = a_file, name = 'TFBS')
119 |     annotations = build_annotations(genome = 'hg19', annotations = 'genome_custom_TFBS')
120 | 
121 |     a = annotate_regions(
122 |         regions = r,
123 |         annotations = annotations,
124 |         minoverlap = 5)
125 | 
126 |     expect_equal(length(a) == 2, expected = TRUE)
127 |     expect_true(all(GenomicRanges::start(a) == c(10791,28801)))
128 | })
129 | 


--------------------------------------------------------------------------------
/tests/testthat/test_5_randomize.R:
--------------------------------------------------------------------------------
 1 | context('Test randomize module')
 2 | 
 3 | ################################################################################
 4 | # Setup objects for plot_categorical()
 5 | 
 6 |     file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr')
 7 |     regions_genome = read_regions(con = file, genome = 'hg19', format = 'bed')
 8 |     regions_nogenome = read_regions(con = file, format = 'bed')
 9 | 
10 | ################################################################################
11 | # Test errors
12 | 
13 |     test_that('Test errors', {
14 |         expect_error(
15 |             randomize_regions(regions = 'hello', allow.overlaps = TRUE, per.chromosome = TRUE),
16 |             'regions must have class GRanges')
17 |         expect_error(
18 |             randomize_regions(regions = regions_nogenome),
19 |             'GRanges object must have a valid genome'
20 |             )
21 |     })
22 | 
23 | ################################################################################
24 | # Test randomize_regions()
25 | 
26 |     test_that('Test randomized regions', {
27 |         random_regions = randomize_regions(
28 |             regions = regions_genome,
29 |             allow.overlaps = TRUE,
30 |             per.chromosome = TRUE)
31 | 
32 |         expect_equal(class(random_regions)[1], expected = 'GRanges')
33 |         expect_equal(length(random_regions), expected = length(regions_genome))
34 |     })
35 | 


--------------------------------------------------------------------------------
/tests/testthat/test_6_summarize.R:
--------------------------------------------------------------------------------
  1 | context('Test summarize module')
  2 | 
  3 | data('annotations', package = 'annotatr')
  4 | 
  5 | bed = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
  6 | extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric')
  7 | r = suppressMessages(read_regions(con = bed, genome = 'hg19', extraCols = extraCols, rename_score = 'pval', rename_name = 'DM_status', format = 'bed'))
  8 | r = r[1:1000]
  9 | r$cancer_status = 'Cancer'
 10 | r2 = r
 11 | r2$cancer_status = 'NoCancer'
 12 | 
 13 | r_dup = c(r,r2)
 14 | 
 15 | a = suppressMessages(annotate_regions(
 16 |     regions = r,
 17 |     annotations = annotations,
 18 |     ignore.strand = TRUE,
 19 |     quiet = TRUE))
 20 | 
 21 | a_dup = suppressMessages(annotate_regions(
 22 |     regions = r_dup,
 23 |     annotations = annotations,
 24 |     ignore.strand = TRUE,
 25 |     quiet = TRUE))
 26 | 
 27 | rnd = suppressMessages(randomize_regions(regions = r))
 28 | 
 29 | rnd_annot = suppressMessages(annotate_regions(
 30 |     regions = rnd,
 31 |     annotations = annotations,
 32 |     ignore.strand = TRUE,
 33 |     quiet = TRUE))
 34 | 
 35 | ################################################################################
 36 | # Test errors
 37 | 
 38 | test_that('Test for error with over=NULL in summarize_numerical()',{
 39 |     expect_error(summarize_numerical(annotated_regions = a),
 40 |         'over cannot be missing')
 41 | })
 42 | 
 43 | ################################################################################
 44 | # Test summarize functions
 45 | 
 46 | test_that('Test summarize_annotations()', {
 47 |     s = summarize_annotations(annotated_regions = a, quiet = FALSE)
 48 | 
 49 |     srand = summarize_annotations(
 50 |         annotated_regions = a,
 51 |         annotated_random = rnd_annot,
 52 |         quiet = FALSE)
 53 | 
 54 |     # NOTE: For small data it is possible that the random regions won't
 55 |     # intersect all CpG types so the second test may fail. Moreover,
 56 |     # if you are going to compute fold changes, corresponding random
 57 |     # rows may be missing if the data is too small...
 58 |     expect_equal( sum(s[['n']]), expected = 1064)
 59 |     expect_equal( nrow(srand), expected = 8)
 60 | })
 61 | 
 62 | test_that('Test summarize_numerical()', {
 63 |     s = summarize_numerical(
 64 |         annotated_regions = a,
 65 |         by = c('annot.type', 'annot.id'),
 66 |         over = 'diff_meth',
 67 |         quiet = TRUE)
 68 | 
 69 |     expect_equal( mean(s[['mean']]), expected = 2.424537, tolerance = 0.01)
 70 | })
 71 | 
 72 | test_that('Test summarize_numerical() and summarize_categorical() over small data', {
 73 |     # Testing summarize_numerical()
 74 |     sn1 = summarize_numerical(
 75 |         annotated_regions = a,
 76 |         by = c('annot.type', 'annot.id'),
 77 |         over = 'diff_meth',
 78 |         quiet = FALSE)
 79 |     sn2 = summarize_numerical(
 80 |         annotated_regions = a,
 81 |         by = c('DM_status'),
 82 |         over = c('diff_meth', 'mu1', 'mu0'),
 83 |         quiet = TRUE)
 84 | 
 85 |     # Testing summarize_categorical()
 86 |     sc1 = summarize_categorical(
 87 |         annotated_regions = a,
 88 |         by = c('annot.type', 'DM_status'),
 89 |         quiet = FALSE)
 90 | 
 91 |     # Testing maintanence of duplicate regions with different categories
 92 |     sc2 = summarize_categorical(
 93 |         annotated_regions = a_dup,
 94 |         by = c('annot.type', 'cancer_status'),
 95 |         quiet = FALSE)
 96 | 
 97 |     expect_equal( sn1[['mean']][which(sn1[['annot.id']] == 'inter:8599')], expected = -1.0066888, tolerance = 0.01)
 98 |     expect_equal( sn2[['mu0_mean']][which(sn2[['DM_status']] == 'hyper')], expected = 16.34614, tolerance = 0.01)
 99 | 
100 |     expect_equal( sc1[['n']][which(sc1[['annot.type']] == 'hg19_cpg_inter' & sc1[,'DM_status'] == 'hyper')], expected = 19)
101 | 
102 |     expect_true( sc2[['n']][which(sc2[['annot.type']] == 'hg19_cpg_inter' & sc2[,'cancer_status'] == 'Cancer')] == sc2[['n']][which(sc2[['annot.type']] == 'hg19_cpg_inter' & sc2[,'cancer_status'] == 'NoCancer')] )
103 | })
104 | 


--------------------------------------------------------------------------------
/tests/testthat/test_7_visualize.R:
--------------------------------------------------------------------------------
  1 | context('Test plot module')
  2 | 
  3 | ################################################################################
  4 | # Setup annotation objects
  5 | data('annotations', package = 'annotatr')
  6 | 
  7 | ################################################################################
  8 | # Setup objects for plot_categorical()
  9 | 
 10 | dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
 11 | extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric')
 12 | dm_regions = suppressMessages(read_regions(con = dm_file, genome = 'hg19', extraCols = extraCols, rename_score = 'pval', rename_name = 'DM_status', format = 'bed'))
 13 | dm_regions = dm_regions[1:1000]
 14 | dm_regions$cancer_status = 'Cancer'
 15 | dm_regions2 = dm_regions
 16 | dm_regions2$cancer_status = 'NoCancer'
 17 | 
 18 | duplicate_regions = c(dm_regions, dm_regions2)
 19 | 
 20 | dm_random_regions = suppressMessages(randomize_regions(regions = dm_regions))
 21 | 
 22 | dm_annots = suppressMessages(annotate_regions(
 23 |     regions = dm_regions,
 24 |     annotations = annotations,
 25 |     ignore.strand = TRUE,
 26 |     quiet = TRUE))
 27 | 
 28 | dm_dup_annots = suppressMessages(annotate_regions(
 29 |     regions = duplicate_regions,
 30 |     annotations = annotations,
 31 |     ignore.strand = TRUE,
 32 |     quiet = TRUE))
 33 | 
 34 | dm_random_annots = suppressMessages(annotate_regions(
 35 |     regions = dm_random_regions,
 36 |     annotations = annotations,
 37 |     ignore.strand = TRUE,
 38 |     quiet = TRUE))
 39 | 
 40 | ################################################################################
 41 | # Setup order vectors and plots that will work
 42 | 
 43 | dm_order = c(
 44 |     'hyper',
 45 |     'hypo',
 46 |     'none')
 47 | cpgs_order = c(
 48 |     'hg19_cpg_islands',
 49 |     'hg19_cpg_shores',
 50 |     'hg19_cpg_shelves',
 51 |     'hg19_cpg_inter')
 52 | 
 53 | ################################################################################
 54 | # Test plot_annotation()
 55 | 
 56 | test_that('Test plot_annotation() errors', {
 57 |     expect_warning(
 58 |         plot_annotation(annotated_regions = dm_annots, annotation_order = c('hypor','hype','')),
 59 |         'elements in col_order that are not present')
 60 | })
 61 | 
 62 | test_that('Test plot_annotation() success', {
 63 |     dm_va_min = plot_annotation(annotated_regions = dm_annots)
 64 | 
 65 |     dm_va = plot_annotation(
 66 |         annotated_regions = dm_annots,
 67 |         annotation_order = cpgs_order,
 68 |         plot_title = 'Testing plot title',
 69 |         x_label = 'Test x-label',
 70 |         y_label = 'Test y-label')
 71 | 
 72 |     dm_va_rnd = plot_annotation(
 73 |         annotated_regions = dm_annots,
 74 |         annotated_random = dm_random_annots,
 75 |         annotation_order = NULL,
 76 |         plot_title = 'Testing dodged bars',
 77 |         x_label = 'Annotation Type',
 78 |         y_label = 'Count')
 79 | 
 80 |     expect_equal( dplyr::setequal(class(dm_va_min), c('gg','ggplot')), expected = TRUE)
 81 |     expect_equal( dplyr::setequal(class(dm_va), c('gg','ggplot')), expected = TRUE)
 82 |     expect_equal( dplyr::setequal(class(dm_va_rnd), c('gg','ggplot')), expected = TRUE)
 83 | })
 84 | 
 85 | ################################################################################
 86 | # Test plot_coannotations()
 87 | 
 88 | test_that('Test plot_coannotations() success', {
 89 | 
 90 |     dm_vs_ca = plot_coannotations(
 91 |         annotated_regions = dm_annots,
 92 |         annotation_order = cpgs_order,
 93 |         axes_label = 'Annotations',
 94 |         plot_title = 'Co-occurrence of Annotations')
 95 | 
 96 |     expect_equal( dplyr::setequal(class(dm_vs_ca), c('gg','ggplot')), expected = TRUE)
 97 | })
 98 | 
 99 | ################################################################################
100 | # Test plot_numerical()
101 | 
102 | test_that('Test plot_numerical() success', {
103 | 
104 |     dm_vs_regions_mu1 = plot_numerical(
105 |         annotated_regions = dm_annots,
106 |         x = 'mu1',
107 |         facet = 'annot.type',
108 |         facet_order = c('hg19_cpg_islands','hg19_cpg_shores','hg19_cpg_shelves','hg19_cpg_inter'),
109 |         bin_width = 5,
110 |         plot_title = 'Group 1 Methylation over CpG Annotations',
111 |         x_label = 'Group 1 Methylation',
112 |         legend_facet_label = 'Group 1 Methylation Rate in Annotation',
113 |         legend_cum_label = 'Overall Group 1 Methylation Rate')
114 | 
115 |     dm_vs_regions_annot = plot_numerical(
116 |         annotated_regions = dm_annots,
117 |         x = 'mu0',
118 |         y = 'mu1',
119 |         facet = 'annot.type',
120 |         facet_order = c('hg19_cpg_islands','hg19_cpg_shores','hg19_cpg_shelves','hg19_cpg_inter'),
121 |         plot_title = 'Region Methylation: Group 0 vs Group 1',
122 |         x_label = 'Group 0',
123 |         y_label = 'Group 1')
124 | 
125 |     dm_vs_regions_name = plot_numerical(
126 |         annotated_regions = dm_annots,
127 |         x = 'mu0',
128 |         y = 'mu1',
129 |         facet = 'DM_status',
130 |         facet_order = c('hyper','hypo','none'),
131 |         plot_title = 'Region Methylation: Group 0 vs Group 1',
132 |         x_label = 'Group 0',
133 |         y_label = 'Group 1')
134 | 
135 |     dm_vs_regions_mu12 = plot_numerical(
136 |         annotated_regions = dm_annots,
137 |         x = 'mu1',
138 |         facet = c('annot.type', 'DM_status'),
139 |         facet_order = list(c('hg19_cpg_islands','hg19_cpg_shores'), c('hyper','hypo','none')),
140 |         plot_title = 'Region Methylation: Group 0 vs Group 1',
141 |         x_label = 'Group 0',
142 |         y_label = 'Group 1')
143 | 
144 |     dm_vs_regions_name2 = plot_numerical(
145 |         annotated_regions = dm_annots,
146 |         x = 'mu0',
147 |         y = 'mu1',
148 |         facet = c('annot.type', 'DM_status'),
149 |         facet_order = list(NULL, c('hyper','hypo','none')),
150 |         plot_title = 'Region Methylation: Group 0 vs Group 1',
151 |         x_label = 'Group 0',
152 |         y_label = 'Group 1')
153 | 
154 |     expect_equal( dplyr::setequal(class(dm_vs_regions_mu1), c('gg','ggplot')), expected = TRUE)
155 |     expect_equal( dplyr::setequal(class(dm_vs_regions_annot), c('gg','ggplot')), expected = TRUE)
156 |     expect_equal( dplyr::setequal(class(dm_vs_regions_name), c('gg','ggplot')), expected = TRUE)
157 | })
158 | 
159 | ################################################################################
160 | # Test plot_numerical_coannotations()
161 | 
162 | test_that('Test plot_numerical_coannotations()', {
163 |   dm_vs_num_co1 = plot_numerical_coannotations(
164 |     annotated_regions = dm_annots,
165 |     x = 'mu0',
166 |     annot1 = 'hg19_cpg_islands',
167 |     annot2 = 'hg19_cpg_shores',
168 |     bin_width = 5,
169 |     plot_title = 'Group 0 Perc. Meth. in CpG Islands and Promoters',
170 |     x_label = 'Percent Methylation',
171 |     legend_facet_label = 'Perc. Methylation in annotation pair',
172 |     legend_cum_label = 'Overall Perc. Methylation')
173 | 
174 |   dm_vs_num_co2 = plot_numerical_coannotations(
175 |     annotated_regions = dm_annots,
176 |     x = 'mu0',
177 |     y = 'mu1',
178 |     annot1 = 'hg19_cpg_islands',
179 |     annot2 = 'hg19_cpg_shores',
180 |     bin_width = 5,
181 |     plot_title = 'Group 0 Perc. Meth. in CpG Islands and Promoters',
182 |     x_label = 'Percent Methylation',
183 |     y_label = 'Percent Methylation')
184 | 
185 |     expect_equal( dplyr::setequal(class(dm_vs_num_co1), c('gg','ggplot')), expected = TRUE)
186 |     expect_equal( dplyr::setequal(class(dm_vs_num_co2), c('gg','ggplot')), expected = TRUE)
187 | })
188 | 
189 | ################################################################################
190 | # Test plot_categorical()
191 | 
192 |   test_that('Test plot_categorical() errors', {
193 |     expect_error(
194 |         plot_categorical(
195 |             annotated_regions = dm_annots),
196 |         'argument "x" is missing')
197 | 
198 |     expect_error(
199 |         plot_categorical(
200 |             annotated_regions = dm_annots,
201 |             x = 'testing'),
202 |         'column name used for x does not exist in annotated_regions')
203 | 
204 |     expect_error(
205 |         plot_categorical(
206 |             annotated_regions = dm_annots,
207 |             x = 'DM_status',
208 |             fill = 'testing'),
209 |         'column name used for fill does not exist in annotated_regions')
210 | 
211 |     expect_error(
212 |         plot_categorical(
213 |             annotated_regions = dm_annots,
214 |             x = 'DM_status',
215 |             fill = 'DM_status'),
216 |         'x cannot equal fill')
217 | 
218 |     expect_error(
219 |         plot_categorical(
220 |             annotated_regions = dm_annots,
221 |             x = 'DM_status',
222 |             fill = 'annot.type',
223 |             position = 'no'),
224 |         'position must be one of "stack", "fill"')
225 | 
226 |     expect_warning(
227 |         plot_categorical(
228 |             annotated_regions = dm_annots,
229 |             x = 'DM_status',
230 |             fill = 'annot.type',
231 |             x_order = cpgs_order),
232 |         'elements in col_order that are not present')
233 | 
234 |     expect_warning(
235 |         plot_categorical(
236 |             annotated_regions = dm_annots,
237 |             x = 'DM_status',
238 |             fill = 'annot.type',
239 |             fill_order = dm_order),
240 |         'elements in col_order that are not present')
241 |   })
242 | 
243 | test_that('Test plot_categorical() error for random regions and non annot fill', {
244 |     expect_error(
245 |         plot_categorical(
246 |             annotated_regions = dm_annots,
247 |             annotated_random = dm_random_annots,
248 |             x = 'annot.type',
249 |             fill = 'DM_status',
250 |             x_order = cpgs_order,
251 |             fill_order = dm_order,
252 |             position = 'fill',
253 |             legend_title = 'Annotations',
254 |             plot_title = 'DM status by CpG Annotation Proportions',
255 |             x_label = 'DM status',
256 |             y_label = 'Proportion'),
257 |         'since data from the original regions are not transferred to the random regions')
258 |     })
259 | 
260 | test_that('Test plot_categorical() success', {
261 |     dm_vn_min = plot_categorical(
262 |         annotated_regions = dm_annots,
263 |         x = 'annot.type')
264 | 
265 |     dm_vn = plot_categorical(
266 |         annotated_regions = dm_annots,
267 |         x = 'DM_status',
268 |         fill = 'annot.type',
269 |         x_order = dm_order,
270 |         fill_order = cpgs_order,
271 |         position = 'fill',
272 |         legend_title = 'knownGene Annotations',
273 |         plot_title = 'DM status in knownGene Annots.',
274 |         x_label = 'DM status',
275 |         y_label = 'Proportion')
276 | 
277 |     dm_vn_rnd = plot_categorical(
278 |         annotated_regions = dm_annots,
279 |         annotated_random = dm_random_annots,
280 |         x = 'DM_status',
281 |         fill = 'annot.type',
282 |         x_order = dm_order,
283 |         fill_order = cpgs_order,
284 |         position = 'fill',
285 |         legend_title = 'Annotations',
286 |         plot_title = 'DM status by CpG Annotation Proportions',
287 |         x_label = 'DM status',
288 |         y_label = 'Proportion')
289 | 
290 |     expect_equal( dplyr::setequal(class(dm_vn_min), c('gg','ggplot')), expected = TRUE)
291 |     expect_equal( dplyr::setequal(class(dm_vn), c('gg','ggplot')), expected = TRUE)
292 |     expect_equal( dplyr::setequal(class(dm_vn_rnd), c('gg','ggplot')), expected = TRUE)
293 | })
294 | 


--------------------------------------------------------------------------------
/vignettes/annotatr-vignette.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "`annotatr`: Making sense of genomic regions"
  3 | author: "Raymond G. Cavalcante"
  4 | date: "`r Sys.Date()`"
  5 | output:
  6 |         BiocStyle::html_document
  7 | vignette: >
  8 |         %\VignetteIndexEntry{annotatr}
  9 |         %\VignetteEngine{knitr::rmarkdown}
 10 |         %\VignetteEncoding{UTF-8}
 11 | ---
 12 | 
 13 | # Introduction
 14 | 
 15 | Genomic regions resulting from next-generation sequencing experiments and bioinformatics pipelines are more meaningful when annotated to genomic features. A SNP occurring in an exon, or an enhancer, is likely of greater interest than one occurring in an inter-genic region. It may be of interest to find that a particular transcription factor overwhelmingly binds in promoters, while another binds mostly in 3’UTRs. Hyper-methylation at promoters containing a CpG island may indicate different regulatory regimes in one condition compared to another.
 16 | 
 17 | `annotatr` provides genomic annotations and a set of functions to read, intersect, summarize, and visualize genomic regions in the context of genomic annotations.
 18 | 
 19 | # Installation
 20 | 
 21 | The release version of `annotatr` is available via [Bioconductor](http://bioconductor.org/packages/annotatr/), and can be installed as follows:
 22 | 
 23 | ```{r, eval=FALSE}
 24 | if (!requireNamespace("BiocManager", quietly=TRUE))
 25 |     install.packages("BiocManager")
 26 | BiocManager::install("annotatr")
 27 | ```
 28 | 
 29 | The development version of `annotatr` can be obtained via the [GitHub repository](https://github.com/rcavalcante/annotatr) or [Bioconductor](https://bioconductor.org/packages/devel/bioc/html/annotatr.html). It is easiest to install development versions with the [`devtools`](https://cran.r-project.org/web/packages/devtools/index.html) package as follows:
 30 | 
 31 | ```{r, eval=FALSE}
 32 | devtools::install_github('rcavalcante/annotatr')
 33 | ```
 34 | 
 35 | Changelogs for development releases will be detailed on [GitHub releases](https://github.com/rcavalcante/annotatr/releases).
 36 | 
 37 | # Annotations
 38 | 
 39 | There are three types of annotations available to annotatr:
 40 | 
 41 | 1. Built-in annotations including CpG annotations, genic annotations, enhancers, GENCODE lncRNAs, and chromatin states from chromHMM. Base data for each of these annotations is retrieved and processed in some way. See each below for details on data source and processing.
 42 | 2. AnnotationHub annotations include any GRanges resource within the Bioconductor AnnotationHub web resource.
 43 | 3. Custom annotations provided by the user.
 44 | 
 45 | ## CpG Annotations
 46 | 
 47 | The CpG islands are the basis for all CpG annotations, and are given by the `AnnotationHub` package for the given organism. CpG shores are defined as 2Kb upstream/downstream from the ends of the CpG islands, less the CpG islands. CpG shelves are defined as another 2Kb upstream/downstream of the farthest upstream/downstream limits of the CpG shores, less the CpG islands and CpG shores. The remaining genomic regions make up the inter-CGI annotation.
 48 | 
 49 | CpG annotations are available for hg19, hg38, mm9, mm10, rn4, rn5, rn6.
 50 | 
 51 | ![Schematic of CpG annotations.](annotatr_cpgs.jpeg)
 52 | 
 53 | ## Genic Annotations
 54 | 
 55 | The genic annotations are determined by functions from `GenomicFeatures` and data from the `TxDb.*` and `org.*.eg.db` packages. Genic annotations include 1-5Kb upstream of the TSS, the promoter (< 1Kb upstream of the TSS), 5'UTR, first exons, exons, introns, CDS, 3'UTR, and intergenic regions (the intergenic regions exclude the previous list of annotations). The schematic below illustrates the relationship between the different annotations as extracted from the `TxDb.*` packages via `GenomicFeatures` functions.
 56 | 
 57 | ![Schematic of knownGene annotations.](annotatr_genes.jpeg)
 58 | 
 59 | Also included in genic annotations are intronexon and exonintron boundaries. These annotations are 200bp up/down stream of any boundary between an exon and intron. Important to note, is that the boundaries are with respect to the strand of the gene.
 60 | 
 61 | Non-intergenic gene annotations include Entrez ID and gene symbol information where it exists. The `org.*.eg.db` packages for the appropriate organisms are used to provide gene IDs and gene symbols.
 62 | 
 63 | The genic annotations have populated `tx_id`, `gene_id`, and `symbol` columns. Respectively they are, the knownGene transcript name, Entrez Gene ID, and gene symbol.
 64 | 
 65 | Genic annotations are available for all hg19, hg38, mm9, mm10, rn4, rn5, rn6, dm3, and dm6.
 66 | 
 67 | ## FANTOM5 Permissive Enhancers
 68 | 
 69 | FANTOM5 permissive enhancers were determined from bi-directional CAGE transcription as in [Andersson et al. (2014)](http://www.nature.com/nature/journal/v507/n7493/full/nature12787.html), and are downloaded and processed for hg19 and mm9 from the [FANTOM5](http://fantom.gsc.riken.jp/5/datafiles/phase2.0/extra/Enhancers/) resource. Using the `rtracklayer::liftOver()` function, enhancers from hg19 are lifted to hg38, and mm9 to mm10.
 70 | 
 71 | ## GENCODE lncRNA transcripts
 72 | 
 73 | The long non-coding RNA (lncRNA) annotations are from [GENCODE](https://www.gencodegenes.org) for hg19, hg38, and mm10. The lncRNA transcripts are used, and we eventually plan to include the lncRNA introns/exons at a later date. The lncRNA annotations have populated `tx_id`, `gene_id`, and `symbol` columns. Respectively they are, the Ensembl transcript name, Entrez Gene ID, and gene symbol. As per the `transcript_type` field in the GENCODE anntotations, the [biotypes](https://www.gencodegenes.org/gencode_biotypes.html) are given in the `id` column.
 74 | 
 75 | ## Chromatin states from ChromHMM
 76 | 
 77 | Chromatin states determined by chromHMM ([Ernst and Kellis (2012)](http://www.nature.com/nmeth/journal/v9/n3/full/nmeth.1906.html)) in hg19 are available for nine cell lines (Gm12878, H1hesc, Hepg2, Hmec, Hsmm, Huvec, K562, Nhek, and Nhlf) via the UCSC Genome Browser tracks. Annotations for all states can be built using a shortcut like `hg19_Gm12878-chromatin`, or specific chromatin states can be accessed via codes like `hg19_chromatin_Gm12878-StrongEnhancer` or `hg19_chromatin_Gm12878-Repressed`.
 78 | 
 79 | ## `AnnotationHub` Annotations
 80 | 
 81 | The `AnnotationHub` Bioconductor package is a client for the AnnotationHub web resource. From the package description:
 82 | 
 83 | > The AnnotationHub web resource provides a central location where genomic files (e.g., VCF, bed, wig) and other resources from standard locations (e.g., UCSC, Ensembl) can be discovered. The resource includes metadata about each resource, e.g., a textual description, tags, and date of modification. The client creates and manages a local cache of files retrieved by the user, helping with quick and reproducible access.
 84 | 
 85 | Using the `build_ah_annots()` function, users can turn any resource of class `GRanges` into an annotation for use in `annotatr`. As an example, we create annotations for H3K4me3 ChIP-seq peaks in Gm12878 and H1-hesc cells.
 86 | 
 87 | ```{r, echo=FALSE}
 88 | suppressWarnings(suppressMessages(suppressPackageStartupMessages(library(annotatr))))
 89 | ```
 90 | 
 91 | ```{r, warning = FALSE, message = FALSE}
 92 | # Create a named vector for the AnnotationHub accession codes with desired names
 93 | h3k4me3_codes = c('Gm12878' = 'AH23256')
 94 | # Fetch ah_codes from AnnotationHub and create annotations annotatr understands
 95 | build_ah_annots(genome = 'hg19', ah_codes = h3k4me3_codes, annotation_class = 'H3K4me3')
 96 | # The annotations as they appear in annotatr_cache
 97 | ah_names = c('hg19_H3K4me3_Gm12878')
 98 | 
 99 | print(annotatr_cache$get('hg19_H3K4me3_Gm12878'))
100 | ```
101 | 
102 | ## Custom Annotations
103 | 
104 | Users may load their own annotations from BED files using the `read_annotations()` function, which uses the `rtracklayer::import()` function. The output is a `GRanges` with `mcols()` for `id`, `tx_id`, `gene_id`, `symbol`, and `type`. If a user wants to include `tx_id`, `gene_id`, and/or `symbol` in their custom annotations they can be included as extra columns on a BED6 input file.
105 | 
106 | ```{r, warning = FALSE, message = FALSE}
107 | ## Use ENCODE ChIP-seq peaks for EZH2 in GM12878
108 | ## These files contain chr, start, and end columns
109 | ezh2_file = system.file('extdata', 'Gm12878_Ezh2_peak_annotations.txt.gz', package = 'annotatr')
110 | 
111 | ## Custom annotation objects are given names of the form genome_custom_name
112 | read_annotations(con = ezh2_file, genome = 'hg19', name = 'ezh2', format = 'bed')
113 | 
114 | print(annotatr_cache$get('hg19_custom_ezh2'))
115 | ```
116 | 
117 | To see what is in the `annotatr_cache` environment, do the following:
118 | 
119 | ```{r, warning = FALSE, message = FALSE}
120 | print(annotatr_cache$list_env())
121 | ```
122 | 
123 | # Usage
124 | 
125 | The following example is based on the results of testing for differential methylation of genomic regions between two conditions using [methylSig](https://github.com/sartorlab/methylSig). The file (`inst/extdata/IDH2mut_v_NBM_multi_data_chr9.txt.gz`) contains chromosome locations, as well as categorical and numerical data columns, and provides a good example of the flexibility of `annotatr`.
126 | 
127 | ## Reading Genomic Regions
128 | 
129 | `read_regions()` uses the `rtracklayer::import()` function to read in BED files and convert them to `GRanges` objects. The `name` and `score` columns in a normal BED file can be used for categorical and numeric data, respectively. Additionally, an arbitrary number of categorical and numeric data columns can be appended to a BED6 file. The `extraCols` parameter is used for this purpose, and the `rename_name` and `rename_score` columns allow users to give more descriptive names to these columns.
130 | 
131 | ```{r, warning = FALSE, message = FALSE}
132 | # This file in inst/extdata represents regions tested for differential
133 | # methylation between two conditions. Additionally, there are columns
134 | # reporting the p-value on the test for differential meth., the
135 | # meth. difference between the two groups, and the group meth. rates.
136 | dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr')
137 | extraCols = c(diff_meth = 'numeric', mu0 = 'numeric', mu1 = 'numeric')
138 | dm_regions = read_regions(con = dm_file, genome = 'hg19', extraCols = extraCols, format = 'bed',
139 |     rename_name = 'DM_status', rename_score = 'pval')
140 | # Use less regions to speed things up
141 | dm_regions = dm_regions[1:2000]
142 | print(dm_regions)
143 | ```
144 | 
145 | ## Annotating Regions
146 | 
147 | Users may select annotations a la carte via the accessors listed with `builtin_annotations()`, shortcuts, or use custom annotations as described above. The `hg19_cpgs` shortcut annotates regions to CpG islands, CpG shores, CpG shelves, and inter-CGI. The `hg19_basicgenes` shortcut annotates regions to 1-5Kb, promoters, 5'UTRs, exons, introns, and 3'UTRs. Shortcuts for other `builtin_genomes()` are accessed in a similar way.
148 | 
149 | `annotate_regions()` requires a `GRanges` object (either the result of `read_regions()` or an existing object), a `GRanges` object of the `annotations`, and a logical value indicating whether to `ignore.strand` when calling `GenomicRanges::findOverlaps()`. The positive integer `minoverlap` is also passed to `GenomicRanges::findOverlaps()` and specifies the minimum overlap required for a region to be assigned to an annotation.
150 | 
151 | Before annotating regions, they must be built with `build_annotations()` which requires a character vector of desired annotation codes.
152 | 
153 | ```{r, warning = FALSE, message = FALSE}
154 | # Select annotations for intersection with regions
155 | # Note inclusion of custom annotation, and use of shortcuts
156 | annots = c('hg19_cpgs', 'hg19_basicgenes', 'hg19_genes_intergenic',
157 |     'hg19_genes_intronexonboundaries',
158 |     'hg19_custom_ezh2', 'hg19_H3K4me3_Gm12878')
159 | 
160 | # Build the annotations (a single GRanges object)
161 | annotations = build_annotations(genome = 'hg19', annotations = annots)
162 | 
163 | # Intersect the regions we read in with the annotations
164 | dm_annotated = annotate_regions(
165 |     regions = dm_regions,
166 |     annotations = annotations,
167 |     ignore.strand = TRUE,
168 |     quiet = FALSE)
169 | # A GRanges object is returned
170 | print(dm_annotated)
171 | ```
172 | 
173 | The `annotate_regions()` function returns a `GRanges`, but it may be more convenient to manipulate a coerced `data.frame`. For example,
174 | 
175 | ```{r, warning = FALSE, message = FALSE}
176 | # Coerce to a data.frame
177 | df_dm_annotated = data.frame(dm_annotated)
178 | 
179 | # See the GRanges column of dm_annotaed expanded
180 | print(head(df_dm_annotated))
181 | 
182 | # Subset based on a gene symbol, in this case NOTCH1
183 | notch1_subset = subset(df_dm_annotated, annot.symbol == 'NOTCH1')
184 | print(head(notch1_subset))
185 | ```
186 | 
187 | ## Randomizing Regions
188 | 
189 | Given a set of annotated regions, it is important to know how the annotations compare to those of a randomized set of regions. The `randomize_regions()` function is a wrapper of `regioneR::randomizeRegions()` from the [`regioneR`](http://bioconductor.org/packages/release/bioc/html/regioneR.html) package that creates a set of random regions given a `GRanges` object. After creating the random set, they must be annotated with `annotate_regions()` for later use. Only `builtin_genomes()` can be used in our wrapper function. Downstream functions that support using random region annotations are `summarize_annotations()`, `plot_annotation()`, and `plot_categorical()`.
190 | 
191 | It is important to note that if the regions to be randomized have a particular property, for example they are CpGs, the `randomize_regions()` wrapper will not preserve that property! Instead, we recommend using `regioneR::resampleRegions()` with `universe` being the superset of the data regions you want to sample from.
192 | 
193 | ```{r, warning = FALSE, message = FALSE}
194 | # Randomize the input regions
195 | dm_random_regions = randomize_regions(
196 |     regions = dm_regions,
197 |     allow.overlaps = TRUE,
198 |     per.chromosome = TRUE)
199 | 
200 | # Annotate the random regions using the same annotations as above
201 | # These will be used in later functions
202 | dm_random_annotated = annotate_regions(
203 |     regions = dm_random_regions,
204 |     annotations = annotations,
205 |     ignore.strand = TRUE,
206 |     quiet = TRUE)
207 | ```
208 | 
209 | ## Summarizing Over Annotations
210 | 
211 | When there is no categorical or numerical information associated with the regions, `summarize_annotations()` is the only possible summarization function to use. It gives the counts of regions in each annotation type (see example below). If there is categorical and/or numerical information, then `summarize_numerical()` and/or `summarize_categorical()` may be used. Using random region annotations is only available for `summarize_annotations()`.
212 | 
213 | ```{r, warning = FALSE, message = FALSE}
214 | # Find the number of regions per annotation type
215 | dm_annsum = summarize_annotations(
216 |     annotated_regions = dm_annotated,
217 |     quiet = TRUE)
218 | print(dm_annsum)
219 | 
220 | # Find the number of regions per annotation type
221 | # and the number of random regions per annotation type
222 | dm_annsum_rnd = summarize_annotations(
223 |     annotated_regions = dm_annotated,
224 |     annotated_random = dm_random_annotated,
225 |     quiet = TRUE)
226 | print(dm_annsum_rnd)
227 | 
228 | # Take the mean of the diff_meth column across all regions
229 | # occurring in an annotation.
230 | dm_numsum = summarize_numerical(
231 |     annotated_regions = dm_annotated,
232 |     by = c('annot.type', 'annot.id'),
233 |     over = c('diff_meth'),
234 |     quiet = TRUE)
235 | print(dm_numsum)
236 | 
237 | # Count the occurrences of classifications in the DM_status
238 | # column across the annotation types.
239 | dm_catsum = summarize_categorical(
240 |     annotated_regions = dm_annotated,
241 |     by = c('annot.type', 'DM_status'),
242 |     quiet = TRUE)
243 | print(dm_catsum)
244 | ```
245 | 
246 | ## Plotting
247 | 
248 | The 5 plot functions described below are to be used on the object returned by `annotate_regions()`. The plot functions return an object of type `ggplot` that can be viewed (`print`), saved (`ggsave`), or modified with additional `ggplot2` code.
249 | 
250 | ### Plotting Regions per Annotation
251 | 
252 | ```{r, fig.align='center', fig.cap='Number of DM regions per annotation.', fig.height=6, fig.width=6, fig.show = 'hold', warning = FALSE, message = FALSE}
253 | # View the number of regions per annotation. This function
254 | # is useful when there is no classification or data
255 | # associated with the regions.
256 | annots_order = c(
257 |     'hg19_custom_ezh2',
258 |     'hg19_H3K4me3_Gm12878',
259 |     'hg19_genes_1to5kb',
260 |     'hg19_genes_promoters',
261 |     'hg19_genes_5UTRs',
262 |     'hg19_genes_exons',
263 |     'hg19_genes_intronexonboundaries',
264 |     'hg19_genes_introns',
265 |     'hg19_genes_3UTRs',
266 |     'hg19_genes_intergenic')
267 | dm_vs_kg_annotations = plot_annotation(
268 |     annotated_regions = dm_annotated,
269 |     annotation_order = annots_order,
270 |     plot_title = '# of Sites Tested for DM annotated on chr9',
271 |     x_label = 'knownGene Annotations',
272 |     y_label = 'Count')
273 | print(dm_vs_kg_annotations)
274 | ```
275 | 
276 | The `plot_annotation()` can also use the annotated random regions in the `annotated_random` argument to plot the number of random regions per annotation type next to the number of input data regions.
277 | 
278 | ```{r, fig.align='center', fig.cap='Number of DM regions per annotation with randomized regions.', fig.height=6, fig.width=6, fig.show = 'hold', warning = FALSE, message = FALSE}
279 | # View the number of regions per annotation and include the annotation
280 | # of randomized regions
281 | annots_order = c(
282 |     'hg19_custom_ezh2',
283 |     'hg19_H3K4me3_Gm12878',
284 |     'hg19_genes_1to5kb',
285 |     'hg19_genes_promoters',
286 |     'hg19_genes_5UTRs',
287 |     'hg19_genes_exons',
288 |     'hg19_genes_intronexonboundaries',
289 |     'hg19_genes_introns',
290 |     'hg19_genes_3UTRs',
291 |     'hg19_genes_intergenic')
292 | dm_vs_kg_annotations_wrandom = plot_annotation(
293 |     annotated_regions = dm_annotated,
294 |     annotated_random = dm_random_annotated,
295 |     annotation_order = annots_order,
296 |     plot_title = 'Dist. of Sites Tested for DM (with rndm.)',
297 |     x_label = 'Annotations',
298 |     y_label = 'Count')
299 | print(dm_vs_kg_annotations_wrandom)
300 | ```
301 | 
302 | ### Plotting Regions Occurring in Pairs of Annotations
303 | 
304 | ```{r, fig.align='center', fig.cap='Number of DM regions per pair of annotations.', fig.height=8, fig.width=8, fig.show = 'hold', warning = FALSE, message = FALSE}
305 | # View a heatmap of regions occurring in pairs of annotations
306 | annots_order = c(
307 |     'hg19_custom_ezh2',
308 |     'hg19_H3K4me3_Gm12878',
309 |     'hg19_genes_promoters',
310 |     'hg19_genes_5UTRs',
311 |     'hg19_genes_exons',
312 |     'hg19_genes_introns',
313 |     'hg19_genes_3UTRs',
314 |     'hg19_genes_intergenic')
315 | dm_vs_coannotations = plot_coannotations(
316 |     annotated_regions = dm_annotated,
317 |     annotation_order = annots_order,
318 |     axes_label = 'Annotations',
319 |     plot_title = 'Regions in Pairs of Annotations')
320 | print(dm_vs_coannotations)
321 | ```
322 | 
323 | ### Plotting Numerical Data Over Regions
324 | 
325 | With numerical data, the `plot_numerical()` function plots a single variable (histogram) or two variables (scatterplot) at the region level, faceting over the categorical variable of choice. It is possible to include two categorical variables to facet over (see below). Note, when the plot is a histogram, the distribution over all regions is plotted within each facet.
326 | 
327 | ```{r, fig.align='center', fig.cap='Methylation Rates in Group 0 for Regions Over DM Status.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE}
328 | dm_vs_regions_annot = plot_numerical(
329 |     annotated_regions = dm_annotated,
330 |     x = 'mu0',
331 |     facet = 'annot.type',
332 |     facet_order = c('hg19_genes_1to5kb','hg19_genes_promoters',
333 |         'hg19_genes_5UTRs','hg19_genes_3UTRs', 'hg19_custom_ezh2',
334 |         'hg19_genes_intergenic', 'hg19_cpg_islands'),
335 |     bin_width = 5,
336 |     plot_title = 'Group 0 Region Methylation In Genes',
337 |     x_label = 'Group 0')
338 | print(dm_vs_regions_annot)
339 | ```
340 | 
341 | ```{r, fig.align='center', fig.cap='Methylation Differences for Regions Over DM Status and Annotation Type.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE}
342 | dm_vs_regions_annot2 = plot_numerical(
343 |     annotated_regions = dm_annotated,
344 |     x = 'diff_meth',
345 |     facet = c('annot.type','DM_status'),
346 |     facet_order = list(c('hg19_genes_promoters','hg19_genes_5UTRs','hg19_cpg_islands'), c('hyper','hypo','none')),
347 |     bin_width = 5,
348 |     plot_title = 'Group 0 Region Methylation In Genes',
349 |     x_label = 'Methylation Difference')
350 | print(dm_vs_regions_annot2)
351 | ```
352 | 
353 | ```{r, fig.align='center', fig.cap='Methylation Rates in Regions Over DM Status in Group 0 vs Group 1.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE}
354 | dm_vs_regions_name = plot_numerical(
355 |     annotated_regions = dm_annotated,
356 |     x = 'mu0',
357 |     y = 'mu1',
358 |     facet = 'annot.type',
359 |     facet_order = c('hg19_genes_1to5kb','hg19_genes_promoters',
360 |         'hg19_genes_5UTRs','hg19_genes_3UTRs', 'hg19_custom_ezh2',
361 |         'hg19_genes_intergenic', 'hg19_cpg_islands', 'hg19_cpg_shores'),
362 |     plot_title = 'Region Methylation: Group 0 vs Group 1',
363 |     x_label = 'Group 0',
364 |     y_label = 'Group 1')
365 | print(dm_vs_regions_name)
366 | ```
367 | 
368 | The `plot_numerical_coannotations()` shows the distribution of numerical data for regions occurring in any two annotations, as well as in one or the other annotation. For example, the following example shows CpG methylation rates for CpGs occurring in just promoters, just CpG islands, and both promoters and CpG islands.
369 | 
370 | ```{r, fig.align='center', fig.cap='Group 0 methylation Rates in Regions in promoters, CpG islands, and both.', fig.height=5, fig.width=12, fig.show='hold', warning = FALSE, message = FALSE}
371 | dm_vs_num_co = plot_numerical_coannotations(
372 |     annotated_regions = dm_annotated,
373 |     x = 'mu0',
374 |     annot1 = 'hg19_cpg_islands',
375 |     annot2 = 'hg19_genes_promoters',
376 |     bin_width = 5,
377 |     plot_title = 'Group 0 Perc. Meth. in CpG Islands and Promoters',
378 |     x_label = 'Percent Methylation')
379 | print(dm_vs_num_co)
380 | ```
381 | 
382 | ### Plotting Categorical Data
383 | 
384 | ```{r, fig.align='center', fig.cap='Differential methylation classification with counts of CpG annotations.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE}
385 | # View the counts of CpG annotations in data classes
386 | 
387 | # The orders for the x-axis labels. This is also a subset
388 | # of the labels (hyper, hypo, none).
389 | x_order = c(
390 |     'hyper',
391 |     'hypo')
392 | # The orders for the fill labels. Can also use this
393 | # parameter to subset annotation types to fill.
394 | fill_order = c(
395 |     'hg19_cpg_islands',
396 |     'hg19_cpg_shores',
397 |     'hg19_cpg_shelves',
398 |     'hg19_cpg_inter')
399 | # Make a barplot of the data class where each bar
400 | # is composed of the counts of CpG annotations.
401 | dm_vs_cpg_cat1 = plot_categorical(
402 |     annotated_regions = dm_annotated, x='DM_status', fill='annot.type',
403 |     x_order = x_order, fill_order = fill_order, position='stack',
404 |     plot_title = 'DM Status by CpG Annotation Counts',
405 |     legend_title = 'Annotations',
406 |     x_label = 'DM status',
407 |     y_label = 'Count')
408 | print(dm_vs_cpg_cat1)
409 | ```
410 | 
411 | ```{r, fig.align='center', fig.cap='Differential methylation classification with proportion of CpG annotations.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE}
412 | # Use the same order vectors as the previous code block,
413 | # but use proportional fill instead of counts.
414 | 
415 | # Make a barplot of the data class where each bar
416 | # is composed of the *proportion* of CpG annotations.
417 | dm_vs_cpg_cat2 = plot_categorical(
418 |     annotated_regions = dm_annotated, x='DM_status', fill='annot.type',
419 |     x_order = x_order, fill_order = fill_order, position='fill',
420 |     plot_title = 'DM Status by CpG Annotation Proportions',
421 |     legend_title = 'Annotations',
422 |     x_label = 'DM status',
423 |     y_label = 'Proportion')
424 | print(dm_vs_cpg_cat2)
425 | ```
426 | 
427 | As with `plot_annotation()` one may add annotations for random regions to the `annotated_random` parameter of `plot_categorical()`. The result is a Random Regions bar representing the distribution of random regions for the categorical variable used for `fill`. NOTE: Random regions can only be added when `fill = 'annot.type'`.
428 | 
429 | ```{r, fig.align='center', fig.cap='Differential methylation classification with proportion of CpG annotations and random regions.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE}
430 | # Add in the randomized annotations for "Random Regions" bar
431 | 
432 | # Make a barplot of the data class where each bar
433 | # is composed of the *proportion* of CpG annotations, and
434 | # includes "All" regions tested for DM and "Random Regions"
435 | # regions consisting of randomized regions.
436 | dm_vs_cpg_cat_random = plot_categorical(
437 |     annotated_regions = dm_annotated, annotated_random = dm_random_annotated,
438 |     x='DM_status', fill='annot.type',
439 |     x_order = x_order, fill_order = fill_order, position='fill',
440 |     plot_title = 'DM Status by CpG Annotation Proportions',
441 |     legend_title = 'Annotations',
442 |     x_label = 'DM status',
443 |     y_label = 'Proportion')
444 | print(dm_vs_cpg_cat_random)
445 | ```
446 | 
447 | ```{r, fig.align='center', fig.cap='Basic gene annotations with proportions of DM classification.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE}
448 | # View the proportions of data classes in knownGene annotations
449 | 
450 | # The orders for the x-axis labels.
451 | x_order = c(
452 |     'hg19_custom_ezh2',
453 |     'hg19_genes_1to5kb',
454 |     'hg19_genes_promoters',
455 |     'hg19_genes_5UTRs',
456 |     'hg19_genes_exons',
457 |     'hg19_genes_introns',
458 |     'hg19_genes_3UTRs',
459 |     'hg19_genes_intergenic')
460 | # The orders for the fill labels.
461 | fill_order = c(
462 |     'hyper',
463 |     'hypo',
464 |     'none')
465 | dm_vs_kg_cat = plot_categorical(
466 |     annotated_regions = dm_annotated, x='annot.type', fill='DM_status',
467 |     x_order = x_order, fill_order = fill_order, position='fill',
468 |     legend_title = 'DM Status',
469 |     x_label = 'knownGene Annotations',
470 |     y_label = 'Proportion')
471 | print(dm_vs_kg_cat)
472 | ```
473 | 


--------------------------------------------------------------------------------
/vignettes/annotatr_cpgs.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/vignettes/annotatr_cpgs.jpeg


--------------------------------------------------------------------------------
/vignettes/annotatr_genes.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/vignettes/annotatr_genes.jpeg


--------------------------------------------------------------------------------