├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── NAMESPACE ├── R ├── annotatr_data_doc.R ├── annotatr_package_doc.R ├── build_annotations.R ├── intersect.R ├── randomize.R ├── read.R ├── summarize.R ├── utils.R └── visualize.R ├── README.md ├── data-raw └── create_example_annotations.R ├── data └── annotations.rda ├── inst ├── CITATION ├── NEWS └── extdata │ ├── Gm12878_Ezh2_peak_annotations.txt.gz │ ├── Gm12878_Ezh2_sorted_scores.narrowPeak.gz │ ├── Gm12878_Stat3_chr2.bed.gz │ ├── IDH2mut_v_NBM_multi_data_chr9.txt.gz │ ├── K562_Cjun_peak_annotations.txt.gz │ ├── test_BED3.bed │ ├── test_BED4.bed │ ├── test_BED5.bed │ ├── test_BED6.bed │ ├── test_annotation_nooverlap.bed │ ├── test_annotations_3.bed │ ├── test_annotations_4.bed │ ├── test_annotations_5.bed │ ├── test_annotations_6.bed │ ├── test_annotations_6_gene.bed │ ├── test_annotations_6_symbol.bed │ ├── test_annotations_6_tx_gene_symbol.bed │ ├── test_annotations_minoverlap.bed │ ├── test_bedGraph.bedGraph │ ├── test_intersect.bed │ └── test_read_multiple_data_nohead.bed ├── man ├── annotate_regions.Rd ├── annotations.Rd ├── annotatr.Rd ├── annotatr_cache.Rd ├── build_ah_annots.Rd ├── build_annotations.Rd ├── build_cpg_annots.Rd ├── build_enhancer_annots.Rd ├── build_gene_annots.Rd ├── build_hmm_annots.Rd ├── build_lncrna_annots.Rd ├── builtin_annotations.Rd ├── builtin_genomes.Rd ├── check_annotations.Rd ├── expand_annotations.Rd ├── get_cellline_from_code.Rd ├── get_cellline_from_shortcut.Rd ├── get_orgdb_name.Rd ├── get_txdb_name.Rd ├── plot_annotation.Rd ├── plot_categorical.Rd ├── plot_coannotations.Rd ├── plot_numerical.Rd ├── plot_numerical_coannotations.Rd ├── randomize_regions.Rd ├── read_annotations.Rd ├── read_regions.Rd ├── reformat_hmm_codes.Rd ├── subset_order_tbl.Rd ├── summarize_annotations.Rd ├── summarize_categorical.Rd ├── summarize_numerical.Rd └── tidy_annotations.Rd ├── tests ├── testthat.R └── testthat │ ├── test_1_utils.R │ ├── test_2_read.R │ ├── test_3_build_annotations.R │ ├── test_4_intersect.R │ ├── test_5_randomize.R │ ├── test_6_summarize.R │ └── test_7_visualize.R └── vignettes ├── annotatr-vignette.Rmd ├── annotatr_cpgs.jpeg └── annotatr_genes.jpeg /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.git$ 2 | ^data-raw$ 3 | ^meta$ 4 | ^\.travis\.yml$ 5 | README\.md 6 | ^\.Rprofile$ 7 | ^doc$ 8 | ^Meta$ 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .Rproj.user 3 | .Rhistory 4 | .RData 5 | meta/ 6 | inst/doc 7 | /doc/ 8 | /Meta/ 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Sample .travis.yml for R projects 2 | 3 | language: r 4 | r: 5 | - devel 6 | sudo: false 7 | cache: packages 8 | bioc_required: true 9 | 10 | warnings_are_errors: false 11 | 12 | r_github_packages: 13 | - jimhester/covr 14 | 15 | notifications: 16 | slack: sartorlab:OpT7L6aC9upo7d3PzW2yzMsh 17 | email: 18 | on_success: change 19 | on_failure: change 20 | 21 | after_success: 22 | - Rscript -e 'covr::coveralls()' 23 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: annotatr 2 | Title: Annotation of Genomic Regions to Genomic Annotations 3 | Version: 1.31.0 4 | Date: 2021-11-20 5 | Authors@R: c( 6 | person("Raymond G.", "Cavalcante", email = "rcavalca@umich.edu", role = c("aut", "cre")), 7 | person(c("Maureen A."), "Sartor", email = "sartorma@med.umich.edu", role = c("ths"))) 8 | Description: Given a set of genomic sites/regions (e.g. ChIP-seq peaks, CpGs, differentially methylated CpGs or regions, SNPs, etc.) it is often of interest to investigate the intersecting genomic annotations. Such annotations include those relating to gene models (promoters, 5'UTRs, exons, introns, and 3'UTRs), CpGs (CpG islands, CpG shores, CpG shelves), or regulatory sequences such as enhancers. The annotatr package provides an easy way to summarize and visualize the intersection of genomic sites/regions with genomic annotations. 9 | Depends: 10 | R (>= 3.4.0) 11 | Imports: 12 | AnnotationDbi, 13 | AnnotationHub, 14 | dplyr, 15 | GenomicFeatures, 16 | GenomicRanges, 17 | GenomeInfoDb (>= 1.10.3), 18 | ggplot2, 19 | IRanges, 20 | methods, 21 | readr, 22 | regioneR, 23 | reshape2, 24 | rtracklayer, 25 | S4Vectors (>= 0.23.10), 26 | stats, 27 | utils 28 | Suggests: 29 | BiocStyle, 30 | devtools, 31 | knitr, 32 | org.Dm.eg.db, 33 | org.Gg.eg.db, 34 | org.Hs.eg.db, 35 | org.Mm.eg.db, 36 | org.Rn.eg.db, 37 | rmarkdown, 38 | roxygen2, 39 | testthat, 40 | TxDb.Dmelanogaster.UCSC.dm3.ensGene, 41 | TxDb.Dmelanogaster.UCSC.dm6.ensGene, 42 | TxDb.Ggallus.UCSC.galGal5.refGene, 43 | TxDb.Hsapiens.UCSC.hg19.knownGene, 44 | TxDb.Hsapiens.UCSC.hg38.knownGene, 45 | TxDb.Mmusculus.UCSC.mm9.knownGene, 46 | TxDb.Mmusculus.UCSC.mm10.knownGene, 47 | TxDb.Rnorvegicus.UCSC.rn4.ensGene, 48 | TxDb.Rnorvegicus.UCSC.rn5.refGene, 49 | TxDb.Rnorvegicus.UCSC.rn6.refGene 50 | VignetteBuilder: knitr 51 | BugReports: https://www.github.com/rcavalcante/annotatr/issues 52 | License: GPL-3 53 | LazyData: true 54 | RoxygenNote: 7.1.2 55 | biocViews: Software, Annotation, GenomeAnnotation, FunctionalGenomics, Visualization 56 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(annotate_regions) 4 | export(annotatr_cache) 5 | export(build_ah_annots) 6 | export(build_annotations) 7 | export(builtin_annotations) 8 | export(builtin_genomes) 9 | export(expand_annotations) 10 | export(plot_annotation) 11 | export(plot_categorical) 12 | export(plot_coannotations) 13 | export(plot_numerical) 14 | export(plot_numerical_coannotations) 15 | export(randomize_regions) 16 | export(read_annotations) 17 | export(read_regions) 18 | export(subset_order_tbl) 19 | export(summarize_annotations) 20 | export(summarize_categorical) 21 | export(summarize_numerical) 22 | export(tidy_annotations) 23 | import(AnnotationDbi, except='select') 24 | import(AnnotationHub, except='query') 25 | import(GenomicFeatures) 26 | import(GenomicRanges, except=c('union','setdiff','intersect','union')) 27 | import(dplyr) 28 | import(ggplot2) 29 | import(methods) 30 | importClassesFrom(GenomeInfoDb,Seqinfo) 31 | importClassesFrom(S4Vectors,Hits) 32 | importClassesFrom(S4Vectors,Rle) 33 | importFrom(GenomeInfoDb,seqlengths) 34 | importFrom(GenomeInfoDb,seqnames) 35 | importFrom(IRanges,IRanges) 36 | importFrom(S4Vectors,endoapply) 37 | importFrom(S4Vectors,splitAsList) 38 | importFrom(readr,read_tsv) 39 | importFrom(regioneR,randomizeRegions) 40 | importFrom(reshape2,melt) 41 | importFrom(rtracklayer,import) 42 | importFrom(rtracklayer,import.bed) 43 | importFrom(stats,as.formula) 44 | importFrom(utils,combn) 45 | importFrom(utils,data) 46 | -------------------------------------------------------------------------------- /R/annotatr_data_doc.R: -------------------------------------------------------------------------------- 1 | #' example_annotations data 2 | #' 3 | #' A \code{GRanges} of precomputed annotations for CpG features. Created by doing 4 | #' \code{build_annotations(genome='hg19', annotations = 'hg19_cpgs')}. 5 | #' 6 | #' @format A \code{GRanges} object with the CpG feature annotations for hg19 7 | #' and containing \code{mcols}: 8 | #' \describe{ 9 | #' \item{id}{The internal ID for the annotation} 10 | #' \item{tx_id}{All NA, since these are not associated with tx_ids} 11 | #' \item{gene_id}{All NA, since there are not associated Entrez IDs} 12 | #' \item{symbols}{All NA, since there are not associated gene symbols} 13 | #' \item{type}{A character indicating the type of annotation. Including: 14 | #' 'hg19_cpg_islands', 'hg19_cpg_shores', 'hg19_cpg_shelves', and 'hg19_cpg_inter'.} 15 | #' } 16 | #' @source The AnnotationHub resource for hg19 CpG features. 17 | "annotations" 18 | -------------------------------------------------------------------------------- /R/annotatr_package_doc.R: -------------------------------------------------------------------------------- 1 | #' annotatr: Annotation of Genomic Regions to Functional Annotations 2 | #' 3 | #' Given a set of genomic sites/regions (e.g. ChIP-seq peaks, CpGs, differentially methylated CpGs or regions, SNPs, etc.) it is often of interest to investigate the intersecting functional annotations. Such annotations include those relating to gene models (promoters, 5'UTRs, exons, introns, and 3'UTRs), CpGs (CpG islands, CpG shores, CpG shelves), the non-coding genome, and enhancers. The annotatr package provides an easy way to summarize and visualize the intersection of genomic sites/regions with the above functional annotations. 4 | #' 5 | #' @docType package 6 | #' @name annotatr 7 | #' 8 | #' @rawNamespace import(AnnotationDbi, except='select') 9 | #' @rawNamespace import(AnnotationHub, except='query') 10 | #' @import dplyr 11 | #' @import ggplot2 12 | #' @import GenomicFeatures 13 | #' @rawNamespace import(GenomicRanges, except=c('union','setdiff','intersect','union')) 14 | #' @importClassesFrom GenomeInfoDb Seqinfo 15 | #' @importFrom GenomeInfoDb seqnames seqlengths 16 | #' @importFrom IRanges IRanges 17 | #' @importFrom S4Vectors endoapply 18 | #' @importFrom S4Vectors splitAsList 19 | #' @import methods 20 | #' @importFrom readr read_tsv 21 | #' @importFrom reshape2 melt 22 | #' @importFrom regioneR randomizeRegions 23 | #' @importFrom rtracklayer import import.bed 24 | #' @importClassesFrom S4Vectors Hits Rle 25 | #' @importFrom stats as.formula 26 | #' @importFrom utils combn data 27 | NULL 28 | -------------------------------------------------------------------------------- /R/intersect.R: -------------------------------------------------------------------------------- 1 | #' A function to intersect user region data with annotation data 2 | #' 3 | #' Annotate genomic regions to selected genomic annotations while preserving the data associated with the genomic regions. 4 | #' 5 | #' @param regions The GRanges object returned by \code{read_regions()}. 6 | #' @param annotations A character vector of annotations to build. Valid annotation codes are listed with \code{builtin_annotations()}. The "basicgenes" shortcut builds the following regions: 1-5Kb upstream of TSSs, promoters, 5UTRs, exons, introns, and 3UTRs. The "cpgs" shortcut builds the following regions: CpG islands, shores, shelves, and interCGI regions. NOTE: Shortcuts need to be appended by the genome, e.g. \code{hg19_basicgenes}. 7 | #' Custom annotations whose names are of the form \code{[genome]_custom_[name]} should also be included. Custom annotations should be read in and converted to \code{GRanges} with \code{read_annotations()}. They can be for a \code{supported_genome()}, or for an unsupported genome. 8 | #' @param minoverlap A scalar, positive integer, indicating the minimum required overlap of regions with annotations. 9 | #' @param ignore.strand Logical indicating whether strandedness should be respected in findOverlaps(). Default FALSE. 10 | #' @param quiet Print progress messages (FALSE) or not (TRUE). 11 | #' 12 | #' @return A \code{GRanges} where the \code{granges} are from the regions, and the \code{mcols} include the \code{mcols} from the regions and a column with the annotation \code{GRanges}. 13 | #' 14 | #' @examples 15 | #' r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr') 16 | #' extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character') 17 | #' r = read_regions(con = r_file, extraCols = extraCols, rename_score = 'coverage') 18 | #' 19 | #' # Get premade CpG annotations 20 | #' data('annotations', package = 'annotatr') 21 | #' 22 | #' a = annotate_regions( 23 | #' regions = r, 24 | #' annotations = annotations, 25 | #' ignore.strand = TRUE) 26 | #' 27 | #' @export 28 | annotate_regions = function(regions, annotations, minoverlap = 1L, ignore.strand = TRUE, quiet = FALSE) { 29 | # Checks before moving forward 30 | if(class(regions)[1] != "GRanges") { 31 | stop('Error in annotate_regions(...): regions object is not GRanges.') 32 | } 33 | 34 | if(class(annotations)[1] != "GRanges") { 35 | stop('Error in annotate_regions(...): annotations object is not GRanges. Use build_annotations(...) to construct the annotations before calling annotate_regions(...).') 36 | } 37 | 38 | # Perform the intersections 39 | if(!quiet) { 40 | message('Annotating...') 41 | } 42 | 43 | intersections = GenomicRanges::findOverlaps(regions, annotations, minoverlap = minoverlap, ignore.strand = ignore.strand) 44 | 45 | if(length(intersections) > 0) { 46 | gr = regions[S4Vectors::queryHits(intersections)] 47 | GenomicRanges::mcols(gr)$annot = annotations[S4Vectors::subjectHits(intersections)] 48 | return(gr) 49 | } else { 50 | stop('No annotations intersect the regions.') 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /R/randomize.R: -------------------------------------------------------------------------------- 1 | #' Randomize Regions 2 | #' 3 | #' \code{randomize_regions} is a wrapper function for \code{regioneR::randomizeRegions()} that simplifies the creation of randomized regions for an input set of regions read with \code{read_regions()}. It relies on the \code{seqlengths} of \code{regions} in order to build the appropriate \code{genome} object for \code{regioneR::randomizeRegions()}. 4 | #' 5 | #' NOTE: The data associated with the input \code{regions} are not passed on to the random regions. 6 | #' 7 | #' @param regions A \code{GRanges} object from \code{read_regions}. 8 | #' @param allow.overlaps A logical stating whether random regions can overlap input regions (TRUE) or not (FALSE). Default TRUE. 9 | #' @param per.chromosome A logical stating whether the random regions should remain on the same chromosome (TRUE) or not (FALSE). Default TRUE. 10 | #' @param quiet Print progress messages (FALSE) or not (TRUE). 11 | #' 12 | #' @return A \code{GRanges} object of randomized regions based on \code{regions} from \code{read_regions()}. NOTE: Data associated with the original regions is not attached to the randomized regions. 13 | #' 14 | #' @examples 15 | #' # Create random region set based on ENCODE ChIP-seq data 16 | #' file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr') 17 | #' r = read_regions(con = file, genome = 'hg19') 18 | #' 19 | #' random_r = randomize_regions(regions = r) 20 | #' 21 | #' @export 22 | randomize_regions = function(regions, allow.overlaps = TRUE, per.chromosome = TRUE, quiet = FALSE) { 23 | 24 | ######################################################################## 25 | # Argument parsing and error handling 26 | if(class(regions)[1] != "GRanges") { 27 | stop('Error: regions must have class GRanges. The best way to ensure this is to pass the result of read_regions() into this function.') 28 | } 29 | 30 | # Get the genome from the regions 31 | genome = unique(GenomeInfoDb::genome(regions)) 32 | 33 | if(is.na(genome)) { 34 | stop('Error: regions GRanges object must have a valid genome to randomize its regions.') 35 | } else { 36 | chr_lengths = GenomeInfoDb::Seqinfo(genome = genome) 37 | chr_lengths = GenomeInfoDb::seqlengths(chr_lengths) 38 | 39 | df_genome = data.frame( 40 | 'chr' = names(chr_lengths), 41 | 'start' = rep.int(1, length(chr_lengths)), 42 | 'end' = as.numeric(chr_lengths), 43 | stringsAsFactors = FALSE) 44 | } 45 | 46 | if(!quiet) { 47 | message('Randomizing regions...') 48 | } 49 | 50 | # Randomize the regions 51 | randomized = regioneR::randomizeRegions(A = regions, genome = df_genome, 52 | per.chromosome = per.chromosome, allow.overlaps = allow.overlaps) 53 | 54 | # Sort the randomized 55 | randomized = sort(randomized) 56 | 57 | return(randomized) 58 | } 59 | -------------------------------------------------------------------------------- /R/read.R: -------------------------------------------------------------------------------- 1 | #' Read genomic regions in BEDX+Y format 2 | #' 3 | #' \code{read_regions()} reads genomic regions by calling the \code{rtracklayer::import()} function. This function can automatically deal with BEDX files from BED3 to BED6. For BED6+Y, the \code{extraCols} argument should be used to correctly interpret the extra columns. 4 | #' 5 | #' NOTE: The \code{name} (4th) and \code{score} (5th) columns are so named. If these columns have a particular meaning for your data, they should be renamed with the \code{rename_name} and/or \code{rename_score} parameters. 6 | #' 7 | #' @param con A path, URL, connection or BEDFile object. See \code{rtracklayer::import()} documentation. 8 | #' @param genome From \code{rtracklayer::import()}: The identifier of a genome, or NA if unknown. Typically, this is a UCSC identifier like 'hg19'. An attempt will be made to derive the \code{seqinfo} on the return value using either an installed BSgenome package or UCSC, if network access is available. 9 | #' @param format From \code{rtracklayer::import()}: The format of the output. If not missing, should be one of 'bed', 'bed15', 'bedGraph' or 'bedpe'. If missing and 'con' is a filename, the format is derived from the file extension. This argument is unnecessary when 'con' is a derivative of 'RTLFile'. 10 | #' @param extraCols From \code{rtracklayer::import()}: A character vector in the same form as 'colClasses' from 'read.table'. It should indicate the name and class of each extra/special column to read from the BED file. As BED does not encode column names, these are assumed to be the last columns in the file. This enables parsing of the various BEDX+Y formats. 11 | #' @param rename_name A string to rename the name column of the BED file. For example, if the name column actually contains a categorical variable. 12 | #' @param rename_score A string to rename the score column of the BED file. For example, if the score column represents a quantity about the data besides the score in the BED specification. 13 | #' @param ... Parameters to pass onto the format-specific method of \code{rtracklayer::import()}. 14 | #' 15 | #' @return A \code{GRanges} object. 16 | #' 17 | #' @examples 18 | #' 19 | #' # Example of reading a BED6+3 file where the last 3 columns are non-standard 20 | #' file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 21 | #' extraCols = c(diff_meth = 'numeric', mu0 = 'numeric', mu1 = 'numeric') 22 | #' gr = read_regions(con = file, genome = 'hg19', extraCols = extraCols, format = 'bed', 23 | #' rename_name = 'DM_status', rename_score = 'pval') 24 | #' 25 | #' @export 26 | read_regions = function(con, genome = NA, format, extraCols = character(), rename_name, rename_score, ...) { 27 | 28 | if(!missing(format)) { 29 | gr = rtracklayer::import(con = con, genome = genome, format = format, extraCols = extraCols, ...) 30 | } else { 31 | gr = rtracklayer::import(con = con, genome = genome, extraCols = extraCols, ...) 32 | } 33 | 34 | # Rename name and score columns if the user desires 35 | if(!missing(rename_name)) { 36 | if(any(colnames(GenomicRanges::mcols(gr)) == 'name')) { 37 | colnames(GenomicRanges::mcols(gr))[which(colnames(GenomicRanges::mcols(gr)) == 'name')] = rename_name 38 | } else { 39 | warning('Ignoring rename_name parameter because con has no name column.') 40 | } 41 | } 42 | if(!missing(rename_score)) { 43 | if(any(colnames(GenomicRanges::mcols(gr)) == 'score')) { 44 | colnames(GenomicRanges::mcols(gr))[which(colnames(GenomicRanges::mcols(gr)) == 'score')] = rename_score 45 | } else { 46 | warning('Ignoring rename_score parameter because con has no score column.') 47 | } 48 | } 49 | 50 | return(gr) 51 | } 52 | 53 | #' Read custom annotations 54 | #' 55 | #' \code{read_annotations()} is a wrapper for the \code{rtracklayer::import()} function that creates a \code{GRanges} object matching the structure of annotations built with \code{build_annotations()}. The structure is defined by \code{GRanges}, with the \code{mcols()} with names \code{c('id','gene_id','symbol','type')}. 56 | #' 57 | #' @param con A path, URL, connection or BEDFile object. See \code{rtracklayer::import.bed()} documentation. 58 | #' @param name A string for the name of the annotations to be used in the name of the object, [genome]_custom_[name] 59 | #' @param genome From \code{rtracklayer::import()}: The identifier of a genome, or NA if unknown. Typically, this is a UCSC identifier like 'hg19'. An attempt will be made to derive the \code{seqinfo} on the return value using either an installed BSgenome package or UCSC, if network access is available. 60 | #' @param format From \code{rtracklayer::import()}: The format of the output. If not missing, should be one of 'bed', 'bed15', 'bedGraph' or 'bedpe'. If missing and 'con' is a filename, the format is derived from the file extension. This argument is unnecessary when 'con' is a derivative of 'RTLFile'. 61 | #' @param extraCols From \code{rtracklayer::import.bed()}: A character vector in the same form as 'colClasses' from 'read.table'. It should indicate the name and class of each extra/special column to read from the BED file. As BED does not encode column names, these are assumed to be the last columns in the file. This enables parsing of the various BEDX+Y formats. 62 | #' @param ... Parameters to pass onto the format-specific method of \code{rtracklayer::import()}. 63 | #' 64 | #' @return A \code{GRanges} object stored in \code{annotatr_cache}. To view a custom annotation, do \code{annotatr_cache$get(name)}. To add a custom annotation to the set of annotations, include \code{'[genome]_custom_[name]'} in the call to \code{build_annotations()}. See example below. 65 | #' 66 | #' @examples 67 | #' 68 | #' # Read in a BED3 file as a custom annotation 69 | #' file = system.file('extdata', 'test_annotations_3.bed', package='annotatr') 70 | #' read_annotations(con = file, name = 'test', genome = 'hg19') 71 | #' build_annotations(genome = 'hg19', annotations = 'hg19_custom_test') 72 | #' 73 | #' print(annotatr_cache$get('hg19_custom_test')) 74 | #' 75 | #' @export 76 | read_annotations = function(con, name, genome = NA, format, extraCols = character(), ...) { 77 | 78 | if(missing(name)) { 79 | name = 'annotations' 80 | } 81 | if(is.na(genome)) { 82 | genome_name = 'genome' 83 | } else { 84 | genome_name = genome 85 | } 86 | 87 | protected_extraCols = c('gene_id','symbol','tx_id') 88 | 89 | if(!missing(format)) { 90 | gr = rtracklayer::import(con = con, genome = genome, format = format, extraCols = extraCols, ...) 91 | } else { 92 | gr = rtracklayer::import(con = con, genome = genome, extraCols = extraCols, ...) 93 | } 94 | 95 | # Determine whether gene_id or symbol are missing from extraCols 96 | missing_extraCols = base::setdiff(protected_extraCols, names(extraCols)) 97 | 98 | if(any(missing_extraCols == 'gene_id')) { 99 | GenomicRanges::mcols(gr)$gene_id = NA 100 | } 101 | if(any(missing_extraCols == 'symbol')) { 102 | GenomicRanges::mcols(gr)$symbol = NA 103 | } 104 | if(any(missing_extraCols == 'tx_id')) { 105 | GenomicRanges::mcols(gr)$tx_id = NA 106 | } 107 | 108 | GenomicRanges::mcols(gr)$id = paste0(name,':',seq_along(gr)) 109 | GenomicRanges::mcols(gr)$type = sprintf('%s_custom_%s', genome_name, name) 110 | 111 | # Make sure only the desired mcols make it out 112 | GenomicRanges::mcols(gr) = GenomicRanges::mcols(gr)[,c('id','tx_id','gene_id','symbol','type')] 113 | 114 | ######################################################## 115 | # Write the object named [genome]_custom_[name] to the annotatr_cache 116 | annotatr_cache$set(sprintf('%s_custom_%s', genome_name, name), gr) 117 | } 118 | -------------------------------------------------------------------------------- /R/summarize.R: -------------------------------------------------------------------------------- 1 | #' Summarize annotation counts 2 | #' 3 | #' Given a \code{GRanges} of annotated regions, count the number of regions in each annotation type. If \code{annotated_random} is not \code{NULL}, then the same is computed for the random regions. 4 | #' 5 | #' If a region is annotated to multiple annotations of the same \code{annot.type}, the region will only be counted once. For example, if a region were annotated to multiple exons, it would only count once toward the exons, but if it were annotated to an exon and an intron, it would count towards both. 6 | #' 7 | #' @param annotated_regions The \code{GRanges} result of \code{annotate_regions()}. 8 | #' @param annotated_random The \code{GRanges} result of \code{annotate_regions()} on the randomized regions created from \code{randomize_regions()}. 9 | #' @param quiet Print progress messages (FALSE) or not (TRUE). 10 | #' 11 | #' @return A \code{tbl_df} of the number of regions per annotation type. 12 | #' 13 | #' @examples 14 | #' ### An example of ChIP-seq peaks with signalValue 15 | #' 16 | #' # Get premade CpG annotations 17 | #' data('annotations', package = 'annotatr') 18 | #' 19 | #' file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr') 20 | #' r = read_regions(con = file, genome = 'hg19') 21 | #' 22 | #' a = annotate_regions( 23 | #' regions = r, 24 | #' annotations = annotations, 25 | #' ignore.strand = TRUE, 26 | #' quiet = FALSE) 27 | #' 28 | #' rnd = randomize_regions(regions = r) 29 | #' 30 | #' rnd_annots = annotate_regions( 31 | #' regions = rnd, 32 | #' annotations = annotations, 33 | #' ignore.strand = TRUE, 34 | #' quiet = FALSE) 35 | #' 36 | #' # Summarize the annotated regions without randomized regions 37 | #' s = summarize_annotations(annotated_regions = a) 38 | #' 39 | #' # Summarize the annotated regions with randomized regions 40 | #' s_rnd = summarize_annotations( 41 | #' annotated_regions = a, 42 | #' annotated_random = rnd_annots) 43 | #' 44 | #' @export 45 | summarize_annotations = function(annotated_regions, annotated_random, quiet = FALSE) { 46 | # Tidy the GRanges into a tbl_df for use with dplyr functions 47 | annotated_regions = as.data.frame(annotated_regions, row.names = NULL) 48 | 49 | ######################################################################## 50 | # If a region has multiple annotation types that are the same, count only one 51 | # from each type of annotation 52 | annotated_regions = dplyr::distinct( 53 | dplyr::ungroup(annotated_regions), 54 | across(c('seqnames', 'start', 'end', 'annot.type')), .keep_all=TRUE) 55 | 56 | # Tally over data and random regions if annotated_random isn't null, 57 | # otherwise tally over data only 58 | if(!missing(annotated_random)) { 59 | # Tidy the GRanges into a tbl_df for use with dplyr functions 60 | annotated_random = as.data.frame(annotated_random, row.names = NULL) 61 | 62 | # If a region has multiple annotation types that are the same, count only one 63 | # from each type of annotation 64 | annotated_random = dplyr::distinct( 65 | dplyr::ungroup(annotated_random), 66 | across(c('seqnames', 'start', 'end', 'annot.type')), .keep_all=TRUE) 67 | 68 | if(!quiet) { 69 | message('Counting annotation types in data and random regions') 70 | } 71 | 72 | combined_annots = dplyr::bind_rows('Data' = annotated_regions, 'Random Regions' = annotated_random, .id = 'data_type') 73 | 74 | agg = dplyr::tally( 75 | dplyr::group_by(combined_annots, across(c('data_type', 'annot.type'))) 76 | ) 77 | } else { 78 | if(!quiet) { 79 | message('Counting annotation types') 80 | } 81 | 82 | # Tally over the normal data 83 | agg = dplyr::tally( 84 | dplyr::group_by(annotated_regions, across(c('annot.type'))) 85 | ) 86 | } 87 | 88 | return(agg) 89 | } 90 | 91 | #' Summarize numerical data over groupings of annotated regions 92 | #' 93 | #' Given a \code{GRanges} of annotated regions, summarize numerical data columns based on a grouping. 94 | #' 95 | #' NOTE: We do not take the distinct values of \code{seqnames}, \code{start}, \code{end}, \code{annot.type} as in the other \code{summarize_*()} functions because in the case of a region that intersected two distinct exons, using \code{distinct()} would destroy the information of the mean of the numerical column over one of the exons, which is not desirable. 96 | #' 97 | #' @param annotated_regions The \code{GRanges} result of \code{annotate_regions()}. 98 | #' @param by A character vector of the columns of \code{as.data.frame(annotated_regions)} to group over. Default is \code{c(annot.type, annot.id)}. 99 | #' @param over A character vector of the numerical columns in \code{as.data.frame(annotated_regions)} to \code{count}, take the \code{mean}, and take the \code{sd} over after grouping according to the \code{by} column. NOTE: If more than one value is used, the naming scheme for the resuling \code{dplyr::tbl} summary columns are \code{COLNAME_n}, \code{COLNAME_mean}, \code{COLNAME_sd}. If \code{over} has length one, then the column names are \code{n}, \code{mean}, \code{sd}. 100 | #' @param quiet Print progress messages (FALSE) or not (TRUE). 101 | #' 102 | #' @return A grouped \code{dplyr::tbl_df}, and the \code{count}, \code{mean}, and \code{sd} of the \code{cols} \code{by} the groupings. 103 | #' 104 | #' @examples 105 | #' ### Test on a very simple bed file to demonstrate different options 106 | #' 107 | #' # Get premade CpG annotations 108 | #' data('annotations', package = 'annotatr') 109 | #' 110 | #' r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr') 111 | #' extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character') 112 | #' r = read_regions(con = r_file, genome = 'hg19', extraCols = extraCols, rename_score = 'coverage') 113 | #' 114 | #' a = annotate_regions( 115 | #' regions = r, 116 | #' annotations = annotations, 117 | #' ignore.strand = TRUE) 118 | #' 119 | #' # Testing over normal by 120 | #' sn1 = summarize_numerical( 121 | #' annotated_regions = a, 122 | #' by = c('annot.type', 'annot.id'), 123 | #' over = c('coverage', 'mu1', 'mu0'), 124 | #' quiet = FALSE) 125 | #' 126 | #' # Testing over a different by 127 | #' sn2 = summarize_numerical( 128 | #' annotated_regions = a, 129 | #' by = c('diff_exp'), 130 | #' over = c('coverage', 'mu1', 'mu0')) 131 | #' 132 | #' @export 133 | summarize_numerical = function(annotated_regions, by = c('annot.type', 'annot.id'), over, quiet = FALSE) { 134 | # Tidy the GRanges into a tbl_df for use with dplyr functions 135 | annotated_regions = as.data.frame(annotated_regions, row.names = NULL) 136 | 137 | if(missing(over)) { 138 | stop("Error: over cannot be missing.") 139 | } 140 | 141 | if(!quiet) { 142 | message(sprintf('Grouping regions by %s, and summarizing numerical data over %s', 143 | paste(by, collapse=' & '), paste(over, collapse=' & '))) 144 | } 145 | agg = dplyr::summarize_at( 146 | dplyr::group_by(annotated_regions, across(by)), 147 | over, 148 | dplyr::funs(n(), 'mean', 'sd')) 149 | 150 | return(agg) 151 | } 152 | 153 | #' Summarize categorical data over groupings of annotated regions 154 | #' 155 | #' Given a \code{GRanges} of annotated regions, count the number of regions when the annotations are grouped \code{by} categorical columns. 156 | #' 157 | #' If a region is annotated to multiple annotations of the same \code{annot.type}, the region will only be counted once. For example, if a region were annotated to multiple exons, it would only count once toward the exons, but if it were annotated to an exon and an intron, it would count towards both. 158 | #' 159 | #' @param annotated_regions The \code{GRanges} result of \code{annotate_regions()}. 160 | #' @param by A character vector to group the data in \code{as.data.frame(annotated_regions)} by and tally over. Default is \code{c('annot.type', 'annot.id')}. 161 | #' @param quiet Print progress messages (FALSE) or not (TRUE). 162 | #' 163 | #' @return A grouped \code{dplyr::tbl_df} of the counts of groupings according to the \code{by} vector. 164 | #' 165 | #' @examples 166 | #' 167 | #' # Get premade CpG annotations 168 | #' data('annotations', package = 'annotatr') 169 | #' 170 | #' r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr') 171 | #' extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character') 172 | #' r = read_regions(con = r_file, genome = 'hg19', extraCols = extraCols, rename_score = 'coverage') 173 | #' 174 | #' a = annotate_regions( 175 | #' regions = r, 176 | #' annotations = annotations, 177 | #' ignore.strand = TRUE) 178 | #' 179 | #' sc = summarize_categorical( 180 | #' annotated_regions = a, 181 | #' by = c('annot.type', 'name'), 182 | #' quiet = FALSE) 183 | #' 184 | #' @export 185 | summarize_categorical = function(annotated_regions, by = c('annot.type', 'annot.id'), quiet = FALSE) { 186 | # Tidy the GRanges into a tbl_df for use with dplyr functions 187 | annotated_regions = as.data.frame(annotated_regions, row.names = NULL) 188 | 189 | ######################################################################## 190 | # If a region has multiple annotation types that are the same, count only one 191 | # from each type of annotation 192 | annotated_regions = dplyr::distinct( 193 | dplyr::ungroup(annotated_regions), 194 | across(c('seqnames', 'start', 'end', by)), .keep_all=TRUE) 195 | 196 | if(!quiet) { 197 | message(sprintf('Grouping regions by %s, and tallying', 198 | paste(by, collapse=' & '))) 199 | } 200 | 201 | agg = dplyr::tally( 202 | dplyr::group_by(annotated_regions, across(by))) 203 | 204 | return(agg) 205 | } 206 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | ### Constants 2 | # TxDb.* family of packages 3 | TXDBS = c( 4 | 'TxDb.Dmelanogaster.UCSC.dm3.ensGene', 5 | 'TxDb.Dmelanogaster.UCSC.dm6.ensGene', 6 | 'TxDb.Ggallus.UCSC.galGal5.refGene', 7 | 'TxDb.Hsapiens.UCSC.hg19.knownGene', 8 | 'TxDb.Hsapiens.UCSC.hg38.knownGene', 9 | 'TxDb.Mmusculus.UCSC.mm9.knownGene', 10 | 'TxDb.Mmusculus.UCSC.mm10.knownGene', 11 | 'TxDb.Rnorvegicus.UCSC.rn4.ensGene', 12 | 'TxDb.Rnorvegicus.UCSC.rn5.refGene', 13 | 'TxDb.Rnorvegicus.UCSC.rn6.refGene') 14 | 15 | # org.* family of packages 16 | ORGDBS = data.frame( 17 | genome = c('dm3','dm6','galGal5','hg19','hg38','mm9','mm10','rn4','rn5','rn6'), 18 | org = c('Dm','Dm','Gg','Hs','Hs','Mm','Mm','Rn','Rn','Rn'), 19 | stringsAsFactors = FALSE) 20 | 21 | HMMCELLLINES = c('Gm12878','H1hesc','Hepg2','Hmec','Hsmm','Huvec','K562','Nhek','Nhlf') 22 | 23 | HMMCODES = c('1_Active_Promoter', '2_Weak_Promoter' ,'3_Poised_Promoter' ,'4_Strong_Enhancer', '5_Strong_Enhancer', '6_Weak_Enhancer', '7_Weak_Enhancer', '8_Insulator', '9_Txn_Transition', '10_Txn_Elongation', '11_Weak_Txn', '12_Repressed', '13_Heterochrom/lo', '14_Repetitive/CNV') 24 | 25 | #' Function to recode classes from chromHMM type column 26 | #' 27 | #' @param hmm_codes in the original form from UCSC Genome Browser track. 28 | #' 29 | #' @return A character vector of chromHMM classes with numbers and underscores removed. 30 | reformat_hmm_codes = function(hmm_codes) { 31 | new_codes = sapply(hmm_codes, 32 | function(hmm){paste(unlist(strsplit(hmm,'_'))[-1],collapse='')}, 33 | USE.NAMES=FALSE) 34 | return(new_codes) 35 | } 36 | 37 | #' Function to return cell line from chromatin annotation shortcut 38 | #' 39 | #' @param shortcut The annotation shortcut, used in \code{build_annotations()}. 40 | #' 41 | #' @return A string of the cell line used in a chromatin annotation shortcut 42 | get_cellline_from_shortcut = function(shortcut) { 43 | return(unlist(strsplit(unlist(strsplit(shortcut,'_'))[2], '-'))[1]) 44 | } 45 | 46 | #' Function to return cell line from chromatin annotation code 47 | #' 48 | #' @param code The annotation code, used in \code{build_annotations()}. 49 | #' 50 | #' @return A string of the cell line used in a chromatin annotation code 51 | get_cellline_from_code = function(code) { 52 | return(unlist(strsplit(unlist(strsplit(code,'_'))[3], '-'))[1]) 53 | } 54 | 55 | #' Function listing which annotations are available. 56 | #' 57 | #' This includes the shortcuts. The \code{expand_annotations()} function helps 58 | #' handle the shortcuts. 59 | #' 60 | #' @return A character vector of available annotations. 61 | #' 62 | #' @examples 63 | #' builtin_annotations() 64 | #' 65 | #' @export 66 | builtin_annotations = function() { 67 | # Create annotation code endings 68 | shortcut_ends = c('basicgenes','cpgs') 69 | 70 | # Gene codes 71 | gene_genomes = annotatr::builtin_genomes() 72 | gene_ends = c('1to5kb', 'promoters', 'cds', '5UTRs', 'exons', 'firstexons', 'introns', 'intronexonboundaries', 'exonintronboundaries', '3UTRs', 'intergenic') 73 | 74 | # CpG codes 75 | cpg_genomes = base::setdiff(annotatr::builtin_genomes(),c('dm3','dm6')) 76 | cpg_ends = c('islands', 'shores', 'shelves', 'inter') 77 | 78 | # Chromatin state codes 79 | # Remove numbers, and underscores, and take unique 80 | chromatin_recode = unique(reformat_hmm_codes(HMMCODES)) 81 | 82 | chromatin_ends = apply( 83 | expand.grid(HMMCELLLINES, chromatin_recode, stringsAsFactors = FALSE), 84 | 1, paste, collapse='-') 85 | 86 | chromatin_shortcut_ends = apply( 87 | expand.grid(HMMCELLLINES, 'chromatin', stringsAsFactors = FALSE), 88 | 1, paste, collapse='-') 89 | 90 | # Create full annotation codes 91 | gene_codes = apply( 92 | expand.grid(gene_genomes, 'genes', gene_ends, stringsAsFactors = FALSE), 93 | 1, paste, collapse='_') 94 | cpg_codes = apply( 95 | expand.grid(cpg_genomes, 'cpg', cpg_ends, stringsAsFactors= FALSE), 96 | 1, paste, collapse='_') 97 | chromatin_codes = apply( 98 | expand.grid('hg19', 'chromatin', chromatin_ends, stringsAsFactors=FALSE), 99 | 1, paste, collapse='_') 100 | 101 | enhancer_codes = c('hg19_enhancers_fantom','hg38_enhancers_fantom','mm9_enhancers_fantom','mm10_enhancers_fantom') 102 | lncrna_codes = c('hg19_lncrna_gencode','hg38_lncrna_gencode','mm10_lncrna_gencode') 103 | 104 | gene_shortcut_codes = apply( 105 | expand.grid(gene_genomes, 'basicgenes', stringsAsFactors = FALSE), 106 | 1, paste, collapse='_') 107 | cpg_shortcut_codes = apply( 108 | expand.grid(cpg_genomes, 'cpgs', stringsAsFactors = FALSE), 109 | 1, paste, collapse='_') 110 | chromatin_shortcut_codes = paste('hg19', chromatin_shortcut_ends, sep='_') 111 | 112 | # Create the big vector of supported annotations 113 | annots = c(gene_codes, cpg_codes, chromatin_codes, enhancer_codes, lncrna_codes, 114 | gene_shortcut_codes, cpg_shortcut_codes, chromatin_shortcut_codes) 115 | 116 | return(annots) 117 | } 118 | 119 | #' Function returning supported TxDb.* genomes 120 | #' 121 | #' @return A character vector of genomes for supported TxDb.* packages 122 | #' 123 | #' @examples 124 | #' builtin_genomes() 125 | #' 126 | #' @export 127 | builtin_genomes = function() { 128 | return(ORGDBS$genome) 129 | } 130 | 131 | #' Function to get correct TxDb.* package name based on genome 132 | #' 133 | #' @param genome A string giving the genome assembly. 134 | #' 135 | #' @return A string giving the name of the correct TxDb.* package name based on \code{genome}. 136 | get_txdb_name = function(genome = annotatr::builtin_genomes()) { 137 | # Ensure valid arguments 138 | genome = match.arg(genome) 139 | 140 | db = grep(genome, TXDBS, value = TRUE) 141 | 142 | return(db) 143 | } 144 | 145 | #' Function to get correct org.* package name based on genome 146 | #' 147 | #' @param genome A string giving the genome assembly. 148 | #' 149 | #' @return A string giving the correct org for org.db packages. e.g. hg19 -> Hs. 150 | get_orgdb_name = function(genome = annotatr::builtin_genomes()) { 151 | # Ensure valid arguments 152 | genome = match.arg(genome) 153 | 154 | org = ORGDBS[ORGDBS$genome == genome, 'org'] 155 | 156 | return(org) 157 | } 158 | 159 | #' Function to tidy up annotation accessors for visualization 160 | #' 161 | #' @param annotations A character vector of annotations, in the order they are to appear in the visualization. 162 | #' 163 | #' @return A list of mappings from original annotation names to names ready for visualization. 164 | #' @export 165 | tidy_annotations = function(annotations) { 166 | tidy = sapply(annotations, function(a){ 167 | tokens = unlist(strsplit(a,'_')) 168 | if(tokens[2] == 'cpg') { 169 | if(tokens[3] == 'inter') { 170 | return('interCGI') 171 | } else { 172 | return(paste('CpG', tokens[3])) 173 | } 174 | } else if (tokens[2] == 'genes') { 175 | if(tokens[3] == 'firstexons') { 176 | return('first exons') 177 | } else if (tokens[3] == 'intronexonboundaries') { 178 | return('intron/exon boundaries') 179 | } else if (tokens[3] == 'exonintronboundaries') { 180 | return('exon/intron boundaries') 181 | } else { 182 | return(tokens[3]) 183 | } 184 | } else if (tokens[2] == 'enhancers') { 185 | return('enhancers') 186 | } else if (tokens[2] == 'chromatin') { 187 | return(tokens[3]) 188 | } else if (tokens[2] == 'custom') { 189 | return(tokens[3]) 190 | } else if (tokens[2] == 'lncrna') { 191 | return('GENCODE lncRNA') 192 | } else { 193 | return(sprintf('%s %s', tokens[2], tokens[3])) 194 | } 195 | }) 196 | 197 | flip_tidy = names(tidy) 198 | names(flip_tidy) = tidy 199 | 200 | return(as.list(flip_tidy)) 201 | } 202 | 203 | #' Function to check for valid annotations 204 | #' 205 | #' Gives errors if any annotations are not in builtin_annotations() (and they are not in the required custom format), basicgenes are used, or the genome prefixes are not the same for all annotations. 206 | #' 207 | #' @param annotations A character vector of annotations possibly using the shortcuts 208 | #' @return If all the checks on the annotations pass, returns NULL to allow code to move forward. 209 | check_annotations = function(annotations) { 210 | # Pull out any custom annotations before checking 211 | custom_annotations = grep('custom', annotations, value = TRUE) 212 | annotations = base::setdiff(annotations, custom_annotations) 213 | 214 | # Check that the annotations are supported, tell the user which are unsupported 215 | if( !all(annotations %in% annotatr::builtin_annotations()) ) { 216 | unsupported = base::setdiff(annotations, annotatr::builtin_annotations()) 217 | 218 | stop(sprintf('Error: "%s" is(are) not supported. See builtin_annotations().', 219 | paste(unsupported, collapse=', '))) 220 | } 221 | 222 | # Recombine annotations and custom_annotations or you get failure when 223 | # there are only custom annotations 224 | annotations = c(custom_annotations, annotations) 225 | 226 | genomes = sapply(annotations, function(a){ 227 | unlist(strsplit(a, '_'))[1] 228 | }, USE.NAMES = FALSE) 229 | 230 | # Check for same genome on all annotations 231 | if( length(unique(genomes)) != 1 ){ 232 | stop('Error: genome prefix on all annotations must be the same.') 233 | } 234 | 235 | return(NULL) 236 | } 237 | 238 | #' Function to expand annotation shortcuts 239 | #' 240 | #' @param annotations A character vector of annotations, possibly using the shortcut accessors 241 | #' 242 | #' @return A vector of data accession-ized names that are ordered from upstream to downstream in the case of knownGenes and islands to interCGI in the case of cpgs. 243 | #' @export 244 | expand_annotations = function(annotations) { 245 | are_basicgenes = any(grepl('basicgenes', annotations)) 246 | are_cpgs = any(grepl('cpgs', annotations)) 247 | are_hmms = any(grepl('-chromatin', annotations)) 248 | 249 | which_are_shortcuts = c(which(grepl('basicgenes', annotations)), which(grepl('cpgs', annotations)), which(grepl('-chromatin', annotations))) 250 | 251 | # expand_shortcuts() will always be run after check_annotations() so we can be 252 | # sure that the genome prefixes are the same for all annotaitons. 253 | genome = unique( sapply(annotations, function(a){ unlist(strsplit(a, '_'))[1] }, USE.NAMES = FALSE) ) 254 | 255 | if(are_basicgenes || are_cpgs || are_hmms) { 256 | 257 | # Check for shortcut annotation accessors 'cpgs', 'basicgenes' 258 | # and create the right annotations based on the genome 259 | new_annotations = c() 260 | remove_shortcuts = c() 261 | if(are_cpgs) { 262 | new_annotations = paste(genome, 'cpg', c('islands','shores','shelves','inter'), sep='_') 263 | } 264 | if(are_basicgenes) { 265 | new_annotations = c(new_annotations, paste(genome, 'genes', c('1to5kb','promoters','5UTRs','exons','introns','3UTRs'), sep='_')) 266 | } 267 | if(are_hmms) { 268 | # Could conceivably use shortcuts for multiple cell lines 269 | hmms = grep('-chromatin', annotations, value = TRUE) 270 | cell_lines = sapply(hmms, get_cellline_from_shortcut, USE.NAMES = FALSE) 271 | 272 | new_hmm_codes = apply( 273 | expand.grid(cell_lines, unique(reformat_hmm_codes(HMMCODES)), stringsAsFactors = FALSE), 274 | 1, paste, collapse='-') 275 | 276 | new_annotations = c(new_annotations, 277 | paste(genome, 'chromatin', new_hmm_codes, sep='_')) 278 | } 279 | annotations = base::setdiff(c(annotations, new_annotations), annotations[which_are_shortcuts]) 280 | } 281 | 282 | return(annotations) 283 | } 284 | 285 | #' Function to subset a tbl_df or grouped_df by a column 286 | #' 287 | #' @param tbl A \code{tbl_df} or \code{grouped_df}. 288 | #' @param col A string indicating which column of of \code{tbl} to subset and order 289 | #' @param col_order A character vector indicating the order of \code{col}. 290 | #' 291 | #' @return A modified version of \code{summary} with \code{col} subsetted by \code{col_order}. 292 | #' @export 293 | subset_order_tbl = function(tbl, col, col_order) { 294 | if(!is.null(col)) { 295 | # Collect all types in the column 296 | all_col_names = unique(tbl[[col]]) 297 | 298 | # Inherit col_order from the order in tbl 299 | if(is.null(col_order)) { 300 | col_order = all_col_names 301 | } 302 | 303 | # Check set equality of col in the summary and the col_order 304 | if( !dplyr::setequal(all_col_names, col_order) ) { 305 | if( all(col_order %in% all_col_names) ) { 306 | tbl = subset(tbl, tbl[[col]] %in% col_order) 307 | } else { 308 | # Intersect col_order with unique(tbl[[col]]) to deal with possible 0 tallies 309 | col_order = intersect(col_order, unique(tbl[[col]])) 310 | warning('There are elements in col_order that are not present in the corresponding column. Check for typos, or this could be a result of 0 tallies.') 311 | } 312 | } 313 | 314 | # Convert fill to factor with levels in the correct order 315 | tbl[[col]] = factor(tbl[[col]], levels = col_order) 316 | # Also convert the levels to tidy names if fill is annotations 317 | if(col == 'annot.type') { 318 | levels(tbl[[col]]) = tidy_annotations(col_order) 319 | } 320 | } 321 | return(tbl) 322 | } 323 | -------------------------------------------------------------------------------- /R/visualize.R: -------------------------------------------------------------------------------- 1 | #' Plot the number of regions per annotation 2 | #' 3 | #' Given a \code{GRanges} of annotated regions, plot the number of regions with the corresponding genomic annotations used in \code{annotation_order}. If a region is annotated to multiple annotations of the same \code{annot.type}, the region will only be counted once in the corresponding bar plot. For example, if a region were annotated to multiple exons, it would only count once toward the exon bar in the plot, but if it were annotated to an exon and an intron, it would count towards both. 4 | #' 5 | #' @param annotated_regions The \code{GRanges} result of \code{annotate_regions()}. 6 | #' @param annotated_random The \code{GRanges} result of \code{annotate_regions()} on the randomized regions created from \code{randomize_regions()}. 7 | #' @param annotation_order A character vector which doubles as the subset of annotations desired for the plot as well as the ordering. If \code{NULL}, all annotations are displayed. 8 | #' @param plot_title A string used for the title of the plot. If missing, no title is displayed. 9 | #' @param x_label A string used for the x-axis label. If missing, no x-axis label is displayed. 10 | #' @param y_label A string used for the y-axis label. If missing, no y-axis label is displayed. 11 | #' @param quiet Print progress messages (FALSE) or not (TRUE). 12 | #' 13 | #' @return A \code{ggplot} object which can be viewed by calling it, saved with \code{ggplot2::ggsave}, or edited. 14 | #' 15 | #' @examples 16 | #' ######################################################################## 17 | #' # An example of ChIP-seq peaks with signalValue used for score 18 | #' 19 | #' # Get premade CpG annotations 20 | #' data('annotations', package = 'annotatr') 21 | #' 22 | #' chip_bed = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr') 23 | #' chip_regions = read_regions(con = chip_bed, genome = 'hg19') 24 | #' 25 | #' chip_rnd = randomize_regions(regions = chip_regions) 26 | #' 27 | #' chip_annots = annotate_regions( 28 | #' regions = chip_regions, 29 | #' annotations = annotations, 30 | #' ignore.strand = TRUE) 31 | #' 32 | #' chip_rnd_annots = annotate_regions( 33 | #' regions = chip_rnd, 34 | #' annotations = annotations, 35 | #' ignore.strand = TRUE) 36 | #' 37 | #' annots_order = c( 38 | #' 'hg19_cpg_islands', 39 | #' 'hg19_cpg_shores') 40 | #' 41 | #' p_annots = plot_annotation(annotated_regions = chip_annots, 42 | #' annotation_order = annots_order) 43 | #' p_annots_rnd = plot_annotation(annotated_regions = chip_annots, 44 | #' annotated_random = chip_rnd_annots, annotation_order = annots_order) 45 | #' 46 | #' @export 47 | plot_annotation = function(annotated_regions, annotated_random, annotation_order = NULL, 48 | plot_title, x_label, y_label, quiet = FALSE) { 49 | 50 | # Tidy the GRanges into a tbl_df for use with dplyr functions 51 | annotated_regions = as.data.frame(annotated_regions, row.names = NULL) 52 | 53 | ######################################################################## 54 | # Order and subset the annotations 55 | annotated_regions = subset_order_tbl(tbl = annotated_regions, col='annot.type', col_order=annotation_order) 56 | 57 | ######################################################################## 58 | # If a region has multiple annotation types that are the same, count only one 59 | # from each type of annotation 60 | annotated_regions = dplyr::distinct( 61 | dplyr::ungroup(annotated_regions), 62 | across(c('seqnames', 'start', 'end', 'annot.type')), .keep_all=TRUE) 63 | 64 | # Do particular things if annotated_random isn't NULL 65 | if(!missing(annotated_random)) { 66 | # Tidy the GRanges into a tbl_df for use with dplyr functions 67 | annotated_random = as.data.frame(annotated_random, row.names = NULL) 68 | 69 | # Order and subset the randomized annotations 70 | annotated_random = subset_order_tbl(tbl = annotated_random, col='annot.type', col_order=annotation_order) 71 | 72 | # If a region has multiple annotation types that are the same, count only one 73 | # from each type of annotation 74 | annotated_random = dplyr::distinct( 75 | dplyr::ungroup(annotated_random), 76 | across(c('seqnames', 'start', 'end', 'annot.type')), .keep_all=TRUE) 77 | 78 | # Combine the tbl_dfs in preparation for visualization 79 | annotated_regions = dplyr::bind_rows("Data" = annotated_regions, "Random Regions" = annotated_random, .id = 'data_type') 80 | } 81 | 82 | ######################################################################## 83 | # Construct the plot 84 | 85 | # Make the base ggplot 86 | # NOTE: binwidth may need to be a parameter 87 | if(missing(annotated_random)) { 88 | plot = 89 | ggplot(annotated_regions, aes_string(x='annot.type')) + 90 | geom_bar() + 91 | theme_bw() + 92 | theme(axis.text.x = element_text(angle = 30, hjust = 1), 93 | legend.title=element_blank(), legend.position="bottom", legend.key = element_rect(color = 'white')) 94 | } else { 95 | plot = 96 | ggplot(annotated_regions, aes_string(x='annot.type')) + 97 | geom_bar(aes_string(fill = 'data_type'), position='dodge') + 98 | theme_bw() + 99 | scale_fill_grey() + 100 | theme(axis.text.x = element_text(angle = 30, hjust = 1), 101 | legend.title=element_blank(), legend.position="bottom", legend.key = element_rect(color = 'white')) 102 | } 103 | 104 | # Add any user defined labels to the plot if their values are not NULL 105 | # if they are NULL, ggplot() will use defaults 106 | if(!missing(plot_title)) { 107 | plot = plot + ggtitle(plot_title) 108 | } 109 | if(!missing(x_label)) { 110 | plot = plot + xlab(x_label) 111 | } 112 | if(!missing(y_label)) { 113 | plot = plot + ylab(y_label) 114 | } 115 | 116 | return(plot) 117 | } 118 | 119 | #' Plot pair-wise annotations across regions 120 | #' 121 | #' All co-occurring annotations associated with a region are computed and displayed as a heatmap. 122 | #' 123 | #' As with \code{plot_annotation()}, the number in each cell is the number of unique regions annotated to the pair of annotations. 124 | #' 125 | #' For example, if a region is annotated to both a CpG shore and to two different exons simultaneously, the region will only be counted once in the CpG shore / exon cell. NOTE, this same region will count once in both the CpG shore and exon cells on the diagonal. 126 | #' 127 | #' @param annotated_regions The \code{GRanges} result of \code{annotate_regions()}. 128 | #' @param annotation_order A character vector which doubles as the subset of annotations desired for plot as well as the ordering. If \code{NULL}, all annotations are displayed. 129 | #' @param plot_title A string used for the title of the plot. If missing, no plot title label is displayed. 130 | #' @param axes_label A string used for the axis labels. If missing, corresponding variable name used. 131 | #' @param quiet Print progress messages (FALSE) or not (TRUE). 132 | #' 133 | #' @return A \code{ggplot} object which can be viewed by calling it, saved with \code{ggplot2::ggsave}, or edited. 134 | #' 135 | #' @examples 136 | #' # Get premade CpG annotations 137 | #' data('annotations', package = 'annotatr') 138 | #' 139 | #' dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 140 | #' extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric') 141 | #' dm_regions = read_regions(con = dm_file, extraCols = extraCols, 142 | #' rename_score = 'pval', rename_name = 'DM_status', format = 'bed') 143 | #' dm_regions = dm_regions[1:1000] 144 | #' 145 | #' dm_annots = annotate_regions( 146 | #' regions = dm_regions, 147 | #' annotations = annotations, 148 | #' ignore.strand = TRUE) 149 | #' 150 | #' all_order = c( 151 | #' 'hg19_cpg_islands', 152 | #' 'hg19_cpg_shores', 153 | #' 'hg19_cpg_shelves', 154 | #' 'hg19_cpg_inter') 155 | #' 156 | #' dm_vs_ca = plot_coannotations( 157 | #' annotated_regions = dm_annots, 158 | #' annotation_order = all_order, 159 | #' axes_label = 'Annotations', 160 | #' plot_title = 'Co-occurrence of Annotations') 161 | #' 162 | #' @export 163 | plot_coannotations = function(annotated_regions, annotation_order = NULL, 164 | plot_title, axes_label, quiet = FALSE) { 165 | 166 | # Tidy the GRanges into a tbl_df for use with dplyr functions 167 | annotated_regions = as.data.frame(annotated_regions, row.names = NULL) 168 | 169 | ######################################################################## 170 | # Order and subset the annotations 171 | annotated_regions = subset_order_tbl(tbl = annotated_regions, col='annot.type', col_order=annotation_order) 172 | 173 | ######################################################################## 174 | # Find the co-annotations 175 | 176 | annotation_pairs_by_region = dplyr::do( 177 | dplyr::group_by(annotated_regions, across(c('seqnames', 'start', 'end'))), 178 | expand.grid(.$annot.type, .$annot.type, stringsAsFactors = FALSE)) 179 | 180 | annotation_pairs_by_region = dplyr::distinct(dplyr::ungroup(annotation_pairs_by_region), 181 | across(c('seqnames', 'start', 'end', 'Var1', 'Var2')), .keep_all=TRUE) 182 | 183 | pairwise_annotation_counts = table(annotation_pairs_by_region[['Var1']], annotation_pairs_by_region[['Var2']]) 184 | 185 | pac_m = reshape2::melt(pairwise_annotation_counts, value.name = 'Counts') 186 | 187 | ######################################################################## 188 | # Construct the plot 189 | 190 | # Make the base ggplot 191 | # NOTE: binwidth may need to be a parameter 192 | plot = ggplot(pac_m, aes_string('Var1', 'Var2')) + 193 | geom_raster(aes_string(fill = 'Counts')) + 194 | geom_text(aes_string(label = 'Counts')) + 195 | scale_fill_gradient(low = "white", high = "steelblue") + 196 | theme(axis.text.x = element_text(angle = 30, hjust = 1), axis.text.y = element_text(angle = 30, hjust = 1)) 197 | 198 | # Add any user defined labels to the plot if their values are not NULL 199 | # if they are NULL, ggplot() will use defaults 200 | if(!missing(plot_title)) { 201 | plot = plot + ggtitle(plot_title) 202 | } 203 | if(!missing(axes_label)) { 204 | plot = plot + xlab(axes_label) 205 | plot = plot + ylab(axes_label) 206 | } 207 | 208 | return(plot) 209 | } 210 | 211 | #' Plot numerical data over regions or regions summarized over annotations 212 | #' 213 | #' This function produces either histograms over \code{facet}, or x-y scatterplots over \code{facet}. In the case of histograms over facets, the All distribution (hollow histogram with red outline) is the distribution of \code{x} over all the regions in the data. The facet specific distributions (solid gray) are the distribution of \code{x} over the regions in each facet. For example, a CpG with associated percent methylation annotated to a CpG island and a promoter will count once in the All distribution, but will count once each in the CpG island and promoter facet distributions. 214 | #' 215 | #' @param annotated_regions A \code{GRanges} returned from \code{annotate_regions()}. If the data is not summarized, the data is at the region level. If it is summarized, it represents the average or standard deviation of the regions by the character vector used for \code{by} in \code{summarize_numerical()}. 216 | #' @param x A string indicating the column of the \code{GRanges} to use for the x-axis. 217 | #' @param y A string indicating the column of the \code{GRanges} to use for the y-axis. If missing, a a histogram over \code{x} will be plotted. If not missing, a scatterplot is plotted. 218 | #' @param facet A string, or character vector of two strings, indicating indicating which categorical variable(s) in the \code{GRanges} to make \code{ggplot2} facets over. When two facets are given, the first entry is the vertical facet and the second entry is the horizontal facet. Default is \code{annot.type}. 219 | #' @param facet_order A character vector, or list of character vectors if \code{facet} has length 2, which gives the order of the facets, and can be used to subset the column in the \code{GRanges} used for the \code{facet}. For example, if \code{facet = 'annot.type'}, then the annotations maybe subsetted to just CpG annotations. Default is \code{NULL}, meaning all annotations in their default order are used. 220 | #' @param bin_width An integer indicating the bin width of the histogram used for score. Default 10. Select something appropriate for the data. NOTE: This is only used if \code{y} is \code{NULL}. 221 | #' @param plot_title A string used for the title of the plot. If missing, no title is displayed. 222 | #' @param x_label A string used for the x-axis label. If missing, no x-axis label is displayed. 223 | #' @param y_label A string used for the y-axis label. If missing, no y-axis label is displayed. 224 | #' @param legend_facet_label A string used to label the gray bar portion of the legend. Defaults to "x in facet". 225 | #' @param legend_cum_label A string used to label the red outline portion of the legend. Defaults to "All in x". 226 | #' @param quiet Print progress messages (FALSE) or not (TRUE). 227 | #' 228 | #' @return A \code{ggplot} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}. 229 | #' 230 | #' @examples 231 | #' # An example with multi-columned data 232 | #' 233 | #' # Get premade CpG annotations 234 | #' data('annotations', package = 'annotatr') 235 | #' 236 | #' dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 237 | #' extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric') 238 | #' dm_regions = read_regions(con = dm_file, extraCols = extraCols, 239 | #' rename_score = 'pval', rename_name = 'DM_status', format = 'bed') 240 | #' dm_regions = dm_regions[1:1000] 241 | #' 242 | #' # Annotate the regions 243 | #' dm_annots = annotate_regions( 244 | #' regions = dm_regions, 245 | #' annotations = annotations, 246 | #' ignore.strand = TRUE) 247 | #' 248 | #' # Plot histogram of group 1 methylation rates across the CpG annotations. 249 | #' # NOTE: Overall distribution (everything in \code{facet_order}) 250 | #' # is plotted in each facet for comparison. 251 | #' dm_vs_regions_mu1 = plot_numerical( 252 | #' annotated_regions = dm_annots, 253 | #' x = 'mu1', 254 | #' facet = 'annot.type', 255 | #' facet_order = c('hg19_cpg_islands','hg19_cpg_shores', 256 | #' 'hg19_cpg_shelves','hg19_cpg_inter'), 257 | #' bin_width = 5, 258 | #' plot_title = 'Group 1 Methylation over CpG Annotations', 259 | #' x_label = 'Group 1 Methylation') 260 | #' 261 | #' # Plot histogram of group 1 methylation rates across the CpG annotations 262 | #' # crossed with DM_status 263 | #' dm_vs_regions_diffmeth = plot_numerical( 264 | #' annotated_regions = dm_annots, 265 | #' x = 'diff_meth', 266 | #' facet = c('annot.type','DM_status'), 267 | #' facet_order = list( 268 | #' c('hg19_genes_promoters','hg19_genes_5UTRs','hg19_cpg_islands'), 269 | #' c('hyper','hypo','none')), 270 | #' bin_width = 5, 271 | #' plot_title = 'Group 0 Region Methylation In Genes', 272 | #' x_label = 'Methylation Difference') 273 | #' 274 | #' # Can also use the result of annotate_regions() to plot two numerical 275 | #' # data columns against each other for each region, and facet by annotations. 276 | #' dm_vs_regions_annot = plot_numerical( 277 | #' annotated_regions = dm_annots, 278 | #' x = 'mu0', 279 | #' y = 'mu1', 280 | #' facet = 'annot.type', 281 | #' facet_order = c('hg19_cpg_islands','hg19_cpg_shores', 282 | #' 'hg19_cpg_shelves','hg19_cpg_inter'), 283 | #' plot_title = 'Region Methylation: Group 0 vs Group 1', 284 | #' x_label = 'Group 0', 285 | #' y_label = 'Group 1') 286 | #' 287 | #' # Another example, but using differential methylation status as the facets. 288 | #' dm_vs_regions_name = plot_numerical( 289 | #' annotated_regions = dm_annots, 290 | #' x = 'mu0', 291 | #' y = 'mu1', 292 | #' facet = 'DM_status', 293 | #' facet_order = c('hyper','hypo','none'), 294 | #' plot_title = 'Region Methylation: Group 0 vs Group 1', 295 | #' x_label = 'Group 0', 296 | #' y_label = 'Group 1') 297 | #' 298 | #' @export 299 | plot_numerical = function(annotated_regions, x, y, facet, facet_order, bin_width=10, 300 | plot_title, x_label, y_label, legend_facet_label, legend_cum_label, quiet = FALSE) { 301 | 302 | # Check for facet facet_order mismatches 303 | if(length(facet) == 2) { 304 | if(!is(facet_order, 'list')) { 305 | stop('When facet is of length two, facet_order must be a list giving the order for each facet variable.') 306 | } 307 | two_facets = TRUE 308 | } else { 309 | two_facets = FALSE 310 | } 311 | 312 | # Deal with facet formula 313 | if(two_facets) { 314 | facet_formula = paste(facet[1], "~", facet[2]) 315 | } else { 316 | facet_formula = paste("~", facet) 317 | } 318 | 319 | # Tidy the GRanges into a tbl_df for use with dplyr functions 320 | tbl = as.data.frame(annotated_regions, row.names = NULL) 321 | 322 | ######################################################################## 323 | # Order and subset the annotations 324 | if(two_facets) { 325 | sub_tbl = subset_order_tbl(tbl = tbl, col = facet[1], col_order = facet_order[[1]]) 326 | sub_tbl = subset_order_tbl(tbl = sub_tbl, col = facet[2], col_order = facet_order[[2]]) 327 | } else { 328 | sub_tbl = subset_order_tbl(tbl = tbl, col = facet, col_order = facet_order) 329 | } 330 | 331 | ######################################################################## 332 | # Create data objects for plots 333 | facet_data = dplyr::distinct(dplyr::ungroup(sub_tbl), across(c('seqnames', 'start', 'end', 'annot.type')), .keep_all=TRUE) 334 | if(two_facets) { 335 | all_data = dplyr::distinct(dplyr::select(dplyr::ungroup(tbl), -matches(facet[1])), across(c('seqnames', 'start', 'end')), .keep_all=TRUE) 336 | all_data = dplyr::distinct(dplyr::select(all_data, -matches(facet[2])), across(c('seqnames', 'start', 'end')), .keep_all=TRUE) 337 | } else { 338 | all_data = dplyr::distinct(dplyr::select(dplyr::ungroup(tbl), -matches(facet)), across(c('seqnames', 'start', 'end')), .keep_all=TRUE) 339 | } 340 | 341 | 342 | ######################################################################## 343 | # Construct the plot 344 | # Note, data must be dplyr::ungroup()-ed before hand for the proper 345 | # display of the overall distribution. 346 | 347 | if(missing(y)) { 348 | if(missing(legend_facet_label)) { 349 | if(two_facets) { 350 | legend_facet_label = sprintf('%s in %s x %s', x, facet[1], facet[2]) 351 | } else { 352 | legend_facet_label = sprintf('%s in %s', x, facet) 353 | } 354 | } 355 | if(missing(legend_cum_label)) { 356 | legend_cum_label = sprintf('All %s', x) 357 | } 358 | fill_man = c(NA, 'gray') 359 | names(fill_man) = c(legend_cum_label, legend_facet_label) 360 | 361 | # Make the base histogram ggplot 362 | plot = 363 | # Facet hists are plotted with distinct (seqnames, start, end, annot.type) combinations 364 | ggplot( 365 | data = facet_data, 366 | aes_string(x=x, y='..density..')) + 367 | geom_histogram(binwidth=bin_width, aes(fill = legend_facet_label)) + 368 | facet_wrap( stats::as.formula(facet_formula) ) + # Over the facets 369 | # All hist is plotted with distinct (seqnames, start, end) combinations 370 | geom_histogram( 371 | data = all_data, 372 | binwidth=bin_width, aes(fill = legend_cum_label, color = 'red')) + # All the data 373 | theme_bw() + 374 | scale_fill_manual(values = fill_man) + 375 | guides(color = 'none') + 376 | theme(legend.title=element_blank(), legend.position="bottom", legend.key = element_rect(color = c('red','white'))) 377 | } else { 378 | # Make the base scatter ggplot 379 | plot = ggplot(facet_data, aes_string(x=x, y=y)) + 380 | geom_point(alpha = 1/8, size = 1) + 381 | facet_wrap( stats::as.formula(facet_formula) ) + 382 | theme_bw() 383 | } 384 | 385 | # Add any user defined labels to the plot if their values are not NULL 386 | # if they are NULL, ggplot() will use defaults 387 | if(!missing(plot_title)) { 388 | plot = plot + ggtitle(plot_title) 389 | } 390 | if(!missing(x_label)) { 391 | plot = plot + xlab(x_label) 392 | } 393 | if(!missing(y_label)) { 394 | plot = plot + ylab(y_label) 395 | } 396 | 397 | return(plot) 398 | } 399 | 400 | #' Plot numerical data occurring in pairs of annotations 401 | #' 402 | #' Plot numerical data associated with regions occurring in \code{annot1}, \code{annot2} and in both. As with \code{plot_numerical()}, the result is a plot of histograms or x-y scatterplots. 403 | #' 404 | #' For example, a CpG with associated percent methylation annotated to a CpG island and a promoter will count once in the All distribution and once in the CpG island / promoter facet distribution. However, a CpG associated only with a promoter will count once in the All distribution and once in the promoter / promoter distribution. 405 | #' 406 | #' @param annotated_regions A \code{GRanges} returned from \code{annotate_regions()}. 407 | #' @param x A string indicating the column of the \code{GRanges} to use for the x-axis. 408 | #' @param y A string indicating the column of the \code{GRanges} to use for the y-axis. If missing, a histogram over \code{x} will be plotted. If not missing, a scatterplot is plotted. 409 | #' @param annot1 A string indicating the first annotation type. 410 | #' @param annot2 A string indicating the second annotation type. 411 | #' @param bin_width An integer indicating the bin width of the histogram used for score. Default 10. Select something appropriate for the data. NOTE: This is only used if \code{y} is \code{NULL}. 412 | #' @param plot_title A string used for the title of the plot. If missing, no title is displayed. 413 | #' @param x_label A string used for the x-axis label. If missing, no x-axis label is displayed. 414 | #' @param y_label A string used for the y-axis label. If missing, no y-axis label is displayed. 415 | #' @param legend_facet_label A string used to label the gray bar portion of the legend. Defaults to "x in annot pair". 416 | #' @param legend_cum_label A string used to label the red outline portion of the legend. Defaults to "All x". 417 | #' @param quiet Print progress messages (FALSE) or not (TRUE). 418 | #' 419 | #' @return A \code{ggplot} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}. 420 | #' 421 | #' @examples 422 | #' # Get premade CpG annotations 423 | #' data('annotations', package = 'annotatr') 424 | #' 425 | #' dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 426 | #' extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric') 427 | #' dm_regions = read_regions(con = dm_file, extraCols = extraCols, 428 | #' rename_score = 'pval', rename_name = 'DM_status', format = 'bed') 429 | #' dm_regions = dm_regions[1:1000] 430 | #' 431 | #' dm_annots = annotate_regions( 432 | #' regions = dm_regions, 433 | #' annotations = annotations, 434 | #' ignore.strand = TRUE) 435 | #' 436 | #' dm_vs_num_co = plot_numerical_coannotations( 437 | #' annotated_regions = dm_annots, 438 | #' x = 'mu0', 439 | #' annot1 = 'hg19_cpg_islands', 440 | #' annot2 = 'hg19_cpg_shelves', 441 | #' bin_width = 5, 442 | #' plot_title = 'Group 0 Perc. Meth. in CpG Islands and Promoters', 443 | #' x_label = 'Percent Methylation') 444 | #' 445 | #' @export 446 | plot_numerical_coannotations = function(annotated_regions, x, y, annot1, annot2, bin_width=10, 447 | plot_title, x_label, y_label, legend_facet_label, legend_cum_label, quiet = FALSE) { 448 | 449 | # Tidy the GRanges into a tbl_df for use with dplyr functions 450 | tbl = as.data.frame(annotated_regions, row.names = NULL) 451 | 452 | ######################################################################## 453 | # Order and subset the annotations 454 | annotation_order = c(annot1,annot2) 455 | sub_tbl = subset_order_tbl(tbl = tbl, col='annot.type', col_order=annotation_order) 456 | 457 | ######################################################################## 458 | # Find the co-annotations 459 | 460 | # Use combn instead of expand.grid because we do not want regions annotated to 461 | # a CpG island and a promoter having their data value count in the island / island 462 | # facet as well as the promoter / promoter facet. We want it *ONLY* in the 463 | # island / promoter facet. Note, sorting ensures island / promoter and promoter / island 464 | # are aggregated 465 | pairs_by_region = dplyr::do( 466 | dplyr::group_by(sub_tbl, across(c('seqnames', 'start', 'end'))), 467 | if(nrow(.) == 1) { 468 | as.data.frame( 469 | t( 470 | utils::combn( 471 | rep.int(as.character(.$annot.type), 2) 472 | , 2)) 473 | , stringsAsFactors = FALSE) 474 | } else { 475 | as.data.frame( 476 | t( 477 | utils::combn( 478 | sort(as.character(.$annot.type)) 479 | , 2)) 480 | , stringsAsFactors = FALSE) 481 | } 482 | ) 483 | 484 | # Join on the data chromosome locations 485 | pairs_by_region = dplyr::inner_join(x = pairs_by_region, y = sub_tbl, by = c('seqnames','start','end')) 486 | 487 | ######################################################################## 488 | # Create data objects for plots 489 | facet_data = dplyr::distinct(dplyr::ungroup(pairs_by_region), 490 | across(c('seqnames', 'start', 'end', 'V1', 'V2')), .keep_all=TRUE) 491 | all_data = dplyr::distinct(dplyr::ungroup(tbl), across(c('seqnames', 'start', 'end')), .keep_all=TRUE) 492 | 493 | ######################################################################## 494 | # Construct the plot 495 | # Note, data must be dplyr::ungroup()-ed before hand for the proper 496 | # display of the overall distribution. 497 | 498 | if(missing(y)) { 499 | if(missing(legend_facet_label)) { 500 | legend_facet_label = sprintf('%s in %s', x, 'annot pair') 501 | } 502 | if(missing(legend_cum_label)) { 503 | legend_cum_label = sprintf('All %s', x) 504 | } 505 | fill_man = c(NA, 'gray') 506 | names(fill_man) = c(legend_cum_label, legend_facet_label) 507 | 508 | # Make the base histogram ggplot 509 | plot = 510 | # Facet hists are plotted with distinct (seqnames, start, end, annot1, annot2) combinations 511 | ggplot( 512 | data = facet_data, 513 | aes_string(x=x, y='..density..')) + 514 | geom_histogram(binwidth=bin_width, aes(fill = legend_facet_label)) + 515 | facet_wrap( V1 ~ V2 ) + # Over the facets 516 | # All hist is plotted with distinct (seqnames, start, end) combinations 517 | geom_histogram( 518 | data = all_data, 519 | binwidth=bin_width, aes(fill = legend_cum_label, color = 'red')) + # All the data 520 | theme_bw() + 521 | scale_fill_manual(values = fill_man) + 522 | guides(color = 'none') + 523 | theme(legend.title=element_blank(), legend.position="bottom", legend.key = element_rect(color = c('red','white'))) 524 | } else { 525 | # Make the base scatter ggplot 526 | plot = ggplot(pairs_by_region, aes_string(x=x, y=y)) + 527 | geom_point(alpha = 1/8, size = 1) + 528 | facet_wrap( V1 ~ V2 ) + 529 | theme_bw() 530 | } 531 | 532 | # Add any user defined labels to the plot if their values are not NULL 533 | # if they are NULL, ggplot() will use defaults 534 | if(!missing(plot_title)) { 535 | plot = plot + ggtitle(plot_title) 536 | } 537 | if(!missing(x_label)) { 538 | plot = plot + xlab(x_label) 539 | } 540 | if(!missing(y_label)) { 541 | plot = plot + ylab(y_label) 542 | } 543 | 544 | return(plot) 545 | } 546 | 547 | #' Plot a categorical data variable over another 548 | #' 549 | #' Given a \code{GRanges} of annotated regions from \code{annotate_regions()}, visualize the the distribution of categorical data \code{fill} in categorical data \code{x}. A bar representing the distribution of all \code{fill} in \code{x} will be added according to the contents of \code{fill}. This is the distribution over all values of \code{x}. Additionally, when \code{annotated_random} is not missing, a "Random Regions" bar shows the distribution of random regions over \code{fill}. 550 | #' 551 | #' For example, if a differentially methylated region has the categorical label hyper, and is annotated to a promoter, a 5UTR, two exons, and an intron. Each annotation will appear in the All bar once. Likewise for the hyper bar if the differential methylation status is chosen as \code{x} with \code{annot.type} chosen as \code{fill}. 552 | #' 553 | #' @param annotated_regions The \code{GRanges} result of \code{annotate_regions()}. 554 | #' @param annotated_random The \code{GRanges} result of \code{annotate_regions()} on the randomized regions created from \code{randomize_regions()}. Random regions can only be used with \code{fill == 'annot.type'}. 555 | #' @param x One of 'annot.type' or a categorical data column, indicating whether annotation classes or data classes will appear on the x-axis. 556 | #' @param fill One of 'annot.type', a categorical data column, or \code{NULL}, indicating whether annotation classes or data classes will fill the bars. If \code{NULL} then the bars will be the total counts of the x classes. 557 | #' @param x_order A character vector that subsets and orders the x classes. Default \code{NULL}, uses existing values. 558 | #' @param fill_order A character vector that subsets and orders the fill classes. Default \code{NULL}, uses existing values. 559 | #' @param position A string which has the same possible values as in \code{ggplot2::geom_bar(..., position)}, i.e., 'stack', 'fill', 'dodge', etc. 560 | #' @param plot_title A string used for the title of the plot. If missing, no title is displayed. 561 | #' @param legend_title A string used for the legend title to describe fills (if fill is not \code{NULL}). Default displays corresponding variable name. 562 | #' @param x_label A string used for the x-axis label. If missing, corresponding variable name used. 563 | #' @param y_label A string used for the y-axis label. If missing, corresponding variable name used. 564 | #' @param quiet Print progress messages (FALSE) or not (TRUE). 565 | #' 566 | #' @return A \code{ggplot} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}. 567 | #' 568 | #' @examples 569 | #' # Get premade CpG annotations 570 | #' data('annotations', package = 'annotatr') 571 | #' 572 | #' dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 573 | #' extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric') 574 | #' dm_regions = read_regions(con = dm_file, extraCols = extraCols, genome = 'hg19', 575 | #' rename_score = 'pval', rename_name = 'DM_status', format = 'bed') 576 | #' dm_regions = dm_regions[1:1000] 577 | #' 578 | #' dm_annots = annotate_regions( 579 | #' regions = dm_regions, 580 | #' annotations = annotations, 581 | #' ignore.strand = TRUE) 582 | #' 583 | #' dm_order = c( 584 | #' 'hyper', 585 | #' 'hypo') 586 | #' cpg_order = c( 587 | #' 'hg19_cpg_islands', 588 | #' 'hg19_cpg_shores', 589 | #' 'hg19_cpg_shelves', 590 | #' 'hg19_cpg_inter') 591 | #' 592 | #' dm_vn = plot_categorical( 593 | #' annotated_regions = dm_annots, 594 | #' x = 'DM_status', 595 | #' fill = 'annot.type', 596 | #' x_order = dm_order, 597 | #' fill_order = cpg_order, 598 | #' position = 'fill', 599 | #' legend_title = 'knownGene Annotations', 600 | #' x_label = 'DM status', 601 | #' y_label = 'Proportion') 602 | #' 603 | #' # Create randomized regions 604 | #' dm_rnd_regions = randomize_regions(regions = dm_regions) 605 | #' dm_rnd_annots = annotate_regions( 606 | #' regions = dm_rnd_regions, 607 | #' annotations = annotations, 608 | #' ignore.strand = TRUE) 609 | #' 610 | #' dm_vn_rnd = plot_categorical( 611 | #' annotated_regions = dm_annots, 612 | #' annotated_random = dm_rnd_annots, 613 | #' x = 'DM_status', 614 | #' fill = 'annot.type', 615 | #' x_order = dm_order, 616 | #' fill_order = cpg_order, 617 | #' position = 'fill', 618 | #' legend_title = 'knownGene Annotations', 619 | #' x_label = 'DM status', 620 | #' y_label = 'Proportion') 621 | #' 622 | #' @export 623 | plot_categorical = function(annotated_regions, annotated_random, x, fill=NULL, x_order=NULL, fill_order=NULL, 624 | position = 'stack', plot_title, legend_title, x_label, y_label, quiet = FALSE) { 625 | 626 | ######################################################################## 627 | # Argument parsing and error handling 628 | 629 | # Tidy the GRanges into a tbl_df for use with dplyr functions 630 | annotated_regions = as.data.frame(annotated_regions, row.names = NULL) 631 | 632 | # Ensure the value of x is a column name in summarized_cats 633 | if( !(x %in% colnames(annotated_regions)) ) { 634 | stop('The column name used for x does not exist in annotated_regions.') 635 | } 636 | 637 | # Ensure the value of fill is a column name in summarized_cats if it isn't NULL 638 | # Also ensure fill != x 639 | if( !is.null(fill) ) { 640 | if( !(fill %in% colnames(annotated_regions)) ) { 641 | stop('The column name used for fill does not exist in annotated_regions.') 642 | } 643 | if( x == fill ) { 644 | stop('Error: x cannot equal fill') 645 | } 646 | } 647 | 648 | # If !is.null(annotated_random), check that fill = 'annot.type'. This is the 649 | # only situation where random regions can be used, because the data from the 650 | # original regions is not transferred to the random ones. 651 | if(!missing(annotated_random) && fill != 'annot.type') { 652 | stop('Error: Random regions can only be used in plot_categorical() when fill == "annot.type" since data from the original regions are not transferred to the random regions.') 653 | } 654 | 655 | # Check valid position argument 656 | if(position != 'stack' && position != 'fill' && position != 'dodge') { 657 | stop('Error: position must be one of "stack", "fill", or "dodge"') 658 | } 659 | 660 | ######################################################################## 661 | # Order and subset based on fill_order 662 | annotated_regions = subset_order_tbl(tbl = annotated_regions, col = fill, col_order = fill_order) 663 | 664 | # Take the distinct annotation types per unique data region 665 | annotated_regions = dplyr::distinct(dplyr::ungroup(annotated_regions), across(c('seqnames', 'start', 'end', x, fill)), .keep_all=TRUE) 666 | 667 | ######################################################################## 668 | # Order and subset based on x_order 669 | if(is.null(x_order)) { 670 | x_order = unique(annotated_regions[[x]]) 671 | } 672 | sub_annot_regions = subset_order_tbl(tbl = annotated_regions, col = x, col_order = x_order) 673 | 674 | # Do particular things if annotated_random isn't NULL 675 | if(!missing(annotated_random)) { 676 | # Tidy the GRanges into a tbl_df for use with dplyr functions 677 | annotated_random = as.data.frame(annotated_random, row.names = NULL) 678 | 679 | # Order and subset the randomized annotations 680 | annotated_random = subset_order_tbl(tbl = annotated_random, col=fill, col_order=fill_order) 681 | 682 | # Take the distinct annotation types per unique random data region 683 | annotated_random = dplyr::distinct(dplyr::ungroup(annotated_random), across(c('seqnames', 'start', 'end', 'annot.type')), .keep_all=TRUE) 684 | 685 | # Combine the tbl_dfs in preparation for visualization 686 | annotated_regions = dplyr::bind_rows("All" = annotated_regions, "Random Regions" = annotated_random, .id = 'data_type') 687 | } 688 | 689 | ######################################################################## 690 | # Construct the plot 691 | 692 | # Make base ggplot 693 | if(!missing(annotated_random)) { 694 | plot = 695 | ggplot(annotated_regions, aes_string(x='data_type')) + 696 | geom_bar(aes_string(fill=fill), position=position, width=0.5) + # The All bar 697 | geom_bar(data = sub_annot_regions, aes_string(x=x, fill=fill), position=position, width=0.5) + # The subsets bars 698 | theme(axis.text.x = element_text(angle = 30, hjust = 1)) 699 | } else { 700 | plot = 701 | ggplot(annotated_regions, aes(x='All')) + 702 | geom_bar(aes_string(fill=fill), position=position, width=0.5) + # The All bar 703 | geom_bar(data = sub_annot_regions, aes_string(x=x, fill=fill), position=position, width=0.5) + # The subsets bars 704 | theme(axis.text.x = element_text(angle = 30, hjust = 1)) 705 | } 706 | 707 | # Change the fill scale and name if legend_title isn't null 708 | if(!missing(legend_title)) { 709 | plot = plot + scale_fill_hue(name=legend_title) 710 | } else { 711 | plot = plot + scale_fill_hue() 712 | } 713 | 714 | # Deal with the x-axis labels to make sure the order is correct 715 | if(!missing(annotated_random)) { 716 | plot = plot + scale_x_discrete(limits = c('All', x_order, 'Random Regions')) 717 | } else { 718 | if(x == 'annot.type') { 719 | plot = plot + scale_x_discrete(limits = c('All', names(tidy_annotations(x_order)))) 720 | } else { 721 | plot = plot + scale_x_discrete(limits = c('All', x_order)) 722 | } 723 | } 724 | 725 | # Add any user defined labels to the plot if their values are not NULL 726 | # if they are NULL, ggplot() will use defaults 727 | if(!missing(plot_title)) { 728 | plot = plot + ggtitle(plot_title) 729 | } 730 | if(!missing(x_label)) { 731 | plot = plot + xlab(x_label) 732 | } 733 | if(!missing(y_label)) { 734 | plot = plot + ylab(y_label) 735 | } 736 | 737 | return(plot) 738 | } 739 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Travis-CI Build Status](https://travis-ci.org/rcavalcante/annotatr.svg?branch=master)](https://travis-ci.org/rcavalcante/annotatr) [![Coverage Status](https://coveralls.io/repos/rcavalcante/annotatr/badge.svg?branch=master&service=github)](https://coveralls.io/github/rcavalcante/annotatr?branch=master) 2 | 3 | See the package vignette for a fully worked through use case. 4 | -------------------------------------------------------------------------------- /data-raw/create_example_annotations.R: -------------------------------------------------------------------------------- 1 | annotations = build_annotations(genome = 'hg19', annotations = 'hg19_cpgs') 2 | devtools::use_data(annotations, internal = FALSE, compress = 'xz') 3 | -------------------------------------------------------------------------------- /data/annotations.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/data/annotations.rda -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite the R package 'annotatr' in publications use:") 2 | 3 | citEntry( 4 | entry = "article", 5 | title = "annotatr: genomic regions in context.", 6 | author = personList( 7 | as.person("Raymond G Cavalcante"), 8 | as.person("Maureen A Sartor") 9 | ), 10 | year = 2017, 11 | journal = "Bioinformatics", 12 | note = paste("R package version", meta$Version), 13 | textVersion = "Cavalcante RG, Sartor MA. annotatr: genomic regions in context. Bioinformatics. (2017) 33(15):2381-2383. doi:10.1093/bioinformatics/btx183" 14 | ) 15 | 16 | citFooter("This free open-source software implements academic research by the authors and co-workers. If you use it, please support the project by citing the appropriate journal articles.") 17 | -------------------------------------------------------------------------------- /inst/NEWS: -------------------------------------------------------------------------------- 1 | CHANGES IN VERSION 1.16.0 2 | ------------------------- 3 | 4 | USER-FACING CHANGES 5 | 6 | o Export expand_annotations(), tidy_annotations(), and subset_order_tbl(). 7 | 8 | BUGFIXES 9 | 10 | o Fix incorrect shortcut search for HMMs. 11 | 12 | CHANGES IN VERSION 1.6.0 13 | ------------------------ 14 | 15 | NEW FEATURES 16 | 17 | o Add support for chicken (galGal5). 18 | 19 | USER-FACING CHANGES 20 | 21 | o Add the ability to facet over two variables in plot_numerical(). 22 | o Add the ability to keep duplicate regions in summarize_categorical() and 23 | plot_categorical(). This is accomplished with the 'by' parameter in the 24 | former and by the 'x' and 'fill' parameters in the latter, and passing 25 | their contents into the '.dots' parameter of dplyr::distinct_(). 26 | o Make TxDb and OrgDb packages Suggests instead of Imports. NOTE: This saves 27 | space, but also requires downloading the appropriate packages as needed. 28 | o Add list_env() function to the annotatr_cache environment to see what 29 | custom annotations have been read in and added to the cache. 30 | 31 | BUGFIXES 32 | 33 | o Replace dplyr::summarize_each_() with dplyr::summarize_at() in line with 34 | deprecation in the dplyr package. 35 | o Prefix builtin_ functions with annotatr:: so that packages that Import 36 | annotatr don't encounter errors. 37 | 38 | CHANGES IN VERSION 1.2.0 39 | ------------------------ 40 | 41 | NEW FEATURES 42 | 43 | o Add support for CpG annotations for hg38, mm10, and rn6 via the UCSC goldenpath URLs. 44 | o Add a function to build annotations from AnnotationHub resources, build_ah_annots(). 45 | o Add support for chromHMM tracks (chromatin state) from the UCSC Genome Browser. 46 | o Users may annotate to chromatin states in multiple cell lines, if desired. 47 | o Use rtracklayer::liftOver to lift hg19 and mm9 enhancers into hg38 and mm10. 48 | 49 | USER-FACING CHANGES 50 | 51 | o Add minoverlaps parameter to annotate_regions() that is passed to 52 | GenomicRanges::findOverlaps(). 53 | o Change supported_annotations() and supported_genomes() into builtin_annotations() 54 | and builtin_genomes(). This enables more flexibility required for AnnotationHub 55 | annotations. 56 | o Added documentation for coercing result of annotate_regions() to data.frame 57 | and subsetting based on gene symbol to the vignette. 58 | 59 | BUGFIXES 60 | 61 | o Fixed a bug in coercion of GRanges to data.frame where row.names could be 62 | duplicated. Thanks to @kdkorthauer. 63 | o Require GenomeInfoDb >= 1.10.3 because of changes to NCBI servers. 64 | o Change scale_fill_brewer() to scale_fill_hue() in plot_categorical() to enable 65 | more categories and avoid plotting abnormalities. 66 | o Fixed bug that mistakenly displayed some supported annotations. 67 | o Fixed a bug in lncRNA annotation building caused by incomplete reference. 68 | 69 | CHANGES IN VERSION 0.99.13 70 | -------------------------- 71 | 72 | PKG FEATURES 73 | 74 | o annotatr is a package to quickly and flexibly annotate genomic regions to 75 | genomic annotations. 76 | 77 | o Genomic annotations include CpG features (island, shore, shelves, and 78 | open sea), genic features (1-5kb upstream of TSS, promoters, 79 | 5'UTRs, exons, introns, CDS, 3'UTRs, intron/exon boundaries, and exon/ 80 | intron boundaries), as well as enhancers from the FANTOM5 consortium for 81 | hg19 and mm9. 82 | 83 | o Annotations are built at runtime using the TxDb.*, AnnotationHub, and 84 | rtracklayer packages. Users can select annotations a la carte, or via 85 | shortcuts, such as hg19_basicgenes. 86 | 87 | o Annotations are currently available for hg19, mm9, mm10, dm3, dm6, rn4, 88 | rn5, and rn6. Any species is supported through custom annotations. 89 | 90 | o Genomic regions are read in using the rtracklayer::import() function, and 91 | the extraCols argument enables users to include an arbitrary number of 92 | categorical or numerical data with the genomic regions. 93 | 94 | o Annotations are determined via GenomicRanges::findOverlaps(), and all 95 | annotations are returned, rather than imposing a prioritization. 96 | 97 | o annotatr provides several helpful summarization (using dplyr) and plot 98 | functions (using ggplot2) to investigate trends in data associated with the 99 | genomic regions over annotations. 100 | -------------------------------------------------------------------------------- /inst/extdata/Gm12878_Ezh2_peak_annotations.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/inst/extdata/Gm12878_Ezh2_peak_annotations.txt.gz -------------------------------------------------------------------------------- /inst/extdata/Gm12878_Ezh2_sorted_scores.narrowPeak.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/inst/extdata/Gm12878_Ezh2_sorted_scores.narrowPeak.gz -------------------------------------------------------------------------------- /inst/extdata/Gm12878_Stat3_chr2.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/inst/extdata/Gm12878_Stat3_chr2.bed.gz -------------------------------------------------------------------------------- /inst/extdata/IDH2mut_v_NBM_multi_data_chr9.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/inst/extdata/IDH2mut_v_NBM_multi_data_chr9.txt.gz -------------------------------------------------------------------------------- /inst/extdata/K562_Cjun_peak_annotations.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/inst/extdata/K562_Cjun_peak_annotations.txt.gz -------------------------------------------------------------------------------- /inst/extdata/test_BED3.bed: -------------------------------------------------------------------------------- 1 | chr1 10790 10805 2 | chr1 26800 28000 3 | chr1 28800 29000 4 | -------------------------------------------------------------------------------- /inst/extdata/test_BED4.bed: -------------------------------------------------------------------------------- 1 | chr1 10900 11000 test1 2 | chr1 26800 28000 test2 3 | chr1 28800 29000 test3 4 | -------------------------------------------------------------------------------- /inst/extdata/test_BED5.bed: -------------------------------------------------------------------------------- 1 | chr1 10900 11000 test1 32 2 | chr1 26800 28000 test2 46 3 | chr1 28800 29000 test3 36 4 | -------------------------------------------------------------------------------- /inst/extdata/test_BED6.bed: -------------------------------------------------------------------------------- 1 | chr1 10900 11000 test1 1000 + 2 | chr1 26800 28000 test2 1000 - 3 | chr1 28800 29000 test3 1000 - 4 | -------------------------------------------------------------------------------- /inst/extdata/test_annotation_nooverlap.bed: -------------------------------------------------------------------------------- 1 | chr1 8000 9000 2 | chr1 20000 24000 3 | chr1 28100 28200 4 | -------------------------------------------------------------------------------- /inst/extdata/test_annotations_3.bed: -------------------------------------------------------------------------------- 1 | chr1 10800 11500 2 | chr1 26500 28200 3 | chr1 28600 29200 4 | -------------------------------------------------------------------------------- /inst/extdata/test_annotations_4.bed: -------------------------------------------------------------------------------- 1 | chr1 10800 11500 test1 2 | chr1 26500 28200 test2 3 | chr1 28600 29200 test3 4 | -------------------------------------------------------------------------------- /inst/extdata/test_annotations_5.bed: -------------------------------------------------------------------------------- 1 | chr1 10800 11500 region1 . 2 | chr1 26500 28200 region2 . 3 | chr1 28600 29200 region3 . 4 | -------------------------------------------------------------------------------- /inst/extdata/test_annotations_6.bed: -------------------------------------------------------------------------------- 1 | chr1 10800 11500 region1 . + 2 | chr1 26500 28200 region2 . - 3 | chr1 28600 29200 region3 . + 4 | -------------------------------------------------------------------------------- /inst/extdata/test_annotations_6_gene.bed: -------------------------------------------------------------------------------- 1 | chr1 10800 11500 region1 . . 324 2 | chr1 26500 28200 region2 . . 4624 3 | chr1 28600 29200 region3 . . 3447 4 | -------------------------------------------------------------------------------- /inst/extdata/test_annotations_6_symbol.bed: -------------------------------------------------------------------------------- 1 | chr1 10800 11500 region1 . . BRCA 2 | chr1 26500 28200 region2 . . TP53 3 | chr1 28600 29200 region3 . . HOX1A 4 | -------------------------------------------------------------------------------- /inst/extdata/test_annotations_6_tx_gene_symbol.bed: -------------------------------------------------------------------------------- 1 | chr1 10800 11500 region1 . . 351236 BRCA ENST00000473358.1 2 | chr1 26500 28200 region2 . . 4624 TP53 ENST00000607096.1 3 | chr1 28600 29200 region3 . . 3447 HOX1A ENST00000496488.1 4 | -------------------------------------------------------------------------------- /inst/extdata/test_annotations_minoverlap.bed: -------------------------------------------------------------------------------- 1 | chr1 10800 11500 2 | chr1 26500 26801 3 | chr1 28600 29200 4 | -------------------------------------------------------------------------------- /inst/extdata/test_bedGraph.bedGraph: -------------------------------------------------------------------------------- 1 | chr1 10900 11000 31 2 | chr1 26800 28000 36 3 | chr1 28800 29000 83 4 | -------------------------------------------------------------------------------- /inst/extdata/test_intersect.bed: -------------------------------------------------------------------------------- 1 | chr1 10900 11000 test1 1000 * 2 | chr1 26800 28000 test2 1000 * 3 | chr1 28800 29000 test3 1000 * 4 | -------------------------------------------------------------------------------- /inst/extdata/test_read_multiple_data_nohead.bed: -------------------------------------------------------------------------------- 1 | chr1 10800 10900 A 87 + 10e-4 100 13 Y 2 | chr1 11000 11100 A 45 - 1e-6 100 55 N 3 | chr1 27800 28800 A 34 - 0.04 41 7 Y 4 | chr1 29000 29300 B 62 + 0.001 95 33 Y 5 | -------------------------------------------------------------------------------- /man/annotate_regions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/intersect.R 3 | \name{annotate_regions} 4 | \alias{annotate_regions} 5 | \title{A function to intersect user region data with annotation data} 6 | \usage{ 7 | annotate_regions( 8 | regions, 9 | annotations, 10 | minoverlap = 1L, 11 | ignore.strand = TRUE, 12 | quiet = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{regions}{The GRanges object returned by \code{read_regions()}.} 17 | 18 | \item{annotations}{A character vector of annotations to build. Valid annotation codes are listed with \code{builtin_annotations()}. The "basicgenes" shortcut builds the following regions: 1-5Kb upstream of TSSs, promoters, 5UTRs, exons, introns, and 3UTRs. The "cpgs" shortcut builds the following regions: CpG islands, shores, shelves, and interCGI regions. NOTE: Shortcuts need to be appended by the genome, e.g. \code{hg19_basicgenes}. 19 | Custom annotations whose names are of the form \code{[genome]_custom_[name]} should also be included. Custom annotations should be read in and converted to \code{GRanges} with \code{read_annotations()}. They can be for a \code{supported_genome()}, or for an unsupported genome.} 20 | 21 | \item{minoverlap}{A scalar, positive integer, indicating the minimum required overlap of regions with annotations.} 22 | 23 | \item{ignore.strand}{Logical indicating whether strandedness should be respected in findOverlaps(). Default FALSE.} 24 | 25 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).} 26 | } 27 | \value{ 28 | A \code{GRanges} where the \code{granges} are from the regions, and the \code{mcols} include the \code{mcols} from the regions and a column with the annotation \code{GRanges}. 29 | } 30 | \description{ 31 | Annotate genomic regions to selected genomic annotations while preserving the data associated with the genomic regions. 32 | } 33 | \examples{ 34 | r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr') 35 | extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character') 36 | r = read_regions(con = r_file, extraCols = extraCols, rename_score = 'coverage') 37 | 38 | # Get premade CpG annotations 39 | data('annotations', package = 'annotatr') 40 | 41 | a = annotate_regions( 42 | regions = r, 43 | annotations = annotations, 44 | ignore.strand = TRUE) 45 | 46 | } 47 | -------------------------------------------------------------------------------- /man/annotations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotatr_data_doc.R 3 | \docType{data} 4 | \name{annotations} 5 | \alias{annotations} 6 | \title{example_annotations data} 7 | \format{ 8 | A \code{GRanges} object with the CpG feature annotations for hg19 9 | and containing \code{mcols}: 10 | \describe{ 11 | \item{id}{The internal ID for the annotation} 12 | \item{tx_id}{All NA, since these are not associated with tx_ids} 13 | \item{gene_id}{All NA, since there are not associated Entrez IDs} 14 | \item{symbols}{All NA, since there are not associated gene symbols} 15 | \item{type}{A character indicating the type of annotation. Including: 16 | 'hg19_cpg_islands', 'hg19_cpg_shores', 'hg19_cpg_shelves', and 'hg19_cpg_inter'.} 17 | } 18 | } 19 | \source{ 20 | The AnnotationHub resource for hg19 CpG features. 21 | } 22 | \usage{ 23 | annotations 24 | } 25 | \description{ 26 | A \code{GRanges} of precomputed annotations for CpG features. Created by doing 27 | \code{build_annotations(genome='hg19', annotations = 'hg19_cpgs')}. 28 | } 29 | \keyword{datasets} 30 | -------------------------------------------------------------------------------- /man/annotatr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/annotatr_package_doc.R 3 | \docType{package} 4 | \name{annotatr} 5 | \alias{annotatr} 6 | \title{annotatr: Annotation of Genomic Regions to Functional Annotations} 7 | \description{ 8 | Given a set of genomic sites/regions (e.g. ChIP-seq peaks, CpGs, differentially methylated CpGs or regions, SNPs, etc.) it is often of interest to investigate the intersecting functional annotations. Such annotations include those relating to gene models (promoters, 5'UTRs, exons, introns, and 3'UTRs), CpGs (CpG islands, CpG shores, CpG shelves), the non-coding genome, and enhancers. The annotatr package provides an easy way to summarize and visualize the intersection of genomic sites/regions with the above functional annotations. 9 | } 10 | -------------------------------------------------------------------------------- /man/annotatr_cache.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/build_annotations.R 3 | \docType{data} 4 | \name{annotatr_cache} 5 | \alias{annotatr_cache} 6 | \title{A global-variable to hold custom annotations loaded in an R session} 7 | \format{ 8 | An object of class \code{list} of length 3. 9 | } 10 | \usage{ 11 | annotatr_cache 12 | } 13 | \value{ 14 | An environment to contain custom annotations from \code{read_annotations}. 15 | } 16 | \description{ 17 | Code thanks to Martin Morgan. This is a global variable that will store custom 18 | annotations that a user reads in during a session in which annotatr is loaded. 19 | } 20 | \examples{ 21 | # Example usage 22 | annotatr_cache$set("foo", 1:10) 23 | annotatr_cache$get("foo") 24 | 25 | # Read in a BED3 file as a custom annotation 26 | file = system.file('extdata', 'test_annotations_3.bed', package='annotatr') 27 | # The custom annotation is added to the annotatr_cache environment in this function 28 | read_annotations(con = file, name = 'test', genome = 'hg19') 29 | # The result of read_annotations() is not visible in .GlobalEnv, instead 30 | # need to use the get method 31 | print(annotatr_cache$get('hg19_custom_test')) 32 | # See what is in the annotatr_cache 33 | annotatr_cache$list_env() 34 | } 35 | \keyword{datasets} 36 | -------------------------------------------------------------------------------- /man/build_ah_annots.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/build_annotations.R 3 | \name{build_ah_annots} 4 | \alias{build_ah_annots} 5 | \title{A helper function to build arbitrary annotatinos from AnnotationHub} 6 | \usage{ 7 | build_ah_annots(genome, ah_codes, annotation_class) 8 | } 9 | \arguments{ 10 | \item{genome}{The genome assembly.} 11 | 12 | \item{ah_codes}{A named character vector giving the AnnotationHub accession number (e.g. AH23256), and whose name describes what the annotation is (e.g. Gm12878_H3K4me3).} 13 | 14 | \item{annotation_class}{A string to name the group of annotations in \code{ah_codes}} 15 | } 16 | \value{ 17 | A \code{GRanges} object stored in \code{annotatr_cache}. To view an annotation built with this function, do \code{annotatr_cache$get(name)}. To add these annotations to a set of annotations, include \code{'[genome]_[annotation_class]_[name]'} in the call to \code{build_annotations()}. See example below. 18 | } 19 | \description{ 20 | A helper function to build arbitrary annotatinos from AnnotationHub 21 | } 22 | \examples{ 23 | 24 | # Create a named vector for the AnnotationHub accession codes with desired names 25 | h3k4me3_code = c('Gm12878' = 'AH23256') 26 | # Fetch ah_codes from AnnotationHub and create annotations annotatr understands 27 | build_ah_annots(genome = 'hg19', ah_codes = h3k4me3_code, annotation_class = 'H3K4me3') 28 | # The annotations as they appear in annotatr_cache 29 | annot_name = c('hg19_H3K4me3_Gm12878') 30 | # Build the annotations right before annotating any regions 31 | annotations = build_annotations(genome = 'hg19', annotations = annot_name) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /man/build_annotations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/build_annotations.R 3 | \name{build_annotations} 4 | \alias{build_annotations} 5 | \title{A function to build annotations from TxDb.* and AnnotationHub resources} 6 | \usage{ 7 | build_annotations(genome, annotations) 8 | } 9 | \arguments{ 10 | \item{genome}{The genome assembly.} 11 | 12 | \item{annotations}{A character vector of annotations to build. Valid annotation codes are listed with \code{builtin_annotations()}. The "basicgenes" shortcut builds the following regions: 1-5Kb upstream of TSSs, promoters, 5UTRs, exons, introns, and 3UTRs. The "cpgs" shortcut builds the following regions: CpG islands, shores, shelves, and interCGI regions. NOTE: Shortcuts need to be appended by the genome, e.g. \code{hg19_basicgenes}. 13 | Custom annotations whose names are of the form \code{[genome]_custom_[name]} should also be included. Custom annotations should be read in and converted to \code{GRanges} with \code{read_annotations()}. They can be for a \code{supported_genome()}, or for an unsupported genome.} 14 | } 15 | \value{ 16 | A \code{GRanges} object of all the \code{annotations} combined. The \code{mcols} are \code{id, tx_id, gene_id, symbol, type}. The \code{id} column is a unique name, the \code{tx_id} column is either a UCSC knownGene transcript ID (genic annotations) or a Ensembl transcript ID (lncRNA annotations), the \code{gene_id} is the Entrez ID, the \code{symbol} is the gene symbol from the \code{org.*.eg.db} mapping from the Entrez ID, and the \code{type} is of the form \code{[genome]_[type]_[name]}. 17 | } 18 | \description{ 19 | Create a \code{GRanges} object consisting of all the desired \code{annotations}. Supported annotation codes are listed by \code{builtin_annotations()}. The basis for enhancer annotations are FANTOM5 data, the basis for CpG related annotations are CpG island tracks from \code{AnnotationHub}, and the basis for genic annotations are from the \code{TxDb.*} and \code{org.db} group of packages. 20 | } 21 | \examples{ 22 | # Example with hg19 gene promoters 23 | annots = c('hg19_genes_promoters') 24 | annots_gr = build_annotations(genome = 'hg19', annotations = annots) 25 | 26 | # See vignette for an example with custom annotation 27 | 28 | } 29 | -------------------------------------------------------------------------------- /man/build_cpg_annots.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/build_annotations.R 3 | \name{build_cpg_annots} 4 | \alias{build_cpg_annots} 5 | \title{A helper function to build CpG related annotations.} 6 | \usage{ 7 | build_cpg_annots( 8 | genome = annotatr::builtin_genomes(), 9 | annotations = annotatr::builtin_annotations() 10 | ) 11 | } 12 | \arguments{ 13 | \item{genome}{The genome assembly.} 14 | 15 | \item{annotations}{A character vector with entries of the form \code{[genome]_cpg_{islands,shores,shelves,inter}}.} 16 | } 17 | \value{ 18 | A list of \code{GRanges} objects. 19 | } 20 | \description{ 21 | Using the \code{AnnotationHub} package, extract CpG island track for the appropriate \code{genome} and construct the shores, shelves, and interCGI annotations as desired. 22 | } 23 | -------------------------------------------------------------------------------- /man/build_enhancer_annots.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/build_annotations.R 3 | \name{build_enhancer_annots} 4 | \alias{build_enhancer_annots} 5 | \title{A helper function to build enhancer annotations for hg19 and mm10 from FANTOM5.} 6 | \usage{ 7 | build_enhancer_annots(genome = c("hg19", "hg38", "mm9", "mm10")) 8 | } 9 | \arguments{ 10 | \item{genome}{The genome assembly.} 11 | } 12 | \value{ 13 | A \code{GRanges} object. 14 | } 15 | \description{ 16 | A helper function to build enhancer annotations for hg19 and mm10 from FANTOM5. 17 | } 18 | -------------------------------------------------------------------------------- /man/build_gene_annots.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/build_annotations.R 3 | \name{build_gene_annots} 4 | \alias{build_gene_annots} 5 | \title{A helper function to build genic annotations.} 6 | \usage{ 7 | build_gene_annots( 8 | genome = annotatr::builtin_genomes(), 9 | annotations = annotatr::builtin_annotations() 10 | ) 11 | } 12 | \arguments{ 13 | \item{genome}{The genome assembly.} 14 | 15 | \item{annotations}{A character vector with entries of the form \code{[genome]_genes_{1to5kb,promoters,5UTRs,cds,exons,firstexons,introns,intronexonboundaries,exonintronboundaries,3UTRs,intergenic}}.} 16 | } 17 | \value{ 18 | A list of \code{GRanges} objects with unique \code{id} of the form \code{[type]:i}, \code{tx_id} being the UCSC knownGene transcript name, \code{gene_id} being the Entrez Gene ID, \code{symbol} being the gene symbol from the Entrez ID to symbol mapping in \code{org.db} for that species, and \code{type} being the annotation type. 19 | } 20 | \description{ 21 | Using the \code{TxDb.*} group of packages, construct genic annotations consisting of any combination of 1-5kb upstream of a TSS, promoters (< 1kb from TSS), 5UTRs, CDS, exons, first exons, introns, intron/exon and exon/intron boundaries, 3UTRs, and intergenic. 22 | } 23 | -------------------------------------------------------------------------------- /man/build_hmm_annots.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/build_annotations.R 3 | \name{build_hmm_annots} 4 | \alias{build_hmm_annots} 5 | \title{A helper function to build chromHMM annotations for hg19 from UCSC Genome Browser.} 6 | \usage{ 7 | build_hmm_annots( 8 | genome = c("hg19"), 9 | annotations = annotatr::builtin_annotations() 10 | ) 11 | } 12 | \arguments{ 13 | \item{genome}{The genome assembly.} 14 | 15 | \item{annotations}{A character vector of valid chromatin state annotatin codes.} 16 | } 17 | \value{ 18 | A \code{GRanges} object. 19 | } 20 | \description{ 21 | A helper function to build chromHMM annotations for hg19 from UCSC Genome Browser. 22 | } 23 | -------------------------------------------------------------------------------- /man/build_lncrna_annots.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/build_annotations.R 3 | \name{build_lncrna_annots} 4 | \alias{build_lncrna_annots} 5 | \title{A helper function to build lncRNA annotations.} 6 | \usage{ 7 | build_lncrna_annots(genome = c("hg19", "hg38", "mm10")) 8 | } 9 | \arguments{ 10 | \item{genome}{The genome assembly.} 11 | } 12 | \value{ 13 | A \code{GRanges} object with \code{id} giving the \code{transcript_type} from the GENCODE file, \code{tx_id} being the Ensembl transcript ID, \code{gene_id} being the Entrez ID coming from a mapping of gene symbol to Entrez ID, \code{symbol} being the gene_name from the GENCODE file, and the \code{type} being \code{[genome]_lncrna_gencode}. 14 | } 15 | \description{ 16 | Using the \code{AnnotationHub} package, retrieve transcript level lncRNA annotations for either human (GRCh38) or mouse (GRCm38). If the genome is 'hg19', use the permalink from GENCODE and \code{rtracklayer::import()} to download and process. 17 | } 18 | -------------------------------------------------------------------------------- /man/builtin_annotations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{builtin_annotations} 4 | \alias{builtin_annotations} 5 | \title{Function listing which annotations are available.} 6 | \usage{ 7 | builtin_annotations() 8 | } 9 | \value{ 10 | A character vector of available annotations. 11 | } 12 | \description{ 13 | This includes the shortcuts. The \code{expand_annotations()} function helps 14 | handle the shortcuts. 15 | } 16 | \examples{ 17 | builtin_annotations() 18 | 19 | } 20 | -------------------------------------------------------------------------------- /man/builtin_genomes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{builtin_genomes} 4 | \alias{builtin_genomes} 5 | \title{Function returning supported TxDb.* genomes} 6 | \usage{ 7 | builtin_genomes() 8 | } 9 | \value{ 10 | A character vector of genomes for supported TxDb.* packages 11 | } 12 | \description{ 13 | Function returning supported TxDb.* genomes 14 | } 15 | \examples{ 16 | builtin_genomes() 17 | 18 | } 19 | -------------------------------------------------------------------------------- /man/check_annotations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{check_annotations} 4 | \alias{check_annotations} 5 | \title{Function to check for valid annotations} 6 | \usage{ 7 | check_annotations(annotations) 8 | } 9 | \arguments{ 10 | \item{annotations}{A character vector of annotations possibly using the shortcuts} 11 | } 12 | \value{ 13 | If all the checks on the annotations pass, returns NULL to allow code to move forward. 14 | } 15 | \description{ 16 | Gives errors if any annotations are not in builtin_annotations() (and they are not in the required custom format), basicgenes are used, or the genome prefixes are not the same for all annotations. 17 | } 18 | -------------------------------------------------------------------------------- /man/expand_annotations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{expand_annotations} 4 | \alias{expand_annotations} 5 | \title{Function to expand annotation shortcuts} 6 | \usage{ 7 | expand_annotations(annotations) 8 | } 9 | \arguments{ 10 | \item{annotations}{A character vector of annotations, possibly using the shortcut accessors} 11 | } 12 | \value{ 13 | A vector of data accession-ized names that are ordered from upstream to downstream in the case of knownGenes and islands to interCGI in the case of cpgs. 14 | } 15 | \description{ 16 | Function to expand annotation shortcuts 17 | } 18 | -------------------------------------------------------------------------------- /man/get_cellline_from_code.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{get_cellline_from_code} 4 | \alias{get_cellline_from_code} 5 | \title{Function to return cell line from chromatin annotation code} 6 | \usage{ 7 | get_cellline_from_code(code) 8 | } 9 | \arguments{ 10 | \item{code}{The annotation code, used in \code{build_annotations()}.} 11 | } 12 | \value{ 13 | A string of the cell line used in a chromatin annotation code 14 | } 15 | \description{ 16 | Function to return cell line from chromatin annotation code 17 | } 18 | -------------------------------------------------------------------------------- /man/get_cellline_from_shortcut.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{get_cellline_from_shortcut} 4 | \alias{get_cellline_from_shortcut} 5 | \title{Function to return cell line from chromatin annotation shortcut} 6 | \usage{ 7 | get_cellline_from_shortcut(shortcut) 8 | } 9 | \arguments{ 10 | \item{shortcut}{The annotation shortcut, used in \code{build_annotations()}.} 11 | } 12 | \value{ 13 | A string of the cell line used in a chromatin annotation shortcut 14 | } 15 | \description{ 16 | Function to return cell line from chromatin annotation shortcut 17 | } 18 | -------------------------------------------------------------------------------- /man/get_orgdb_name.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{get_orgdb_name} 4 | \alias{get_orgdb_name} 5 | \title{Function to get correct org.* package name based on genome} 6 | \usage{ 7 | get_orgdb_name(genome = annotatr::builtin_genomes()) 8 | } 9 | \arguments{ 10 | \item{genome}{A string giving the genome assembly.} 11 | } 12 | \value{ 13 | A string giving the correct org for org.db packages. e.g. hg19 -> Hs. 14 | } 15 | \description{ 16 | Function to get correct org.* package name based on genome 17 | } 18 | -------------------------------------------------------------------------------- /man/get_txdb_name.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{get_txdb_name} 4 | \alias{get_txdb_name} 5 | \title{Function to get correct TxDb.* package name based on genome} 6 | \usage{ 7 | get_txdb_name(genome = annotatr::builtin_genomes()) 8 | } 9 | \arguments{ 10 | \item{genome}{A string giving the genome assembly.} 11 | } 12 | \value{ 13 | A string giving the name of the correct TxDb.* package name based on \code{genome}. 14 | } 15 | \description{ 16 | Function to get correct TxDb.* package name based on genome 17 | } 18 | -------------------------------------------------------------------------------- /man/plot_annotation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualize.R 3 | \name{plot_annotation} 4 | \alias{plot_annotation} 5 | \title{Plot the number of regions per annotation} 6 | \usage{ 7 | plot_annotation( 8 | annotated_regions, 9 | annotated_random, 10 | annotation_order = NULL, 11 | plot_title, 12 | x_label, 13 | y_label, 14 | quiet = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{annotated_regions}{The \code{GRanges} result of \code{annotate_regions()}.} 19 | 20 | \item{annotated_random}{The \code{GRanges} result of \code{annotate_regions()} on the randomized regions created from \code{randomize_regions()}.} 21 | 22 | \item{annotation_order}{A character vector which doubles as the subset of annotations desired for the plot as well as the ordering. If \code{NULL}, all annotations are displayed.} 23 | 24 | \item{plot_title}{A string used for the title of the plot. If missing, no title is displayed.} 25 | 26 | \item{x_label}{A string used for the x-axis label. If missing, no x-axis label is displayed.} 27 | 28 | \item{y_label}{A string used for the y-axis label. If missing, no y-axis label is displayed.} 29 | 30 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).} 31 | } 32 | \value{ 33 | A \code{ggplot} object which can be viewed by calling it, saved with \code{ggplot2::ggsave}, or edited. 34 | } 35 | \description{ 36 | Given a \code{GRanges} of annotated regions, plot the number of regions with the corresponding genomic annotations used in \code{annotation_order}. If a region is annotated to multiple annotations of the same \code{annot.type}, the region will only be counted once in the corresponding bar plot. For example, if a region were annotated to multiple exons, it would only count once toward the exon bar in the plot, but if it were annotated to an exon and an intron, it would count towards both. 37 | } 38 | \examples{ 39 | ######################################################################## 40 | # An example of ChIP-seq peaks with signalValue used for score 41 | 42 | # Get premade CpG annotations 43 | data('annotations', package = 'annotatr') 44 | 45 | chip_bed = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr') 46 | chip_regions = read_regions(con = chip_bed, genome = 'hg19') 47 | 48 | chip_rnd = randomize_regions(regions = chip_regions) 49 | 50 | chip_annots = annotate_regions( 51 | regions = chip_regions, 52 | annotations = annotations, 53 | ignore.strand = TRUE) 54 | 55 | chip_rnd_annots = annotate_regions( 56 | regions = chip_rnd, 57 | annotations = annotations, 58 | ignore.strand = TRUE) 59 | 60 | annots_order = c( 61 | 'hg19_cpg_islands', 62 | 'hg19_cpg_shores') 63 | 64 | p_annots = plot_annotation(annotated_regions = chip_annots, 65 | annotation_order = annots_order) 66 | p_annots_rnd = plot_annotation(annotated_regions = chip_annots, 67 | annotated_random = chip_rnd_annots, annotation_order = annots_order) 68 | 69 | } 70 | -------------------------------------------------------------------------------- /man/plot_categorical.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualize.R 3 | \name{plot_categorical} 4 | \alias{plot_categorical} 5 | \title{Plot a categorical data variable over another} 6 | \usage{ 7 | plot_categorical( 8 | annotated_regions, 9 | annotated_random, 10 | x, 11 | fill = NULL, 12 | x_order = NULL, 13 | fill_order = NULL, 14 | position = "stack", 15 | plot_title, 16 | legend_title, 17 | x_label, 18 | y_label, 19 | quiet = FALSE 20 | ) 21 | } 22 | \arguments{ 23 | \item{annotated_regions}{The \code{GRanges} result of \code{annotate_regions()}.} 24 | 25 | \item{annotated_random}{The \code{GRanges} result of \code{annotate_regions()} on the randomized regions created from \code{randomize_regions()}. Random regions can only be used with \code{fill == 'annot.type'}.} 26 | 27 | \item{x}{One of 'annot.type' or a categorical data column, indicating whether annotation classes or data classes will appear on the x-axis.} 28 | 29 | \item{fill}{One of 'annot.type', a categorical data column, or \code{NULL}, indicating whether annotation classes or data classes will fill the bars. If \code{NULL} then the bars will be the total counts of the x classes.} 30 | 31 | \item{x_order}{A character vector that subsets and orders the x classes. Default \code{NULL}, uses existing values.} 32 | 33 | \item{fill_order}{A character vector that subsets and orders the fill classes. Default \code{NULL}, uses existing values.} 34 | 35 | \item{position}{A string which has the same possible values as in \code{ggplot2::geom_bar(..., position)}, i.e., 'stack', 'fill', 'dodge', etc.} 36 | 37 | \item{plot_title}{A string used for the title of the plot. If missing, no title is displayed.} 38 | 39 | \item{legend_title}{A string used for the legend title to describe fills (if fill is not \code{NULL}). Default displays corresponding variable name.} 40 | 41 | \item{x_label}{A string used for the x-axis label. If missing, corresponding variable name used.} 42 | 43 | \item{y_label}{A string used for the y-axis label. If missing, corresponding variable name used.} 44 | 45 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).} 46 | } 47 | \value{ 48 | A \code{ggplot} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}. 49 | } 50 | \description{ 51 | Given a \code{GRanges} of annotated regions from \code{annotate_regions()}, visualize the the distribution of categorical data \code{fill} in categorical data \code{x}. A bar representing the distribution of all \code{fill} in \code{x} will be added according to the contents of \code{fill}. This is the distribution over all values of \code{x}. Additionally, when \code{annotated_random} is not missing, a "Random Regions" bar shows the distribution of random regions over \code{fill}. 52 | } 53 | \details{ 54 | For example, if a differentially methylated region has the categorical label hyper, and is annotated to a promoter, a 5UTR, two exons, and an intron. Each annotation will appear in the All bar once. Likewise for the hyper bar if the differential methylation status is chosen as \code{x} with \code{annot.type} chosen as \code{fill}. 55 | } 56 | \examples{ 57 | # Get premade CpG annotations 58 | data('annotations', package = 'annotatr') 59 | 60 | dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 61 | extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric') 62 | dm_regions = read_regions(con = dm_file, extraCols = extraCols, genome = 'hg19', 63 | rename_score = 'pval', rename_name = 'DM_status', format = 'bed') 64 | dm_regions = dm_regions[1:1000] 65 | 66 | dm_annots = annotate_regions( 67 | regions = dm_regions, 68 | annotations = annotations, 69 | ignore.strand = TRUE) 70 | 71 | dm_order = c( 72 | 'hyper', 73 | 'hypo') 74 | cpg_order = c( 75 | 'hg19_cpg_islands', 76 | 'hg19_cpg_shores', 77 | 'hg19_cpg_shelves', 78 | 'hg19_cpg_inter') 79 | 80 | dm_vn = plot_categorical( 81 | annotated_regions = dm_annots, 82 | x = 'DM_status', 83 | fill = 'annot.type', 84 | x_order = dm_order, 85 | fill_order = cpg_order, 86 | position = 'fill', 87 | legend_title = 'knownGene Annotations', 88 | x_label = 'DM status', 89 | y_label = 'Proportion') 90 | 91 | # Create randomized regions 92 | dm_rnd_regions = randomize_regions(regions = dm_regions) 93 | dm_rnd_annots = annotate_regions( 94 | regions = dm_rnd_regions, 95 | annotations = annotations, 96 | ignore.strand = TRUE) 97 | 98 | dm_vn_rnd = plot_categorical( 99 | annotated_regions = dm_annots, 100 | annotated_random = dm_rnd_annots, 101 | x = 'DM_status', 102 | fill = 'annot.type', 103 | x_order = dm_order, 104 | fill_order = cpg_order, 105 | position = 'fill', 106 | legend_title = 'knownGene Annotations', 107 | x_label = 'DM status', 108 | y_label = 'Proportion') 109 | 110 | } 111 | -------------------------------------------------------------------------------- /man/plot_coannotations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualize.R 3 | \name{plot_coannotations} 4 | \alias{plot_coannotations} 5 | \title{Plot pair-wise annotations across regions} 6 | \usage{ 7 | plot_coannotations( 8 | annotated_regions, 9 | annotation_order = NULL, 10 | plot_title, 11 | axes_label, 12 | quiet = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{annotated_regions}{The \code{GRanges} result of \code{annotate_regions()}.} 17 | 18 | \item{annotation_order}{A character vector which doubles as the subset of annotations desired for plot as well as the ordering. If \code{NULL}, all annotations are displayed.} 19 | 20 | \item{plot_title}{A string used for the title of the plot. If missing, no plot title label is displayed.} 21 | 22 | \item{axes_label}{A string used for the axis labels. If missing, corresponding variable name used.} 23 | 24 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).} 25 | } 26 | \value{ 27 | A \code{ggplot} object which can be viewed by calling it, saved with \code{ggplot2::ggsave}, or edited. 28 | } 29 | \description{ 30 | All co-occurring annotations associated with a region are computed and displayed as a heatmap. 31 | } 32 | \details{ 33 | As with \code{plot_annotation()}, the number in each cell is the number of unique regions annotated to the pair of annotations. 34 | 35 | For example, if a region is annotated to both a CpG shore and to two different exons simultaneously, the region will only be counted once in the CpG shore / exon cell. NOTE, this same region will count once in both the CpG shore and exon cells on the diagonal. 36 | } 37 | \examples{ 38 | # Get premade CpG annotations 39 | data('annotations', package = 'annotatr') 40 | 41 | dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 42 | extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric') 43 | dm_regions = read_regions(con = dm_file, extraCols = extraCols, 44 | rename_score = 'pval', rename_name = 'DM_status', format = 'bed') 45 | dm_regions = dm_regions[1:1000] 46 | 47 | dm_annots = annotate_regions( 48 | regions = dm_regions, 49 | annotations = annotations, 50 | ignore.strand = TRUE) 51 | 52 | all_order = c( 53 | 'hg19_cpg_islands', 54 | 'hg19_cpg_shores', 55 | 'hg19_cpg_shelves', 56 | 'hg19_cpg_inter') 57 | 58 | dm_vs_ca = plot_coannotations( 59 | annotated_regions = dm_annots, 60 | annotation_order = all_order, 61 | axes_label = 'Annotations', 62 | plot_title = 'Co-occurrence of Annotations') 63 | 64 | } 65 | -------------------------------------------------------------------------------- /man/plot_numerical.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualize.R 3 | \name{plot_numerical} 4 | \alias{plot_numerical} 5 | \title{Plot numerical data over regions or regions summarized over annotations} 6 | \usage{ 7 | plot_numerical( 8 | annotated_regions, 9 | x, 10 | y, 11 | facet, 12 | facet_order, 13 | bin_width = 10, 14 | plot_title, 15 | x_label, 16 | y_label, 17 | legend_facet_label, 18 | legend_cum_label, 19 | quiet = FALSE 20 | ) 21 | } 22 | \arguments{ 23 | \item{annotated_regions}{A \code{GRanges} returned from \code{annotate_regions()}. If the data is not summarized, the data is at the region level. If it is summarized, it represents the average or standard deviation of the regions by the character vector used for \code{by} in \code{summarize_numerical()}.} 24 | 25 | \item{x}{A string indicating the column of the \code{GRanges} to use for the x-axis.} 26 | 27 | \item{y}{A string indicating the column of the \code{GRanges} to use for the y-axis. If missing, a a histogram over \code{x} will be plotted. If not missing, a scatterplot is plotted.} 28 | 29 | \item{facet}{A string, or character vector of two strings, indicating indicating which categorical variable(s) in the \code{GRanges} to make \code{ggplot2} facets over. When two facets are given, the first entry is the vertical facet and the second entry is the horizontal facet. Default is \code{annot.type}.} 30 | 31 | \item{facet_order}{A character vector, or list of character vectors if \code{facet} has length 2, which gives the order of the facets, and can be used to subset the column in the \code{GRanges} used for the \code{facet}. For example, if \code{facet = 'annot.type'}, then the annotations maybe subsetted to just CpG annotations. Default is \code{NULL}, meaning all annotations in their default order are used.} 32 | 33 | \item{bin_width}{An integer indicating the bin width of the histogram used for score. Default 10. Select something appropriate for the data. NOTE: This is only used if \code{y} is \code{NULL}.} 34 | 35 | \item{plot_title}{A string used for the title of the plot. If missing, no title is displayed.} 36 | 37 | \item{x_label}{A string used for the x-axis label. If missing, no x-axis label is displayed.} 38 | 39 | \item{y_label}{A string used for the y-axis label. If missing, no y-axis label is displayed.} 40 | 41 | \item{legend_facet_label}{A string used to label the gray bar portion of the legend. Defaults to "x in facet".} 42 | 43 | \item{legend_cum_label}{A string used to label the red outline portion of the legend. Defaults to "All in x".} 44 | 45 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).} 46 | } 47 | \value{ 48 | A \code{ggplot} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}. 49 | } 50 | \description{ 51 | This function produces either histograms over \code{facet}, or x-y scatterplots over \code{facet}. In the case of histograms over facets, the All distribution (hollow histogram with red outline) is the distribution of \code{x} over all the regions in the data. The facet specific distributions (solid gray) are the distribution of \code{x} over the regions in each facet. For example, a CpG with associated percent methylation annotated to a CpG island and a promoter will count once in the All distribution, but will count once each in the CpG island and promoter facet distributions. 52 | } 53 | \examples{ 54 | # An example with multi-columned data 55 | 56 | # Get premade CpG annotations 57 | data('annotations', package = 'annotatr') 58 | 59 | dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 60 | extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric') 61 | dm_regions = read_regions(con = dm_file, extraCols = extraCols, 62 | rename_score = 'pval', rename_name = 'DM_status', format = 'bed') 63 | dm_regions = dm_regions[1:1000] 64 | 65 | # Annotate the regions 66 | dm_annots = annotate_regions( 67 | regions = dm_regions, 68 | annotations = annotations, 69 | ignore.strand = TRUE) 70 | 71 | # Plot histogram of group 1 methylation rates across the CpG annotations. 72 | # NOTE: Overall distribution (everything in \code{facet_order}) 73 | # is plotted in each facet for comparison. 74 | dm_vs_regions_mu1 = plot_numerical( 75 | annotated_regions = dm_annots, 76 | x = 'mu1', 77 | facet = 'annot.type', 78 | facet_order = c('hg19_cpg_islands','hg19_cpg_shores', 79 | 'hg19_cpg_shelves','hg19_cpg_inter'), 80 | bin_width = 5, 81 | plot_title = 'Group 1 Methylation over CpG Annotations', 82 | x_label = 'Group 1 Methylation') 83 | 84 | # Plot histogram of group 1 methylation rates across the CpG annotations 85 | # crossed with DM_status 86 | dm_vs_regions_diffmeth = plot_numerical( 87 | annotated_regions = dm_annots, 88 | x = 'diff_meth', 89 | facet = c('annot.type','DM_status'), 90 | facet_order = list( 91 | c('hg19_genes_promoters','hg19_genes_5UTRs','hg19_cpg_islands'), 92 | c('hyper','hypo','none')), 93 | bin_width = 5, 94 | plot_title = 'Group 0 Region Methylation In Genes', 95 | x_label = 'Methylation Difference') 96 | 97 | # Can also use the result of annotate_regions() to plot two numerical 98 | # data columns against each other for each region, and facet by annotations. 99 | dm_vs_regions_annot = plot_numerical( 100 | annotated_regions = dm_annots, 101 | x = 'mu0', 102 | y = 'mu1', 103 | facet = 'annot.type', 104 | facet_order = c('hg19_cpg_islands','hg19_cpg_shores', 105 | 'hg19_cpg_shelves','hg19_cpg_inter'), 106 | plot_title = 'Region Methylation: Group 0 vs Group 1', 107 | x_label = 'Group 0', 108 | y_label = 'Group 1') 109 | 110 | # Another example, but using differential methylation status as the facets. 111 | dm_vs_regions_name = plot_numerical( 112 | annotated_regions = dm_annots, 113 | x = 'mu0', 114 | y = 'mu1', 115 | facet = 'DM_status', 116 | facet_order = c('hyper','hypo','none'), 117 | plot_title = 'Region Methylation: Group 0 vs Group 1', 118 | x_label = 'Group 0', 119 | y_label = 'Group 1') 120 | 121 | } 122 | -------------------------------------------------------------------------------- /man/plot_numerical_coannotations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualize.R 3 | \name{plot_numerical_coannotations} 4 | \alias{plot_numerical_coannotations} 5 | \title{Plot numerical data occurring in pairs of annotations} 6 | \usage{ 7 | plot_numerical_coannotations( 8 | annotated_regions, 9 | x, 10 | y, 11 | annot1, 12 | annot2, 13 | bin_width = 10, 14 | plot_title, 15 | x_label, 16 | y_label, 17 | legend_facet_label, 18 | legend_cum_label, 19 | quiet = FALSE 20 | ) 21 | } 22 | \arguments{ 23 | \item{annotated_regions}{A \code{GRanges} returned from \code{annotate_regions()}.} 24 | 25 | \item{x}{A string indicating the column of the \code{GRanges} to use for the x-axis.} 26 | 27 | \item{y}{A string indicating the column of the \code{GRanges} to use for the y-axis. If missing, a histogram over \code{x} will be plotted. If not missing, a scatterplot is plotted.} 28 | 29 | \item{annot1}{A string indicating the first annotation type.} 30 | 31 | \item{annot2}{A string indicating the second annotation type.} 32 | 33 | \item{bin_width}{An integer indicating the bin width of the histogram used for score. Default 10. Select something appropriate for the data. NOTE: This is only used if \code{y} is \code{NULL}.} 34 | 35 | \item{plot_title}{A string used for the title of the plot. If missing, no title is displayed.} 36 | 37 | \item{x_label}{A string used for the x-axis label. If missing, no x-axis label is displayed.} 38 | 39 | \item{y_label}{A string used for the y-axis label. If missing, no y-axis label is displayed.} 40 | 41 | \item{legend_facet_label}{A string used to label the gray bar portion of the legend. Defaults to "x in annot pair".} 42 | 43 | \item{legend_cum_label}{A string used to label the red outline portion of the legend. Defaults to "All x".} 44 | 45 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).} 46 | } 47 | \value{ 48 | A \code{ggplot} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}. 49 | } 50 | \description{ 51 | Plot numerical data associated with regions occurring in \code{annot1}, \code{annot2} and in both. As with \code{plot_numerical()}, the result is a plot of histograms or x-y scatterplots. 52 | } 53 | \details{ 54 | For example, a CpG with associated percent methylation annotated to a CpG island and a promoter will count once in the All distribution and once in the CpG island / promoter facet distribution. However, a CpG associated only with a promoter will count once in the All distribution and once in the promoter / promoter distribution. 55 | } 56 | \examples{ 57 | # Get premade CpG annotations 58 | data('annotations', package = 'annotatr') 59 | 60 | dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 61 | extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric') 62 | dm_regions = read_regions(con = dm_file, extraCols = extraCols, 63 | rename_score = 'pval', rename_name = 'DM_status', format = 'bed') 64 | dm_regions = dm_regions[1:1000] 65 | 66 | dm_annots = annotate_regions( 67 | regions = dm_regions, 68 | annotations = annotations, 69 | ignore.strand = TRUE) 70 | 71 | dm_vs_num_co = plot_numerical_coannotations( 72 | annotated_regions = dm_annots, 73 | x = 'mu0', 74 | annot1 = 'hg19_cpg_islands', 75 | annot2 = 'hg19_cpg_shelves', 76 | bin_width = 5, 77 | plot_title = 'Group 0 Perc. Meth. in CpG Islands and Promoters', 78 | x_label = 'Percent Methylation') 79 | 80 | } 81 | -------------------------------------------------------------------------------- /man/randomize_regions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/randomize.R 3 | \name{randomize_regions} 4 | \alias{randomize_regions} 5 | \title{Randomize Regions} 6 | \usage{ 7 | randomize_regions( 8 | regions, 9 | allow.overlaps = TRUE, 10 | per.chromosome = TRUE, 11 | quiet = FALSE 12 | ) 13 | } 14 | \arguments{ 15 | \item{regions}{A \code{GRanges} object from \code{read_regions}.} 16 | 17 | \item{allow.overlaps}{A logical stating whether random regions can overlap input regions (TRUE) or not (FALSE). Default TRUE.} 18 | 19 | \item{per.chromosome}{A logical stating whether the random regions should remain on the same chromosome (TRUE) or not (FALSE). Default TRUE.} 20 | 21 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).} 22 | } 23 | \value{ 24 | A \code{GRanges} object of randomized regions based on \code{regions} from \code{read_regions()}. NOTE: Data associated with the original regions is not attached to the randomized regions. 25 | } 26 | \description{ 27 | \code{randomize_regions} is a wrapper function for \code{regioneR::randomizeRegions()} that simplifies the creation of randomized regions for an input set of regions read with \code{read_regions()}. It relies on the \code{seqlengths} of \code{regions} in order to build the appropriate \code{genome} object for \code{regioneR::randomizeRegions()}. 28 | } 29 | \details{ 30 | NOTE: The data associated with the input \code{regions} are not passed on to the random regions. 31 | } 32 | \examples{ 33 | # Create random region set based on ENCODE ChIP-seq data 34 | file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr') 35 | r = read_regions(con = file, genome = 'hg19') 36 | 37 | random_r = randomize_regions(regions = r) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /man/read_annotations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read.R 3 | \name{read_annotations} 4 | \alias{read_annotations} 5 | \title{Read custom annotations} 6 | \usage{ 7 | read_annotations(con, name, genome = NA, format, extraCols = character(), ...) 8 | } 9 | \arguments{ 10 | \item{con}{A path, URL, connection or BEDFile object. See \code{rtracklayer::import.bed()} documentation.} 11 | 12 | \item{name}{A string for the name of the annotations to be used in the name of the object, [genome]_custom_[name]} 13 | 14 | \item{genome}{From \code{rtracklayer::import()}: The identifier of a genome, or NA if unknown. Typically, this is a UCSC identifier like 'hg19'. An attempt will be made to derive the \code{seqinfo} on the return value using either an installed BSgenome package or UCSC, if network access is available.} 15 | 16 | \item{format}{From \code{rtracklayer::import()}: The format of the output. If not missing, should be one of 'bed', 'bed15', 'bedGraph' or 'bedpe'. If missing and 'con' is a filename, the format is derived from the file extension. This argument is unnecessary when 'con' is a derivative of 'RTLFile'.} 17 | 18 | \item{extraCols}{From \code{rtracklayer::import.bed()}: A character vector in the same form as 'colClasses' from 'read.table'. It should indicate the name and class of each extra/special column to read from the BED file. As BED does not encode column names, these are assumed to be the last columns in the file. This enables parsing of the various BEDX+Y formats.} 19 | 20 | \item{...}{Parameters to pass onto the format-specific method of \code{rtracklayer::import()}.} 21 | } 22 | \value{ 23 | A \code{GRanges} object stored in \code{annotatr_cache}. To view a custom annotation, do \code{annotatr_cache$get(name)}. To add a custom annotation to the set of annotations, include \code{'[genome]_custom_[name]'} in the call to \code{build_annotations()}. See example below. 24 | } 25 | \description{ 26 | \code{read_annotations()} is a wrapper for the \code{rtracklayer::import()} function that creates a \code{GRanges} object matching the structure of annotations built with \code{build_annotations()}. The structure is defined by \code{GRanges}, with the \code{mcols()} with names \code{c('id','gene_id','symbol','type')}. 27 | } 28 | \examples{ 29 | 30 | # Read in a BED3 file as a custom annotation 31 | file = system.file('extdata', 'test_annotations_3.bed', package='annotatr') 32 | read_annotations(con = file, name = 'test', genome = 'hg19') 33 | build_annotations(genome = 'hg19', annotations = 'hg19_custom_test') 34 | 35 | print(annotatr_cache$get('hg19_custom_test')) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /man/read_regions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read.R 3 | \name{read_regions} 4 | \alias{read_regions} 5 | \title{Read genomic regions in BEDX+Y format} 6 | \usage{ 7 | read_regions( 8 | con, 9 | genome = NA, 10 | format, 11 | extraCols = character(), 12 | rename_name, 13 | rename_score, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{con}{A path, URL, connection or BEDFile object. See \code{rtracklayer::import()} documentation.} 19 | 20 | \item{genome}{From \code{rtracklayer::import()}: The identifier of a genome, or NA if unknown. Typically, this is a UCSC identifier like 'hg19'. An attempt will be made to derive the \code{seqinfo} on the return value using either an installed BSgenome package or UCSC, if network access is available.} 21 | 22 | \item{format}{From \code{rtracklayer::import()}: The format of the output. If not missing, should be one of 'bed', 'bed15', 'bedGraph' or 'bedpe'. If missing and 'con' is a filename, the format is derived from the file extension. This argument is unnecessary when 'con' is a derivative of 'RTLFile'.} 23 | 24 | \item{extraCols}{From \code{rtracklayer::import()}: A character vector in the same form as 'colClasses' from 'read.table'. It should indicate the name and class of each extra/special column to read from the BED file. As BED does not encode column names, these are assumed to be the last columns in the file. This enables parsing of the various BEDX+Y formats.} 25 | 26 | \item{rename_name}{A string to rename the name column of the BED file. For example, if the name column actually contains a categorical variable.} 27 | 28 | \item{rename_score}{A string to rename the score column of the BED file. For example, if the score column represents a quantity about the data besides the score in the BED specification.} 29 | 30 | \item{...}{Parameters to pass onto the format-specific method of \code{rtracklayer::import()}.} 31 | } 32 | \value{ 33 | A \code{GRanges} object. 34 | } 35 | \description{ 36 | \code{read_regions()} reads genomic regions by calling the \code{rtracklayer::import()} function. This function can automatically deal with BEDX files from BED3 to BED6. For BED6+Y, the \code{extraCols} argument should be used to correctly interpret the extra columns. 37 | } 38 | \details{ 39 | NOTE: The \code{name} (4th) and \code{score} (5th) columns are so named. If these columns have a particular meaning for your data, they should be renamed with the \code{rename_name} and/or \code{rename_score} parameters. 40 | } 41 | \examples{ 42 | 43 | # Example of reading a BED6+3 file where the last 3 columns are non-standard 44 | file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 45 | extraCols = c(diff_meth = 'numeric', mu0 = 'numeric', mu1 = 'numeric') 46 | gr = read_regions(con = file, genome = 'hg19', extraCols = extraCols, format = 'bed', 47 | rename_name = 'DM_status', rename_score = 'pval') 48 | 49 | } 50 | -------------------------------------------------------------------------------- /man/reformat_hmm_codes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{reformat_hmm_codes} 4 | \alias{reformat_hmm_codes} 5 | \title{Function to recode classes from chromHMM type column} 6 | \usage{ 7 | reformat_hmm_codes(hmm_codes) 8 | } 9 | \arguments{ 10 | \item{hmm_codes}{in the original form from UCSC Genome Browser track.} 11 | } 12 | \value{ 13 | A character vector of chromHMM classes with numbers and underscores removed. 14 | } 15 | \description{ 16 | Function to recode classes from chromHMM type column 17 | } 18 | -------------------------------------------------------------------------------- /man/subset_order_tbl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{subset_order_tbl} 4 | \alias{subset_order_tbl} 5 | \title{Function to subset a tbl_df or grouped_df by a column} 6 | \usage{ 7 | subset_order_tbl(tbl, col, col_order) 8 | } 9 | \arguments{ 10 | \item{tbl}{A \code{tbl_df} or \code{grouped_df}.} 11 | 12 | \item{col}{A string indicating which column of of \code{tbl} to subset and order} 13 | 14 | \item{col_order}{A character vector indicating the order of \code{col}.} 15 | } 16 | \value{ 17 | A modified version of \code{summary} with \code{col} subsetted by \code{col_order}. 18 | } 19 | \description{ 20 | Function to subset a tbl_df or grouped_df by a column 21 | } 22 | -------------------------------------------------------------------------------- /man/summarize_annotations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarize.R 3 | \name{summarize_annotations} 4 | \alias{summarize_annotations} 5 | \title{Summarize annotation counts} 6 | \usage{ 7 | summarize_annotations(annotated_regions, annotated_random, quiet = FALSE) 8 | } 9 | \arguments{ 10 | \item{annotated_regions}{The \code{GRanges} result of \code{annotate_regions()}.} 11 | 12 | \item{annotated_random}{The \code{GRanges} result of \code{annotate_regions()} on the randomized regions created from \code{randomize_regions()}.} 13 | 14 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).} 15 | } 16 | \value{ 17 | A \code{tbl_df} of the number of regions per annotation type. 18 | } 19 | \description{ 20 | Given a \code{GRanges} of annotated regions, count the number of regions in each annotation type. If \code{annotated_random} is not \code{NULL}, then the same is computed for the random regions. 21 | } 22 | \details{ 23 | If a region is annotated to multiple annotations of the same \code{annot.type}, the region will only be counted once. For example, if a region were annotated to multiple exons, it would only count once toward the exons, but if it were annotated to an exon and an intron, it would count towards both. 24 | } 25 | \examples{ 26 | ### An example of ChIP-seq peaks with signalValue 27 | 28 | # Get premade CpG annotations 29 | data('annotations', package = 'annotatr') 30 | 31 | file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr') 32 | r = read_regions(con = file, genome = 'hg19') 33 | 34 | a = annotate_regions( 35 | regions = r, 36 | annotations = annotations, 37 | ignore.strand = TRUE, 38 | quiet = FALSE) 39 | 40 | rnd = randomize_regions(regions = r) 41 | 42 | rnd_annots = annotate_regions( 43 | regions = rnd, 44 | annotations = annotations, 45 | ignore.strand = TRUE, 46 | quiet = FALSE) 47 | 48 | # Summarize the annotated regions without randomized regions 49 | s = summarize_annotations(annotated_regions = a) 50 | 51 | # Summarize the annotated regions with randomized regions 52 | s_rnd = summarize_annotations( 53 | annotated_regions = a, 54 | annotated_random = rnd_annots) 55 | 56 | } 57 | -------------------------------------------------------------------------------- /man/summarize_categorical.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarize.R 3 | \name{summarize_categorical} 4 | \alias{summarize_categorical} 5 | \title{Summarize categorical data over groupings of annotated regions} 6 | \usage{ 7 | summarize_categorical( 8 | annotated_regions, 9 | by = c("annot.type", "annot.id"), 10 | quiet = FALSE 11 | ) 12 | } 13 | \arguments{ 14 | \item{annotated_regions}{The \code{GRanges} result of \code{annotate_regions()}.} 15 | 16 | \item{by}{A character vector to group the data in \code{as.data.frame(annotated_regions)} by and tally over. Default is \code{c('annot.type', 'annot.id')}.} 17 | 18 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).} 19 | } 20 | \value{ 21 | A grouped \code{dplyr::tbl_df} of the counts of groupings according to the \code{by} vector. 22 | } 23 | \description{ 24 | Given a \code{GRanges} of annotated regions, count the number of regions when the annotations are grouped \code{by} categorical columns. 25 | } 26 | \details{ 27 | If a region is annotated to multiple annotations of the same \code{annot.type}, the region will only be counted once. For example, if a region were annotated to multiple exons, it would only count once toward the exons, but if it were annotated to an exon and an intron, it would count towards both. 28 | } 29 | \examples{ 30 | 31 | # Get premade CpG annotations 32 | data('annotations', package = 'annotatr') 33 | 34 | r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr') 35 | extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character') 36 | r = read_regions(con = r_file, genome = 'hg19', extraCols = extraCols, rename_score = 'coverage') 37 | 38 | a = annotate_regions( 39 | regions = r, 40 | annotations = annotations, 41 | ignore.strand = TRUE) 42 | 43 | sc = summarize_categorical( 44 | annotated_regions = a, 45 | by = c('annot.type', 'name'), 46 | quiet = FALSE) 47 | 48 | } 49 | -------------------------------------------------------------------------------- /man/summarize_numerical.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarize.R 3 | \name{summarize_numerical} 4 | \alias{summarize_numerical} 5 | \title{Summarize numerical data over groupings of annotated regions} 6 | \usage{ 7 | summarize_numerical( 8 | annotated_regions, 9 | by = c("annot.type", "annot.id"), 10 | over, 11 | quiet = FALSE 12 | ) 13 | } 14 | \arguments{ 15 | \item{annotated_regions}{The \code{GRanges} result of \code{annotate_regions()}.} 16 | 17 | \item{by}{A character vector of the columns of \code{as.data.frame(annotated_regions)} to group over. Default is \code{c(annot.type, annot.id)}.} 18 | 19 | \item{over}{A character vector of the numerical columns in \code{as.data.frame(annotated_regions)} to \code{count}, take the \code{mean}, and take the \code{sd} over after grouping according to the \code{by} column. NOTE: If more than one value is used, the naming scheme for the resuling \code{dplyr::tbl} summary columns are \code{COLNAME_n}, \code{COLNAME_mean}, \code{COLNAME_sd}. If \code{over} has length one, then the column names are \code{n}, \code{mean}, \code{sd}.} 20 | 21 | \item{quiet}{Print progress messages (FALSE) or not (TRUE).} 22 | } 23 | \value{ 24 | A grouped \code{dplyr::tbl_df}, and the \code{count}, \code{mean}, and \code{sd} of the \code{cols} \code{by} the groupings. 25 | } 26 | \description{ 27 | Given a \code{GRanges} of annotated regions, summarize numerical data columns based on a grouping. 28 | } 29 | \details{ 30 | NOTE: We do not take the distinct values of \code{seqnames}, \code{start}, \code{end}, \code{annot.type} as in the other \code{summarize_*()} functions because in the case of a region that intersected two distinct exons, using \code{distinct()} would destroy the information of the mean of the numerical column over one of the exons, which is not desirable. 31 | } 32 | \examples{ 33 | ### Test on a very simple bed file to demonstrate different options 34 | 35 | # Get premade CpG annotations 36 | data('annotations', package = 'annotatr') 37 | 38 | r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr') 39 | extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character') 40 | r = read_regions(con = r_file, genome = 'hg19', extraCols = extraCols, rename_score = 'coverage') 41 | 42 | a = annotate_regions( 43 | regions = r, 44 | annotations = annotations, 45 | ignore.strand = TRUE) 46 | 47 | # Testing over normal by 48 | sn1 = summarize_numerical( 49 | annotated_regions = a, 50 | by = c('annot.type', 'annot.id'), 51 | over = c('coverage', 'mu1', 'mu0'), 52 | quiet = FALSE) 53 | 54 | # Testing over a different by 55 | sn2 = summarize_numerical( 56 | annotated_regions = a, 57 | by = c('diff_exp'), 58 | over = c('coverage', 'mu1', 'mu0')) 59 | 60 | } 61 | -------------------------------------------------------------------------------- /man/tidy_annotations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{tidy_annotations} 4 | \alias{tidy_annotations} 5 | \title{Function to tidy up annotation accessors for visualization} 6 | \usage{ 7 | tidy_annotations(annotations) 8 | } 9 | \arguments{ 10 | \item{annotations}{A character vector of annotations, in the order they are to appear in the visualization.} 11 | } 12 | \value{ 13 | A list of mappings from original annotation names to names ready for visualization. 14 | } 15 | \description{ 16 | Function to tidy up annotation accessors for visualization 17 | } 18 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(annotatr) 3 | 4 | test_check("annotatr") 5 | -------------------------------------------------------------------------------- /tests/testthat/test_1_utils.R: -------------------------------------------------------------------------------- 1 | context('Test utility functions') 2 | 3 | test_that('Test get_*_name() functions', { 4 | expect_error( 5 | get_txdb_name(genome = 'hg18'), 6 | 'should be one of') 7 | 8 | expect_error( 9 | get_orgdb_name(genome = 'hg18'), 10 | 'should be one of') 11 | }) 12 | 13 | test_that('Test tidy_annotations()', { 14 | hg19_annots = c('hg19_cpg_islands', 'hg19_cpg_inter', 'hg19_genes_firstexons', 'hg19_genes_intronexonboundaries', 'hg19_genes_exonintronboundaries', 'hg19_lncrna_gencode', 'hg19_chromatin_Gm12878-ActivePromoter') 15 | mm9_annots = c('mm9_cpg_islands','mm9_genes_exonsCDSs','mm9_cpg_inter') 16 | rn4_custom_annots = c('rn4_custom_cpgislands','rn4_custom_TFBS') 17 | 18 | hg19_tidy_annots = tidy_annotations(hg19_annots) 19 | mm9_tidy_annots = tidy_annotations(mm9_annots) 20 | rn4_tidy_annots = tidy_annotations(rn4_custom_annots) 21 | 22 | expect_equal( all(names(hg19_tidy_annots) == c('CpG islands', 'interCGI', 'first exons', 'intron/exon boundaries', 'exon/intron boundaries', 'GENCODE lncRNA', 'Gm12878-ActivePromoter')), expected = TRUE) 23 | expect_equal( all(names(mm9_tidy_annots) == c('CpG islands', 'exonsCDSs', 'interCGI')), expected = TRUE) 24 | expect_equal( all(names(rn4_tidy_annots) == c('cpgislands', 'TFBS')), expected = TRUE) 25 | expect_equal 26 | }) 27 | 28 | test_that('Test check_annotations()', { 29 | annots1 = c('hg17_genes_promoters','hg19_cpgs') 30 | annots2 = c('hello','hg19_genes_promoters','hg19_cpgs') 31 | annots3 = c('hg19_genes_promoters', 'mm9_cpg_islands') 32 | 33 | expect_error( check_annotations(annots1), 'not supported. See builtin_annotations()' ) 34 | expect_error( check_annotations(annots2), 'not supported. See builtin_annotations()' ) 35 | expect_error( check_annotations(annots3), 'genome prefix on all annotations must be the same' ) 36 | }) 37 | 38 | test_that('Test expand_annotations()', { 39 | annots1 = c('hg19_genes_promoters', 'hg19_genes_exons') 40 | 41 | annots2 = c('mm9_basicgenes', 'mm9_cpgs') 42 | expanded_annots2 = c('mm9_cpg_islands', 'mm9_cpg_shores', 'mm9_cpg_shelves', 'mm9_cpg_inter', 'mm9_genes_1to5kb', 'mm9_genes_promoters', 'mm9_genes_5UTRs', 'mm9_genes_exons', 'mm9_genes_introns', 'mm9_genes_3UTRs') 43 | 44 | annots3 = c('hg19_cpg_shores', 'hg19_cpgs') 45 | expanded_annots3 = c('hg19_cpg_islands', 'hg19_cpg_shores', 'hg19_cpg_shelves','hg19_cpg_inter') 46 | 47 | annots4 = c('hg19_Hepg2-chromatin') 48 | expanded_annots4 = c('hg19_chromatin_Hepg2-ActivePromoter','hg19_chromatin_Hepg2-WeakPromoter','hg19_chromatin_Hepg2-PoisedPromoter','hg19_chromatin_Hepg2-StrongEnhancer','hg19_chromatin_Hepg2-WeakEnhancer','hg19_chromatin_Hepg2-Insulator','hg19_chromatin_Hepg2-TxnTransition','hg19_chromatin_Hepg2-TxnElongation','hg19_chromatin_Hepg2-WeakTxn','hg19_chromatin_Hepg2-Repressed','hg19_chromatin_Hepg2-Heterochrom/lo','hg19_chromatin_Hepg2-Repetitive/CNV') 49 | 50 | expect_equal( dplyr::setequal(expand_annotations(annots1), annots1), expected = TRUE ) 51 | expect_equal( dplyr::setequal(expand_annotations(annots2), expanded_annots2), expected = TRUE ) 52 | expect_equal( dplyr::setequal(expand_annotations(annots3), expanded_annots3), expected = TRUE ) 53 | expect_equal( dplyr::setequal(expand_annotations(annots4), 54 | expanded_annots4), expected = TRUE) 55 | }) 56 | -------------------------------------------------------------------------------- /tests/testthat/test_2_read.R: -------------------------------------------------------------------------------- 1 | context('Test read module') 2 | 3 | ################################################################################ 4 | # Test warnings in read_regions() 5 | 6 | test_that('Test rename_* warnings', { 7 | file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr') 8 | 9 | expect_warning( 10 | read_regions(con = file, format = 'bed', rename_name = 'hello'), 11 | 'Ignoring rename_name parameter because') 12 | 13 | expect_warning( 14 | read_regions(con = file, format = 'bed', rename_score = 'score'), 15 | 'Ignoring rename_score parameter because') 16 | }) 17 | 18 | ################################################################################ 19 | # Test BED3-6+ and bedGraph 20 | 21 | test_that('Test BED3', { 22 | file = system.file('extdata', 'test_BED3.bed', package = 'annotatr') 23 | r = read_regions(con = file, format = 'bed') 24 | 25 | expect_true(is(r, 'GRanges')) 26 | }) 27 | 28 | test_that('Test BED4', { 29 | file = system.file('extdata', 'test_BED4.bed', package = 'annotatr') 30 | r = read_regions(con = file, format = 'bed') 31 | 32 | expect_true(is(r, 'GRanges')) 33 | }) 34 | 35 | test_that('Test BED5', { 36 | file = system.file('extdata', 'test_BED5.bed', package = 'annotatr') 37 | r = read_regions(con = file, format = 'bed') 38 | 39 | expect_true(is(r, 'GRanges')) 40 | }) 41 | 42 | test_that('Test BED6', { 43 | file = system.file('extdata', 'test_BED6.bed', package = 'annotatr') 44 | r = read_regions(con = file, format = 'bed') 45 | 46 | expect_true(is(r, 'GRanges')) 47 | }) 48 | 49 | test_that('Test BED6+ with renaming', { 50 | file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 51 | extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric') 52 | r = read_regions(con = file, extraCols = extraCols, rename_score = 'pval', rename_name = 'DM_status', format = 'bed') 53 | 54 | expect_true(is(r, 'GRanges')) 55 | }) 56 | 57 | test_that('Test bedGraph', { 58 | file = system.file('extdata', 'test_bedGraph.bedGraph', package = 'annotatr') 59 | r = read_regions(con = file, format = 'bedGraph') 60 | 61 | expect_true(is(r, 'GRanges')) 62 | }) 63 | 64 | ################################################################################ 65 | # Test 66 | 67 | test_that('Test custom BED3 with no genome and a name', { 68 | file = system.file('extdata', 'test_annotations_3.bed', package = 'annotatr') 69 | read_annotations(con = file, name = 'test', format = 'bed') 70 | 71 | expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_test'))) == c('id','tx_id','gene_id','symbol','type')) ) 72 | }) 73 | 74 | test_that('Test custom BED3 with no name and a genome', { 75 | file = system.file('extdata', 'test_annotations_3.bed', package = 'annotatr') 76 | read_annotations(con = file, genome = 'hg19', format = 'bed') 77 | 78 | expect_true( all(colnames(mcols(annotatr_cache$get('hg19_custom_annotations'))) == c('id','tx_id','gene_id','symbol','type')) ) 79 | }) 80 | 81 | test_that('Test custom BED3 with no name or genome', { 82 | file = system.file('extdata', 'test_annotations_3.bed', package = 'annotatr') 83 | read_annotations(con = file, format = 'bed') 84 | expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_annotations'))) == c('id','tx_id','gene_id','symbol','type')) ) 85 | }) 86 | 87 | test_that('Test custom BED4', { 88 | file = system.file('extdata', 'test_annotations_4.bed', package = 'annotatr') 89 | read_annotations(con = file, format = 'bed') 90 | 91 | expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_test'))) == c('id','tx_id','gene_id','symbol','type')) ) 92 | }) 93 | 94 | test_that('Test custom BED5', { 95 | file = system.file('extdata', 'test_annotations_5.bed', package = 'annotatr') 96 | read_annotations(con = file, format = 'bed') 97 | 98 | expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_test'))) == c('id','tx_id','gene_id','symbol','type')) ) 99 | }) 100 | 101 | test_that('Test custom BED6', { 102 | file = system.file('extdata', 'test_annotations_6.bed', package = 'annotatr') 103 | read_annotations(con = file, name = 'six', format = 'bed') 104 | 105 | expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_six'))) == c('id','tx_id','gene_id','symbol','type')) ) 106 | }) 107 | 108 | test_that('Test custom BED6 with gene_id', { 109 | file = system.file('extdata', 'test_annotations_6_gene.bed', package = 'annotatr') 110 | extraCols = c(gene_id = 'character') 111 | read_annotations(con = file, name = 'geneid', format = 'bed', extraCols = extraCols) 112 | 113 | expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_geneid'))) == c('id','tx_id','gene_id','symbol','type')) ) 114 | }) 115 | 116 | test_that('Test custom BED6 with symbol', { 117 | file = system.file('extdata', 'test_annotations_6_symbol.bed', package = 'annotatr') 118 | extraCols = c(symbol = 'character') 119 | read_annotations(con = file, name = 'symbol', format = 'bed', extraCols = extraCols) 120 | 121 | expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_symbol'))) == c('id','tx_id','gene_id','symbol','type')) ) 122 | }) 123 | 124 | test_that('Test custom BED6 with symbol nad gene_id', { 125 | file = system.file('extdata', 'test_annotations_6_tx_gene_symbol.bed', package = 'annotatr') 126 | extraCols = c(gene_id = 'character', symbol = 'character', tx_id = 'character') 127 | read_annotations(con = file, name = 'txgenesymbol', format = 'bed', extraCols = extraCols) 128 | 129 | expect_true( all(colnames(mcols(annotatr_cache$get('genome_custom_txgenesymbol'))) == c('id','tx_id','gene_id','symbol','type')) ) 130 | }) 131 | -------------------------------------------------------------------------------- /tests/testthat/test_3_build_annotations.R: -------------------------------------------------------------------------------- 1 | context('Test build annotations module') 2 | 3 | ################################################################################ 4 | # Test errors 5 | 6 | test_that('Test error for non-existent custom annotations', { 7 | expect_error( 8 | build_annotations(genome = 'hg19', annotations = 'hg19_custom_ezh2'), 9 | 'not in annotatr_cache' 10 | ) 11 | }) 12 | 13 | ################################################################################ 14 | # Test annotations that aren't otherwise tested 15 | # intergenic, cds, firstexons, and both boundaries 16 | 17 | # test_that('Test all annotations', { 18 | # annots = c('hg19_basicgenes', 'hg19_cpgs', 'hg19_genes_intergenic', 'hg19_genes_cds', 'hg19_genes_firstexons', 'hg19_genes_intronexonboundaries', 'hg19_genes_exonintronboundaries', 'hg19_lncrna_gencode', 'hg19_Gm12878-chromatin', 'hg19_H1hesc-chromatin', 'hg19_Hepg2-chromatin', 'hg19_Hmec-chromatin', 'hg19_Hsmm-chromatin', 'hg19_Huvec-chromatin', 'hg19_K562-chromatin', 'hg19_Nhek-chromatin', 'hg19_Nhlf-chromatin') 19 | # annotations = build_annotations(genome = 'hg19', annotations = annots) 20 | # expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) ) 21 | # 22 | # annots = c('hg38_basicgenes', 'hg38_cpgs', 'hg38_genes_intergenic', 'hg38_genes_cds', 'hg38_genes_firstexons', 'hg38_genes_intronexonboundaries', 'hg38_genes_exonintronboundaries', 'hg38_lncrna_gencode', 'hg38_enhancers_fantom') 23 | # annotations = build_annotations(genome = 'hg38', annotations = annots) 24 | # expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) ) 25 | # 26 | # annots = c('mm10_basicgenes', 'mm10_cpgs', 'mm10_genes_intergenic', 'mm10_genes_cds', 'mm10_genes_firstexons', 'mm10_genes_intronexonboundaries', 'mm10_genes_exonintronboundaries', 'mm10_lncrna_gencode', 'mm10_enhancers_fantom') 27 | # annotations = build_annotations(genome = 'mm10', annotations = annots) 28 | # expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) ) 29 | # 30 | # annots = c('mm9_basicgenes', 'mm9_cpgs', 'mm9_genes_intergenic', 'mm9_genes_cds', 'mm9_genes_firstexons', 'mm9_genes_intronexonboundaries', 'mm9_genes_exonintronboundaries', 'mm9_enhancers_fantom') 31 | # annotations = build_annotations(genome = 'mm9', annotations = annots) 32 | # expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) ) 33 | # 34 | # annots = c('rn4_basicgenes', 'rn4_cpgs', 'rn4_genes_intergenic', 'rn4_genes_cds', 'rn4_genes_firstexons', 'rn4_genes_intronexonboundaries', 'rn4_genes_exonintronboundaries') 35 | # annotations = build_annotations(genome = 'rn4', annotations = annots) 36 | # expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) ) 37 | # 38 | # annots = c('rn5_basicgenes', 'rn5_cpgs', 'rn5_genes_intergenic', 'rn5_genes_cds', 'rn5_genes_firstexons', 'rn5_genes_intronexonboundaries', 'rn5_genes_exonintronboundaries') 39 | # annotations = build_annotations(genome = 'rn5', annotations = annots) 40 | # expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) ) 41 | # 42 | # annots = c('rn6_basicgenes', 'rn6_cpgs', 'rn6_genes_intergenic', 'rn6_genes_cds', 'rn6_genes_firstexons', 'rn6_genes_intronexonboundaries', 'rn6_genes_exonintronboundaries') 43 | # annotations = build_annotations(genome = 'rn6', annotations = annots) 44 | # expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) ) 45 | # 46 | # annots = c('dm3_basicgenes', 'dm3_genes_intergenic', 'dm3_genes_cds', 'dm3_genes_firstexons', 'dm3_genes_intronexonboundaries', 'dm3_genes_exonintronboundaries') 47 | # annotations = build_annotations(genome = 'dm3', annotations = annots) 48 | # expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) ) 49 | # 50 | # annots = c('dm6_basicgenes', 'dm6_genes_intergenic', 'dm6_genes_cds', 'dm6_genes_firstexons', 'dm6_genes_intronexonboundaries', 'dm6_genes_exonintronboundaries') 51 | # annotations = build_annotations(genome = 'dm6', annotations = annots) 52 | # expect_true( dplyr::setequal(unique(annotations$type), expand_annotations(annots)) ) 53 | # }) 54 | -------------------------------------------------------------------------------- /tests/testthat/test_4_intersect.R: -------------------------------------------------------------------------------- 1 | context('Test intersect/annotate module') 2 | 3 | ################################################################################ 4 | # Test errors 5 | 6 | test_that('Test error thrown for non-GRanges regions object in annotate_regions()',{ 7 | annotations = c('hg19_cpg_islands') 8 | 9 | bed = system.file('extdata', 'test_intersect.bed', package = 'annotatr') 10 | r = suppressMessages(read_regions(con = bed, format = 'bed')) 11 | 12 | expect_error( 13 | annotate_regions( 14 | regions = bed, 15 | annotations = annotations, 16 | ignore.strand = TRUE, 17 | quiet = TRUE), 18 | "regions object is not GRanges") 19 | 20 | expect_error( 21 | annotate_regions( 22 | regions = r, 23 | annotations = bed, 24 | ignore.strand = TRUE, 25 | quiet = TRUE), 26 | "annotations object is not GRanges") 27 | 28 | a_file = system.file('extdata', 'test_annotation_nooverlap.bed', package = 'annotatr') 29 | read_annotations(con = a_file, name = 'test') 30 | annotations = build_annotations(genome = 'hg19', annotations = 'genome_custom_test') 31 | expect_error( 32 | annotate_regions( 33 | regions = r, 34 | annotations = annotations, 35 | ignore.strand = TRUE, 36 | quiet = TRUE), 37 | "No annotations intersect the regions") 38 | }) 39 | 40 | ################################################################################ 41 | # Test annotate_regions() 42 | 43 | test_that('Test a la carte annotations in annotate_regions()',{ 44 | # Get premade CpG annotations 45 | annots = expand_annotations('hg19_cpgs') 46 | data('annotations', package = 'annotatr') 47 | 48 | bed = system.file('extdata', 'test_intersect.bed', package = 'annotatr') 49 | r = read_regions(con = bed, format = 'bed') 50 | 51 | i = annotate_regions( 52 | regions = r, 53 | annotations = annotations, 54 | ignore.strand = TRUE, 55 | quiet = TRUE) 56 | 57 | expect_true( all(unique(i$annot$type) %in% expand_annotations(annots)) ) 58 | }) 59 | 60 | test_that('Test a la carte and shortcut annotations in annotate_regions()',{ 61 | data('annotations', package = 'annotatr') 62 | 63 | file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr') 64 | r = read_regions(con = file, format = 'bed') 65 | 66 | i = annotate_regions( 67 | regions = r, 68 | annotations = annotations, 69 | ignore.strand = TRUE, 70 | quiet = TRUE) 71 | 72 | expect_true( all(unique(i$annot$type) %in% c('hg19_cpg_islands', 'hg19_cpg_shores', 'hg19_cpg_shelves', 'hg19_cpg_inter')) ) 73 | }) 74 | 75 | test_that('Custom annotations work in annotate_regions()', { 76 | r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr') 77 | extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character') 78 | r = read_regions(con = r_file, extraCols = extraCols, rename_score = 'coverage') 79 | 80 | a_file = system.file('extdata', 'test_annotations_3.bed', package='annotatr') 81 | read_annotations(con = a_file, name = 'TFBS', genome = 'hg19') 82 | 83 | annots = c('hg19_custom_TFBS', 'hg19_cpgs') 84 | annotations = build_annotations(genome = 'hg19', annotations = annots) 85 | 86 | a = annotate_regions( 87 | regions = r, 88 | annotations = annotations, 89 | ignore.strand = TRUE, 90 | quiet = TRUE) 91 | 92 | expect_equal(length(a) == 10, expected = TRUE) 93 | }) 94 | 95 | test_that('annotate_regions() works with only custom annotations', { 96 | r_file = system.file('extdata', 'test_read_multiple_data_nohead.bed', package='annotatr') 97 | extraCols = c(pval = 'numeric', mu1 = 'integer', mu0 = 'integer', diff_exp = 'character') 98 | r = read_regions(con = r_file, extraCols = extraCols, rename_score = 'coverage') 99 | 100 | a_file = system.file('extdata', 'test_annotations_3.bed', package='annotatr') 101 | read_annotations(con = a_file, name = 'TFBS') 102 | annotations = build_annotations(genome = 'hg19', annotations = 'genome_custom_TFBS') 103 | 104 | a = annotate_regions( 105 | regions = r, 106 | annotations = annotations, 107 | ignore.strand = TRUE, 108 | quiet = FALSE) 109 | 110 | expect_equal(length(a) == 5, expected = TRUE) 111 | }) 112 | 113 | test_that('annotate_regions() uses minoverlap correctly', { 114 | file = system.file('extdata', 'test_BED3.bed', package = 'annotatr') 115 | r = read_regions(con = file, format = 'bed') 116 | 117 | a_file = system.file('extdata', 'test_annotations_minoverlap.bed', package='annotatr') 118 | read_annotations(con = a_file, name = 'TFBS') 119 | annotations = build_annotations(genome = 'hg19', annotations = 'genome_custom_TFBS') 120 | 121 | a = annotate_regions( 122 | regions = r, 123 | annotations = annotations, 124 | minoverlap = 5) 125 | 126 | expect_equal(length(a) == 2, expected = TRUE) 127 | expect_true(all(GenomicRanges::start(a) == c(10791,28801))) 128 | }) 129 | -------------------------------------------------------------------------------- /tests/testthat/test_5_randomize.R: -------------------------------------------------------------------------------- 1 | context('Test randomize module') 2 | 3 | ################################################################################ 4 | # Setup objects for plot_categorical() 5 | 6 | file = system.file('extdata', 'Gm12878_Stat3_chr2.bed.gz', package = 'annotatr') 7 | regions_genome = read_regions(con = file, genome = 'hg19', format = 'bed') 8 | regions_nogenome = read_regions(con = file, format = 'bed') 9 | 10 | ################################################################################ 11 | # Test errors 12 | 13 | test_that('Test errors', { 14 | expect_error( 15 | randomize_regions(regions = 'hello', allow.overlaps = TRUE, per.chromosome = TRUE), 16 | 'regions must have class GRanges') 17 | expect_error( 18 | randomize_regions(regions = regions_nogenome), 19 | 'GRanges object must have a valid genome' 20 | ) 21 | }) 22 | 23 | ################################################################################ 24 | # Test randomize_regions() 25 | 26 | test_that('Test randomized regions', { 27 | random_regions = randomize_regions( 28 | regions = regions_genome, 29 | allow.overlaps = TRUE, 30 | per.chromosome = TRUE) 31 | 32 | expect_equal(class(random_regions)[1], expected = 'GRanges') 33 | expect_equal(length(random_regions), expected = length(regions_genome)) 34 | }) 35 | -------------------------------------------------------------------------------- /tests/testthat/test_6_summarize.R: -------------------------------------------------------------------------------- 1 | context('Test summarize module') 2 | 3 | data('annotations', package = 'annotatr') 4 | 5 | bed = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 6 | extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric') 7 | r = suppressMessages(read_regions(con = bed, genome = 'hg19', extraCols = extraCols, rename_score = 'pval', rename_name = 'DM_status', format = 'bed')) 8 | r = r[1:1000] 9 | r$cancer_status = 'Cancer' 10 | r2 = r 11 | r2$cancer_status = 'NoCancer' 12 | 13 | r_dup = c(r,r2) 14 | 15 | a = suppressMessages(annotate_regions( 16 | regions = r, 17 | annotations = annotations, 18 | ignore.strand = TRUE, 19 | quiet = TRUE)) 20 | 21 | a_dup = suppressMessages(annotate_regions( 22 | regions = r_dup, 23 | annotations = annotations, 24 | ignore.strand = TRUE, 25 | quiet = TRUE)) 26 | 27 | rnd = suppressMessages(randomize_regions(regions = r)) 28 | 29 | rnd_annot = suppressMessages(annotate_regions( 30 | regions = rnd, 31 | annotations = annotations, 32 | ignore.strand = TRUE, 33 | quiet = TRUE)) 34 | 35 | ################################################################################ 36 | # Test errors 37 | 38 | test_that('Test for error with over=NULL in summarize_numerical()',{ 39 | expect_error(summarize_numerical(annotated_regions = a), 40 | 'over cannot be missing') 41 | }) 42 | 43 | ################################################################################ 44 | # Test summarize functions 45 | 46 | test_that('Test summarize_annotations()', { 47 | s = summarize_annotations(annotated_regions = a, quiet = FALSE) 48 | 49 | srand = summarize_annotations( 50 | annotated_regions = a, 51 | annotated_random = rnd_annot, 52 | quiet = FALSE) 53 | 54 | # NOTE: For small data it is possible that the random regions won't 55 | # intersect all CpG types so the second test may fail. Moreover, 56 | # if you are going to compute fold changes, corresponding random 57 | # rows may be missing if the data is too small... 58 | expect_equal( sum(s[['n']]), expected = 1064) 59 | expect_equal( nrow(srand), expected = 8) 60 | }) 61 | 62 | test_that('Test summarize_numerical()', { 63 | s = summarize_numerical( 64 | annotated_regions = a, 65 | by = c('annot.type', 'annot.id'), 66 | over = 'diff_meth', 67 | quiet = TRUE) 68 | 69 | expect_equal( mean(s[['mean']]), expected = 2.424537, tolerance = 0.01) 70 | }) 71 | 72 | test_that('Test summarize_numerical() and summarize_categorical() over small data', { 73 | # Testing summarize_numerical() 74 | sn1 = summarize_numerical( 75 | annotated_regions = a, 76 | by = c('annot.type', 'annot.id'), 77 | over = 'diff_meth', 78 | quiet = FALSE) 79 | sn2 = summarize_numerical( 80 | annotated_regions = a, 81 | by = c('DM_status'), 82 | over = c('diff_meth', 'mu1', 'mu0'), 83 | quiet = TRUE) 84 | 85 | # Testing summarize_categorical() 86 | sc1 = summarize_categorical( 87 | annotated_regions = a, 88 | by = c('annot.type', 'DM_status'), 89 | quiet = FALSE) 90 | 91 | # Testing maintanence of duplicate regions with different categories 92 | sc2 = summarize_categorical( 93 | annotated_regions = a_dup, 94 | by = c('annot.type', 'cancer_status'), 95 | quiet = FALSE) 96 | 97 | expect_equal( sn1[['mean']][which(sn1[['annot.id']] == 'inter:8599')], expected = -1.0066888, tolerance = 0.01) 98 | expect_equal( sn2[['mu0_mean']][which(sn2[['DM_status']] == 'hyper')], expected = 16.34614, tolerance = 0.01) 99 | 100 | expect_equal( sc1[['n']][which(sc1[['annot.type']] == 'hg19_cpg_inter' & sc1[,'DM_status'] == 'hyper')], expected = 19) 101 | 102 | expect_true( sc2[['n']][which(sc2[['annot.type']] == 'hg19_cpg_inter' & sc2[,'cancer_status'] == 'Cancer')] == sc2[['n']][which(sc2[['annot.type']] == 'hg19_cpg_inter' & sc2[,'cancer_status'] == 'NoCancer')] ) 103 | }) 104 | -------------------------------------------------------------------------------- /tests/testthat/test_7_visualize.R: -------------------------------------------------------------------------------- 1 | context('Test plot module') 2 | 3 | ################################################################################ 4 | # Setup annotation objects 5 | data('annotations', package = 'annotatr') 6 | 7 | ################################################################################ 8 | # Setup objects for plot_categorical() 9 | 10 | dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 11 | extraCols = c(diff_meth = 'numeric', mu1 = 'numeric', mu0 = 'numeric') 12 | dm_regions = suppressMessages(read_regions(con = dm_file, genome = 'hg19', extraCols = extraCols, rename_score = 'pval', rename_name = 'DM_status', format = 'bed')) 13 | dm_regions = dm_regions[1:1000] 14 | dm_regions$cancer_status = 'Cancer' 15 | dm_regions2 = dm_regions 16 | dm_regions2$cancer_status = 'NoCancer' 17 | 18 | duplicate_regions = c(dm_regions, dm_regions2) 19 | 20 | dm_random_regions = suppressMessages(randomize_regions(regions = dm_regions)) 21 | 22 | dm_annots = suppressMessages(annotate_regions( 23 | regions = dm_regions, 24 | annotations = annotations, 25 | ignore.strand = TRUE, 26 | quiet = TRUE)) 27 | 28 | dm_dup_annots = suppressMessages(annotate_regions( 29 | regions = duplicate_regions, 30 | annotations = annotations, 31 | ignore.strand = TRUE, 32 | quiet = TRUE)) 33 | 34 | dm_random_annots = suppressMessages(annotate_regions( 35 | regions = dm_random_regions, 36 | annotations = annotations, 37 | ignore.strand = TRUE, 38 | quiet = TRUE)) 39 | 40 | ################################################################################ 41 | # Setup order vectors and plots that will work 42 | 43 | dm_order = c( 44 | 'hyper', 45 | 'hypo', 46 | 'none') 47 | cpgs_order = c( 48 | 'hg19_cpg_islands', 49 | 'hg19_cpg_shores', 50 | 'hg19_cpg_shelves', 51 | 'hg19_cpg_inter') 52 | 53 | ################################################################################ 54 | # Test plot_annotation() 55 | 56 | test_that('Test plot_annotation() errors', { 57 | expect_warning( 58 | plot_annotation(annotated_regions = dm_annots, annotation_order = c('hypor','hype','')), 59 | 'elements in col_order that are not present') 60 | }) 61 | 62 | test_that('Test plot_annotation() success', { 63 | dm_va_min = plot_annotation(annotated_regions = dm_annots) 64 | 65 | dm_va = plot_annotation( 66 | annotated_regions = dm_annots, 67 | annotation_order = cpgs_order, 68 | plot_title = 'Testing plot title', 69 | x_label = 'Test x-label', 70 | y_label = 'Test y-label') 71 | 72 | dm_va_rnd = plot_annotation( 73 | annotated_regions = dm_annots, 74 | annotated_random = dm_random_annots, 75 | annotation_order = NULL, 76 | plot_title = 'Testing dodged bars', 77 | x_label = 'Annotation Type', 78 | y_label = 'Count') 79 | 80 | expect_equal( dplyr::setequal(class(dm_va_min), c('gg','ggplot')), expected = TRUE) 81 | expect_equal( dplyr::setequal(class(dm_va), c('gg','ggplot')), expected = TRUE) 82 | expect_equal( dplyr::setequal(class(dm_va_rnd), c('gg','ggplot')), expected = TRUE) 83 | }) 84 | 85 | ################################################################################ 86 | # Test plot_coannotations() 87 | 88 | test_that('Test plot_coannotations() success', { 89 | 90 | dm_vs_ca = plot_coannotations( 91 | annotated_regions = dm_annots, 92 | annotation_order = cpgs_order, 93 | axes_label = 'Annotations', 94 | plot_title = 'Co-occurrence of Annotations') 95 | 96 | expect_equal( dplyr::setequal(class(dm_vs_ca), c('gg','ggplot')), expected = TRUE) 97 | }) 98 | 99 | ################################################################################ 100 | # Test plot_numerical() 101 | 102 | test_that('Test plot_numerical() success', { 103 | 104 | dm_vs_regions_mu1 = plot_numerical( 105 | annotated_regions = dm_annots, 106 | x = 'mu1', 107 | facet = 'annot.type', 108 | facet_order = c('hg19_cpg_islands','hg19_cpg_shores','hg19_cpg_shelves','hg19_cpg_inter'), 109 | bin_width = 5, 110 | plot_title = 'Group 1 Methylation over CpG Annotations', 111 | x_label = 'Group 1 Methylation', 112 | legend_facet_label = 'Group 1 Methylation Rate in Annotation', 113 | legend_cum_label = 'Overall Group 1 Methylation Rate') 114 | 115 | dm_vs_regions_annot = plot_numerical( 116 | annotated_regions = dm_annots, 117 | x = 'mu0', 118 | y = 'mu1', 119 | facet = 'annot.type', 120 | facet_order = c('hg19_cpg_islands','hg19_cpg_shores','hg19_cpg_shelves','hg19_cpg_inter'), 121 | plot_title = 'Region Methylation: Group 0 vs Group 1', 122 | x_label = 'Group 0', 123 | y_label = 'Group 1') 124 | 125 | dm_vs_regions_name = plot_numerical( 126 | annotated_regions = dm_annots, 127 | x = 'mu0', 128 | y = 'mu1', 129 | facet = 'DM_status', 130 | facet_order = c('hyper','hypo','none'), 131 | plot_title = 'Region Methylation: Group 0 vs Group 1', 132 | x_label = 'Group 0', 133 | y_label = 'Group 1') 134 | 135 | dm_vs_regions_mu12 = plot_numerical( 136 | annotated_regions = dm_annots, 137 | x = 'mu1', 138 | facet = c('annot.type', 'DM_status'), 139 | facet_order = list(c('hg19_cpg_islands','hg19_cpg_shores'), c('hyper','hypo','none')), 140 | plot_title = 'Region Methylation: Group 0 vs Group 1', 141 | x_label = 'Group 0', 142 | y_label = 'Group 1') 143 | 144 | dm_vs_regions_name2 = plot_numerical( 145 | annotated_regions = dm_annots, 146 | x = 'mu0', 147 | y = 'mu1', 148 | facet = c('annot.type', 'DM_status'), 149 | facet_order = list(NULL, c('hyper','hypo','none')), 150 | plot_title = 'Region Methylation: Group 0 vs Group 1', 151 | x_label = 'Group 0', 152 | y_label = 'Group 1') 153 | 154 | expect_equal( dplyr::setequal(class(dm_vs_regions_mu1), c('gg','ggplot')), expected = TRUE) 155 | expect_equal( dplyr::setequal(class(dm_vs_regions_annot), c('gg','ggplot')), expected = TRUE) 156 | expect_equal( dplyr::setequal(class(dm_vs_regions_name), c('gg','ggplot')), expected = TRUE) 157 | }) 158 | 159 | ################################################################################ 160 | # Test plot_numerical_coannotations() 161 | 162 | test_that('Test plot_numerical_coannotations()', { 163 | dm_vs_num_co1 = plot_numerical_coannotations( 164 | annotated_regions = dm_annots, 165 | x = 'mu0', 166 | annot1 = 'hg19_cpg_islands', 167 | annot2 = 'hg19_cpg_shores', 168 | bin_width = 5, 169 | plot_title = 'Group 0 Perc. Meth. in CpG Islands and Promoters', 170 | x_label = 'Percent Methylation', 171 | legend_facet_label = 'Perc. Methylation in annotation pair', 172 | legend_cum_label = 'Overall Perc. Methylation') 173 | 174 | dm_vs_num_co2 = plot_numerical_coannotations( 175 | annotated_regions = dm_annots, 176 | x = 'mu0', 177 | y = 'mu1', 178 | annot1 = 'hg19_cpg_islands', 179 | annot2 = 'hg19_cpg_shores', 180 | bin_width = 5, 181 | plot_title = 'Group 0 Perc. Meth. in CpG Islands and Promoters', 182 | x_label = 'Percent Methylation', 183 | y_label = 'Percent Methylation') 184 | 185 | expect_equal( dplyr::setequal(class(dm_vs_num_co1), c('gg','ggplot')), expected = TRUE) 186 | expect_equal( dplyr::setequal(class(dm_vs_num_co2), c('gg','ggplot')), expected = TRUE) 187 | }) 188 | 189 | ################################################################################ 190 | # Test plot_categorical() 191 | 192 | test_that('Test plot_categorical() errors', { 193 | expect_error( 194 | plot_categorical( 195 | annotated_regions = dm_annots), 196 | 'argument "x" is missing') 197 | 198 | expect_error( 199 | plot_categorical( 200 | annotated_regions = dm_annots, 201 | x = 'testing'), 202 | 'column name used for x does not exist in annotated_regions') 203 | 204 | expect_error( 205 | plot_categorical( 206 | annotated_regions = dm_annots, 207 | x = 'DM_status', 208 | fill = 'testing'), 209 | 'column name used for fill does not exist in annotated_regions') 210 | 211 | expect_error( 212 | plot_categorical( 213 | annotated_regions = dm_annots, 214 | x = 'DM_status', 215 | fill = 'DM_status'), 216 | 'x cannot equal fill') 217 | 218 | expect_error( 219 | plot_categorical( 220 | annotated_regions = dm_annots, 221 | x = 'DM_status', 222 | fill = 'annot.type', 223 | position = 'no'), 224 | 'position must be one of "stack", "fill"') 225 | 226 | expect_warning( 227 | plot_categorical( 228 | annotated_regions = dm_annots, 229 | x = 'DM_status', 230 | fill = 'annot.type', 231 | x_order = cpgs_order), 232 | 'elements in col_order that are not present') 233 | 234 | expect_warning( 235 | plot_categorical( 236 | annotated_regions = dm_annots, 237 | x = 'DM_status', 238 | fill = 'annot.type', 239 | fill_order = dm_order), 240 | 'elements in col_order that are not present') 241 | }) 242 | 243 | test_that('Test plot_categorical() error for random regions and non annot fill', { 244 | expect_error( 245 | plot_categorical( 246 | annotated_regions = dm_annots, 247 | annotated_random = dm_random_annots, 248 | x = 'annot.type', 249 | fill = 'DM_status', 250 | x_order = cpgs_order, 251 | fill_order = dm_order, 252 | position = 'fill', 253 | legend_title = 'Annotations', 254 | plot_title = 'DM status by CpG Annotation Proportions', 255 | x_label = 'DM status', 256 | y_label = 'Proportion'), 257 | 'since data from the original regions are not transferred to the random regions') 258 | }) 259 | 260 | test_that('Test plot_categorical() success', { 261 | dm_vn_min = plot_categorical( 262 | annotated_regions = dm_annots, 263 | x = 'annot.type') 264 | 265 | dm_vn = plot_categorical( 266 | annotated_regions = dm_annots, 267 | x = 'DM_status', 268 | fill = 'annot.type', 269 | x_order = dm_order, 270 | fill_order = cpgs_order, 271 | position = 'fill', 272 | legend_title = 'knownGene Annotations', 273 | plot_title = 'DM status in knownGene Annots.', 274 | x_label = 'DM status', 275 | y_label = 'Proportion') 276 | 277 | dm_vn_rnd = plot_categorical( 278 | annotated_regions = dm_annots, 279 | annotated_random = dm_random_annots, 280 | x = 'DM_status', 281 | fill = 'annot.type', 282 | x_order = dm_order, 283 | fill_order = cpgs_order, 284 | position = 'fill', 285 | legend_title = 'Annotations', 286 | plot_title = 'DM status by CpG Annotation Proportions', 287 | x_label = 'DM status', 288 | y_label = 'Proportion') 289 | 290 | expect_equal( dplyr::setequal(class(dm_vn_min), c('gg','ggplot')), expected = TRUE) 291 | expect_equal( dplyr::setequal(class(dm_vn), c('gg','ggplot')), expected = TRUE) 292 | expect_equal( dplyr::setequal(class(dm_vn_rnd), c('gg','ggplot')), expected = TRUE) 293 | }) 294 | -------------------------------------------------------------------------------- /vignettes/annotatr-vignette.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "`annotatr`: Making sense of genomic regions" 3 | author: "Raymond G. Cavalcante" 4 | date: "`r Sys.Date()`" 5 | output: 6 | BiocStyle::html_document 7 | vignette: > 8 | %\VignetteIndexEntry{annotatr} 9 | %\VignetteEngine{knitr::rmarkdown} 10 | %\VignetteEncoding{UTF-8} 11 | --- 12 | 13 | # Introduction 14 | 15 | Genomic regions resulting from next-generation sequencing experiments and bioinformatics pipelines are more meaningful when annotated to genomic features. A SNP occurring in an exon, or an enhancer, is likely of greater interest than one occurring in an inter-genic region. It may be of interest to find that a particular transcription factor overwhelmingly binds in promoters, while another binds mostly in 3’UTRs. Hyper-methylation at promoters containing a CpG island may indicate different regulatory regimes in one condition compared to another. 16 | 17 | `annotatr` provides genomic annotations and a set of functions to read, intersect, summarize, and visualize genomic regions in the context of genomic annotations. 18 | 19 | # Installation 20 | 21 | The release version of `annotatr` is available via [Bioconductor](http://bioconductor.org/packages/annotatr/), and can be installed as follows: 22 | 23 | ```{r, eval=FALSE} 24 | if (!requireNamespace("BiocManager", quietly=TRUE)) 25 | install.packages("BiocManager") 26 | BiocManager::install("annotatr") 27 | ``` 28 | 29 | The development version of `annotatr` can be obtained via the [GitHub repository](https://github.com/rcavalcante/annotatr) or [Bioconductor](https://bioconductor.org/packages/devel/bioc/html/annotatr.html). It is easiest to install development versions with the [`devtools`](https://cran.r-project.org/web/packages/devtools/index.html) package as follows: 30 | 31 | ```{r, eval=FALSE} 32 | devtools::install_github('rcavalcante/annotatr') 33 | ``` 34 | 35 | Changelogs for development releases will be detailed on [GitHub releases](https://github.com/rcavalcante/annotatr/releases). 36 | 37 | # Annotations 38 | 39 | There are three types of annotations available to annotatr: 40 | 41 | 1. Built-in annotations including CpG annotations, genic annotations, enhancers, GENCODE lncRNAs, and chromatin states from chromHMM. Base data for each of these annotations is retrieved and processed in some way. See each below for details on data source and processing. 42 | 2. AnnotationHub annotations include any GRanges resource within the Bioconductor AnnotationHub web resource. 43 | 3. Custom annotations provided by the user. 44 | 45 | ## CpG Annotations 46 | 47 | The CpG islands are the basis for all CpG annotations, and are given by the `AnnotationHub` package for the given organism. CpG shores are defined as 2Kb upstream/downstream from the ends of the CpG islands, less the CpG islands. CpG shelves are defined as another 2Kb upstream/downstream of the farthest upstream/downstream limits of the CpG shores, less the CpG islands and CpG shores. The remaining genomic regions make up the inter-CGI annotation. 48 | 49 | CpG annotations are available for hg19, hg38, mm9, mm10, rn4, rn5, rn6. 50 | 51 | ![Schematic of CpG annotations.](annotatr_cpgs.jpeg) 52 | 53 | ## Genic Annotations 54 | 55 | The genic annotations are determined by functions from `GenomicFeatures` and data from the `TxDb.*` and `org.*.eg.db` packages. Genic annotations include 1-5Kb upstream of the TSS, the promoter (< 1Kb upstream of the TSS), 5'UTR, first exons, exons, introns, CDS, 3'UTR, and intergenic regions (the intergenic regions exclude the previous list of annotations). The schematic below illustrates the relationship between the different annotations as extracted from the `TxDb.*` packages via `GenomicFeatures` functions. 56 | 57 | ![Schematic of knownGene annotations.](annotatr_genes.jpeg) 58 | 59 | Also included in genic annotations are intronexon and exonintron boundaries. These annotations are 200bp up/down stream of any boundary between an exon and intron. Important to note, is that the boundaries are with respect to the strand of the gene. 60 | 61 | Non-intergenic gene annotations include Entrez ID and gene symbol information where it exists. The `org.*.eg.db` packages for the appropriate organisms are used to provide gene IDs and gene symbols. 62 | 63 | The genic annotations have populated `tx_id`, `gene_id`, and `symbol` columns. Respectively they are, the knownGene transcript name, Entrez Gene ID, and gene symbol. 64 | 65 | Genic annotations are available for all hg19, hg38, mm9, mm10, rn4, rn5, rn6, dm3, and dm6. 66 | 67 | ## FANTOM5 Permissive Enhancers 68 | 69 | FANTOM5 permissive enhancers were determined from bi-directional CAGE transcription as in [Andersson et al. (2014)](http://www.nature.com/nature/journal/v507/n7493/full/nature12787.html), and are downloaded and processed for hg19 and mm9 from the [FANTOM5](http://fantom.gsc.riken.jp/5/datafiles/phase2.0/extra/Enhancers/) resource. Using the `rtracklayer::liftOver()` function, enhancers from hg19 are lifted to hg38, and mm9 to mm10. 70 | 71 | ## GENCODE lncRNA transcripts 72 | 73 | The long non-coding RNA (lncRNA) annotations are from [GENCODE](https://www.gencodegenes.org) for hg19, hg38, and mm10. The lncRNA transcripts are used, and we eventually plan to include the lncRNA introns/exons at a later date. The lncRNA annotations have populated `tx_id`, `gene_id`, and `symbol` columns. Respectively they are, the Ensembl transcript name, Entrez Gene ID, and gene symbol. As per the `transcript_type` field in the GENCODE anntotations, the [biotypes](https://www.gencodegenes.org/gencode_biotypes.html) are given in the `id` column. 74 | 75 | ## Chromatin states from ChromHMM 76 | 77 | Chromatin states determined by chromHMM ([Ernst and Kellis (2012)](http://www.nature.com/nmeth/journal/v9/n3/full/nmeth.1906.html)) in hg19 are available for nine cell lines (Gm12878, H1hesc, Hepg2, Hmec, Hsmm, Huvec, K562, Nhek, and Nhlf) via the UCSC Genome Browser tracks. Annotations for all states can be built using a shortcut like `hg19_Gm12878-chromatin`, or specific chromatin states can be accessed via codes like `hg19_chromatin_Gm12878-StrongEnhancer` or `hg19_chromatin_Gm12878-Repressed`. 78 | 79 | ## `AnnotationHub` Annotations 80 | 81 | The `AnnotationHub` Bioconductor package is a client for the AnnotationHub web resource. From the package description: 82 | 83 | > The AnnotationHub web resource provides a central location where genomic files (e.g., VCF, bed, wig) and other resources from standard locations (e.g., UCSC, Ensembl) can be discovered. The resource includes metadata about each resource, e.g., a textual description, tags, and date of modification. The client creates and manages a local cache of files retrieved by the user, helping with quick and reproducible access. 84 | 85 | Using the `build_ah_annots()` function, users can turn any resource of class `GRanges` into an annotation for use in `annotatr`. As an example, we create annotations for H3K4me3 ChIP-seq peaks in Gm12878 and H1-hesc cells. 86 | 87 | ```{r, echo=FALSE} 88 | suppressWarnings(suppressMessages(suppressPackageStartupMessages(library(annotatr)))) 89 | ``` 90 | 91 | ```{r, warning = FALSE, message = FALSE} 92 | # Create a named vector for the AnnotationHub accession codes with desired names 93 | h3k4me3_codes = c('Gm12878' = 'AH23256') 94 | # Fetch ah_codes from AnnotationHub and create annotations annotatr understands 95 | build_ah_annots(genome = 'hg19', ah_codes = h3k4me3_codes, annotation_class = 'H3K4me3') 96 | # The annotations as they appear in annotatr_cache 97 | ah_names = c('hg19_H3K4me3_Gm12878') 98 | 99 | print(annotatr_cache$get('hg19_H3K4me3_Gm12878')) 100 | ``` 101 | 102 | ## Custom Annotations 103 | 104 | Users may load their own annotations from BED files using the `read_annotations()` function, which uses the `rtracklayer::import()` function. The output is a `GRanges` with `mcols()` for `id`, `tx_id`, `gene_id`, `symbol`, and `type`. If a user wants to include `tx_id`, `gene_id`, and/or `symbol` in their custom annotations they can be included as extra columns on a BED6 input file. 105 | 106 | ```{r, warning = FALSE, message = FALSE} 107 | ## Use ENCODE ChIP-seq peaks for EZH2 in GM12878 108 | ## These files contain chr, start, and end columns 109 | ezh2_file = system.file('extdata', 'Gm12878_Ezh2_peak_annotations.txt.gz', package = 'annotatr') 110 | 111 | ## Custom annotation objects are given names of the form genome_custom_name 112 | read_annotations(con = ezh2_file, genome = 'hg19', name = 'ezh2', format = 'bed') 113 | 114 | print(annotatr_cache$get('hg19_custom_ezh2')) 115 | ``` 116 | 117 | To see what is in the `annotatr_cache` environment, do the following: 118 | 119 | ```{r, warning = FALSE, message = FALSE} 120 | print(annotatr_cache$list_env()) 121 | ``` 122 | 123 | # Usage 124 | 125 | The following example is based on the results of testing for differential methylation of genomic regions between two conditions using [methylSig](https://github.com/sartorlab/methylSig). The file (`inst/extdata/IDH2mut_v_NBM_multi_data_chr9.txt.gz`) contains chromosome locations, as well as categorical and numerical data columns, and provides a good example of the flexibility of `annotatr`. 126 | 127 | ## Reading Genomic Regions 128 | 129 | `read_regions()` uses the `rtracklayer::import()` function to read in BED files and convert them to `GRanges` objects. The `name` and `score` columns in a normal BED file can be used for categorical and numeric data, respectively. Additionally, an arbitrary number of categorical and numeric data columns can be appended to a BED6 file. The `extraCols` parameter is used for this purpose, and the `rename_name` and `rename_score` columns allow users to give more descriptive names to these columns. 130 | 131 | ```{r, warning = FALSE, message = FALSE} 132 | # This file in inst/extdata represents regions tested for differential 133 | # methylation between two conditions. Additionally, there are columns 134 | # reporting the p-value on the test for differential meth., the 135 | # meth. difference between the two groups, and the group meth. rates. 136 | dm_file = system.file('extdata', 'IDH2mut_v_NBM_multi_data_chr9.txt.gz', package = 'annotatr') 137 | extraCols = c(diff_meth = 'numeric', mu0 = 'numeric', mu1 = 'numeric') 138 | dm_regions = read_regions(con = dm_file, genome = 'hg19', extraCols = extraCols, format = 'bed', 139 | rename_name = 'DM_status', rename_score = 'pval') 140 | # Use less regions to speed things up 141 | dm_regions = dm_regions[1:2000] 142 | print(dm_regions) 143 | ``` 144 | 145 | ## Annotating Regions 146 | 147 | Users may select annotations a la carte via the accessors listed with `builtin_annotations()`, shortcuts, or use custom annotations as described above. The `hg19_cpgs` shortcut annotates regions to CpG islands, CpG shores, CpG shelves, and inter-CGI. The `hg19_basicgenes` shortcut annotates regions to 1-5Kb, promoters, 5'UTRs, exons, introns, and 3'UTRs. Shortcuts for other `builtin_genomes()` are accessed in a similar way. 148 | 149 | `annotate_regions()` requires a `GRanges` object (either the result of `read_regions()` or an existing object), a `GRanges` object of the `annotations`, and a logical value indicating whether to `ignore.strand` when calling `GenomicRanges::findOverlaps()`. The positive integer `minoverlap` is also passed to `GenomicRanges::findOverlaps()` and specifies the minimum overlap required for a region to be assigned to an annotation. 150 | 151 | Before annotating regions, they must be built with `build_annotations()` which requires a character vector of desired annotation codes. 152 | 153 | ```{r, warning = FALSE, message = FALSE} 154 | # Select annotations for intersection with regions 155 | # Note inclusion of custom annotation, and use of shortcuts 156 | annots = c('hg19_cpgs', 'hg19_basicgenes', 'hg19_genes_intergenic', 157 | 'hg19_genes_intronexonboundaries', 158 | 'hg19_custom_ezh2', 'hg19_H3K4me3_Gm12878') 159 | 160 | # Build the annotations (a single GRanges object) 161 | annotations = build_annotations(genome = 'hg19', annotations = annots) 162 | 163 | # Intersect the regions we read in with the annotations 164 | dm_annotated = annotate_regions( 165 | regions = dm_regions, 166 | annotations = annotations, 167 | ignore.strand = TRUE, 168 | quiet = FALSE) 169 | # A GRanges object is returned 170 | print(dm_annotated) 171 | ``` 172 | 173 | The `annotate_regions()` function returns a `GRanges`, but it may be more convenient to manipulate a coerced `data.frame`. For example, 174 | 175 | ```{r, warning = FALSE, message = FALSE} 176 | # Coerce to a data.frame 177 | df_dm_annotated = data.frame(dm_annotated) 178 | 179 | # See the GRanges column of dm_annotaed expanded 180 | print(head(df_dm_annotated)) 181 | 182 | # Subset based on a gene symbol, in this case NOTCH1 183 | notch1_subset = subset(df_dm_annotated, annot.symbol == 'NOTCH1') 184 | print(head(notch1_subset)) 185 | ``` 186 | 187 | ## Randomizing Regions 188 | 189 | Given a set of annotated regions, it is important to know how the annotations compare to those of a randomized set of regions. The `randomize_regions()` function is a wrapper of `regioneR::randomizeRegions()` from the [`regioneR`](http://bioconductor.org/packages/release/bioc/html/regioneR.html) package that creates a set of random regions given a `GRanges` object. After creating the random set, they must be annotated with `annotate_regions()` for later use. Only `builtin_genomes()` can be used in our wrapper function. Downstream functions that support using random region annotations are `summarize_annotations()`, `plot_annotation()`, and `plot_categorical()`. 190 | 191 | It is important to note that if the regions to be randomized have a particular property, for example they are CpGs, the `randomize_regions()` wrapper will not preserve that property! Instead, we recommend using `regioneR::resampleRegions()` with `universe` being the superset of the data regions you want to sample from. 192 | 193 | ```{r, warning = FALSE, message = FALSE} 194 | # Randomize the input regions 195 | dm_random_regions = randomize_regions( 196 | regions = dm_regions, 197 | allow.overlaps = TRUE, 198 | per.chromosome = TRUE) 199 | 200 | # Annotate the random regions using the same annotations as above 201 | # These will be used in later functions 202 | dm_random_annotated = annotate_regions( 203 | regions = dm_random_regions, 204 | annotations = annotations, 205 | ignore.strand = TRUE, 206 | quiet = TRUE) 207 | ``` 208 | 209 | ## Summarizing Over Annotations 210 | 211 | When there is no categorical or numerical information associated with the regions, `summarize_annotations()` is the only possible summarization function to use. It gives the counts of regions in each annotation type (see example below). If there is categorical and/or numerical information, then `summarize_numerical()` and/or `summarize_categorical()` may be used. Using random region annotations is only available for `summarize_annotations()`. 212 | 213 | ```{r, warning = FALSE, message = FALSE} 214 | # Find the number of regions per annotation type 215 | dm_annsum = summarize_annotations( 216 | annotated_regions = dm_annotated, 217 | quiet = TRUE) 218 | print(dm_annsum) 219 | 220 | # Find the number of regions per annotation type 221 | # and the number of random regions per annotation type 222 | dm_annsum_rnd = summarize_annotations( 223 | annotated_regions = dm_annotated, 224 | annotated_random = dm_random_annotated, 225 | quiet = TRUE) 226 | print(dm_annsum_rnd) 227 | 228 | # Take the mean of the diff_meth column across all regions 229 | # occurring in an annotation. 230 | dm_numsum = summarize_numerical( 231 | annotated_regions = dm_annotated, 232 | by = c('annot.type', 'annot.id'), 233 | over = c('diff_meth'), 234 | quiet = TRUE) 235 | print(dm_numsum) 236 | 237 | # Count the occurrences of classifications in the DM_status 238 | # column across the annotation types. 239 | dm_catsum = summarize_categorical( 240 | annotated_regions = dm_annotated, 241 | by = c('annot.type', 'DM_status'), 242 | quiet = TRUE) 243 | print(dm_catsum) 244 | ``` 245 | 246 | ## Plotting 247 | 248 | The 5 plot functions described below are to be used on the object returned by `annotate_regions()`. The plot functions return an object of type `ggplot` that can be viewed (`print`), saved (`ggsave`), or modified with additional `ggplot2` code. 249 | 250 | ### Plotting Regions per Annotation 251 | 252 | ```{r, fig.align='center', fig.cap='Number of DM regions per annotation.', fig.height=6, fig.width=6, fig.show = 'hold', warning = FALSE, message = FALSE} 253 | # View the number of regions per annotation. This function 254 | # is useful when there is no classification or data 255 | # associated with the regions. 256 | annots_order = c( 257 | 'hg19_custom_ezh2', 258 | 'hg19_H3K4me3_Gm12878', 259 | 'hg19_genes_1to5kb', 260 | 'hg19_genes_promoters', 261 | 'hg19_genes_5UTRs', 262 | 'hg19_genes_exons', 263 | 'hg19_genes_intronexonboundaries', 264 | 'hg19_genes_introns', 265 | 'hg19_genes_3UTRs', 266 | 'hg19_genes_intergenic') 267 | dm_vs_kg_annotations = plot_annotation( 268 | annotated_regions = dm_annotated, 269 | annotation_order = annots_order, 270 | plot_title = '# of Sites Tested for DM annotated on chr9', 271 | x_label = 'knownGene Annotations', 272 | y_label = 'Count') 273 | print(dm_vs_kg_annotations) 274 | ``` 275 | 276 | The `plot_annotation()` can also use the annotated random regions in the `annotated_random` argument to plot the number of random regions per annotation type next to the number of input data regions. 277 | 278 | ```{r, fig.align='center', fig.cap='Number of DM regions per annotation with randomized regions.', fig.height=6, fig.width=6, fig.show = 'hold', warning = FALSE, message = FALSE} 279 | # View the number of regions per annotation and include the annotation 280 | # of randomized regions 281 | annots_order = c( 282 | 'hg19_custom_ezh2', 283 | 'hg19_H3K4me3_Gm12878', 284 | 'hg19_genes_1to5kb', 285 | 'hg19_genes_promoters', 286 | 'hg19_genes_5UTRs', 287 | 'hg19_genes_exons', 288 | 'hg19_genes_intronexonboundaries', 289 | 'hg19_genes_introns', 290 | 'hg19_genes_3UTRs', 291 | 'hg19_genes_intergenic') 292 | dm_vs_kg_annotations_wrandom = plot_annotation( 293 | annotated_regions = dm_annotated, 294 | annotated_random = dm_random_annotated, 295 | annotation_order = annots_order, 296 | plot_title = 'Dist. of Sites Tested for DM (with rndm.)', 297 | x_label = 'Annotations', 298 | y_label = 'Count') 299 | print(dm_vs_kg_annotations_wrandom) 300 | ``` 301 | 302 | ### Plotting Regions Occurring in Pairs of Annotations 303 | 304 | ```{r, fig.align='center', fig.cap='Number of DM regions per pair of annotations.', fig.height=8, fig.width=8, fig.show = 'hold', warning = FALSE, message = FALSE} 305 | # View a heatmap of regions occurring in pairs of annotations 306 | annots_order = c( 307 | 'hg19_custom_ezh2', 308 | 'hg19_H3K4me3_Gm12878', 309 | 'hg19_genes_promoters', 310 | 'hg19_genes_5UTRs', 311 | 'hg19_genes_exons', 312 | 'hg19_genes_introns', 313 | 'hg19_genes_3UTRs', 314 | 'hg19_genes_intergenic') 315 | dm_vs_coannotations = plot_coannotations( 316 | annotated_regions = dm_annotated, 317 | annotation_order = annots_order, 318 | axes_label = 'Annotations', 319 | plot_title = 'Regions in Pairs of Annotations') 320 | print(dm_vs_coannotations) 321 | ``` 322 | 323 | ### Plotting Numerical Data Over Regions 324 | 325 | With numerical data, the `plot_numerical()` function plots a single variable (histogram) or two variables (scatterplot) at the region level, faceting over the categorical variable of choice. It is possible to include two categorical variables to facet over (see below). Note, when the plot is a histogram, the distribution over all regions is plotted within each facet. 326 | 327 | ```{r, fig.align='center', fig.cap='Methylation Rates in Group 0 for Regions Over DM Status.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE} 328 | dm_vs_regions_annot = plot_numerical( 329 | annotated_regions = dm_annotated, 330 | x = 'mu0', 331 | facet = 'annot.type', 332 | facet_order = c('hg19_genes_1to5kb','hg19_genes_promoters', 333 | 'hg19_genes_5UTRs','hg19_genes_3UTRs', 'hg19_custom_ezh2', 334 | 'hg19_genes_intergenic', 'hg19_cpg_islands'), 335 | bin_width = 5, 336 | plot_title = 'Group 0 Region Methylation In Genes', 337 | x_label = 'Group 0') 338 | print(dm_vs_regions_annot) 339 | ``` 340 | 341 | ```{r, fig.align='center', fig.cap='Methylation Differences for Regions Over DM Status and Annotation Type.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE} 342 | dm_vs_regions_annot2 = plot_numerical( 343 | annotated_regions = dm_annotated, 344 | x = 'diff_meth', 345 | facet = c('annot.type','DM_status'), 346 | facet_order = list(c('hg19_genes_promoters','hg19_genes_5UTRs','hg19_cpg_islands'), c('hyper','hypo','none')), 347 | bin_width = 5, 348 | plot_title = 'Group 0 Region Methylation In Genes', 349 | x_label = 'Methylation Difference') 350 | print(dm_vs_regions_annot2) 351 | ``` 352 | 353 | ```{r, fig.align='center', fig.cap='Methylation Rates in Regions Over DM Status in Group 0 vs Group 1.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE} 354 | dm_vs_regions_name = plot_numerical( 355 | annotated_regions = dm_annotated, 356 | x = 'mu0', 357 | y = 'mu1', 358 | facet = 'annot.type', 359 | facet_order = c('hg19_genes_1to5kb','hg19_genes_promoters', 360 | 'hg19_genes_5UTRs','hg19_genes_3UTRs', 'hg19_custom_ezh2', 361 | 'hg19_genes_intergenic', 'hg19_cpg_islands', 'hg19_cpg_shores'), 362 | plot_title = 'Region Methylation: Group 0 vs Group 1', 363 | x_label = 'Group 0', 364 | y_label = 'Group 1') 365 | print(dm_vs_regions_name) 366 | ``` 367 | 368 | The `plot_numerical_coannotations()` shows the distribution of numerical data for regions occurring in any two annotations, as well as in one or the other annotation. For example, the following example shows CpG methylation rates for CpGs occurring in just promoters, just CpG islands, and both promoters and CpG islands. 369 | 370 | ```{r, fig.align='center', fig.cap='Group 0 methylation Rates in Regions in promoters, CpG islands, and both.', fig.height=5, fig.width=12, fig.show='hold', warning = FALSE, message = FALSE} 371 | dm_vs_num_co = plot_numerical_coannotations( 372 | annotated_regions = dm_annotated, 373 | x = 'mu0', 374 | annot1 = 'hg19_cpg_islands', 375 | annot2 = 'hg19_genes_promoters', 376 | bin_width = 5, 377 | plot_title = 'Group 0 Perc. Meth. in CpG Islands and Promoters', 378 | x_label = 'Percent Methylation') 379 | print(dm_vs_num_co) 380 | ``` 381 | 382 | ### Plotting Categorical Data 383 | 384 | ```{r, fig.align='center', fig.cap='Differential methylation classification with counts of CpG annotations.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE} 385 | # View the counts of CpG annotations in data classes 386 | 387 | # The orders for the x-axis labels. This is also a subset 388 | # of the labels (hyper, hypo, none). 389 | x_order = c( 390 | 'hyper', 391 | 'hypo') 392 | # The orders for the fill labels. Can also use this 393 | # parameter to subset annotation types to fill. 394 | fill_order = c( 395 | 'hg19_cpg_islands', 396 | 'hg19_cpg_shores', 397 | 'hg19_cpg_shelves', 398 | 'hg19_cpg_inter') 399 | # Make a barplot of the data class where each bar 400 | # is composed of the counts of CpG annotations. 401 | dm_vs_cpg_cat1 = plot_categorical( 402 | annotated_regions = dm_annotated, x='DM_status', fill='annot.type', 403 | x_order = x_order, fill_order = fill_order, position='stack', 404 | plot_title = 'DM Status by CpG Annotation Counts', 405 | legend_title = 'Annotations', 406 | x_label = 'DM status', 407 | y_label = 'Count') 408 | print(dm_vs_cpg_cat1) 409 | ``` 410 | 411 | ```{r, fig.align='center', fig.cap='Differential methylation classification with proportion of CpG annotations.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE} 412 | # Use the same order vectors as the previous code block, 413 | # but use proportional fill instead of counts. 414 | 415 | # Make a barplot of the data class where each bar 416 | # is composed of the *proportion* of CpG annotations. 417 | dm_vs_cpg_cat2 = plot_categorical( 418 | annotated_regions = dm_annotated, x='DM_status', fill='annot.type', 419 | x_order = x_order, fill_order = fill_order, position='fill', 420 | plot_title = 'DM Status by CpG Annotation Proportions', 421 | legend_title = 'Annotations', 422 | x_label = 'DM status', 423 | y_label = 'Proportion') 424 | print(dm_vs_cpg_cat2) 425 | ``` 426 | 427 | As with `plot_annotation()` one may add annotations for random regions to the `annotated_random` parameter of `plot_categorical()`. The result is a Random Regions bar representing the distribution of random regions for the categorical variable used for `fill`. NOTE: Random regions can only be added when `fill = 'annot.type'`. 428 | 429 | ```{r, fig.align='center', fig.cap='Differential methylation classification with proportion of CpG annotations and random regions.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE} 430 | # Add in the randomized annotations for "Random Regions" bar 431 | 432 | # Make a barplot of the data class where each bar 433 | # is composed of the *proportion* of CpG annotations, and 434 | # includes "All" regions tested for DM and "Random Regions" 435 | # regions consisting of randomized regions. 436 | dm_vs_cpg_cat_random = plot_categorical( 437 | annotated_regions = dm_annotated, annotated_random = dm_random_annotated, 438 | x='DM_status', fill='annot.type', 439 | x_order = x_order, fill_order = fill_order, position='fill', 440 | plot_title = 'DM Status by CpG Annotation Proportions', 441 | legend_title = 'Annotations', 442 | x_label = 'DM status', 443 | y_label = 'Proportion') 444 | print(dm_vs_cpg_cat_random) 445 | ``` 446 | 447 | ```{r, fig.align='center', fig.cap='Basic gene annotations with proportions of DM classification.', fig.height=6, fig.width=6, fig.show='hold', warning = FALSE, message = FALSE} 448 | # View the proportions of data classes in knownGene annotations 449 | 450 | # The orders for the x-axis labels. 451 | x_order = c( 452 | 'hg19_custom_ezh2', 453 | 'hg19_genes_1to5kb', 454 | 'hg19_genes_promoters', 455 | 'hg19_genes_5UTRs', 456 | 'hg19_genes_exons', 457 | 'hg19_genes_introns', 458 | 'hg19_genes_3UTRs', 459 | 'hg19_genes_intergenic') 460 | # The orders for the fill labels. 461 | fill_order = c( 462 | 'hyper', 463 | 'hypo', 464 | 'none') 465 | dm_vs_kg_cat = plot_categorical( 466 | annotated_regions = dm_annotated, x='annot.type', fill='DM_status', 467 | x_order = x_order, fill_order = fill_order, position='fill', 468 | legend_title = 'DM Status', 469 | x_label = 'knownGene Annotations', 470 | y_label = 'Proportion') 471 | print(dm_vs_kg_cat) 472 | ``` 473 | -------------------------------------------------------------------------------- /vignettes/annotatr_cpgs.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/vignettes/annotatr_cpgs.jpeg -------------------------------------------------------------------------------- /vignettes/annotatr_genes.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rcavalcante/annotatr/a675e1f3401bfdb270b06add469f9bafbc11efe3/vignettes/annotatr_genes.jpeg --------------------------------------------------------------------------------