├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ └── lint.yml ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── NEWS ├── R ├── aggregate.R ├── buildReferenceData.R ├── chrom-plots.R ├── content-plots.R ├── data.R ├── feature-plots.R ├── loadData.R ├── neighbor-distances.R ├── package.R ├── partition-plots.R ├── qthist.R ├── specificity-plots.R ├── utility.R └── zalias.R ├── README.md ├── _pkgdown.yaml ├── data-raw ├── TSS_hg19.R ├── bedfiles.R ├── chromSizes_hg19.R └── geneModels_hg19.R ├── data ├── TSS_hg19.rda ├── cellTypeMetadata.rda ├── chromSizes_hg19.rda ├── datalist ├── exampleOpenSignalMatrix_hg19.rda ├── geneModels_hg19.rda ├── setB_100.rda └── vistaEnhancers.rda ├── inst ├── CITATION └── extdata │ ├── C_elegans_cropped_example.fa.gz │ ├── C_elegans_cropped_example.gtf.gz │ ├── example_cell_matrix.txt │ ├── setB_100.bed.gz │ └── vistaEnhancers.bed.gz ├── long_vignettes ├── full-power.Rmd └── render-long-vignettes.R ├── man ├── BSdtToGRanges.Rd ├── GenomicDistributions-package.Rd ├── TSS_hg19.Rd ├── binBSGenome.Rd ├── binChroms.Rd ├── binRegion.Rd ├── calcChromBins.Rd ├── calcChromBinsRef.Rd ├── calcChromBinsRefSlow.Rd ├── calcCumulativePartitions.Rd ├── calcCumulativePartitionsRef.Rd ├── calcDinuclFreq.Rd ├── calcDinuclFreqRef.Rd ├── calcExpectedPartitions.Rd ├── calcExpectedPartitionsRef.Rd ├── calcFeatureDist.Rd ├── calcFeatureDistRefTSS.Rd ├── calcGCContent.Rd ├── calcGCContentRef.Rd ├── calcNearestNeighbors.Rd ├── calcNeighborDist.Rd ├── calcPartitions.Rd ├── calcPartitionsRef.Rd ├── calcSummarySignal.Rd ├── calcWidth.Rd ├── cellTypeMetadata.Rd ├── chromSizes_hg19.Rd ├── dot-requireAndReturn.Rd ├── dot-validateInputs.Rd ├── dtToGr.Rd ├── dtToGrInternal.Rd ├── exampleOpenSignalMatrix_hg19.Rd ├── geneModels_hg19.Rd ├── genomePartitionList.Rd ├── getChromSizes.Rd ├── getChromSizesFromFasta.Rd ├── getGeneModels.Rd ├── getGeneModelsFromGTF.Rd ├── getGenomeBins.Rd ├── getReferenceData.Rd ├── getTssFromGTF.Rd ├── grToDt.Rd ├── labelCuts.Rd ├── loadBSgenome.Rd ├── loadEnsDb.Rd ├── neighbordt.Rd ├── nlist.Rd ├── plotChromBins.Rd ├── plotCumulativePartitions.Rd ├── plotDinuclFreq.Rd ├── plotExpectedPartitions.Rd ├── plotFeatureDist.Rd ├── plotGCContent.Rd ├── plotNeighborDist.Rd ├── plotPartitions.Rd ├── plotQTHist.Rd ├── plotSummarySignal.Rd ├── retrieveFile.Rd ├── setB_100.Rd ├── splitDataTable.Rd ├── theme_blank_facet_label.Rd └── vistaEnhancers.Rd ├── tests ├── testthat.R └── testthat │ ├── testChrom.R │ ├── testGCContent.R │ ├── testNeighborDist.R │ ├── testOpenChromatin.R │ ├── testPartitions.R │ └── test_all.R └── vignettes ├── figures-full-power ├── GC-content-1.png ├── TSS-plot-1.png ├── TSS-plot-closeup-1.png ├── chrom-bin-plot-1.png ├── cumulative-partitions-1.png ├── custom-cumulative-partitions-1.png ├── custom-expected-partition-plot-1.png ├── custom-partition-plot-1.png ├── dinuc-content-1.png ├── expected-partition-plot-1.png ├── gene-distance-plot-1.png ├── neighbor-distance-distribution-1.png ├── open-signal-1.png ├── partition-plot-1.png ├── partition-plot-proportional-1.png └── width-distribution-1.png ├── full-power.Rmd └── intro.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | .git 2 | .travis.yml 3 | _pkgdown.yaml 4 | long_vignettes 5 | data-raw 6 | ^.*\.Rproj$ 7 | ^\.Rproj\.user$ 8 | ^doc$ 9 | ^Meta$ 10 | ^\.github$ 11 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | - dev 6 | pull_request: 7 | branches: 8 | - master 9 | - dev 10 | 11 | name: Install and lint 12 | 13 | jobs: 14 | install-and-lint: 15 | runs-on: macos-latest 16 | env: 17 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 18 | steps: 19 | - uses: actions/checkout@v2 20 | 21 | - uses: r-lib/actions/setup-r@v1 22 | 23 | - name: Query dependencies 24 | run: | 25 | install.packages('remotes') 26 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 27 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 28 | shell: Rscript {0} 29 | 30 | - name: Restore R package cache 31 | uses: actions/cache@v2 32 | with: 33 | path: ${{ env.R_LIBS_USER }} 34 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 35 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 36 | 37 | - name: Install dependencies 38 | run: | 39 | install.packages(c("remotes")) 40 | remotes::install_deps(dependencies = TRUE) 41 | remotes::install_cran("lintr") 42 | shell: Rscript {0} 43 | 44 | - name: Install package 45 | run: R CMD INSTALL . 46 | 47 | - name: Lint 48 | run: lintr::lint_package(linters = lintr::with_defaults(assignment_linter=NULL)) 49 | shell: Rscript {0} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | docs/* 2 | .Rproj.user 3 | .RData 4 | .Rhistory 5 | *.Rproj 6 | *.bed.gz 7 | *.sqlite 8 | 9 | # OS generated files 10 | .DS_Store 11 | .DS_Store? 12 | ._* 13 | .Spotlight-V100 14 | .Trashes 15 | ehthumbs.db 16 | Thumbs.db 17 | 18 | # Gedit temporary files 19 | *~ 20 | 21 | # libreoffice lock files: 22 | .~lock* 23 | doc 24 | Meta 25 | /doc/ 26 | /Meta/ 27 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | branches: 2 | only: 3 | - master 4 | - dev 5 | language: r 6 | r: bioc-release -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: GenomicDistributions 2 | Version: 1.17.1 3 | Date: 2025-04-23 4 | Title: GenomicDistributions: fast analysis of genomic intervals with Bioconductor 5 | Description: If you have a set of genomic ranges, this package can help you with 6 | visualization and comparison. It produces several kinds of plots, for example: 7 | Chromosome distribution plots, which visualize how your regions are distributed 8 | over chromosomes; feature distance distribution plots, which visualizes how 9 | your regions are distributed relative to a feature of interest, like 10 | Transcription Start Sites (TSSs); genomic partition plots, which visualize 11 | how your regions overlap given genomic features such as promoters, introns, 12 | exons, or intergenic regions. It also makes it easy to compare one set of 13 | ranges to another. 14 | Authors@R: c( 15 | person("Kristyna", "Kupkova", role=c("aut", "cre"), 16 | email = "kristynakupkova@gmail.com"), 17 | person("Jose", "Verdezoto", role="aut"), 18 | person("Tessa", "Danehy", role="aut"), 19 | person("John", "Lawson", role="aut"), 20 | person("Jose", "Verdezoto", role="aut"), 21 | person("Michal", "Stolarczyk", role="aut"), 22 | person("Jason", "Smith", role="aut"), 23 | person("Bingjie", "Xue", role="aut"), 24 | person("Sophia", "Rogers", role="aut"), 25 | person("John", "Stubbs", role="aut"), 26 | person(given=c("Nathan", "C."), "Sheffield", 27 | email = "nathan@code.databio.org", role="aut")) 28 | Depends: 29 | R (>= 4.0), 30 | IRanges, 31 | GenomicRanges 32 | Imports: 33 | data.table, 34 | ggplot2, 35 | reshape2, 36 | methods, 37 | utils, 38 | Biostrings, 39 | plyr, 40 | dplyr, 41 | scales, 42 | broom, 43 | GenomeInfoDb, 44 | stats 45 | Suggests: 46 | AnnotationFilter, 47 | rtracklayer, 48 | testthat, 49 | knitr, 50 | BiocStyle, 51 | rmarkdown, 52 | GenomicDistributionsData 53 | Enhances: 54 | BSgenome, 55 | extrafont, 56 | ensembldb, 57 | GenomicFeatures 58 | LazyData: true 59 | VignetteBuilder: knitr 60 | License: BSD_2_clause + file LICENSE 61 | biocViews: Software, GenomeAnnotation, GenomeAssembly, DataRepresentation, Sequencing, 62 | Coverage, FunctionalGenomics, Visualization 63 | RoxygenNote: 7.3.2 64 | URL: http://code.databio.org/GenomicDistributions 65 | BugReports: http://github.com/databio/GenomicDistributions 66 | Encoding: UTF-8 67 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2017 2 | COPYRIGHT HOLDER: Nathan Sheffield -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(binBSGenome) 4 | export(binChroms) 5 | export(binRegion) 6 | export(calcChromBins) 7 | export(calcChromBinsRef) 8 | export(calcCumulativePartitions) 9 | export(calcCumulativePartitionsRef) 10 | export(calcDinuclFreq) 11 | export(calcDinuclFreqRef) 12 | export(calcExpectedPartitions) 13 | export(calcExpectedPartitionsRef) 14 | export(calcFeatureDist) 15 | export(calcFeatureDistRefTSS) 16 | export(calcGCContent) 17 | export(calcGCContentRef) 18 | export(calcNearestNeighbors) 19 | export(calcNeighborDist) 20 | export(calcPartitions) 21 | export(calcPartitionsRef) 22 | export(calcSummarySignal) 23 | export(calcWidth) 24 | export(dtToGr) 25 | export(genomePartitionList) 26 | export(getChromSizes) 27 | export(getChromSizesFromFasta) 28 | export(getGeneModels) 29 | export(getGeneModelsFromGTF) 30 | export(getGenomeBins) 31 | export(getTssFromGTF) 32 | export(loadBSgenome) 33 | export(loadEnsDb) 34 | export(nlist) 35 | export(plotChromBins) 36 | export(plotCumulativePartitions) 37 | export(plotDinuclFreq) 38 | export(plotExpectedPartitions) 39 | export(plotFeatureDist) 40 | export(plotGCContent) 41 | export(plotNeighborDist) 42 | export(plotPartitions) 43 | export(plotQTHist) 44 | export(plotSummarySignal) 45 | export(retrieveFile) 46 | import(dplyr) 47 | import(ggplot2) 48 | importFrom(Biostrings,alphabetFrequency) 49 | importFrom(Biostrings,readDNAStringSet) 50 | importFrom(GenomicRanges,GRanges) 51 | importFrom(GenomicRanges,GRangesList) 52 | importFrom(GenomicRanges,elementMetadata) 53 | importFrom(GenomicRanges,granges) 54 | importFrom(GenomicRanges,makeGRangesFromDataFrame) 55 | importFrom(GenomicRanges,seqnames) 56 | importFrom(GenomicRanges,strand) 57 | importFrom(IRanges,IRanges) 58 | importFrom(IRanges,Views) 59 | importFrom(data.table,":=") 60 | importFrom(data.table,as.data.table) 61 | importFrom(data.table,copy) 62 | importFrom(data.table,data.table) 63 | importFrom(data.table,foverlaps) 64 | importFrom(data.table,fread) 65 | importFrom(data.table,is.data.table) 66 | importFrom(data.table,rbindlist) 67 | importFrom(data.table,setDT) 68 | importFrom(data.table,setattr) 69 | importFrom(data.table,setcolorder) 70 | importFrom(data.table,setkey) 71 | importFrom(data.table,setnames) 72 | importFrom(data.table,setorder) 73 | importFrom(data.table,tstrsplit) 74 | importFrom(methods,is) 75 | importFrom(reshape2,melt) 76 | importFrom(stats,chisq.test) 77 | importFrom(utils,data) 78 | importFrom(utils,download.file) 79 | importFrom(utils,getAnywhere) 80 | importFrom(utils,globalVariables) 81 | importFrom(utils,installed.packages) 82 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | # Change log 2 | All notable changes to this project will be documented in this file. Here we 3 | will document changes to major new releases only (not point releases). 4 | 5 | ## [1.3.3] -- 2022-01-27 6 | - Updated cumulative distribution feature overlap plots and description 7 | 8 | ## [1.3.2] -- 2022-01-20 9 | - Bioconductor released new version 10 | - Cell specificity plots are now more generic signal summary plots - the calc function is now "calcSummarySignal", plot function "plotSummarySignal" 11 | - Added calculation of Chi-square p-values of expected partitions as default output of "calcExpectedPartitionsRef" 12 | - Chi-square p-values are now optionally shown as stars on top of each partition output by "plotExpectedPartitions" 13 | - Chromosome distribution calculations have been optimized to substantially reduce running time 14 | - Default plotting of neighbor distance now includes an X-axis log scale to account for outliers more intuitively 15 | - Corrected plotting of partition overlap for multiple region sets - now sums to 100 by group 16 | - Intro vignette has been updated to include "calcSummarySignal" and "plotSummarySignal" functions along with expected partitions p-values calculation and updated X-axis scale of neighbor dist plots 17 | 18 | ## [1.1.2] -- 2020-07-07 19 | - Added functions to calculate and plot dinucleotide frequencies 20 | 21 | ## [1.1.1] -- 2020-06-03 22 | 23 | - Package now on Bioconductor under 1.0.0 (dev version 1.1.0) - bump to 1.1.1 24 | - Added sorting option to feature distance plot 25 | - Added stack bar option to partition plots 26 | - Fixed ordering of chromosome distribution 27 | - Added possibility to calculate proportional overlap in partition plots 28 | - Expected partition distribution now calculated based on annotation class object sizes 29 | - Improved definition of introns in gene partition lists 30 | - Improved memory and time performance 31 | - In full power vignette added option to add custom annotation classes in partition plots 32 | - Added functions to create annotations out of GTF files 33 | - New vignette - how to build custom reference data 34 | 35 | ## [0.99.0] -- 2020-05-26 36 | 37 | - Update R version to 4.0 38 | - Bioconductor submission 39 | 40 | ## [0.8] -- 2020-05-20 41 | 42 | - Added tiled version to feature distance plot 43 | - Added percentage plots to feature distances 44 | - Reduced data shipped with package 45 | - Reduced verbosity of calc functions 46 | - Simplified functions for calculating overlaps 47 | - Added UTRs to gene models 48 | - Moved data producing functions to data-raw 49 | - Improved 'full power' vignette 50 | - Bug fixes for edge cases, like plotting distribution with only one value 51 | 52 | ## [0.7] -- 2020-04-11 53 | 54 | - Revamped TSS distance distribution plots to better reflect the scale of distances 55 | - Added new versions of partition plots 56 | - Added new cell-type specificity plot 57 | - Added new quantile-trimmed histogram for width distribution plots 58 | - Added new unit tests and updated coding style for bioconductor 59 | 60 | ## [0.6] -- 2019-09-20 61 | 62 | - Added functions to calculate GC content 63 | 64 | ## [0.5] -- 2018-04-30 65 | 66 | - Built-in data added for mm9 assembly 67 | - Functions can now accept TxDb objects, in addition to EnsDb objects 68 | 69 | ## [0.4] -- 2018-04-05 70 | 71 | - Add partition plots 72 | 73 | ## [0.3] -- 2018-03-05 74 | 75 | - Revamp all function names, make functions more parallel and modular 76 | - Dramatic increase in speed for feature distance plots 77 | 78 | ## [0.2] -- 2018-03-02 79 | 80 | - Make bins the same size, instead of having the same number of bins per chrom 81 | - Divest dependency on BSGenome and ensembldb by integrating basic data 82 | 83 | ## [0.1] -- 2018-02-01 84 | 85 | - First version released 86 | -------------------------------------------------------------------------------- /R/aggregate.R: -------------------------------------------------------------------------------- 1 | 2 | # Quick way to count overlaps between a region set and one or more other 3 | # region sets. 4 | # 5 | # Count how many regions 6 | # from the first region set (queryRegionDT) overlap with each of the regions 7 | # from the other region set/s. 8 | # Uses only the midpoint of the first region set when finding overlaps. 9 | # 10 | # @param queryRegionDT data.frame/data.table. Must have "chr" and "start" 11 | # columns. 12 | # @param regionsGRL GRangesList or GRanges. E.g. Binned chromosomes, with 13 | # each chromosome as a GRanges object in the GRangesList. 14 | # @return a data.table with the following columns: 15 | # regionID, chr, start, end, withinGroupID, regionGroupID, N 16 | # the coordinates refer to the regions from regionsGRL. 17 | # "regionGroupID" refers to which GRanges from regionsGRL the given 18 | # region was a member of. "withinGroupID" refers to the index of the given 19 | # region within its GRanges object. 20 | # "regionID" has the index for the given region as if all the GRanges from 21 | # regionsGRL were combined into a single GRanges object 22 | # The "N" column has the counts for number of query regions 23 | # overlapping with that regionsGRL region 24 | calcOLCount = function(queryRegionDT, regionsGRL) { 25 | jExpr = ".N" 26 | queryRegionDT = queryRegionDT 27 | 28 | # Assert that regionsGRL is a GRL. 29 | # If regionsGRL is given as a GRanges, we convert to GRL 30 | if(methods::is(regionsGRL,"GRanges")) { 31 | regionsGRL = GRangesList(regionsGRL) 32 | } else if (! methods::is(regionsGRL, "GRangesList")) { 33 | stop("regionsGRL is not a GRanges or GRangesList object") 34 | } 35 | 36 | # convert query regions to just the midpoint 37 | if ("end" %in% colnames(queryRegionDT)) { 38 | # assign to "start" since BSdtToGRanges keeps the start coord 39 | queryRegionDT$start = round((queryRegionDT$start + 40 | queryRegionDT$end)/2) 41 | } 42 | 43 | # only keeps start column 44 | bsgr = BSdtToGRanges(list(queryRegionDT)) 45 | 46 | # It's required to do a findoverlaps on each region individually, 47 | # Not on a GRL, because of the way overlaps with GRLs work. So, 48 | # we must convert the GRL to a GR, but we must keep track of which 49 | # regions came from which group. 50 | regionsGR = unlist(regionsGRL) 51 | 52 | regionsGRL.length = lapply(regionsGRL, length) 53 | 54 | # Build a table to keep track of which regions belong to which group 55 | region2group = data.table( 56 | regionID=seq_along(regionsGR), 57 | chr=as.vector(seqnames(regionsGR)), 58 | start=as.vector(start(regionsGR)), 59 | end=as.vector(end(regionsGR)), 60 | withinGroupID= as.vector(unlist(lapply(regionsGRL.length, seq))), 61 | regionGroupID=rep(seq_along(regionsGRL), regionsGRL.length)) 62 | setkey(region2group, regionID) 63 | 64 | 65 | message("Finding overlaps...") 66 | fo = findOverlaps(bsgr[[1]], regionsGR) 67 | 68 | setkey(queryRegionDT, chr, start) 69 | 70 | message("Setting regionIDs...") 71 | #restr to CpGs in any region. 72 | queryRegionDT = queryRegionDT[queryHits(fo),] 73 | 74 | if (NROW(queryRegionDT) < 1) { 75 | warning("No overlapping regions in the given region list; 76 | please expand your regionsGRL") 77 | return(NULL) 78 | } 79 | #record which region they overlapped. 80 | queryRegionDT[,regionID:=subjectHits(fo)] 81 | #queryRegionDT[queryHits(fo),regionID:=subjectHits(fo)] 82 | #if (!keep.na) { 83 | #queryRegionDT = queryRegionDT[queryHits(fo),] 84 | #} 85 | 86 | # Build the by string 87 | byString = paste0("list(regionID)") 88 | 89 | # Now actually do the aggregate: 90 | message("Combining...") 91 | bsCombined = queryRegionDT[,eval(parse(text=jExpr)), 92 | by=eval(parse(text=byString))] 93 | setkey(bsCombined, regionID) 94 | 95 | e = region2group[bsCombined,] 96 | setkey(e, regionID) 97 | return(e) 98 | } 99 | -------------------------------------------------------------------------------- /R/buildReferenceData.R: -------------------------------------------------------------------------------- 1 | #' Read local or remote file 2 | #' 3 | #' @param source a string that is either a path to a local or remote GTF 4 | #' @param destDir a string that indicates the path to the directory where 5 | #' the downloaded GTF file should be stored. If not provided, 6 | #' a temporary directory will be used. 7 | #' 8 | #' @return data.frame retrieved file path 9 | #' @export 10 | #' 11 | #' @examples 12 | #' CElegansGtfCropped = system.file("extdata", 13 | #' "C_elegans_cropped_example.gtf.gz", 14 | #' package="GenomicDistributions") 15 | #' CElegansGtf = retrieveFile(CElegansGtfCropped) 16 | retrieveFile = function(source, destDir=NULL){ 17 | if (is.null(destDir)) destDir = tempdir() 18 | # download file, if not local 19 | if (!file.exists(source)) { 20 | destFile = paste(destDir, basename(source), sep = "/") 21 | if (file.exists(destFile)){ 22 | message("File exists: ", destFile) 23 | }else{ 24 | message("File will be saved in: ", destFile) 25 | download.file(url = source, destfile = destFile) 26 | } 27 | }else{ 28 | destFile = source 29 | message("Got local file: ", destFile) 30 | } 31 | 32 | return(destFile) 33 | } 34 | 35 | 36 | #' Get transcription start sites (TSSs) from a remote or local GTF file 37 | #' 38 | #' @param source a string that is either a path to a local or remote GTF 39 | #' @param destDir a string that indicates the path to the directory where 40 | #' the downloaded GTF file should be stored 41 | #' @param convertEnsemblUCSC a logical indicating whether Ensembl style 42 | #' chromosome annotation should be changed to UCSC style 43 | #' @param filterProteinCoding a logical indicating if TSSs should be only 44 | #' protein-coding genes (default = TRUE) 45 | #' 46 | #' @return a list of GRanges objects 47 | #' 48 | #' @import dplyr 49 | #' @export 50 | #' 51 | #' @examples 52 | #' CElegansGtfCropped = system.file("extdata", 53 | #' "C_elegans_cropped_example.gtf.gz", 54 | #' package="GenomicDistributions") 55 | #' CElegansTss = getTssFromGTF(CElegansGtfCropped, TRUE) 56 | getTssFromGTF = function(source, convertEnsemblUCSC=FALSE, destDir=NULL, 57 | filterProteinCoding=TRUE){ 58 | GtfDf = as.data.frame(rtracklayer::import(retrieveFile(source, destDir))) 59 | 60 | if (filterProteinCoding) { 61 | subsetGtfDf = GtfDf %>% 62 | dplyr::filter(gene_biotype == "protein_coding", type == "gene") 63 | } else { 64 | subsetGtfDf = GtfDf 65 | } 66 | 67 | gr = makeGRangesFromDataFrame(subsetGtfDf, keep.extra.columns = TRUE) 68 | feats = promoters(gr, 1, 1) 69 | if(convertEnsemblUCSC) 70 | seqlevels(feats) = paste0("chr", seqlevels(feats)) 71 | feats 72 | } 73 | 74 | 75 | #' Get gene models from a remote or local GTF file 76 | #' 77 | #' @param source a string that is either a path to a local or remote GTF 78 | #' @param destDir a string that indicates the path to the directory where 79 | #' the downloaded GTF file should be stored 80 | #' @param features a vector of strings with feature identifiers that to 81 | #' include in the result list 82 | #' @param convertEnsemblUCSC a logical indicating whether Ensembl style 83 | #' chromosome annotation should be changed to UCSC style 84 | #' @param filterProteinCoding a logical indicating if TSSs should be only 85 | #' protein-coding genes (default = TRUE) 86 | #' 87 | #' @return a list of GRanges objects 88 | #' 89 | #' @import dplyr 90 | #' @export 91 | #' 92 | #' @examples 93 | #' CElegansGtfCropped = system.file("extdata", 94 | #' "C_elegans_cropped_example.gtf.gz", 95 | #' package="GenomicDistributions") 96 | #' features = c("gene", "exon", "three_prime_utr", "five_prime_utr") 97 | #' CElegansGeneModels = getGeneModelsFromGTF(CElegansGtfCropped, features, TRUE) 98 | getGeneModelsFromGTF = function(source, 99 | features, 100 | convertEnsemblUCSC = FALSE, 101 | destDir = NULL, 102 | filterProteinCoding=TRUE) { 103 | GtfDf = as.data.frame(rtracklayer::import(retrieveFile(source, destDir))) 104 | 105 | if (filterProteinCoding) { 106 | subsetGtfDf = GtfDf %>% 107 | dplyr::filter(gene_biotype == "protein_coding") 108 | } else { 109 | subsetGtfDf = GtfDf 110 | } 111 | 112 | retList = list() 113 | message("Extracting features: ", paste(features, collapse = ", ")) 114 | for (feat in features) { 115 | featGR = GenomicRanges::reduce( 116 | unique(GenomeInfoDb::keepStandardChromosomes( 117 | GenomicRanges::makeGRangesFromDataFrame( 118 | subsetGtfDf %>% filter(type == feat), 119 | keep.extra.columns = TRUE), 120 | pruning.mode = "coarse"))) 121 | # change from Ensembl style chromosome annotation to UCSC style 122 | if (convertEnsemblUCSC) 123 | seqlevels(featGR) = paste0("chr", seqlevels(featGR)) 124 | retList[[feat]] = featGR 125 | } 126 | retList 127 | } 128 | 129 | 130 | #' Get gene models from a remote or local FASTA file 131 | #' 132 | #' @param source a string that is either a path to a 133 | #' local or remote FASTA 134 | #' @param destDir a string that indicates the path to the 135 | #' directory where the downloaded FASTA file should be stored 136 | #' @param convertEnsemblUCSC a logical indicating whether Ensembl style 137 | #' chromosome annotation should be changed to UCSC style (add chr) 138 | #' @return a named vector of sequence lengths 139 | #' @importFrom Biostrings readDNAStringSet 140 | #' @export 141 | #' 142 | #' @examples 143 | #' CElegansFasteCropped = system.file("extdata", 144 | #' "C_elegans_cropped_example.fa.gz", 145 | #' package="GenomicDistributions") 146 | #' CElegansChromSizes = getChromSizesFromFasta(CElegansFasteCropped) 147 | getChromSizesFromFasta = function(source, destDir=NULL, 148 | convertEnsemblUCSC=FALSE) { 149 | fastaPath = retrieveFile(source, destDir) 150 | fastaStringSet = readDNAStringSet(fastaPath) 151 | oriNames = fastaStringSet@ranges@NAMES 152 | names = vapply(oriNames, function(x){ 153 | strsplit(x, " ")[[1]][1] 154 | }, character(1)) 155 | chromSizes = fastaStringSet@ranges@width 156 | if(convertEnsemblUCSC){ 157 | names(chromSizes) = paste0("chr", names) 158 | } else{ 159 | names(chromSizes) = names 160 | } 161 | chromSizes 162 | } 163 | -------------------------------------------------------------------------------- /R/chrom-plots.R: -------------------------------------------------------------------------------- 1 | 2 | #' Divide regions into roughly equal bins 3 | #' 4 | #' Given a start coordinate, end coordinate, and number of bins to divide, 5 | #' this function will split the regions into that many bins. 6 | #' Bins will be only approximately the same size, due to rounding. 7 | #' (they should not be more than 1 different). 8 | #' 9 | #' Use case: take a set of regions, like CG islands, and bin them; now you can 10 | #' aggregate signal scores across the bins, giving you an aggregate signal 11 | #' in bins across many regions of the same type. 12 | #' 13 | #' In theory, this just runs on 3 values, but you can run it inside a 14 | #' data.table j expression to divide a bunch of regions in the same way. 15 | #' @param start The starting coordinate 16 | #' @param end The ending coordinate 17 | #' @param binSize The size of bin to divide the genome into. You must supply 18 | #' either binSize (priority) or binCount. 19 | #' @param binCount The number of bins to divide. If you do not supply binSize, 20 | #' you must supply binCount, which will be used to calculate the binSize. 21 | #' @param indicator A vector with identifiers to keep with your bins, in case 22 | #' you are doing this on a long table with multiple segments concatenated 23 | #' 24 | #' @return 25 | #' A data.table, expanded to nrow = number of bins, with these id columns: 26 | #' id: region ID 27 | #' binID: repeating ID (this is the value to aggregate across) 28 | #' ubinID: unique bin IDs 29 | #' @export 30 | #' @examples 31 | #' Rbins = binRegion(1, 3000, 100, 1000) 32 | #' 33 | binRegion = function(start, end, binSize=NULL, binCount=NULL, indicator=NULL) { 34 | .validateInputs(list(start="numeric", end="numeric")) 35 | if (is.null(binSize) & is.null(binCount)) { 36 | stop("You must provide either binSize or binCount") 37 | } 38 | if (is.null(binSize)) { 39 | binSize = round(sum(end-start)/binCount) 40 | } 41 | binCountByChrom = round((end-start)/binSize) 42 | binCountByChrom[binCountByChrom==0]=1 43 | binSizeByChrom = (end-start)/(binCountByChrom) 44 | breaks = round(unlist(lapply(binCountByChrom, 45 | function(x) seq(from=0, to=x))) * 46 | rep(binSizeByChrom, (binCountByChrom+1))) 47 | endpoints = cumsum(binCountByChrom + 1) 48 | startpoints = c(1, endpoints[-length(endpoints)]+1) 49 | 50 | dataTable = data.table(start=breaks[-endpoints]+1, 51 | end=breaks[-startpoints], 52 | id=rep((seq_along(start)), binCountByChrom), 53 | binID=unlist(lapply(binCountByChrom, 54 | function(x) seq(from=1, to=x))), 55 | ubinID=seq_along(breaks[-startpoints]), 56 | key="id") 57 | 58 | if (!is.null(indicator)){ 59 | idCol = rep(indicator, binCountByChrom) 60 | dataTable = data.table(idCol, dataTable) 61 | } 62 | return(dataTable) 63 | } 64 | 65 | #' Bins a BSgenome object. 66 | #' 67 | #' Given a BSgenome object (to be loaded via \code{loadBSgenome}), and a number 68 | #' of bins, this will bin that genome. It is a simple wrapper of the 69 | #' \code{binChroms} function 70 | #' 71 | #' @param genome A UCSC-style string denoting reference assembly (e.g. 'hg38') 72 | #' @param binCount number of bins per chromosome 73 | #' @return A data.table object showing the region and bin IDs 74 | #' of the reference genome. 75 | #' @export 76 | #' @examples 77 | #' \dontrun{ 78 | #' binCount = 1000 79 | #' refGenomeBins = binBSGenome("hg19", binCount) 80 | #' } 81 | binBSGenome = function(genome, binCount) { 82 | .validateInputs(list(genome="character", binCount="numeric")) 83 | BSG = loadBSgenome(genome) 84 | chromSizes = seqlengths(BSG) 85 | return(binChroms(binCount, chromSizes)) 86 | } 87 | 88 | #' Naively splits a chromosome into bins 89 | #' 90 | #' Given a list of chromosomes with corresponding sizes, this script will 91 | #' produce (roughly) evenly-sized bins across the chromosomes. It does not 92 | #' account for assembly gaps or the like. 93 | #' 94 | #' @param binCount number of bins (total; *not* per chromosome) 95 | #' @param chromSizes a named list of size (length) for each chromosome. 96 | #' @return A data.table object assigning a bin ID to each chromosome region. 97 | #' @export 98 | #' @examples 99 | #' chromSizes = c(chr1=249250621, chr2=243199373, chr3=198022430) 100 | #' cBins = binChroms(1000, chromSizes) 101 | #' 102 | binChroms = function(binCount, chromSizes) { 103 | .validateInputs(list(chromSizes="numeric", binCount="numeric")) 104 | seqnamesColName="chr" 105 | rangeDT = data.table(chr=names(chromSizes), start=1, end=chromSizes) 106 | binnedDT = rangeDT[, binRegion(start, end, binCount=binCount, 107 | indicator=get(seqnamesColName))] 108 | return(binnedDT) 109 | } 110 | 111 | 112 | #' Calculates the distribution of a query set over the genome 113 | #' 114 | #' Returns a data.table showing counts of regions from the query that overlap 115 | #' with each bin. 116 | #' In other words, where on which chromosomes are the ranges distributed? 117 | #' You must provide binned regions. Only the midpoint of each query region is 118 | #' used to test for overlap with the bin regions. 119 | #' 120 | #' @param query A GenomicRanges or GenomicRangesList object with query regions 121 | #' @param bins Pre-computed bins (as a GRangesList object) to aggregate 122 | #' over; for example, these could be genome bins 123 | #' @return A data.table showing where on which chromosomes 124 | #' ranges are distributed. 125 | #' @export 126 | #' @examples 127 | #' 128 | #' chromSizes = getChromSizes("hg19") 129 | #' genomeBins = getGenomeBins(chromSizes) 130 | #' chromDistribution = calcChromBins(vistaEnhancers, genomeBins) 131 | #' 132 | #' vistaSftd = GenomicRanges::shift(vistaEnhancers, 100000) 133 | #' vistaSftd2 = GenomicRanges::shift(vistaEnhancers, 200000) 134 | #' calcChromBins(vistaEnhancers, GRangesList(vistaSftd, vistaSftd2)) 135 | calcChromBins = function(query, bins) { 136 | .validateInputs(list(bins=c("GRanges","GRangesList"), 137 | query=c("GRanges","GRangesList"))) 138 | if (is(query, "GRangesList")) { 139 | # Recurse over each GRanges object 140 | x = lapply(query, calcChromBins, bins) 141 | # To accommodate multiple regions, we'll need to introduce a new 'name' 142 | # column to distinguish them. 143 | nameList = names(query) 144 | if(is.null(nameList)) { 145 | nameList = seq_along(query) # Fallback to sequential numbers 146 | } 147 | # Append names 148 | xb = rbindlist(x) 149 | xb$name = rep(nameList, vapply(x, nrow, integer(1))) 150 | return(xb) 151 | } 152 | 153 | queryDT = grToDt(query) 154 | 155 | # This function will just count the number of regions. 156 | res = calcOLCount(queryDT, bins) 157 | 158 | # order chromosomes by current order. 159 | res[, chr:=factor(chr, levels=unique(res$chr))] 160 | return(res) 161 | } 162 | 163 | #' Returns the distribution of query over a reference assembly 164 | 165 | #' Given a query set of elements (a GRanges object) and a reference assembly 166 | #' (*e.g. 'hg38'), this will aggregate and count the distribution of the query 167 | #' elements across bins of the reference genome. This is a helper function to 168 | #' create features for common genomes. It is a wrapper of 169 | #' \code{calcChromBins}, which is more general. 170 | 171 | #' @param query A GenomicRanges or GenomicRangesList object with query regions 172 | #' @param refAssembly A character vector that will be used to grab chromosome 173 | #' sizes with \code{getChromSizes} 174 | #' @param binCount Number of bins to divide the chromosomes into 175 | #' @return A data.table showing the distribution of regions across bins of the 176 | #' reference genome. 177 | #' @examples 178 | #' ChromBins = calcChromBinsRef(vistaEnhancers, "hg19") 179 | calcChromBinsRefSlow = function(query, refAssembly, binCount=3000) { 180 | .validateInputs(list(refAssembly="character", 181 | query=c("GRanges","GRangesList"))) 182 | # Bin the genome 183 | chromSizes = getChromSizes(refAssembly) 184 | binnedDT = binChroms(binCount, chromSizes) 185 | splitBinnedDT = splitDataTable(binnedDT, "id") 186 | listGR = lapply(splitBinnedDT, dtToGr, chr="idCol") 187 | genomeBins = GRangesList(listGR) 188 | return(calcChromBins(query, genomeBins)) 189 | } 190 | 191 | 192 | #' Returns the distribution of query over a reference assembly 193 | 194 | #' Given a query set of elements (a GRanges object) and a reference assembly 195 | #' (*e.g. 'hg38'), this will aggregate and count the distribution of the query 196 | #' elements across bins of the reference genome. This is a helper function to 197 | #' create features for common genomes. It is a wrapper of 198 | #' \code{calcChromBins}, which is more general. 199 | 200 | #' @param query A GenomicRanges or GenomicRangesList object with query regions 201 | #' @param refAssembly A character vector that will be used to grab chromosome 202 | #' sizes with \code{getChromSizes} 203 | #' @param binCount Number of bins to divide the chromosomes into 204 | #' @return A data.table showing the distribution of regions across bins of the 205 | #' reference genome. 206 | #' @export 207 | #' @examples 208 | #' ChromBins = calcChromBinsRef(vistaEnhancers, "hg19") 209 | calcChromBinsRef = function(query, refAssembly, binCount=3000) { 210 | .validateInputs(list(refAssembly="character", 211 | query=c("GRanges","GRangesList"))) 212 | if (is(query, "GRangesList")) { 213 | # Recurse over each GRanges object 214 | x = lapply(query, calcChromBinsRef, refAssembly, binCount) 215 | # To accommodate multiple regions, we'll need to introduce a new 'name' 216 | # column to distinguish them. 217 | nameList = names(query) 218 | if(is.null(nameList)) { 219 | nameList = seq_along(query) # Fallback to sequential numbers 220 | } 221 | # Append names 222 | xb = rbindlist(x) 223 | xb$name = rep(nameList, vapply(x, nrow, integer(1))) 224 | return(xb) 225 | } 226 | # Bin the genome 227 | chromSizes = getChromSizes(refAssembly) 228 | binnedDT = binChroms(binCount, chromSizes) 229 | queryDT = grToDt(query) 230 | setnames(binnedDT, "idCol", "chr") 231 | queryDT[, midpoint:=start + (end-start)] 232 | # Here I use a non-equi join to get the overlaps 233 | res = binnedDT[queryDT, .(chr, regionID=ubinID, withinGroupID=x.binID, start=x.start, end=x.end), 234 | on=.(chr, start<=midpoint, end>=midpoint), nomatch=0L][, list(.N), by=list(chr, start, end, regionID, withinGroupID)][order(regionID),] 235 | res[, chr:=factor(chr, levels=unique(res$chr))] 236 | return(res) 237 | } 238 | 239 | 240 | 241 | #' Plot distribution over chromosomes 242 | #' 243 | #' Plots result from \code{genomicDistribution} calculation 244 | #' @param genomeAggregate The output from the genomicDistribution function 245 | #' @param plotTitle Title for plot. 246 | #' @param ylim Limit of y-axes. Default "max" sets limit to N of biggest bin. 247 | #' @return A ggplot object showing the distribution of the query 248 | #' regions over bins of 249 | #' the reference genome. 250 | #' @export 251 | #' @examples 252 | #' agg = data.frame("regionID"=1:5, "chr"=rep(c("chr1"), 5), 253 | #' "withinGroupID"=1:5, "N"=c(1,3,5,7,9)) 254 | #' ChromBins = plotChromBins(agg) 255 | #' 256 | plotChromBins = function(genomeAggregate, 257 | plotTitle="Distribution over chromosomes", ylim="max") { 258 | .validateInputs(list(genomeAggregate=c("data.table","data.frame"))) 259 | 260 | if ("name" %in% names(genomeAggregate)){ 261 | # It has multiple regions 262 | # sort the regions labels again 263 | setkey(genomeAggregate, regionID) 264 | genomeAggregate[, chr:=factor(chr, levels=unique(genomeAggregate$chr))] 265 | # and plot 266 | g = ggplot(genomeAggregate, aes(x=withinGroupID, y=N, 267 | fill=name, color=name)) 268 | } else { 269 | # It's a single region 270 | g = ggplot(genomeAggregate, aes(x=withinGroupID, y=N)) 271 | } 272 | g = g + 273 | xlab("Genome") + 274 | ylab("Number of regions") + 275 | geom_bar(stat="identity") + # Spread out to max width 276 | facet_grid(chr ~ .) + # Place chromosomes one on top of another 277 | theme_classic() + # Clean up cruft 278 | theme_blank_facet_label() + # No boxes around labels 279 | theme(panel.spacing=unit(0, "lines")) + # Reduce whitespace 280 | theme(strip.text.y=element_text(size=12, angle=0)) + # Rotate labels 281 | geom_hline(yintercept=0, color="#EEEEEE") + # Light chrom lines 282 | {if (ylim == "max") { 283 | scale_y_continuous(breaks = c(max(genomeAggregate$N)), 284 | limits = c(0, max(genomeAggregate$N))) 285 | } else { 286 | scale_y_continuous(breaks = ylim, 287 | limits = c(0, ylim)) 288 | }} + 289 | scale_x_continuous(breaks=c(0, max(genomeAggregate$withinGroupID)), labels=c("Start", "End")) + 290 | theme(plot.title=element_text(hjust=0.5)) + # Center title 291 | ggtitle(plotTitle) + 292 | theme(legend.position="bottom") 293 | return(g) 294 | } 295 | 296 | #' Returns bins used in `calcChromBins` function 297 | 298 | #' Given a named vector of chromosome sizes, the function returns 299 | #' GRangesList object with bins for each chromosome. 300 | 301 | #' @param chromSizes a named list of size (length) for each chromosome. 302 | #' @param binCount number of bins (total; *not* per chromosome), 303 | #' defaults to 10,000 304 | #' @return A GRangesList object with bins that separate chromosomes 305 | #' into equal parts. 306 | #' @export 307 | #' @examples 308 | #' chromSizes = getChromSizes("hg19") 309 | #' chromBins = getGenomeBins(chromSizes) 310 | #' 311 | getGenomeBins = function(chromSizes, binCount=10000) { 312 | .validateInputs(list(chromSizes="integer")) 313 | 314 | binnedDT = binChroms(binCount, chromSizes) 315 | splitBinnedDT = splitDataTable(binnedDT, "id") 316 | listGR = lapply(splitBinnedDT, dtToGr, chr="idCol") 317 | genomeBins = GRangesList(listGR) 318 | return(genomeBins) 319 | } 320 | -------------------------------------------------------------------------------- /R/content-plots.R: -------------------------------------------------------------------------------- 1 | #' Calculate GC content over genomic ranges 2 | #' 3 | #' Given a reference genome as a BSgenome object and some ranges on that 4 | #' reference, this function will return a vector of the same length as the 5 | #' granges object, with percent of Cs and Gs. 6 | #' 7 | #' @param query A GenomicRanges or GenomicRangesList object with query regions. 8 | #' @param ref Reference genome BSgenome object. 9 | #' @return A numeric vector of list of vectors with the GC percentage of 10 | #' the query regions. 11 | #' @export 12 | #' @examples 13 | #' \dontrun{ 14 | #' bsg = loadBSgenome('hg19') 15 | #' gcvec = calcGCContent(vistaEnhancers, bsg) 16 | #' } 17 | calcGCContent = function(query, ref) { 18 | .validateInputs(list(query=c("GRanges","GRangesList"), 19 | ref="BSgenome")) 20 | if (is(query, "GRangesList")) { 21 | # Recurse over each GRanges object 22 | x = lapply(query, calcGCContent, ref) 23 | namelist = names(query) 24 | if (is.null(namelist)) { 25 | newnames = seq_along(query) 26 | namelist = newnames 27 | # Append names 28 | names(x) = namelist 29 | } 30 | return(x) 31 | } 32 | # Restrict the seqnames to known chromosomes 33 | query = GenomeInfoDb::keepStandardChromosomes(query, pruning.mode="coarse") 34 | v = IRanges::Views(ref, query) 35 | gcvec = apply(Biostrings::alphabetFrequency(v)[,c("C","G")],1, sum)/width(v) 36 | return(gcvec) 37 | } 38 | 39 | 40 | #' Calculate GC content over genomic ranges 41 | #' 42 | #' Given a reference genome as a BSgenome object and some ranges on that 43 | #' reference, this function will return a vector of the same length as the 44 | #' granges object, with percent of Cs and Gs. 45 | #' 46 | #' @param query A GenomicRanges or GenomicRangesList object with query regions 47 | #' @param refAssembly A character vector specifying the reference genome 48 | #' assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes with 49 | #' \code{getTSSs}. 50 | #' @return A numeric vector or list of vectors with the GC percentage of 51 | #' the query regions. 52 | #' @export 53 | #' @examples 54 | #' \dontrun{ 55 | #' refAssembly = 'hg19' 56 | #' GCcontent = calcGCContentRef(vistaEnhancers, refAssembly) 57 | #' } 58 | calcGCContentRef = function(query, refAssembly) { 59 | .validateInputs(list(query=c("GRanges","GRangesList"), 60 | refAssembly="character")) 61 | ref = loadBSgenome(refAssembly) 62 | return(calcGCContent(query, ref)) 63 | } 64 | 65 | #' Plots a density distribution of GC vectors 66 | 67 | #' Give results from the \code{calcGCContent} function, this will produce a 68 | #' density plot 69 | #' @param gcvectors A numeric vector or list of numeric vectors of GC contents. 70 | #' @return A ggplot object plotting distribution of GC content in query regions. 71 | #' @export 72 | #' @examples 73 | #' numVector = rnorm(400, mean=0.5, sd=0.1) 74 | #' GCplot = plotGCContent(numVector) 75 | #' vecs = list(example1 = rnorm(400, mean=0.5, sd=0.1), 76 | #' example2 = rnorm(600, mean=0.5, sd=0.1)) 77 | #' GCplot = plotGCContent(vecs) 78 | #' 79 | plotGCContent = function(gcvectors) { 80 | .validateInputs(list(gcvectors=c("numeric", "list"))) 81 | 82 | if (is(gcvectors, "list")) { 83 | nameList = names(gcvectors) 84 | vectorLengths = unlist(lapply(gcvectors, length)) 85 | gcdfReshaped = data.frame(value = unlist(gcvectors), 86 | regionSet = rep(nameList, vectorLengths)) 87 | meansdf = aggregate(gcdfReshaped$value, 88 | list(gcdfReshaped$regionSet), mean) 89 | g = ggplot2::ggplot(gcdfReshaped, aes(x=value, colour=regionSet)) + 90 | geom_density() + 91 | geom_vline(data=meansdf, aes(xintercept=x, colour=Group.1), 92 | linetype="dashed", size=0.5) + 93 | theme_classic() + 94 | theme(legend.position = "bottom") 95 | } else { 96 | # plot a single regionset 97 | gcdfReshaped = data.frame(value = gcvectors) 98 | g = ggplot2::ggplot(gcdfReshaped, aes(x=value)) + 99 | geom_density() + 100 | geom_vline(aes(xintercept=mean(value)), 101 | color="red", linetype="dashed", size=0.5) + 102 | theme_classic() 103 | } 104 | g = g + 105 | ggtitle("GC content distribution") + 106 | theme(plot.title = element_text(hjust=0.5)) + 107 | xlab("GC content") + 108 | xlim(0,1) 109 | return(g) 110 | } 111 | 112 | #' Calculate Dinuclotide content over genomic ranges 113 | #' 114 | #' Given a reference genome (BSgenome object) and ranges on the 115 | #' reference, this function returns a data.table with 116 | #' counts of dinucleotides within the GRanges object. 117 | #' 118 | #' @param query A GRanges object with query sets 119 | #' @param ref Reference genome BSgenome object 120 | #' @param rawCounts a logical indicating whether the raw numbers should be 121 | #' displayed, rather than percentages (optional). 122 | #' @return A data.table with counts of dinucleotides across the GRanges object 123 | #' @export 124 | #' @examples 125 | #' \dontrun{ 126 | #' bsg = loadBSgenome('hg19') 127 | #' DNF = calcDinuclFreq(vistaEnhancers, bsg) 128 | #' } 129 | 130 | calcDinuclFreq = function(query, ref, rawCounts=FALSE) { 131 | 132 | .validateInputs(list(query=c("GRanges","GRangesList"), 133 | ref="BSgenome")) 134 | if (is(query, "GRangesList")) { 135 | 136 | # Recurse over each GRanges object 137 | x = lapply(query, calcDinuclFreq, ref, rawCounts=rawCounts) 138 | 139 | # return a list of dinucleotide dataframes across each GRanges object 140 | return(x) 141 | } 142 | # Restrict the seqnames to known chromosomes 143 | query = GenomeInfoDb::keepStandardChromosomes(query, pruning.mode="coarse") 144 | v = IRanges::Views(ref, query) 145 | regionNames = data.frame(region = paste(seqnames(query), 146 | start(query), 147 | end(query), sep="_")) 148 | dnvec= Biostrings::dinucleotideFrequency(v) 149 | # claculate frequencies if raw counts not required 150 | if(!rawCounts){ 151 | dnvec = prop.table(dnvec, margin = 1)*100 152 | } 153 | dnvec = cbind(regionNames, as.data.frame(dnvec)) 154 | return(dnvec) 155 | } 156 | 157 | 158 | #' Calculate dinucleotide content over genomic ranges 159 | #' 160 | #' Given a reference genome (BSgenome object) and ranges on the 161 | #' reference, this function returns a data.table with 162 | #' counts of dinucleotides within the GRanges object. 163 | #' 164 | #' @param query A GRanges object with query sets 165 | #' @param refAssembly A character vector specifying the reference genome 166 | #' assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes with 167 | #' \code{getTSSs}. 168 | #' @param rawCounts a logical indicating whether the raw numbers should be 169 | #' displayed, rather than percentages (optional). 170 | #' @return A numeric vector or list of vectors with the GC percentage of 171 | #' the query regions. 172 | #' @export 173 | #' @examples 174 | #' \dontrun{ 175 | #'query = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions") 176 | #'GRquery = rtracklayer::import(query) 177 | #'refAssembly = 'hg19' 178 | #'DNF = calcDinuclFreqRef(GRquery, refAssembly) 179 | #' } 180 | 181 | calcDinuclFreqRef= function(query, refAssembly, rawCounts=FALSE) { 182 | 183 | .validateInputs(list(query=c("GRanges","GRangesList"), 184 | 185 | refAssembly="character")) 186 | 187 | ref = loadBSgenome(refAssembly) 188 | 189 | return(calcDinuclFreq(query, ref, rawCounts=rawCounts)) 190 | } 191 | 192 | 193 | #' Plot dinuclotide content within region set(s) 194 | #' 195 | #' Given \code{calcDinuclFreq} or \code{calcDinuclFreqRef} results, this function 196 | #' generates a violin plot of dinucleotide frequency 197 | #' 198 | #' @param DNFDataTable A data.table, data.frame, or a list of dinucleotide counts - 199 | #' results from \code{calcDinuclFreq} or \code{calcDinuclFreqRef} 200 | #' @return A ggplot object plotting distribution of dinucleotide content in query regions 201 | #' @export 202 | #' @examples 203 | #' 204 | #' DNFDataTable = data.table::data.table(GC = rnorm(400, mean=0.5, sd=0.1), 205 | #' CG = rnorm(400, mean=0.5, sd=0.5), 206 | #' AT = rnorm(400, mean=0.5, sd=1), 207 | #' TA = rnorm(400, mean=0.5, sd=1.5)) 208 | #' DNFPlot = plotDinuclFreq(DNFDataTable) 209 | #' 210 | #' \dontrun{ 211 | #' query = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions") 212 | #' GRquery = rtracklayer::import(query) 213 | #' refAssembly = 'hg19' 214 | #' DNF = calcDinuclFreqRef(GRquery, refAssembly) 215 | #' DNFPlot2 = plotDinuclFreq(DNF) 216 | #' } 217 | 218 | plotDinuclFreq = function(DNFDataTable) { 219 | .validateInputs(list(DNFDataTable=c("data.table","data.frame","list"))) 220 | 221 | # reshape the data for plotting 222 | if (is(DNFDataTable, "list") && 223 | any(vapply(DNFDataTable, function(x) any(names(x) == "region"), logical(1)))){ 224 | g = reshape2::melt(DNFDataTable,id.vars="region", 225 | variable.name="dinucleotide", value.name="frequency") 226 | } else if ((is(DNFDataTable, "data.frame") | is(DNFDataTable, "data.table"))&& 227 | ("region" %in% colnames(DNFDataTable))){ 228 | g = reshape2::melt(DNFDataTable,id.vars="region", 229 | variable.name="dinucleotide", value.name="frequency") 230 | } else { 231 | g = reshape2::melt(DNFDataTable, id.vars=NULL, 232 | variable.name="dinucleotide", value.name="frequency") 233 | } 234 | 235 | # plot data as violin plots 236 | # if multiple inuts - make a facet for each dinucleotide to make the plot easier to read 237 | if (is(DNFDataTable, "list")){ 238 | plot = ggplot2::ggplot(data=g, ggplot2::aes(x=L1, y=frequency, fill=L1)) + 239 | facet_wrap(~dinucleotide, nrow=4) + 240 | theme_bw() + 241 | theme(strip.background =element_rect(fill="white"))+ 242 | theme(strip.text = element_text(face = "bold")) + 243 | theme(axis.text.x = element_text(angle=90, hjust=1)) + 244 | xlab(" ") 245 | } else{ 246 | plot = ggplot2::ggplot(data=g, ggplot2::aes(x=dinucleotide, y=frequency))+ 247 | xlab("Dinucleotide")+ 248 | theme_bw() 249 | } 250 | plot = plot + 251 | geom_violin(trim=TRUE, scale = "width") + 252 | geom_boxplot(alpha=0.2, outlier.shape = NA)+ 253 | ggtitle("Dinucleotide Frequency") + 254 | guides(fill="none") + 255 | theme(plot.title = element_text(hjust = 0.5)) 256 | # check if we have raw counts or frequencies 257 | if (is(g[,"frequency"], "integer")){ 258 | plot = plot + 259 | ylab("Dinucleotide counts per region [n]") 260 | 261 | } else { 262 | plot = plot + 263 | ylab("Dinucleotide frequency per region [%]") 264 | } 265 | return(plot) 266 | } 267 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' hg19 chromosome sizes 2 | #' 3 | #' A dataset containing chromosome sizes for Homo Sapiens hg38 genome assembly 4 | #' 5 | #' @format A named vectors of lengths with one item per chromosome 6 | #' @source BSgenome.Hsapiens.UCSC.hg19 package 7 | #' @name chromSizes_hg19 8 | #' @docType data 9 | #' @keywords datasets 10 | #' @usage data(chromSizes_hg19) 11 | NULL 12 | 13 | 14 | #' hg19 TSS locations 15 | #' 16 | #' A dataset containing chromosome sizes for Homo Sapiens hg38 genome assembly 17 | #' 18 | #' @format A named vectors of lengths with one item per chromosome 19 | #' @source EnsDb.Hsapiens.v75 package 20 | #' @name TSS_hg19 21 | #' @docType data 22 | #' @keywords datasets 23 | #' @usage data(TSS_hg19) 24 | NULL 25 | 26 | #' hg38 gene models 27 | #' 28 | #' A dataset containing gene models for Homo Sapiens hg38 genome assembly. 29 | #' 30 | #' @format A list of two GRanges objects, with genes and exons locations 31 | #' @source EnsDb.Hsapiens.v75 package 32 | #' @name geneModels_hg19 33 | #' @docType data 34 | #' @keywords datasets 35 | #' @usage data(geneModels_hg19) 36 | NULL 37 | 38 | 39 | #’ Example hg19 open signal matrix 40 | #' 41 | #' A dataset containing a subset of open chromatin regions across all 42 | #' cell types defined by ENCODE for Homo Sapiens hg19 43 | #' 44 | #' Preparation steps: 45 | #' \enumerate{ 46 | #' \item{made a universe of regions by merging regions across 47 | #' cell types defined as opened in ENCODE} 48 | #' \item{took bigwig files from ENCODE for individual cell types, 49 | #' merged replicates, filtered out blacklisted sites} 50 | #' \item{evaluated the signal above regions defined by previous step} 51 | #' \item{performed quantile normalization} 52 | #' \item{subsetted it} 53 | #' } 54 | #' 55 | #' @format data.frame, rows represent whole selection of open 56 | #' chromatin regions across all cell types defined by ENCODE, columns are 57 | #' individual cell types and values are normalized open chromatin signal values. 58 | #' @source \url{http://big.databio.org/open_chromatin_matrix/openSignalMatrix_hg19_quantileNormalized_round4.txt.gz} 59 | #' @name exampleOpenSignalMatrix_hg19 60 | #' @docType data 61 | #' @keywords datasets 62 | #' @usage data(exampleOpenSignalMatrix_hg19) 63 | NULL 64 | 65 | 66 | #’ Example BED file 67 | #' 68 | #' Example BED file read with rtracklayer::import 69 | #' 70 | #' @format GenomicRanges::GRanges 71 | #' @name vistaEnhancers 72 | #' @docType data 73 | #' @keywords datasets 74 | #' @usage data(vistaEnhancers) 75 | NULL 76 | 77 | 78 | #’ Example BED file 79 | #' 80 | #' Example BED file read with rtracklayer::import 81 | #' 82 | #' @format GenomicRanges::GRanges 83 | #' @name setB_100 84 | #' @docType data 85 | #' @keywords datasets 86 | #' @usage data(setB_100) 87 | NULL 88 | 89 | 90 | #’ Cell type metadata matrix 91 | #' 92 | #' Table the maps cell types to tissues and groups 93 | #' 94 | #' @format data.table with 3 columns (cellType, tissue and group) 95 | #' and 74 rows (one per cellType) 96 | #' @source self-curated dataset 97 | #' @name cellTypeMetadata 98 | #' @docType data 99 | #' @keywords datasets 100 | #' @usage data(cellTypeMetadata) 101 | NULL 102 | 103 | -------------------------------------------------------------------------------- /R/feature-plots.R: -------------------------------------------------------------------------------- 1 | # Old, slow version based on GRanges methods 2 | # 3 | # Find the distance to the nearest genomic feature. 4 | # 5 | # For a given query set of genomic regions, and a given feature set of 6 | # regions, this function will return the distance for each query region to its 7 | # closest feature. It ignores strand and returns the distance as positive or 8 | # negative, depending on whether the feature is upstream or downstream. 9 | # 10 | # This function is similar to the bioconductor distanceToNearest function, but 11 | # returns negative values for downstream distances instead of absolute values. 12 | # This allows you to assess the relative location. 13 | # 14 | # @param query A GRanges or GRangesList object with query sets 15 | # @param features A GRanges object with features to test distance to 16 | # 17 | # @return A vector of genomic distances for each query region relative to its 18 | # closest feature. 19 | calcFeatureDistBioc = function(query, features) { 20 | .validateInputs(list(query=c("GRangesList","GRanges"))) 21 | if (is(query, "GRangesList")) { 22 | # Recurse over each GRanges object 23 | x = lapply(query, calcFeatureDist, features) 24 | return(x) 25 | } 26 | 27 | precedeInd = precede(query, features) 28 | preIndNA = is.na(precedeInd) 29 | followInd = follow(query, features) 30 | folIndNA = is.na(followInd) 31 | preDist = rep(NA, length(query)) 32 | 33 | preDist[!preIndNA] = -distance(query[!preIndNA], 34 | features[precedeInd[!preIndNA]]) 35 | 36 | postDist = rep(NA, length(query)) 37 | postDist[!folIndNA] = distance(query[!folIndNA], 38 | features[followInd[!folIndNA]]) 39 | 40 | postHits = -preDist > postDist 41 | postHitsNA = is.na(postHits) 42 | dists = preDist 43 | dists[postHits[!postHitsNA]] = postDist[postHits[!postHitsNA]] 44 | return(dists) 45 | } 46 | 47 | #' Find the distance to the nearest genomic feature 48 | #' 49 | #' For a given query set of genomic regions, and a given feature set of 50 | #' regions, this function will return the distance for each query region to its 51 | #' closest feature. It ignores strand and returns the distance as positive or 52 | #' negative, depending on whether the feature is upstream or downstream 53 | #' 54 | #' This function is similar to the bioconductor distanceToNearest function, but 55 | #' returns negative values for downstream distances instead of absolute values. 56 | #' This allows you to assess the relative location. 57 | #' 58 | #' @param query A GRanges or GRangesList object with query sets 59 | #' @param features A GRanges object with features to test distance to 60 | #' 61 | #' @return A vector of genomic distances for each query region relative to its 62 | #' closest feature. 63 | #' @export 64 | #' @examples 65 | #' vistaSftd = GenomicRanges::shift(vistaEnhancers, 100000) 66 | #' calcFeatureDist(vistaEnhancers, vistaSftd) 67 | calcFeatureDist = function(query, features) { 68 | .validateInputs(list(query=c("GRangesList","GRanges"))) 69 | if (is(query, "GRangesList")) { 70 | # Recurse over each GRanges object 71 | x = lapply(query, calcFeatureDist, features) 72 | return(x) 73 | } 74 | queryDT = grToDt(query) 75 | featureDT = grToDt(features) 76 | queryDTs = splitDataTable(queryDT, "chr") 77 | featureDTs = splitDataTable(featureDT, "chr") 78 | as.vector(unlist(mapply(queryDTs, featureDTs[names(queryDTs)], 79 | FUN=DTNearest))) 80 | } 81 | 82 | # Function uses data.table rolling join to identify the nearest features 83 | # really quickly. 84 | # 85 | # @param DT1 A data.table object to be joined to a second data.table object. 86 | # @param DT2 A second data.table object to join with DT1. 87 | # 88 | # @return A rolling joined data.table object. 89 | DTNearest = function(DT1, DT2) { 90 | #data.table::set(DT1, j=mid, value=start + round((end-start)/2)) 91 | #data.table::set(DT2, j=mid, value=start + round((end-start)/2)) 92 | if (is.null(DT1)) { 93 | return(NULL) 94 | } 95 | if (is.null(DT2)) { 96 | return(rep(NA, nrow(DT1))) 97 | } 98 | DT1[, mid:=start + round((end-start)/2)] 99 | DT2[, mid:=start + round((end-start)/2)] 100 | data.table::setorder(DT1, mid) 101 | data.table::setorder(DT2, mid) 102 | data.table::setattr(DT1, "sorted", "mid") 103 | data.table::setattr(DT2, "sorted", "mid") 104 | DT2[J(DT1), roll="nearest"] 105 | DT2[J(DT1), start+round((end-start)/2)-mid, roll="nearest"] 106 | } 107 | 108 | 109 | #' Calculates the distribution of distances from a query set to closest TSS 110 | #' 111 | #' Given a query GRanges object and an assembly string, this function will grab 112 | #' the TSS list for the given reference assembly and then calculate the distance 113 | #' from each query feature to the closest TSS. It is a wrapper of 114 | #' \code{calcFeatureDist} that uses built-in TSS features for a reference 115 | #' assembly 116 | #' 117 | #' @param query A GenomicRanges or GenomicRangesList object with query regions 118 | #' @param refAssembly A character vector specifying the reference genome 119 | #' assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes with 120 | #' \code{getTSSs}. 121 | #' @return A vector of distances for each query region relative to TSSs. 122 | #' @export 123 | #' @examples 124 | #' calcFeatureDistRefTSS(vistaEnhancers, "hg19") 125 | calcFeatureDistRefTSS = function(query, refAssembly) { 126 | features = getTSSs(refAssembly) 127 | return(calcFeatureDist(query, features)) 128 | } 129 | 130 | 131 | # Converts a nucleotide count into a label with abbreviation 132 | # @param x base count 133 | # @return A label with 'kb' or 'mb' appended if appropriate 134 | genomeLabel = function(x) { 135 | .validateInputs(list(x="numeric")) 136 | lab = x 137 | if (abs(x) > 1e6){ 138 | lab = paste0(round(x/1e6), " mb") 139 | } 140 | else if (abs(x) > 1e3){ 141 | lab = paste0(round(x/1e3), " kb") 142 | } 143 | return(lab) 144 | } 145 | 146 | 147 | #' Plots a histogram of distances to genomic features 148 | #' 149 | #' Given the results from \code{featureDistribution}, plots a histogram of 150 | #' distances surrounding the features of interest 151 | #' 152 | #' @param dists Results from \code{featureDistribution} 153 | #' @param bgdists Background distances. If provided, will plot a background 154 | #' distribution of expected distances 155 | #' @param featureName Character vector for plot labels (optional). 156 | #' @param numbers a logical indicating whether the raw numbers should be 157 | #' displayed, rather than percentages (optional). 158 | #' @param nbins Number of bins on each side of the center point. 159 | #' @param size Number of bases to include in plot on each side of the 160 | #' center point. 161 | #' @param infBins Include catch-all bins on the sides? 162 | #' @param tile Turn on a tile mode, which plots a tiled figure 163 | #' instead of a histogram. 164 | #' @param labelOrder -- Enter "default" to order by order of user input (default); 165 | #' Enter "center" to order by value in tile in the closest proximity to the center 166 | #' of features (in case TSS is used - center is TSS) (center). 167 | #' @return A ggplot2 plot object 168 | #' @export 169 | #' @examples 170 | #' TSSdist = calcFeatureDistRefTSS(vistaEnhancers, "hg19") 171 | #' f = plotFeatureDist(TSSdist, featureName="TSS") 172 | plotFeatureDist = function(dists, bgdists=NULL, featureName="features", 173 | numbers=FALSE, nbins=50, size=100000, 174 | infBins=FALSE, tile=FALSE, labelOrder="default") { 175 | df = cutDists(dists, divisions=NULL, nbins, size, infBins) 176 | 177 | if(is.list(dists)){ 178 | nplots = length(dists) 179 | } else { 180 | nplots = 1 181 | } 182 | 183 | if (!is.null(bgdists)) { 184 | bgDistsDF = cutDists(bgdists, divisions=NULL, nbins, size, infBins) 185 | # bgDistsDF$Freq= scale(bgDistsDF$Freq, center=FALSE) 186 | bgDistsDF$Freq = (bgDistsDF$Freq / sum(bgDistsDF$Freq)) * 100 187 | df$bgFreq = rep(bgDistsDF$Freq, nplots) 188 | df$bgX = rep(seq_len(nrow(bgDistsDF)), nplots) 189 | } 190 | 191 | if ("name" %in% names(df)){ 192 | df$name = sortingFunction(df, labelOrder, nbins) 193 | if (!numbers) 194 | df$Freq = df[, .(Freq.Per = (Freq / sum(Freq)) * 100), 195 | by = name]$"Freq.Per" 196 | df$name = sortingFunction(df, labelOrder, nbins) 197 | # It has multiple regions 198 | g = ggplot(df, aes(x=cuts, y=Freq, fill=name, color = name)) + 199 | facet_grid(. ~name) 200 | } else { 201 | if (!numbers) 202 | df$Freq = (df$Freq / sum(df$Freq)) * 100 203 | g = ggplot(df, aes(x=cuts, y=Freq)) 204 | } 205 | 206 | if (!is.null(bgdists)) { 207 | 208 | # bgtrack = scale(smooth(bgDistsDF$Freq), center=FALSE) 209 | g = g + 210 | geom_line(stat="identity", aes(x=bgX,y=bgFreq), 211 | color="gray", alpha=1, size=1.5) + 212 | geom_bar(stat="identity", aes(x=cuts,y=bgFreq), fill="gray", alpha=0.8) 213 | } 214 | 215 | # find midpoint 216 | midx = nrow(df)/2/nplots 217 | barcount = nrow(df)/nplots 218 | minlabel = genomeLabel(-size) 219 | maxlabel = genomeLabel(size) 220 | edgeLabels = c(minlabel, rep("", barcount-2), maxlabel) 221 | 222 | if (tile) { 223 | if (!"name" %in% names(df)) { 224 | df$name = "Region set" 225 | } 226 | 227 | ncuts = length(unique(df$cuts)) 228 | xs = rep(seq_len(ncuts), nplots) 229 | g = ggplot(df) + 230 | geom_raster(aes(x=xs, y=name, fill=Freq)) + 231 | scale_fill_gradient(low="navy", high="orange") + 232 | geom_point(aes(x=midx, y=0.5), color="black", 233 | size=2, shape=17, alpha=0.8) + 234 | theme_classic() + 235 | labs(fill=ifelse(numbers,"Counts","Frequency (%)")) + 236 | theme(legend.position="bottom") + 237 | xlab(paste("Distance to", featureName)) + 238 | theme(axis.text.x=element_text(angle = 0, hjust = 0.5, vjust=0.5)) + 239 | scale_x_continuous(breaks=c(1, ncuts), labels=c(minlabel, maxlabel)) 240 | return(g) 241 | } 242 | 243 | if ("name" %in% names(df)) { 244 | g = g + 245 | geom_bar(data=df, stat="identity", alpha=0.7) 246 | } else { 247 | g = g + 248 | geom_bar(data=df, stat="identity", fill="darkblue", alpha=0.7) 249 | } 250 | g = g + geom_point(aes(x=midx, y=0), color="tan2", size=2, 251 | shape=17, alpha=0.8) + 252 | guides(fill="none") + # remove legend for geom_point 253 | theme_classic() + 254 | theme(aspect.ratio=1) + 255 | theme_blank_facet_label() + 256 | xlab(paste("Distance to", featureName)) + 257 | ylab(ifelse(numbers,"Counts","Frequency (%)")) + 258 | # theme(axis.text.x=element_text(angle = 90, hjust = 1, vjust=0.5)) 259 | theme(axis.text.x=element_text(angle = 0, hjust = 0.5, vjust=0.5)) + 260 | theme(plot.title = element_text(hjust = 0.5)) + # Center title 261 | ggtitle(paste("Distribution relative to", featureName)) + 262 | theme(legend.position="bottom") + 263 | theme(panel.spacing.x=unit(1, "lines")) + 264 | scale_x_discrete(labels=edgeLabels) + 265 | scale_x_discrete(labels=edgeLabels, expand=expansion(mult=0.035)) 266 | 267 | return(g) 268 | } 269 | 270 | # Internal helper function for \code{plotFeatureDist}: 271 | # orderes datasets based on their order in the user provided list, 272 | # or based on the value around feature center (in TSS based on TSS) 273 | # 274 | # @param df A data.table with varibales "cuts" - based on created bins in 275 | # \code{plotFeatureDist} function , "Freq" - either frequency or raw 276 | # counts in aa given bin, "name" - name of the dataset 277 | # @param labelOrder The method used to order datasets. Options: "default" 278 | # orderes datasets in a plot based on order of datasets in GRangesList 279 | # provided by user; "center" orderes datasets based on value in a central 280 | # bin of the plot. 281 | # @param nbins Number of bins on each side of the center point - input in 282 | # \code{plotFeatureDist} function. 283 | # @return A factor of names in "df" input with levels sorted based on 284 | # sorting option. 285 | 286 | sortingFunction = function(df, labelOrder="default", nbins=50){ 287 | if(labelOrder == "default"){ 288 | orderedLabels = unique(df$name) 289 | orderedNames = factor(df$name, levels = orderedLabels) 290 | return(orderedNames) 291 | } 292 | if (labelOrder == "center"){ 293 | # get the value around center, sort lables based on 294 | # central values, use the labels as factor levels 295 | centerIndex = seq(nbins, nrow(df), by = (nbins*2)) 296 | centerTiles = df[centerIndex,] 297 | orderTiles = centerTiles[order(centerTiles$Freq, decreasing = TRUE),] 298 | orderedLabels = orderTiles$name 299 | orderedNames = factor(df$name, levels = orderedLabels) 300 | return(orderedNames) 301 | 302 | } 303 | 304 | } 305 | 306 | 307 | # Internal helper function for \code{plotFeatureDist} 308 | # 309 | # @param dists A vector of genomic distances. 310 | # @param divisions A vector of bin sizes to divide the dists into. 311 | # @param nbins Number of bins on each side of the center point. 312 | # @param size Number of bases to include in plot on 313 | # each side of the center point. 314 | # @param infBins Include catch-all bins on the sides? 315 | # @return A data.frame of the table of the frequency of dists in divisions. 316 | cutDists = function(dists, divisions=NULL, nbins=50, 317 | size=100000, infBins=TRUE) { 318 | if (is.null(divisions)) { 319 | poscuts = seq(0, size, by=size/nbins) 320 | divisions = sort(unique(c(-poscuts, poscuts))) 321 | if (infBins) { 322 | divisions = c(-Inf, divisions, Inf) 323 | } 324 | } 325 | if (is.list(dists)) { 326 | x = lapply(dists, cutDists, divisions) 327 | 328 | # To accommodate multiple lists, we'll need to introduce a new 'name' 329 | # column to distinguish them. 330 | nameList = names(dists) 331 | if(is.null(nameList)) { 332 | nameList = seq_along(dists) # Fallback to sequential numbers 333 | } 334 | 335 | # Append names 336 | xb = rbindlist(x) 337 | xb$name = rep(nameList, vapply(x, nrow, integer(1))) 338 | 339 | return(xb) 340 | } 341 | 342 | labels = labelCuts(sort(divisions), collapse=" to ", infBins=infBins) 343 | cuts = cut(dists, divisions, labels) 344 | df = as.data.frame(table(cuts)) 345 | setDT(df) 346 | return(df) 347 | } 348 | 349 | -------------------------------------------------------------------------------- /R/loadData.R: -------------------------------------------------------------------------------- 1 | #' Loads BSgenome objects from UCSC-style character vectors. 2 | #' 3 | #' This function will let you use a simple character vector (e.g. 'hg19') to 4 | #' load and then return BSgenome objects. This lets you avoid having to use the 5 | #' more complex annotation for a complete BSgenome object (e.g. 6 | #' BSgenome.Hsapiens.UCSC.hg38.masked) 7 | #' 8 | #' @param genomeBuild One of 'hg19', 'hg38', 'mm10', 'mm9', or 'grch38' 9 | #' @param masked Should we used the masked version? Default:TRUE 10 | #' @return A BSgenome object corresponding to the provided genome build. 11 | #' @export 12 | #' @examples 13 | #' \dontrun{ 14 | #' bsg = loadBSgenome('hg19') 15 | #' } 16 | loadBSgenome = function(genomeBuild, masked=TRUE) { 17 | # Convert the given string into the BSgenome notation 18 | if (!requireNamespace("BSgenome", quietly=TRUE)) { 19 | message("BSgenome package is not installed.") 20 | } 21 | databasePkgString = switch (genomeBuild, 22 | grch38 = "BSgenome.Hsapiens.UCSC.hg38", 23 | hg38 = "BSgenome.Hsapiens.UCSC.hg38", 24 | hg19 = "BSgenome.Hsapiens.UCSC.hg19", 25 | mm10 = "BSgenome.Mmusculus.UCSC.mm10", 26 | mm9 = "BSgenome.Mmusculus.UCSC.mm9", 27 | bogus = "bogus" # a bogus genome for tests 28 | ) 29 | if (masked) { 30 | databasePkgString = paste0(databasePkgString, ".masked") 31 | } 32 | 33 | if (is.null(databasePkgString)) { 34 | stop("I don't know how to map the string ", genomeBuild, 35 | " to a BSgenome") 36 | } 37 | return(.requireAndReturn(databasePkgString)) 38 | } 39 | 40 | #' Load selected EnsDb library 41 | #' 42 | #' @param genomeBuild string, genome identifier 43 | #' 44 | #' @return loaded library 45 | #' @export 46 | #' 47 | #' @examples 48 | #' \dontrun{ 49 | #' loadEnsDb("hg19") 50 | #' } 51 | loadEnsDb = function(genomeBuild) { 52 | databasePkgString = switch (genomeBuild, 53 | grch38 = "EnsDb.Hsapiens.v86", 54 | hg38 = "EnsDb.Hsapiens.v86", 55 | hg19 = "EnsDb.Hsapiens.v75", 56 | mm10 = "EnsDb.Mmusculus.v79", 57 | bogus = "bogus" # a bogus db for unit tests 58 | ) 59 | 60 | if (is.null(databasePkgString)) { 61 | stop("I don't know how to map the string ", genomeBuild, 62 | " to a EnsDb") 63 | } 64 | return(.requireAndReturn(databasePkgString)) 65 | } 66 | 67 | #' Returns built-in chrom sizes for a given reference assembly 68 | # 69 | #' @param refAssembly A string identifier for the reference assembly 70 | #' @return A vector with the chromosome sizes corresponding to a 71 | #' specific genome assembly. 72 | #' @export 73 | #' @examples 74 | #' getChromSizes("hg19") 75 | getChromSizes = function(refAssembly) { 76 | datasetId = paste0("chromSizes_", refAssembly) 77 | 78 | if (refAssembly == "hg19"){ 79 | 80 | chromSizesDataset = getReferenceData(refAssembly, tagline="chromSizes_") 81 | 82 | } else if (refAssembly == "hg38"){ 83 | 84 | if (!"GenomicDistributionsData" %in% utils::installed.packages()){ 85 | stop(paste(datasetId, "not available in GenomicDistributions package", 86 | "and GenomicDistributionsData package is not installed")) 87 | } else { 88 | chromSizesDataset = GenomicDistributionsData::chromSizes_hg38() 89 | } 90 | 91 | } else if (refAssembly == "mm10"){ 92 | 93 | if (!"GenomicDistributionsData" %in% utils::installed.packages()){ 94 | stop(paste(datasetId, "not available in GenomicDistributions package", 95 | "and GenomicDistributionsData package is not installed")) 96 | } else { 97 | chromSizesDataset = GenomicDistributionsData::chromSizes_mm10() 98 | } 99 | 100 | } else if (refAssembly == "mm9"){ 101 | 102 | if (!"GenomicDistributionsData" %in% utils::installed.packages()){ 103 | stop(paste(datasetId, "not available in GenomicDistributions package", 104 | "and GenomicDistributionsData package is not installed")) 105 | } else { 106 | chromSizesDataset = GenomicDistributionsData::chromSizes_mm9() 107 | } 108 | 109 | } else { 110 | stop(paste(datasetId, "not available in GenomicDistributions package", 111 | "or GenomicDistributionsData package,", 112 | "please use getChromSizesFromFasta() to get chromosome sizes.")) 113 | } 114 | 115 | return(chromSizesDataset) 116 | } 117 | 118 | 119 | # Returns built-in TSSs for a given reference assembly 120 | # 121 | # @param refAssembly A string identifier for the reference assembly 122 | getTSSs = function(refAssembly) { 123 | datasetId = paste0("TSS_", refAssembly) 124 | 125 | if (refAssembly == "hg19"){ 126 | 127 | TSSs = getReferenceData(refAssembly, tagline="TSS_") 128 | 129 | } else if (refAssembly == "hg38"){ 130 | 131 | if (!"GenomicDistributionsData" %in% utils::installed.packages()){ 132 | stop(paste(datasetId, "not available in GenomicDistributions package", 133 | "and GenomicDistributionsData package is not installed")) 134 | } else { 135 | TSSs = GenomicDistributionsData::TSS_hg38() 136 | } 137 | 138 | } else if (refAssembly == "mm10"){ 139 | 140 | if (!"GenomicDistributionsData" %in% utils::installed.packages()){ 141 | stop(paste(datasetId, "not available in GenomicDistributions package", 142 | "and GenomicDistributionsData package is not installed")) 143 | } else { 144 | TSSs = GenomicDistributionsData::TSS_mm10() 145 | } 146 | 147 | } else if (refAssembly == "mm9"){ 148 | 149 | if(!"GenomicDistributionsData" %in% utils::installed.packages()){ 150 | stop(paste(datasetId, "not available in GenomicDistributions package", 151 | "and GenomicDistributionsData package is not installed")) 152 | } else { 153 | TSSs = GenomicDistributionsData::TSS_mm9() 154 | } 155 | 156 | } else { 157 | stop(paste(datasetId, "not available in GenomicDistributions package", 158 | "or GenomicDistributionsData package,", 159 | "please use getTssFromGTF() to get list of TSSs.")) 160 | } 161 | 162 | return(TSSs) 163 | } 164 | 165 | 166 | #' Returns built-in gene models for a given reference assembly 167 | #' 168 | #' Some functions require gene models, which can obtained from any source. 169 | #' This function allows you to retrieve a few common built-in ones. 170 | #' @param refAssembly A string identifier for the reference assembly 171 | #' @return A list containing the gene models corresponding to a 172 | #' specific reference assembly. 173 | #' @export 174 | #' @examples 175 | #' getGeneModels("hg19") 176 | getGeneModels = function(refAssembly) { 177 | datasetId = paste0("geneModels_", refAssembly) 178 | 179 | if(refAssembly == "hg19"){ 180 | 181 | geneModelsDataset = getReferenceData(refAssembly, tagline="geneModels_") 182 | 183 | } else if(refAssembly == "hg38"){ 184 | 185 | if(!"GenomicDistributionsData" %in% utils::installed.packages()){ 186 | stop(paste(datasetId, "not available in GenomicDistributions package", 187 | "and GenomicDistributionsData package is not installed")) 188 | } else { 189 | geneModelsDataset = GenomicDistributionsData::geneModels_hg38() 190 | } 191 | 192 | } else if(refAssembly == "mm10"){ 193 | 194 | if(!"GenomicDistributionsData" %in% utils::installed.packages()){ 195 | stop(paste(datasetId, "not available in GenomicDistributions package", 196 | "and GenomicDistributionsData package is not installed")) 197 | } else { 198 | geneModelsDataset = GenomicDistributionsData::geneModels_mm10() 199 | } 200 | 201 | } else if(refAssembly == "mm9"){ 202 | 203 | if(!"GenomicDistributionsData" %in% utils::installed.packages()){ 204 | stop(paste(datasetId, "not available in GenomicDistributions package", 205 | "and GenomicDistributionsData package is not installed")) 206 | } else { 207 | geneModelsDataset = GenomicDistributionsData::geneModels_mm9() 208 | } 209 | 210 | } else { 211 | stop(paste(datasetId, "not available in GenomicDistributions package", 212 | "or GenomicDistributionsData package,", 213 | "please use getGeneModelsFromGTF() to get", 214 | "gene models.")) 215 | } 216 | 217 | return(geneModelsDataset) 218 | } 219 | 220 | #' Get reference data for a specified assembly 221 | #' 222 | #' This is a generic getter function that will return a data object requested, 223 | #' if it is included in the built-in data with the GenomicDistributions package 224 | #' or GenomicDistributionsData package (if installed). Data objects can 225 | #' be requested for different reference assemblies and data types (specified by 226 | #' a tagline, which is a unique string identifying the data type). 227 | #' 228 | #' @param refAssembly Reference assembly string (e.g. 'hg38') 229 | #' @param tagline The string that was used to identify data of a given type in 230 | #' the data building step. It's used for the filename so we know 231 | #' what to load, and is what makes this function generic (so it 232 | #' can load different data types). 233 | #' @return A requested and included package data object. 234 | getReferenceData = function(refAssembly, tagline) { 235 | # query available datasets and convert the packageIQR object into a vector 236 | datasetId = paste0(tagline, refAssembly) 237 | dataset = .getDataFromPkg(id=datasetId, "GenomicDistributions") 238 | if(!is.null(dataset)) 239 | return(dataset) 240 | if(!"GenomicDistributionsData" %in% utils::installed.packages()) 241 | stop(paste(datasetId, "not available in GenomicDistributions package", 242 | "and GenomicDistributionsData package is not installed")) 243 | dataset = .getDataFromPkg(id=datasetId, "GenomicDistributionsData") 244 | if(!is.null(dataset)) 245 | return(dataset) 246 | stop(paste(datasetId, "not available in GenomicDistributions and", 247 | "GenomicDistributionsData packages")) 248 | } 249 | 250 | .getDataFromPkg = function(id, pkg){ 251 | datasetListIQR = utils::data(package=pkg) 252 | datasetList = datasetListIQR$results[,"Item"] 253 | if (id %in% datasetList){ 254 | utils::data(list=id, package=pkg, envir=environment()) 255 | return(get(id)) 256 | } 257 | return(invisible(NULL)) 258 | } 259 | 260 | -------------------------------------------------------------------------------- /R/neighbor-distances.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #' Group regions from the same chromosome together and 5 | #' calculate the distances of a region to its upstream and 6 | #' downstream neighboring regions. 7 | #' Distances are then lumped into a numeric vector. 8 | #' 9 | #' @param query A GRanges or GRangesList object. 10 | #' @param correctRef A string indicating the reference genome 11 | #' to use if distances are corrected for the number of 12 | #' regions in a regionSet. 13 | #' 14 | #' @return A numeric vector or list with different vectors containing the 15 | #' distances of regions to their upstream/downstream neighbors. 16 | #' @export 17 | #' @examples 18 | #' dist = calcNeighborDist(vistaEnhancers) 19 | calcNeighborDist = function(query, correctRef="None") { 20 | .validateInputs(list(query=c("GRanges","GRangesList"), 21 | correctRef=c("character"))) 22 | # lapply if a GRangeslist is provided 23 | if (is(query, "GRangesList")) { 24 | dist = lapply(query, 25 | function(x){calcNeighborDist(x, correctRef = correctRef)}) 26 | namelist = names(query) 27 | if (is.null(namelist)) { 28 | newnames = seq_along(query) 29 | namelist = newnames 30 | # Append names 31 | names(dist) = namelist 32 | } 33 | return(dist) 34 | } 35 | querydt = grToDt(sort(query)) 36 | querydts = splitDataTable(querydt, "chr") 37 | distanceVectors = lapply(querydts, neighbordt) 38 | d = as.vector(unlist(distanceVectors)) 39 | # remove overlaps 40 | dcvec = d[!(d == "0")] 41 | # Correct for number of regions 42 | if (!correctRef=="None") { 43 | chromSizes = getChromSizes(correctRef) 44 | genomelen = sum(chromSizes) 45 | meanWidth = mean(calcWidth(query)) 46 | expectedDist = genomelen/nrow(querydt) - meanWidth 47 | correctedDist = log10(dcvec/expectedDist) 48 | return(correctedDist) 49 | # If we just want to look at the raw neighbor distances 50 | } else { 51 | return(dcvec) 52 | } 53 | } 54 | 55 | #' Internal helper function to calculate distance 56 | #' between neighboring regions. 57 | #' 58 | #' @param querydt A data table with regions grouped according to 59 | #' chromosome. 60 | #' @return A numeric vector with the distances in bp 61 | neighbordt = function(querydt) { 62 | # there should be at least 2 regions for each chr 63 | if (nrow(querydt) > 1) { 64 | endVect = abs(querydt[, diff(end)]) 65 | regionWidth = querydt[, (end-start+1)] 66 | distancesVector = endVect - regionWidth[-1] 67 | # neg values represent overlaps between neighbor regions, set those to 0 68 | distancesVector[which(distancesVector < 0)] = 0 69 | return(distancesVector) 70 | } 71 | } 72 | 73 | 74 | #' Group regions from the same chromosome together and 75 | #' compute the distance of a region to its nearest neighbor. 76 | #' Distances are then lumped into a numeric vector. 77 | #' 78 | #' @param query A GRanges or GRangesList object. 79 | #' @param correctRef A string indicating the reference genome 80 | #' to use if Nearest neighbor distances are corrected for the 81 | #' number of regions in a regionSet. 82 | #' 83 | #' @return A numeric vector or list of vectors containing the 84 | #' distance of regions to their nearest neighbors. 85 | #' @export 86 | #' @examples 87 | #' Nneighbors = calcNearestNeighbors(vistaEnhancers) 88 | calcNearestNeighbors = function(query, correctRef="None") { 89 | .validateInputs(list(query=c("GRanges","GRangesList"), 90 | correctRef=c("character"))) 91 | # lapply if a GRangeslist is provided 92 | if (is(query, "GRangesList")) { 93 | dist = lapply(query, 94 | function(x){calcNearestNeighbors(x, correctRef = correctRef)}) 95 | namelist = names(query) 96 | if (is.null(namelist)) { 97 | newnames = seq_along(query) 98 | namelist = newnames 99 | # Append names 100 | names(dist) = namelist 101 | } 102 | return(dist) 103 | } 104 | # Calculate nearest neighbors in a vectorized manner 105 | dist = calcNeighborDist(query) 106 | upstream = dist[-length(dist)] 107 | downstream = dist[-1] 108 | dt = data.table(i=upstream, j=downstream) 109 | pairmins = dt[, pmin(i, j)] 110 | # First and last distances are default nearest neighbors 111 | nNeighbors = c(dist[1], pairmins, dist[length(dist)]) 112 | # Correct for number of regions 113 | if (!correctRef=="None") { 114 | chromSizes = getChromSizes(correctRef) 115 | genomelen = sum(chromSizes) 116 | meanWidth = mean(calcWidth(query)) 117 | expectedDist = genomelen/length(query) - meanWidth 118 | correctedDist = log10(nNeighbors/expectedDist) 119 | return(correctedDist) 120 | } else { 121 | return(nNeighbors) 122 | } 123 | } 124 | 125 | #' Plot the distances from regions to their upstream/downstream neighbors 126 | #' or nearest neighbors. Distances can be passed as either raw bp or 127 | #' corrected for the number of regions (log10(obs/exp)), but this has 128 | #' to be specified in the function parameters. 129 | #' 130 | #' @param dcvec A numeric vector or list of vectors containing distances 131 | #' to upstream/downstream neighboring regions or to nearest neighbors. 132 | #' Produced by \code{calcNeighborDist} or \code{calcNearestNeighbors} 133 | #' @param correctedDist A logical indicating if the plot axis should 134 | #' be adjusted to show distances corrected for the number of regions 135 | #' in a regionset. 136 | #' @param Nneighbors A logical indicating whether legend should be adjusted 137 | #' if Nearest neighbors are being plotted. Default legend shows distances 138 | #' to upstream/downstream neighbors. 139 | #' 140 | #' @return A ggplot density object showing the distribution of 141 | #' raw or corrected distances. 142 | #' @export 143 | #' @examples 144 | #' numVector = rnorm(400, mean=5, sd=0.1) 145 | #' d = plotNeighborDist(numVector) 146 | plotNeighborDist = function(dcvec, correctedDist=FALSE, 147 | Nneighbors=FALSE) { 148 | .validateInputs(list(dcvec=c("numeric","list"))) 149 | # if input is list, convert it to a data frame with 150 | # value and region set name, if input is vector - make a single 151 | # columns data.frame 152 | if (is(dcvec, "list")) { 153 | nameList = names(dcvec) 154 | vectorLengths = unlist(lapply(dcvec, length)) 155 | distReshaped = data.frame(value = unlist(dcvec), 156 | regionSet = rep(nameList, vectorLengths)) 157 | g = ggplot2::ggplot(distReshaped, aes(x=value, 158 | fill=regionSet, 159 | colour=regionSet)) + 160 | geom_density(alpha=0.4) 161 | } else { 162 | distReshaped = data.frame(value = dcvec) 163 | g = ggplot2::ggplot(distReshaped, aes(x=value)) + 164 | geom_density() 165 | } 166 | if (correctedDist==TRUE) { 167 | g = g + 168 | xlab(expression(log[10](over(Obs, Exp)))) + 169 | geom_vline(xintercept = 0, linetype="dashed") + 170 | ggtitle("Corrected neighboring regions distance distribution") 171 | } else { 172 | g = g + 173 | xlab(expression("bp distance")) + 174 | scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x), 175 | labels = scales::trans_format("log10", 176 | scales::math_format(10^.x))) + 177 | ggtitle("Neighboring regions distance distribution") 178 | } 179 | g = g + 180 | theme_classic() + 181 | theme(aspect.ratio=1, 182 | plot.title = element_text(hjust=0.5), 183 | legend.position = "bottom") + 184 | theme_blank_facet_label() 185 | 186 | # Adjust legend if plotting nearest neighbors 187 | if (Nneighbors==TRUE){ 188 | g = g + 189 | labs(fill="regionSet Nneighbors", 190 | colour="regionSet Nneighbors") 191 | } 192 | return(g) 193 | } 194 | 195 | -------------------------------------------------------------------------------- /R/package.R: -------------------------------------------------------------------------------- 1 | # PACKAGE DOCUMENTATION 2 | #' Produces summaries and plots of features distributed across genomes 3 | #' 4 | #' If you have a set of genomic ranges, the GenomicDistributions R package can 5 | #' help you with some simple visualizations. Currently, it can produce two kinds 6 | #' of plots: First, the chromosome distribution plot, which visualizes how your 7 | #' regions are distributed over chromosomes; and second, the feature 8 | #' distribution plot, which visualizes how your regions are distributed relative 9 | #' to a feature of interest, like Transcription Start Sites (TSSs). 10 | #' 11 | "_PACKAGE" 12 | #' @name GenomicDistributions 13 | #' @author Nathan C. Sheffield 14 | #' 15 | #' @references \url{http://github.com/databio/GenomicDistributions} 16 | #' @import ggplot2 17 | #' @importFrom GenomicRanges GRanges GRangesList elementMetadata strand 18 | #' seqnames granges makeGRangesFromDataFrame 19 | #' @importFrom data.table ":=" setDT data.table setkey fread setnames 20 | #' setcolorder rbindlist setattr setorder copy is.data.table 21 | #' tstrsplit as.data.table foverlaps 22 | #' @importFrom reshape2 melt 23 | #' @importFrom IRanges IRanges Views 24 | #' @importFrom Biostrings alphabetFrequency 25 | #' @importFrom methods is 26 | #' @importFrom stats chisq.test 27 | #' @importFrom utils installed.packages getAnywhere data globalVariables download.file 28 | 29 | NULL 30 | 31 | # You can either use 'import X' or 'importFrom X abcdefg'. importFrom is 32 | # better practice, but for ggplot2 we were simply importing so many functions 33 | # that it makes sense to just import the whole package 34 | # @importFrom ggplot2 ggplot aes facet_grid geom_jitter geom_line 35 | # geom_bar theme_classic xlab ylab geom_hline ylim 36 | # scale_color_discrete scale_x_discrete scale_y_discrete 37 | # scale_fill_brewer scale_color_manual scale_x_continuous 38 | # ggtitle geom_vline scale_fill_discrete xlim 39 | # scale_color_brewer theme element_blank unit 40 | # element_text geom_density geom_point guides geom_col 41 | # theme_bw scale_fill_manual 42 | 43 | 44 | # Because of some issues with NOTEs on R CMD check and CRAN submission, 45 | # (see here: http://stackoverflow.com/questions/9439256/) 46 | # I have to register stuff used in data.table as non-standard evaluation, 47 | # in order to pass some R CMD check NOTES. 48 | if(getRversion() >= "2.15.1") { 49 | utils::globalVariables(c( 50 | "cuts", "mid", "J", "chr", "N", "regionID", "x", "name", "BSFilter", 51 | "start", "end", "findOverlaps", "queryHits", "subjectHits", "buildJ", 52 | "seqlengths", "IRanges", "seqlengths", "reduce", "seqlevels", "follow", 53 | "trim", "error", "nlist", "aggregate", "median", "bgDists", "Freq", "bgX", 54 | "bgFreq", "value", "regionSet", "Group.1", "cellType", "spaceLabel", 55 | "signal", "group", "medianBar", "partition", "Freq", "Freq", "cumsize", 56 | "frif", "aggregate", "withinGroupID", "lowerCaseTissue", "boxplot.stats", 57 | "median", "barplot", "legend", "promoters", "seqlevels", "width", 58 | "precede", "elementMetadata", ".N", ".SD", "colorRampPalette", "count", 59 | "countOverlaps", "distance", "elementMetadata<-", "elementNROWS", 60 | "expected", "log10OE", "pintersect", "plot_labels", "query", 61 | "regionGroupID", "seqlevels<-", "size", "tableCount", "V1", "queryPeak", 62 | "xid", "yid", "na.omit", "peakName", "mixedVar", 63 | "cellTypeMetadata", "tissueType", "boxStats", 64 | "tissue", ".", "Percent", "Var1", "maxStart", "start", 65 | "i.start", "minEnd", "i.end", "overlap", "gene_biotype", "dinucleotide", 66 | "frequency", "L1", "V4", "colName", "i", "j", ".x", "lowerColorColumn", 67 | "midpoint", "ubinID", "x.binID", "x.start", "x.end", "FreqPercent", 68 | "Chi.square.pval", "score", "type")) 69 | } 70 | 71 | 72 | -------------------------------------------------------------------------------- /R/qthist.R: -------------------------------------------------------------------------------- 1 | #' Calculate the widths of regions 2 | #' 3 | #' The length of a genomic region (the distance between the start and end) 4 | #' is called the width 5 | #' When given a query set of genomic regions, this function returns the width 6 | #' @param query A GRanges or GRangesList object with query sets 7 | #' @return A vector of the widths (end-start coordinates) of GRanges objects. 8 | #' @export 9 | #' @examples 10 | #' regWidths = calcWidth(vistaEnhancers) 11 | calcWidth = function(query) { 12 | if (is(query, "GRangesList")) { 13 | # Recurse over each GRanges object 14 | x = lapply(query, calcWidth) 15 | return(x) } 16 | width(query) 17 | } 18 | 19 | 20 | #' Plot quantile-trimmed histogram 21 | #' 22 | #' Given the results from \code{calcWidth}, plots a histogram with 23 | #' outliers trimmed. 24 | #' 25 | #' x-axis breaks for the frequency calculations are based on the "divisions" 26 | #' results from helper function \code{calcDivisions}. 27 | #' 28 | #' @param x Data values to plot - vector or list of vectors 29 | #' @param EndBarColor Color for the quantile bars on both ends of the graph 30 | #' (optional) 31 | #' @param MiddleBarColor Color for the bars in the middle of the graph 32 | #' (optional) 33 | #' @param quantThresh Quantile of data to be contained in each end bar (optional) 34 | #' quantThresh values must be under .2, optimal size is under .1 35 | #' @param bins The number of bins for the histogram to allocate data to. 36 | #' (optional) 37 | #' @param indep logical value which returns a list of plots that have had their 38 | #' bins calculated independently; the normal version will plot them on the 39 | #' same x and y axis. 40 | #' @param numbers a logical indicating whether the raw numbers should be 41 | #' displayed, rather than percentages (optional). 42 | #' @return A ggplot2 plot object 43 | #' @export 44 | #' @examples 45 | #' regWidths = calcWidth(vistaEnhancers) 46 | #' qtHist = plotQTHist(regWidths) 47 | #' qtHist2 = plotQTHist(regWidths, quantThresh=0.1) 48 | plotQTHist = function(x, EndBarColor = "gray57", MiddleBarColor = "gray27", 49 | quantThresh=NULL, bins=NULL, indep=FALSE, numbers=FALSE) { 50 | if (indep) { 51 | if (is(x, "list") | is(x, "List")) { 52 | x = lapply(x, plotQTHist) 53 | namesx = names(x) 54 | for (i in seq_along(x)){ 55 | x[[i]] = x[[i]] + ggtitle(namesx[i]) 56 | } 57 | return(x) 58 | # you can use grid.arrange like this to plot these 59 | # do.call("grid.arrange", x) 60 | } 61 | } 62 | output = calcDivisions(x, quantThresh=quantThresh, bins=bins) 63 | # if all x are the same - recalculate divisions 64 | divisionCheck = output[["divisions"]] 65 | if (length(divisionCheck) > length(unique(divisionCheck))){ 66 | if (length(unique(divisionCheck)) == 3){ 67 | output[["divisions"]] = c(-Inf, divisionCheck[2], 68 | divisionCheck[2]+1, Inf) 69 | output[["bins"]] = 1 70 | } else { 71 | output[["divisions"]] = unique(divisionCheck) 72 | output[["bins"]] = (length(unique(divisionCheck)) - 3) 73 | } 74 | } 75 | if(is(x, "List")){ 76 | x = as.list(x) 77 | } 78 | if(is.list(x)){ 79 | nplots = length(x) 80 | } else { 81 | nplots = 1 82 | } 83 | 84 | df = cutDists(x, divisions=output[["divisions"]]) 85 | if ("name" %in% names(df)){ 86 | if (!numbers) 87 | df$Freq = df[, .(Freq.Per = (Freq / sum(Freq)) * 100), 88 | by = name]$"Freq.Per" 89 | 90 | g = ggplot(df, aes(x=cuts, y=Freq, fill=name)) + 91 | facet_wrap(. ~name) 92 | } else { 93 | if (!numbers) 94 | df$Freq = df[, .(Freq.Per = (Freq / sum(Freq)) * 100)]$"Freq.Per" 95 | g = ggplot(df, aes(x=cuts, y=Freq)) 96 | } 97 | # Create a vector for the colors 98 | colors_vect = c(EndBarColor , 99 | rep(MiddleBarColor, (length(output[["divisions"]])-3)), EndBarColor) 100 | colors_vect = rep(colors_vect, nplots) 101 | 102 | nbars = output[["bins"]]+2 103 | qbaridx = sort(c(seq(1, nbars*nplots, by=nbars), 104 | seq(nbars, nbars*nplots, by=nbars))) 105 | 106 | g = g + 107 | geom_bar(stat="identity", fill = colors_vect) + 108 | theme_classic() + 109 | theme(aspect.ratio=1) + 110 | theme_blank_facet_label() + 111 | ylab("Frequency") + 112 | xlab("") + 113 | theme(axis.text.x=element_text(angle = 90, hjust = 1, vjust=0.5)) + 114 | theme(plot.title = element_text(hjust = 0.5)) + # Center title 115 | ggtitle("Quantile Trimmed Histogram") + 116 | theme(legend.position="bottom") + 117 | geom_text(aes(label= paste((output[["quantile"]]*100),"%", sep='')), 118 | data=df[qbaridx,], hjust=-1, angle=90, size=2.5) 119 | 120 | if (!numbers){ 121 | g = g + ylab("Percentage") 122 | } 123 | 124 | return(g) 125 | } 126 | 127 | 128 | # Internal helper function for \code{plotQTHist} 129 | # 130 | # If the bins or quantiles for the hist are specified by the user, those are 131 | # used. Otherwise, this function is used to calculate 1) number of bins based 132 | # on size of the dataset, and 2) quantiles based on bins. 133 | # 134 | # @param x A vector of GRanges x. 135 | # @return A list of the divisions that will be used in plotting the histogram. 136 | # @examples 137 | # calcDivisions(runif(500)*1000) 138 | calcDivisions = function(x, bins=NULL, quantThresh = NULL){ 139 | if(is.list(x)){ 140 | x=unlist(x) 141 | } 142 | 143 | # calculating bins 144 | if(!is.null(bins)){ 145 | b = bins 146 | } else { 147 | n = length(x) 148 | if (n > 10000) {n = 10000} 149 | if (n < 500) {n = 500} 150 | # finding number of bins based on the size of dataset 151 | b = round(n^.15 + (n/200)) 152 | } 153 | # calculating quantiles 154 | if(!is.null(quantThresh)){ 155 | if(quantThresh > .2){ 156 | stop("quantThresh value must be less than .2, Optimal size is under .1") } 157 | q = quantThresh 158 | } else { 159 | # finding the quantile on each side based on number of bins 160 | q = round(25/(b))/100 161 | # minimum on each side is 1% 162 | q = max(.01, q) 163 | } 164 | quant = unname(stats::quantile(x, probs = c((q), (1-(q))))) 165 | seq_10 = seq(quant[1], quant[2], length = b+1) 166 | div = c(-Inf, round(seq_10), Inf) 167 | listOutput <- list("bins"= b,"quantile"= q, "divisions" = div) 168 | return(listOutput) 169 | } 170 | -------------------------------------------------------------------------------- /R/utility.R: -------------------------------------------------------------------------------- 1 | #' Checks class of the list of variables. To be used in functions 2 | #' 3 | #' @param checkList list of object to check, e.g. 4 | #' list(varname=c("data.frame", "numeric")). 5 | #' Multiuple strings in the vector are treated as OR. 6 | #' @return A warning if the wrong input class is provided. 7 | #' @examples 8 | #' x = function(var1) { 9 | #' cl = list(var1=c("numeric","character")) 10 | #' .validateInputs(cl) 11 | #' return(var1^2) 12 | #' } 13 | .validateInputs = function(checkList) { 14 | nms = names(checkList) 15 | for(i in seq_along(checkList)){ 16 | fail = FALSE 17 | clss = checkList[[i]] 18 | x = get(nms[i], envir=parent.frame(1)) 19 | for(cls in clss){ 20 | if (is(x, cls)) fail = append(fail, TRUE) 21 | } 22 | if(!any(fail)) 23 | stop(paste0(nms[i], " must be a ", paste(clss, collapse=" or "), 24 | ". Got: ", class(x))) 25 | } 26 | } 27 | 28 | 29 | #' Checks to make sure a package object is installed, 30 | #' and if so, returns it. If the library is not installed, it issues a warning 31 | #' and returns NULL. 32 | # 33 | #' @param BSgenomeString A BSgenome compatible genome string. 34 | #' @return A BSgenome object if installed. 35 | .requireAndReturn = function(BSgenomeString) { 36 | if (requireNamespace(BSgenomeString)) 37 | return(utils::getAnywhere(BSgenomeString)$objs[[1]]) 38 | else 39 | warning(BSgenomeString, " is not installed") 40 | return(NULL) 41 | } 42 | 43 | 44 | #' Efficiently split a data.table by a column in the table 45 | #' 46 | #' @param DT Data.table to split 47 | #' @param split_factor Column to split, which can be a character vector 48 | #' or an integer. 49 | #' @return List of data.table objects, split by column 50 | # @examples 51 | # DT = data.table::data.table(letters, grp = rep(c("group1", "group2"), 13)) 52 | # splitDataTable(DT, "grp") 53 | # splitDataTable(DT, 2) 54 | splitDataTable = function(DT, split_factor) { 55 | factor_order = unique(DT[, get(split_factor)]) 56 | if (is.numeric(split_factor)) { 57 | split_factor = colnames(DT)[split_factor] 58 | message("Integer split_factor, changed to: ", split_factor) 59 | } 60 | l = lapply(split(seq_len(nrow(DT)), DT[, get(split_factor)]), 61 | function(x) DT[x]) 62 | return(l[factor_order]) 63 | } 64 | 65 | 66 | #' Two utility functions for converting data.tables into GRanges objects 67 | #' 68 | #' @param DT A data.table representing genomic regions. 69 | #' @param chr A string representing the chromosome column. 70 | #' @param start A string representing the name of the start column. 71 | #' @param end A string representing the name of the end column. 72 | #' @param strand A string representing the name of the strand column. 73 | #' @param name A string representing the name of the name column. 74 | #' @param metaCols A string representing the name of the metadata column(s) 75 | #' to include in the returned GRanges object. 76 | #' @return A GRanges object. 77 | dtToGrInternal = function(DT, chr, start, end=NA, strand=NA, 78 | name=NA, metaCols=NA) { 79 | if (is.na(end)) { 80 | if ("end" %in% colnames(DT)) { 81 | end = "end" 82 | } else { 83 | end = start 84 | } 85 | } 86 | if (is.na(strand)) { 87 | gr=GRanges(seqnames=DT[[`chr`]], 88 | ranges=IRanges(start=DT[[`start`]], 89 | end=DT[[`end`]]), strand="*") 90 | } else { 91 | # GRanges can only handle '*' for no strand, so replace any non-accepted 92 | # characters with '*' 93 | DT[,strand:=as.character(strand)] 94 | DT[strand=="1", strand:="+"] 95 | DT[strand=="-1", strand:="-"] 96 | DT[[`strand`]] = gsub("[^+-]", "*", DT[[`strand`]]) 97 | gr=GRanges(seqnames=DT[[`chr`]], ranges=IRanges(start=DT[[`start`]], 98 | end=DT[[`end`]]), 99 | strand=DT[[`strand`]]) 100 | } 101 | if (! is.na(name) ) { 102 | names(gr) = DT[[`name`]] 103 | } else { 104 | names(gr) = seq_along(gr) 105 | } 106 | if(! is.na(metaCols)) { 107 | for(x in metaCols) { 108 | elementMetadata(gr)[[`x`]]=DT[[`x`]] 109 | } 110 | } 111 | gr 112 | } 113 | 114 | 115 | #' Converts a data.table (DT) object to a GenomicRanges 116 | #' (GR) object. Tries to be intelligent, guessing chr 117 | #' and start, but you have to supply end or other 118 | #' columns if you want them to be carried into the GR. 119 | #' 120 | #' @param DT A data.table representing genomic regions. 121 | #' @param chr A string representing the chromosome column. 122 | #' @param start A string representing the name of the start column. 123 | #' @param end A string representing the name of the end column. 124 | #' @param strand A string representing the name of the strand column. 125 | #' @param name A string representing the name of the name column. 126 | #' @param splitFactor A string representing the name of the column to use to 127 | #' split the data.table into multiple data.tables. 128 | #' @param metaCols A string representing the name of the metadata column(s) 129 | #' to include in the returned GRanges object. 130 | #' @return A GRanges object. 131 | #' @export 132 | #' @examples 133 | #' start1 = c(seq(from=1, to = 2001, by = 1000), 800) 134 | #' chrString1 = c(rep("chr1", 3), "chr2") 135 | #' dt = data.table::data.table(chr=chrString1, 136 | #' start=start1, 137 | #' end=start1 + 250) 138 | #' newGR = dtToGr(dt) 139 | dtToGr = function(DT, chr="chr", start="start", end=NA, strand=NA, name=NA, 140 | splitFactor=NA, metaCols=NA) { 141 | if(is.na(splitFactor)) { 142 | return(dtToGrInternal(DT, chr, start, end, strand, name, metaCols)) 143 | } 144 | 145 | if ( length(splitFactor) == 1 ) { 146 | if( splitFactor %in% colnames(DT) ) { 147 | splitFactor = DT[, get(splitFactor)] 148 | } 149 | } 150 | 151 | lapply(split(seq_len(nrow(DT)), splitFactor), function(x) { 152 | dtToGrInternal(DT[x,], chr, start, end, strand, name, metaCols) 153 | } 154 | ) 155 | 156 | 157 | } 158 | 159 | 160 | #' Convert a GenomicRanges into a data.table. 161 | #' 162 | #' @param GR A Granges object 163 | #' @return A data.table object. 164 | grToDt = function(GR) { 165 | DF=as.data.frame(elementMetadata(GR)) 166 | if( ncol(DF) > 0) { 167 | DT = data.table(chr=as.vector(seqnames(GR)), 168 | start=start(GR), end=end(GR), DF) 169 | } else { 170 | DT = data.table(chr=as.vector(seqnames(GR)), 171 | start=start(GR), end=end(GR)) 172 | } 173 | return(DT) 174 | } 175 | 176 | 177 | #' Converts a list of data.tables (From BSreadbeds) into GRanges. 178 | #' 179 | #' @param dtList A list of data.tables 180 | #' @return A GRangesList object. 181 | BSdtToGRanges = function(dtList) { 182 | gList = list() 183 | for (i in seq_along(dtList)) { 184 | #dt = dtList[[i]] 185 | setkey(dtList[[i]], chr, start) 186 | #convert the data into granges object 187 | gList[[i]] = GRanges(seqnames=dtList[[i]]$chr, 188 | ranges=IRanges(start=dtList[[i]]$start, 189 | end=dtList[[i]]$start), 190 | strand=rep("*", nrow(dtList[[i]])), 191 | hitCount=dtList[[i]]$hitCount, 192 | readCount=dtList[[i]]$readCount) 193 | # I used to use end=start+1, but this targets 194 | # CG instead of just a C, and it's causing edge-effects 195 | # problems when I assign Cs to tiled windows 196 | # using (within). Aug 2014 I'm changing to 197 | # start/end at the same coordinate. 198 | } 199 | return(gList) 200 | } 201 | 202 | 203 | #' Clear ggplot face label. 204 | #' 205 | #' Usually ggplot2 facets are labeled with boxes surrounding the label. This 206 | #' function removes the box, so it's a simple label for each facet. 207 | #' 208 | #' @return A ggplot theme 209 | theme_blank_facet_label = function() { 210 | return(theme( 211 | panel.grid.major = element_blank(), 212 | panel.grid.minor = element_blank(), 213 | strip.background = element_blank() 214 | ) 215 | ) 216 | } 217 | 218 | 219 | #' Creates labels based on a discretization definition. 220 | #' 221 | #' If you are building a histogram of binned values, you want to have labels for 222 | #' your bins that correspond to the ranges you used to bin. This function takes 223 | #' the breakpoints that define your bins and produces nice-looking labels for 224 | #' your histogram plot. 225 | #' 226 | #' \code{labelCuts} will take a cut group, (e.g., a quantile division of 227 | #' some signal), and give you clean labels (similar to the cut method). 228 | #' @param breakPoints The exact values you want as boundaries for your bins 229 | #' @param round_digits Number of digits to cut round labels to. 230 | #' @param signif_digits Number of significant digits to specify. 231 | #' @param collapse Character to separate the labels 232 | #' @param infBins use >/< as labels on the edge bins 233 | #' @return A vector of histogram axis labels. 234 | # @examples 235 | # labelCuts(seq(0,100,by=20)) 236 | labelCuts = function(breakPoints, round_digits=1, 237 | signif_digits=3, collapse="-", infBins=FALSE) { 238 | roundedLabels = signif(round( 239 | cbind( breakPoints[-length(breakPoints)],breakPoints[-1]), 240 | round_digits), signif_digits) 241 | # set the Inf values to NA so formatC can add commas 242 | is.na(roundedLabels) = vapply(roundedLabels, is.infinite, logical(1)) 243 | labelsWithCommas = formatC(roundedLabels, format="d", 244 | big.mark=",") 245 | labels = apply(labelsWithCommas, 1, paste0, collapse=collapse) 246 | if (infBins) { 247 | labels[1] = paste0("<=", formatC(breakPoints[2], format="d", 248 | big.mark=",")) 249 | labels[length(labels)] = paste0(">", 250 | formatC(breakPoints[length(breakPoints)-1], 251 | format="d", big.mark=",")) 252 | } 253 | return(labels) 254 | } 255 | 256 | #' Nathan's magical named list function. 257 | #' This function is a drop-in replacement for the base list() function, 258 | #' which automatically names your list according to the names of the 259 | #' variables used to construct it. 260 | #' It seamlessly handles lists with some names and others absent, 261 | #' not overwriting specified names while naming any unnamed parameters. 262 | #' Took me awhile to figure this out. 263 | #' 264 | #' @param ... arguments passed to list() 265 | #' @return A named list object. 266 | #' @export 267 | #' @examples 268 | #' x=5 269 | #' y=10 270 | #' nlist(x,y) # returns list(x=5, y=10) 271 | #' list(x,y) # returns unnamed list(5, 10) 272 | nlist = function(...) { 273 | fcall = match.call(expand.dots=FALSE) 274 | l = list(...) 275 | if(!is.null(names(list(...)))) { 276 | names(l)[names(l) == ""] = fcall[[2]][names(l) == ""] 277 | } else { 278 | names(l) = fcall[[2]] 279 | } 280 | return(l) 281 | } 282 | 283 | 284 | -------------------------------------------------------------------------------- /R/zalias.R: -------------------------------------------------------------------------------- 1 | featureAggregateDistribution = calcChromBins 2 | plotGenomeAggregate = plotChromBins 3 | TSSDist = calcFeatureDistRefTSS 4 | genomicPartitions = calcPartitionsRef 5 | aggregateOverGenomeBins = calcChromBinsRef 6 | featureDistanceDistribution = calcFeatureDist 7 | assignPartitions = calcPartitions 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | | Master | Dev | 2 | |--------|-----| 3 | |[![Build Status](https://travis-ci.org/databio/GenomicDistributions.svg?branch=master)](https://travis-ci.org/databio/GenomicDistributions) | [![Build Status](https://travis-ci.org/databio/GenomicDistributions.svg?branch=dev)](https://travis-ci.org/databio/GenomicDistributions) | 4 | 5 | 6 | 7 | # GenomicDistributions 8 | 9 | An R package that provides functions for 1) calculating and 2) visualizing a variety of statistics for a collection of genomic ranges. If you have a set of genomic ranges, such as a BED file the GenomicDistributions R package can help you to explore, annotate, visualize,and compare it. 10 | 11 | ## Installing 12 | 13 | ### Main package 14 | 15 | With Bioconductor: 16 | 17 | ```r 18 | if (!requireNamespace("BiocManager", quietly = TRUE)) 19 | install.packages("BiocManager") 20 | BiocManager::install("GenomicDistributions") 21 | ``` 22 | 23 | Or from GitHub: 24 | 25 | ```r 26 | devtools::install_github("databio/GenomicDistributions") 27 | ``` 28 | 29 | ### Data package 30 | 31 | [GenomicDistributionsData](https://github.com/databio/GenomicDistributionsData): includes full data files, too large to include in GenomicDistributions 32 | 33 | 34 | ## Quick start 35 | 36 | See the vignettes for more information: http://code.databio.org/GenomicDistributions 37 | 38 | ## Building long vignettes 39 | 40 | In the [long_vignettes](/long_vignettes) are vignettes that require large external data and take a long time to run. Therefore, they should be pre-built. You can render them manually by running [long_vignettes/render-long-vignettes.R](long_vignettes/render-long-vignettes.R). This will use `knitr` to run the vignette and put the result into the `vignettes` folder, along with output figures. 41 | 42 | **Cite GenomicDistributions:**\ 43 | Kupkova K., Mosquera J.V., Smith J.P., Stolarczyk M, Danehy T., Lawson J.T., Rogers S., LeRoy N., Sheffield N.C. GenomicDistributions: fast analysis of genomic intervals with Bioconductor. *BMC Genomics* 23, 299 (2022). https://doi.org/10.1186/s12864-022-08467-y 44 | -------------------------------------------------------------------------------- /_pkgdown.yaml: -------------------------------------------------------------------------------- 1 | 2 | template: 3 | params: 4 | bootswatch: yeti 5 | 6 | navbar: 7 | left: 8 | - text: Vignettes 9 | icon: fa-play-circle 10 | href: articles/index.html 11 | - text: Documentation 12 | icon: fa-pencil 13 | href: reference/index.html 14 | - text: GitHub 15 | icon: fa-github fa-lg 16 | href: https://github.com/databio/GenomicDistributions 17 | right: 18 | - text: Databio.org 19 | href: http://databio.org 20 | - text: Software & Data 21 | href: http://databio.org/software/ 22 | 23 | articles: 24 | - title: Vignettes 25 | contents: 26 | - intro 27 | - full-power 28 | 29 | -------------------------------------------------------------------------------- /data-raw/TSS_hg19.R: -------------------------------------------------------------------------------- 1 | library(usethis) 2 | TSS_hg19 = GenomicDistributionsData::buildTSS("hg19") 3 | usethis::use_data(TSS_hg19, overwrite=TRUE) 4 | -------------------------------------------------------------------------------- /data-raw/bedfiles.R: -------------------------------------------------------------------------------- 1 | library(usethis) 2 | fileNameList = c("vistaEnhancers.bed.gz", "setB_100.bed.gz") 3 | for (fileName in fileNameList) { 4 | storedObjectName = strsplit(fileName, "\\.")[[1]][1] 5 | x = rtracklayer::import(system.file("extdata", fileName, package = "GenomicDistributions")) 6 | assign(storedObjectName, x) 7 | do.call("use_data", list(as.name(storedObjectName), overwrite = TRUE)) 8 | rm(feats, storedObjectName) 9 | } 10 | -------------------------------------------------------------------------------- /data-raw/chromSizes_hg19.R: -------------------------------------------------------------------------------- 1 | library(usethis) 2 | chromSizes_hg19 = GenomicDistributionsData::buildChromSizes("hg19") 3 | usethis::use_data(chromSizes_hg19, overwrite=TRUE) 4 | -------------------------------------------------------------------------------- /data-raw/geneModels_hg19.R: -------------------------------------------------------------------------------- 1 | library(usethis) 2 | geneModels_hg19 = GenomicDistributionsData::buildGeneModels("hg19") 3 | usethis::use_data(geneModels, overwrite=TRUE) 4 | -------------------------------------------------------------------------------- /data/TSS_hg19.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/TSS_hg19.rda -------------------------------------------------------------------------------- /data/cellTypeMetadata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/cellTypeMetadata.rda -------------------------------------------------------------------------------- /data/chromSizes_hg19.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/chromSizes_hg19.rda -------------------------------------------------------------------------------- /data/datalist: -------------------------------------------------------------------------------- 1 | cellTypeMetadata 2 | chromSizes_hg19 3 | geneModels_hg19 4 | TSS_hg19 5 | exampleOpenSignalMatrix_hg19 6 | vistaEnhancers 7 | setB_100 -------------------------------------------------------------------------------- /data/exampleOpenSignalMatrix_hg19.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/exampleOpenSignalMatrix_hg19.rda -------------------------------------------------------------------------------- /data/geneModels_hg19.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/geneModels_hg19.rda -------------------------------------------------------------------------------- /data/setB_100.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/setB_100.rda -------------------------------------------------------------------------------- /data/vistaEnhancers.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/vistaEnhancers.rda -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite the GenomicDistributions package please use:") 2 | 3 | citEntry(entry="article", 4 | title = "GenomicDistributions: fast analysis of genomic intervals with Bioconductor", 5 | author = personList(as.person("Kristyna Kupkova" ), 6 | as.person("Jose Verdezoto Mosquera"), 7 | as.person("Jason P. Smith" ), 8 | as.person("Michal Stolarczyk"), 9 | as.person("Tessa L. Danehy"), 10 | as.person("John T. Lawson"), 11 | as.person("Bingjie Xue"), 12 | as.person("John T. Stubbs"), 13 | as.person("Nathan LeRoy"), 14 | as.person("Nathan C. Sheffield")), 15 | year = 2022, 16 | journal = "BMC Genomics", 17 | doi = "10.1186/S12864-022-08467-Y", 18 | url = "https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-022-08467-y", 19 | textVersion = 20 | "Kupkova K, Verdezoto Mosquera J, Smith JP, et al. (2022) GenomicDistributions: fast analysis of genomic intervals with Bioconductor. BMC Genomics. doi:10.1186/S12864-022-08467-Y") 21 | -------------------------------------------------------------------------------- /inst/extdata/C_elegans_cropped_example.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/inst/extdata/C_elegans_cropped_example.fa.gz -------------------------------------------------------------------------------- /inst/extdata/C_elegans_cropped_example.gtf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/inst/extdata/C_elegans_cropped_example.gtf.gz -------------------------------------------------------------------------------- /inst/extdata/example_cell_matrix.txt: -------------------------------------------------------------------------------- 1 | V1 astrocyte_of_the_spinal_cord cardiac_fibroblast CD8-positive_xx_alpha-beta_T_cell endothelial_cell_of_umbilical_vein fibroblast_of_dermis fibroblast_of_pulmonary_artery fibroblast_of_skin_of_right_biceps fibroblast_of_the_conjunctiva glomerular_endothelial_cell lung_microvascular_endothelial_cell osteoblast skeletal_muscle_myoblast T-helper_17_cell 2 | chr1_8130458_8130903 0.127 0.0937 0.055 0.0309 0.0315 0.052 0.0735 0.0662 0.068 0.032 0.019 0.1183 0.1115 3 | chr1_8131775_8131925 0.0508 0.0321 0.0683 0.0563 0.0482 0.0452 0.0072 0.0286 0.026 0.0217 0.0378 0.0239 0.0903 4 | chr1_10732275_10732425 0.0236 0.0183 0.056 0.2383 0.0165 0.0194 0.0128 0.0074 0.0263 0.2841 0.4417 0.012 0.0159 5 | chr1_10732475_10732645 0.0045 0.009 0.0406 0.2067 0.0177 0.0295 0.0192 0.0073 0.0479 0.1875 0.4553 0.0096 0.0103 6 | chr1_10852095_10852245 0.0338 0.0416 0.0397 0.0532 0.0102 0.0587 0.0197 0.0314 0.0718 0.0542 1 0.0308 0.0485 7 | chr1_10925155_10925365 0.0424 0.0181 0.041 0.046 0.0405 0.0263 0.0338 0.0392 0.0129 0.0063 0.09 0.0172 0.0091 8 | chr1_10965595_10965785 0.0606 0.0254 0.0442 0.0111 0.0387 0.0235 0.0116 0.0025 0.0042 0.0277 0.139 0.0299 0.0703 9 | chr1_33722335_33723625 0.5422 0.249 0.6172 0.4543 0.1972 0.3181 0.2779 0.3422 0.2329 0.2895 0.848 0.1536 0.4351 10 | chr1_33723635_33723908 0.0713 0.1258 0.1728 0.1867 0.1114 0.1013 0.0818 0.1358 0.0414 0.1763 0.8794 0.1207 0.0203 11 | chr1_33829095_33829245 0.0482 0.0555 0.0306 0.0228 0.0581 0.0579 0.0183 0.0604 0.0627 0.0468 0.0331 0.0536 0.0244 12 | chr1_33829603_33830118 0.8796 0.1994 0.0491 0.1314 0.1776 0.4936 0.4731 0.7528 0.0333 0.7031 0.0655 0.7978 0.0261 13 | chr1_38495115_38496257 0.3529 0.2145 0.5038 0.5988 0.103 0.254 0.3538 0.249 0.3269 0.2645 0.5682 0.1296 0.7671 14 | chr1_38560483_38560811 0.0367 0.0259 0.0712 0.036 0.0387 0.0279 0.0388 0.0967 0.0279 0.0439 0.0808 0.0283 0.0199 15 | chr1_38561602_38561752 0.0134 0.0591 0.0825 0.0413 0.0352 0.05 0.0377 0.0362 0.0332 0.0492 0.1444 0.0153 0.0307 16 | chr1_38627361_38627549 0.035 0.01 0.0082 0.1264 0.0636 0.0617 0.0151 0.0377 0.074 0.0414 0.0342 0.06 0.0252 17 | chr1_38627916_38627936 0.0575 0.0331 0.0245 0.0254 0.0348 0.0083 0.0303 0.0667 0.0042 0.0115 0.0218 0.0258 0.0489 18 | chr1_38657195_38657424 0.033 0.0414 0.0665 0.0597 0.0496 0.0265 0.007 0.0338 0.0167 0.0301 0.0739 0.0734 0.0036 19 | chr1_38735686_38735990 0.0323 0.126 0.0371 0.038 0.0272 0.0888 0.0311 0.0431 0.0381 0.0394 0.0628 0.0574 0.0779 20 | chr1_38736395_38736545 0.0357 0.0294 0.0624 0.0108 0.0209 0.0433 0.0617 0.0384 0.0042 0.0337 0.0579 0.0123 0.0857 21 | chr1_38791885_38792106 0.0225 0.0261 0.0364 0.0203 0.0521 0.0488 0.0722 0.0371 0.0337 0.0529 0.0818 0.0714 0.0168 22 | chr1_38793215_38793664 0.0877 0.0631 0.0412 0.0931 0.0602 0.0482 0.0738 0.0417 0.054 0.0574 0.0533 0.0881 0.0321 23 | chr1_38802058_38802430 0.0172 0.0229 0.0269 0.0545 0.0266 0.0303 0.0431 0.0378 0.0156 0.0375 0.0261 0.071 0.0419 24 | chr1_39292040_39292245 0.0748 0.0337 0.0371 0.1152 0.0909 0.0477 0.0804 0.0263 0.0423 0.0402 0.0308 0.1064 0.081 25 | chr1_41711275_41711685 0.1858 0.1828 0.03 0.1141 0.0205 0.2247 0.1125 0.0181 0.0727 0.1484 0.1171 0.2079 0.0736 26 | chr1_44500999_44501245 0.0477 0.053 0.1087 0.0427 0.097 0.0729 0.0673 0.0363 0.035 0.1167 0.1681 0.0785 0.0649 27 | chr1_44990655_44990903 0.0396 0.103 0.0239 0.0611 0.0627 0.1072 0.0356 0.0196 0.0477 0.0132 0.0554 0.1881 0.0526 28 | chr1_51006708_51006761 0.0748 0.0145 0.0397 0.0361 0.0439 0.0197 0.032 0.0626 0.0596 0.0042 0.0971 0.0278 0.0582 29 | chr1_51034675_51034925 0.0574 0.0427 0.063 0.0323 0.014 0.0377 0.0191 0.0303 0.0522 0.0241 0.0096 0.0471 0.0449 30 | chr1_51035725_51036145 1 0.0678 0.0099 0.0299 0.0605 0.0486 0.1065 0.038 0.0335 0.0241 0.0806 0.1441 0.0218 31 | chr1_54924892_54925086 0.0508 0.0874 0.239 0.0851 0.2416 0.0926 0.167 0.0765 0.1011 0.1152 0.3239 0.0705 0.1913 32 | chr1_54926095_54926385 0.0393 0.071 1 0.0448 0.0772 0.0349 0.0669 0.0312 0.0253 0.0522 0.0448 0.0382 1 33 | chr1_54927515_54927665 0.0286 0.0261 0.1449 0.0495 0.0281 0.0539 0.0542 0.0764 0.0203 0.0688 0.4354 0.0308 0.0392 34 | chr1_54928470_54928670 0.0344 0.0436 0.1488 0.1447 0.0355 0.0632 0.0721 0.0308 0.0671 0.0628 0.0437 0.0303 0.0359 35 | chr1_54928798_54929565 0.0429 0.0659 0.1137 0.1724 0.0458 0.0925 0.0976 0.0583 0.1366 0.1063 0.1272 0.0341 0.0416 36 | chr1_59522090_59522425 0.0709 0.1789 0.0638 0.0427 0.4935 0.2201 0.0494 0.4556 0.0306 0.1736 0.0183 0.0638 0.0172 37 | chr1_59522915_59523325 1 1 0.0755 0.1297 1 1 1 1 0.1723 0.3904 0.3319 0.673 0.0498 38 | chr1_59523476_59523490 0.0634 0.1475 0.0812 0.0568 0.1271 0.125 0.1332 0.4081 0.073 0.0984 0.0218 0.0666 0.0098 39 | chr1_59523555_59523787 0.0811 0.3627 0.0646 0.0372 0.1693 0.1838 0.2624 0.7117 0.0376 0.0329 0.0457 0.0793 0.013 40 | chr1_60105442_60105459 0.0883 0.0815 0.0271 0.0255 0.0185 0.0122 0.0288 0.0386 0.0211 0.0224 0.0368 0.0143 0.131 41 | chr1_61086832_61087125 0.1248 0.0575 0.013 0.2817 0.0408 0.0812 0.0372 0.0544 0.0679 0.1186 0.0441 0.0371 0.0143 42 | chr1_61087275_61087621 0.4201 0.2548 0.0107 1 0.092 0.6451 0.2189 0.1832 1 1 0.0361 0.109 0.0495 43 | chr1_62045875_62046025 0.0753 0.0682 0.0559 0.0395 0.2332 0.0441 0.0286 0.0511 0.1098 0.0302 0.0527 0.0564 0.0633 44 | chr1_62046195_62046371 0.044 0.0717 0.0448 0.042 0.0798 0.0227 0.0213 0.0354 0.0532 0.0246 0.0212 0.0177 0.0475 45 | chr1_62053295_62053445 0.0311 0.0362 0.0238 0.0459 0.0369 0.0407 0.0372 0.0437 0.0167 0.0454 0.1605 0.0306 0.041 46 | chr1_62055155_62055445 0.2025 0.092 0.0362 0.0478 0.0827 0.051 0.1191 0.2308 0.0113 0.0645 0.0345 0.0487 0.0174 47 | chr1_63370375_63370525 0.0131 0.0301 0.01 0.0352 0.0679 0.0494 0.0426 0.0675 0.017 0.0232 0.0198 0.0073 0.0571 48 | chr1_63443879_63444045 0.0731 0.07 0.0354 0.0203 0.0342 0.0235 0.0206 0.0519 0.0383 0.0748 0.0096 0.0272 0.0349 49 | chr1_63464548_63464705 0.0425 0.0668 0.0077 0.0451 0.0393 0.0296 0.0171 0.0082 0.0651 0.0262 0.0631 0.0111 0.0115 50 | chr1_82663855_82664005 0.0189 0.0207 0.0045 0.025 0.0197 0.0508 0.0207 0.0173 0.01 0.005 0.0096 0.0249 0.0149 51 | chr1_83252855_83253573 0.0437 0.0476 0.0202 0.0774 0.0346 0.0534 0.0509 0.0199 0.0339 0.0275 0.0318 0.0253 0.0543 52 | chr1_83345462_83345963 0.0501 0.0135 0.0276 0.1716 0.1681 0.0398 0.0556 0.0769 0.2686 0.0281 0.0234 0.0092 0.0648 53 | chr1_83346168_83346650 0.0333 0.0523 0.0141 0.0436 0.024 0.0288 0.0162 0.029 0.0642 0.033 0.0096 0.0156 0.0639 54 | chr16_80951515_80951638 0.0208 0.0311 0.0278 0.0441 0.0281 0.0622 0.0533 0.0673 0.0688 0.0545 0.0096 0.0114 0.0118 55 | chr16_80381587_80381737 0.0584 0.0597 0.0088 0.0266 0.0588 0.0405 0.0241 0.0484 0.0042 0.0639 0.0416 0.0498 0.01 56 | chr16_79933304_79933525 0.0238 0.0319 0.022 0.0155 0.0232 0.0303 0.0448 0.0339 0.0042 0.0287 0.053 0.0162 0.0106 57 | chr16_79933655_79933805 0.0391 0.0506 0.0267 0.0401 0.0265 0.0629 0.0339 0.1073 0.1709 0.048 0.031 0.0518 0.0141 58 | chr16_79644375_79644525 0.0088 0.0081 0.0746 0.0155 0.0167 0.0371 0.0527 0.0172 0.0998 0.0338 0.0971 0.0544 0.0403 59 | chr16_79510878_79510900 0.04 0.02 0.0625 0.0628 0.0177 0.0481 0.0509 0.0187 0.0042 0.0437 0.0469 0.0549 0.1469 60 | chr16_79511057_79511207 0.0045 0.024 0.057 0.0387 0.0447 0.0057 0.0213 0.0518 0.0109 0.071 0.0433 0.0388 0.084 61 | chr16_79436935_79437295 0.0172 0.0399 0.0645 0.0357 0.0079 0.0392 0.0424 0.0403 0.0406 0.0214 0.0713 0.0336 0.0087 62 | chr16_79437296_79437446 0.0055 0.0373 0.0137 0.034 0.0061 0.0397 0.0293 0.0314 0.0201 0.0472 0.0544 0.0176 0.0223 63 | chr16_79437456_79437606 0.0102 0.0202 0.0301 0.0162 0.0544 0.0537 0.0497 0.043 0.0371 0.0491 0.0428 0.025 0.0184 64 | chr16_79397778_79397928 0.0298 0.0176 0.0683 0.0114 0.0352 0.0321 0.0222 0.0345 0.0246 0.0396 0.0226 0.0508 0.012 65 | chr16_79210975_79211205 0.003 0.0132 0.2555 0.0185 0.0191 0.0429 0.0437 0.0168 0.0116 0.0427 0.062 0.0226 0.045 66 | chr16_79211216_79211366 0.0236 0.0238 0.0602 0.0356 0.0149 0.0195 0.0171 0.0237 0.0086 0.0286 0.0279 0.0125 0.0364 67 | chr16_74363381_74363581 0.1219 0.2504 1 0.2249 0.2187 0.4715 0.2744 0.6045 0.0787 0.7063 0.1307 0.1501 0.6573 68 | chr16_74352560_74352860 0.045 0.0316 0.0537 0.0637 0.0384 0.0485 0.0596 0.0398 0.0871 0.0755 0.0329 0.0334 0.0257 69 | chr16_74353114_74353285 0.0526 0.0772 0.0775 0.0346 0.015 0.075 0.0686 0.0568 0.0591 0.0786 0.0526 0.0403 0.2131 70 | chr16_73981035_73981205 0.0121 0.0131 0.0435 0.0704 0.0409 0.0244 0.0371 0.0173 0.03 0.0191 0.0484 0.0042 0.0463 71 | chr16_73840165_73840315 0.0114 0.025 0.0234 0.0357 0.0105 0.0102 0.0098 0.0269 0.0133 0.0422 0.0199 0.0399 0.013 72 | chr16_73738690_73738840 0.0249 0.0493 0.0291 0.039 0.0213 0.0344 0.034 0.0541 0.0332 0.0335 0.031 0.0398 0.0207 73 | chr16_73739597_73739747 0.1042 0.0178 0.0171 0.011 0.0562 0.011 0.0096 0.0374 0.0773 0.0313 0.0096 0.0061 0.0369 74 | chr16_73254855_73255062 0.025 0.0995 0.0277 0.0489 0.0161 0.075 0.159 0.1063 0.0202 0.0325 0.1465 0.0792 0.0118 75 | chr16_70680855_70681065 0.2686 0.3397 0.0468 0.0559 0.0797 0.2599 0.0837 0.1962 0.0211 0.1002 0.2779 0.1895 0.0516 76 | chr16_70681455_70681625 0.1126 0.1423 0.0878 0.0454 0.1012 0.1285 0.0665 0.0733 0.0732 0.1139 0.0537 0.1272 0.0304 77 | chr16_65651655_65651905 0.0148 0.0437 0.0351 0.0414 0.0293 0.0465 0.1174 0.0302 0.0265 0.0187 0.0096 0.2493 0.0355 78 | chr16_60575250_60575400 0.0662 0.0353 0.0025 0.0526 0.0173 0.0462 0.0528 0.0211 0.0091 0.0394 0.038 0.0233 0.0436 79 | chr16_60575497_60575647 0.0082 0.029 0.0408 0.0079 0.0512 0.0339 0.0256 0.0196 0.0271 0.0156 0.0096 0.0041 0.0228 80 | chr16_56418106_56418256 0.0162 0.0469 0.0535 0.0092 0.0079 0.0527 0.0271 0.063 0.0217 0.0416 0.076 0.0281 0.0607 81 | chr16_56328929_56329415 0.301 0.1369 0.0468 0.0541 0.0312 0.089 0.1607 0.0236 0.0376 0.0573 0.0751 0.0695 0.036 82 | chr16_56329475_56329625 0.0338 0.0661 0.0535 0.0426 0.0356 0.0369 0.0839 0.0446 0.0253 0.0376 0.164 0.0533 0.0339 83 | chr16_56329755_56330365 0.0719 0.0712 0.0466 0.0697 0.0393 0.0643 0.1246 0.0434 0.0366 0.062 0.0678 0.0611 0.0383 84 | chr16_55784515_55784725 0.0148 0.047 0.0369 0.0489 0.0169 0.0172 0.0167 0.0542 0.0077 0.035 0.0423 0.0236 0.0745 85 | chr16_55619388_55619665 0.0292 0.0266 0.0244 0.1022 0.0329 0.0452 0.0906 0.0281 0.0435 0.0251 0.0188 0.0403 0.0272 86 | chr16_55576385_55576997 0.5997 0.3098 0.0357 0.3485 0.2955 0.3126 0.5383 0.3117 0.4584 0.2641 0.0304 0.2038 0.0502 87 | chr16_55577189_55577339 0.2471 0.057 0.02 0.0642 0.0365 0.063 0.0592 0.0069 0.1024 0.0137 0.0096 0.0495 0.0634 88 | chr16_55577463_55577763 0.4379 0.3725 0.0212 0.3498 0.2549 0.5258 0.2463 0.5622 0.4622 0.2492 0.0242 0.1647 0.04 89 | chr16_55540715_55540865 0.0749 0.0479 0.0155 0.0337 0.0387 0.044 0.0454 0.0397 0.0351 0.0251 0.0297 0.0123 0.0267 90 | chr16_55143715_55143865 0.026 0.0153 0.0157 0.0193 0.1472 0.0071 0.0509 0.0421 0.0163 0.0083 0.0178 0.0523 0.0326 91 | chr16_54987433_54987665 0.0157 0.0162 0.0468 0.0275 0.0922 0.0331 0.4583 0.0662 0.0418 0.0367 0.0235 0.0184 0.0148 92 | chr16_54948279_54948429 0.0662 0.0338 0.0139 0.0067 0.0301 0.0629 0.0966 0.0619 0.0302 0.0256 0.0096 0.0186 0.0092 93 | chr16_54948995_54949145 0.0125 0.0245 0.0052 0.0206 0.0564 0.0159 0.0827 0.0485 0.0284 0.022 0.0237 0.042 0.0254 94 | chr16_54726535_54726685 0.0148 0.0242 0.0277 0.041 0.0414 0.0253 0.0514 0.0163 0.0095 0.0344 0.0497 0.0317 0.0157 95 | chr16_54578712_54578959 0.0414 0.0523 0.0379 0.0161 0.058 0.0613 0.0325 0.0556 0.0222 0.0442 0.0223 0.0441 0.0384 96 | chr16_54564175_54564374 0.0155 0.076 0.0107 0.0379 0.0993 0.0568 0.0957 0.053 0.0279 0.0186 0.155 0.0923 0.0325 97 | chr16_53557235_53557485 0.044 0.0294 0.1621 0.0438 0.0244 0.0575 0.0238 0.0353 0.0333 0.0556 0.055 0.0498 0.0552 98 | chr16_53551513_53552185 0.013 0.0295 0.2388 0.0815 0.0237 0.0472 0.0392 0.048 0.2182 0.1048 0.0318 0.0341 0.1964 99 | chr16_53503965_53504158 0.05 0.0398 0.145 0.028 0.0251 0.0303 0.0419 0.0427 0.0253 0.0899 0.0096 0.031 0.0427 100 | chr16_53504415_53504565 0.0268 0.0409 0.0437 0.032 0.0236 0.0289 0.0107 0.0455 0.0042 0.0499 0.0198 0.047 1e-04 101 | chr16_53468535_53469618 0.6563 0.5969 1 1 0.4776 0.6457 0.3546 0.5479 1 1 0.5444 0.6336 1 102 | -------------------------------------------------------------------------------- /inst/extdata/setB_100.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/inst/extdata/setB_100.bed.gz -------------------------------------------------------------------------------- /inst/extdata/vistaEnhancers.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/inst/extdata/vistaEnhancers.bed.gz -------------------------------------------------------------------------------- /long_vignettes/render-long-vignettes.R: -------------------------------------------------------------------------------- 1 | knitr::opts_knit$set(base.dir = 'vignettes/', progress = TRUE, verbose = TRUE) 2 | knitr::opts_chunk$set(fig.path="figures-full-power/") 3 | knitr::knit("long_vignettes/full-power.Rmd", "vignettes/full-power.Rmd") 4 | # knitr::opts_knit$set(base.dir = 'vignettes/', progress = TRUE, verbose = TRUE) 5 | # knitr::opts_chunk$set(fig.path="figures-GDData/") 6 | # knitr::knit("long_vignettes/GenomicDistributionsData.Rmd", "vignettes/GenomicDistributionsData.Rmd") 7 | -------------------------------------------------------------------------------- /man/BSdtToGRanges.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility.R 3 | \name{BSdtToGRanges} 4 | \alias{BSdtToGRanges} 5 | \title{Converts a list of data.tables (From BSreadbeds) into GRanges.} 6 | \usage{ 7 | BSdtToGRanges(dtList) 8 | } 9 | \arguments{ 10 | \item{dtList}{A list of data.tables} 11 | } 12 | \value{ 13 | A GRangesList object. 14 | } 15 | \description{ 16 | Converts a list of data.tables (From BSreadbeds) into GRanges. 17 | } 18 | -------------------------------------------------------------------------------- /man/GenomicDistributions-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/package.R 3 | \docType{package} 4 | \name{GenomicDistributions-package} 5 | \alias{GenomicDistributions} 6 | \alias{GenomicDistributions-package} 7 | \title{Produces summaries and plots of features distributed across genomes} 8 | \description{ 9 | If you have a set of genomic ranges, the GenomicDistributions R package can 10 | help you with some simple visualizations. Currently, it can produce two kinds 11 | of plots: First, the chromosome distribution plot, which visualizes how your 12 | regions are distributed over chromosomes; and second, the feature 13 | distribution plot, which visualizes how your regions are distributed relative 14 | to a feature of interest, like Transcription Start Sites (TSSs). 15 | } 16 | \seealso{ 17 | Useful links: 18 | \itemize{ 19 | \item \url{http://code.databio.org/GenomicDistributions} 20 | \item Report bugs at \url{http://github.com/databio/GenomicDistributions} 21 | } 22 | 23 | } 24 | \author{ 25 | \strong{Maintainer}: Kristyna Kupkova \email{kristynakupkova@gmail.com} 26 | 27 | Authors: 28 | \itemize{ 29 | \item Jose Verdezoto 30 | \item Tessa Danehy 31 | \item John Lawson 32 | \item Jose Verdezoto 33 | \item Michal Stolarczyk 34 | \item Jason Smith 35 | \item Bingjie Xue 36 | \item Sophia Rogers 37 | \item John Stubbs 38 | \item Nathan C. Sheffield \email{nathan@code.databio.org} 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /man/TSS_hg19.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{TSS_hg19} 5 | \alias{TSS_hg19} 6 | \title{hg19 TSS locations} 7 | \format{ 8 | A named vectors of lengths with one item per chromosome 9 | } 10 | \source{ 11 | EnsDb.Hsapiens.v75 package 12 | } 13 | \usage{ 14 | data(TSS_hg19) 15 | } 16 | \description{ 17 | A dataset containing chromosome sizes for Homo Sapiens hg38 genome assembly 18 | } 19 | \keyword{datasets} 20 | -------------------------------------------------------------------------------- /man/binBSGenome.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chrom-plots.R 3 | \name{binBSGenome} 4 | \alias{binBSGenome} 5 | \title{Bins a BSgenome object.} 6 | \usage{ 7 | binBSGenome(genome, binCount) 8 | } 9 | \arguments{ 10 | \item{genome}{A UCSC-style string denoting reference assembly (e.g. 'hg38')} 11 | 12 | \item{binCount}{number of bins per chromosome} 13 | } 14 | \value{ 15 | A data.table object showing the region and bin IDs 16 | of the reference genome. 17 | } 18 | \description{ 19 | Given a BSgenome object (to be loaded via \code{loadBSgenome}), and a number 20 | of bins, this will bin that genome. It is a simple wrapper of the 21 | \code{binChroms} function 22 | } 23 | \examples{ 24 | \dontrun{ 25 | binCount = 1000 26 | refGenomeBins = binBSGenome("hg19", binCount) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /man/binChroms.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chrom-plots.R 3 | \name{binChroms} 4 | \alias{binChroms} 5 | \title{Naively splits a chromosome into bins} 6 | \usage{ 7 | binChroms(binCount, chromSizes) 8 | } 9 | \arguments{ 10 | \item{binCount}{number of bins (total; *not* per chromosome)} 11 | 12 | \item{chromSizes}{a named list of size (length) for each chromosome.} 13 | } 14 | \value{ 15 | A data.table object assigning a bin ID to each chromosome region. 16 | } 17 | \description{ 18 | Given a list of chromosomes with corresponding sizes, this script will 19 | produce (roughly) evenly-sized bins across the chromosomes. It does not 20 | account for assembly gaps or the like. 21 | } 22 | \examples{ 23 | chromSizes = c(chr1=249250621, chr2=243199373, chr3=198022430) 24 | cBins = binChroms(1000, chromSizes) 25 | 26 | } 27 | -------------------------------------------------------------------------------- /man/binRegion.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chrom-plots.R 3 | \name{binRegion} 4 | \alias{binRegion} 5 | \title{Divide regions into roughly equal bins} 6 | \usage{ 7 | binRegion(start, end, binSize = NULL, binCount = NULL, indicator = NULL) 8 | } 9 | \arguments{ 10 | \item{start}{The starting coordinate} 11 | 12 | \item{end}{The ending coordinate} 13 | 14 | \item{binSize}{The size of bin to divide the genome into. You must supply 15 | either binSize (priority) or binCount.} 16 | 17 | \item{binCount}{The number of bins to divide. If you do not supply binSize, 18 | you must supply binCount, which will be used to calculate the binSize.} 19 | 20 | \item{indicator}{A vector with identifiers to keep with your bins, in case 21 | you are doing this on a long table with multiple segments concatenated} 22 | } 23 | \value{ 24 | A data.table, expanded to nrow = number of bins, with these id columns: 25 | id: region ID 26 | binID: repeating ID (this is the value to aggregate across) 27 | ubinID: unique bin IDs 28 | } 29 | \description{ 30 | Given a start coordinate, end coordinate, and number of bins to divide, 31 | this function will split the regions into that many bins. 32 | Bins will be only approximately the same size, due to rounding. 33 | (they should not be more than 1 different). 34 | } 35 | \details{ 36 | Use case: take a set of regions, like CG islands, and bin them; now you can 37 | aggregate signal scores across the bins, giving you an aggregate signal 38 | in bins across many regions of the same type. 39 | 40 | In theory, this just runs on 3 values, but you can run it inside a 41 | data.table j expression to divide a bunch of regions in the same way. 42 | } 43 | \examples{ 44 | Rbins = binRegion(1, 3000, 100, 1000) 45 | 46 | } 47 | -------------------------------------------------------------------------------- /man/calcChromBins.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chrom-plots.R 3 | \name{calcChromBins} 4 | \alias{calcChromBins} 5 | \title{Calculates the distribution of a query set over the genome} 6 | \usage{ 7 | calcChromBins(query, bins) 8 | } 9 | \arguments{ 10 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions} 11 | 12 | \item{bins}{Pre-computed bins (as a GRangesList object) to aggregate 13 | over; for example, these could be genome bins} 14 | } 15 | \value{ 16 | A data.table showing where on which chromosomes 17 | ranges are distributed. 18 | } 19 | \description{ 20 | Returns a data.table showing counts of regions from the query that overlap 21 | with each bin. 22 | In other words, where on which chromosomes are the ranges distributed? 23 | You must provide binned regions. Only the midpoint of each query region is 24 | used to test for overlap with the bin regions. 25 | } 26 | \examples{ 27 | 28 | chromSizes = getChromSizes("hg19") 29 | genomeBins = getGenomeBins(chromSizes) 30 | chromDistribution = calcChromBins(vistaEnhancers, genomeBins) 31 | 32 | vistaSftd = GenomicRanges::shift(vistaEnhancers, 100000) 33 | vistaSftd2 = GenomicRanges::shift(vistaEnhancers, 200000) 34 | calcChromBins(vistaEnhancers, GRangesList(vistaSftd, vistaSftd2)) 35 | } 36 | -------------------------------------------------------------------------------- /man/calcChromBinsRef.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chrom-plots.R 3 | \name{calcChromBinsRef} 4 | \alias{calcChromBinsRef} 5 | \title{Returns the distribution of query over a reference assembly 6 | Given a query set of elements (a GRanges object) and a reference assembly 7 | (*e.g. 'hg38'), this will aggregate and count the distribution of the query 8 | elements across bins of the reference genome. This is a helper function to 9 | create features for common genomes. It is a wrapper of 10 | \code{calcChromBins}, which is more general.} 11 | \usage{ 12 | calcChromBinsRef(query, refAssembly, binCount = 3000) 13 | } 14 | \arguments{ 15 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions} 16 | 17 | \item{refAssembly}{A character vector that will be used to grab chromosome 18 | sizes with \code{getChromSizes}} 19 | 20 | \item{binCount}{Number of bins to divide the chromosomes into} 21 | } 22 | \value{ 23 | A data.table showing the distribution of regions across bins of the 24 | reference genome. 25 | } 26 | \description{ 27 | Returns the distribution of query over a reference assembly 28 | Given a query set of elements (a GRanges object) and a reference assembly 29 | (*e.g. 'hg38'), this will aggregate and count the distribution of the query 30 | elements across bins of the reference genome. This is a helper function to 31 | create features for common genomes. It is a wrapper of 32 | \code{calcChromBins}, which is more general. 33 | } 34 | \examples{ 35 | ChromBins = calcChromBinsRef(vistaEnhancers, "hg19") 36 | } 37 | -------------------------------------------------------------------------------- /man/calcChromBinsRefSlow.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chrom-plots.R 3 | \name{calcChromBinsRefSlow} 4 | \alias{calcChromBinsRefSlow} 5 | \title{Returns the distribution of query over a reference assembly 6 | Given a query set of elements (a GRanges object) and a reference assembly 7 | (*e.g. 'hg38'), this will aggregate and count the distribution of the query 8 | elements across bins of the reference genome. This is a helper function to 9 | create features for common genomes. It is a wrapper of 10 | \code{calcChromBins}, which is more general.} 11 | \usage{ 12 | calcChromBinsRefSlow(query, refAssembly, binCount = 3000) 13 | } 14 | \arguments{ 15 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions} 16 | 17 | \item{refAssembly}{A character vector that will be used to grab chromosome 18 | sizes with \code{getChromSizes}} 19 | 20 | \item{binCount}{Number of bins to divide the chromosomes into} 21 | } 22 | \value{ 23 | A data.table showing the distribution of regions across bins of the 24 | reference genome. 25 | } 26 | \description{ 27 | Returns the distribution of query over a reference assembly 28 | Given a query set of elements (a GRanges object) and a reference assembly 29 | (*e.g. 'hg38'), this will aggregate and count the distribution of the query 30 | elements across bins of the reference genome. This is a helper function to 31 | create features for common genomes. It is a wrapper of 32 | \code{calcChromBins}, which is more general. 33 | } 34 | \examples{ 35 | ChromBins = calcChromBinsRef(vistaEnhancers, "hg19") 36 | } 37 | -------------------------------------------------------------------------------- /man/calcCumulativePartitions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/partition-plots.R 3 | \name{calcCumulativePartitions} 4 | \alias{calcCumulativePartitions} 5 | \title{Calculates the cumulative distribution of overlaps between query and 6 | arbitrary genomic partitions} 7 | \usage{ 8 | calcCumulativePartitions(query, partitionList, remainder = "intergenic") 9 | } 10 | \arguments{ 11 | \item{query}{GRanges or GRangesList with regions to classify.} 12 | 13 | \item{partitionList}{An ORDERED and NAMED list of genomic partitions 14 | GRanges. This list must be in priority order; the input will be assigned 15 | to the first partition it overlaps.} 16 | 17 | \item{remainder}{Which partition do you want to account for 'everything 18 | else'?} 19 | } 20 | \value{ 21 | A data.frame assigning each element of a GRanges object to a 22 | partition from a previously provided partitionList. 23 | } 24 | \description{ 25 | Takes a GRanges object, then assigns each element to a partition from the 26 | provided partitionList, and then tallies the number of regions assigned to 27 | each partition. A typical example of partitions is promoter, exon, intron, 28 | etc; this function will yield the number of each for a query GRanges object 29 | There will be a priority order to these, to account for regions that may 30 | overlap multiple genomic partitions. 31 | } 32 | \examples{ 33 | partitionList = genomePartitionList(geneModels_hg19$genesGR, 34 | geneModels_hg19$exonsGR, 35 | geneModels_hg19$threeUTRGR, 36 | geneModels_hg19$fiveUTRGR) 37 | calcCumulativePartitions(vistaEnhancers, partitionList) 38 | } 39 | -------------------------------------------------------------------------------- /man/calcCumulativePartitionsRef.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/partition-plots.R 3 | \name{calcCumulativePartitionsRef} 4 | \alias{calcCumulativePartitionsRef} 5 | \title{Calculates the cumulative distribution of overlaps for a query set to a 6 | reference assembly} 7 | \usage{ 8 | calcCumulativePartitionsRef(query, refAssembly) 9 | } 10 | \arguments{ 11 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions} 12 | 13 | \item{refAssembly}{A character vector specifying the reference genome 14 | assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes 15 | with \code{getTSSs}.} 16 | } 17 | \value{ 18 | A data.frame indicating the number of query region overlaps in 19 | several genomic partitions. 20 | } 21 | \description{ 22 | This function is a wrapper for \code{calcCumulativePartitions} that uses 23 | built-in partitions for a given reference genome assembly. 24 | } 25 | \examples{ 26 | calcCumulativePartitionsRef(vistaEnhancers, "hg19") 27 | } 28 | -------------------------------------------------------------------------------- /man/calcDinuclFreq.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/content-plots.R 3 | \name{calcDinuclFreq} 4 | \alias{calcDinuclFreq} 5 | \title{Calculate Dinuclotide content over genomic ranges} 6 | \usage{ 7 | calcDinuclFreq(query, ref, rawCounts = FALSE) 8 | } 9 | \arguments{ 10 | \item{query}{A GRanges object with query sets} 11 | 12 | \item{ref}{Reference genome BSgenome object} 13 | 14 | \item{rawCounts}{a logical indicating whether the raw numbers should be 15 | displayed, rather than percentages (optional).} 16 | } 17 | \value{ 18 | A data.table with counts of dinucleotides across the GRanges object 19 | } 20 | \description{ 21 | Given a reference genome (BSgenome object) and ranges on the 22 | reference, this function returns a data.table with 23 | counts of dinucleotides within the GRanges object. 24 | } 25 | \examples{ 26 | \dontrun{ 27 | bsg = loadBSgenome('hg19') 28 | DNF = calcDinuclFreq(vistaEnhancers, bsg) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /man/calcDinuclFreqRef.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/content-plots.R 3 | \name{calcDinuclFreqRef} 4 | \alias{calcDinuclFreqRef} 5 | \title{Calculate dinucleotide content over genomic ranges} 6 | \usage{ 7 | calcDinuclFreqRef(query, refAssembly, rawCounts = FALSE) 8 | } 9 | \arguments{ 10 | \item{query}{A GRanges object with query sets} 11 | 12 | \item{refAssembly}{A character vector specifying the reference genome 13 | assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes with 14 | \code{getTSSs}.} 15 | 16 | \item{rawCounts}{a logical indicating whether the raw numbers should be 17 | displayed, rather than percentages (optional).} 18 | } 19 | \value{ 20 | A numeric vector or list of vectors with the GC percentage of 21 | the query regions. 22 | } 23 | \description{ 24 | Given a reference genome (BSgenome object) and ranges on the 25 | reference, this function returns a data.table with 26 | counts of dinucleotides within the GRanges object. 27 | } 28 | \examples{ 29 | \dontrun{ 30 | query = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions") 31 | GRquery = rtracklayer::import(query) 32 | refAssembly = 'hg19' 33 | DNF = calcDinuclFreqRef(GRquery, refAssembly) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /man/calcExpectedPartitions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/partition-plots.R 3 | \name{calcExpectedPartitions} 4 | \alias{calcExpectedPartitions} 5 | \title{Calculates expected partiton overlap based on contribution of each 6 | feature (partition) to genome size. Expected and observed overlaps 7 | are then compared.} 8 | \usage{ 9 | calcExpectedPartitions( 10 | query, 11 | partitionList, 12 | genomeSize = NULL, 13 | remainder = "intergenic", 14 | bpProportion = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{query}{GRanges or GRangesList with regions to classify.} 19 | 20 | \item{partitionList}{An ORDERED (if bpProportion=FALSE) and NAMED 21 | list of genomic partitions GRanges. This list must be in 22 | priority order; the input will be assigned 23 | to the first partition it overlaps. However, if bpProportion=TRUE, 24 | the list does not need ordering.} 25 | 26 | \item{genomeSize}{The number of bases in the query genome. In other words, 27 | the sum of all chromosome sizes.} 28 | 29 | \item{remainder}{Which partition do you want to account for 'everything 30 | else'?} 31 | 32 | \item{bpProportion}{logical indicating if overlaps should be calculated based 33 | on number of base pairs overlapping with each partition. 34 | bpProportion=FALSE does overlaps in priority order, 35 | bpProportion=TRUE counts number of overlapping 36 | base pairs between query and each partition.} 37 | } 38 | \value{ 39 | A data.frame assigning each element of a GRanges object to a 40 | partition from a previously provided partitionList.The data.frame also 41 | contains Chi-square p-values calculated for observed/expected 42 | overlaps on each individual partition. 43 | } 44 | \description{ 45 | Calculates expected partiton overlap based on contribution of each 46 | feature (partition) to genome size. Expected and observed overlaps 47 | are then compared. 48 | } 49 | \examples{ 50 | partitionList = genomePartitionList(geneModels_hg19$genesGR, 51 | geneModels_hg19$exonsGR, 52 | geneModels_hg19$threeUTRGR, 53 | geneModels_hg19$fiveUTRGR) 54 | chromSizes = getChromSizes('hg19') 55 | genomeSize = sum(chromSizes) 56 | calcExpectedPartitions(vistaEnhancers, partitionList, genomeSize) 57 | } 58 | -------------------------------------------------------------------------------- /man/calcExpectedPartitionsRef.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/partition-plots.R 3 | \name{calcExpectedPartitionsRef} 4 | \alias{calcExpectedPartitionsRef} 5 | \title{Calculates the distribution of observed versus expected overlaps for a 6 | query set to a reference assembly} 7 | \usage{ 8 | calcExpectedPartitionsRef(query, refAssembly, bpProportion = FALSE) 9 | } 10 | \arguments{ 11 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions} 12 | 13 | \item{refAssembly}{A character vector specifying the reference genome 14 | assembly (*e.g.* 'hg19'). This will be used to grab annotation 15 | models with \code{getGeneModels}, and chromosome sizes with\code{getChromSizes}} 16 | 17 | \item{bpProportion}{logical indicating if overlaps should be calculated based 18 | on number of base pairs overlapping with each partition. 19 | bpProportion=FALSE does overlaps in priority order, 20 | bpProportion=TRUE counts number of overlapping 21 | base pairs between query and each partition.} 22 | } 23 | \value{ 24 | A data.frame indicating the number of query region overlaps in 25 | several genomic partitions. 26 | } 27 | \description{ 28 | This function is a wrapper for \code{calcExpectedPartitions} that uses 29 | built-in partitions for a given reference genome assembly. 30 | } 31 | \examples{ 32 | calcExpectedPartitionsRef(vistaEnhancers, "hg19") 33 | } 34 | -------------------------------------------------------------------------------- /man/calcFeatureDist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/feature-plots.R 3 | \name{calcFeatureDist} 4 | \alias{calcFeatureDist} 5 | \title{Find the distance to the nearest genomic feature} 6 | \usage{ 7 | calcFeatureDist(query, features) 8 | } 9 | \arguments{ 10 | \item{query}{A GRanges or GRangesList object with query sets} 11 | 12 | \item{features}{A GRanges object with features to test distance to} 13 | } 14 | \value{ 15 | A vector of genomic distances for each query region relative to its 16 | closest feature. 17 | } 18 | \description{ 19 | For a given query set of genomic regions, and a given feature set of 20 | regions, this function will return the distance for each query region to its 21 | closest feature. It ignores strand and returns the distance as positive or 22 | negative, depending on whether the feature is upstream or downstream 23 | } 24 | \details{ 25 | This function is similar to the bioconductor distanceToNearest function, but 26 | returns negative values for downstream distances instead of absolute values. 27 | This allows you to assess the relative location. 28 | } 29 | \examples{ 30 | vistaSftd = GenomicRanges::shift(vistaEnhancers, 100000) 31 | calcFeatureDist(vistaEnhancers, vistaSftd) 32 | } 33 | -------------------------------------------------------------------------------- /man/calcFeatureDistRefTSS.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/feature-plots.R 3 | \name{calcFeatureDistRefTSS} 4 | \alias{calcFeatureDistRefTSS} 5 | \title{Calculates the distribution of distances from a query set to closest TSS} 6 | \usage{ 7 | calcFeatureDistRefTSS(query, refAssembly) 8 | } 9 | \arguments{ 10 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions} 11 | 12 | \item{refAssembly}{A character vector specifying the reference genome 13 | assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes with 14 | \code{getTSSs}.} 15 | } 16 | \value{ 17 | A vector of distances for each query region relative to TSSs. 18 | } 19 | \description{ 20 | Given a query GRanges object and an assembly string, this function will grab 21 | the TSS list for the given reference assembly and then calculate the distance 22 | from each query feature to the closest TSS. It is a wrapper of 23 | \code{calcFeatureDist} that uses built-in TSS features for a reference 24 | assembly 25 | } 26 | \examples{ 27 | calcFeatureDistRefTSS(vistaEnhancers, "hg19") 28 | } 29 | -------------------------------------------------------------------------------- /man/calcGCContent.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/content-plots.R 3 | \name{calcGCContent} 4 | \alias{calcGCContent} 5 | \title{Calculate GC content over genomic ranges} 6 | \usage{ 7 | calcGCContent(query, ref) 8 | } 9 | \arguments{ 10 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions.} 11 | 12 | \item{ref}{Reference genome BSgenome object.} 13 | } 14 | \value{ 15 | A numeric vector of list of vectors with the GC percentage of 16 | the query regions. 17 | } 18 | \description{ 19 | Given a reference genome as a BSgenome object and some ranges on that 20 | reference, this function will return a vector of the same length as the 21 | granges object, with percent of Cs and Gs. 22 | } 23 | \examples{ 24 | \dontrun{ 25 | bsg = loadBSgenome('hg19') 26 | gcvec = calcGCContent(vistaEnhancers, bsg) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /man/calcGCContentRef.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/content-plots.R 3 | \name{calcGCContentRef} 4 | \alias{calcGCContentRef} 5 | \title{Calculate GC content over genomic ranges} 6 | \usage{ 7 | calcGCContentRef(query, refAssembly) 8 | } 9 | \arguments{ 10 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions} 11 | 12 | \item{refAssembly}{A character vector specifying the reference genome 13 | assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes with 14 | \code{getTSSs}.} 15 | } 16 | \value{ 17 | A numeric vector or list of vectors with the GC percentage of 18 | the query regions. 19 | } 20 | \description{ 21 | Given a reference genome as a BSgenome object and some ranges on that 22 | reference, this function will return a vector of the same length as the 23 | granges object, with percent of Cs and Gs. 24 | } 25 | \examples{ 26 | \dontrun{ 27 | refAssembly = 'hg19' 28 | GCcontent = calcGCContentRef(vistaEnhancers, refAssembly) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /man/calcNearestNeighbors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/neighbor-distances.R 3 | \name{calcNearestNeighbors} 4 | \alias{calcNearestNeighbors} 5 | \title{Group regions from the same chromosome together and 6 | compute the distance of a region to its nearest neighbor. 7 | Distances are then lumped into a numeric vector.} 8 | \usage{ 9 | calcNearestNeighbors(query, correctRef = "None") 10 | } 11 | \arguments{ 12 | \item{query}{A GRanges or GRangesList object.} 13 | 14 | \item{correctRef}{A string indicating the reference genome 15 | to use if Nearest neighbor distances are corrected for the 16 | number of regions in a regionSet.} 17 | } 18 | \value{ 19 | A numeric vector or list of vectors containing the 20 | distance of regions to their nearest neighbors. 21 | } 22 | \description{ 23 | Group regions from the same chromosome together and 24 | compute the distance of a region to its nearest neighbor. 25 | Distances are then lumped into a numeric vector. 26 | } 27 | \examples{ 28 | Nneighbors = calcNearestNeighbors(vistaEnhancers) 29 | } 30 | -------------------------------------------------------------------------------- /man/calcNeighborDist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/neighbor-distances.R 3 | \name{calcNeighborDist} 4 | \alias{calcNeighborDist} 5 | \title{Group regions from the same chromosome together and 6 | calculate the distances of a region to its upstream and 7 | downstream neighboring regions. 8 | Distances are then lumped into a numeric vector.} 9 | \usage{ 10 | calcNeighborDist(query, correctRef = "None") 11 | } 12 | \arguments{ 13 | \item{query}{A GRanges or GRangesList object.} 14 | 15 | \item{correctRef}{A string indicating the reference genome 16 | to use if distances are corrected for the number of 17 | regions in a regionSet.} 18 | } 19 | \value{ 20 | A numeric vector or list with different vectors containing the 21 | distances of regions to their upstream/downstream neighbors. 22 | } 23 | \description{ 24 | Group regions from the same chromosome together and 25 | calculate the distances of a region to its upstream and 26 | downstream neighboring regions. 27 | Distances are then lumped into a numeric vector. 28 | } 29 | \examples{ 30 | dist = calcNeighborDist(vistaEnhancers) 31 | } 32 | -------------------------------------------------------------------------------- /man/calcPartitions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/partition-plots.R 3 | \name{calcPartitions} 4 | \alias{calcPartitions} 5 | \title{Calculates the distribution of overlaps between 6 | query and arbitrary genomic partitions} 7 | \usage{ 8 | calcPartitions( 9 | query, 10 | partitionList, 11 | remainder = "intergenic", 12 | bpProportion = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{query}{GRanges or GRangesList with regions to classify} 17 | 18 | \item{partitionList}{an ORDERED (if bpProportion=FALSE) and NAMED list of 19 | genomic partitions GRanges. This list must be in priority order; the 20 | input will be assigned to the first partition it overlaps. 21 | bpProportion=TRUE, the list does not need ordering.} 22 | 23 | \item{remainder}{A character vector to assign any query regions that do 24 | not overlap with anything in the partitionList. Defaults to "intergenic"} 25 | 26 | \item{bpProportion}{logical indicating if overlaps should be calculated based 27 | on number of base pairs overlapping with each partition. 28 | bpProportion=FALSE does overlaps in priority order, 29 | bpProportion=TRUE counts number of overlapping 30 | base pairs between query and each partition.} 31 | } 32 | \value{ 33 | A data.frame assigning each element of a GRanges object to a 34 | partition from a previously provided partitionList. 35 | } 36 | \description{ 37 | Takes a GRanges object, then assigns each element to a partition from the 38 | provided partitionList, and then tallies the number of regions assigned to 39 | each partition. A typical example of partitions is promoter, exon, intron, 40 | etc; this function will yield the number of each for a query GRanges object 41 | There will be a priority order to these, to account for regions that may 42 | overlap multiple genomic partitions. 43 | } 44 | \examples{ 45 | partitionList = genomePartitionList(geneModels_hg19$genesGR, 46 | geneModels_hg19$exonsGR, 47 | geneModels_hg19$threeUTRGR, 48 | geneModels_hg19$fiveUTRGR) 49 | calcPartitions(vistaEnhancers, partitionList) 50 | } 51 | -------------------------------------------------------------------------------- /man/calcPartitionsRef.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/partition-plots.R 3 | \name{calcPartitionsRef} 4 | \alias{calcPartitionsRef} 5 | \title{Calculates the distribution of overlaps for a query set to a reference 6 | assembly} 7 | \usage{ 8 | calcPartitionsRef(query, refAssembly, bpProportion = FALSE) 9 | } 10 | \arguments{ 11 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions} 12 | 13 | \item{refAssembly}{A character vector specifying the reference genome 14 | assembly (*e.g.* 'hg19'). This will be used to grab annotation 15 | models with \code{getGeneModels}} 16 | 17 | \item{bpProportion}{logical indicating if overlaps should be calculated 18 | based on number of base pairs overlapping with each partition. 19 | bpProportion=FALSE does overlaps in priority order, 20 | bpProportion=TRUE counts number of overlapping 21 | base pairs between query and each partition.} 22 | } 23 | \value{ 24 | A data.frame indicating the number of query region overlaps in 25 | several genomic partitions. 26 | } 27 | \description{ 28 | This function is a wrapper for \code{calcPartitions} 29 | and \code{calcPartitionPercents} that uses built-in 30 | partitions for a given reference genome assembly. 31 | } 32 | \examples{ 33 | calcPartitionsRef(vistaEnhancers, "hg19") 34 | } 35 | -------------------------------------------------------------------------------- /man/calcSummarySignal.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/specificity-plots.R 3 | \name{calcSummarySignal} 4 | \alias{calcSummarySignal} 5 | \title{The function calcSummarySignal takes the input BED file(s) 6 | in form of GRanges or GRangesList object, overlaps 7 | it with all defined open chromatin regions across 8 | conditions (e.g. cell types) and returns a matrix, 9 | where each row is the input genomic region 10 | (if overlap was found), each column is a condition, 11 | and the value is a meam signal from regions where 12 | overlap was found.} 13 | \usage{ 14 | calcSummarySignal(query, signalMatrix) 15 | } 16 | \arguments{ 17 | \item{query}{Genomic regions to be analyzed. Can be GRanges or GRangesList 18 | object.} 19 | 20 | \item{signalMatrix}{Matrix with signal values in predfined regions, where 21 | rows are predefined genomic regions, columns are conditions 22 | (e.g. cell types in which the signal was measured). 23 | First column contains information about the genomic region in 24 | following form: chr_start_end. 25 | Can be either data.frame or data.table object.} 26 | } 27 | \value{ 28 | A list with named components: 29 | signalSummaryMatrix - data.table with cell specific open chromatin signal 30 | values for query regions 31 | matrixStats - data.frame containing boxplot stats for individual 32 | cell type 33 | } 34 | \description{ 35 | The function calcSummarySignal takes the input BED file(s) 36 | in form of GRanges or GRangesList object, overlaps 37 | it with all defined open chromatin regions across 38 | conditions (e.g. cell types) and returns a matrix, 39 | where each row is the input genomic region 40 | (if overlap was found), each column is a condition, 41 | and the value is a meam signal from regions where 42 | overlap was found. 43 | } 44 | \examples{ 45 | signalSummaryList = calcSummarySignal(vistaEnhancers, exampleOpenSignalMatrix_hg19) 46 | } 47 | -------------------------------------------------------------------------------- /man/calcWidth.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qthist.R 3 | \name{calcWidth} 4 | \alias{calcWidth} 5 | \title{Calculate the widths of regions} 6 | \usage{ 7 | calcWidth(query) 8 | } 9 | \arguments{ 10 | \item{query}{A GRanges or GRangesList object with query sets} 11 | } 12 | \value{ 13 | A vector of the widths (end-start coordinates) of GRanges objects. 14 | } 15 | \description{ 16 | The length of a genomic region (the distance between the start and end) 17 | is called the width 18 | When given a query set of genomic regions, this function returns the width 19 | } 20 | \examples{ 21 | regWidths = calcWidth(vistaEnhancers) 22 | } 23 | -------------------------------------------------------------------------------- /man/cellTypeMetadata.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{cellTypeMetadata} 5 | \alias{cellTypeMetadata} 6 | \title{Table the maps cell types to tissues and groups} 7 | \format{ 8 | data.table with 3 columns (cellType, tissue and group) 9 | and 74 rows (one per cellType) 10 | } 11 | \source{ 12 | self-curated dataset 13 | } 14 | \usage{ 15 | data(cellTypeMetadata) 16 | } 17 | \description{ 18 | Table the maps cell types to tissues and groups 19 | } 20 | \keyword{datasets} 21 | -------------------------------------------------------------------------------- /man/chromSizes_hg19.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{chromSizes_hg19} 5 | \alias{chromSizes_hg19} 6 | \title{hg19 chromosome sizes} 7 | \format{ 8 | A named vectors of lengths with one item per chromosome 9 | } 10 | \source{ 11 | BSgenome.Hsapiens.UCSC.hg19 package 12 | } 13 | \usage{ 14 | data(chromSizes_hg19) 15 | } 16 | \description{ 17 | A dataset containing chromosome sizes for Homo Sapiens hg38 genome assembly 18 | } 19 | \keyword{datasets} 20 | -------------------------------------------------------------------------------- /man/dot-requireAndReturn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility.R 3 | \name{.requireAndReturn} 4 | \alias{.requireAndReturn} 5 | \title{Checks to make sure a package object is installed, 6 | and if so, returns it. If the library is not installed, it issues a warning 7 | and returns NULL.} 8 | \usage{ 9 | .requireAndReturn(BSgenomeString) 10 | } 11 | \arguments{ 12 | \item{BSgenomeString}{A BSgenome compatible genome string.} 13 | } 14 | \value{ 15 | A BSgenome object if installed. 16 | } 17 | \description{ 18 | Checks to make sure a package object is installed, 19 | and if so, returns it. If the library is not installed, it issues a warning 20 | and returns NULL. 21 | } 22 | -------------------------------------------------------------------------------- /man/dot-validateInputs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility.R 3 | \name{.validateInputs} 4 | \alias{.validateInputs} 5 | \title{Checks class of the list of variables. To be used in functions} 6 | \usage{ 7 | .validateInputs(checkList) 8 | } 9 | \arguments{ 10 | \item{checkList}{list of object to check, e.g. 11 | list(varname=c("data.frame", "numeric")). 12 | Multiuple strings in the vector are treated as OR.} 13 | } 14 | \value{ 15 | A warning if the wrong input class is provided. 16 | } 17 | \description{ 18 | Checks class of the list of variables. To be used in functions 19 | } 20 | \examples{ 21 | x = function(var1) { 22 | cl = list(var1=c("numeric","character")) 23 | .validateInputs(cl) 24 | return(var1^2) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /man/dtToGr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility.R 3 | \name{dtToGr} 4 | \alias{dtToGr} 5 | \title{Converts a data.table (DT) object to a GenomicRanges 6 | (GR) object. Tries to be intelligent, guessing chr 7 | and start, but you have to supply end or other 8 | columns if you want them to be carried into the GR.} 9 | \usage{ 10 | dtToGr( 11 | DT, 12 | chr = "chr", 13 | start = "start", 14 | end = NA, 15 | strand = NA, 16 | name = NA, 17 | splitFactor = NA, 18 | metaCols = NA 19 | ) 20 | } 21 | \arguments{ 22 | \item{DT}{A data.table representing genomic regions.} 23 | 24 | \item{chr}{A string representing the chromosome column.} 25 | 26 | \item{start}{A string representing the name of the start column.} 27 | 28 | \item{end}{A string representing the name of the end column.} 29 | 30 | \item{strand}{A string representing the name of the strand column.} 31 | 32 | \item{name}{A string representing the name of the name column.} 33 | 34 | \item{splitFactor}{A string representing the name of the column to use to 35 | split the data.table into multiple data.tables.} 36 | 37 | \item{metaCols}{A string representing the name of the metadata column(s) 38 | to include in the returned GRanges object.} 39 | } 40 | \value{ 41 | A GRanges object. 42 | } 43 | \description{ 44 | Converts a data.table (DT) object to a GenomicRanges 45 | (GR) object. Tries to be intelligent, guessing chr 46 | and start, but you have to supply end or other 47 | columns if you want them to be carried into the GR. 48 | } 49 | \examples{ 50 | start1 = c(seq(from=1, to = 2001, by = 1000), 800) 51 | chrString1 = c(rep("chr1", 3), "chr2") 52 | dt = data.table::data.table(chr=chrString1, 53 | start=start1, 54 | end=start1 + 250) 55 | newGR = dtToGr(dt) 56 | } 57 | -------------------------------------------------------------------------------- /man/dtToGrInternal.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility.R 3 | \name{dtToGrInternal} 4 | \alias{dtToGrInternal} 5 | \title{Two utility functions for converting data.tables into GRanges objects} 6 | \usage{ 7 | dtToGrInternal(DT, chr, start, end = NA, strand = NA, name = NA, metaCols = NA) 8 | } 9 | \arguments{ 10 | \item{DT}{A data.table representing genomic regions.} 11 | 12 | \item{chr}{A string representing the chromosome column.} 13 | 14 | \item{start}{A string representing the name of the start column.} 15 | 16 | \item{end}{A string representing the name of the end column.} 17 | 18 | \item{strand}{A string representing the name of the strand column.} 19 | 20 | \item{name}{A string representing the name of the name column.} 21 | 22 | \item{metaCols}{A string representing the name of the metadata column(s) 23 | to include in the returned GRanges object.} 24 | } 25 | \value{ 26 | A GRanges object. 27 | } 28 | \description{ 29 | Two utility functions for converting data.tables into GRanges objects 30 | } 31 | -------------------------------------------------------------------------------- /man/exampleOpenSignalMatrix_hg19.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{exampleOpenSignalMatrix_hg19} 5 | \alias{exampleOpenSignalMatrix_hg19} 6 | \title{A dataset containing a subset of open chromatin regions across all 7 | cell types defined by ENCODE for Homo Sapiens hg19} 8 | \format{ 9 | data.frame, rows represent whole selection of open 10 | chromatin regions across all cell types defined by ENCODE, columns are 11 | individual cell types and values are normalized open chromatin signal values. 12 | } 13 | \source{ 14 | \url{http://big.databio.org/open_chromatin_matrix/openSignalMatrix_hg19_quantileNormalized_round4.txt.gz} 15 | } 16 | \usage{ 17 | data(exampleOpenSignalMatrix_hg19) 18 | } 19 | \description{ 20 | Preparation steps: 21 | \enumerate{ 22 | \item{made a universe of regions by merging regions across 23 | cell types defined as opened in ENCODE} 24 | \item{took bigwig files from ENCODE for individual cell types, 25 | merged replicates, filtered out blacklisted sites} 26 | \item{evaluated the signal above regions defined by previous step} 27 | \item{performed quantile normalization} 28 | \item{subsetted it} 29 | } 30 | } 31 | \keyword{datasets} 32 | -------------------------------------------------------------------------------- /man/geneModels_hg19.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{geneModels_hg19} 5 | \alias{geneModels_hg19} 6 | \title{hg38 gene models} 7 | \format{ 8 | A list of two GRanges objects, with genes and exons locations 9 | } 10 | \source{ 11 | EnsDb.Hsapiens.v75 package 12 | } 13 | \usage{ 14 | data(geneModels_hg19) 15 | } 16 | \description{ 17 | A dataset containing gene models for Homo Sapiens hg38 genome assembly. 18 | } 19 | \keyword{datasets} 20 | -------------------------------------------------------------------------------- /man/genomePartitionList.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/partition-plots.R 3 | \name{genomePartitionList} 4 | \alias{genomePartitionList} 5 | \title{Create a basic genome partition list of genes, exons, introns, UTRs, and 6 | intergenic} 7 | \usage{ 8 | genomePartitionList( 9 | genesGR, 10 | exonsGR, 11 | threeUTRGR = NULL, 12 | fiveUTRGR = NULL, 13 | getCorePromoter = TRUE, 14 | getProxPromoter = TRUE, 15 | corePromSize = 100, 16 | proxPromSize = 2000 17 | ) 18 | } 19 | \arguments{ 20 | \item{genesGR}{a GRanges object of gene coordinates} 21 | 22 | \item{exonsGR}{a GRanges object of exons coordinates} 23 | 24 | \item{threeUTRGR}{a GRanges object of 3' UTRs} 25 | 26 | \item{fiveUTRGR}{a GRanges object of 5' UTRs} 27 | 28 | \item{getCorePromoter}{option specifying if core promoters should be 29 | extracted defeaults to TRUE} 30 | 31 | \item{getProxPromoter}{option specifying if proximal promoters should be 32 | extracted defeaults to TRUE} 33 | 34 | \item{corePromSize}{size of core promoter (in bp) upstrem from TSS 35 | default value = 100} 36 | 37 | \item{proxPromSize}{size of proximal promoter (in bp) upstrem from TSS 38 | default value = 2000} 39 | } 40 | \value{ 41 | A list of GRanges objects, each corresponding to a partition of the 42 | genome. Partitions include proximal and core promoters, exons and 43 | introns. 44 | } 45 | \description{ 46 | Given GRanges for genes, and a GRanges for exons, returns a list of GRanges 47 | corresponding to various breakdown of the genome, based on the given 48 | annotations; it gives you proximal and core promoters, exons, and introns. 49 | } 50 | \details{ 51 | To be used as a partitionList for \code{calcPartitions}. 52 | } 53 | \examples{ 54 | partitionList = genomePartitionList(geneModels_hg19$genesGR, 55 | geneModels_hg19$exonsGR, 56 | geneModels_hg19$threeUTRGR, 57 | geneModels_hg19$fiveUTRGR) 58 | } 59 | -------------------------------------------------------------------------------- /man/getChromSizes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/loadData.R 3 | \name{getChromSizes} 4 | \alias{getChromSizes} 5 | \title{Returns built-in chrom sizes for a given reference assembly} 6 | \usage{ 7 | getChromSizes(refAssembly) 8 | } 9 | \arguments{ 10 | \item{refAssembly}{A string identifier for the reference assembly} 11 | } 12 | \value{ 13 | A vector with the chromosome sizes corresponding to a 14 | specific genome assembly. 15 | } 16 | \description{ 17 | Returns built-in chrom sizes for a given reference assembly 18 | } 19 | \examples{ 20 | getChromSizes("hg19") 21 | } 22 | -------------------------------------------------------------------------------- /man/getChromSizesFromFasta.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/buildReferenceData.R 3 | \name{getChromSizesFromFasta} 4 | \alias{getChromSizesFromFasta} 5 | \title{Get gene models from a remote or local FASTA file} 6 | \usage{ 7 | getChromSizesFromFasta(source, destDir = NULL, convertEnsemblUCSC = FALSE) 8 | } 9 | \arguments{ 10 | \item{source}{a string that is either a path to a 11 | local or remote FASTA} 12 | 13 | \item{destDir}{a string that indicates the path to the 14 | directory where the downloaded FASTA file should be stored} 15 | 16 | \item{convertEnsemblUCSC}{a logical indicating whether Ensembl style 17 | chromosome annotation should be changed to UCSC style (add chr)} 18 | } 19 | \value{ 20 | a named vector of sequence lengths 21 | } 22 | \description{ 23 | Get gene models from a remote or local FASTA file 24 | } 25 | \examples{ 26 | CElegansFasteCropped = system.file("extdata", 27 | "C_elegans_cropped_example.fa.gz", 28 | package="GenomicDistributions") 29 | CElegansChromSizes = getChromSizesFromFasta(CElegansFasteCropped) 30 | } 31 | -------------------------------------------------------------------------------- /man/getGeneModels.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/loadData.R 3 | \name{getGeneModels} 4 | \alias{getGeneModels} 5 | \title{Returns built-in gene models for a given reference assembly} 6 | \usage{ 7 | getGeneModels(refAssembly) 8 | } 9 | \arguments{ 10 | \item{refAssembly}{A string identifier for the reference assembly} 11 | } 12 | \value{ 13 | A list containing the gene models corresponding to a 14 | specific reference assembly. 15 | } 16 | \description{ 17 | Some functions require gene models, which can obtained from any source. 18 | This function allows you to retrieve a few common built-in ones. 19 | } 20 | \examples{ 21 | getGeneModels("hg19") 22 | } 23 | -------------------------------------------------------------------------------- /man/getGeneModelsFromGTF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/buildReferenceData.R 3 | \name{getGeneModelsFromGTF} 4 | \alias{getGeneModelsFromGTF} 5 | \title{Get gene models from a remote or local GTF file} 6 | \usage{ 7 | getGeneModelsFromGTF( 8 | source, 9 | features, 10 | convertEnsemblUCSC = FALSE, 11 | destDir = NULL, 12 | filterProteinCoding = TRUE 13 | ) 14 | } 15 | \arguments{ 16 | \item{source}{a string that is either a path to a local or remote GTF} 17 | 18 | \item{features}{a vector of strings with feature identifiers that to 19 | include in the result list} 20 | 21 | \item{convertEnsemblUCSC}{a logical indicating whether Ensembl style 22 | chromosome annotation should be changed to UCSC style} 23 | 24 | \item{destDir}{a string that indicates the path to the directory where 25 | the downloaded GTF file should be stored} 26 | 27 | \item{filterProteinCoding}{a logical indicating if TSSs should be only 28 | protein-coding genes (default = TRUE)} 29 | } 30 | \value{ 31 | a list of GRanges objects 32 | } 33 | \description{ 34 | Get gene models from a remote or local GTF file 35 | } 36 | \examples{ 37 | CElegansGtfCropped = system.file("extdata", 38 | "C_elegans_cropped_example.gtf.gz", 39 | package="GenomicDistributions") 40 | features = c("gene", "exon", "three_prime_utr", "five_prime_utr") 41 | CElegansGeneModels = getGeneModelsFromGTF(CElegansGtfCropped, features, TRUE) 42 | } 43 | -------------------------------------------------------------------------------- /man/getGenomeBins.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chrom-plots.R 3 | \name{getGenomeBins} 4 | \alias{getGenomeBins} 5 | \title{Returns bins used in `calcChromBins` function 6 | Given a named vector of chromosome sizes, the function returns 7 | GRangesList object with bins for each chromosome.} 8 | \usage{ 9 | getGenomeBins(chromSizes, binCount = 10000) 10 | } 11 | \arguments{ 12 | \item{chromSizes}{a named list of size (length) for each chromosome.} 13 | 14 | \item{binCount}{number of bins (total; *not* per chromosome), 15 | defaults to 10,000} 16 | } 17 | \value{ 18 | A GRangesList object with bins that separate chromosomes 19 | into equal parts. 20 | } 21 | \description{ 22 | Returns bins used in `calcChromBins` function 23 | Given a named vector of chromosome sizes, the function returns 24 | GRangesList object with bins for each chromosome. 25 | } 26 | \examples{ 27 | chromSizes = getChromSizes("hg19") 28 | chromBins = getGenomeBins(chromSizes) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /man/getReferenceData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/loadData.R 3 | \name{getReferenceData} 4 | \alias{getReferenceData} 5 | \title{Get reference data for a specified assembly} 6 | \usage{ 7 | getReferenceData(refAssembly, tagline) 8 | } 9 | \arguments{ 10 | \item{refAssembly}{Reference assembly string (e.g. 'hg38')} 11 | 12 | \item{tagline}{The string that was used to identify data of a given type in 13 | the data building step. It's used for the filename so we know 14 | what to load, and is what makes this function generic (so it 15 | can load different data types).} 16 | } 17 | \value{ 18 | A requested and included package data object. 19 | } 20 | \description{ 21 | This is a generic getter function that will return a data object requested, 22 | if it is included in the built-in data with the GenomicDistributions package 23 | or GenomicDistributionsData package (if installed). Data objects can 24 | be requested for different reference assemblies and data types (specified by 25 | a tagline, which is a unique string identifying the data type). 26 | } 27 | -------------------------------------------------------------------------------- /man/getTssFromGTF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/buildReferenceData.R 3 | \name{getTssFromGTF} 4 | \alias{getTssFromGTF} 5 | \title{Get transcription start sites (TSSs) from a remote or local GTF file} 6 | \usage{ 7 | getTssFromGTF( 8 | source, 9 | convertEnsemblUCSC = FALSE, 10 | destDir = NULL, 11 | filterProteinCoding = TRUE 12 | ) 13 | } 14 | \arguments{ 15 | \item{source}{a string that is either a path to a local or remote GTF} 16 | 17 | \item{convertEnsemblUCSC}{a logical indicating whether Ensembl style 18 | chromosome annotation should be changed to UCSC style} 19 | 20 | \item{destDir}{a string that indicates the path to the directory where 21 | the downloaded GTF file should be stored} 22 | 23 | \item{filterProteinCoding}{a logical indicating if TSSs should be only 24 | protein-coding genes (default = TRUE)} 25 | } 26 | \value{ 27 | a list of GRanges objects 28 | } 29 | \description{ 30 | Get transcription start sites (TSSs) from a remote or local GTF file 31 | } 32 | \examples{ 33 | CElegansGtfCropped = system.file("extdata", 34 | "C_elegans_cropped_example.gtf.gz", 35 | package="GenomicDistributions") 36 | CElegansTss = getTssFromGTF(CElegansGtfCropped, TRUE) 37 | } 38 | -------------------------------------------------------------------------------- /man/grToDt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility.R 3 | \name{grToDt} 4 | \alias{grToDt} 5 | \title{Convert a GenomicRanges into a data.table.} 6 | \usage{ 7 | grToDt(GR) 8 | } 9 | \arguments{ 10 | \item{GR}{A Granges object} 11 | } 12 | \value{ 13 | A data.table object. 14 | } 15 | \description{ 16 | Convert a GenomicRanges into a data.table. 17 | } 18 | -------------------------------------------------------------------------------- /man/labelCuts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility.R 3 | \name{labelCuts} 4 | \alias{labelCuts} 5 | \title{Creates labels based on a discretization definition.} 6 | \usage{ 7 | labelCuts( 8 | breakPoints, 9 | round_digits = 1, 10 | signif_digits = 3, 11 | collapse = "-", 12 | infBins = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{breakPoints}{The exact values you want as boundaries for your bins} 17 | 18 | \item{round_digits}{Number of digits to cut round labels to.} 19 | 20 | \item{signif_digits}{Number of significant digits to specify.} 21 | 22 | \item{collapse}{Character to separate the labels} 23 | 24 | \item{infBins}{use >/< as labels on the edge bins} 25 | } 26 | \value{ 27 | A vector of histogram axis labels. 28 | } 29 | \description{ 30 | If you are building a histogram of binned values, you want to have labels for 31 | your bins that correspond to the ranges you used to bin. This function takes 32 | the breakpoints that define your bins and produces nice-looking labels for 33 | your histogram plot. 34 | } 35 | \details{ 36 | \code{labelCuts} will take a cut group, (e.g., a quantile division of 37 | some signal), and give you clean labels (similar to the cut method). 38 | } 39 | -------------------------------------------------------------------------------- /man/loadBSgenome.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/loadData.R 3 | \name{loadBSgenome} 4 | \alias{loadBSgenome} 5 | \title{Loads BSgenome objects from UCSC-style character vectors.} 6 | \usage{ 7 | loadBSgenome(genomeBuild, masked = TRUE) 8 | } 9 | \arguments{ 10 | \item{genomeBuild}{One of 'hg19', 'hg38', 'mm10', 'mm9', or 'grch38'} 11 | 12 | \item{masked}{Should we used the masked version? Default:TRUE} 13 | } 14 | \value{ 15 | A BSgenome object corresponding to the provided genome build. 16 | } 17 | \description{ 18 | This function will let you use a simple character vector (e.g. 'hg19') to 19 | load and then return BSgenome objects. This lets you avoid having to use the 20 | more complex annotation for a complete BSgenome object (e.g. 21 | BSgenome.Hsapiens.UCSC.hg38.masked) 22 | } 23 | \examples{ 24 | \dontrun{ 25 | bsg = loadBSgenome('hg19') 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /man/loadEnsDb.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/loadData.R 3 | \name{loadEnsDb} 4 | \alias{loadEnsDb} 5 | \title{Load selected EnsDb library} 6 | \usage{ 7 | loadEnsDb(genomeBuild) 8 | } 9 | \arguments{ 10 | \item{genomeBuild}{string, genome identifier} 11 | } 12 | \value{ 13 | loaded library 14 | } 15 | \description{ 16 | Load selected EnsDb library 17 | } 18 | \examples{ 19 | \dontrun{ 20 | loadEnsDb("hg19") 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /man/neighbordt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/neighbor-distances.R 3 | \name{neighbordt} 4 | \alias{neighbordt} 5 | \title{Internal helper function to calculate distance 6 | between neighboring regions.} 7 | \usage{ 8 | neighbordt(querydt) 9 | } 10 | \arguments{ 11 | \item{querydt}{A data table with regions grouped according to 12 | chromosome.} 13 | } 14 | \value{ 15 | A numeric vector with the distances in bp 16 | } 17 | \description{ 18 | Internal helper function to calculate distance 19 | between neighboring regions. 20 | } 21 | -------------------------------------------------------------------------------- /man/nlist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility.R 3 | \name{nlist} 4 | \alias{nlist} 5 | \title{Nathan's magical named list function. 6 | This function is a drop-in replacement for the base list() function, 7 | which automatically names your list according to the names of the 8 | variables used to construct it. 9 | It seamlessly handles lists with some names and others absent, 10 | not overwriting specified names while naming any unnamed parameters. 11 | Took me awhile to figure this out.} 12 | \usage{ 13 | nlist(...) 14 | } 15 | \arguments{ 16 | \item{...}{arguments passed to list()} 17 | } 18 | \value{ 19 | A named list object. 20 | } 21 | \description{ 22 | Nathan's magical named list function. 23 | This function is a drop-in replacement for the base list() function, 24 | which automatically names your list according to the names of the 25 | variables used to construct it. 26 | It seamlessly handles lists with some names and others absent, 27 | not overwriting specified names while naming any unnamed parameters. 28 | Took me awhile to figure this out. 29 | } 30 | \examples{ 31 | x=5 32 | y=10 33 | nlist(x,y) # returns list(x=5, y=10) 34 | list(x,y) # returns unnamed list(5, 10) 35 | } 36 | -------------------------------------------------------------------------------- /man/plotChromBins.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chrom-plots.R 3 | \name{plotChromBins} 4 | \alias{plotChromBins} 5 | \title{Plot distribution over chromosomes} 6 | \usage{ 7 | plotChromBins( 8 | genomeAggregate, 9 | plotTitle = "Distribution over chromosomes", 10 | ylim = "max" 11 | ) 12 | } 13 | \arguments{ 14 | \item{genomeAggregate}{The output from the genomicDistribution function} 15 | 16 | \item{plotTitle}{Title for plot.} 17 | 18 | \item{ylim}{Limit of y-axes. Default "max" sets limit to N of biggest bin.} 19 | } 20 | \value{ 21 | A ggplot object showing the distribution of the query 22 | regions over bins of 23 | the reference genome. 24 | } 25 | \description{ 26 | Plots result from \code{genomicDistribution} calculation 27 | } 28 | \examples{ 29 | agg = data.frame("regionID"=1:5, "chr"=rep(c("chr1"), 5), 30 | "withinGroupID"=1:5, "N"=c(1,3,5,7,9)) 31 | ChromBins = plotChromBins(agg) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /man/plotCumulativePartitions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/partition-plots.R 3 | \name{plotCumulativePartitions} 4 | \alias{plotCumulativePartitions} 5 | \title{Plot the cumulative distribution of regions in features} 6 | \usage{ 7 | plotCumulativePartitions(assignedPartitions, feature_names = NULL) 8 | } 9 | \arguments{ 10 | \item{assignedPartitions}{Results from \code{calcCumulativePartitions}} 11 | 12 | \item{feature_names}{An optional character vector of feature names, in the 13 | same order as the GenomicRanges or GenomicRangesList object.} 14 | } 15 | \value{ 16 | A ggplot object of the cumulative distribution of regions in 17 | features. 18 | } 19 | \description{ 20 | This function plots the cumulative distribution of regions across a 21 | feature set. 22 | } 23 | \examples{ 24 | p = calcCumulativePartitionsRef(vistaEnhancers, "hg19") 25 | cumuPlot = plotCumulativePartitions(p) 26 | } 27 | -------------------------------------------------------------------------------- /man/plotDinuclFreq.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/content-plots.R 3 | \name{plotDinuclFreq} 4 | \alias{plotDinuclFreq} 5 | \title{Plot dinuclotide content within region set(s)} 6 | \usage{ 7 | plotDinuclFreq(DNFDataTable) 8 | } 9 | \arguments{ 10 | \item{DNFDataTable}{A data.table, data.frame, or a list of dinucleotide counts - 11 | results from \code{calcDinuclFreq} or \code{calcDinuclFreqRef}} 12 | } 13 | \value{ 14 | A ggplot object plotting distribution of dinucleotide content in query regions 15 | } 16 | \description{ 17 | Given \code{calcDinuclFreq} or \code{calcDinuclFreqRef} results, this function 18 | generates a violin plot of dinucleotide frequency 19 | } 20 | \examples{ 21 | 22 | DNFDataTable = data.table::data.table(GC = rnorm(400, mean=0.5, sd=0.1), 23 | CG = rnorm(400, mean=0.5, sd=0.5), 24 | AT = rnorm(400, mean=0.5, sd=1), 25 | TA = rnorm(400, mean=0.5, sd=1.5)) 26 | DNFPlot = plotDinuclFreq(DNFDataTable) 27 | 28 | \dontrun{ 29 | query = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions") 30 | GRquery = rtracklayer::import(query) 31 | refAssembly = 'hg19' 32 | DNF = calcDinuclFreqRef(GRquery, refAssembly) 33 | DNFPlot2 = plotDinuclFreq(DNF) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /man/plotExpectedPartitions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/partition-plots.R 3 | \name{plotExpectedPartitions} 4 | \alias{plotExpectedPartitions} 5 | \title{Produces a barplot showing how query regions of interest are distributed 6 | relative to the expected distribution across a given partition list} 7 | \usage{ 8 | plotExpectedPartitions(expectedPartitions, feature_names = NULL, pval = FALSE) 9 | } 10 | \arguments{ 11 | \item{expectedPartitions}{A data.frame holding the frequency of assignment 12 | to each of the partitions, the expected number of each partition, and 13 | the log10 of the observed over expected. Produced by 14 | \code{calcExpectedPartitions}.} 15 | 16 | \item{feature_names}{Character vector with labels for the partitions 17 | (optional). By default it will use the names from the first argument.} 18 | 19 | \item{pval}{Logical indicating whether Chi-square p-values should be added 20 | for each partition.} 21 | } 22 | \value{ 23 | A ggplot object using a barplot to show the distribution of the 24 | query regions across a given partition list. 25 | } 26 | \description{ 27 | Produces a barplot showing how query regions of interest are distributed 28 | relative to the expected distribution across a given partition list 29 | } 30 | \examples{ 31 | p = calcExpectedPartitionsRef(vistaEnhancers, "hg19") 32 | expectedPlot = plotExpectedPartitions(p) 33 | } 34 | -------------------------------------------------------------------------------- /man/plotFeatureDist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/feature-plots.R 3 | \name{plotFeatureDist} 4 | \alias{plotFeatureDist} 5 | \title{Plots a histogram of distances to genomic features} 6 | \usage{ 7 | plotFeatureDist( 8 | dists, 9 | bgdists = NULL, 10 | featureName = "features", 11 | numbers = FALSE, 12 | nbins = 50, 13 | size = 1e+05, 14 | infBins = FALSE, 15 | tile = FALSE, 16 | labelOrder = "default" 17 | ) 18 | } 19 | \arguments{ 20 | \item{dists}{Results from \code{featureDistribution}} 21 | 22 | \item{bgdists}{Background distances. If provided, will plot a background 23 | distribution of expected distances} 24 | 25 | \item{featureName}{Character vector for plot labels (optional).} 26 | 27 | \item{numbers}{a logical indicating whether the raw numbers should be 28 | displayed, rather than percentages (optional).} 29 | 30 | \item{nbins}{Number of bins on each side of the center point.} 31 | 32 | \item{size}{Number of bases to include in plot on each side of the 33 | center point.} 34 | 35 | \item{infBins}{Include catch-all bins on the sides?} 36 | 37 | \item{tile}{Turn on a tile mode, which plots a tiled figure 38 | instead of a histogram.} 39 | 40 | \item{labelOrder}{-- Enter "default" to order by order of user input (default); 41 | Enter "center" to order by value in tile in the closest proximity to the center 42 | of features (in case TSS is used - center is TSS) (center).} 43 | } 44 | \value{ 45 | A ggplot2 plot object 46 | } 47 | \description{ 48 | Given the results from \code{featureDistribution}, plots a histogram of 49 | distances surrounding the features of interest 50 | } 51 | \examples{ 52 | TSSdist = calcFeatureDistRefTSS(vistaEnhancers, "hg19") 53 | f = plotFeatureDist(TSSdist, featureName="TSS") 54 | } 55 | -------------------------------------------------------------------------------- /man/plotGCContent.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/content-plots.R 3 | \name{plotGCContent} 4 | \alias{plotGCContent} 5 | \title{Plots a density distribution of GC vectors 6 | Give results from the \code{calcGCContent} function, this will produce a 7 | density plot} 8 | \usage{ 9 | plotGCContent(gcvectors) 10 | } 11 | \arguments{ 12 | \item{gcvectors}{A numeric vector or list of numeric vectors of GC contents.} 13 | } 14 | \value{ 15 | A ggplot object plotting distribution of GC content in query regions. 16 | } 17 | \description{ 18 | Plots a density distribution of GC vectors 19 | Give results from the \code{calcGCContent} function, this will produce a 20 | density plot 21 | } 22 | \examples{ 23 | numVector = rnorm(400, mean=0.5, sd=0.1) 24 | GCplot = plotGCContent(numVector) 25 | vecs = list(example1 = rnorm(400, mean=0.5, sd=0.1), 26 | example2 = rnorm(600, mean=0.5, sd=0.1)) 27 | GCplot = plotGCContent(vecs) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /man/plotNeighborDist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/neighbor-distances.R 3 | \name{plotNeighborDist} 4 | \alias{plotNeighborDist} 5 | \title{Plot the distances from regions to their upstream/downstream neighbors 6 | or nearest neighbors. Distances can be passed as either raw bp or 7 | corrected for the number of regions (log10(obs/exp)), but this has 8 | to be specified in the function parameters.} 9 | \usage{ 10 | plotNeighborDist(dcvec, correctedDist = FALSE, Nneighbors = FALSE) 11 | } 12 | \arguments{ 13 | \item{dcvec}{A numeric vector or list of vectors containing distances 14 | to upstream/downstream neighboring regions or to nearest neighbors. 15 | Produced by \code{calcNeighborDist} or \code{calcNearestNeighbors}} 16 | 17 | \item{correctedDist}{A logical indicating if the plot axis should 18 | be adjusted to show distances corrected for the number of regions 19 | in a regionset.} 20 | 21 | \item{Nneighbors}{A logical indicating whether legend should be adjusted 22 | if Nearest neighbors are being plotted. Default legend shows distances 23 | to upstream/downstream neighbors.} 24 | } 25 | \value{ 26 | A ggplot density object showing the distribution of 27 | raw or corrected distances. 28 | } 29 | \description{ 30 | Plot the distances from regions to their upstream/downstream neighbors 31 | or nearest neighbors. Distances can be passed as either raw bp or 32 | corrected for the number of regions (log10(obs/exp)), but this has 33 | to be specified in the function parameters. 34 | } 35 | \examples{ 36 | numVector = rnorm(400, mean=5, sd=0.1) 37 | d = plotNeighborDist(numVector) 38 | } 39 | -------------------------------------------------------------------------------- /man/plotPartitions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/partition-plots.R 3 | \name{plotPartitions} 4 | \alias{plotPartitions} 5 | \title{Produces a barplot showing how query regions of interest are distributed 6 | across a given partition list} 7 | \usage{ 8 | plotPartitions(assignedPartitions, numbers = FALSE, stacked = FALSE) 9 | } 10 | \arguments{ 11 | \item{assignedPartitions}{A table holding the frequency of assignment to 12 | each of the partitions. Produced by \code{calcPartitions}} 13 | 14 | \item{numbers}{logical indicating whether raw overlaps should be 15 | plotted instead of the default percentages} 16 | 17 | \item{stacked}{logical indicating that data should be plotted as stacked 18 | bar plot} 19 | } 20 | \value{ 21 | A ggplot object using a barplot to show the distribution 22 | of the query 23 | regions across a given partition list. 24 | } 25 | \description{ 26 | This function can be used to test a GRanges object against any arbitrary 27 | list of genome partitions. The partition list is a priority-ordered list of 28 | GRanges objects. Each region in the query will be assigned to a given 29 | partition that it overlaps with the highest priority. 30 | } 31 | \examples{ 32 | p = calcPartitionsRef(vistaEnhancers, "hg19") 33 | partPlot = plotPartitions(p) 34 | partCounts = plotPartitions(p, numbers=TRUE) 35 | partPlot = plotPartitions(p, stacked=TRUE) 36 | } 37 | -------------------------------------------------------------------------------- /man/plotQTHist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qthist.R 3 | \name{plotQTHist} 4 | \alias{plotQTHist} 5 | \title{Plot quantile-trimmed histogram} 6 | \usage{ 7 | plotQTHist( 8 | x, 9 | EndBarColor = "gray57", 10 | MiddleBarColor = "gray27", 11 | quantThresh = NULL, 12 | bins = NULL, 13 | indep = FALSE, 14 | numbers = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{x}{Data values to plot - vector or list of vectors} 19 | 20 | \item{EndBarColor}{Color for the quantile bars on both ends of the graph 21 | (optional)} 22 | 23 | \item{MiddleBarColor}{Color for the bars in the middle of the graph 24 | (optional)} 25 | 26 | \item{quantThresh}{Quantile of data to be contained in each end bar (optional) 27 | quantThresh values must be under .2, optimal size is under .1} 28 | 29 | \item{bins}{The number of bins for the histogram to allocate data to. 30 | (optional)} 31 | 32 | \item{indep}{logical value which returns a list of plots that have had their 33 | bins calculated independently; the normal version will plot them on the 34 | same x and y axis.} 35 | 36 | \item{numbers}{a logical indicating whether the raw numbers should be 37 | displayed, rather than percentages (optional).} 38 | } 39 | \value{ 40 | A ggplot2 plot object 41 | } 42 | \description{ 43 | Given the results from \code{calcWidth}, plots a histogram with 44 | outliers trimmed. 45 | } 46 | \details{ 47 | x-axis breaks for the frequency calculations are based on the "divisions" 48 | results from helper function \code{calcDivisions}. 49 | } 50 | \examples{ 51 | regWidths = calcWidth(vistaEnhancers) 52 | qtHist = plotQTHist(regWidths) 53 | qtHist2 = plotQTHist(regWidths, quantThresh=0.1) 54 | } 55 | -------------------------------------------------------------------------------- /man/plotSummarySignal.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/specificity-plots.R 3 | \name{plotSummarySignal} 4 | \alias{plotSummarySignal} 5 | \title{The function plotSummarySignal visualizes the signalSummaryMatrix obtained from 6 | \code{calcSummarySignal}.} 7 | \usage{ 8 | plotSummarySignal( 9 | signalSummaryList, 10 | plotType = "barPlot", 11 | metadata = NULL, 12 | colorColumn = NULL, 13 | filterGroupColumn = NULL, 14 | filterGroup = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{signalSummaryList}{Output list from \code{calcSummarySignal} function.} 19 | 20 | \item{plotType}{Options are: "jitter" - jitter plot with box plot on top, 21 | "boxPlot" - box plot without individual points and outliers, 22 | "barPlot" (default) - bar height represents the median signal value 23 | for a given cell type, 24 | "violinPlot" - violin plot with medians.} 25 | 26 | \item{metadata}{(optional) data.table used for grouping columns from 27 | 'signalMatrix' into categories, that are then plotted with different colors. 28 | Must contain variable 'colName' that contains all the condition column names 29 | from 'signaMatrix'.} 30 | 31 | \item{colorColumn}{(optional only if metadata provided) columns name from 32 | 'metadata' table that will be used as grouping variable for coloring.} 33 | 34 | \item{filterGroupColumn}{(optional only if metadata provided and 35 | 'filterGroup' specified) allows user to plot specified subgroups only. 36 | String specifying the column name in 'metadata' from which groups will 37 | be filtered (groups are specified in as 'filterGroups)} 38 | 39 | \item{filterGroup}{(optional only if 'metadata' and 'filterGroupColumn' 40 | provided) - string (or vector of strings) of groups from 41 | 'filterGroupColumn' to be plottted.} 42 | } 43 | \value{ 44 | A ggplot object. 45 | } 46 | \description{ 47 | The function plotSummarySignal visualizes the signalSummaryMatrix obtained from 48 | \code{calcSummarySignal}. 49 | } 50 | \examples{ 51 | signalSummaryList = calcSummarySignal(vistaEnhancers, exampleOpenSignalMatrix_hg19) 52 | metadata = cellTypeMetadata 53 | plotSignal = plotSummarySignal(signalSummaryList) 54 | 55 | plotSignalTissueColor = plotSummarySignal(signalSummaryList = signalSummaryList, 56 | plotType = "jitter", metadata = metadata, colorColumn = "tissueType") 57 | 58 | plotSignalFiltered = plotSummarySignal(signalSummaryList = signalSummaryList, 59 | plotType = "violinPlot", metadata = metadata, colorColumn = "tissueType", 60 | filterGroupColumn = "tissueType", filterGroup = c("skin", "blood")) 61 | } 62 | -------------------------------------------------------------------------------- /man/retrieveFile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/buildReferenceData.R 3 | \name{retrieveFile} 4 | \alias{retrieveFile} 5 | \title{Read local or remote file} 6 | \usage{ 7 | retrieveFile(source, destDir = NULL) 8 | } 9 | \arguments{ 10 | \item{source}{a string that is either a path to a local or remote GTF} 11 | 12 | \item{destDir}{a string that indicates the path to the directory where 13 | the downloaded GTF file should be stored. If not provided, 14 | a temporary directory will be used.} 15 | } 16 | \value{ 17 | data.frame retrieved file path 18 | } 19 | \description{ 20 | Read local or remote file 21 | } 22 | \examples{ 23 | CElegansGtfCropped = system.file("extdata", 24 | "C_elegans_cropped_example.gtf.gz", 25 | package="GenomicDistributions") 26 | CElegansGtf = retrieveFile(CElegansGtfCropped) 27 | } 28 | -------------------------------------------------------------------------------- /man/setB_100.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{setB_100} 5 | \alias{setB_100} 6 | \title{Example BED file read with rtracklayer::import} 7 | \format{ 8 | GenomicRanges::GRanges 9 | } 10 | \usage{ 11 | data(setB_100) 12 | } 13 | \description{ 14 | Example BED file read with rtracklayer::import 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/splitDataTable.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility.R 3 | \name{splitDataTable} 4 | \alias{splitDataTable} 5 | \title{Efficiently split a data.table by a column in the table} 6 | \usage{ 7 | splitDataTable(DT, split_factor) 8 | } 9 | \arguments{ 10 | \item{DT}{Data.table to split} 11 | 12 | \item{split_factor}{Column to split, which can be a character vector 13 | or an integer.} 14 | } 15 | \value{ 16 | List of data.table objects, split by column 17 | } 18 | \description{ 19 | Efficiently split a data.table by a column in the table 20 | } 21 | -------------------------------------------------------------------------------- /man/theme_blank_facet_label.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility.R 3 | \name{theme_blank_facet_label} 4 | \alias{theme_blank_facet_label} 5 | \title{Clear ggplot face label.} 6 | \usage{ 7 | theme_blank_facet_label() 8 | } 9 | \value{ 10 | A ggplot theme 11 | } 12 | \description{ 13 | Usually ggplot2 facets are labeled with boxes surrounding the label. This 14 | function removes the box, so it's a simple label for each facet. 15 | } 16 | -------------------------------------------------------------------------------- /man/vistaEnhancers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{vistaEnhancers} 5 | \alias{vistaEnhancers} 6 | \title{Example BED file read with rtracklayer::import} 7 | \format{ 8 | GenomicRanges::GRanges 9 | } 10 | \usage{ 11 | data(vistaEnhancers) 12 | } 13 | \description{ 14 | Example BED file read with rtracklayer::import 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library("GenomicDistributions") 3 | 4 | test_check("GenomicDistributions") 5 | -------------------------------------------------------------------------------- /tests/testthat/testChrom.R: -------------------------------------------------------------------------------- 1 | # library(GenomicDistributions) 2 | library(testthat) 3 | library(data.table) 4 | 5 | # data 6 | query = vistaEnhancers 7 | querySftd = GenomicRanges::shift(query, 100000) 8 | queryList = GRangesList(q1=query, q2=querySftd) 9 | 10 | context("general") 11 | test_that("binRegion works with binSize and binCount", { 12 | for(s in seq(1, 100, by=50)){ 13 | for(e in seq(1000, 10000, by=5000)){ 14 | expect_visible(binRegion(start=s, end=e, binSize=10)) 15 | expect_visible(binRegion(start=s, end=e, binCount=10)) 16 | } 17 | } 18 | }) 19 | 20 | test_that("calcChromBinsRef works with list input", { 21 | expect_visible(calcChromBinsRef(queryList, "hg19")) 22 | }) 23 | 24 | context("result") 25 | test_that("binRegion returns result of correct length", { 26 | expect_equal( 27 | binRegion(start=1, end=100, binSize=10), 28 | binRegion(start=1, end=100, binCount=10), 29 | ) 30 | expect_length(binRegion(start=1, end=100, binSize=10), 5) 31 | expect_equal(NROW(binRegion(start=1, end=100, binSize=10)), 10) 32 | }) 33 | 34 | test_that("calcChromBinsRef returns a proper object type, length ad includes all the regions", { 35 | result = calcChromBinsRef(query, "hg19") 36 | expect_is(result, "data.table") 37 | expect_length(result, 6) 38 | expect_equal(sum(result$N), length(query)) 39 | }) 40 | -------------------------------------------------------------------------------- /tests/testthat/testGCContent.R: -------------------------------------------------------------------------------- 1 | # COMMENTED OUT DUE TO BSgenome.Hsapiens.UCSC..masked PACKAGES 2 | # DEPENDANCIES, WHICH ARE NOT INCLUDED IN REQUIREMENTS DUE TO SIZE 3 | 4 | # # lib 5 | # library(testthat) 6 | # 7 | # # data 8 | # featureFile = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions") 9 | # feats = rtracklayer::import(featureFile) 10 | # refs = c("hg38", "hg19") 11 | # 12 | # # tests 13 | # context("general") 14 | # test_that("calcGCContent works", { 15 | # for(r in refs){ 16 | # expect_visible(calcGCContentRef(feats, r)) 17 | # } 18 | # }) 19 | # 20 | # context("result") 21 | # test_that("calcGCContent yields results of proper length", { 22 | # expect_equal(length(calcGCContentRef(feats, "hg19")), length(feats)) 23 | # }) 24 | # 25 | # test_that("calcGCContent yields a numeric result", { 26 | # expect_true(is.numeric(calcGCContentRef(feats, "hg19"))) 27 | # }) 28 | # 29 | # test_that("calcGCContent yields a numeric in range 0-1", { 30 | # x = calcGCContentRef(feats, "hg19") 31 | # for(i in x){ 32 | # expect_gt(i, 0) 33 | # expect_lt(i, 1) 34 | # } 35 | # }) 36 | -------------------------------------------------------------------------------- /tests/testthat/testNeighborDist.R: -------------------------------------------------------------------------------- 1 | # lib 2 | library(data.table) 3 | library(testthat) 4 | library(GenomicDistributions) 5 | 6 | # data 7 | query = vistaEnhancers 8 | querySftd = GenomicRanges::shift(query, 100000) 9 | queryList = GRangesList(q1=query, q2=querySftd) 10 | 11 | # tests 12 | context("general") 13 | test_that("calcNeighborDist works", { 14 | lapply(queryList, function(x) expect_visible(calcNeighborDist(x))) 15 | }) 16 | 17 | context("result") 18 | test_that("calcNeighborDist returns a result of a proper class", { 19 | expect_true(is(calcNeighborDist(query), "numeric")) 20 | expect_true(is(calcNeighborDist(queryList), "list" )) 21 | }) 22 | 23 | test_that("calcNeighborDist returns the same result for a shifted region set", { 24 | expect_equal(calcNeighborDist(query), calcNeighborDist(querySftd)) 25 | }) 26 | 27 | # test_that("calcNeighborDist yields a numeric in range 0-10", { 28 | # x = calcNeighborDist(query) 29 | # for(i in x){ 30 | # expect_gt(i, 0) 31 | # expect_lt(i, 10) 32 | # } 33 | #}) 34 | -------------------------------------------------------------------------------- /tests/testthat/testOpenChromatin.R: -------------------------------------------------------------------------------- 1 | # lib 2 | library(data.table) 3 | library(testthat) 4 | library(GenomicDistributions) 5 | # data 6 | cellMatrix = exampleOpenSignalMatrix_hg19 7 | query = vistaEnhancers 8 | querySftd = GenomicRanges::shift(query, 100) 9 | queryList = GRangesList(q1=query, q2=querySftd) 10 | 11 | # tests 12 | context("general") 13 | test_that("calcSummarySignal works", { 14 | expect_visible(calcSummarySignal(query, cellMatrix)) 15 | expect_visible(calcSummarySignal(querySftd, cellMatrix)) 16 | }) 17 | 18 | test_that("ccalcSummarySignal works with multiple queries", { 19 | expect_visible(calcSummarySignal(queryList, cellMatrix)) 20 | }) 21 | 22 | context("result") 23 | test_that("calcSummarySignal returns a result of a proper class", { 24 | expect_true(is(calcSummarySignal(query, cellMatrix), "list")) 25 | expect_true(is(calcSummarySignal(query, cellMatrix)[[1]], "data.table")) 26 | expect_true(is(calcSummarySignal(query, cellMatrix)[[2]], "data.frame")) 27 | }) 28 | 29 | test_that("calcSummarySignal returns different results for different queries", { 30 | expect_false(identical(calcSummarySignal(query, cellMatrix)[[1]], 31 | calcSummarySignal(querySftd, cellMatrix)[[1]])) 32 | }) 33 | 34 | test_that("calcSummarySignal combines results from multi-query runs", { 35 | ql = GRangesList(q1=query, q2=query) 36 | expect_true(NROW(calcSummarySignal(query, cellMatrix)[[1]])*2 == 37 | NROW(calcSummarySignal(ql, cellMatrix)[[1]])) 38 | expect_true(NROW(calcSummarySignal(query, cellMatrix)[[2]])*2 == 39 | NROW(calcSummarySignal(ql, cellMatrix)[[2]])) 40 | }) 41 | 42 | -------------------------------------------------------------------------------- /tests/testthat/testPartitions.R: -------------------------------------------------------------------------------- 1 | # lib 2 | library(data.table) 3 | library(testthat) 4 | library(GenomicDistributions) 5 | 6 | # data 7 | query = vistaEnhancers 8 | querySftd = GenomicRanges::shift(query, 100000) 9 | queryList = GRangesList(q1=query, q2=querySftd) 10 | 11 | # tests 12 | context("general") 13 | test_that("calcPartitionsRef works", { 14 | lapply(queryList, function(x) expect_visible(calcPartitionsRef(x, "hg19"))) 15 | }) 16 | 17 | context("result") 18 | test_that("calcPartitionsRef returns a result of a proper class", { 19 | expect_true(is(calcPartitionsRef(query, "hg19"), "data.frame")) 20 | }) 21 | 22 | test_that("calcPartitionsRef returns a result of a proper length", { 23 | expect_length(calcPartitionsRef(query, "hg19"), 2) 24 | expect_equal(NROW(calcPartitionsRef(query, "hg19")), 7) 25 | }) 26 | 27 | test_that("calcPartitionsRef returns different results for different queries", { 28 | expect_false(all(calcPartitionsRef(query, "hg19")$Freq == 29 | calcPartitionsRef(querySftd, "hg19")$Freq)) 30 | }) 31 | -------------------------------------------------------------------------------- /tests/testthat/test_all.R: -------------------------------------------------------------------------------- 1 | # Unit tests 2 | library(GenomicDistributions) 3 | 4 | context("Testthat context...") 5 | 6 | ############################################################################# 7 | # Test data should be with toy examples you can work out by hand 8 | # that way you can calculate by hand and compare to the output of the function 9 | 10 | # toy data for testing functions 11 | # if altered, tests relying on these objects will be disrupted 12 | start1 = c(seq(from=1, to = 2001, by = 1000), 800) 13 | start2 = c(seq(from=126, to = 2126, by = 1000), 100, 2500) 14 | chrString1 = c(rep("chr1", 3), "chr2") 15 | chrString2 = c(chrString1, "chr3") 16 | 17 | origCoordDT1 = data.table(chr=chrString1, 18 | start = start1, 19 | end = start1 + 250) 20 | origCoordDT2 = data.table(chr=chrString2, 21 | start=start2, 22 | end=start2+150) 23 | coordDT1 = copy(origCoordDT1) 24 | coordDT2 = copy(origCoordDT2) 25 | 26 | testGR1 = dtToGr(coordDT1) 27 | testGR2 = dtToGr(coordDT2) 28 | testGR3 = GenomicRanges::shift(testGR2, 1000) 29 | testGR4 = GenomicRanges::shift(testGR2, 2500) 30 | testGR5 = GenomicRanges::shift(testGR2, 4000) 31 | ############################################################################### 32 | 33 | # test for calcOLCount 34 | # reset test data in case it was changed by another unit test section 35 | coordDT1 = copy(origCoordDT1) 36 | coordDT2 = copy(origCoordDT2) 37 | testGR1 = dtToGr(coordDT1) 38 | testGR2 = dtToGr(coordDT2) 39 | test_that("calcOLCount", { 40 | 41 | # uses midpoint coordinate of queryRegionDT 42 | testGRList = GRangesList(dtToGr(data.table(chr=c("chr1", "chr1"), 43 | start = c(1, 2001), 44 | end = c(2000, 4000))), 45 | dtToGr(data.table(chr=c("chr2", "chr2"), 46 | start = c(1, 2001), 47 | end = c(2000, 4000))), 48 | dtToGr(data.table(chr=c("chr3", "chr3"), 49 | start = c(1, 2001), 50 | end = c(2000, 4000)))) 51 | olCount1 = calcOLCount(queryRegionDT = coordDT2, regionsGRL = testGRList) 52 | expect_equal(olCount1$N, c(2, 1, 1, 1)) 53 | expect_equal(olCount1$regionGroupID, c(1, 1, 2, 3)) 54 | 55 | # only expect one overlap: chr2 56 | olCount2 = calcOLCount(coordDT2, dtToGr(data.table(chr=c("chr1", "chr1", "chr2"), 57 | start = c(1, 250, 170), 58 | end = c(150, 300, 180)))) 59 | olCount2=as.data.frame(olCount2) 60 | expectedOut = data.frame(regionID=3, chr="chr2", start=170, end=180, withinGroupID=3, regionGroupID=1, N=1, stringsAsFactors = FALSE) 61 | expect_equal(olCount2, expectedOut) 62 | }) 63 | 64 | 65 | 66 | 67 | # "featureDistanceDistribution" function is now named "calcFeatureDist" 68 | # reset test data in case it was changed by another unit test section 69 | # and select just one chromosome - since DTNearest is help function calculating 70 | # distances within one chromosome 71 | coordDT1 = copy(origCoordDT1) 72 | coordDT2 = copy(origCoordDT2) 73 | testGR1 = dtToGr(coordDT1) 74 | testGR2 = dtToGr(coordDT2) 75 | test_that("featureDistribution", { 76 | 77 | ############# old 78 | # queryFile = system.file("extdata", "setB_100.bed.gz", package="GenomicDistributions") 79 | # query = rtracklayer::import(queryFile) 80 | # 81 | # featureExample = GenomicRanges::shift(query, round(rnorm(length(query), 0,1000))) 82 | # fdd = featureDistanceDistribution(query, featureExample) 83 | # featureFile = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions") 84 | # feats = rtracklayer::import(featureFile) 85 | 86 | #' featureDistance = featureDistanceDistribution(query, feats) 87 | #' expect_equal(sum(is.na(featureDistance)), -3) 88 | #' expect_equal(sum(featureDistance, na.rm=TRUE), 743969) 89 | ############# old 90 | 91 | coordDT1$end[1] = 100 92 | coordDT1$start[2] = 200 93 | coordDT1$end[2] = 400 94 | testGR1 = dtToGr(coordDT1) 95 | # DTNearest 96 | # @param DT1 data.table Has start and end column 97 | # @param DT2 98 | # @return numeric vector. Distance from region set to closest other region set. 99 | # Distance from the midpointof each region to the midpoint. 100 | nearestVec = DTNearest(coordDT1, coordDT2) 101 | nearestVec 102 | expect_equal(nearestVec, c(124, -99, 276, 75)) 103 | 104 | 105 | # DTNearest ignores chromosome completely. By design. 106 | # DTNearest shouldn't be used with data from different chromosomes. 107 | # Suggested to split by chromosome when such case presents (e.g chrom1). 108 | DT1chrom1 = coordDT1[coordDT1$chr == "chr1"] 109 | DT2chrom1 = coordDT2[coordDT2$chr == "chr1"] 110 | nearestVec2C1 = DTNearest(DT2chrom1, DT1chrom1) 111 | expect_equal(nearestVec2C1, c(99, -901, -75)) 112 | 113 | featureDistance = calcFeatureDist(testGR1, testGR2) 114 | featureDistance 115 | expect_equal(featureDistance, c(150, -99, 75, -750)) 116 | featureDistance2 = calcFeatureDist(testGR2, testGR1) 117 | featureDistance2 118 | 119 | expect_equal(featureDistance2, c( 99, -901, -75, 750, NA)) 120 | 121 | # coordDT1$chr = "chr2" 122 | # testGR1 = dtToGr(coordDT1) 123 | # featureDistance = calcFeatureDist(testGR1, testGR2) 124 | # featureDistance 125 | # featureDistance2 = calcFeatureDist(testGR2, testGR1) 126 | # featureDistance2 127 | 128 | 129 | }) 130 | 131 | #' queryDT = GenomicDistributions:::grToDt(query) 132 | #' featureDT = GenomicDistributions:::grToDt(features) 133 | #' queryDTs = GenomicDistributions:::splitDataTable(queryDT, "chr") 134 | #' featureDTs = GenomicDistributions:::splitDataTable(featureDT, "chr") 135 | #' as.vector(unlist(mapply(queryDTs, featureDTs[names(queryDTs)], FUN=DTNearest))) 136 | 137 | 138 | 139 | test_that("Genome aggregate", { 140 | queryFile = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions") 141 | query = rtracklayer::import(queryFile) 142 | # First, calculate the distribution: 143 | x = aggregateOverGenomeBins(query, "hg19") 144 | # Then, plot the result: 145 | # plotGenomeAggregate(x) 146 | }) 147 | 148 | 149 | # "genomicPartitions" function changed to "calcPartitionsRef" 150 | 151 | test_that("Partitions", { 152 | 153 | ################### old 154 | #queryFile = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions") 155 | #query = rtracklayer::import(queryFile) 156 | #gp = genomicPartitions(query, "hg38") 157 | #gp = genomicPartitions(query, "hg19") 158 | #gp = genomicPartitions(query, "mm10") 159 | #gp = genomicPartitions(query, "mm9") 160 | #plotPartitions(gp) 161 | ################### old 162 | 163 | # test calcPartitions() 164 | # GenomePartitionList 165 | promCore = GenomicRanges::reduce(trim(promoters(testGR2, upstream=100, downstream=0))) 166 | promProx = GenomicRanges::reduce(trim(promoters(testGR2, upstream=2000, downstream=0))) 167 | promoterProx = GenomicRanges::setdiff(promProx, promCore) 168 | 169 | # remove any possible overlaps between classes 170 | testGR5 = GenomicRanges::setdiff(testGR5, testGR4) 171 | testGR3 = GenomicRanges::setdiff(testGR3, testGR4) 172 | testGR3 = GenomicRanges::setdiff(testGR3, testGR5) 173 | 174 | nonThree = GenomicRanges::setdiff(testGR2, testGR4) 175 | nonThreeFive = GenomicRanges::setdiff(nonThree, testGR5) 176 | intronGR = GenomicRanges::setdiff(nonThreeFive, testGR3) 177 | 178 | partList = list(promoterCore=GenomicRanges::reduce(trim(promoters(testGR2, upstream=100, downstream=0))), 179 | promoterProx=promoterProx, 180 | threeUTR=testGR4, 181 | fiveUTR=testGR5, 182 | exon=testGR3, 183 | intron=intronGR) 184 | 185 | gp = genomePartitionList(testGR2, testGR3, testGR4, testGR5) 186 | expect_equal(gp, partList) 187 | 188 | # calcPartitions 189 | partition = rep(0, length(testGR1)) 190 | for (i in seq_along(partList)) { 191 | ols = countOverlaps(testGR1[partition==0], partList[[i]]) 192 | partition[partition==0][ols > 0] = names(partList)[[i]] 193 | } 194 | partition[partition=="0"] = "intergenic" 195 | testPartitions = data.frame(table(partition)) 196 | 197 | testPartitionNames = c("promoterCore", "promoterProx", "threeUTR", "fiveUTR", 198 | "exon", "intron", "intergenic") 199 | if (!all(testPartitionNames %in% testPartitions$partition)){ 200 | notIncluded = testPartitionNames[!(testPartitionNames %in% 201 | testPartitions$partition)] 202 | addRows = data.frame(partition = notIncluded, 203 | Freq = rep(0, length(notIncluded))) 204 | testPartitions = rbind(testPartitions, addRows) 205 | } 206 | 207 | Partitions = calcPartitions(testGR1, partList) 208 | expect_equal(Partitions, testPartitions) 209 | 210 | }) 211 | 212 | test_that("Neighbor distances", { 213 | 214 | testGRdt = grToDt(sort(testGR1)) 215 | splitdt = splitDataTable(testGRdt, "chr") 216 | chromTest = splitdt[[1]] 217 | # Compare bp distance generated by neighbordt 218 | distancesExp = neighbordt(chromTest) 219 | # Calculated by hand c(749, 749) 220 | expect_equal(distancesExp, c(749, 749)) 221 | 222 | # Compare distances from calcNeighborDist 223 | distances = calcNeighborDist(testGR1) 224 | expect_equal(distances, c(749, 749)) 225 | 226 | }) 227 | 228 | test_that("Nearest Neighbor distances", { 229 | 230 | testGR2dt = grToDt(sort(testGR2)) 231 | splitdt2 = splitDataTable(testGR2dt, "chr") 232 | chromTest2 = splitdt2[[1]] 233 | # Compare bp distance generated by neighbordt 234 | nearestDistancesExp = neighbordt(chromTest2) 235 | up = nearestDistancesExp[-length(dist)] 236 | down = nearestDistancesExp[-1] 237 | dt = data.table(i=up, j=down) 238 | pairmins = dt[, pmin(i, j)] 239 | nNeighbors = c(nearestDistancesExp[1], pairmins, 240 | nearestDistancesExp[length(dist)]) 241 | 242 | # Calculated by hand c(849, 849, 849) 243 | expect_equal(nNeighbors, rep(849, 3)) 244 | 245 | # Compare distances from calcNeighborDist 246 | nearestNeighborsTest = calcNearestNeighbors(testGR2) 247 | expect_equal(nearestNeighborsTest, rep(849, 3)) 248 | 249 | }) 250 | 251 | -------------------------------------------------------------------------------- /vignettes/figures-full-power/GC-content-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/GC-content-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/TSS-plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/TSS-plot-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/TSS-plot-closeup-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/TSS-plot-closeup-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/chrom-bin-plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/chrom-bin-plot-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/cumulative-partitions-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/cumulative-partitions-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/custom-cumulative-partitions-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/custom-cumulative-partitions-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/custom-expected-partition-plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/custom-expected-partition-plot-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/custom-partition-plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/custom-partition-plot-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/dinuc-content-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/dinuc-content-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/expected-partition-plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/expected-partition-plot-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/gene-distance-plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/gene-distance-plot-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/neighbor-distance-distribution-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/neighbor-distance-distribution-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/open-signal-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/open-signal-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/partition-plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/partition-plot-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/partition-plot-proportional-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/partition-plot-proportional-1.png -------------------------------------------------------------------------------- /vignettes/figures-full-power/width-distribution-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/width-distribution-1.png --------------------------------------------------------------------------------