├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   └── lint.yml
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── NEWS
├── R
    ├── aggregate.R
    ├── buildReferenceData.R
    ├── chrom-plots.R
    ├── content-plots.R
    ├── data.R
    ├── feature-plots.R
    ├── loadData.R
    ├── neighbor-distances.R
    ├── package.R
    ├── partition-plots.R
    ├── qthist.R
    ├── specificity-plots.R
    ├── utility.R
    └── zalias.R
├── README.md
├── _pkgdown.yaml
├── data-raw
    ├── TSS_hg19.R
    ├── bedfiles.R
    ├── chromSizes_hg19.R
    └── geneModels_hg19.R
├── data
    ├── TSS_hg19.rda
    ├── cellTypeMetadata.rda
    ├── chromSizes_hg19.rda
    ├── datalist
    ├── exampleOpenSignalMatrix_hg19.rda
    ├── geneModels_hg19.rda
    ├── setB_100.rda
    └── vistaEnhancers.rda
├── inst
    ├── CITATION
    └── extdata
    │   ├── C_elegans_cropped_example.fa.gz
    │   ├── C_elegans_cropped_example.gtf.gz
    │   ├── example_cell_matrix.txt
    │   ├── setB_100.bed.gz
    │   └── vistaEnhancers.bed.gz
├── long_vignettes
    ├── full-power.Rmd
    └── render-long-vignettes.R
├── man
    ├── BSdtToGRanges.Rd
    ├── GenomicDistributions-package.Rd
    ├── TSS_hg19.Rd
    ├── binBSGenome.Rd
    ├── binChroms.Rd
    ├── binRegion.Rd
    ├── calcChromBins.Rd
    ├── calcChromBinsRef.Rd
    ├── calcChromBinsRefSlow.Rd
    ├── calcCumulativePartitions.Rd
    ├── calcCumulativePartitionsRef.Rd
    ├── calcDinuclFreq.Rd
    ├── calcDinuclFreqRef.Rd
    ├── calcExpectedPartitions.Rd
    ├── calcExpectedPartitionsRef.Rd
    ├── calcFeatureDist.Rd
    ├── calcFeatureDistRefTSS.Rd
    ├── calcGCContent.Rd
    ├── calcGCContentRef.Rd
    ├── calcNearestNeighbors.Rd
    ├── calcNeighborDist.Rd
    ├── calcPartitions.Rd
    ├── calcPartitionsRef.Rd
    ├── calcSummarySignal.Rd
    ├── calcWidth.Rd
    ├── cellTypeMetadata.Rd
    ├── chromSizes_hg19.Rd
    ├── dot-requireAndReturn.Rd
    ├── dot-validateInputs.Rd
    ├── dtToGr.Rd
    ├── dtToGrInternal.Rd
    ├── exampleOpenSignalMatrix_hg19.Rd
    ├── geneModels_hg19.Rd
    ├── genomePartitionList.Rd
    ├── getChromSizes.Rd
    ├── getChromSizesFromFasta.Rd
    ├── getGeneModels.Rd
    ├── getGeneModelsFromGTF.Rd
    ├── getGenomeBins.Rd
    ├── getReferenceData.Rd
    ├── getTssFromGTF.Rd
    ├── grToDt.Rd
    ├── labelCuts.Rd
    ├── loadBSgenome.Rd
    ├── loadEnsDb.Rd
    ├── neighbordt.Rd
    ├── nlist.Rd
    ├── plotChromBins.Rd
    ├── plotCumulativePartitions.Rd
    ├── plotDinuclFreq.Rd
    ├── plotExpectedPartitions.Rd
    ├── plotFeatureDist.Rd
    ├── plotGCContent.Rd
    ├── plotNeighborDist.Rd
    ├── plotPartitions.Rd
    ├── plotQTHist.Rd
    ├── plotSummarySignal.Rd
    ├── retrieveFile.Rd
    ├── setB_100.Rd
    ├── splitDataTable.Rd
    ├── theme_blank_facet_label.Rd
    └── vistaEnhancers.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── testChrom.R
    │   ├── testGCContent.R
    │   ├── testNeighborDist.R
    │   ├── testOpenChromatin.R
    │   ├── testPartitions.R
    │   └── test_all.R
└── vignettes
    ├── figures-full-power
        ├── GC-content-1.png
        ├── TSS-plot-1.png
        ├── TSS-plot-closeup-1.png
        ├── chrom-bin-plot-1.png
        ├── cumulative-partitions-1.png
        ├── custom-cumulative-partitions-1.png
        ├── custom-expected-partition-plot-1.png
        ├── custom-partition-plot-1.png
        ├── dinuc-content-1.png
        ├── expected-partition-plot-1.png
        ├── gene-distance-plot-1.png
        ├── neighbor-distance-distribution-1.png
        ├── open-signal-1.png
        ├── partition-plot-1.png
        ├── partition-plot-proportional-1.png
        └── width-distribution-1.png
    ├── full-power.Rmd
    └── intro.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | .git
 2 | .travis.yml
 3 | _pkgdown.yaml
 4 | long_vignettes
 5 | data-raw
 6 | ^.*\.Rproj$
 7 | ^\.Rproj\.user$
 8 | ^doc$
 9 | ^Meta$
10 | ^\.github$
11 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - master
 5 |       - dev
 6 |   pull_request:
 7 |     branches:
 8 |       - master
 9 |       - dev
10 | 
11 | name: Install and lint
12 | 
13 | jobs:
14 |   install-and-lint:
15 |     runs-on: macos-latest
16 |     env:
17 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
18 |     steps:
19 |       - uses: actions/checkout@v2
20 | 
21 |       - uses: r-lib/actions/setup-r@v1
22 | 
23 |       - name: Query dependencies
24 |         run: |
25 |           install.packages('remotes')
26 |           saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
27 |           writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
28 |         shell: Rscript {0}
29 | 
30 |       - name: Restore R package cache
31 |         uses: actions/cache@v2
32 |         with:
33 |           path: ${{ env.R_LIBS_USER }}
34 |           key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
35 |           restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
36 | 
37 |       - name: Install dependencies
38 |         run: |
39 |           install.packages(c("remotes"))
40 |           remotes::install_deps(dependencies = TRUE)
41 |           remotes::install_cran("lintr")
42 |         shell: Rscript {0}
43 | 
44 |       - name: Install package
45 |         run: R CMD INSTALL .
46 | 
47 |       - name: Lint
48 |         run: lintr::lint_package(linters = lintr::with_defaults(assignment_linter=NULL))
49 |         shell: Rscript {0}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | docs/*
 2 | .Rproj.user
 3 | .RData
 4 | .Rhistory
 5 | *.Rproj
 6 | *.bed.gz
 7 | *.sqlite
 8 | 
 9 | # OS generated files 
10 | .DS_Store
11 | .DS_Store?
12 | ._*
13 | .Spotlight-V100
14 | .Trashes
15 | ehthumbs.db
16 | Thumbs.db
17 | 
18 | # Gedit temporary files 
19 | *~
20 | 
21 | # libreoffice lock files:
22 | .~lock*
23 | doc
24 | Meta
25 | /doc/
26 | /Meta/
27 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | branches:
2 |   only:
3 |   - master
4 |   - dev
5 | language: r
6 | r: bioc-release


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: GenomicDistributions
 2 | Version: 1.17.1
 3 | Date: 2025-04-23
 4 | Title: GenomicDistributions: fast analysis of genomic intervals with Bioconductor
 5 | Description: If you have a set of genomic ranges, this package can help you with
 6 |  visualization and comparison. It produces several kinds of plots, for example:
 7 |  Chromosome distribution plots, which visualize how your regions are distributed
 8 |  over chromosomes; feature distance distribution plots, which visualizes how
 9 |  your regions are distributed relative to a feature of interest, like
10 |  Transcription Start Sites (TSSs); genomic partition plots, which visualize
11 |  how your regions overlap given genomic features such as promoters, introns,
12 |  exons, or intergenic regions. It also makes it easy to compare one set of
13 |  ranges to another.
14 | Authors@R: c(
15 | 	  person("Kristyna", "Kupkova", role=c("aut", "cre"),
16 |     email = "kristynakupkova@gmail.com"),
17 |     person("Jose", "Verdezoto", role="aut"),
18 |     person("Tessa", "Danehy", role="aut"),
19 |     person("John", "Lawson", role="aut"),
20 |     person("Jose", "Verdezoto", role="aut"),
21 |     person("Michal", "Stolarczyk", role="aut"),
22 |     person("Jason", "Smith", role="aut"),
23 |     person("Bingjie", "Xue", role="aut"),
24 |     person("Sophia", "Rogers", role="aut"),
25 |     person("John", "Stubbs", role="aut"),
26 |     person(given=c("Nathan", "C."), "Sheffield",
27 |     email = "nathan@code.databio.org", role="aut"))
28 | Depends:
29 |     R (>= 4.0),
30 |     IRanges,
31 |     GenomicRanges
32 | Imports:
33 |     data.table,
34 |     ggplot2,
35 |     reshape2,
36 |     methods,
37 |     utils,
38 |     Biostrings,
39 |     plyr,
40 |     dplyr,
41 |     scales,
42 |     broom,
43 |     GenomeInfoDb,
44 |     stats
45 | Suggests:
46 |     AnnotationFilter,
47 |     rtracklayer,
48 |     testthat,
49 |     knitr,
50 |     BiocStyle,
51 |     rmarkdown,
52 |     GenomicDistributionsData
53 | Enhances:
54 |     BSgenome,
55 |     extrafont,
56 |     ensembldb,
57 |     GenomicFeatures
58 | LazyData: true
59 | VignetteBuilder: knitr
60 | License: BSD_2_clause + file LICENSE
61 | biocViews: Software, GenomeAnnotation, GenomeAssembly, DataRepresentation, Sequencing,
62 |             Coverage, FunctionalGenomics, Visualization
63 | RoxygenNote: 7.3.2
64 | URL: http://code.databio.org/GenomicDistributions
65 | BugReports: http://github.com/databio/GenomicDistributions
66 | Encoding: UTF-8
67 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2017
2 | COPYRIGHT HOLDER: Nathan Sheffield


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(binBSGenome)
 4 | export(binChroms)
 5 | export(binRegion)
 6 | export(calcChromBins)
 7 | export(calcChromBinsRef)
 8 | export(calcCumulativePartitions)
 9 | export(calcCumulativePartitionsRef)
10 | export(calcDinuclFreq)
11 | export(calcDinuclFreqRef)
12 | export(calcExpectedPartitions)
13 | export(calcExpectedPartitionsRef)
14 | export(calcFeatureDist)
15 | export(calcFeatureDistRefTSS)
16 | export(calcGCContent)
17 | export(calcGCContentRef)
18 | export(calcNearestNeighbors)
19 | export(calcNeighborDist)
20 | export(calcPartitions)
21 | export(calcPartitionsRef)
22 | export(calcSummarySignal)
23 | export(calcWidth)
24 | export(dtToGr)
25 | export(genomePartitionList)
26 | export(getChromSizes)
27 | export(getChromSizesFromFasta)
28 | export(getGeneModels)
29 | export(getGeneModelsFromGTF)
30 | export(getGenomeBins)
31 | export(getTssFromGTF)
32 | export(loadBSgenome)
33 | export(loadEnsDb)
34 | export(nlist)
35 | export(plotChromBins)
36 | export(plotCumulativePartitions)
37 | export(plotDinuclFreq)
38 | export(plotExpectedPartitions)
39 | export(plotFeatureDist)
40 | export(plotGCContent)
41 | export(plotNeighborDist)
42 | export(plotPartitions)
43 | export(plotQTHist)
44 | export(plotSummarySignal)
45 | export(retrieveFile)
46 | import(dplyr)
47 | import(ggplot2)
48 | importFrom(Biostrings,alphabetFrequency)
49 | importFrom(Biostrings,readDNAStringSet)
50 | importFrom(GenomicRanges,GRanges)
51 | importFrom(GenomicRanges,GRangesList)
52 | importFrom(GenomicRanges,elementMetadata)
53 | importFrom(GenomicRanges,granges)
54 | importFrom(GenomicRanges,makeGRangesFromDataFrame)
55 | importFrom(GenomicRanges,seqnames)
56 | importFrom(GenomicRanges,strand)
57 | importFrom(IRanges,IRanges)
58 | importFrom(IRanges,Views)
59 | importFrom(data.table,":=")
60 | importFrom(data.table,as.data.table)
61 | importFrom(data.table,copy)
62 | importFrom(data.table,data.table)
63 | importFrom(data.table,foverlaps)
64 | importFrom(data.table,fread)
65 | importFrom(data.table,is.data.table)
66 | importFrom(data.table,rbindlist)
67 | importFrom(data.table,setDT)
68 | importFrom(data.table,setattr)
69 | importFrom(data.table,setcolorder)
70 | importFrom(data.table,setkey)
71 | importFrom(data.table,setnames)
72 | importFrom(data.table,setorder)
73 | importFrom(data.table,tstrsplit)
74 | importFrom(methods,is)
75 | importFrom(reshape2,melt)
76 | importFrom(stats,chisq.test)
77 | importFrom(utils,data)
78 | importFrom(utils,download.file)
79 | importFrom(utils,getAnywhere)
80 | importFrom(utils,globalVariables)
81 | importFrom(utils,installed.packages)
82 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | # Change log
 2 | All notable changes to this project will be documented in this file. Here we
 3 | will document changes to major new releases only (not point releases).
 4 | 
 5 | ## [1.3.3] -- 2022-01-27
 6 |   - Updated cumulative distribution feature overlap plots and description
 7 | 
 8 | ## [1.3.2] -- 2022-01-20
 9 |   - Bioconductor released new version
10 |   - Cell specificity plots are now more generic signal summary plots - the calc function is now "calcSummarySignal", plot function "plotSummarySignal" 
11 |   - Added calculation of Chi-square p-values of expected partitions as default output of "calcExpectedPartitionsRef"
12 |   - Chi-square p-values are now optionally shown as stars on top of each partition output by "plotExpectedPartitions"
13 |   - Chromosome distribution calculations have been optimized to substantially reduce running time 
14 |   - Default plotting of neighbor distance now includes an X-axis log scale to account for outliers more intuitively 
15 |   - Corrected plotting of partition overlap for multiple region sets - now sums to 100 by group
16 |   - Intro vignette has been updated to include "calcSummarySignal" and "plotSummarySignal" functions along with expected partitions p-values calculation and updated X-axis scale of neighbor dist plots
17 | 
18 | ## [1.1.2] -- 2020-07-07
19 |   - Added functions to calculate and plot dinucleotide frequencies
20 | 
21 | ## [1.1.1] -- 2020-06-03
22 | 
23 |   - Package now on Bioconductor under 1.0.0 (dev version 1.1.0) - bump to 1.1.1
24 |   - Added sorting option to feature distance plot
25 |   - Added stack bar option to partition plots
26 |   - Fixed ordering of chromosome distribution
27 |   - Added possibility to calculate proportional overlap in partition plots
28 |   - Expected partition distribution now calculated based on annotation class object sizes
29 |   - Improved definition of introns in gene partition lists
30 |   - Improved memory and time performance
31 |   - In full power vignette added option to add custom annotation classes in partition plots
32 |   - Added functions to create annotations out of GTF files
33 |   - New vignette - how to build custom reference data
34 | 
35 | ## [0.99.0] -- 2020-05-26
36 | 
37 |         - Update R version to 4.0
38 |         - Bioconductor submission
39 | 
40 | ## [0.8] -- 2020-05-20
41 | 	
42 | 	- Added tiled version to feature distance plot
43 | 	- Added percentage plots to feature distances
44 | 	- Reduced data shipped with package
45 | 	- Reduced verbosity of calc functions
46 | 	- Simplified functions for calculating overlaps
47 | 	- Added UTRs to gene models
48 | 	- Moved data producing functions to data-raw
49 | 	- Improved 'full power' vignette
50 | 	- Bug fixes for edge cases, like plotting distribution with only one value
51 | 
52 | ## [0.7] -- 2020-04-11
53 | 
54 | 	- Revamped TSS distance distribution plots to better reflect the scale of distances
55 | 	- Added new versions of partition plots
56 | 	- Added new cell-type specificity plot
57 | 	- Added new quantile-trimmed histogram for width distribution plots
58 | 	- Added new unit tests and updated coding style for bioconductor
59 | 
60 | ## [0.6] -- 2019-09-20
61 | 
62 | 	- Added functions to calculate GC content
63 | 
64 | ## [0.5] -- 2018-04-30
65 | 
66 | 	- Built-in data added for mm9 assembly
67 | 	- Functions can now accept TxDb objects, in addition to EnsDb objects
68 | 
69 | ## [0.4] -- 2018-04-05
70 | 
71 | 	- Add partition plots
72 | 
73 | ## [0.3] -- 2018-03-05
74 | 
75 | 	- Revamp all function names, make functions more parallel and modular
76 | 	- Dramatic increase in speed for feature distance plots
77 | 
78 | ## [0.2] -- 2018-03-02
79 | 
80 | 	- Make bins the same size, instead of having the same number of bins per chrom
81 | 	- Divest dependency on BSGenome and ensembldb by integrating basic data
82 | 
83 | ## [0.1] -- 2018-02-01
84 | 
85 | 	- First version released
86 | 


--------------------------------------------------------------------------------
/R/aggregate.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Quick way to count overlaps between a region set and one or more other 
 3 | # region sets. 
 4 | #
 5 | # Count how many regions
 6 | # from the first region set (queryRegionDT) overlap with each of the regions
 7 | # from the other region set/s.
 8 | # Uses only the midpoint of the first region set when finding overlaps. 
 9 | #
10 | # @param queryRegionDT data.frame/data.table. Must have "chr" and "start"
11 | # columns.
12 | # @param regionsGRL GRangesList or GRanges. E.g. Binned chromosomes, with
13 | # each chromosome as a GRanges object in the GRangesList.
14 | # @return a data.table with the following columns: 
15 | # regionID,  chr, start,  end, withinGroupID, regionGroupID, N
16 | # the coordinates refer to the regions from regionsGRL.
17 | # "regionGroupID" refers to which GRanges from regionsGRL the given
18 | # region was a member of. "withinGroupID" refers to the index of the given
19 | # region within its GRanges object.
20 | # "regionID" has the index for the given region as if all the GRanges from
21 | # regionsGRL were combined into a single GRanges object
22 | # The "N" column has the counts for number of query regions 
23 | # overlapping with that regionsGRL region
24 | calcOLCount = function(queryRegionDT, regionsGRL) {
25 |     jExpr = ".N"
26 |     queryRegionDT = queryRegionDT
27 |   
28 |     # Assert that regionsGRL is a GRL.
29 |     # If regionsGRL is given as a GRanges, we convert to GRL
30 |     if(methods::is(regionsGRL,"GRanges")) {
31 |       regionsGRL = GRangesList(regionsGRL)
32 |     } else if (! methods::is(regionsGRL, "GRangesList")) {
33 |       stop("regionsGRL is not a GRanges or GRangesList object")
34 |     }
35 |   
36 |     # convert query regions to just the midpoint
37 |     if ("end" %in% colnames(queryRegionDT)) {
38 |       # assign to "start" since BSdtToGRanges keeps the start coord
39 |       queryRegionDT$start = round((queryRegionDT$start + 
40 |                                    queryRegionDT$end)/2) 
41 |     }
42 |   
43 |     # only keeps start column
44 |     bsgr = BSdtToGRanges(list(queryRegionDT))
45 |   
46 |     # It's required to do a findoverlaps on each region individually,
47 |     # Not on a GRL, because of the way overlaps with GRLs work. So,
48 |     # we must convert the GRL to a GR, but we must keep track of which
49 |     # regions came from which group.
50 |     regionsGR = unlist(regionsGRL)
51 |   
52 |     regionsGRL.length = lapply(regionsGRL, length)
53 |   
54 |     # Build a table to keep track of which regions belong to which group
55 |     region2group = data.table(
56 |       regionID=seq_along(regionsGR), 
57 |       chr=as.vector(seqnames(regionsGR)), 
58 |       start=as.vector(start(regionsGR)), 
59 |       end=as.vector(end(regionsGR)),
60 |       withinGroupID= as.vector(unlist(lapply(regionsGRL.length, seq))),
61 |       regionGroupID=rep(seq_along(regionsGRL), regionsGRL.length))
62 |     setkey(region2group, regionID)
63 |   
64 |   
65 |     message("Finding overlaps...")
66 |     fo = findOverlaps(bsgr[[1]], regionsGR)
67 |   
68 |     setkey(queryRegionDT, chr, start)
69 |   
70 |     message("Setting regionIDs...")
71 |     #restr to CpGs in any region.
72 |     queryRegionDT = queryRegionDT[queryHits(fo),] 
73 |   
74 |     if (NROW(queryRegionDT) < 1) {
75 |     warning("No overlapping regions in the given region list; 
76 |                 please expand your regionsGRL")
77 |     return(NULL)
78 |   }
79 |     #record which region they overlapped.
80 |     queryRegionDT[,regionID:=subjectHits(fo)] 
81 |     #queryRegionDT[queryHits(fo),regionID:=subjectHits(fo)]
82 |     #if (!keep.na) {
83 |     #queryRegionDT = queryRegionDT[queryHits(fo),]
84 |     #}
85 |   
86 |     # Build the by string
87 |     byString = paste0("list(regionID)")
88 |   
89 |     # Now actually do the aggregate:
90 |     message("Combining...")
91 |     bsCombined = queryRegionDT[,eval(parse(text=jExpr)), 
92 |                              by=eval(parse(text=byString))]
93 |     setkey(bsCombined, regionID)
94 |   
95 |     e = region2group[bsCombined,]
96 |     setkey(e, regionID)
97 |     return(e)
98 | }
99 | 


--------------------------------------------------------------------------------
/R/buildReferenceData.R:
--------------------------------------------------------------------------------
  1 | #' Read local or remote file
  2 | #'
  3 | #' @param source a string that is either a path to a local or remote GTF
  4 | #' @param destDir a string that indicates the path to the directory where
  5 | #'       the downloaded GTF file should be stored. If not provided, 
  6 | #'       a temporary directory will be used.
  7 | #'
  8 | #' @return data.frame retrieved file path
  9 | #' @export
 10 | #'
 11 | #' @examples
 12 | #' CElegansGtfCropped = system.file("extdata", 
 13 | #'                                  "C_elegans_cropped_example.gtf.gz", 
 14 | #'                                  package="GenomicDistributions")
 15 | #' CElegansGtf = retrieveFile(CElegansGtfCropped)
 16 | retrieveFile = function(source, destDir=NULL){
 17 |   if (is.null(destDir)) destDir = tempdir()
 18 |     # download file, if not local
 19 |   if (!file.exists(source)) {
 20 |     destFile = paste(destDir, basename(source), sep = "/")
 21 |     if (file.exists(destFile)){
 22 |       message("File exists: ", destFile)
 23 |     }else{
 24 |       message("File will be saved in: ", destFile)
 25 |       download.file(url = source, destfile = destFile)    
 26 |     }
 27 |   }else{
 28 |     destFile = source
 29 |     message("Got local file: ", destFile)
 30 |   }
 31 |   
 32 |     return(destFile)
 33 | }
 34 | 
 35 | 
 36 | #' Get transcription start sites (TSSs) from a remote or local GTF file
 37 | #'
 38 | #' @param source a string that is either a path to a local or remote GTF
 39 | #' @param destDir a string that indicates the path to the directory where 
 40 | #'        the downloaded GTF file should be stored
 41 | #' @param convertEnsemblUCSC a logical indicating whether Ensembl style 
 42 | #'        chromosome annotation should be changed to UCSC style
 43 | #' @param filterProteinCoding a logical indicating if TSSs should be only
 44 | #'        protein-coding genes (default = TRUE)
 45 | #'
 46 | #' @return a list of GRanges objects
 47 | #'
 48 | #' @import dplyr
 49 | #' @export
 50 | #'
 51 | #' @examples
 52 | #' CElegansGtfCropped = system.file("extdata", 
 53 | #'                                  "C_elegans_cropped_example.gtf.gz", 
 54 | #'                                  package="GenomicDistributions")
 55 | #' CElegansTss = getTssFromGTF(CElegansGtfCropped, TRUE)
 56 | getTssFromGTF = function(source, convertEnsemblUCSC=FALSE, destDir=NULL,
 57 |                          filterProteinCoding=TRUE){
 58 |     GtfDf = as.data.frame(rtracklayer::import(retrieveFile(source, destDir)))
 59 |     
 60 |     if (filterProteinCoding) {
 61 |       subsetGtfDf = GtfDf %>% 
 62 |         dplyr::filter(gene_biotype == "protein_coding", type == "gene")
 63 |     } else {
 64 |       subsetGtfDf = GtfDf
 65 |     }
 66 |     
 67 |     gr = makeGRangesFromDataFrame(subsetGtfDf, keep.extra.columns = TRUE)
 68 |     feats = promoters(gr, 1, 1) 
 69 |     if(convertEnsemblUCSC)
 70 |       seqlevels(feats) = paste0("chr", seqlevels(feats))
 71 |     feats
 72 | }
 73 | 
 74 | 
 75 | #' Get gene models from a remote or local GTF file
 76 | #'
 77 | #' @param source a string that is either a path to a local or remote GTF
 78 | #' @param destDir a string that indicates the path to the directory where
 79 | #'        the downloaded GTF file should be stored
 80 | #' @param features a vector of strings with feature identifiers that to 
 81 | #'        include in the result list
 82 | #' @param convertEnsemblUCSC a logical indicating whether Ensembl style 
 83 | #'        chromosome annotation should be changed to UCSC style
 84 | #' @param filterProteinCoding a logical indicating if TSSs should be only
 85 | #'        protein-coding genes (default = TRUE)
 86 | #'
 87 | #' @return a list of GRanges objects
 88 | #'
 89 | #' @import dplyr
 90 | #' @export
 91 | #'
 92 | #' @examples
 93 | #' CElegansGtfCropped = system.file("extdata", 
 94 | #'                                  "C_elegans_cropped_example.gtf.gz", 
 95 | #'                                  package="GenomicDistributions")
 96 | #' features = c("gene", "exon", "three_prime_utr", "five_prime_utr")
 97 | #' CElegansGeneModels = getGeneModelsFromGTF(CElegansGtfCropped, features, TRUE)
 98 | getGeneModelsFromGTF = function(source,
 99 |                                  features,
100 |                                  convertEnsemblUCSC = FALSE,
101 |                                  destDir = NULL,
102 |                                  filterProteinCoding=TRUE) {
103 |   GtfDf = as.data.frame(rtracklayer::import(retrieveFile(source, destDir)))
104 |   
105 |   if (filterProteinCoding) {
106 |     subsetGtfDf = GtfDf %>%
107 |       dplyr::filter(gene_biotype == "protein_coding")
108 |   } else {
109 |     subsetGtfDf = GtfDf
110 |   }
111 |   
112 |   retList = list()
113 |   message("Extracting features: ", paste(features, collapse = ", "))
114 |   for (feat in features) {
115 |     featGR =  GenomicRanges::reduce(
116 |       unique(GenomeInfoDb::keepStandardChromosomes(
117 |         GenomicRanges::makeGRangesFromDataFrame(
118 |           subsetGtfDf %>% filter(type == feat),
119 |           keep.extra.columns = TRUE), 
120 |         pruning.mode = "coarse")))
121 |     # change from Ensembl style chromosome annotation to UCSC style
122 |     if (convertEnsemblUCSC)
123 |       seqlevels(featGR) =  paste0("chr", seqlevels(featGR))
124 |     retList[[feat]] = featGR
125 |   }
126 |   retList
127 | }
128 | 
129 | 
130 | #' Get gene models from a remote or local FASTA file
131 | #'
132 | #' @param source a string that is either a path to a  
133 | #'        local or remote FASTA
134 | #' @param destDir a string that indicates the path to the 
135 | #'        directory where the downloaded FASTA file should be stored
136 | #' @param convertEnsemblUCSC a logical indicating whether Ensembl style 
137 | #'        chromosome annotation should be changed to UCSC style (add chr)
138 | #' @return a named vector of sequence lengths
139 | #' @importFrom Biostrings readDNAStringSet
140 | #' @export
141 | #'
142 | #' @examples
143 | #' CElegansFasteCropped = system.file("extdata", 
144 | #'                                    "C_elegans_cropped_example.fa.gz", 
145 | #'                                    package="GenomicDistributions")
146 | #' CElegansChromSizes = getChromSizesFromFasta(CElegansFasteCropped)
147 | getChromSizesFromFasta = function(source, destDir=NULL,
148 |                                   convertEnsemblUCSC=FALSE) {
149 |   fastaPath = retrieveFile(source, destDir)
150 |   fastaStringSet = readDNAStringSet(fastaPath)
151 |   oriNames = fastaStringSet@ranges@NAMES
152 |   names = vapply(oriNames, function(x){
153 |     strsplit(x, " ")[[1]][1]
154 |   }, character(1))
155 |   chromSizes = fastaStringSet@ranges@width
156 |   if(convertEnsemblUCSC){
157 |     names(chromSizes) = paste0("chr", names)
158 |   } else{
159 |     names(chromSizes) = names
160 |   }
161 |   chromSizes
162 | }
163 | 


--------------------------------------------------------------------------------
/R/chrom-plots.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' Divide regions into roughly equal bins
  3 | #'
  4 | #' Given a start coordinate, end coordinate, and number of bins to divide, 
  5 | #' this function will split the regions into that many bins.
  6 | #' Bins will be only approximately the same size, due to rounding.
  7 | #' (they should not be more than 1 different).
  8 | #'
  9 | #' Use case: take a set of regions, like CG islands, and bin them; now you can
 10 | #' aggregate signal scores across the bins, giving you an aggregate signal
 11 | #' in bins across many regions of the same type.
 12 | #'
 13 | #' In theory, this just runs on 3 values, but you can run it inside a 
 14 | #' data.table j expression to divide a bunch of regions in the same way.
 15 | #' @param start The starting coordinate
 16 | #' @param end The ending coordinate
 17 | #' @param binSize The size of bin to divide the genome into. You must supply
 18 | #'     either binSize (priority) or binCount.
 19 | #' @param binCount The number of bins to divide. If you do not supply binSize,
 20 | #'     you must supply binCount, which will be used to calculate the binSize.
 21 | #' @param indicator A vector with identifiers to keep with your bins, in case
 22 | #'     you are doing this on a long table with multiple segments concatenated
 23 | #'
 24 | #' @return
 25 | #' A data.table, expanded to nrow = number of bins, with these id columns:
 26 | #'     id: region ID
 27 | #'     binID: repeating ID (this is the value to aggregate across)
 28 | #'     ubinID: unique bin IDs
 29 | #' @export
 30 | #' @examples
 31 | #' Rbins = binRegion(1, 3000, 100, 1000)
 32 | #' 
 33 | binRegion = function(start, end, binSize=NULL, binCount=NULL, indicator=NULL) {
 34 |     .validateInputs(list(start="numeric", end="numeric"))
 35 |     if (is.null(binSize) & is.null(binCount)) {
 36 |         stop("You must provide either binSize or binCount")
 37 |     }
 38 |     if (is.null(binSize)) {
 39 |         binSize = round(sum(end-start)/binCount)
 40 |     }
 41 |     binCountByChrom = round((end-start)/binSize)
 42 |     binCountByChrom[binCountByChrom==0]=1
 43 |     binSizeByChrom = (end-start)/(binCountByChrom)
 44 |     breaks = round(unlist(lapply(binCountByChrom, 
 45 |                             function(x) seq(from=0, to=x))) * 
 46 |                             rep(binSizeByChrom, (binCountByChrom+1)))
 47 |     endpoints = cumsum(binCountByChrom + 1) 
 48 |     startpoints = c(1, endpoints[-length(endpoints)]+1)
 49 | 
 50 |     dataTable = data.table(start=breaks[-endpoints]+1, 
 51 |             end=breaks[-startpoints],
 52 |             id=rep((seq_along(start)), binCountByChrom),
 53 |             binID=unlist(lapply(binCountByChrom, 
 54 |                             function(x) seq(from=1, to=x))),
 55 |             ubinID=seq_along(breaks[-startpoints]),
 56 |             key="id")
 57 | 
 58 |     if (!is.null(indicator)){
 59 |         idCol = rep(indicator, binCountByChrom)
 60 |         dataTable = data.table(idCol, dataTable)
 61 |     }
 62 |     return(dataTable)
 63 | }
 64 | 
 65 | #' Bins a BSgenome object.
 66 | #'
 67 | #' Given a BSgenome object (to be loaded via \code{loadBSgenome}), and a number
 68 | #' of bins, this will bin that genome. It is a simple wrapper of the
 69 | #' \code{binChroms} function
 70 | #' 
 71 | #' @param genome A UCSC-style string denoting reference assembly (e.g. 'hg38')
 72 | #' @param binCount number of bins per chromosome
 73 | #' @return A data.table object showing the region and bin IDs 
 74 | #'         of the reference genome.
 75 | #' @export
 76 | #' @examples
 77 | #' \dontrun{
 78 | #' binCount = 1000
 79 | #' refGenomeBins = binBSGenome("hg19", binCount)
 80 | #' }
 81 | binBSGenome = function(genome, binCount) {
 82 |     .validateInputs(list(genome="character", binCount="numeric"))
 83 |     BSG = loadBSgenome(genome)
 84 |     chromSizes = seqlengths(BSG)
 85 |     return(binChroms(binCount, chromSizes))
 86 | }
 87 | 
 88 | #' Naively splits a chromosome into bins
 89 | #' 
 90 | #' Given a list of chromosomes with corresponding sizes, this script will
 91 | #' produce (roughly) evenly-sized bins across the chromosomes. It does not
 92 | #' account for assembly gaps or the like.
 93 | #' 
 94 | #' @param binCount number of bins (total; *not* per chromosome)
 95 | #' @param chromSizes a named list of size (length) for each chromosome.
 96 | #' @return A data.table object assigning a bin ID to each chromosome region.
 97 | #' @export
 98 | #' @examples 
 99 | #' chromSizes = c(chr1=249250621, chr2=243199373, chr3=198022430)
100 | #' cBins = binChroms(1000, chromSizes)
101 | #' 
102 | binChroms = function(binCount, chromSizes) {
103 |     .validateInputs(list(chromSizes="numeric", binCount="numeric"))
104 |     seqnamesColName="chr"
105 |     rangeDT = data.table(chr=names(chromSizes), start=1, end=chromSizes)
106 |     binnedDT = rangeDT[, binRegion(start, end, binCount=binCount,
107 |             indicator=get(seqnamesColName))]
108 |     return(binnedDT)
109 | }
110 | 
111 | 
112 | #' Calculates the distribution of a query set over the genome
113 | #' 
114 | #' Returns a data.table showing counts of regions from the query that overlap
115 | #' with each bin.
116 | #' In other words, where on which chromosomes are the ranges distributed?
117 | #' You must provide binned regions. Only the midpoint of each query region is
118 | #' used to test for overlap with the bin regions.
119 | #' 
120 | #' @param query A GenomicRanges or GenomicRangesList object with query regions
121 | #' @param bins Pre-computed bins (as a GRangesList object) to aggregate
122 | #'    over; for example, these could be genome bins
123 | #' @return A data.table showing where on which chromosomes 
124 | #'    ranges are distributed.
125 | #' @export
126 | #' @examples
127 | #' 
128 | #' chromSizes = getChromSizes("hg19")
129 | #' genomeBins  = getGenomeBins(chromSizes)
130 | #' chromDistribution = calcChromBins(vistaEnhancers, genomeBins)
131 | #' 
132 | #' vistaSftd = GenomicRanges::shift(vistaEnhancers, 100000)
133 | #' vistaSftd2 = GenomicRanges::shift(vistaEnhancers, 200000)
134 | #' calcChromBins(vistaEnhancers, GRangesList(vistaSftd, vistaSftd2))
135 | calcChromBins = function(query, bins) {
136 |     .validateInputs(list(bins=c("GRanges","GRangesList"),
137 |                            query=c("GRanges","GRangesList")))
138 |     if (is(query, "GRangesList"))  {
139 |         # Recurse over each GRanges object
140 |         x = lapply(query, calcChromBins, bins)
141 |         # To accommodate multiple regions, we'll need to introduce a new 'name'
142 |         # column to distinguish them.
143 |         nameList = names(query)
144 |     if(is.null(nameList)) {
145 |         nameList = seq_along(query) # Fallback to sequential numbers
146 |     }
147 |     # Append names
148 |     xb = rbindlist(x)
149 |     xb$name = rep(nameList, vapply(x, nrow, integer(1)))
150 |     return(xb)
151 |     }
152 | 
153 |     queryDT = grToDt(query)
154 |     
155 |     # This function will just count the number of regions.
156 |     res = calcOLCount(queryDT, bins)
157 | 
158 |     # order chromosomes by current order.
159 |     res[, chr:=factor(chr, levels=unique(res$chr))]
160 |     return(res)
161 | }
162 | 
163 | #' Returns the distribution of query over a reference assembly
164 | 
165 | #' Given a query set of elements (a GRanges object) and a reference assembly
166 | #' (*e.g. 'hg38'), this will aggregate and count the distribution of the query
167 | #' elements across bins of the reference genome. This is a helper function to
168 | #' create features for common genomes. It is a wrapper of
169 | #' \code{calcChromBins}, which is more general.
170 | 
171 | #' @param query A GenomicRanges or GenomicRangesList object with query regions
172 | #' @param refAssembly A character vector that will be used to grab chromosome
173 | #'     sizes with \code{getChromSizes}
174 | #' @param binCount Number of bins to divide the chromosomes into
175 | #' @return A data.table showing the distribution of regions across bins of the
176 | #' reference genome.
177 | #' @examples 
178 | #' ChromBins = calcChromBinsRef(vistaEnhancers, "hg19")
179 | calcChromBinsRefSlow = function(query, refAssembly, binCount=3000) {
180 |     .validateInputs(list(refAssembly="character", 
181 |                            query=c("GRanges","GRangesList")))
182 |     # Bin the genome
183 |     chromSizes = getChromSizes(refAssembly)
184 |     binnedDT = binChroms(binCount, chromSizes)
185 |     splitBinnedDT = splitDataTable(binnedDT, "id")
186 |     listGR = lapply(splitBinnedDT, dtToGr, chr="idCol")
187 |     genomeBins =  GRangesList(listGR)
188 |     return(calcChromBins(query, genomeBins))
189 | }
190 | 
191 | 
192 | #' Returns the distribution of query over a reference assembly
193 | 
194 | #' Given a query set of elements (a GRanges object) and a reference assembly
195 | #' (*e.g. 'hg38'), this will aggregate and count the distribution of the query
196 | #' elements across bins of the reference genome. This is a helper function to
197 | #' create features for common genomes. It is a wrapper of
198 | #' \code{calcChromBins}, which is more general.
199 | 
200 | #' @param query A GenomicRanges or GenomicRangesList object with query regions
201 | #' @param refAssembly A character vector that will be used to grab chromosome
202 | #'     sizes with \code{getChromSizes}
203 | #' @param binCount Number of bins to divide the chromosomes into
204 | #' @return A data.table showing the distribution of regions across bins of the
205 | #' reference genome.
206 | #' @export
207 | #' @examples 
208 | #' ChromBins = calcChromBinsRef(vistaEnhancers, "hg19")
209 | calcChromBinsRef = function(query, refAssembly, binCount=3000) {
210 |    .validateInputs(list(refAssembly="character",
211 |                            query=c("GRanges","GRangesList")))
212 |     if (is(query, "GRangesList"))  {
213 |         # Recurse over each GRanges object
214 |         x = lapply(query, calcChromBinsRef, refAssembly, binCount)
215 |         # To accommodate multiple regions, we'll need to introduce a new 'name'
216 |         # column to distinguish them.
217 |         nameList = names(query)
218 |        if(is.null(nameList)) {
219 |             nameList = seq_along(query) # Fallback to sequential numbers
220 |         }
221 |         # Append names
222 |         xb = rbindlist(x)
223 |         xb$name = rep(nameList, vapply(x, nrow, integer(1)))
224 |         return(xb)
225 |     }        
226 |    # Bin the genome
227 |     chromSizes = getChromSizes(refAssembly)
228 |     binnedDT = binChroms(binCount, chromSizes)
229 |     queryDT = grToDt(query)
230 |     setnames(binnedDT, "idCol", "chr")
231 |     queryDT[, midpoint:=start + (end-start)]
232 |     # Here I use a non-equi join to get the overlaps
233 |     res = binnedDT[queryDT, .(chr, regionID=ubinID, withinGroupID=x.binID, start=x.start, end=x.end), 
234 |                     on=.(chr, start<=midpoint, end>=midpoint), nomatch=0L][, list(.N), by=list(chr, start, end, regionID, withinGroupID)][order(regionID),]
235 |     res[, chr:=factor(chr, levels=unique(res$chr))]
236 |     return(res)
237 | }
238 | 
239 | 
240 | 
241 | #' Plot distribution over chromosomes
242 | #' 
243 | #' Plots result from \code{genomicDistribution} calculation
244 | #' @param genomeAggregate The output from the genomicDistribution function
245 | #' @param plotTitle Title for plot.
246 | #' @param ylim Limit of y-axes. Default "max" sets limit to N of biggest bin.
247 | #' @return A ggplot object showing the distribution of the query 
248 | #'     regions over bins of
249 | #' the reference genome.
250 | #' @export
251 | #' @examples
252 | #' agg = data.frame("regionID"=1:5, "chr"=rep(c("chr1"), 5), 
253 | #'                 "withinGroupID"=1:5, "N"=c(1,3,5,7,9))  
254 | #' ChromBins = plotChromBins(agg)
255 | #' 
256 | plotChromBins = function(genomeAggregate,
257 |                            plotTitle="Distribution over chromosomes", ylim="max") {
258 |     .validateInputs(list(genomeAggregate=c("data.table","data.frame")))
259 |     
260 |     if ("name" %in% names(genomeAggregate)){
261 |         # It has multiple regions
262 |         # sort the regions labels again
263 |         setkey(genomeAggregate, regionID)
264 |         genomeAggregate[, chr:=factor(chr, levels=unique(genomeAggregate$chr))]
265 |         # and plot
266 |         g = ggplot(genomeAggregate, aes(x=withinGroupID, y=N, 
267 |                                         fill=name, color=name))
268 |     } else {
269 |         # It's a single region
270 |         g = ggplot(genomeAggregate, aes(x=withinGroupID, y=N))
271 |     }
272 |     g = g +
273 |         xlab("Genome") + 
274 |         ylab("Number of regions") +
275 |         geom_bar(stat="identity") + # Spread out to max width
276 |         facet_grid(chr ~ .) + # Place chromosomes one on top of another
277 |         theme_classic() + # Clean up cruft
278 |         theme_blank_facet_label() + # No boxes around labels
279 |         theme(panel.spacing=unit(0, "lines")) + # Reduce whitespace
280 |         theme(strip.text.y=element_text(size=12, angle=0)) + # Rotate labels
281 |         geom_hline(yintercept=0, color="#EEEEEE") + # Light chrom lines
282 |         {if (ylim == "max") {
283 |             scale_y_continuous(breaks = c(max(genomeAggregate$N)),
284 |                                limits = c(0, max(genomeAggregate$N)))
285 |         } else {
286 |             scale_y_continuous(breaks = ylim,
287 |                                limits = c(0, ylim))
288 |         }} +
289 |     scale_x_continuous(breaks=c(0, max(genomeAggregate$withinGroupID)), labels=c("Start", "End")) +
290 |     theme(plot.title=element_text(hjust=0.5)) + # Center title
291 |     ggtitle(plotTitle) +
292 |     theme(legend.position="bottom")
293 |     return(g)
294 | }
295 | 
296 | #' Returns bins used in `calcChromBins` function
297 | 
298 | #' Given a named vector of chromosome sizes, the function returns
299 | #' GRangesList object with bins for each chromosome.
300 | 
301 | #' @param chromSizes a named list of size (length) for each chromosome.
302 | #' @param binCount number of bins (total; *not* per chromosome), 
303 | #'        defaults to 10,000
304 | #' @return A GRangesList object with bins that separate chromosomes
305 | #'         into equal parts.
306 | #' @export
307 | #' @examples 
308 | #' chromSizes = getChromSizes("hg19")
309 | #' chromBins  = getGenomeBins(chromSizes)
310 | #' 
311 | getGenomeBins = function(chromSizes, binCount=10000) {
312 |   .validateInputs(list(chromSizes="integer"))
313 |   
314 |   binnedDT = binChroms(binCount, chromSizes)
315 |   splitBinnedDT = splitDataTable(binnedDT, "id")
316 |   listGR = lapply(splitBinnedDT, dtToGr, chr="idCol")
317 |   genomeBins =  GRangesList(listGR)
318 |   return(genomeBins)
319 | }
320 | 


--------------------------------------------------------------------------------
/R/content-plots.R:
--------------------------------------------------------------------------------
  1 | #' Calculate GC content over genomic ranges
  2 | #' 
  3 | #' Given a reference genome as a BSgenome object and some ranges on that
  4 | #' reference, this function will return a vector of the same length as the
  5 | #' granges object, with percent of Cs and Gs.
  6 | #' 
  7 | #' @param query  A GenomicRanges or GenomicRangesList object with query regions.
  8 | #' @param ref Reference genome BSgenome object.
  9 | #' @return A numeric vector of list of vectors with the GC percentage of 
 10 | #'     the query regions.
 11 | #' @export
 12 | #' @examples
 13 | #' \dontrun{
 14 | #' bsg = loadBSgenome('hg19')
 15 | #' gcvec = calcGCContent(vistaEnhancers, bsg)
 16 | #' }
 17 | calcGCContent = function(query, ref) {
 18 |     .validateInputs(list(query=c("GRanges","GRangesList"),
 19 |                          ref="BSgenome"))
 20 |     if (is(query, "GRangesList")) {
 21 |         # Recurse over each GRanges object
 22 |         x = lapply(query, calcGCContent, ref)
 23 |         namelist = names(query)
 24 |         if (is.null(namelist)) {
 25 |             newnames = seq_along(query)
 26 |             namelist = newnames
 27 |             # Append names
 28 |             names(x) = namelist
 29 |         }
 30 |         return(x)
 31 |     }
 32 |     # Restrict the seqnames to known chromosomes
 33 |     query = GenomeInfoDb::keepStandardChromosomes(query, pruning.mode="coarse")
 34 |     v = IRanges::Views(ref, query)
 35 |     gcvec = apply(Biostrings::alphabetFrequency(v)[,c("C","G")],1, sum)/width(v)
 36 |     return(gcvec)
 37 | }
 38 | 
 39 | 
 40 | #' Calculate GC content over genomic ranges
 41 | #' 
 42 | #' Given a reference genome as a BSgenome object and some ranges on that
 43 | #' reference, this function will return a vector of the same length as the
 44 | #' granges object, with percent of Cs and Gs.
 45 | #' 
 46 | #' @param query A GenomicRanges or GenomicRangesList object with query regions
 47 | #' @param refAssembly A character vector specifying the reference genome
 48 | #'     assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes with
 49 | #'     \code{getTSSs}.
 50 | #' @return A numeric vector or list of vectors with the GC percentage of 
 51 | #'     the query regions.
 52 | #' @export
 53 | #' @examples
 54 | #' \dontrun{
 55 | #' refAssembly = 'hg19'
 56 | #' GCcontent = calcGCContentRef(vistaEnhancers, refAssembly)
 57 | #' } 
 58 | calcGCContentRef = function(query, refAssembly) {
 59 |     .validateInputs(list(query=c("GRanges","GRangesList"),
 60 |                          refAssembly="character"))
 61 |     ref = loadBSgenome(refAssembly)
 62 |     return(calcGCContent(query, ref))
 63 | }
 64 | 
 65 | #' Plots a density distribution of GC vectors
 66 | 
 67 | #' Give results from the \code{calcGCContent} function, this will produce a
 68 | #' density plot
 69 | #' @param gcvectors A numeric vector or list of numeric vectors of GC contents.
 70 | #' @return A ggplot object plotting distribution of GC content in query regions.
 71 | #' @export
 72 | #' @examples
 73 | #' numVector = rnorm(400, mean=0.5, sd=0.1)
 74 | #' GCplot = plotGCContent(numVector)
 75 | #' vecs = list(example1 = rnorm(400, mean=0.5, sd=0.1), 
 76 | #'             example2 = rnorm(600, mean=0.5, sd=0.1))
 77 | #' GCplot = plotGCContent(vecs)
 78 | #' 
 79 | plotGCContent = function(gcvectors) {
 80 |     .validateInputs(list(gcvectors=c("numeric", "list")))
 81 |     
 82 |     if (is(gcvectors, "list")) {
 83 |         nameList = names(gcvectors)
 84 |         vectorLengths = unlist(lapply(gcvectors, length))
 85 |         gcdfReshaped = data.frame(value = unlist(gcvectors),
 86 |                                   regionSet = rep(nameList, vectorLengths))
 87 |         meansdf = aggregate(gcdfReshaped$value, 
 88 |                             list(gcdfReshaped$regionSet), mean)
 89 |         g = ggplot2::ggplot(gcdfReshaped, aes(x=value, colour=regionSet)) +
 90 |             geom_density() +
 91 |             geom_vline(data=meansdf, aes(xintercept=x, colour=Group.1),
 92 |                        linetype="dashed", size=0.5) +
 93 |             theme_classic() +
 94 |             theme(legend.position = "bottom")
 95 |     } else {
 96 |         # plot a single regionset
 97 |         gcdfReshaped = data.frame(value = gcvectors)
 98 |         g = ggplot2::ggplot(gcdfReshaped, aes(x=value)) + 
 99 |             geom_density() + 
100 |             geom_vline(aes(xintercept=mean(value)),
101 |                        color="red", linetype="dashed", size=0.5) + 
102 |             theme_classic()
103 |     }    
104 |     g = g + 
105 |         ggtitle("GC content distribution") + 
106 |         theme(plot.title = element_text(hjust=0.5)) +
107 |         xlab("GC content") + 
108 |         xlim(0,1) 
109 |     return(g)
110 | }
111 | 
112 | #' Calculate Dinuclotide content over genomic ranges
113 | #' 
114 | #' Given a reference genome (BSgenome object) and ranges on the
115 | #' reference, this function returns a data.table with 
116 | #' counts of dinucleotides within the GRanges object.
117 | #' 
118 | #' @param query A GRanges object with query sets
119 | #' @param ref Reference genome BSgenome object
120 | #' @param rawCounts a logical indicating whether the raw numbers should be 
121 | #'     displayed, rather than percentages (optional).
122 | #' @return A data.table with counts of dinucleotides across the GRanges object
123 | #' @export
124 | #' @examples
125 | #' \dontrun{ 
126 | #' bsg = loadBSgenome('hg19')
127 | #' DNF = calcDinuclFreq(vistaEnhancers, bsg)
128 | #' }
129 | 
130 | calcDinuclFreq = function(query, ref, rawCounts=FALSE) {
131 |     
132 |     .validateInputs(list(query=c("GRanges","GRangesList"),
133 |                          ref="BSgenome"))
134 |     if (is(query, "GRangesList")) {
135 |         
136 |         # Recurse over each GRanges object
137 |         x = lapply(query, calcDinuclFreq, ref, rawCounts=rawCounts)
138 |         
139 |         # return a list of dinucleotide dataframes across each GRanges object
140 |         return(x)
141 |     }
142 |     # Restrict the seqnames to known chromosomes
143 |     query = GenomeInfoDb::keepStandardChromosomes(query, pruning.mode="coarse")
144 |     v = IRanges::Views(ref, query)
145 |     regionNames = data.frame(region = paste(seqnames(query), 
146 |                                             start(query), 
147 |                                             end(query), sep="_"))
148 |     dnvec= Biostrings::dinucleotideFrequency(v)
149 |     # claculate frequencies if raw counts not required
150 |     if(!rawCounts){
151 |       dnvec = prop.table(dnvec, margin = 1)*100
152 |     }
153 |     dnvec = cbind(regionNames, as.data.frame(dnvec))
154 |     return(dnvec)
155 | }
156 | 
157 | 
158 | #' Calculate dinucleotide content over genomic ranges
159 | #' 
160 | #' Given a reference genome (BSgenome object) and ranges on the
161 | #' reference, this function returns a data.table with 
162 | #' counts of dinucleotides within the GRanges object.
163 | #' 
164 | #' @param query A GRanges object with query sets
165 | #' @param refAssembly A character vector specifying the reference genome
166 | #'     assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes with
167 | #'     \code{getTSSs}.
168 | #' @param rawCounts a logical indicating whether the raw numbers should be 
169 | #'     displayed, rather than percentages (optional).
170 | #' @return A numeric vector or list of vectors with the GC percentage of 
171 | #'     the query regions.
172 | #' @export
173 | #' @examples
174 | #' \dontrun{
175 | #'query = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions")
176 | #'GRquery = rtracklayer::import(query)
177 | #'refAssembly = 'hg19'
178 | #'DNF = calcDinuclFreqRef(GRquery, refAssembly)
179 | #' } 
180 | 
181 | calcDinuclFreqRef= function(query, refAssembly, rawCounts=FALSE) {
182 |     
183 |     .validateInputs(list(query=c("GRanges","GRangesList"),
184 |                          
185 |                          refAssembly="character"))
186 |     
187 |     ref = loadBSgenome(refAssembly)
188 |     
189 |     return(calcDinuclFreq(query, ref, rawCounts=rawCounts))
190 | }
191 | 
192 | 
193 | #' Plot dinuclotide content within region set(s)
194 | #' 
195 | #' Given \code{calcDinuclFreq} or \code{calcDinuclFreqRef} results, this function 
196 | #' generates a violin plot of dinucleotide frequency
197 | #' 
198 | #' @param DNFDataTable A data.table, data.frame, or a list of dinucleotide counts - 
199 | #'                    results from \code{calcDinuclFreq} or \code{calcDinuclFreqRef}
200 | #' @return A ggplot object plotting distribution of dinucleotide content in query regions
201 | #' @export
202 | #' @examples
203 | #' 
204 | #' DNFDataTable = data.table::data.table(GC = rnorm(400, mean=0.5, sd=0.1), 
205 | #' CG = rnorm(400, mean=0.5, sd=0.5), 
206 | #' AT = rnorm(400, mean=0.5, sd=1), 
207 | #' TA = rnorm(400, mean=0.5, sd=1.5))
208 | #' DNFPlot =  plotDinuclFreq(DNFDataTable)
209 | #' 
210 | #' \dontrun{
211 | #' query = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions")
212 | #' GRquery = rtracklayer::import(query)
213 | #' refAssembly = 'hg19'
214 | #' DNF = calcDinuclFreqRef(GRquery, refAssembly)
215 | #' DNFPlot2 =  plotDinuclFreq(DNF)
216 | #' } 
217 | 
218 | plotDinuclFreq = function(DNFDataTable) {
219 |   .validateInputs(list(DNFDataTable=c("data.table","data.frame","list")))
220 |   
221 |   # reshape the data for plotting
222 |   if (is(DNFDataTable, "list") && 
223 |       any(vapply(DNFDataTable, function(x) any(names(x) == "region"), logical(1)))){
224 |     g = reshape2::melt(DNFDataTable,id.vars="region", 
225 |                        variable.name="dinucleotide", value.name="frequency") 
226 |   } else if ((is(DNFDataTable, "data.frame") | is(DNFDataTable, "data.table"))&& 
227 |              ("region" %in% colnames(DNFDataTable))){
228 |     g = reshape2::melt(DNFDataTable,id.vars="region", 
229 |                        variable.name="dinucleotide", value.name="frequency") 
230 |   } else {
231 |     g = reshape2::melt(DNFDataTable, id.vars=NULL,
232 |                        variable.name="dinucleotide", value.name="frequency") 
233 |   }
234 |   
235 |   # plot data as violin plots
236 |   # if multiple inuts - make a facet for each dinucleotide to make the plot easier to read
237 |   if (is(DNFDataTable, "list")){
238 |     plot = ggplot2::ggplot(data=g, ggplot2::aes(x=L1, y=frequency, fill=L1)) +
239 |       facet_wrap(~dinucleotide, nrow=4) +
240 |       theme_bw() +
241 |       theme(strip.background =element_rect(fill="white"))+
242 |       theme(strip.text = element_text(face = "bold")) +
243 |       theme(axis.text.x = element_text(angle=90, hjust=1)) +
244 |       xlab(" ")
245 |   } else{
246 |     plot = ggplot2::ggplot(data=g, ggplot2::aes(x=dinucleotide, y=frequency))+
247 |       xlab("Dinucleotide")+
248 |       theme_bw()
249 |   }
250 |   plot = plot +
251 |     geom_violin(trim=TRUE, scale = "width") +
252 |     geom_boxplot(alpha=0.2, outlier.shape = NA)+
253 |     ggtitle("Dinucleotide Frequency") +
254 |     guides(fill="none") + 
255 |     theme(plot.title = element_text(hjust = 0.5))
256 |   # check if we have raw counts or frequencies
257 |   if (is(g[,"frequency"], "integer")){
258 |     plot = plot + 
259 |       ylab("Dinucleotide counts per region [n]")
260 |     
261 |   } else {
262 |     plot = plot + 
263 |       ylab("Dinucleotide frequency per region [%]")
264 |   }
265 |   return(plot)
266 | }
267 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
  1 | #' hg19 chromosome sizes
  2 | #'
  3 | #' A dataset containing chromosome sizes for Homo Sapiens hg38 genome assembly
  4 | #'
  5 | #' @format A named vectors of lengths with one item per chromosome
  6 | #' @source BSgenome.Hsapiens.UCSC.hg19 package
  7 | #' @name chromSizes_hg19
  8 | #' @docType data
  9 | #' @keywords datasets
 10 | #' @usage data(chromSizes_hg19)
 11 | NULL
 12 | 
 13 | 
 14 | #' hg19 TSS locations
 15 | #'
 16 | #' A dataset containing chromosome sizes for Homo Sapiens hg38 genome assembly
 17 | #'
 18 | #' @format A named vectors of lengths with one item per chromosome
 19 | #' @source EnsDb.Hsapiens.v75 package
 20 | #' @name TSS_hg19
 21 | #' @docType data
 22 | #' @keywords datasets
 23 | #' @usage data(TSS_hg19)
 24 | NULL
 25 | 
 26 | #' hg38 gene models
 27 | #'
 28 | #' A dataset containing gene models for Homo Sapiens hg38 genome assembly. 
 29 | #'
 30 | #' @format A list of two GRanges objects, with genes and exons locations
 31 | #' @source EnsDb.Hsapiens.v75 package
 32 | #' @name geneModels_hg19
 33 | #' @docType data
 34 | #' @keywords datasets
 35 | #' @usage data(geneModels_hg19)
 36 | NULL
 37 | 
 38 | 
 39 | #’ Example hg19 open signal matrix 
 40 | #' 
 41 | #' A dataset containing a subset of open chromatin regions across all 
 42 | #' cell types defined by ENCODE for Homo Sapiens hg19
 43 | #'
 44 | #' Preparation steps:
 45 | #' \enumerate{
 46 | #'    \item{made a universe of regions by merging regions across 
 47 | #'        cell types defined as opened in ENCODE}
 48 | #'    \item{took bigwig files from ENCODE for individual cell types, 
 49 | #'        merged replicates, filtered out blacklisted sites}
 50 | #'    \item{evaluated the signal above regions defined by previous step}
 51 | #'    \item{performed quantile normalization}
 52 | #'    \item{subsetted it}
 53 | #' }
 54 | #'
 55 | #' @format data.frame, rows represent whole selection of open 
 56 | #' chromatin regions across all cell types defined by ENCODE, columns are 
 57 | #' individual cell types and values are normalized open chromatin signal values.
 58 | #' @source \url{http://big.databio.org/open_chromatin_matrix/openSignalMatrix_hg19_quantileNormalized_round4.txt.gz}
 59 | #' @name exampleOpenSignalMatrix_hg19
 60 | #' @docType data
 61 | #' @keywords datasets
 62 | #' @usage data(exampleOpenSignalMatrix_hg19)
 63 | NULL
 64 | 
 65 | 
 66 | #’ Example BED file
 67 | #' 
 68 | #' Example BED file read with rtracklayer::import
 69 | #'
 70 | #' @format GenomicRanges::GRanges
 71 | #' @name vistaEnhancers
 72 | #' @docType data
 73 | #' @keywords datasets
 74 | #' @usage data(vistaEnhancers)
 75 | NULL
 76 | 
 77 | 
 78 | #’ Example BED file
 79 | #' 
 80 | #' Example BED file read with rtracklayer::import
 81 | #'
 82 | #' @format GenomicRanges::GRanges
 83 | #' @name setB_100
 84 | #' @docType data
 85 | #' @keywords datasets
 86 | #' @usage data(setB_100)
 87 | NULL
 88 | 
 89 | 
 90 | #’ Cell type metadata matrix
 91 | #' 
 92 | #' Table the maps cell types to tissues and groups
 93 | #'
 94 | #' @format data.table with 3 columns (cellType, tissue and group) 
 95 | #'     and 74 rows (one per cellType)
 96 | #' @source self-curated dataset
 97 | #' @name cellTypeMetadata
 98 | #' @docType data
 99 | #' @keywords datasets
100 | #' @usage data(cellTypeMetadata)
101 | NULL
102 | 
103 | 


--------------------------------------------------------------------------------
/R/feature-plots.R:
--------------------------------------------------------------------------------
  1 | # Old, slow version based on GRanges methods
  2 | #
  3 | # Find the distance to the nearest genomic feature.
  4 | # 
  5 | # For a given query set of genomic regions, and a given feature set of 
  6 | # regions, this function will return the distance for each query region to its
  7 | # closest feature. It ignores strand and returns the distance as positive or 
  8 | # negative, depending on whether the feature is upstream or downstream.
  9 | # 
 10 | # This function is similar to the bioconductor distanceToNearest function, but
 11 | # returns negative values for downstream distances instead of absolute values.
 12 | # This allows you to assess the relative location.
 13 | # 
 14 | # @param query A GRanges or GRangesList object with query sets
 15 | # @param features A GRanges object with features to test distance to
 16 | # 
 17 | # @return A vector of genomic distances for each query region relative to its 
 18 | #         closest feature.
 19 | calcFeatureDistBioc = function(query, features) {
 20 |     .validateInputs(list(query=c("GRangesList","GRanges")))
 21 |     if (is(query, "GRangesList")) {
 22 |         # Recurse over each GRanges object
 23 |         x = lapply(query, calcFeatureDist, features)
 24 |         return(x)
 25 |     }
 26 | 
 27 |     precedeInd = precede(query, features)
 28 |     preIndNA = is.na(precedeInd)
 29 |     followInd = follow(query, features)
 30 |     folIndNA = is.na(followInd)
 31 |     preDist = rep(NA, length(query))
 32 | 
 33 |     preDist[!preIndNA] = -distance(query[!preIndNA], 
 34 |                                    features[precedeInd[!preIndNA]])
 35 | 
 36 |     postDist = rep(NA, length(query))
 37 |     postDist[!folIndNA] = distance(query[!folIndNA], 
 38 |                                    features[followInd[!folIndNA]])
 39 | 
 40 |     postHits = -preDist > postDist
 41 |     postHitsNA = is.na(postHits)
 42 |     dists = preDist
 43 |     dists[postHits[!postHitsNA]] = postDist[postHits[!postHitsNA]]
 44 |     return(dists)
 45 | }
 46 | 
 47 | #' Find the distance to the nearest genomic feature
 48 | #' 
 49 | #' For a given query set of genomic regions, and a given feature set of 
 50 | #' regions, this function will return the distance for each query region to its
 51 | #' closest feature. It ignores strand and returns the distance as positive or 
 52 | #' negative, depending on whether the feature is upstream or downstream
 53 | #' 
 54 | #' This function is similar to the bioconductor distanceToNearest function, but
 55 | #' returns negative values for downstream distances instead of absolute values.
 56 | #' This allows you to assess the relative location.
 57 | #' 
 58 | #' @param query A GRanges or GRangesList object with query sets
 59 | #' @param features A GRanges object with features to test distance to
 60 | #' 
 61 | #' @return A vector of genomic distances for each query region relative to its 
 62 | #'     closest feature.
 63 | #' @export
 64 | #' @examples 
 65 | #' vistaSftd = GenomicRanges::shift(vistaEnhancers, 100000)
 66 | #' calcFeatureDist(vistaEnhancers, vistaSftd) 
 67 | calcFeatureDist = function(query, features) {
 68 |     .validateInputs(list(query=c("GRangesList","GRanges")))
 69 |     if (is(query, "GRangesList")) {
 70 |         # Recurse over each GRanges object
 71 |         x = lapply(query, calcFeatureDist, features)
 72 |         return(x)
 73 |     }
 74 |     queryDT = grToDt(query)
 75 |     featureDT = grToDt(features)
 76 |     queryDTs = splitDataTable(queryDT, "chr")
 77 |     featureDTs = splitDataTable(featureDT, "chr")
 78 |    as.vector(unlist(mapply(queryDTs, featureDTs[names(queryDTs)], 
 79 |                            FUN=DTNearest)))
 80 | }
 81 | 
 82 | # Function uses data.table rolling join to identify the nearest features
 83 | # really quickly.
 84 | #
 85 | # @param DT1 A data.table object to be joined to a second data.table object.
 86 | # @param DT2 A second data.table object to join with DT1.
 87 | #
 88 | # @return A rolling joined data.table object.
 89 | DTNearest = function(DT1, DT2) {
 90 |     #data.table::set(DT1, j=mid, value=start + round((end-start)/2))
 91 |     #data.table::set(DT2, j=mid, value=start + round((end-start)/2))
 92 |     if (is.null(DT1)) {
 93 |         return(NULL)
 94 |     }
 95 |     if (is.null(DT2)) {
 96 |         return(rep(NA, nrow(DT1)))
 97 |     }
 98 |     DT1[, mid:=start + round((end-start)/2)]
 99 |     DT2[, mid:=start + round((end-start)/2)]
100 |     data.table::setorder(DT1, mid)
101 |     data.table::setorder(DT2, mid)
102 |     data.table::setattr(DT1, "sorted", "mid")
103 |     data.table::setattr(DT2, "sorted", "mid")
104 |     DT2[J(DT1), roll="nearest"]
105 |     DT2[J(DT1), start+round((end-start)/2)-mid, roll="nearest"]
106 | }
107 | 
108 | 
109 | #' Calculates the distribution of distances from a query set to closest TSS
110 | #' 
111 | #' Given a query GRanges object and an assembly string, this function will grab
112 | #' the TSS list for the given reference assembly and then calculate the distance
113 | #' from each query feature to the closest TSS. It is a wrapper of
114 | #' \code{calcFeatureDist} that uses built-in TSS features for a reference
115 | #' assembly
116 | #' 
117 | #' @param query A GenomicRanges or GenomicRangesList object with query regions
118 | #' @param refAssembly A character vector specifying the reference genome
119 | #'     assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes with
120 | #'     \code{getTSSs}.
121 | #' @return A vector of distances for each query region relative to TSSs.
122 | #' @export
123 | #' @examples 
124 | #' calcFeatureDistRefTSS(vistaEnhancers, "hg19")
125 | calcFeatureDistRefTSS = function(query, refAssembly) {
126 |     features = getTSSs(refAssembly)
127 |     return(calcFeatureDist(query, features))
128 | }
129 | 
130 | 
131 | # Converts a nucleotide count into a label with abbreviation
132 | # @param x base count
133 | # @return A label with 'kb' or 'mb' appended if appropriate
134 | genomeLabel = function(x) {
135 |     .validateInputs(list(x="numeric"))
136 |     lab = x
137 |     if (abs(x) > 1e6){
138 |         lab = paste0(round(x/1e6), " mb")
139 |     }
140 |     else if (abs(x) > 1e3){
141 |         lab = paste0(round(x/1e3), " kb")
142 |     }
143 |     return(lab)
144 | }
145 | 
146 | 
147 | #' Plots a histogram of distances to genomic features
148 | #' 
149 | #' Given the results from \code{featureDistribution}, plots a histogram of
150 | #' distances surrounding the features of interest
151 | #' 
152 | #' @param dists Results from \code{featureDistribution}
153 | #' @param bgdists Background distances. If provided, will plot a background
154 | #'     distribution of expected distances 
155 | #' @param featureName Character vector for plot labels (optional).
156 | #' @param numbers a logical indicating whether the raw numbers should be 
157 | #'     displayed, rather than percentages (optional).
158 | #' @param nbins Number of bins on each side of the center point.
159 | #' @param size Number of bases to include in plot on each side of the 
160 | #'     center point.
161 | #' @param infBins Include catch-all bins on the sides?
162 | #' @param tile Turn on a tile mode, which plots a tiled figure 
163 | #'     instead of a histogram.
164 | #' @param labelOrder -- Enter "default" to order by order of user input (default); 
165 | #'     Enter "center" to order by value in tile in the closest proximity to the center 
166 | #'     of features (in case TSS is used - center is TSS) (center).
167 | #' @return A ggplot2 plot object
168 | #' @export
169 | #' @examples
170 | #' TSSdist = calcFeatureDistRefTSS(vistaEnhancers, "hg19")
171 | #' f = plotFeatureDist(TSSdist, featureName="TSS")
172 | plotFeatureDist = function(dists, bgdists=NULL, featureName="features", 
173 |                            numbers=FALSE, nbins=50, size=100000, 
174 |                            infBins=FALSE, tile=FALSE, labelOrder="default") {
175 |     df = cutDists(dists, divisions=NULL, nbins, size, infBins)
176 |     
177 |     if(is.list(dists)){
178 |         nplots = length(dists)
179 |     } else {
180 |         nplots = 1
181 |     }
182 | 
183 |     if (!is.null(bgdists)) {
184 |         bgDistsDF = cutDists(bgdists, divisions=NULL, nbins, size, infBins)
185 |         # bgDistsDF$Freq= scale(bgDistsDF$Freq, center=FALSE)
186 |         bgDistsDF$Freq = (bgDistsDF$Freq / sum(bgDistsDF$Freq)) * 100
187 |         df$bgFreq = rep(bgDistsDF$Freq, nplots)
188 |         df$bgX = rep(seq_len(nrow(bgDistsDF)), nplots)
189 |     }
190 |     
191 |     if ("name" %in% names(df)){
192 |         df$name = sortingFunction(df, labelOrder, nbins)
193 |         if (!numbers)
194 |             df$Freq = df[, .(Freq.Per = (Freq / sum(Freq)) * 100), 
195 |                          by = name]$"Freq.Per"
196 |             df$name = sortingFunction(df, labelOrder, nbins)
197 |             # It has multiple regions
198 |             g = ggplot(df, aes(x=cuts, y=Freq, fill=name, color = name)) + 
199 |             facet_grid(. ~name)
200 |     } else {
201 |         if (!numbers) 
202 |             df$Freq = (df$Freq / sum(df$Freq)) * 100
203 |             g = ggplot(df, aes(x=cuts, y=Freq))
204 |     }
205 | 
206 |     if (!is.null(bgdists)) {
207 | 
208 |     # bgtrack = scale(smooth(bgDistsDF$Freq), center=FALSE)
209 |     g = g + 
210 |         geom_line(stat="identity", aes(x=bgX,y=bgFreq), 
211 |                   color="gray", alpha=1, size=1.5) + 
212 |         geom_bar(stat="identity", aes(x=cuts,y=bgFreq), fill="gray", alpha=0.8)
213 |     }
214 | 
215 |     # find midpoint
216 |     midx = nrow(df)/2/nplots
217 |     barcount = nrow(df)/nplots
218 |     minlabel = genomeLabel(-size)
219 |     maxlabel = genomeLabel(size)
220 |     edgeLabels = c(minlabel, rep("", barcount-2), maxlabel)
221 | 
222 |     if (tile) {
223 |         if (!"name"  %in% names(df)) {
224 |             df$name = "Region set"
225 |     }
226 | 
227 |     ncuts = length(unique(df$cuts))
228 |     xs = rep(seq_len(ncuts), nplots)
229 |     g = ggplot(df) + 
230 |         geom_raster(aes(x=xs, y=name, fill=Freq)) +
231 |         scale_fill_gradient(low="navy", high="orange") +
232 |         geom_point(aes(x=midx, y=0.5), color="black", 
233 |                    size=2, shape=17, alpha=0.8) + 
234 |         theme_classic() + 
235 |         labs(fill=ifelse(numbers,"Counts","Frequency (%)")) +
236 |         theme(legend.position="bottom") + 
237 |         xlab(paste("Distance to", featureName)) +
238 |         theme(axis.text.x=element_text(angle = 0, hjust = 0.5, vjust=0.5)) +
239 |         scale_x_continuous(breaks=c(1, ncuts), labels=c(minlabel, maxlabel))
240 |         return(g)
241 |     }
242 |     
243 |     if ("name" %in% names(df)) {
244 |       g = g +
245 |         geom_bar(data=df, stat="identity", alpha=0.7) 
246 |     } else {
247 |       g = g +
248 |         geom_bar(data=df, stat="identity", fill="darkblue", alpha=0.7) 
249 |     }
250 |     g = g + geom_point(aes(x=midx, y=0), color="tan2", size=2, 
251 |                    shape=17, alpha=0.8) +
252 |         guides(fill="none") + # remove legend for geom_point
253 |         theme_classic() + 
254 |         theme(aspect.ratio=1) + 
255 |         theme_blank_facet_label() + 
256 |         xlab(paste("Distance to", featureName)) +
257 |         ylab(ifelse(numbers,"Counts","Frequency (%)")) +
258 |         # theme(axis.text.x=element_text(angle = 90, hjust = 1, vjust=0.5))
259 |         theme(axis.text.x=element_text(angle = 0, hjust = 0.5, vjust=0.5)) + 
260 |         theme(plot.title = element_text(hjust = 0.5)) + # Center title
261 |         ggtitle(paste("Distribution relative to", featureName)) +
262 |         theme(legend.position="bottom") + 
263 |         theme(panel.spacing.x=unit(1, "lines")) + 
264 |         scale_x_discrete(labels=edgeLabels) +
265 |         scale_x_discrete(labels=edgeLabels, expand=expansion(mult=0.035))
266 | 
267 |     return(g)
268 | }
269 | 
270 | # Internal helper function for \code{plotFeatureDist}:
271 | # orderes datasets based on their order in the user provided list,
272 | # or based on the value around feature center (in TSS based on TSS)
273 | #
274 | # @param df A data.table with varibales "cuts" - based on created bins in 
275 | #    \code{plotFeatureDist} function , "Freq" - either frequency or raw 
276 | #   counts in aa given bin, "name" - name of the dataset
277 | # @param labelOrder The method used to order datasets. Options: "default"
278 | #    orderes datasets in a plot based on order of datasets in GRangesList
279 | #    provided by user; "center" orderes datasets based on value in a central 
280 | #    bin of the plot.
281 | # @param nbins Number of bins on each side of the center point - input in 
282 | #    \code{plotFeatureDist} function.
283 | # @return A factor of names in "df" input with levels sorted based on 
284 | #    sorting option.
285 | 
286 | sortingFunction = function(df, labelOrder="default", nbins=50){
287 |     if(labelOrder == "default"){
288 |         orderedLabels = unique(df$name)
289 |         orderedNames = factor(df$name, levels = orderedLabels)
290 |         return(orderedNames)
291 |     }
292 |     if (labelOrder == "center"){
293 |       # get the value around center, sort lables based on 
294 |       # central values, use the labels as factor levels
295 |         centerIndex = seq(nbins, nrow(df), by = (nbins*2))
296 |         centerTiles = df[centerIndex,]
297 |         orderTiles = centerTiles[order(centerTiles$Freq, decreasing = TRUE),]
298 |         orderedLabels = orderTiles$name
299 |         orderedNames = factor(df$name, levels = orderedLabels)
300 |         return(orderedNames)
301 |         
302 |     }
303 |     
304 | }
305 | 
306 | 
307 | # Internal helper function for \code{plotFeatureDist}
308 | #
309 | # @param dists A vector of genomic distances.
310 | # @param divisions A vector of bin sizes to divide the dists into.
311 | # @param nbins Number of bins on each side of the center point.
312 | # @param size Number of bases to include in plot on 
313 | # each side of the center point.
314 | # @param infBins Include catch-all bins on the sides?
315 | # @return A data.frame of the table of the frequency of dists in divisions.
316 | cutDists = function(dists, divisions=NULL, nbins=50, 
317 |                     size=100000, infBins=TRUE) {
318 |     if (is.null(divisions)) {
319 |         poscuts = seq(0, size, by=size/nbins)
320 |         divisions = sort(unique(c(-poscuts, poscuts)))
321 |         if (infBins) {
322 |             divisions = c(-Inf, divisions, Inf)
323 |         }
324 |     }
325 |     if (is.list(dists)) {
326 |         x = lapply(dists, cutDists, divisions)
327 | 
328 |         # To accommodate multiple lists, we'll need to introduce a new 'name'
329 |         # column to distinguish them.
330 |         nameList = names(dists)
331 |         if(is.null(nameList)) {
332 |             nameList = seq_along(dists) # Fallback to sequential numbers
333 |         }
334 | 
335 |     # Append names
336 |     xb = rbindlist(x)
337 |     xb$name = rep(nameList, vapply(x, nrow, integer(1)))
338 | 
339 |     return(xb)
340 |     }
341 | 
342 |     labels = labelCuts(sort(divisions), collapse=" to ", infBins=infBins)
343 |     cuts = cut(dists, divisions, labels)
344 |     df = as.data.frame(table(cuts))
345 |     setDT(df)
346 |     return(df)
347 | }
348 | 
349 | 


--------------------------------------------------------------------------------
/R/loadData.R:
--------------------------------------------------------------------------------
  1 | #' Loads BSgenome objects from UCSC-style character vectors.
  2 | #'
  3 | #' This function will let you use a simple character vector (e.g. 'hg19') to
  4 | #' load and then return BSgenome objects. This lets you avoid having to use the
  5 | #' more complex annotation for a complete BSgenome object (e.g.
  6 | #' BSgenome.Hsapiens.UCSC.hg38.masked)
  7 | #' 
  8 | #' @param genomeBuild One of 'hg19', 'hg38', 'mm10', 'mm9', or 'grch38'
  9 | #' @param masked Should we used the masked version? Default:TRUE
 10 | #' @return A BSgenome object corresponding to the provided genome build.
 11 | #' @export
 12 | #' @examples
 13 | #' \dontrun{
 14 | #' bsg = loadBSgenome('hg19')
 15 | #' }
 16 | loadBSgenome = function(genomeBuild, masked=TRUE) {
 17 |     # Convert the given string into the BSgenome notation
 18 |     if (!requireNamespace("BSgenome", quietly=TRUE)) {
 19 |         message("BSgenome package is not installed.")
 20 |     }
 21 |     databasePkgString = switch (genomeBuild,
 22 |                                 grch38 = "BSgenome.Hsapiens.UCSC.hg38",
 23 |                                 hg38 = "BSgenome.Hsapiens.UCSC.hg38",
 24 |                                 hg19 = "BSgenome.Hsapiens.UCSC.hg19",
 25 |                                 mm10 = "BSgenome.Mmusculus.UCSC.mm10",
 26 |                                 mm9 = "BSgenome.Mmusculus.UCSC.mm9",
 27 |                                 bogus = "bogus" # a bogus genome for tests
 28 |     )
 29 |     if (masked) {
 30 |         databasePkgString = paste0(databasePkgString, ".masked")
 31 |     }
 32 |     
 33 |     if (is.null(databasePkgString)) {
 34 |         stop("I don't know how to map the string ", genomeBuild,
 35 |              " to a BSgenome")
 36 |     }
 37 |     return(.requireAndReturn(databasePkgString))
 38 | }
 39 | 
 40 | #' Load selected EnsDb library
 41 | #'
 42 | #' @param genomeBuild string, genome identifier
 43 | #'
 44 | #' @return loaded library
 45 | #' @export
 46 | #'
 47 | #' @examples
 48 | #' \dontrun{
 49 | #' loadEnsDb("hg19")
 50 | #' }
 51 | loadEnsDb = function(genomeBuild) {
 52 |     databasePkgString = switch (genomeBuild,
 53 |                                 grch38 = "EnsDb.Hsapiens.v86",
 54 |                                 hg38 = "EnsDb.Hsapiens.v86",
 55 |                                 hg19 = "EnsDb.Hsapiens.v75",
 56 |                                 mm10 = "EnsDb.Mmusculus.v79",
 57 |                                 bogus = "bogus" # a bogus db for unit tests
 58 |     )
 59 |     
 60 |     if (is.null(databasePkgString)) {
 61 |         stop("I don't know how to map the string ", genomeBuild,
 62 |              " to a EnsDb")
 63 |     }
 64 |     return(.requireAndReturn(databasePkgString))
 65 | }
 66 | 
 67 | #' Returns built-in chrom sizes for a given reference assembly
 68 | #
 69 | #' @param refAssembly A string identifier for the reference assembly
 70 | #' @return A vector with the chromosome sizes corresponding to a 
 71 | #' specific genome assembly.
 72 | #' @export
 73 | #' @examples
 74 | #' getChromSizes("hg19")
 75 | getChromSizes = function(refAssembly) {
 76 |   datasetId = paste0("chromSizes_", refAssembly)
 77 |   
 78 |   if (refAssembly == "hg19"){
 79 |     
 80 |     chromSizesDataset = getReferenceData(refAssembly, tagline="chromSizes_")
 81 |     
 82 |   } else if (refAssembly == "hg38"){
 83 |     
 84 |     if (!"GenomicDistributionsData" %in% utils::installed.packages()){
 85 |       stop(paste(datasetId, "not available in GenomicDistributions package",
 86 |                  "and GenomicDistributionsData package is not installed"))
 87 |     } else {
 88 |       chromSizesDataset = GenomicDistributionsData::chromSizes_hg38()
 89 |     }
 90 |     
 91 |   } else if (refAssembly == "mm10"){
 92 |     
 93 |     if (!"GenomicDistributionsData" %in% utils::installed.packages()){
 94 |       stop(paste(datasetId, "not available in GenomicDistributions package",
 95 |                  "and GenomicDistributionsData package is not installed"))
 96 |     } else {
 97 |       chromSizesDataset = GenomicDistributionsData::chromSizes_mm10()
 98 |     }
 99 |     
100 |   } else if (refAssembly == "mm9"){
101 |     
102 |     if (!"GenomicDistributionsData" %in% utils::installed.packages()){
103 |       stop(paste(datasetId, "not available in GenomicDistributions package",
104 |                  "and GenomicDistributionsData package is not installed"))
105 |     } else {
106 |       chromSizesDataset = GenomicDistributionsData::chromSizes_mm9()
107 |     }
108 |     
109 |   } else {
110 |     stop(paste(datasetId, "not available in GenomicDistributions package",
111 |                "or GenomicDistributionsData package,",
112 |                "please use getChromSizesFromFasta() to get chromosome sizes."))
113 |   }
114 |   
115 |   return(chromSizesDataset)
116 | }
117 | 
118 | 
119 | # Returns built-in TSSs for a given reference assembly
120 | #
121 | # @param refAssembly A string identifier for the reference assembly
122 | getTSSs = function(refAssembly) { 
123 |   datasetId = paste0("TSS_", refAssembly)
124 |   
125 |   if (refAssembly == "hg19"){
126 |     
127 |     TSSs = getReferenceData(refAssembly, tagline="TSS_")
128 |     
129 |   } else if (refAssembly == "hg38"){
130 |     
131 |     if (!"GenomicDistributionsData" %in% utils::installed.packages()){
132 |       stop(paste(datasetId, "not available in GenomicDistributions package",
133 |                  "and GenomicDistributionsData package is not installed"))
134 |     } else {
135 |       TSSs = GenomicDistributionsData::TSS_hg38()
136 |     }
137 |     
138 |   } else if (refAssembly == "mm10"){
139 |     
140 |     if (!"GenomicDistributionsData" %in% utils::installed.packages()){
141 |       stop(paste(datasetId, "not available in GenomicDistributions package",
142 |                  "and GenomicDistributionsData package is not installed"))
143 |     } else {
144 |       TSSs = GenomicDistributionsData::TSS_mm10()
145 |     }
146 |     
147 |   } else if (refAssembly == "mm9"){
148 |     
149 |     if(!"GenomicDistributionsData" %in% utils::installed.packages()){
150 |       stop(paste(datasetId, "not available in GenomicDistributions package",
151 |                  "and GenomicDistributionsData package is not installed"))
152 |     } else {
153 |       TSSs = GenomicDistributionsData::TSS_mm9()
154 |     }
155 |     
156 |   } else {
157 |     stop(paste(datasetId, "not available in GenomicDistributions package",
158 |                "or GenomicDistributionsData package,",
159 |                "please use getTssFromGTF() to get list of TSSs."))
160 |   }
161 |   
162 |   return(TSSs)
163 | }
164 | 
165 | 
166 | #' Returns built-in gene models for a given reference assembly
167 | #'
168 | #' Some functions require gene models, which can obtained from any source.
169 | #' This function allows you to retrieve a few common built-in ones.
170 | #' @param refAssembly A string identifier for the reference assembly
171 | #' @return A list containing the gene models corresponding to a
172 | #' specific reference assembly.
173 | #' @export
174 | #' @examples
175 | #' getGeneModels("hg19")
176 | getGeneModels = function(refAssembly) { 
177 |   datasetId = paste0("geneModels_", refAssembly)
178 |   
179 |   if(refAssembly == "hg19"){
180 |     
181 |     geneModelsDataset = getReferenceData(refAssembly, tagline="geneModels_")
182 |     
183 |   } else if(refAssembly == "hg38"){
184 |     
185 |     if(!"GenomicDistributionsData" %in% utils::installed.packages()){
186 |       stop(paste(datasetId, "not available in GenomicDistributions package",
187 |                  "and GenomicDistributionsData package is not installed"))
188 |     } else {
189 |       geneModelsDataset = GenomicDistributionsData::geneModels_hg38()
190 |     }
191 |     
192 |   } else if(refAssembly == "mm10"){
193 |     
194 |     if(!"GenomicDistributionsData" %in% utils::installed.packages()){
195 |       stop(paste(datasetId, "not available in GenomicDistributions package",
196 |                  "and GenomicDistributionsData package is not installed"))
197 |     } else {
198 |       geneModelsDataset = GenomicDistributionsData::geneModels_mm10()
199 |     }
200 |     
201 |   } else if(refAssembly == "mm9"){
202 |     
203 |     if(!"GenomicDistributionsData" %in% utils::installed.packages()){
204 |       stop(paste(datasetId, "not available in GenomicDistributions package",
205 |                  "and GenomicDistributionsData package is not installed"))
206 |     } else {
207 |       geneModelsDataset = GenomicDistributionsData::geneModels_mm9()
208 |     }
209 |     
210 |   } else {
211 |     stop(paste(datasetId, "not available in GenomicDistributions package",
212 |                "or GenomicDistributionsData package,",
213 |                "please use getGeneModelsFromGTF() to get",
214 |                "gene models."))
215 |   }
216 |   
217 |   return(geneModelsDataset)
218 | }
219 | 
220 | #' Get reference data for a specified assembly
221 | #' 
222 | #' This is a generic getter function that will return a data object requested,
223 | #' if it is included in the built-in data with the GenomicDistributions package 
224 | #' or GenomicDistributionsData package (if installed). Data objects can 
225 | #' be requested for different reference assemblies and data types (specified by
226 | #' a tagline, which is a unique string identifying the data type).
227 | #' 
228 | #' @param refAssembly Reference assembly string (e.g. 'hg38')
229 | #' @param tagline The string that was used to identify data of a given type in 
230 | #'     the data building step. It's used for the filename so we know
231 | #'     what to load, and is what makes this function generic (so it 
232 | #'     can load different data types).
233 | #' @return A requested and included package data object.
234 | getReferenceData = function(refAssembly, tagline) {
235 |     # query available datasets and convert the packageIQR object into a vector
236 |     datasetId = paste0(tagline, refAssembly)
237 |     dataset = .getDataFromPkg(id=datasetId, "GenomicDistributions")
238 |     if(!is.null(dataset))
239 |         return(dataset)
240 |     if(!"GenomicDistributionsData" %in% utils::installed.packages())
241 |         stop(paste(datasetId, "not available in GenomicDistributions package",
242 |                    "and GenomicDistributionsData package is not installed"))
243 |     dataset = .getDataFromPkg(id=datasetId, "GenomicDistributionsData")
244 |     if(!is.null(dataset))
245 |         return(dataset)
246 |     stop(paste(datasetId, "not available in GenomicDistributions and",
247 |                "GenomicDistributionsData packages"))
248 | }
249 | 
250 | .getDataFromPkg = function(id, pkg){
251 |     datasetListIQR = utils::data(package=pkg)
252 |     datasetList = datasetListIQR$results[,"Item"]
253 |     if (id %in% datasetList){
254 |         utils::data(list=id, package=pkg, envir=environment())
255 |         return(get(id))
256 |     } 
257 |     return(invisible(NULL))
258 | }
259 | 
260 | 


--------------------------------------------------------------------------------
/R/neighbor-distances.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | #' Group regions from the same chromosome together and
  5 | #' calculate the distances of a region to its upstream and
  6 | #' downstream neighboring regions.  
  7 | #' Distances are then lumped into a numeric vector. 
  8 | #'
  9 | #' @param query A GRanges or GRangesList object.
 10 | #' @param correctRef A string indicating the reference genome
 11 | #' to use if distances are corrected for the number of 
 12 | #' regions in a regionSet. 
 13 | #' 
 14 | #' @return A numeric vector or list with different vectors containing the
 15 | #'  distances of regions to their upstream/downstream neighbors.
 16 | #' @export
 17 | #' @examples 
 18 | #' dist = calcNeighborDist(vistaEnhancers)
 19 | calcNeighborDist =  function(query, correctRef="None") {
 20 |     .validateInputs(list(query=c("GRanges","GRangesList"),
 21 |                          correctRef=c("character")))
 22 |     # lapply if a GRangeslist is provided
 23 |     if (is(query, "GRangesList")) {
 24 |         dist = lapply(query,
 25 |                       function(x){calcNeighborDist(x, correctRef = correctRef)})
 26 |         namelist = names(query)
 27 |         if (is.null(namelist)) {
 28 |             newnames = seq_along(query)
 29 |             namelist = newnames
 30 |             # Append names
 31 |             names(dist) = namelist
 32 |         }
 33 |         return(dist)
 34 |     }
 35 |     querydt = grToDt(sort(query))
 36 |     querydts = splitDataTable(querydt, "chr")
 37 |     distanceVectors = lapply(querydts, neighbordt)
 38 |     d = as.vector(unlist(distanceVectors))
 39 |     # remove overlaps
 40 |     dcvec = d[!(d == "0")]
 41 |     # Correct for number of regions
 42 |     if (!correctRef=="None") {
 43 |         chromSizes = getChromSizes(correctRef)
 44 |         genomelen = sum(chromSizes)
 45 |         meanWidth = mean(calcWidth(query))
 46 |         expectedDist = genomelen/nrow(querydt) - meanWidth
 47 |         correctedDist = log10(dcvec/expectedDist)
 48 |         return(correctedDist)
 49 |     # If we just want to look at the raw neighbor distances
 50 |     } else {
 51 |         return(dcvec)
 52 |     }
 53 | }
 54 |   
 55 | #' Internal helper function to calculate distance 
 56 | #' between neighboring regions.
 57 | #'
 58 | #' @param querydt A data table with regions grouped according to
 59 | #' chromosome.
 60 | #' @return A numeric vector with the distances in bp 
 61 | neighbordt = function(querydt)  {
 62 |     # there should be at least 2 regions for each chr
 63 |     if (nrow(querydt) > 1) {
 64 |         endVect = abs(querydt[, diff(end)])
 65 |         regionWidth = querydt[, (end-start+1)]
 66 |         distancesVector = endVect - regionWidth[-1]
 67 |         # neg values represent overlaps between neighbor regions, set those to 0
 68 |         distancesVector[which(distancesVector < 0)] = 0 
 69 |         return(distancesVector)
 70 |   }
 71 | }
 72 | 
 73 | 
 74 | #' Group regions from the same chromosome together and
 75 | #' compute the distance of a region to its nearest neighbor. 
 76 | #' Distances are then lumped into a numeric vector. 
 77 | #'
 78 | #' @param query A GRanges or GRangesList object.
 79 | #' @param correctRef A string indicating the reference genome
 80 | #' to use if Nearest neighbor distances are corrected for the 
 81 | #' number of regions in a regionSet. 
 82 | #'
 83 | #' @return A numeric vector or list of vectors containing the
 84 | #'  distance of regions to their nearest neighbors.
 85 | #' @export
 86 | #' @examples 
 87 | #' Nneighbors = calcNearestNeighbors(vistaEnhancers)
 88 | calcNearestNeighbors = function(query, correctRef="None") {
 89 |     .validateInputs(list(query=c("GRanges","GRangesList"),
 90 |                          correctRef=c("character")))
 91 |     # lapply if a GRangeslist is provided
 92 |     if (is(query, "GRangesList")) {
 93 |         dist = lapply(query,
 94 |                       function(x){calcNearestNeighbors(x, correctRef = correctRef)})
 95 |         namelist = names(query)
 96 |         if (is.null(namelist)) {
 97 |             newnames = seq_along(query)
 98 |             namelist = newnames
 99 |             # Append names
100 |             names(dist) = namelist 
101 |         }
102 |         return(dist)
103 |     }
104 |     # Calculate nearest neighbors in a vectorized manner
105 |     dist = calcNeighborDist(query)
106 |     upstream = dist[-length(dist)]
107 |     downstream = dist[-1]
108 |     dt = data.table(i=upstream, j=downstream)
109 |     pairmins = dt[, pmin(i, j)]
110 |     # First and last distances are default nearest neighbors
111 |     nNeighbors = c(dist[1], pairmins, dist[length(dist)])
112 |     # Correct for number of regions
113 |     if (!correctRef=="None") {
114 |         chromSizes = getChromSizes(correctRef)
115 |         genomelen = sum(chromSizes)
116 |         meanWidth = mean(calcWidth(query))
117 |         expectedDist = genomelen/length(query) - meanWidth
118 |         correctedDist = log10(nNeighbors/expectedDist)
119 |         return(correctedDist)
120 |     } else {
121 |         return(nNeighbors)
122 |     }
123 | }
124 | 
125 | #' Plot the distances from regions to their upstream/downstream neighbors
126 | #' or nearest neighbors. Distances can be passed as either raw bp or
127 | #' corrected for the number of regions (log10(obs/exp)), but this has
128 | #' to be specified in the function parameters. 
129 | #' 
130 | #' @param dcvec A numeric vector or list of vectors containing distances 
131 | #' to upstream/downstream neighboring regions or to nearest neighbors. 
132 | #' Produced by \code{calcNeighborDist} or \code{calcNearestNeighbors}
133 | #' @param correctedDist A logical indicating if the plot axis should
134 | #' be adjusted to show distances corrected for the number of regions
135 | #' in a regionset.
136 | #' @param Nneighbors A logical indicating whether legend should be adjusted
137 | #' if Nearest neighbors are being plotted. Default legend shows distances
138 | #' to upstream/downstream neighbors.  
139 | #'
140 | #' @return A ggplot density object showing the distribution of
141 | #' raw or corrected distances.  
142 | #' @export
143 | #' @examples
144 | #' numVector = rnorm(400, mean=5, sd=0.1)
145 | #' d = plotNeighborDist(numVector)
146 | plotNeighborDist = function(dcvec, correctedDist=FALSE,
147 |                             Nneighbors=FALSE) {
148 |     .validateInputs(list(dcvec=c("numeric","list")))
149 |     # if input is list, convert it to a data frame with 
150 |     # value and region set name, if input is vector - make a single
151 |     # columns data.frame
152 |     if (is(dcvec, "list")) {
153 |         nameList = names(dcvec)
154 |         vectorLengths = unlist(lapply(dcvec, length))
155 |         distReshaped = data.frame(value = unlist(dcvec),
156 |                                   regionSet = rep(nameList, vectorLengths))
157 |         g = ggplot2::ggplot(distReshaped, aes(x=value,
158 |                                               fill=regionSet,
159 |                                               colour=regionSet)) +
160 |           geom_density(alpha=0.4)
161 |     } else {
162 |         distReshaped = data.frame(value = dcvec)
163 |         g = ggplot2::ggplot(distReshaped, aes(x=value)) +
164 |           geom_density() 
165 |     }
166 |     if (correctedDist==TRUE) {
167 |         g = g + 
168 |           xlab(expression(log[10](over(Obs, Exp)))) +
169 |           geom_vline(xintercept = 0, linetype="dashed") +
170 |           ggtitle("Corrected neighboring regions distance distribution") 
171 |     } else {
172 |         g = g + 
173 |           xlab(expression("bp distance")) +
174 |           scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
175 |                         labels = scales::trans_format("log10", 
176 |                                                   scales::math_format(10^.x))) +
177 |           ggtitle("Neighboring regions distance distribution") 
178 |     }
179 |     g = g + 
180 |       theme_classic() +
181 |       theme(aspect.ratio=1,
182 |             plot.title = element_text(hjust=0.5),
183 |             legend.position = "bottom") +
184 |       theme_blank_facet_label()
185 |   
186 |   # Adjust legend if plotting nearest neighbors 
187 |     if (Nneighbors==TRUE){
188 |         g = g + 
189 |           labs(fill="regionSet Nneighbors", 
190 |                colour="regionSet Nneighbors")
191 |     }
192 |     return(g)
193 | }
194 | 
195 | 


--------------------------------------------------------------------------------
/R/package.R:
--------------------------------------------------------------------------------
 1 | # PACKAGE DOCUMENTATION
 2 | #' Produces summaries and plots of features distributed across genomes
 3 | #' 
 4 | #' If you have a set of genomic ranges, the GenomicDistributions R package can
 5 | #' help you with some simple visualizations. Currently, it can produce two kinds
 6 | #' of plots: First, the chromosome distribution plot, which visualizes how your
 7 | #' regions are distributed over chromosomes; and second, the feature
 8 | #' distribution plot, which visualizes how your regions are distributed relative
 9 | #' to a feature of interest, like Transcription Start Sites (TSSs).
10 | #'
11 | "_PACKAGE"
12 | #' @name GenomicDistributions
13 | #' @author Nathan C. Sheffield
14 | #'
15 | #' @references \url{http://github.com/databio/GenomicDistributions}
16 | #' @import ggplot2
17 | #' @importFrom GenomicRanges GRanges GRangesList elementMetadata strand
18 | #'             seqnames granges makeGRangesFromDataFrame
19 | #' @importFrom data.table ":=" setDT data.table setkey fread setnames 
20 | #'             setcolorder rbindlist setattr setorder copy is.data.table
21 | #'             tstrsplit as.data.table foverlaps
22 | #' @importFrom reshape2 melt
23 | #' @importFrom IRanges IRanges Views
24 | #' @importFrom Biostrings alphabetFrequency
25 | #' @importFrom methods is
26 | #' @importFrom stats chisq.test
27 | #' @importFrom utils installed.packages getAnywhere data globalVariables download.file
28 | 
29 | NULL
30 | 
31 | # You can either use 'import X' or 'importFrom X abcdefg'. importFrom  is
32 | # better practice, but for ggplot2 we were simply importing so many functions
33 | # that it makes  sense to just import the whole package
34 | # @importFrom ggplot2 ggplot aes facet_grid geom_jitter geom_line
35 | #             geom_bar theme_classic xlab ylab geom_hline ylim 
36 | #             scale_color_discrete scale_x_discrete scale_y_discrete 
37 | #             scale_fill_brewer scale_color_manual scale_x_continuous
38 | #             ggtitle geom_vline scale_fill_discrete xlim
39 | #             scale_color_brewer theme element_blank unit 
40 | #             element_text geom_density geom_point guides geom_col 
41 | #             theme_bw scale_fill_manual
42 | 
43 | 
44 | # Because of some issues with NOTEs on R CMD check and CRAN submission,
45 | # (see here: http://stackoverflow.com/questions/9439256/)
46 | # I have to register stuff used in data.table as non-standard evaluation,
47 | # in order to pass some R CMD check NOTES.
48 | if(getRversion() >= "2.15.1") {
49 |     utils::globalVariables(c(
50 |         "cuts", "mid", "J", "chr", "N", "regionID", "x", "name", "BSFilter", 
51 |         "start", "end", "findOverlaps", "queryHits", "subjectHits", "buildJ",
52 |         "seqlengths", "IRanges", "seqlengths", "reduce", "seqlevels", "follow",
53 |         "trim", "error", "nlist", "aggregate", "median",  "bgDists", "Freq", "bgX",
54 |         "bgFreq", "value", "regionSet", "Group.1", "cellType", "spaceLabel", 
55 |         "signal", "group", "medianBar", "partition", "Freq", "Freq", "cumsize", 
56 |         "frif", "aggregate", "withinGroupID", "lowerCaseTissue", "boxplot.stats", 
57 |         "median", "barplot", "legend", "promoters", "seqlevels", "width", 
58 |         "precede", "elementMetadata", ".N", ".SD", "colorRampPalette", "count", 
59 |         "countOverlaps", "distance", "elementMetadata<-", "elementNROWS", 
60 |         "expected", "log10OE", "pintersect", "plot_labels", "query", 
61 |         "regionGroupID", "seqlevels<-", "size", "tableCount", "V1", "queryPeak", 
62 |         "xid", "yid", "na.omit", "peakName", "mixedVar",
63 |         "cellTypeMetadata", "tissueType", "boxStats",
64 |         "tissue", ".", "Percent", "Var1", "maxStart", "start",
65 |         "i.start", "minEnd", "i.end", "overlap", "gene_biotype", "dinucleotide",
66 |         "frequency", "L1", "V4", "colName", "i", "j", ".x", "lowerColorColumn",
67 |         "midpoint", "ubinID", "x.binID", "x.start", "x.end", "FreqPercent",
68 |         "Chi.square.pval", "score", "type"))
69 | }
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/R/qthist.R:
--------------------------------------------------------------------------------
  1 | #' Calculate the widths of regions
  2 | #' 
  3 | #' The length of a genomic region (the distance between the start and end) 
  4 | #' is called the width
  5 | #' When given a query set of genomic regions, this function returns the width
  6 | #' @param query A GRanges or GRangesList object with query sets
  7 | #' @return A vector of the widths (end-start coordinates) of GRanges objects.
  8 | #' @export
  9 | #' @examples
 10 | #' regWidths = calcWidth(vistaEnhancers)
 11 | calcWidth = function(query) { 
 12 |     if (is(query, "GRangesList")) {
 13 |         # Recurse over each GRanges object
 14 |         x = lapply(query, calcWidth)
 15 |         return(x) } 
 16 |     width(query)
 17 | }
 18 | 
 19 | 
 20 | #' Plot quantile-trimmed histogram
 21 | #' 
 22 | #' Given the results from \code{calcWidth}, plots a histogram with 
 23 | #' outliers trimmed.
 24 | #' 
 25 | #' x-axis breaks for the frequency calculations are based on the "divisions" 
 26 | #' results from helper function \code{calcDivisions}.
 27 | #' 
 28 | #' @param x Data values to plot - vector or list of vectors
 29 | #' @param EndBarColor Color for the quantile bars on both ends of the graph
 30 | #'     (optional)
 31 | #' @param MiddleBarColor Color for the bars in the middle of the graph
 32 | #'     (optional)
 33 | #' @param quantThresh Quantile of data to be contained in each end bar (optional)
 34 | #' quantThresh values must be under .2, optimal size is under .1
 35 | #' @param bins The number of bins for the histogram to allocate data to.
 36 | #'     (optional)
 37 | #' @param indep logical value which returns a list of plots that have had their
 38 | #'     bins calculated independently; the normal version will plot them on the 
 39 | #'     same x and y axis.
 40 | #' @param numbers a logical indicating whether the raw numbers should be 
 41 | #'     displayed, rather than percentages (optional).
 42 | #' @return A ggplot2 plot object
 43 | #' @export
 44 | #' @examples
 45 | #' regWidths = calcWidth(vistaEnhancers)
 46 | #' qtHist = plotQTHist(regWidths)
 47 | #' qtHist2 = plotQTHist(regWidths, quantThresh=0.1)
 48 | plotQTHist = function(x, EndBarColor = "gray57", MiddleBarColor = "gray27",
 49 |     quantThresh=NULL, bins=NULL, indep=FALSE, numbers=FALSE) {
 50 |     if (indep) {
 51 |         if (is(x, "list") | is(x, "List")) {
 52 |             x = lapply(x, plotQTHist)
 53 |             namesx = names(x)
 54 |             for (i in seq_along(x)){
 55 |                 x[[i]] = x[[i]] + ggtitle(namesx[i])
 56 |             }
 57 |         return(x)
 58 |         # you can use grid.arrange like this to plot these           
 59 |         # do.call("grid.arrange", x)
 60 |         }
 61 |     }
 62 |     output = calcDivisions(x, quantThresh=quantThresh, bins=bins)
 63 |     # if all x are the same - recalculate divisions
 64 |     divisionCheck = output[["divisions"]]
 65 |     if (length(divisionCheck) > length(unique(divisionCheck))){
 66 |       if (length(unique(divisionCheck)) == 3){
 67 |         output[["divisions"]] = c(-Inf, divisionCheck[2], 
 68 |                                   divisionCheck[2]+1, Inf)
 69 |         output[["bins"]] = 1
 70 |       } else {
 71 |         output[["divisions"]] = unique(divisionCheck)
 72 |         output[["bins"]] = (length(unique(divisionCheck)) - 3)
 73 |       }
 74 |     }
 75 |     if(is(x, "List")){
 76 |         x = as.list(x)
 77 |     }
 78 |     if(is.list(x)){
 79 |         nplots = length(x)
 80 |     } else {
 81 |         nplots = 1
 82 |     }
 83 | 
 84 |     df = cutDists(x, divisions=output[["divisions"]])
 85 |     if ("name" %in% names(df)){
 86 |         if (!numbers)
 87 |             df$Freq = df[, .(Freq.Per = (Freq / sum(Freq)) * 100), 
 88 |                          by = name]$"Freq.Per"
 89 | 
 90 |         g = ggplot(df, aes(x=cuts, y=Freq, fill=name)) + 
 91 |             facet_wrap(. ~name)
 92 |     } else {
 93 |         if (!numbers)
 94 |             df$Freq = df[, .(Freq.Per = (Freq / sum(Freq)) * 100)]$"Freq.Per"
 95 |         g = ggplot(df, aes(x=cuts, y=Freq))
 96 |     }
 97 |     # Create a vector for the colors
 98 |     colors_vect = c(EndBarColor ,
 99 |         rep(MiddleBarColor, (length(output[["divisions"]])-3)), EndBarColor)
100 |     colors_vect = rep(colors_vect, nplots)
101 | 
102 |     nbars = output[["bins"]]+2
103 |     qbaridx = sort(c(seq(1, nbars*nplots, by=nbars),
104 |             seq(nbars, nbars*nplots, by=nbars)))
105 |   
106 |     g = g +
107 |         geom_bar(stat="identity", fill = colors_vect) + 
108 |         theme_classic() + 
109 |         theme(aspect.ratio=1) + 
110 |         theme_blank_facet_label() +
111 |         ylab("Frequency") +
112 |         xlab("") +
113 |         theme(axis.text.x=element_text(angle = 90, hjust = 1, vjust=0.5)) +
114 |         theme(plot.title = element_text(hjust = 0.5)) + # Center title
115 |         ggtitle("Quantile Trimmed Histogram") +
116 |         theme(legend.position="bottom") +
117 |         geom_text(aes(label= paste((output[["quantile"]]*100),"%", sep='')),
118 |             data=df[qbaridx,], hjust=-1, angle=90, size=2.5)
119 | 
120 |     if (!numbers){
121 |       g = g + ylab("Percentage")
122 |     }
123 |     
124 |     return(g)
125 | }
126 | 
127 | 
128 | # Internal helper function for \code{plotQTHist}
129 | # 
130 | # If the bins or quantiles for the hist are specified by the user, those are 
131 | # used. Otherwise, this function is used to calculate 1) number of bins based
132 | # on size of the dataset, and 2) quantiles based on bins.
133 | #
134 | # @param x A vector of GRanges x.
135 | # @return A list of the divisions that will be used in plotting the histogram. 
136 | # @examples
137 | # calcDivisions(runif(500)*1000)
138 | calcDivisions = function(x, bins=NULL, quantThresh = NULL){
139 |   if(is.list(x)){
140 |     x=unlist(x)
141 |   }
142 |   
143 |   # calculating bins
144 |   if(!is.null(bins)){
145 |     b = bins
146 |   } else {
147 |     n = length(x)
148 |     if (n > 10000) {n = 10000}
149 |     if (n < 500) {n = 500}
150 |     # finding number of bins based on the size of dataset
151 |     b = round(n^.15 + (n/200))
152 |   }
153 |   # calculating quantiles
154 |   if(!is.null(quantThresh)){
155 |     if(quantThresh > .2){
156 |       stop("quantThresh value must be less than .2, Optimal size is under .1") }
157 |     q = quantThresh
158 |   } else {
159 |     # finding the quantile on each side based on number of bins
160 |     q = round(25/(b))/100
161 |     # minimum on each side is 1%
162 |     q = max(.01, q)
163 |   }
164 |   quant = unname(stats::quantile(x, probs = c((q), (1-(q)))))
165 |   seq_10 = seq(quant[1], quant[2], length = b+1)
166 |   div = c(-Inf, round(seq_10), Inf)
167 |   listOutput <- list("bins"= b,"quantile"= q, "divisions" = div)
168 |   return(listOutput)
169 | }
170 | 


--------------------------------------------------------------------------------
/R/utility.R:
--------------------------------------------------------------------------------
  1 | #' Checks class of the list of variables. To be used in functions
  2 | #'
  3 | #' @param checkList list of object to check, e.g. 
  4 | #' list(varname=c("data.frame", "numeric")). 
  5 | #' Multiuple strings in the vector are treated as OR.
  6 | #' @return A warning if the wrong input class is provided.
  7 | #' @examples
  8 | #' x = function(var1) {
  9 | #'     cl = list(var1=c("numeric","character"))
 10 | #'     .validateInputs(cl)
 11 | #'     return(var1^2)
 12 | #' }
 13 | .validateInputs = function(checkList) {
 14 |     nms = names(checkList)
 15 |     for(i in seq_along(checkList)){
 16 |         fail = FALSE
 17 |         clss = checkList[[i]]
 18 |         x = get(nms[i], envir=parent.frame(1))
 19 |         for(cls in clss){
 20 |             if (is(x, cls)) fail = append(fail, TRUE)
 21 |         }
 22 |         if(!any(fail)) 
 23 |             stop(paste0(nms[i], " must be a ", paste(clss, collapse=" or "), 
 24 |                         ".  Got: ", class(x)))
 25 |     }
 26 | }
 27 | 
 28 | 
 29 | #' Checks to make sure a package object is installed,
 30 | #' and if so, returns it. If the library is not installed, it issues a warning
 31 | #' and returns NULL.
 32 | #
 33 | #' @param BSgenomeString A BSgenome compatible genome string.
 34 | #' @return A BSgenome object if installed.
 35 | .requireAndReturn = function(BSgenomeString) {
 36 |     if (requireNamespace(BSgenomeString))
 37 |         return(utils::getAnywhere(BSgenomeString)$objs[[1]])
 38 |     else
 39 |         warning(BSgenomeString, " is not installed")
 40 |     return(NULL)
 41 | }
 42 | 
 43 | 
 44 | #' Efficiently split a data.table by a column in the table
 45 | #' 
 46 | #' @param DT Data.table to split
 47 | #' @param split_factor Column to split, which can be a character vector
 48 | #'        or an integer.
 49 | #' @return List of data.table objects, split by column
 50 | # @examples
 51 | # DT = data.table::data.table(letters, grp = rep(c("group1", "group2"), 13))
 52 | # splitDataTable(DT, "grp")
 53 | # splitDataTable(DT, 2)
 54 | splitDataTable = function(DT, split_factor) {
 55 |     factor_order = unique(DT[, get(split_factor)])
 56 |     if (is.numeric(split_factor)) {
 57 |         split_factor = colnames(DT)[split_factor]
 58 |         message("Integer split_factor, changed to: ", split_factor)
 59 |     }
 60 |     l = lapply(split(seq_len(nrow(DT)), DT[, get(split_factor)]), 
 61 |                 function(x) DT[x])
 62 |     return(l[factor_order])
 63 | }
 64 | 
 65 | 
 66 | #' Two utility functions for converting data.tables into GRanges objects
 67 | #'
 68 | #' @param DT A data.table representing genomic regions.
 69 | #' @param chr A string representing the chromosome column.
 70 | #' @param start A string representing the name of the start column.
 71 | #' @param end A string representing the name of the end column.
 72 | #' @param strand A string representing the name of the strand column.
 73 | #' @param name A string representing the name of the name column.
 74 | #' @param metaCols A string representing the name of the metadata column(s)
 75 | #'     to include in the returned GRanges object.
 76 | #' @return A GRanges object.
 77 | dtToGrInternal = function(DT, chr, start, end=NA, strand=NA, 
 78 |                           name=NA, metaCols=NA) {
 79 |     if (is.na(end)) {
 80 |         if ("end" %in% colnames(DT)) {
 81 |             end = "end"
 82 |         } else {
 83 |             end = start
 84 |         }
 85 |     }
 86 |     if (is.na(strand)) {
 87 |         gr=GRanges(seqnames=DT[[`chr`]], 
 88 |                    ranges=IRanges(start=DT[[`start`]], 
 89 |                                   end=DT[[`end`]]), strand="*")
 90 |     } else {
 91 |         # GRanges can only handle '*' for no strand, so replace any non-accepted
 92 |         # characters with '*'
 93 |         DT[,strand:=as.character(strand)]
 94 |         DT[strand=="1", strand:="+"]
 95 |         DT[strand=="-1", strand:="-"]
 96 |         DT[[`strand`]] =  gsub("[^+-]", "*", DT[[`strand`]])
 97 |         gr=GRanges(seqnames=DT[[`chr`]], ranges=IRanges(start=DT[[`start`]], 
 98 |                                                         end=DT[[`end`]]), 
 99 |                                                         strand=DT[[`strand`]])
100 |     }
101 |     if (! is.na(name) ) {
102 |         names(gr) = DT[[`name`]]
103 |     } else {
104 |         names(gr) = seq_along(gr)
105 |     }
106 |     if(! is.na(metaCols)) {
107 |         for(x in metaCols) {
108 |             elementMetadata(gr)[[`x`]]=DT[[`x`]]
109 |         }
110 |     }
111 |     gr
112 | }
113 | 
114 | 
115 | #' Converts a data.table (DT) object to a GenomicRanges 
116 | #' (GR) object. Tries to be intelligent, guessing chr 
117 | #' and start, but you have to supply end or other
118 | #' columns if you want them to be carried into the GR.
119 | #'
120 | #' @param DT A data.table representing genomic regions.
121 | #' @param chr A string representing the chromosome column.
122 | #' @param start A string representing the name of the start column.
123 | #' @param end A string representing the name of the end column.
124 | #' @param strand A string representing the name of the strand column.
125 | #' @param name A string representing the name of the name column.
126 | #' @param splitFactor A string representing the name of the column to use to
127 | #'     split the data.table into multiple data.tables.
128 | #' @param metaCols A string representing the name of the metadata column(s)
129 | #'     to include in the returned GRanges object.
130 | #' @return A GRanges object.
131 | #' @export
132 | #' @examples 
133 | #' start1 = c(seq(from=1, to = 2001, by = 1000), 800)
134 | #' chrString1 = c(rep("chr1", 3), "chr2")
135 | #' dt = data.table::data.table(chr=chrString1,
136 | #'                             start=start1,
137 | #'                             end=start1 + 250)
138 | #' newGR = dtToGr(dt)                
139 | dtToGr = function(DT, chr="chr", start="start", end=NA, strand=NA, name=NA,
140 |                   splitFactor=NA, metaCols=NA) {
141 |     if(is.na(splitFactor)) {
142 |         return(dtToGrInternal(DT, chr, start, end, strand, name, metaCols))
143 |     }
144 | 
145 |     if ( length(splitFactor) == 1 ) { 
146 |         if( splitFactor %in% colnames(DT) ) {
147 |             splitFactor = DT[, get(splitFactor)]
148 |         }
149 |     }
150 | 
151 |     lapply(split(seq_len(nrow(DT)), splitFactor), function(x) { 
152 |         dtToGrInternal(DT[x,], chr, start, end, strand, name, metaCols)
153 |     }
154 |     )
155 | 
156 | 
157 | }
158 | 
159 | 
160 | #' Convert a GenomicRanges into a data.table.
161 | #'
162 | #' @param GR A Granges object
163 | #' @return A data.table object.
164 | grToDt = function(GR) {
165 |     DF=as.data.frame(elementMetadata(GR))
166 |     if( ncol(DF) > 0) {
167 |         DT = data.table(chr=as.vector(seqnames(GR)), 
168 |                         start=start(GR), end=end(GR), DF)
169 |     } else {
170 |         DT = data.table(chr=as.vector(seqnames(GR)), 
171 |                         start=start(GR), end=end(GR))
172 |     }
173 |     return(DT)
174 | }
175 | 
176 | 
177 | #' Converts a list of data.tables (From BSreadbeds) into GRanges.
178 | #'
179 | #' @param dtList A list of data.tables
180 | #' @return A GRangesList object.
181 | BSdtToGRanges = function(dtList) {
182 |     gList = list()
183 |     for (i in seq_along(dtList)) {
184 |         #dt = dtList[[i]]
185 |         setkey(dtList[[i]], chr, start)
186 |         #convert the data into granges object
187 |         gList[[i]] = GRanges(seqnames=dtList[[i]]$chr, 
188 |                             ranges=IRanges(start=dtList[[i]]$start, 
189 |                             end=dtList[[i]]$start), 
190 |                             strand=rep("*", nrow(dtList[[i]])), 
191 |                             hitCount=dtList[[i]]$hitCount, 
192 |                             readCount=dtList[[i]]$readCount)
193 |         # I used to use end=start+1, but this targets 
194 |         # CG instead of just a C, and it's causing edge-effects 
195 |         # problems when I assign Cs to tiled windows 
196 |         # using (within). Aug 2014 I'm changing to 
197 |         # start/end at the same coordinate.
198 |     }
199 |     return(gList)
200 | }
201 | 
202 | 
203 | #' Clear ggplot face label.
204 | #'
205 | #' Usually ggplot2 facets are labeled with boxes surrounding the label. This
206 | #' function removes the box, so it's a simple label for each facet.
207 | #'
208 | #' @return A ggplot theme
209 | theme_blank_facet_label = function() {
210 |     return(theme(
211 |         panel.grid.major = element_blank(),
212 |         panel.grid.minor = element_blank(),
213 |         strip.background = element_blank()
214 |         )
215 |     )
216 | }
217 | 
218 | 
219 | #' Creates labels based on a discretization definition.
220 | #' 
221 | #' If you are building a histogram of binned values, you want to have labels for
222 | #' your bins that correspond to the ranges you used to bin. This function takes
223 | #' the breakpoints that define your bins and produces nice-looking labels for
224 | #' your histogram plot.
225 | #' 
226 | #' \code{labelCuts} will take a cut group, (e.g., a quantile division of 
227 | #' some signal), and give you clean labels (similar to the cut method).
228 | #' @param breakPoints The exact values you want as boundaries for your bins
229 | #' @param round_digits Number of digits to cut round labels to. 
230 | #' @param signif_digits Number of significant digits to specify. 
231 | #' @param collapse Character to separate the labels
232 | #' @param infBins use >/< as labels on the edge bins
233 | #' @return A vector of histogram axis labels.
234 | # @examples 
235 | # labelCuts(seq(0,100,by=20))
236 | labelCuts = function(breakPoints, round_digits=1, 
237 |                      signif_digits=3, collapse="-", infBins=FALSE) {
238 |     roundedLabels = signif(round(
239 |         cbind( breakPoints[-length(breakPoints)],breakPoints[-1]), 
240 |         round_digits), signif_digits)
241 |     # set the Inf values to NA so formatC can add commas
242 |     is.na(roundedLabels) = vapply(roundedLabels, is.infinite, logical(1)) 
243 |     labelsWithCommas = formatC(roundedLabels, format="d", 
244 |                                big.mark=",")
245 |     labels = apply(labelsWithCommas, 1, paste0, collapse=collapse) 
246 |     if (infBins) {
247 |         labels[1] = paste0("<=", formatC(breakPoints[2], format="d", 
248 |                                          big.mark=","))
249 |         labels[length(labels)] = paste0(">", 
250 |                                     formatC(breakPoints[length(breakPoints)-1], 
251 |                                             format="d", big.mark=","))
252 |     }
253 |     return(labels)
254 | }
255 | 
256 | #' Nathan's magical named list function.
257 | #' This function is a drop-in replacement for the base list() function,
258 | #' which automatically names your list according to the names of the 
259 | #' variables used to construct it.
260 | #' It seamlessly handles lists with some names and others absent,
261 | #' not overwriting specified names while naming any unnamed parameters.
262 | #' Took me awhile to figure this out.
263 | #'
264 | #' @param ... arguments passed to list()
265 | #' @return A named list object.
266 | #' @export
267 | #' @examples
268 | #' x=5
269 | #' y=10
270 | #' nlist(x,y) # returns list(x=5, y=10)
271 | #' list(x,y) # returns unnamed list(5, 10)
272 | nlist = function(...) {
273 |     fcall = match.call(expand.dots=FALSE)
274 |     l = list(...)
275 |     if(!is.null(names(list(...)))) { 
276 |         names(l)[names(l) == ""] = fcall[[2]][names(l) == ""]
277 |     } else {
278 |         names(l) = fcall[[2]]
279 |     }
280 |     return(l)
281 | }
282 | 
283 | 
284 | 


--------------------------------------------------------------------------------
/R/zalias.R:
--------------------------------------------------------------------------------
1 | featureAggregateDistribution = calcChromBins
2 | plotGenomeAggregate = plotChromBins
3 | TSSDist = calcFeatureDistRefTSS
4 | genomicPartitions = calcPartitionsRef
5 | aggregateOverGenomeBins = calcChromBinsRef
6 | featureDistanceDistribution = calcFeatureDist
7 | assignPartitions = calcPartitions
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | | Master | Dev |
 2 | |--------|-----|
 3 | |[![Build Status](https://travis-ci.org/databio/GenomicDistributions.svg?branch=master)](https://travis-ci.org/databio/GenomicDistributions) | [![Build Status](https://travis-ci.org/databio/GenomicDistributions.svg?branch=dev)](https://travis-ci.org/databio/GenomicDistributions) |
 4 | 
 5 | 
 6 | 
 7 | # GenomicDistributions
 8 | 
 9 | An R package that provides functions for 1) calculating and 2) visualizing a variety of statistics for a collection of genomic ranges. If you have a set of genomic ranges, such as a BED file the GenomicDistributions R package can help you to explore, annotate, visualize,and compare it.
10 | 
11 | ## Installing
12 | 
13 | ### Main package
14 | 
15 | With Bioconductor:
16 | 
17 | ```r
18 | if (!requireNamespace("BiocManager", quietly = TRUE))
19 | install.packages("BiocManager")
20 | BiocManager::install("GenomicDistributions")
21 | ```
22 | 
23 | Or from GitHub:
24 | 
25 | ```r
26 | devtools::install_github("databio/GenomicDistributions")
27 | ```
28 | 
29 | ### Data package
30 | 
31 | [GenomicDistributionsData](https://github.com/databio/GenomicDistributionsData): includes full data files, too large to include in GenomicDistributions
32 | 
33 | 
34 | ## Quick start
35 | 
36 | See the vignettes for more information: http://code.databio.org/GenomicDistributions
37 | 
38 | ## Building long vignettes
39 | 
40 | In the [long_vignettes](/long_vignettes) are vignettes that require large external data and take a long time to run. Therefore, they should be pre-built. You can render them manually by running [long_vignettes/render-long-vignettes.R](long_vignettes/render-long-vignettes.R). This will use `knitr` to run the vignette and put the result into the `vignettes` folder, along with output figures.
41 | 
42 | **Cite GenomicDistributions:**\
43 | Kupkova K., Mosquera J.V., Smith J.P., Stolarczyk M, Danehy T., Lawson J.T., Rogers S., LeRoy N., Sheffield N.C. GenomicDistributions: fast analysis of genomic intervals with Bioconductor. *BMC Genomics* 23, 299 (2022). https://doi.org/10.1186/s12864-022-08467-y
44 | 


--------------------------------------------------------------------------------
/_pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | template:
 3 |   params:
 4 |     bootswatch: yeti
 5 | 
 6 | navbar:
 7 |   left:
 8 |   - text: Vignettes
 9 |     icon: fa-play-circle
10 |     href: articles/index.html
11 |   - text: Documentation
12 |     icon: fa-pencil
13 |     href: reference/index.html
14 |   - text: GitHub
15 |     icon: fa-github fa-lg
16 |     href: https://github.com/databio/GenomicDistributions
17 |   right:
18 |   - text: Databio.org
19 |     href: http://databio.org
20 |   - text: Software & Data
21 |     href: http://databio.org/software/
22 | 
23 | articles:
24 |   - title: Vignettes
25 |     contents:
26 |       - intro
27 |       - full-power
28 | 
29 | 


--------------------------------------------------------------------------------
/data-raw/TSS_hg19.R:
--------------------------------------------------------------------------------
1 | library(usethis)
2 | TSS_hg19 = GenomicDistributionsData::buildTSS("hg19")
3 | usethis::use_data(TSS_hg19, overwrite=TRUE)
4 | 


--------------------------------------------------------------------------------
/data-raw/bedfiles.R:
--------------------------------------------------------------------------------
 1 | library(usethis)
 2 | fileNameList = c("vistaEnhancers.bed.gz", "setB_100.bed.gz")
 3 | for (fileName in fileNameList) {
 4 |     storedObjectName = strsplit(fileName, "\\.")[[1]][1]
 5 |     x = rtracklayer::import(system.file("extdata", fileName, package = "GenomicDistributions"))
 6 |     assign(storedObjectName, x)
 7 |     do.call("use_data", list(as.name(storedObjectName), overwrite = TRUE))
 8 |     rm(feats, storedObjectName)
 9 | }
10 | 


--------------------------------------------------------------------------------
/data-raw/chromSizes_hg19.R:
--------------------------------------------------------------------------------
1 | library(usethis)
2 | chromSizes_hg19 = GenomicDistributionsData::buildChromSizes("hg19")
3 | usethis::use_data(chromSizes_hg19, overwrite=TRUE)
4 | 


--------------------------------------------------------------------------------
/data-raw/geneModels_hg19.R:
--------------------------------------------------------------------------------
1 | library(usethis)
2 | geneModels_hg19 = GenomicDistributionsData::buildGeneModels("hg19")
3 | usethis::use_data(geneModels, overwrite=TRUE)
4 | 


--------------------------------------------------------------------------------
/data/TSS_hg19.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/TSS_hg19.rda


--------------------------------------------------------------------------------
/data/cellTypeMetadata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/cellTypeMetadata.rda


--------------------------------------------------------------------------------
/data/chromSizes_hg19.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/chromSizes_hg19.rda


--------------------------------------------------------------------------------
/data/datalist:
--------------------------------------------------------------------------------
1 | cellTypeMetadata
2 | chromSizes_hg19
3 | geneModels_hg19
4 | TSS_hg19
5 | exampleOpenSignalMatrix_hg19
6 | vistaEnhancers
7 | setB_100


--------------------------------------------------------------------------------
/data/exampleOpenSignalMatrix_hg19.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/exampleOpenSignalMatrix_hg19.rda


--------------------------------------------------------------------------------
/data/geneModels_hg19.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/geneModels_hg19.rda


--------------------------------------------------------------------------------
/data/setB_100.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/setB_100.rda


--------------------------------------------------------------------------------
/data/vistaEnhancers.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/data/vistaEnhancers.rda


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite the GenomicDistributions package please use:")
 2 | 
 3 | citEntry(entry="article",
 4 |          title = "GenomicDistributions: fast analysis of genomic intervals with Bioconductor",
 5 |          author = personList(as.person("Kristyna Kupkova" ),
 6 |            as.person("Jose Verdezoto Mosquera"),
 7 |            as.person("Jason P. Smith" ),
 8 |            as.person("Michal Stolarczyk"), 
 9 |            as.person("Tessa L. Danehy"), 
10 |            as.person("John T. Lawson"),
11 |            as.person("Bingjie Xue"),
12 |            as.person("John T. Stubbs"), 
13 |            as.person("Nathan LeRoy"), 
14 |            as.person("Nathan C. Sheffield")),
15 |          year = 2022,
16 |          journal = "BMC Genomics",
17 |          doi = "10.1186/S12864-022-08467-Y",
18 |          url = "https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-022-08467-y",
19 |          textVersion =
20 |              "Kupkova K, Verdezoto Mosquera J, Smith JP, et al. (2022) GenomicDistributions: fast analysis of genomic intervals with Bioconductor. BMC Genomics. doi:10.1186/S12864-022-08467-Y")
21 | 


--------------------------------------------------------------------------------
/inst/extdata/C_elegans_cropped_example.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/inst/extdata/C_elegans_cropped_example.fa.gz


--------------------------------------------------------------------------------
/inst/extdata/C_elegans_cropped_example.gtf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/inst/extdata/C_elegans_cropped_example.gtf.gz


--------------------------------------------------------------------------------
/inst/extdata/example_cell_matrix.txt:
--------------------------------------------------------------------------------
  1 | V1	astrocyte_of_the_spinal_cord	cardiac_fibroblast	CD8-positive_xx_alpha-beta_T_cell	endothelial_cell_of_umbilical_vein	fibroblast_of_dermis	fibroblast_of_pulmonary_artery	fibroblast_of_skin_of_right_biceps	fibroblast_of_the_conjunctiva	glomerular_endothelial_cell	lung_microvascular_endothelial_cell	osteoblast	skeletal_muscle_myoblast	T-helper_17_cell
  2 | chr1_8130458_8130903	0.127	0.0937	0.055	0.0309	0.0315	0.052	0.0735	0.0662	0.068	0.032	0.019	0.1183	0.1115
  3 | chr1_8131775_8131925	0.0508	0.0321	0.0683	0.0563	0.0482	0.0452	0.0072	0.0286	0.026	0.0217	0.0378	0.0239	0.0903
  4 | chr1_10732275_10732425	0.0236	0.0183	0.056	0.2383	0.0165	0.0194	0.0128	0.0074	0.0263	0.2841	0.4417	0.012	0.0159
  5 | chr1_10732475_10732645	0.0045	0.009	0.0406	0.2067	0.0177	0.0295	0.0192	0.0073	0.0479	0.1875	0.4553	0.0096	0.0103
  6 | chr1_10852095_10852245	0.0338	0.0416	0.0397	0.0532	0.0102	0.0587	0.0197	0.0314	0.0718	0.0542	1	0.0308	0.0485
  7 | chr1_10925155_10925365	0.0424	0.0181	0.041	0.046	0.0405	0.0263	0.0338	0.0392	0.0129	0.0063	0.09	0.0172	0.0091
  8 | chr1_10965595_10965785	0.0606	0.0254	0.0442	0.0111	0.0387	0.0235	0.0116	0.0025	0.0042	0.0277	0.139	0.0299	0.0703
  9 | chr1_33722335_33723625	0.5422	0.249	0.6172	0.4543	0.1972	0.3181	0.2779	0.3422	0.2329	0.2895	0.848	0.1536	0.4351
 10 | chr1_33723635_33723908	0.0713	0.1258	0.1728	0.1867	0.1114	0.1013	0.0818	0.1358	0.0414	0.1763	0.8794	0.1207	0.0203
 11 | chr1_33829095_33829245	0.0482	0.0555	0.0306	0.0228	0.0581	0.0579	0.0183	0.0604	0.0627	0.0468	0.0331	0.0536	0.0244
 12 | chr1_33829603_33830118	0.8796	0.1994	0.0491	0.1314	0.1776	0.4936	0.4731	0.7528	0.0333	0.7031	0.0655	0.7978	0.0261
 13 | chr1_38495115_38496257	0.3529	0.2145	0.5038	0.5988	0.103	0.254	0.3538	0.249	0.3269	0.2645	0.5682	0.1296	0.7671
 14 | chr1_38560483_38560811	0.0367	0.0259	0.0712	0.036	0.0387	0.0279	0.0388	0.0967	0.0279	0.0439	0.0808	0.0283	0.0199
 15 | chr1_38561602_38561752	0.0134	0.0591	0.0825	0.0413	0.0352	0.05	0.0377	0.0362	0.0332	0.0492	0.1444	0.0153	0.0307
 16 | chr1_38627361_38627549	0.035	0.01	0.0082	0.1264	0.0636	0.0617	0.0151	0.0377	0.074	0.0414	0.0342	0.06	0.0252
 17 | chr1_38627916_38627936	0.0575	0.0331	0.0245	0.0254	0.0348	0.0083	0.0303	0.0667	0.0042	0.0115	0.0218	0.0258	0.0489
 18 | chr1_38657195_38657424	0.033	0.0414	0.0665	0.0597	0.0496	0.0265	0.007	0.0338	0.0167	0.0301	0.0739	0.0734	0.0036
 19 | chr1_38735686_38735990	0.0323	0.126	0.0371	0.038	0.0272	0.0888	0.0311	0.0431	0.0381	0.0394	0.0628	0.0574	0.0779
 20 | chr1_38736395_38736545	0.0357	0.0294	0.0624	0.0108	0.0209	0.0433	0.0617	0.0384	0.0042	0.0337	0.0579	0.0123	0.0857
 21 | chr1_38791885_38792106	0.0225	0.0261	0.0364	0.0203	0.0521	0.0488	0.0722	0.0371	0.0337	0.0529	0.0818	0.0714	0.0168
 22 | chr1_38793215_38793664	0.0877	0.0631	0.0412	0.0931	0.0602	0.0482	0.0738	0.0417	0.054	0.0574	0.0533	0.0881	0.0321
 23 | chr1_38802058_38802430	0.0172	0.0229	0.0269	0.0545	0.0266	0.0303	0.0431	0.0378	0.0156	0.0375	0.0261	0.071	0.0419
 24 | chr1_39292040_39292245	0.0748	0.0337	0.0371	0.1152	0.0909	0.0477	0.0804	0.0263	0.0423	0.0402	0.0308	0.1064	0.081
 25 | chr1_41711275_41711685	0.1858	0.1828	0.03	0.1141	0.0205	0.2247	0.1125	0.0181	0.0727	0.1484	0.1171	0.2079	0.0736
 26 | chr1_44500999_44501245	0.0477	0.053	0.1087	0.0427	0.097	0.0729	0.0673	0.0363	0.035	0.1167	0.1681	0.0785	0.0649
 27 | chr1_44990655_44990903	0.0396	0.103	0.0239	0.0611	0.0627	0.1072	0.0356	0.0196	0.0477	0.0132	0.0554	0.1881	0.0526
 28 | chr1_51006708_51006761	0.0748	0.0145	0.0397	0.0361	0.0439	0.0197	0.032	0.0626	0.0596	0.0042	0.0971	0.0278	0.0582
 29 | chr1_51034675_51034925	0.0574	0.0427	0.063	0.0323	0.014	0.0377	0.0191	0.0303	0.0522	0.0241	0.0096	0.0471	0.0449
 30 | chr1_51035725_51036145	1	0.0678	0.0099	0.0299	0.0605	0.0486	0.1065	0.038	0.0335	0.0241	0.0806	0.1441	0.0218
 31 | chr1_54924892_54925086	0.0508	0.0874	0.239	0.0851	0.2416	0.0926	0.167	0.0765	0.1011	0.1152	0.3239	0.0705	0.1913
 32 | chr1_54926095_54926385	0.0393	0.071	1	0.0448	0.0772	0.0349	0.0669	0.0312	0.0253	0.0522	0.0448	0.0382	1
 33 | chr1_54927515_54927665	0.0286	0.0261	0.1449	0.0495	0.0281	0.0539	0.0542	0.0764	0.0203	0.0688	0.4354	0.0308	0.0392
 34 | chr1_54928470_54928670	0.0344	0.0436	0.1488	0.1447	0.0355	0.0632	0.0721	0.0308	0.0671	0.0628	0.0437	0.0303	0.0359
 35 | chr1_54928798_54929565	0.0429	0.0659	0.1137	0.1724	0.0458	0.0925	0.0976	0.0583	0.1366	0.1063	0.1272	0.0341	0.0416
 36 | chr1_59522090_59522425	0.0709	0.1789	0.0638	0.0427	0.4935	0.2201	0.0494	0.4556	0.0306	0.1736	0.0183	0.0638	0.0172
 37 | chr1_59522915_59523325	1	1	0.0755	0.1297	1	1	1	1	0.1723	0.3904	0.3319	0.673	0.0498
 38 | chr1_59523476_59523490	0.0634	0.1475	0.0812	0.0568	0.1271	0.125	0.1332	0.4081	0.073	0.0984	0.0218	0.0666	0.0098
 39 | chr1_59523555_59523787	0.0811	0.3627	0.0646	0.0372	0.1693	0.1838	0.2624	0.7117	0.0376	0.0329	0.0457	0.0793	0.013
 40 | chr1_60105442_60105459	0.0883	0.0815	0.0271	0.0255	0.0185	0.0122	0.0288	0.0386	0.0211	0.0224	0.0368	0.0143	0.131
 41 | chr1_61086832_61087125	0.1248	0.0575	0.013	0.2817	0.0408	0.0812	0.0372	0.0544	0.0679	0.1186	0.0441	0.0371	0.0143
 42 | chr1_61087275_61087621	0.4201	0.2548	0.0107	1	0.092	0.6451	0.2189	0.1832	1	1	0.0361	0.109	0.0495
 43 | chr1_62045875_62046025	0.0753	0.0682	0.0559	0.0395	0.2332	0.0441	0.0286	0.0511	0.1098	0.0302	0.0527	0.0564	0.0633
 44 | chr1_62046195_62046371	0.044	0.0717	0.0448	0.042	0.0798	0.0227	0.0213	0.0354	0.0532	0.0246	0.0212	0.0177	0.0475
 45 | chr1_62053295_62053445	0.0311	0.0362	0.0238	0.0459	0.0369	0.0407	0.0372	0.0437	0.0167	0.0454	0.1605	0.0306	0.041
 46 | chr1_62055155_62055445	0.2025	0.092	0.0362	0.0478	0.0827	0.051	0.1191	0.2308	0.0113	0.0645	0.0345	0.0487	0.0174
 47 | chr1_63370375_63370525	0.0131	0.0301	0.01	0.0352	0.0679	0.0494	0.0426	0.0675	0.017	0.0232	0.0198	0.0073	0.0571
 48 | chr1_63443879_63444045	0.0731	0.07	0.0354	0.0203	0.0342	0.0235	0.0206	0.0519	0.0383	0.0748	0.0096	0.0272	0.0349
 49 | chr1_63464548_63464705	0.0425	0.0668	0.0077	0.0451	0.0393	0.0296	0.0171	0.0082	0.0651	0.0262	0.0631	0.0111	0.0115
 50 | chr1_82663855_82664005	0.0189	0.0207	0.0045	0.025	0.0197	0.0508	0.0207	0.0173	0.01	0.005	0.0096	0.0249	0.0149
 51 | chr1_83252855_83253573	0.0437	0.0476	0.0202	0.0774	0.0346	0.0534	0.0509	0.0199	0.0339	0.0275	0.0318	0.0253	0.0543
 52 | chr1_83345462_83345963	0.0501	0.0135	0.0276	0.1716	0.1681	0.0398	0.0556	0.0769	0.2686	0.0281	0.0234	0.0092	0.0648
 53 | chr1_83346168_83346650	0.0333	0.0523	0.0141	0.0436	0.024	0.0288	0.0162	0.029	0.0642	0.033	0.0096	0.0156	0.0639
 54 | chr16_80951515_80951638	0.0208	0.0311	0.0278	0.0441	0.0281	0.0622	0.0533	0.0673	0.0688	0.0545	0.0096	0.0114	0.0118
 55 | chr16_80381587_80381737	0.0584	0.0597	0.0088	0.0266	0.0588	0.0405	0.0241	0.0484	0.0042	0.0639	0.0416	0.0498	0.01
 56 | chr16_79933304_79933525	0.0238	0.0319	0.022	0.0155	0.0232	0.0303	0.0448	0.0339	0.0042	0.0287	0.053	0.0162	0.0106
 57 | chr16_79933655_79933805	0.0391	0.0506	0.0267	0.0401	0.0265	0.0629	0.0339	0.1073	0.1709	0.048	0.031	0.0518	0.0141
 58 | chr16_79644375_79644525	0.0088	0.0081	0.0746	0.0155	0.0167	0.0371	0.0527	0.0172	0.0998	0.0338	0.0971	0.0544	0.0403
 59 | chr16_79510878_79510900	0.04	0.02	0.0625	0.0628	0.0177	0.0481	0.0509	0.0187	0.0042	0.0437	0.0469	0.0549	0.1469
 60 | chr16_79511057_79511207	0.0045	0.024	0.057	0.0387	0.0447	0.0057	0.0213	0.0518	0.0109	0.071	0.0433	0.0388	0.084
 61 | chr16_79436935_79437295	0.0172	0.0399	0.0645	0.0357	0.0079	0.0392	0.0424	0.0403	0.0406	0.0214	0.0713	0.0336	0.0087
 62 | chr16_79437296_79437446	0.0055	0.0373	0.0137	0.034	0.0061	0.0397	0.0293	0.0314	0.0201	0.0472	0.0544	0.0176	0.0223
 63 | chr16_79437456_79437606	0.0102	0.0202	0.0301	0.0162	0.0544	0.0537	0.0497	0.043	0.0371	0.0491	0.0428	0.025	0.0184
 64 | chr16_79397778_79397928	0.0298	0.0176	0.0683	0.0114	0.0352	0.0321	0.0222	0.0345	0.0246	0.0396	0.0226	0.0508	0.012
 65 | chr16_79210975_79211205	0.003	0.0132	0.2555	0.0185	0.0191	0.0429	0.0437	0.0168	0.0116	0.0427	0.062	0.0226	0.045
 66 | chr16_79211216_79211366	0.0236	0.0238	0.0602	0.0356	0.0149	0.0195	0.0171	0.0237	0.0086	0.0286	0.0279	0.0125	0.0364
 67 | chr16_74363381_74363581	0.1219	0.2504	1	0.2249	0.2187	0.4715	0.2744	0.6045	0.0787	0.7063	0.1307	0.1501	0.6573
 68 | chr16_74352560_74352860	0.045	0.0316	0.0537	0.0637	0.0384	0.0485	0.0596	0.0398	0.0871	0.0755	0.0329	0.0334	0.0257
 69 | chr16_74353114_74353285	0.0526	0.0772	0.0775	0.0346	0.015	0.075	0.0686	0.0568	0.0591	0.0786	0.0526	0.0403	0.2131
 70 | chr16_73981035_73981205	0.0121	0.0131	0.0435	0.0704	0.0409	0.0244	0.0371	0.0173	0.03	0.0191	0.0484	0.0042	0.0463
 71 | chr16_73840165_73840315	0.0114	0.025	0.0234	0.0357	0.0105	0.0102	0.0098	0.0269	0.0133	0.0422	0.0199	0.0399	0.013
 72 | chr16_73738690_73738840	0.0249	0.0493	0.0291	0.039	0.0213	0.0344	0.034	0.0541	0.0332	0.0335	0.031	0.0398	0.0207
 73 | chr16_73739597_73739747	0.1042	0.0178	0.0171	0.011	0.0562	0.011	0.0096	0.0374	0.0773	0.0313	0.0096	0.0061	0.0369
 74 | chr16_73254855_73255062	0.025	0.0995	0.0277	0.0489	0.0161	0.075	0.159	0.1063	0.0202	0.0325	0.1465	0.0792	0.0118
 75 | chr16_70680855_70681065	0.2686	0.3397	0.0468	0.0559	0.0797	0.2599	0.0837	0.1962	0.0211	0.1002	0.2779	0.1895	0.0516
 76 | chr16_70681455_70681625	0.1126	0.1423	0.0878	0.0454	0.1012	0.1285	0.0665	0.0733	0.0732	0.1139	0.0537	0.1272	0.0304
 77 | chr16_65651655_65651905	0.0148	0.0437	0.0351	0.0414	0.0293	0.0465	0.1174	0.0302	0.0265	0.0187	0.0096	0.2493	0.0355
 78 | chr16_60575250_60575400	0.0662	0.0353	0.0025	0.0526	0.0173	0.0462	0.0528	0.0211	0.0091	0.0394	0.038	0.0233	0.0436
 79 | chr16_60575497_60575647	0.0082	0.029	0.0408	0.0079	0.0512	0.0339	0.0256	0.0196	0.0271	0.0156	0.0096	0.0041	0.0228
 80 | chr16_56418106_56418256	0.0162	0.0469	0.0535	0.0092	0.0079	0.0527	0.0271	0.063	0.0217	0.0416	0.076	0.0281	0.0607
 81 | chr16_56328929_56329415	0.301	0.1369	0.0468	0.0541	0.0312	0.089	0.1607	0.0236	0.0376	0.0573	0.0751	0.0695	0.036
 82 | chr16_56329475_56329625	0.0338	0.0661	0.0535	0.0426	0.0356	0.0369	0.0839	0.0446	0.0253	0.0376	0.164	0.0533	0.0339
 83 | chr16_56329755_56330365	0.0719	0.0712	0.0466	0.0697	0.0393	0.0643	0.1246	0.0434	0.0366	0.062	0.0678	0.0611	0.0383
 84 | chr16_55784515_55784725	0.0148	0.047	0.0369	0.0489	0.0169	0.0172	0.0167	0.0542	0.0077	0.035	0.0423	0.0236	0.0745
 85 | chr16_55619388_55619665	0.0292	0.0266	0.0244	0.1022	0.0329	0.0452	0.0906	0.0281	0.0435	0.0251	0.0188	0.0403	0.0272
 86 | chr16_55576385_55576997	0.5997	0.3098	0.0357	0.3485	0.2955	0.3126	0.5383	0.3117	0.4584	0.2641	0.0304	0.2038	0.0502
 87 | chr16_55577189_55577339	0.2471	0.057	0.02	0.0642	0.0365	0.063	0.0592	0.0069	0.1024	0.0137	0.0096	0.0495	0.0634
 88 | chr16_55577463_55577763	0.4379	0.3725	0.0212	0.3498	0.2549	0.5258	0.2463	0.5622	0.4622	0.2492	0.0242	0.1647	0.04
 89 | chr16_55540715_55540865	0.0749	0.0479	0.0155	0.0337	0.0387	0.044	0.0454	0.0397	0.0351	0.0251	0.0297	0.0123	0.0267
 90 | chr16_55143715_55143865	0.026	0.0153	0.0157	0.0193	0.1472	0.0071	0.0509	0.0421	0.0163	0.0083	0.0178	0.0523	0.0326
 91 | chr16_54987433_54987665	0.0157	0.0162	0.0468	0.0275	0.0922	0.0331	0.4583	0.0662	0.0418	0.0367	0.0235	0.0184	0.0148
 92 | chr16_54948279_54948429	0.0662	0.0338	0.0139	0.0067	0.0301	0.0629	0.0966	0.0619	0.0302	0.0256	0.0096	0.0186	0.0092
 93 | chr16_54948995_54949145	0.0125	0.0245	0.0052	0.0206	0.0564	0.0159	0.0827	0.0485	0.0284	0.022	0.0237	0.042	0.0254
 94 | chr16_54726535_54726685	0.0148	0.0242	0.0277	0.041	0.0414	0.0253	0.0514	0.0163	0.0095	0.0344	0.0497	0.0317	0.0157
 95 | chr16_54578712_54578959	0.0414	0.0523	0.0379	0.0161	0.058	0.0613	0.0325	0.0556	0.0222	0.0442	0.0223	0.0441	0.0384
 96 | chr16_54564175_54564374	0.0155	0.076	0.0107	0.0379	0.0993	0.0568	0.0957	0.053	0.0279	0.0186	0.155	0.0923	0.0325
 97 | chr16_53557235_53557485	0.044	0.0294	0.1621	0.0438	0.0244	0.0575	0.0238	0.0353	0.0333	0.0556	0.055	0.0498	0.0552
 98 | chr16_53551513_53552185	0.013	0.0295	0.2388	0.0815	0.0237	0.0472	0.0392	0.048	0.2182	0.1048	0.0318	0.0341	0.1964
 99 | chr16_53503965_53504158	0.05	0.0398	0.145	0.028	0.0251	0.0303	0.0419	0.0427	0.0253	0.0899	0.0096	0.031	0.0427
100 | chr16_53504415_53504565	0.0268	0.0409	0.0437	0.032	0.0236	0.0289	0.0107	0.0455	0.0042	0.0499	0.0198	0.047	1e-04
101 | chr16_53468535_53469618	0.6563	0.5969	1	1	0.4776	0.6457	0.3546	0.5479	1	1	0.5444	0.6336	1
102 | 


--------------------------------------------------------------------------------
/inst/extdata/setB_100.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/inst/extdata/setB_100.bed.gz


--------------------------------------------------------------------------------
/inst/extdata/vistaEnhancers.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/inst/extdata/vistaEnhancers.bed.gz


--------------------------------------------------------------------------------
/long_vignettes/render-long-vignettes.R:
--------------------------------------------------------------------------------
1 | knitr::opts_knit$set(base.dir = 'vignettes/', progress = TRUE, verbose = TRUE)
2 | knitr::opts_chunk$set(fig.path="figures-full-power/")
3 | knitr::knit("long_vignettes/full-power.Rmd", "vignettes/full-power.Rmd")
4 | # knitr::opts_knit$set(base.dir = 'vignettes/', progress = TRUE, verbose = TRUE)
5 | # knitr::opts_chunk$set(fig.path="figures-GDData/")
6 | # knitr::knit("long_vignettes/GenomicDistributionsData.Rmd", "vignettes/GenomicDistributionsData.Rmd")
7 | 


--------------------------------------------------------------------------------
/man/BSdtToGRanges.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility.R
 3 | \name{BSdtToGRanges}
 4 | \alias{BSdtToGRanges}
 5 | \title{Converts a list of data.tables (From BSreadbeds) into GRanges.}
 6 | \usage{
 7 | BSdtToGRanges(dtList)
 8 | }
 9 | \arguments{
10 | \item{dtList}{A list of data.tables}
11 | }
12 | \value{
13 | A GRangesList object.
14 | }
15 | \description{
16 | Converts a list of data.tables (From BSreadbeds) into GRanges.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/GenomicDistributions-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/package.R
 3 | \docType{package}
 4 | \name{GenomicDistributions-package}
 5 | \alias{GenomicDistributions}
 6 | \alias{GenomicDistributions-package}
 7 | \title{Produces summaries and plots of features distributed across genomes}
 8 | \description{
 9 | If you have a set of genomic ranges, the GenomicDistributions R package can
10 | help you with some simple visualizations. Currently, it can produce two kinds
11 | of plots: First, the chromosome distribution plot, which visualizes how your
12 | regions are distributed over chromosomes; and second, the feature
13 | distribution plot, which visualizes how your regions are distributed relative
14 | to a feature of interest, like Transcription Start Sites (TSSs).
15 | }
16 | \seealso{
17 | Useful links:
18 | \itemize{
19 |   \item \url{http://code.databio.org/GenomicDistributions}
20 |   \item Report bugs at \url{http://github.com/databio/GenomicDistributions}
21 | }
22 | 
23 | }
24 | \author{
25 | \strong{Maintainer}: Kristyna Kupkova \email{kristynakupkova@gmail.com}
26 | 
27 | Authors:
28 | \itemize{
29 |   \item Jose Verdezoto
30 |   \item Tessa Danehy
31 |   \item John Lawson
32 |   \item Jose Verdezoto
33 |   \item Michal Stolarczyk
34 |   \item Jason Smith
35 |   \item Bingjie Xue
36 |   \item Sophia Rogers
37 |   \item John Stubbs
38 |   \item Nathan C. Sheffield \email{nathan@code.databio.org}
39 | }
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/man/TSS_hg19.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{TSS_hg19}
 5 | \alias{TSS_hg19}
 6 | \title{hg19 TSS locations}
 7 | \format{
 8 | A named vectors of lengths with one item per chromosome
 9 | }
10 | \source{
11 | EnsDb.Hsapiens.v75 package
12 | }
13 | \usage{
14 | data(TSS_hg19)
15 | }
16 | \description{
17 | A dataset containing chromosome sizes for Homo Sapiens hg38 genome assembly
18 | }
19 | \keyword{datasets}
20 | 


--------------------------------------------------------------------------------
/man/binBSGenome.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chrom-plots.R
 3 | \name{binBSGenome}
 4 | \alias{binBSGenome}
 5 | \title{Bins a BSgenome object.}
 6 | \usage{
 7 | binBSGenome(genome, binCount)
 8 | }
 9 | \arguments{
10 | \item{genome}{A UCSC-style string denoting reference assembly (e.g. 'hg38')}
11 | 
12 | \item{binCount}{number of bins per chromosome}
13 | }
14 | \value{
15 | A data.table object showing the region and bin IDs 
16 |         of the reference genome.
17 | }
18 | \description{
19 | Given a BSgenome object (to be loaded via \code{loadBSgenome}), and a number
20 | of bins, this will bin that genome. It is a simple wrapper of the
21 | \code{binChroms} function
22 | }
23 | \examples{
24 | \dontrun{
25 | binCount = 1000
26 | refGenomeBins = binBSGenome("hg19", binCount)
27 | }
28 | }
29 | 


--------------------------------------------------------------------------------
/man/binChroms.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chrom-plots.R
 3 | \name{binChroms}
 4 | \alias{binChroms}
 5 | \title{Naively splits a chromosome into bins}
 6 | \usage{
 7 | binChroms(binCount, chromSizes)
 8 | }
 9 | \arguments{
10 | \item{binCount}{number of bins (total; *not* per chromosome)}
11 | 
12 | \item{chromSizes}{a named list of size (length) for each chromosome.}
13 | }
14 | \value{
15 | A data.table object assigning a bin ID to each chromosome region.
16 | }
17 | \description{
18 | Given a list of chromosomes with corresponding sizes, this script will
19 | produce (roughly) evenly-sized bins across the chromosomes. It does not
20 | account for assembly gaps or the like.
21 | }
22 | \examples{
23 | chromSizes = c(chr1=249250621, chr2=243199373, chr3=198022430)
24 | cBins = binChroms(1000, chromSizes)
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/man/binRegion.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chrom-plots.R
 3 | \name{binRegion}
 4 | \alias{binRegion}
 5 | \title{Divide regions into roughly equal bins}
 6 | \usage{
 7 | binRegion(start, end, binSize = NULL, binCount = NULL, indicator = NULL)
 8 | }
 9 | \arguments{
10 | \item{start}{The starting coordinate}
11 | 
12 | \item{end}{The ending coordinate}
13 | 
14 | \item{binSize}{The size of bin to divide the genome into. You must supply
15 | either binSize (priority) or binCount.}
16 | 
17 | \item{binCount}{The number of bins to divide. If you do not supply binSize,
18 | you must supply binCount, which will be used to calculate the binSize.}
19 | 
20 | \item{indicator}{A vector with identifiers to keep with your bins, in case
21 | you are doing this on a long table with multiple segments concatenated}
22 | }
23 | \value{
24 | A data.table, expanded to nrow = number of bins, with these id columns:
25 |     id: region ID
26 |     binID: repeating ID (this is the value to aggregate across)
27 |     ubinID: unique bin IDs
28 | }
29 | \description{
30 | Given a start coordinate, end coordinate, and number of bins to divide, 
31 | this function will split the regions into that many bins.
32 | Bins will be only approximately the same size, due to rounding.
33 | (they should not be more than 1 different).
34 | }
35 | \details{
36 | Use case: take a set of regions, like CG islands, and bin them; now you can
37 | aggregate signal scores across the bins, giving you an aggregate signal
38 | in bins across many regions of the same type.
39 | 
40 | In theory, this just runs on 3 values, but you can run it inside a 
41 | data.table j expression to divide a bunch of regions in the same way.
42 | }
43 | \examples{
44 | Rbins = binRegion(1, 3000, 100, 1000)
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/man/calcChromBins.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chrom-plots.R
 3 | \name{calcChromBins}
 4 | \alias{calcChromBins}
 5 | \title{Calculates the distribution of a query set over the genome}
 6 | \usage{
 7 | calcChromBins(query, bins)
 8 | }
 9 | \arguments{
10 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions}
11 | 
12 | \item{bins}{Pre-computed bins (as a GRangesList object) to aggregate
13 | over; for example, these could be genome bins}
14 | }
15 | \value{
16 | A data.table showing where on which chromosomes 
17 |    ranges are distributed.
18 | }
19 | \description{
20 | Returns a data.table showing counts of regions from the query that overlap
21 | with each bin.
22 | In other words, where on which chromosomes are the ranges distributed?
23 | You must provide binned regions. Only the midpoint of each query region is
24 | used to test for overlap with the bin regions.
25 | }
26 | \examples{
27 | 
28 | chromSizes = getChromSizes("hg19")
29 | genomeBins  = getGenomeBins(chromSizes)
30 | chromDistribution = calcChromBins(vistaEnhancers, genomeBins)
31 | 
32 | vistaSftd = GenomicRanges::shift(vistaEnhancers, 100000)
33 | vistaSftd2 = GenomicRanges::shift(vistaEnhancers, 200000)
34 | calcChromBins(vistaEnhancers, GRangesList(vistaSftd, vistaSftd2))
35 | }
36 | 


--------------------------------------------------------------------------------
/man/calcChromBinsRef.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chrom-plots.R
 3 | \name{calcChromBinsRef}
 4 | \alias{calcChromBinsRef}
 5 | \title{Returns the distribution of query over a reference assembly
 6 | Given a query set of elements (a GRanges object) and a reference assembly
 7 | (*e.g. 'hg38'), this will aggregate and count the distribution of the query
 8 | elements across bins of the reference genome. This is a helper function to
 9 | create features for common genomes. It is a wrapper of
10 | \code{calcChromBins}, which is more general.}
11 | \usage{
12 | calcChromBinsRef(query, refAssembly, binCount = 3000)
13 | }
14 | \arguments{
15 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions}
16 | 
17 | \item{refAssembly}{A character vector that will be used to grab chromosome
18 | sizes with \code{getChromSizes}}
19 | 
20 | \item{binCount}{Number of bins to divide the chromosomes into}
21 | }
22 | \value{
23 | A data.table showing the distribution of regions across bins of the
24 | reference genome.
25 | }
26 | \description{
27 | Returns the distribution of query over a reference assembly
28 | Given a query set of elements (a GRanges object) and a reference assembly
29 | (*e.g. 'hg38'), this will aggregate and count the distribution of the query
30 | elements across bins of the reference genome. This is a helper function to
31 | create features for common genomes. It is a wrapper of
32 | \code{calcChromBins}, which is more general.
33 | }
34 | \examples{
35 | ChromBins = calcChromBinsRef(vistaEnhancers, "hg19")
36 | }
37 | 


--------------------------------------------------------------------------------
/man/calcChromBinsRefSlow.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chrom-plots.R
 3 | \name{calcChromBinsRefSlow}
 4 | \alias{calcChromBinsRefSlow}
 5 | \title{Returns the distribution of query over a reference assembly
 6 | Given a query set of elements (a GRanges object) and a reference assembly
 7 | (*e.g. 'hg38'), this will aggregate and count the distribution of the query
 8 | elements across bins of the reference genome. This is a helper function to
 9 | create features for common genomes. It is a wrapper of
10 | \code{calcChromBins}, which is more general.}
11 | \usage{
12 | calcChromBinsRefSlow(query, refAssembly, binCount = 3000)
13 | }
14 | \arguments{
15 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions}
16 | 
17 | \item{refAssembly}{A character vector that will be used to grab chromosome
18 | sizes with \code{getChromSizes}}
19 | 
20 | \item{binCount}{Number of bins to divide the chromosomes into}
21 | }
22 | \value{
23 | A data.table showing the distribution of regions across bins of the
24 | reference genome.
25 | }
26 | \description{
27 | Returns the distribution of query over a reference assembly
28 | Given a query set of elements (a GRanges object) and a reference assembly
29 | (*e.g. 'hg38'), this will aggregate and count the distribution of the query
30 | elements across bins of the reference genome. This is a helper function to
31 | create features for common genomes. It is a wrapper of
32 | \code{calcChromBins}, which is more general.
33 | }
34 | \examples{
35 | ChromBins = calcChromBinsRef(vistaEnhancers, "hg19")
36 | }
37 | 


--------------------------------------------------------------------------------
/man/calcCumulativePartitions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/partition-plots.R
 3 | \name{calcCumulativePartitions}
 4 | \alias{calcCumulativePartitions}
 5 | \title{Calculates the cumulative distribution of overlaps between query and
 6 | arbitrary genomic partitions}
 7 | \usage{
 8 | calcCumulativePartitions(query, partitionList, remainder = "intergenic")
 9 | }
10 | \arguments{
11 | \item{query}{GRanges or GRangesList with regions to classify.}
12 | 
13 | \item{partitionList}{An ORDERED and NAMED list of genomic partitions
14 | GRanges. This list must be in priority order; the input will be assigned
15 | to the first partition it overlaps.}
16 | 
17 | \item{remainder}{Which partition do you want to account for 'everything
18 | else'?}
19 | }
20 | \value{
21 | A data.frame assigning each element of a GRanges object to a
22 |     partition from a previously provided partitionList.
23 | }
24 | \description{
25 | Takes a GRanges object, then assigns each element to a partition from the
26 | provided partitionList, and then tallies the number of regions assigned to
27 | each partition. A typical example of partitions is promoter, exon, intron,
28 | etc; this function will yield the number of each for a query GRanges object
29 | There will be a priority order to these, to account for regions that may
30 | overlap multiple genomic partitions.
31 | }
32 | \examples{
33 | partitionList = genomePartitionList(geneModels_hg19$genesGR,
34 |                                     geneModels_hg19$exonsGR,
35 |                                     geneModels_hg19$threeUTRGR,
36 |                                     geneModels_hg19$fiveUTRGR)
37 | calcCumulativePartitions(vistaEnhancers, partitionList)
38 | }
39 | 


--------------------------------------------------------------------------------
/man/calcCumulativePartitionsRef.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/partition-plots.R
 3 | \name{calcCumulativePartitionsRef}
 4 | \alias{calcCumulativePartitionsRef}
 5 | \title{Calculates the cumulative distribution of overlaps for a query set to a
 6 | reference assembly}
 7 | \usage{
 8 | calcCumulativePartitionsRef(query, refAssembly)
 9 | }
10 | \arguments{
11 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions}
12 | 
13 | \item{refAssembly}{A character vector specifying the reference genome
14 | assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes
15 | with \code{getTSSs}.}
16 | }
17 | \value{
18 | A data.frame indicating the number of query region overlaps in
19 |     several genomic partitions.
20 | }
21 | \description{
22 | This function is a wrapper for \code{calcCumulativePartitions} that uses
23 | built-in partitions for a given reference genome assembly.
24 | }
25 | \examples{
26 | calcCumulativePartitionsRef(vistaEnhancers, "hg19")
27 | }
28 | 


--------------------------------------------------------------------------------
/man/calcDinuclFreq.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/content-plots.R
 3 | \name{calcDinuclFreq}
 4 | \alias{calcDinuclFreq}
 5 | \title{Calculate Dinuclotide content over genomic ranges}
 6 | \usage{
 7 | calcDinuclFreq(query, ref, rawCounts = FALSE)
 8 | }
 9 | \arguments{
10 | \item{query}{A GRanges object with query sets}
11 | 
12 | \item{ref}{Reference genome BSgenome object}
13 | 
14 | \item{rawCounts}{a logical indicating whether the raw numbers should be 
15 | displayed, rather than percentages (optional).}
16 | }
17 | \value{
18 | A data.table with counts of dinucleotides across the GRanges object
19 | }
20 | \description{
21 | Given a reference genome (BSgenome object) and ranges on the
22 | reference, this function returns a data.table with 
23 | counts of dinucleotides within the GRanges object.
24 | }
25 | \examples{
26 | \dontrun{ 
27 | bsg = loadBSgenome('hg19')
28 | DNF = calcDinuclFreq(vistaEnhancers, bsg)
29 | }
30 | }
31 | 


--------------------------------------------------------------------------------
/man/calcDinuclFreqRef.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/content-plots.R
 3 | \name{calcDinuclFreqRef}
 4 | \alias{calcDinuclFreqRef}
 5 | \title{Calculate dinucleotide content over genomic ranges}
 6 | \usage{
 7 | calcDinuclFreqRef(query, refAssembly, rawCounts = FALSE)
 8 | }
 9 | \arguments{
10 | \item{query}{A GRanges object with query sets}
11 | 
12 | \item{refAssembly}{A character vector specifying the reference genome
13 | assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes with
14 | \code{getTSSs}.}
15 | 
16 | \item{rawCounts}{a logical indicating whether the raw numbers should be 
17 | displayed, rather than percentages (optional).}
18 | }
19 | \value{
20 | A numeric vector or list of vectors with the GC percentage of 
21 |     the query regions.
22 | }
23 | \description{
24 | Given a reference genome (BSgenome object) and ranges on the
25 | reference, this function returns a data.table with 
26 | counts of dinucleotides within the GRanges object.
27 | }
28 | \examples{
29 | \dontrun{
30 | query = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions")
31 | GRquery = rtracklayer::import(query)
32 | refAssembly = 'hg19'
33 | DNF = calcDinuclFreqRef(GRquery, refAssembly)
34 | } 
35 | }
36 | 


--------------------------------------------------------------------------------
/man/calcExpectedPartitions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/partition-plots.R
 3 | \name{calcExpectedPartitions}
 4 | \alias{calcExpectedPartitions}
 5 | \title{Calculates expected partiton overlap based on contribution of each
 6 | feature (partition) to genome size. Expected and observed overlaps
 7 | are then compared.}
 8 | \usage{
 9 | calcExpectedPartitions(
10 |   query,
11 |   partitionList,
12 |   genomeSize = NULL,
13 |   remainder = "intergenic",
14 |   bpProportion = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{query}{GRanges or GRangesList with regions to classify.}
19 | 
20 | \item{partitionList}{An ORDERED (if bpProportion=FALSE) and NAMED
21 | list of genomic partitions GRanges. This list must be in
22 | priority order; the input will be assigned
23 | to the first partition it overlaps. However, if bpProportion=TRUE,
24 | the list does not need ordering.}
25 | 
26 | \item{genomeSize}{The number of bases in the query genome. In other words,
27 | the sum of all chromosome sizes.}
28 | 
29 | \item{remainder}{Which partition do you want to account for 'everything
30 | else'?}
31 | 
32 | \item{bpProportion}{logical indicating if overlaps should be calculated based
33 | on number of base pairs overlapping with each partition.
34 | bpProportion=FALSE does overlaps in priority order,
35 | bpProportion=TRUE counts number of overlapping
36 | base pairs between query and each partition.}
37 | }
38 | \value{
39 | A data.frame assigning each element of a GRanges object to a
40 |     partition from a previously provided partitionList.The data.frame also
41 |     contains Chi-square p-values calculated for observed/expected
42 |     overlaps on each individual partition.
43 | }
44 | \description{
45 | Calculates expected partiton overlap based on contribution of each
46 | feature (partition) to genome size. Expected and observed overlaps
47 | are then compared.
48 | }
49 | \examples{
50 | partitionList = genomePartitionList(geneModels_hg19$genesGR,
51 |                                     geneModels_hg19$exonsGR,
52 |                                     geneModels_hg19$threeUTRGR,
53 |                                     geneModels_hg19$fiveUTRGR)
54 | chromSizes = getChromSizes('hg19')
55 | genomeSize = sum(chromSizes)
56 | calcExpectedPartitions(vistaEnhancers, partitionList, genomeSize)
57 | }
58 | 


--------------------------------------------------------------------------------
/man/calcExpectedPartitionsRef.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/partition-plots.R
 3 | \name{calcExpectedPartitionsRef}
 4 | \alias{calcExpectedPartitionsRef}
 5 | \title{Calculates the distribution of observed versus expected overlaps for a
 6 | query set to a reference assembly}
 7 | \usage{
 8 | calcExpectedPartitionsRef(query, refAssembly, bpProportion = FALSE)
 9 | }
10 | \arguments{
11 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions}
12 | 
13 | \item{refAssembly}{A character vector specifying the reference genome
14 | assembly (*e.g.* 'hg19'). This will be used to grab annotation
15 | models with \code{getGeneModels}, and chromosome sizes with\code{getChromSizes}}
16 | 
17 | \item{bpProportion}{logical indicating if overlaps should be calculated based
18 | on number of base pairs overlapping with each partition.
19 | bpProportion=FALSE does overlaps in priority order,
20 | bpProportion=TRUE counts number of overlapping
21 | base pairs between query and each partition.}
22 | }
23 | \value{
24 | A data.frame indicating the number of query region overlaps in
25 |     several genomic partitions.
26 | }
27 | \description{
28 | This function is a wrapper for \code{calcExpectedPartitions} that uses
29 | built-in partitions for a given reference genome assembly.
30 | }
31 | \examples{
32 | calcExpectedPartitionsRef(vistaEnhancers, "hg19")
33 | }
34 | 


--------------------------------------------------------------------------------
/man/calcFeatureDist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/feature-plots.R
 3 | \name{calcFeatureDist}
 4 | \alias{calcFeatureDist}
 5 | \title{Find the distance to the nearest genomic feature}
 6 | \usage{
 7 | calcFeatureDist(query, features)
 8 | }
 9 | \arguments{
10 | \item{query}{A GRanges or GRangesList object with query sets}
11 | 
12 | \item{features}{A GRanges object with features to test distance to}
13 | }
14 | \value{
15 | A vector of genomic distances for each query region relative to its 
16 |     closest feature.
17 | }
18 | \description{
19 | For a given query set of genomic regions, and a given feature set of 
20 | regions, this function will return the distance for each query region to its
21 | closest feature. It ignores strand and returns the distance as positive or 
22 | negative, depending on whether the feature is upstream or downstream
23 | }
24 | \details{
25 | This function is similar to the bioconductor distanceToNearest function, but
26 | returns negative values for downstream distances instead of absolute values.
27 | This allows you to assess the relative location.
28 | }
29 | \examples{
30 | vistaSftd = GenomicRanges::shift(vistaEnhancers, 100000)
31 | calcFeatureDist(vistaEnhancers, vistaSftd) 
32 | }
33 | 


--------------------------------------------------------------------------------
/man/calcFeatureDistRefTSS.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/feature-plots.R
 3 | \name{calcFeatureDistRefTSS}
 4 | \alias{calcFeatureDistRefTSS}
 5 | \title{Calculates the distribution of distances from a query set to closest TSS}
 6 | \usage{
 7 | calcFeatureDistRefTSS(query, refAssembly)
 8 | }
 9 | \arguments{
10 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions}
11 | 
12 | \item{refAssembly}{A character vector specifying the reference genome
13 | assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes with
14 | \code{getTSSs}.}
15 | }
16 | \value{
17 | A vector of distances for each query region relative to TSSs.
18 | }
19 | \description{
20 | Given a query GRanges object and an assembly string, this function will grab
21 | the TSS list for the given reference assembly and then calculate the distance
22 | from each query feature to the closest TSS. It is a wrapper of
23 | \code{calcFeatureDist} that uses built-in TSS features for a reference
24 | assembly
25 | }
26 | \examples{
27 | calcFeatureDistRefTSS(vistaEnhancers, "hg19")
28 | }
29 | 


--------------------------------------------------------------------------------
/man/calcGCContent.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/content-plots.R
 3 | \name{calcGCContent}
 4 | \alias{calcGCContent}
 5 | \title{Calculate GC content over genomic ranges}
 6 | \usage{
 7 | calcGCContent(query, ref)
 8 | }
 9 | \arguments{
10 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions.}
11 | 
12 | \item{ref}{Reference genome BSgenome object.}
13 | }
14 | \value{
15 | A numeric vector of list of vectors with the GC percentage of 
16 |     the query regions.
17 | }
18 | \description{
19 | Given a reference genome as a BSgenome object and some ranges on that
20 | reference, this function will return a vector of the same length as the
21 | granges object, with percent of Cs and Gs.
22 | }
23 | \examples{
24 | \dontrun{
25 | bsg = loadBSgenome('hg19')
26 | gcvec = calcGCContent(vistaEnhancers, bsg)
27 | }
28 | }
29 | 


--------------------------------------------------------------------------------
/man/calcGCContentRef.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/content-plots.R
 3 | \name{calcGCContentRef}
 4 | \alias{calcGCContentRef}
 5 | \title{Calculate GC content over genomic ranges}
 6 | \usage{
 7 | calcGCContentRef(query, refAssembly)
 8 | }
 9 | \arguments{
10 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions}
11 | 
12 | \item{refAssembly}{A character vector specifying the reference genome
13 | assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes with
14 | \code{getTSSs}.}
15 | }
16 | \value{
17 | A numeric vector or list of vectors with the GC percentage of 
18 |     the query regions.
19 | }
20 | \description{
21 | Given a reference genome as a BSgenome object and some ranges on that
22 | reference, this function will return a vector of the same length as the
23 | granges object, with percent of Cs and Gs.
24 | }
25 | \examples{
26 | \dontrun{
27 | refAssembly = 'hg19'
28 | GCcontent = calcGCContentRef(vistaEnhancers, refAssembly)
29 | } 
30 | }
31 | 


--------------------------------------------------------------------------------
/man/calcNearestNeighbors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/neighbor-distances.R
 3 | \name{calcNearestNeighbors}
 4 | \alias{calcNearestNeighbors}
 5 | \title{Group regions from the same chromosome together and
 6 | compute the distance of a region to its nearest neighbor. 
 7 | Distances are then lumped into a numeric vector.}
 8 | \usage{
 9 | calcNearestNeighbors(query, correctRef = "None")
10 | }
11 | \arguments{
12 | \item{query}{A GRanges or GRangesList object.}
13 | 
14 | \item{correctRef}{A string indicating the reference genome
15 | to use if Nearest neighbor distances are corrected for the 
16 | number of regions in a regionSet.}
17 | }
18 | \value{
19 | A numeric vector or list of vectors containing the
20 |  distance of regions to their nearest neighbors.
21 | }
22 | \description{
23 | Group regions from the same chromosome together and
24 | compute the distance of a region to its nearest neighbor. 
25 | Distances are then lumped into a numeric vector.
26 | }
27 | \examples{
28 | Nneighbors = calcNearestNeighbors(vistaEnhancers)
29 | }
30 | 


--------------------------------------------------------------------------------
/man/calcNeighborDist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/neighbor-distances.R
 3 | \name{calcNeighborDist}
 4 | \alias{calcNeighborDist}
 5 | \title{Group regions from the same chromosome together and
 6 | calculate the distances of a region to its upstream and
 7 | downstream neighboring regions.  
 8 | Distances are then lumped into a numeric vector.}
 9 | \usage{
10 | calcNeighborDist(query, correctRef = "None")
11 | }
12 | \arguments{
13 | \item{query}{A GRanges or GRangesList object.}
14 | 
15 | \item{correctRef}{A string indicating the reference genome
16 | to use if distances are corrected for the number of 
17 | regions in a regionSet.}
18 | }
19 | \value{
20 | A numeric vector or list with different vectors containing the
21 |  distances of regions to their upstream/downstream neighbors.
22 | }
23 | \description{
24 | Group regions from the same chromosome together and
25 | calculate the distances of a region to its upstream and
26 | downstream neighboring regions.  
27 | Distances are then lumped into a numeric vector.
28 | }
29 | \examples{
30 | dist = calcNeighborDist(vistaEnhancers)
31 | }
32 | 


--------------------------------------------------------------------------------
/man/calcPartitions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/partition-plots.R
 3 | \name{calcPartitions}
 4 | \alias{calcPartitions}
 5 | \title{Calculates the distribution of overlaps between
 6 | query and arbitrary genomic partitions}
 7 | \usage{
 8 | calcPartitions(
 9 |   query,
10 |   partitionList,
11 |   remainder = "intergenic",
12 |   bpProportion = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{query}{GRanges or GRangesList with regions to classify}
17 | 
18 | \item{partitionList}{an ORDERED (if bpProportion=FALSE) and NAMED list of
19 | genomic partitions GRanges. This list must be in priority order; the
20 | input will be assigned to the first partition it overlaps.
21 | bpProportion=TRUE, the list does not need ordering.}
22 | 
23 | \item{remainder}{A character vector to assign any query regions that do
24 | not overlap with anything in the partitionList. Defaults to "intergenic"}
25 | 
26 | \item{bpProportion}{logical indicating if overlaps should be calculated based
27 | on number of base pairs overlapping with each partition.
28 | bpProportion=FALSE does overlaps in priority order,
29 | bpProportion=TRUE counts number of overlapping
30 | base pairs between query and each partition.}
31 | }
32 | \value{
33 | A data.frame assigning each element of a GRanges object to a
34 |     partition from a previously provided partitionList.
35 | }
36 | \description{
37 | Takes a GRanges object, then assigns each element to a partition from the
38 | provided partitionList, and then tallies the number of regions assigned to
39 | each partition. A typical example of partitions is promoter, exon, intron,
40 | etc; this function will yield the number of each for a query GRanges object
41 | There will be a priority order to these, to account for regions that may
42 | overlap multiple genomic partitions.
43 | }
44 | \examples{
45 | partitionList = genomePartitionList(geneModels_hg19$genesGR,
46 |                                     geneModels_hg19$exonsGR,
47 |                                     geneModels_hg19$threeUTRGR,
48 |                                     geneModels_hg19$fiveUTRGR)
49 | calcPartitions(vistaEnhancers, partitionList)
50 | }
51 | 


--------------------------------------------------------------------------------
/man/calcPartitionsRef.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/partition-plots.R
 3 | \name{calcPartitionsRef}
 4 | \alias{calcPartitionsRef}
 5 | \title{Calculates the distribution of overlaps for a query set to a reference
 6 | assembly}
 7 | \usage{
 8 | calcPartitionsRef(query, refAssembly, bpProportion = FALSE)
 9 | }
10 | \arguments{
11 | \item{query}{A GenomicRanges or GenomicRangesList object with query regions}
12 | 
13 | \item{refAssembly}{A character vector specifying the reference genome
14 | assembly (*e.g.* 'hg19'). This will be used to grab annotation
15 | models with \code{getGeneModels}}
16 | 
17 | \item{bpProportion}{logical indicating if overlaps should be calculated
18 | based on number of base pairs overlapping with each partition.
19 | bpProportion=FALSE does overlaps in priority order,
20 | bpProportion=TRUE counts number of overlapping
21 | base pairs between query and each partition.}
22 | }
23 | \value{
24 | A data.frame indicating the number of query region overlaps in
25 |     several genomic partitions.
26 | }
27 | \description{
28 | This function is a wrapper for \code{calcPartitions}
29 | and \code{calcPartitionPercents} that uses built-in
30 | partitions for a given reference genome assembly.
31 | }
32 | \examples{
33 | calcPartitionsRef(vistaEnhancers, "hg19")
34 | }
35 | 


--------------------------------------------------------------------------------
/man/calcSummarySignal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/specificity-plots.R
 3 | \name{calcSummarySignal}
 4 | \alias{calcSummarySignal}
 5 | \title{The function calcSummarySignal takes the input BED file(s) 
 6 | in form of GRanges or GRangesList object, overlaps 
 7 | it with all defined open chromatin regions across 
 8 | conditions (e.g. cell types) and returns a matrix, 
 9 | where each row is the input genomic region 
10 | (if overlap was found), each column is a condition, 
11 | and the value is a meam signal from regions where
12 | overlap was found.}
13 | \usage{
14 | calcSummarySignal(query, signalMatrix)
15 | }
16 | \arguments{
17 | \item{query}{Genomic regions to be analyzed. Can be GRanges or GRangesList 
18 | object.}
19 | 
20 | \item{signalMatrix}{Matrix with signal values in predfined regions, where
21 | rows are predefined genomic regions, columns are conditions 
22 | (e.g. cell types in which the signal was measured). 
23 | First column contains information about the genomic region in 
24 | following form: chr_start_end. 
25 | Can be either data.frame or data.table object.}
26 | }
27 | \value{
28 | A list with named components:
29 |            signalSummaryMatrix - data.table with cell specific open chromatin signal
30 |                           values for query regions
31 |            matrixStats - data.frame containing boxplot stats for individual 
32 |                           cell type
33 | }
34 | \description{
35 | The function calcSummarySignal takes the input BED file(s) 
36 | in form of GRanges or GRangesList object, overlaps 
37 | it with all defined open chromatin regions across 
38 | conditions (e.g. cell types) and returns a matrix, 
39 | where each row is the input genomic region 
40 | (if overlap was found), each column is a condition, 
41 | and the value is a meam signal from regions where
42 | overlap was found.
43 | }
44 | \examples{
45 | signalSummaryList = calcSummarySignal(vistaEnhancers, exampleOpenSignalMatrix_hg19)
46 | }
47 | 


--------------------------------------------------------------------------------
/man/calcWidth.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/qthist.R
 3 | \name{calcWidth}
 4 | \alias{calcWidth}
 5 | \title{Calculate the widths of regions}
 6 | \usage{
 7 | calcWidth(query)
 8 | }
 9 | \arguments{
10 | \item{query}{A GRanges or GRangesList object with query sets}
11 | }
12 | \value{
13 | A vector of the widths (end-start coordinates) of GRanges objects.
14 | }
15 | \description{
16 | The length of a genomic region (the distance between the start and end) 
17 | is called the width
18 | When given a query set of genomic regions, this function returns the width
19 | }
20 | \examples{
21 | regWidths = calcWidth(vistaEnhancers)
22 | }
23 | 


--------------------------------------------------------------------------------
/man/cellTypeMetadata.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{cellTypeMetadata}
 5 | \alias{cellTypeMetadata}
 6 | \title{Table the maps cell types to tissues and groups}
 7 | \format{
 8 | data.table with 3 columns (cellType, tissue and group) 
 9 |     and 74 rows (one per cellType)
10 | }
11 | \source{
12 | self-curated dataset
13 | }
14 | \usage{
15 | data(cellTypeMetadata)
16 | }
17 | \description{
18 | Table the maps cell types to tissues and groups
19 | }
20 | \keyword{datasets}
21 | 


--------------------------------------------------------------------------------
/man/chromSizes_hg19.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{chromSizes_hg19}
 5 | \alias{chromSizes_hg19}
 6 | \title{hg19 chromosome sizes}
 7 | \format{
 8 | A named vectors of lengths with one item per chromosome
 9 | }
10 | \source{
11 | BSgenome.Hsapiens.UCSC.hg19 package
12 | }
13 | \usage{
14 | data(chromSizes_hg19)
15 | }
16 | \description{
17 | A dataset containing chromosome sizes for Homo Sapiens hg38 genome assembly
18 | }
19 | \keyword{datasets}
20 | 


--------------------------------------------------------------------------------
/man/dot-requireAndReturn.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility.R
 3 | \name{.requireAndReturn}
 4 | \alias{.requireAndReturn}
 5 | \title{Checks to make sure a package object is installed,
 6 | and if so, returns it. If the library is not installed, it issues a warning
 7 | and returns NULL.}
 8 | \usage{
 9 | .requireAndReturn(BSgenomeString)
10 | }
11 | \arguments{
12 | \item{BSgenomeString}{A BSgenome compatible genome string.}
13 | }
14 | \value{
15 | A BSgenome object if installed.
16 | }
17 | \description{
18 | Checks to make sure a package object is installed,
19 | and if so, returns it. If the library is not installed, it issues a warning
20 | and returns NULL.
21 | }
22 | 


--------------------------------------------------------------------------------
/man/dot-validateInputs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility.R
 3 | \name{.validateInputs}
 4 | \alias{.validateInputs}
 5 | \title{Checks class of the list of variables. To be used in functions}
 6 | \usage{
 7 | .validateInputs(checkList)
 8 | }
 9 | \arguments{
10 | \item{checkList}{list of object to check, e.g. 
11 | list(varname=c("data.frame", "numeric")). 
12 | Multiuple strings in the vector are treated as OR.}
13 | }
14 | \value{
15 | A warning if the wrong input class is provided.
16 | }
17 | \description{
18 | Checks class of the list of variables. To be used in functions
19 | }
20 | \examples{
21 | x = function(var1) {
22 |     cl = list(var1=c("numeric","character"))
23 |     .validateInputs(cl)
24 |     return(var1^2)
25 | }
26 | }
27 | 


--------------------------------------------------------------------------------
/man/dtToGr.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility.R
 3 | \name{dtToGr}
 4 | \alias{dtToGr}
 5 | \title{Converts a data.table (DT) object to a GenomicRanges 
 6 | (GR) object. Tries to be intelligent, guessing chr 
 7 | and start, but you have to supply end or other
 8 | columns if you want them to be carried into the GR.}
 9 | \usage{
10 | dtToGr(
11 |   DT,
12 |   chr = "chr",
13 |   start = "start",
14 |   end = NA,
15 |   strand = NA,
16 |   name = NA,
17 |   splitFactor = NA,
18 |   metaCols = NA
19 | )
20 | }
21 | \arguments{
22 | \item{DT}{A data.table representing genomic regions.}
23 | 
24 | \item{chr}{A string representing the chromosome column.}
25 | 
26 | \item{start}{A string representing the name of the start column.}
27 | 
28 | \item{end}{A string representing the name of the end column.}
29 | 
30 | \item{strand}{A string representing the name of the strand column.}
31 | 
32 | \item{name}{A string representing the name of the name column.}
33 | 
34 | \item{splitFactor}{A string representing the name of the column to use to
35 | split the data.table into multiple data.tables.}
36 | 
37 | \item{metaCols}{A string representing the name of the metadata column(s)
38 | to include in the returned GRanges object.}
39 | }
40 | \value{
41 | A GRanges object.
42 | }
43 | \description{
44 | Converts a data.table (DT) object to a GenomicRanges 
45 | (GR) object. Tries to be intelligent, guessing chr 
46 | and start, but you have to supply end or other
47 | columns if you want them to be carried into the GR.
48 | }
49 | \examples{
50 | start1 = c(seq(from=1, to = 2001, by = 1000), 800)
51 | chrString1 = c(rep("chr1", 3), "chr2")
52 | dt = data.table::data.table(chr=chrString1,
53 |                             start=start1,
54 |                             end=start1 + 250)
55 | newGR = dtToGr(dt)                
56 | }
57 | 


--------------------------------------------------------------------------------
/man/dtToGrInternal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility.R
 3 | \name{dtToGrInternal}
 4 | \alias{dtToGrInternal}
 5 | \title{Two utility functions for converting data.tables into GRanges objects}
 6 | \usage{
 7 | dtToGrInternal(DT, chr, start, end = NA, strand = NA, name = NA, metaCols = NA)
 8 | }
 9 | \arguments{
10 | \item{DT}{A data.table representing genomic regions.}
11 | 
12 | \item{chr}{A string representing the chromosome column.}
13 | 
14 | \item{start}{A string representing the name of the start column.}
15 | 
16 | \item{end}{A string representing the name of the end column.}
17 | 
18 | \item{strand}{A string representing the name of the strand column.}
19 | 
20 | \item{name}{A string representing the name of the name column.}
21 | 
22 | \item{metaCols}{A string representing the name of the metadata column(s)
23 | to include in the returned GRanges object.}
24 | }
25 | \value{
26 | A GRanges object.
27 | }
28 | \description{
29 | Two utility functions for converting data.tables into GRanges objects
30 | }
31 | 


--------------------------------------------------------------------------------
/man/exampleOpenSignalMatrix_hg19.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{exampleOpenSignalMatrix_hg19}
 5 | \alias{exampleOpenSignalMatrix_hg19}
 6 | \title{A dataset containing a subset of open chromatin regions across all 
 7 | cell types defined by ENCODE for Homo Sapiens hg19}
 8 | \format{
 9 | data.frame, rows represent whole selection of open 
10 | chromatin regions across all cell types defined by ENCODE, columns are 
11 | individual cell types and values are normalized open chromatin signal values.
12 | }
13 | \source{
14 | \url{http://big.databio.org/open_chromatin_matrix/openSignalMatrix_hg19_quantileNormalized_round4.txt.gz}
15 | }
16 | \usage{
17 | data(exampleOpenSignalMatrix_hg19)
18 | }
19 | \description{
20 | Preparation steps:
21 | \enumerate{
22 |    \item{made a universe of regions by merging regions across 
23 |        cell types defined as opened in ENCODE}
24 |    \item{took bigwig files from ENCODE for individual cell types, 
25 |        merged replicates, filtered out blacklisted sites}
26 |    \item{evaluated the signal above regions defined by previous step}
27 |    \item{performed quantile normalization}
28 |    \item{subsetted it}
29 | }
30 | }
31 | \keyword{datasets}
32 | 


--------------------------------------------------------------------------------
/man/geneModels_hg19.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{geneModels_hg19}
 5 | \alias{geneModels_hg19}
 6 | \title{hg38 gene models}
 7 | \format{
 8 | A list of two GRanges objects, with genes and exons locations
 9 | }
10 | \source{
11 | EnsDb.Hsapiens.v75 package
12 | }
13 | \usage{
14 | data(geneModels_hg19)
15 | }
16 | \description{
17 | A dataset containing gene models for Homo Sapiens hg38 genome assembly.
18 | }
19 | \keyword{datasets}
20 | 


--------------------------------------------------------------------------------
/man/genomePartitionList.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/partition-plots.R
 3 | \name{genomePartitionList}
 4 | \alias{genomePartitionList}
 5 | \title{Create a basic genome partition list of genes, exons, introns, UTRs, and
 6 | intergenic}
 7 | \usage{
 8 | genomePartitionList(
 9 |   genesGR,
10 |   exonsGR,
11 |   threeUTRGR = NULL,
12 |   fiveUTRGR = NULL,
13 |   getCorePromoter = TRUE,
14 |   getProxPromoter = TRUE,
15 |   corePromSize = 100,
16 |   proxPromSize = 2000
17 | )
18 | }
19 | \arguments{
20 | \item{genesGR}{a GRanges object of gene coordinates}
21 | 
22 | \item{exonsGR}{a GRanges object of exons coordinates}
23 | 
24 | \item{threeUTRGR}{a GRanges object of 3' UTRs}
25 | 
26 | \item{fiveUTRGR}{a GRanges object of 5' UTRs}
27 | 
28 | \item{getCorePromoter}{option specifying if core promoters should be
29 | extracted defeaults to TRUE}
30 | 
31 | \item{getProxPromoter}{option specifying if proximal promoters should be
32 | extracted defeaults to TRUE}
33 | 
34 | \item{corePromSize}{size of core promoter (in bp) upstrem from TSS
35 | default value = 100}
36 | 
37 | \item{proxPromSize}{size of proximal promoter (in bp) upstrem from TSS
38 | default value = 2000}
39 | }
40 | \value{
41 | A list of GRanges objects, each corresponding to a partition of the
42 |     genome. Partitions include proximal and core promoters, exons and
43 |     introns.
44 | }
45 | \description{
46 | Given GRanges for genes, and a GRanges for exons, returns a list of GRanges
47 | corresponding to various breakdown of the genome, based on the given
48 | annotations; it gives you proximal and core promoters, exons, and introns.
49 | }
50 | \details{
51 | To be used as a partitionList for \code{calcPartitions}.
52 | }
53 | \examples{
54 | partitionList = genomePartitionList(geneModels_hg19$genesGR,
55 |                                     geneModels_hg19$exonsGR,
56 |                                     geneModels_hg19$threeUTRGR,
57 |                                     geneModels_hg19$fiveUTRGR)
58 | }
59 | 


--------------------------------------------------------------------------------
/man/getChromSizes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/loadData.R
 3 | \name{getChromSizes}
 4 | \alias{getChromSizes}
 5 | \title{Returns built-in chrom sizes for a given reference assembly}
 6 | \usage{
 7 | getChromSizes(refAssembly)
 8 | }
 9 | \arguments{
10 | \item{refAssembly}{A string identifier for the reference assembly}
11 | }
12 | \value{
13 | A vector with the chromosome sizes corresponding to a 
14 | specific genome assembly.
15 | }
16 | \description{
17 | Returns built-in chrom sizes for a given reference assembly
18 | }
19 | \examples{
20 | getChromSizes("hg19")
21 | }
22 | 


--------------------------------------------------------------------------------
/man/getChromSizesFromFasta.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/buildReferenceData.R
 3 | \name{getChromSizesFromFasta}
 4 | \alias{getChromSizesFromFasta}
 5 | \title{Get gene models from a remote or local FASTA file}
 6 | \usage{
 7 | getChromSizesFromFasta(source, destDir = NULL, convertEnsemblUCSC = FALSE)
 8 | }
 9 | \arguments{
10 | \item{source}{a string that is either a path to a
11 | local or remote FASTA}
12 | 
13 | \item{destDir}{a string that indicates the path to the 
14 | directory where the downloaded FASTA file should be stored}
15 | 
16 | \item{convertEnsemblUCSC}{a logical indicating whether Ensembl style 
17 | chromosome annotation should be changed to UCSC style (add chr)}
18 | }
19 | \value{
20 | a named vector of sequence lengths
21 | }
22 | \description{
23 | Get gene models from a remote or local FASTA file
24 | }
25 | \examples{
26 | CElegansFasteCropped = system.file("extdata", 
27 |                                    "C_elegans_cropped_example.fa.gz", 
28 |                                    package="GenomicDistributions")
29 | CElegansChromSizes = getChromSizesFromFasta(CElegansFasteCropped)
30 | }
31 | 


--------------------------------------------------------------------------------
/man/getGeneModels.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/loadData.R
 3 | \name{getGeneModels}
 4 | \alias{getGeneModels}
 5 | \title{Returns built-in gene models for a given reference assembly}
 6 | \usage{
 7 | getGeneModels(refAssembly)
 8 | }
 9 | \arguments{
10 | \item{refAssembly}{A string identifier for the reference assembly}
11 | }
12 | \value{
13 | A list containing the gene models corresponding to a
14 | specific reference assembly.
15 | }
16 | \description{
17 | Some functions require gene models, which can obtained from any source.
18 | This function allows you to retrieve a few common built-in ones.
19 | }
20 | \examples{
21 | getGeneModels("hg19")
22 | }
23 | 


--------------------------------------------------------------------------------
/man/getGeneModelsFromGTF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/buildReferenceData.R
 3 | \name{getGeneModelsFromGTF}
 4 | \alias{getGeneModelsFromGTF}
 5 | \title{Get gene models from a remote or local GTF file}
 6 | \usage{
 7 | getGeneModelsFromGTF(
 8 |   source,
 9 |   features,
10 |   convertEnsemblUCSC = FALSE,
11 |   destDir = NULL,
12 |   filterProteinCoding = TRUE
13 | )
14 | }
15 | \arguments{
16 | \item{source}{a string that is either a path to a local or remote GTF}
17 | 
18 | \item{features}{a vector of strings with feature identifiers that to 
19 | include in the result list}
20 | 
21 | \item{convertEnsemblUCSC}{a logical indicating whether Ensembl style 
22 | chromosome annotation should be changed to UCSC style}
23 | 
24 | \item{destDir}{a string that indicates the path to the directory where
25 | the downloaded GTF file should be stored}
26 | 
27 | \item{filterProteinCoding}{a logical indicating if TSSs should be only
28 | protein-coding genes (default = TRUE)}
29 | }
30 | \value{
31 | a list of GRanges objects
32 | }
33 | \description{
34 | Get gene models from a remote or local GTF file
35 | }
36 | \examples{
37 | CElegansGtfCropped = system.file("extdata", 
38 |                                  "C_elegans_cropped_example.gtf.gz", 
39 |                                  package="GenomicDistributions")
40 | features = c("gene", "exon", "three_prime_utr", "five_prime_utr")
41 | CElegansGeneModels = getGeneModelsFromGTF(CElegansGtfCropped, features, TRUE)
42 | }
43 | 


--------------------------------------------------------------------------------
/man/getGenomeBins.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chrom-plots.R
 3 | \name{getGenomeBins}
 4 | \alias{getGenomeBins}
 5 | \title{Returns bins used in `calcChromBins` function
 6 | Given a named vector of chromosome sizes, the function returns
 7 | GRangesList object with bins for each chromosome.}
 8 | \usage{
 9 | getGenomeBins(chromSizes, binCount = 10000)
10 | }
11 | \arguments{
12 | \item{chromSizes}{a named list of size (length) for each chromosome.}
13 | 
14 | \item{binCount}{number of bins (total; *not* per chromosome), 
15 | defaults to 10,000}
16 | }
17 | \value{
18 | A GRangesList object with bins that separate chromosomes
19 |         into equal parts.
20 | }
21 | \description{
22 | Returns bins used in `calcChromBins` function
23 | Given a named vector of chromosome sizes, the function returns
24 | GRangesList object with bins for each chromosome.
25 | }
26 | \examples{
27 | chromSizes = getChromSizes("hg19")
28 | chromBins  = getGenomeBins(chromSizes)
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/man/getReferenceData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/loadData.R
 3 | \name{getReferenceData}
 4 | \alias{getReferenceData}
 5 | \title{Get reference data for a specified assembly}
 6 | \usage{
 7 | getReferenceData(refAssembly, tagline)
 8 | }
 9 | \arguments{
10 | \item{refAssembly}{Reference assembly string (e.g. 'hg38')}
11 | 
12 | \item{tagline}{The string that was used to identify data of a given type in 
13 | the data building step. It's used for the filename so we know
14 | what to load, and is what makes this function generic (so it 
15 | can load different data types).}
16 | }
17 | \value{
18 | A requested and included package data object.
19 | }
20 | \description{
21 | This is a generic getter function that will return a data object requested,
22 | if it is included in the built-in data with the GenomicDistributions package 
23 | or GenomicDistributionsData package (if installed). Data objects can 
24 | be requested for different reference assemblies and data types (specified by
25 | a tagline, which is a unique string identifying the data type).
26 | }
27 | 


--------------------------------------------------------------------------------
/man/getTssFromGTF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/buildReferenceData.R
 3 | \name{getTssFromGTF}
 4 | \alias{getTssFromGTF}
 5 | \title{Get transcription start sites (TSSs) from a remote or local GTF file}
 6 | \usage{
 7 | getTssFromGTF(
 8 |   source,
 9 |   convertEnsemblUCSC = FALSE,
10 |   destDir = NULL,
11 |   filterProteinCoding = TRUE
12 | )
13 | }
14 | \arguments{
15 | \item{source}{a string that is either a path to a local or remote GTF}
16 | 
17 | \item{convertEnsemblUCSC}{a logical indicating whether Ensembl style 
18 | chromosome annotation should be changed to UCSC style}
19 | 
20 | \item{destDir}{a string that indicates the path to the directory where 
21 | the downloaded GTF file should be stored}
22 | 
23 | \item{filterProteinCoding}{a logical indicating if TSSs should be only
24 | protein-coding genes (default = TRUE)}
25 | }
26 | \value{
27 | a list of GRanges objects
28 | }
29 | \description{
30 | Get transcription start sites (TSSs) from a remote or local GTF file
31 | }
32 | \examples{
33 | CElegansGtfCropped = system.file("extdata", 
34 |                                  "C_elegans_cropped_example.gtf.gz", 
35 |                                  package="GenomicDistributions")
36 | CElegansTss = getTssFromGTF(CElegansGtfCropped, TRUE)
37 | }
38 | 


--------------------------------------------------------------------------------
/man/grToDt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility.R
 3 | \name{grToDt}
 4 | \alias{grToDt}
 5 | \title{Convert a GenomicRanges into a data.table.}
 6 | \usage{
 7 | grToDt(GR)
 8 | }
 9 | \arguments{
10 | \item{GR}{A Granges object}
11 | }
12 | \value{
13 | A data.table object.
14 | }
15 | \description{
16 | Convert a GenomicRanges into a data.table.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/labelCuts.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility.R
 3 | \name{labelCuts}
 4 | \alias{labelCuts}
 5 | \title{Creates labels based on a discretization definition.}
 6 | \usage{
 7 | labelCuts(
 8 |   breakPoints,
 9 |   round_digits = 1,
10 |   signif_digits = 3,
11 |   collapse = "-",
12 |   infBins = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{breakPoints}{The exact values you want as boundaries for your bins}
17 | 
18 | \item{round_digits}{Number of digits to cut round labels to.}
19 | 
20 | \item{signif_digits}{Number of significant digits to specify.}
21 | 
22 | \item{collapse}{Character to separate the labels}
23 | 
24 | \item{infBins}{use >/< as labels on the edge bins}
25 | }
26 | \value{
27 | A vector of histogram axis labels.
28 | }
29 | \description{
30 | If you are building a histogram of binned values, you want to have labels for
31 | your bins that correspond to the ranges you used to bin. This function takes
32 | the breakpoints that define your bins and produces nice-looking labels for
33 | your histogram plot.
34 | }
35 | \details{
36 | \code{labelCuts} will take a cut group, (e.g., a quantile division of 
37 | some signal), and give you clean labels (similar to the cut method).
38 | }
39 | 


--------------------------------------------------------------------------------
/man/loadBSgenome.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/loadData.R
 3 | \name{loadBSgenome}
 4 | \alias{loadBSgenome}
 5 | \title{Loads BSgenome objects from UCSC-style character vectors.}
 6 | \usage{
 7 | loadBSgenome(genomeBuild, masked = TRUE)
 8 | }
 9 | \arguments{
10 | \item{genomeBuild}{One of 'hg19', 'hg38', 'mm10', 'mm9', or 'grch38'}
11 | 
12 | \item{masked}{Should we used the masked version? Default:TRUE}
13 | }
14 | \value{
15 | A BSgenome object corresponding to the provided genome build.
16 | }
17 | \description{
18 | This function will let you use a simple character vector (e.g. 'hg19') to
19 | load and then return BSgenome objects. This lets you avoid having to use the
20 | more complex annotation for a complete BSgenome object (e.g.
21 | BSgenome.Hsapiens.UCSC.hg38.masked)
22 | }
23 | \examples{
24 | \dontrun{
25 | bsg = loadBSgenome('hg19')
26 | }
27 | }
28 | 


--------------------------------------------------------------------------------
/man/loadEnsDb.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/loadData.R
 3 | \name{loadEnsDb}
 4 | \alias{loadEnsDb}
 5 | \title{Load selected EnsDb library}
 6 | \usage{
 7 | loadEnsDb(genomeBuild)
 8 | }
 9 | \arguments{
10 | \item{genomeBuild}{string, genome identifier}
11 | }
12 | \value{
13 | loaded library
14 | }
15 | \description{
16 | Load selected EnsDb library
17 | }
18 | \examples{
19 | \dontrun{
20 | loadEnsDb("hg19")
21 | }
22 | }
23 | 


--------------------------------------------------------------------------------
/man/neighbordt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/neighbor-distances.R
 3 | \name{neighbordt}
 4 | \alias{neighbordt}
 5 | \title{Internal helper function to calculate distance 
 6 | between neighboring regions.}
 7 | \usage{
 8 | neighbordt(querydt)
 9 | }
10 | \arguments{
11 | \item{querydt}{A data table with regions grouped according to
12 | chromosome.}
13 | }
14 | \value{
15 | A numeric vector with the distances in bp
16 | }
17 | \description{
18 | Internal helper function to calculate distance 
19 | between neighboring regions.
20 | }
21 | 


--------------------------------------------------------------------------------
/man/nlist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility.R
 3 | \name{nlist}
 4 | \alias{nlist}
 5 | \title{Nathan's magical named list function.
 6 | This function is a drop-in replacement for the base list() function,
 7 | which automatically names your list according to the names of the 
 8 | variables used to construct it.
 9 | It seamlessly handles lists with some names and others absent,
10 | not overwriting specified names while naming any unnamed parameters.
11 | Took me awhile to figure this out.}
12 | \usage{
13 | nlist(...)
14 | }
15 | \arguments{
16 | \item{...}{arguments passed to list()}
17 | }
18 | \value{
19 | A named list object.
20 | }
21 | \description{
22 | Nathan's magical named list function.
23 | This function is a drop-in replacement for the base list() function,
24 | which automatically names your list according to the names of the 
25 | variables used to construct it.
26 | It seamlessly handles lists with some names and others absent,
27 | not overwriting specified names while naming any unnamed parameters.
28 | Took me awhile to figure this out.
29 | }
30 | \examples{
31 | x=5
32 | y=10
33 | nlist(x,y) # returns list(x=5, y=10)
34 | list(x,y) # returns unnamed list(5, 10)
35 | }
36 | 


--------------------------------------------------------------------------------
/man/plotChromBins.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chrom-plots.R
 3 | \name{plotChromBins}
 4 | \alias{plotChromBins}
 5 | \title{Plot distribution over chromosomes}
 6 | \usage{
 7 | plotChromBins(
 8 |   genomeAggregate,
 9 |   plotTitle = "Distribution over chromosomes",
10 |   ylim = "max"
11 | )
12 | }
13 | \arguments{
14 | \item{genomeAggregate}{The output from the genomicDistribution function}
15 | 
16 | \item{plotTitle}{Title for plot.}
17 | 
18 | \item{ylim}{Limit of y-axes. Default "max" sets limit to N of biggest bin.}
19 | }
20 | \value{
21 | A ggplot object showing the distribution of the query 
22 |     regions over bins of
23 | the reference genome.
24 | }
25 | \description{
26 | Plots result from \code{genomicDistribution} calculation
27 | }
28 | \examples{
29 | agg = data.frame("regionID"=1:5, "chr"=rep(c("chr1"), 5), 
30 |                 "withinGroupID"=1:5, "N"=c(1,3,5,7,9))  
31 | ChromBins = plotChromBins(agg)
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/man/plotCumulativePartitions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/partition-plots.R
 3 | \name{plotCumulativePartitions}
 4 | \alias{plotCumulativePartitions}
 5 | \title{Plot the cumulative distribution of regions in features}
 6 | \usage{
 7 | plotCumulativePartitions(assignedPartitions, feature_names = NULL)
 8 | }
 9 | \arguments{
10 | \item{assignedPartitions}{Results from \code{calcCumulativePartitions}}
11 | 
12 | \item{feature_names}{An optional character vector of feature names, in the
13 | same order as the GenomicRanges or GenomicRangesList object.}
14 | }
15 | \value{
16 | A ggplot object of the cumulative distribution of regions in
17 |     features.
18 | }
19 | \description{
20 | This function plots the cumulative distribution of regions across a
21 | feature set.
22 | }
23 | \examples{
24 | p = calcCumulativePartitionsRef(vistaEnhancers, "hg19")
25 | cumuPlot = plotCumulativePartitions(p)
26 | }
27 | 


--------------------------------------------------------------------------------
/man/plotDinuclFreq.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/content-plots.R
 3 | \name{plotDinuclFreq}
 4 | \alias{plotDinuclFreq}
 5 | \title{Plot dinuclotide content within region set(s)}
 6 | \usage{
 7 | plotDinuclFreq(DNFDataTable)
 8 | }
 9 | \arguments{
10 | \item{DNFDataTable}{A data.table, data.frame, or a list of dinucleotide counts - 
11 | results from \code{calcDinuclFreq} or \code{calcDinuclFreqRef}}
12 | }
13 | \value{
14 | A ggplot object plotting distribution of dinucleotide content in query regions
15 | }
16 | \description{
17 | Given \code{calcDinuclFreq} or \code{calcDinuclFreqRef} results, this function 
18 | generates a violin plot of dinucleotide frequency
19 | }
20 | \examples{
21 | 
22 | DNFDataTable = data.table::data.table(GC = rnorm(400, mean=0.5, sd=0.1), 
23 | CG = rnorm(400, mean=0.5, sd=0.5), 
24 | AT = rnorm(400, mean=0.5, sd=1), 
25 | TA = rnorm(400, mean=0.5, sd=1.5))
26 | DNFPlot =  plotDinuclFreq(DNFDataTable)
27 | 
28 | \dontrun{
29 | query = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions")
30 | GRquery = rtracklayer::import(query)
31 | refAssembly = 'hg19'
32 | DNF = calcDinuclFreqRef(GRquery, refAssembly)
33 | DNFPlot2 =  plotDinuclFreq(DNF)
34 | } 
35 | }
36 | 


--------------------------------------------------------------------------------
/man/plotExpectedPartitions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/partition-plots.R
 3 | \name{plotExpectedPartitions}
 4 | \alias{plotExpectedPartitions}
 5 | \title{Produces a barplot showing how query regions of interest are distributed
 6 | relative to the expected distribution across a given partition list}
 7 | \usage{
 8 | plotExpectedPartitions(expectedPartitions, feature_names = NULL, pval = FALSE)
 9 | }
10 | \arguments{
11 | \item{expectedPartitions}{A data.frame holding the frequency of assignment
12 | to each of the partitions, the expected number of each partition, and
13 | the log10 of the observed over expected. Produced by
14 | \code{calcExpectedPartitions}.}
15 | 
16 | \item{feature_names}{Character vector with labels for the partitions
17 | (optional). By default it will use the names from the first argument.}
18 | 
19 | \item{pval}{Logical indicating whether Chi-square p-values should be added
20 | for each partition.}
21 | }
22 | \value{
23 | A ggplot object using a barplot to show the distribution of the
24 |     query regions across a given partition list.
25 | }
26 | \description{
27 | Produces a barplot showing how query regions of interest are distributed
28 | relative to the expected distribution across a given partition list
29 | }
30 | \examples{
31 | p = calcExpectedPartitionsRef(vistaEnhancers, "hg19")
32 | expectedPlot = plotExpectedPartitions(p)
33 | }
34 | 


--------------------------------------------------------------------------------
/man/plotFeatureDist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/feature-plots.R
 3 | \name{plotFeatureDist}
 4 | \alias{plotFeatureDist}
 5 | \title{Plots a histogram of distances to genomic features}
 6 | \usage{
 7 | plotFeatureDist(
 8 |   dists,
 9 |   bgdists = NULL,
10 |   featureName = "features",
11 |   numbers = FALSE,
12 |   nbins = 50,
13 |   size = 1e+05,
14 |   infBins = FALSE,
15 |   tile = FALSE,
16 |   labelOrder = "default"
17 | )
18 | }
19 | \arguments{
20 | \item{dists}{Results from \code{featureDistribution}}
21 | 
22 | \item{bgdists}{Background distances. If provided, will plot a background
23 | distribution of expected distances}
24 | 
25 | \item{featureName}{Character vector for plot labels (optional).}
26 | 
27 | \item{numbers}{a logical indicating whether the raw numbers should be 
28 | displayed, rather than percentages (optional).}
29 | 
30 | \item{nbins}{Number of bins on each side of the center point.}
31 | 
32 | \item{size}{Number of bases to include in plot on each side of the 
33 | center point.}
34 | 
35 | \item{infBins}{Include catch-all bins on the sides?}
36 | 
37 | \item{tile}{Turn on a tile mode, which plots a tiled figure 
38 | instead of a histogram.}
39 | 
40 | \item{labelOrder}{-- Enter "default" to order by order of user input (default); 
41 | Enter "center" to order by value in tile in the closest proximity to the center 
42 | of features (in case TSS is used - center is TSS) (center).}
43 | }
44 | \value{
45 | A ggplot2 plot object
46 | }
47 | \description{
48 | Given the results from \code{featureDistribution}, plots a histogram of
49 | distances surrounding the features of interest
50 | }
51 | \examples{
52 | TSSdist = calcFeatureDistRefTSS(vistaEnhancers, "hg19")
53 | f = plotFeatureDist(TSSdist, featureName="TSS")
54 | }
55 | 


--------------------------------------------------------------------------------
/man/plotGCContent.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/content-plots.R
 3 | \name{plotGCContent}
 4 | \alias{plotGCContent}
 5 | \title{Plots a density distribution of GC vectors
 6 | Give results from the \code{calcGCContent} function, this will produce a
 7 | density plot}
 8 | \usage{
 9 | plotGCContent(gcvectors)
10 | }
11 | \arguments{
12 | \item{gcvectors}{A numeric vector or list of numeric vectors of GC contents.}
13 | }
14 | \value{
15 | A ggplot object plotting distribution of GC content in query regions.
16 | }
17 | \description{
18 | Plots a density distribution of GC vectors
19 | Give results from the \code{calcGCContent} function, this will produce a
20 | density plot
21 | }
22 | \examples{
23 | numVector = rnorm(400, mean=0.5, sd=0.1)
24 | GCplot = plotGCContent(numVector)
25 | vecs = list(example1 = rnorm(400, mean=0.5, sd=0.1), 
26 |             example2 = rnorm(600, mean=0.5, sd=0.1))
27 | GCplot = plotGCContent(vecs)
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/man/plotNeighborDist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/neighbor-distances.R
 3 | \name{plotNeighborDist}
 4 | \alias{plotNeighborDist}
 5 | \title{Plot the distances from regions to their upstream/downstream neighbors
 6 | or nearest neighbors. Distances can be passed as either raw bp or
 7 | corrected for the number of regions (log10(obs/exp)), but this has
 8 | to be specified in the function parameters.}
 9 | \usage{
10 | plotNeighborDist(dcvec, correctedDist = FALSE, Nneighbors = FALSE)
11 | }
12 | \arguments{
13 | \item{dcvec}{A numeric vector or list of vectors containing distances 
14 | to upstream/downstream neighboring regions or to nearest neighbors. 
15 | Produced by \code{calcNeighborDist} or \code{calcNearestNeighbors}}
16 | 
17 | \item{correctedDist}{A logical indicating if the plot axis should
18 | be adjusted to show distances corrected for the number of regions
19 | in a regionset.}
20 | 
21 | \item{Nneighbors}{A logical indicating whether legend should be adjusted
22 | if Nearest neighbors are being plotted. Default legend shows distances
23 | to upstream/downstream neighbors.}
24 | }
25 | \value{
26 | A ggplot density object showing the distribution of
27 | raw or corrected distances.
28 | }
29 | \description{
30 | Plot the distances from regions to their upstream/downstream neighbors
31 | or nearest neighbors. Distances can be passed as either raw bp or
32 | corrected for the number of regions (log10(obs/exp)), but this has
33 | to be specified in the function parameters.
34 | }
35 | \examples{
36 | numVector = rnorm(400, mean=5, sd=0.1)
37 | d = plotNeighborDist(numVector)
38 | }
39 | 


--------------------------------------------------------------------------------
/man/plotPartitions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/partition-plots.R
 3 | \name{plotPartitions}
 4 | \alias{plotPartitions}
 5 | \title{Produces a barplot showing how query regions of interest are distributed
 6 | across a given partition list}
 7 | \usage{
 8 | plotPartitions(assignedPartitions, numbers = FALSE, stacked = FALSE)
 9 | }
10 | \arguments{
11 | \item{assignedPartitions}{A table holding the frequency of assignment to
12 | each of the partitions. Produced by \code{calcPartitions}}
13 | 
14 | \item{numbers}{logical indicating whether raw overlaps should be
15 | plotted instead of the default percentages}
16 | 
17 | \item{stacked}{logical indicating that data should be plotted as stacked
18 | bar plot}
19 | }
20 | \value{
21 | A ggplot object using a barplot to show the distribution
22 |     of the query
23 |  regions across a given partition list.
24 | }
25 | \description{
26 | This function can be used to test a GRanges object against any arbitrary
27 | list of genome partitions. The partition list is a priority-ordered list of
28 | GRanges objects. Each region in the query will be assigned to a given
29 | partition that it overlaps with the highest priority.
30 | }
31 | \examples{
32 | p = calcPartitionsRef(vistaEnhancers, "hg19")
33 | partPlot = plotPartitions(p)
34 | partCounts = plotPartitions(p, numbers=TRUE)
35 | partPlot = plotPartitions(p, stacked=TRUE)
36 | }
37 | 


--------------------------------------------------------------------------------
/man/plotQTHist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/qthist.R
 3 | \name{plotQTHist}
 4 | \alias{plotQTHist}
 5 | \title{Plot quantile-trimmed histogram}
 6 | \usage{
 7 | plotQTHist(
 8 |   x,
 9 |   EndBarColor = "gray57",
10 |   MiddleBarColor = "gray27",
11 |   quantThresh = NULL,
12 |   bins = NULL,
13 |   indep = FALSE,
14 |   numbers = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{x}{Data values to plot - vector or list of vectors}
19 | 
20 | \item{EndBarColor}{Color for the quantile bars on both ends of the graph
21 | (optional)}
22 | 
23 | \item{MiddleBarColor}{Color for the bars in the middle of the graph
24 | (optional)}
25 | 
26 | \item{quantThresh}{Quantile of data to be contained in each end bar (optional)
27 | quantThresh values must be under .2, optimal size is under .1}
28 | 
29 | \item{bins}{The number of bins for the histogram to allocate data to.
30 | (optional)}
31 | 
32 | \item{indep}{logical value which returns a list of plots that have had their
33 | bins calculated independently; the normal version will plot them on the 
34 | same x and y axis.}
35 | 
36 | \item{numbers}{a logical indicating whether the raw numbers should be 
37 | displayed, rather than percentages (optional).}
38 | }
39 | \value{
40 | A ggplot2 plot object
41 | }
42 | \description{
43 | Given the results from \code{calcWidth}, plots a histogram with 
44 | outliers trimmed.
45 | }
46 | \details{
47 | x-axis breaks for the frequency calculations are based on the "divisions" 
48 | results from helper function \code{calcDivisions}.
49 | }
50 | \examples{
51 | regWidths = calcWidth(vistaEnhancers)
52 | qtHist = plotQTHist(regWidths)
53 | qtHist2 = plotQTHist(regWidths, quantThresh=0.1)
54 | }
55 | 


--------------------------------------------------------------------------------
/man/plotSummarySignal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/specificity-plots.R
 3 | \name{plotSummarySignal}
 4 | \alias{plotSummarySignal}
 5 | \title{The function plotSummarySignal visualizes the signalSummaryMatrix obtained from
 6 | \code{calcSummarySignal}.}
 7 | \usage{
 8 | plotSummarySignal(
 9 |   signalSummaryList,
10 |   plotType = "barPlot",
11 |   metadata = NULL,
12 |   colorColumn = NULL,
13 |   filterGroupColumn = NULL,
14 |   filterGroup = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{signalSummaryList}{Output list from \code{calcSummarySignal} function.}
19 | 
20 | \item{plotType}{Options are: "jitter" - jitter plot with box plot on top,
21 | "boxPlot" - box plot without individual points and outliers,
22 | "barPlot" (default) - bar height represents the median signal value
23 | for a given cell type, 
24 | "violinPlot" - violin plot with medians.}
25 | 
26 | \item{metadata}{(optional) data.table used for grouping columns from 
27 | 'signalMatrix' into categories, that are then plotted with different colors. 
28 | Must contain variable 'colName' that contains all the condition column names 
29 | from 'signaMatrix'.}
30 | 
31 | \item{colorColumn}{(optional only if metadata provided) columns name from 
32 | 'metadata' table that will be used as grouping variable for coloring.}
33 | 
34 | \item{filterGroupColumn}{(optional only if metadata provided and 
35 | 'filterGroup' specified) allows user to plot specified subgroups only. 
36 | String specifying the column name in 'metadata' from which groups will 
37 | be filtered (groups are specified in as 'filterGroups)}
38 | 
39 | \item{filterGroup}{(optional only if 'metadata' and 'filterGroupColumn' 
40 | provided) - string (or vector of strings) of groups from 
41 | 'filterGroupColumn' to be plottted.}
42 | }
43 | \value{
44 | A ggplot object.
45 | }
46 | \description{
47 | The function plotSummarySignal visualizes the signalSummaryMatrix obtained from
48 | \code{calcSummarySignal}.
49 | }
50 | \examples{
51 | signalSummaryList = calcSummarySignal(vistaEnhancers, exampleOpenSignalMatrix_hg19)
52 | metadata = cellTypeMetadata
53 | plotSignal = plotSummarySignal(signalSummaryList)
54 | 
55 | plotSignalTissueColor = plotSummarySignal(signalSummaryList = signalSummaryList, 
56 | plotType = "jitter", metadata = metadata, colorColumn = "tissueType")
57 | 
58 | plotSignalFiltered = plotSummarySignal(signalSummaryList = signalSummaryList,
59 | plotType = "violinPlot", metadata = metadata, colorColumn = "tissueType", 
60 | filterGroupColumn = "tissueType", filterGroup = c("skin", "blood"))
61 | }
62 | 


--------------------------------------------------------------------------------
/man/retrieveFile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/buildReferenceData.R
 3 | \name{retrieveFile}
 4 | \alias{retrieveFile}
 5 | \title{Read local or remote file}
 6 | \usage{
 7 | retrieveFile(source, destDir = NULL)
 8 | }
 9 | \arguments{
10 | \item{source}{a string that is either a path to a local or remote GTF}
11 | 
12 | \item{destDir}{a string that indicates the path to the directory where
13 | the downloaded GTF file should be stored. If not provided, 
14 | a temporary directory will be used.}
15 | }
16 | \value{
17 | data.frame retrieved file path
18 | }
19 | \description{
20 | Read local or remote file
21 | }
22 | \examples{
23 | CElegansGtfCropped = system.file("extdata", 
24 |                                  "C_elegans_cropped_example.gtf.gz", 
25 |                                  package="GenomicDistributions")
26 | CElegansGtf = retrieveFile(CElegansGtfCropped)
27 | }
28 | 


--------------------------------------------------------------------------------
/man/setB_100.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{setB_100}
 5 | \alias{setB_100}
 6 | \title{Example BED file read with rtracklayer::import}
 7 | \format{
 8 | GenomicRanges::GRanges
 9 | }
10 | \usage{
11 | data(setB_100)
12 | }
13 | \description{
14 | Example BED file read with rtracklayer::import
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/splitDataTable.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility.R
 3 | \name{splitDataTable}
 4 | \alias{splitDataTable}
 5 | \title{Efficiently split a data.table by a column in the table}
 6 | \usage{
 7 | splitDataTable(DT, split_factor)
 8 | }
 9 | \arguments{
10 | \item{DT}{Data.table to split}
11 | 
12 | \item{split_factor}{Column to split, which can be a character vector
13 | or an integer.}
14 | }
15 | \value{
16 | List of data.table objects, split by column
17 | }
18 | \description{
19 | Efficiently split a data.table by a column in the table
20 | }
21 | 


--------------------------------------------------------------------------------
/man/theme_blank_facet_label.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utility.R
 3 | \name{theme_blank_facet_label}
 4 | \alias{theme_blank_facet_label}
 5 | \title{Clear ggplot face label.}
 6 | \usage{
 7 | theme_blank_facet_label()
 8 | }
 9 | \value{
10 | A ggplot theme
11 | }
12 | \description{
13 | Usually ggplot2 facets are labeled with boxes surrounding the label. This
14 | function removes the box, so it's a simple label for each facet.
15 | }
16 | 


--------------------------------------------------------------------------------
/man/vistaEnhancers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{vistaEnhancers}
 5 | \alias{vistaEnhancers}
 6 | \title{Example BED file read with rtracklayer::import}
 7 | \format{
 8 | GenomicRanges::GRanges
 9 | }
10 | \usage{
11 | data(vistaEnhancers)
12 | }
13 | \description{
14 | Example BED file read with rtracklayer::import
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library("GenomicDistributions")
3 | 
4 | test_check("GenomicDistributions")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/testChrom.R:
--------------------------------------------------------------------------------
 1 | # library(GenomicDistributions)
 2 | library(testthat)
 3 | library(data.table)
 4 | 
 5 | # data
 6 | query = vistaEnhancers
 7 | querySftd = GenomicRanges::shift(query, 100000)
 8 | queryList = GRangesList(q1=query, q2=querySftd)
 9 | 
10 | context("general")
11 | test_that("binRegion works with binSize and binCount", {
12 |     for(s in seq(1, 100, by=50)){
13 |         for(e in seq(1000, 10000, by=5000)){
14 |             expect_visible(binRegion(start=s, end=e, binSize=10))
15 |             expect_visible(binRegion(start=s, end=e, binCount=10))
16 |         }
17 |     }
18 | })
19 | 
20 | test_that("calcChromBinsRef works with list input", {
21 |     expect_visible(calcChromBinsRef(queryList, "hg19"))
22 | })
23 | 
24 | context("result")
25 | test_that("binRegion returns result of correct length", {
26 |     expect_equal(
27 |         binRegion(start=1, end=100, binSize=10),
28 |         binRegion(start=1, end=100, binCount=10),
29 |     )
30 |     expect_length(binRegion(start=1, end=100, binSize=10), 5)
31 |     expect_equal(NROW(binRegion(start=1, end=100, binSize=10)), 10)
32 | })
33 | 
34 | test_that("calcChromBinsRef returns a proper object type, length ad includes all the regions", {
35 |     result = calcChromBinsRef(query, "hg19")
36 |     expect_is(result, "data.table")
37 |     expect_length(result, 6)
38 |     expect_equal(sum(result$N), length(query))
39 | })
40 | 


--------------------------------------------------------------------------------
/tests/testthat/testGCContent.R:
--------------------------------------------------------------------------------
 1 | # COMMENTED OUT DUE TO BSgenome.Hsapiens.UCSC.<genome>.masked PACKAGES 
 2 | # DEPENDANCIES, WHICH ARE NOT INCLUDED IN REQUIREMENTS DUE TO SIZE
 3 | 
 4 | # # lib
 5 | # library(testthat)
 6 | # 
 7 | # # data
 8 | # featureFile = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions")
 9 | # feats = rtracklayer::import(featureFile)
10 | # refs = c("hg38", "hg19")
11 | # 
12 | # # tests
13 | # context("general")
14 | # test_that("calcGCContent works", {
15 | #     for(r in refs){
16 | #         expect_visible(calcGCContentRef(feats, r))
17 | #     }
18 | # })
19 | # 
20 | # context("result")
21 | # test_that("calcGCContent yields results of proper length", {
22 | #     expect_equal(length(calcGCContentRef(feats, "hg19")), length(feats))
23 | # })
24 | # 
25 | # test_that("calcGCContent yields a numeric result", {
26 | #     expect_true(is.numeric(calcGCContentRef(feats, "hg19")))
27 | # })
28 | # 
29 | # test_that("calcGCContent yields a numeric in range 0-1", {
30 | #     x = calcGCContentRef(feats, "hg19")
31 | #     for(i in x){
32 | #         expect_gt(i, 0)    
33 | #         expect_lt(i, 1)
34 | #     }
35 | # })
36 | 


--------------------------------------------------------------------------------
/tests/testthat/testNeighborDist.R:
--------------------------------------------------------------------------------
 1 | # lib
 2 | library(data.table)
 3 | library(testthat)
 4 | library(GenomicDistributions)
 5 | 
 6 | # data
 7 | query = vistaEnhancers
 8 | querySftd = GenomicRanges::shift(query, 100000)
 9 | queryList = GRangesList(q1=query, q2=querySftd)
10 | 
11 | # tests
12 | context("general")
13 | test_that("calcNeighborDist works", {
14 |   lapply(queryList, function(x) expect_visible(calcNeighborDist(x)))
15 | })
16 | 
17 | context("result")
18 | test_that("calcNeighborDist returns a result of a proper class", {
19 |   expect_true(is(calcNeighborDist(query), "numeric"))
20 |   expect_true(is(calcNeighborDist(queryList), "list" ))
21 | })
22 | 
23 | test_that("calcNeighborDist returns the same result for a shifted region set", {
24 |   expect_equal(calcNeighborDist(query), calcNeighborDist(querySftd))
25 | })
26 | 
27 | # test_that("calcNeighborDist yields a numeric in range 0-10", {
28 | #     x = calcNeighborDist(query)
29 | #     for(i in x){
30 | #         expect_gt(i, 0)    
31 | #         expect_lt(i, 10)
32 | #     }
33 | #})
34 | 


--------------------------------------------------------------------------------
/tests/testthat/testOpenChromatin.R:
--------------------------------------------------------------------------------
 1 | # lib
 2 | library(data.table)
 3 | library(testthat)
 4 | library(GenomicDistributions)
 5 | # data
 6 | cellMatrix = exampleOpenSignalMatrix_hg19
 7 | query = vistaEnhancers
 8 | querySftd = GenomicRanges::shift(query, 100)
 9 | queryList = GRangesList(q1=query, q2=querySftd)
10 | 
11 | # tests
12 | context("general")
13 | test_that("calcSummarySignal works", {
14 |     expect_visible(calcSummarySignal(query, cellMatrix))
15 |     expect_visible(calcSummarySignal(querySftd, cellMatrix))
16 | })
17 | 
18 | test_that("ccalcSummarySignal works with multiple queries", {
19 |     expect_visible(calcSummarySignal(queryList, cellMatrix))
20 | })
21 | 
22 | context("result")
23 | test_that("calcSummarySignal returns a result of a proper class", {
24 |     expect_true(is(calcSummarySignal(query, cellMatrix), "list"))
25 |     expect_true(is(calcSummarySignal(query, cellMatrix)[[1]], "data.table"))
26 |     expect_true(is(calcSummarySignal(query, cellMatrix)[[2]], "data.frame"))
27 | })
28 | 
29 | test_that("calcSummarySignal returns different results for different queries", {
30 |     expect_false(identical(calcSummarySignal(query, cellMatrix)[[1]], 
31 |                            calcSummarySignal(querySftd, cellMatrix)[[1]]))
32 | })
33 | 
34 | test_that("calcSummarySignal combines results from multi-query runs", {
35 |     ql = GRangesList(q1=query, q2=query)
36 |     expect_true(NROW(calcSummarySignal(query, cellMatrix)[[1]])*2 == 
37 |                     NROW(calcSummarySignal(ql, cellMatrix)[[1]]))
38 |     expect_true(NROW(calcSummarySignal(query, cellMatrix)[[2]])*2 == 
39 |                   NROW(calcSummarySignal(ql, cellMatrix)[[2]]))
40 | })
41 | 
42 | 


--------------------------------------------------------------------------------
/tests/testthat/testPartitions.R:
--------------------------------------------------------------------------------
 1 | # lib
 2 | library(data.table)
 3 | library(testthat)
 4 | library(GenomicDistributions)
 5 | 
 6 | # data
 7 | query = vistaEnhancers
 8 | querySftd = GenomicRanges::shift(query, 100000)
 9 | queryList = GRangesList(q1=query, q2=querySftd)
10 | 
11 | # tests
12 | context("general")
13 | test_that("calcPartitionsRef works", {
14 |     lapply(queryList, function(x) expect_visible(calcPartitionsRef(x, "hg19")))
15 | })
16 | 
17 | context("result")
18 | test_that("calcPartitionsRef returns a result of a proper class", {
19 |     expect_true(is(calcPartitionsRef(query, "hg19"), "data.frame"))
20 | })
21 | 
22 | test_that("calcPartitionsRef returns a result of a proper length", {
23 |     expect_length(calcPartitionsRef(query, "hg19"), 2)
24 |     expect_equal(NROW(calcPartitionsRef(query, "hg19")), 7)
25 | })
26 | 
27 | test_that("calcPartitionsRef returns different results for different queries", {
28 |     expect_false(all(calcPartitionsRef(query, "hg19")$Freq == 
29 |                      calcPartitionsRef(querySftd, "hg19")$Freq))
30 | })
31 | 


--------------------------------------------------------------------------------
/tests/testthat/test_all.R:
--------------------------------------------------------------------------------
  1 | # Unit tests
  2 | library(GenomicDistributions)
  3 | 
  4 | context("Testthat context...")
  5 | 
  6 | #############################################################################
  7 | # Test data should be with toy examples you can work out by hand
  8 | # that way you can calculate by hand and compare to the output of the function
  9 | 
 10 | # toy data for testing functions
 11 | # if altered, tests relying on these objects will be disrupted
 12 | start1 = c(seq(from=1, to = 2001, by = 1000), 800)
 13 | start2 = c(seq(from=126, to = 2126, by = 1000), 100, 2500)
 14 | chrString1 = c(rep("chr1", 3), "chr2")
 15 | chrString2 = c(chrString1, "chr3")
 16 | 
 17 | origCoordDT1 = data.table(chr=chrString1,
 18 |                            start = start1,
 19 |                            end = start1 + 250)
 20 | origCoordDT2 = data.table(chr=chrString2,
 21 |                           start=start2,
 22 |                           end=start2+150)
 23 | coordDT1 = copy(origCoordDT1)
 24 | coordDT2 = copy(origCoordDT2)
 25 | 
 26 | testGR1 = dtToGr(coordDT1)
 27 | testGR2 = dtToGr(coordDT2)
 28 | testGR3 = GenomicRanges::shift(testGR2, 1000)
 29 | testGR4 = GenomicRanges::shift(testGR2, 2500)
 30 | testGR5 = GenomicRanges::shift(testGR2, 4000)
 31 | ###############################################################################
 32 | 
 33 | # test for calcOLCount
 34 | # reset test data in case it was changed by another unit test section
 35 | coordDT1 = copy(origCoordDT1)
 36 | coordDT2 = copy(origCoordDT2)
 37 | testGR1 = dtToGr(coordDT1)
 38 | testGR2 = dtToGr(coordDT2)
 39 | test_that("calcOLCount", {
 40 |     
 41 |     # uses midpoint coordinate of queryRegionDT
 42 |     testGRList = GRangesList(dtToGr(data.table(chr=c("chr1", "chr1"),
 43 |                                    start = c(1, 2001),
 44 |                                    end = c(2000, 4000))), 
 45 |                              dtToGr(data.table(chr=c("chr2", "chr2"),
 46 |                                                start = c(1, 2001),
 47 |                                                end = c(2000, 4000))),
 48 |                              dtToGr(data.table(chr=c("chr3", "chr3"),
 49 |                                                start = c(1, 2001),
 50 |                                                end = c(2000, 4000))))
 51 |     olCount1 = calcOLCount(queryRegionDT = coordDT2, regionsGRL = testGRList)
 52 |     expect_equal(olCount1$N, c(2, 1, 1, 1))
 53 |     expect_equal(olCount1$regionGroupID, c(1, 1, 2, 3))
 54 |     
 55 |     # only expect one overlap: chr2
 56 |     olCount2 = calcOLCount(coordDT2, dtToGr(data.table(chr=c("chr1", "chr1", "chr2"),
 57 |                                            start = c(1, 250, 170),
 58 |                                            end = c(150, 300, 180))))
 59 |     olCount2=as.data.frame(olCount2)
 60 |     expectedOut = data.frame(regionID=3, chr="chr2", start=170, end=180, withinGroupID=3, regionGroupID=1, N=1, stringsAsFactors = FALSE)
 61 |     expect_equal(olCount2, expectedOut)
 62 | })
 63 | 
 64 | 
 65 | 
 66 | 
 67 | # "featureDistanceDistribution" function is now named "calcFeatureDist"
 68 | # reset test data in case it was changed by another unit test section
 69 | # and select just one chromosome - since DTNearest is help function calculating
 70 | # distances within one chromosome
 71 | coordDT1 = copy(origCoordDT1)
 72 | coordDT2 = copy(origCoordDT2)
 73 | testGR1 = dtToGr(coordDT1)
 74 | testGR2 = dtToGr(coordDT2)
 75 | test_that("featureDistribution",  {
 76 |     
 77 |     ############# old
 78 |     # queryFile = system.file("extdata", "setB_100.bed.gz", package="GenomicDistributions")
 79 |     # query = rtracklayer::import(queryFile)
 80 |     # 
 81 |     # featureExample = GenomicRanges::shift(query, round(rnorm(length(query), 0,1000)))
 82 |     # fdd = featureDistanceDistribution(query, featureExample)
 83 |     # featureFile = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions")
 84 |     # feats = rtracklayer::import(featureFile)
 85 |     
 86 |     #' featureDistance = featureDistanceDistribution(query, feats)
 87 |     #' expect_equal(sum(is.na(featureDistance)), -3)
 88 |     #' expect_equal(sum(featureDistance, na.rm=TRUE), 743969)
 89 |     ############# old
 90 |     
 91 |     coordDT1$end[1] = 100
 92 |     coordDT1$start[2] = 200
 93 |     coordDT1$end[2] = 400
 94 |     testGR1 = dtToGr(coordDT1)
 95 |     # DTNearest
 96 |     # @param DT1 data.table Has start and end column
 97 |     # @param DT2 
 98 |     # @return numeric vector. Distance from region set to closest other region set.
 99 |     # Distance from the midpointof each region to the midpoint.
100 |     nearestVec = DTNearest(coordDT1, coordDT2)
101 |     nearestVec
102 |     expect_equal(nearestVec, c(124, -99, 276, 75))
103 |   
104 | 
105 |     # DTNearest ignores chromosome completely. By design.
106 |     # DTNearest shouldn't be used with data from different chromosomes.
107 |     # Suggested to split by chromosome when such case presents (e.g chrom1).
108 |     DT1chrom1 = coordDT1[coordDT1$chr == "chr1"]
109 |     DT2chrom1 = coordDT2[coordDT2$chr == "chr1"]
110 |     nearestVec2C1 = DTNearest(DT2chrom1, DT1chrom1)
111 |     expect_equal(nearestVec2C1, c(99, -901, -75))
112 |     
113 |     featureDistance = calcFeatureDist(testGR1, testGR2)
114 |     featureDistance
115 |     expect_equal(featureDistance, c(150, -99, 75, -750))
116 |     featureDistance2 = calcFeatureDist(testGR2, testGR1)
117 |     featureDistance2
118 |     
119 |     expect_equal(featureDistance2, c( 99, -901, -75, 750, NA))
120 |     
121 |     # coordDT1$chr = "chr2"
122 |     # testGR1 = dtToGr(coordDT1)
123 |     # featureDistance = calcFeatureDist(testGR1, testGR2)
124 |     # featureDistance
125 |     # featureDistance2 = calcFeatureDist(testGR2, testGR1)
126 |     # featureDistance2
127 | 
128 |     
129 | })
130 | 
131 | #' queryDT = GenomicDistributions:::grToDt(query)
132 | #'        featureDT = GenomicDistributions:::grToDt(features)
133 | #'        queryDTs = GenomicDistributions:::splitDataTable(queryDT, "chr")
134 | #'        featureDTs = GenomicDistributions:::splitDataTable(featureDT, "chr")
135 | #'        as.vector(unlist(mapply(queryDTs, featureDTs[names(queryDTs)], FUN=DTNearest)))
136 | 
137 | 
138 | 
139 | test_that("Genome aggregate", {
140 |     queryFile = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions")
141 |     query = rtracklayer::import(queryFile)
142 |     # First, calculate the distribution:
143 |     x = aggregateOverGenomeBins(query, "hg19")
144 |     # Then, plot the result:
145 |     # plotGenomeAggregate(x)
146 | })
147 | 
148 | 
149 | # "genomicPartitions" function changed to "calcPartitionsRef"
150 | 
151 | test_that("Partitions", {
152 |     
153 |     ################### old
154 |     #queryFile = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions")
155 |     #query = rtracklayer::import(queryFile)
156 |     #gp = genomicPartitions(query, "hg38")
157 |     #gp = genomicPartitions(query, "hg19")
158 |     #gp = genomicPartitions(query, "mm10")
159 |     #gp = genomicPartitions(query, "mm9")
160 |     #plotPartitions(gp)
161 |     ################### old
162 |     
163 |     # test calcPartitions()
164 |     # GenomePartitionList 
165 |     promCore = GenomicRanges::reduce(trim(promoters(testGR2, upstream=100, downstream=0)))
166 |     promProx = GenomicRanges::reduce(trim(promoters(testGR2, upstream=2000, downstream=0)))
167 |     promoterProx = GenomicRanges::setdiff(promProx, promCore)
168 |     
169 |     # remove any possible overlaps between classes
170 |     testGR5 = GenomicRanges::setdiff(testGR5, testGR4)
171 |     testGR3 = GenomicRanges::setdiff(testGR3, testGR4)
172 |     testGR3 = GenomicRanges::setdiff(testGR3, testGR5)
173 |     
174 |     nonThree = GenomicRanges::setdiff(testGR2, testGR4)
175 |     nonThreeFive = GenomicRanges::setdiff(nonThree, testGR5)
176 |     intronGR = GenomicRanges::setdiff(nonThreeFive, testGR3)
177 |     
178 |     partList = list(promoterCore=GenomicRanges::reduce(trim(promoters(testGR2, upstream=100, downstream=0))),
179 |                     promoterProx=promoterProx, 
180 |                     threeUTR=testGR4, 
181 |                     fiveUTR=testGR5,
182 |                     exon=testGR3,
183 |                     intron=intronGR)
184 |   
185 |     gp = genomePartitionList(testGR2, testGR3, testGR4, testGR5)
186 |     expect_equal(gp, partList)
187 |     
188 |     # calcPartitions
189 |     partition = rep(0, length(testGR1))
190 |     for (i in seq_along(partList)) {
191 |       ols = countOverlaps(testGR1[partition==0], partList[[i]])
192 |       partition[partition==0][ols > 0] = names(partList)[[i]]
193 |     }
194 |     partition[partition=="0"] = "intergenic"
195 |     testPartitions = data.frame(table(partition))
196 |     
197 |     testPartitionNames = c("promoterCore", "promoterProx", "threeUTR", "fiveUTR",
198 |                            "exon", "intron", "intergenic")
199 |     if (!all(testPartitionNames %in% testPartitions$partition)){
200 |       notIncluded = testPartitionNames[!(testPartitionNames %in% 
201 |                                            testPartitions$partition)]
202 |       addRows = data.frame(partition = notIncluded, 
203 |                            Freq = rep(0, length(notIncluded)))
204 |       testPartitions = rbind(testPartitions, addRows)
205 |     }
206 |     
207 |     Partitions = calcPartitions(testGR1, partList)
208 |     expect_equal(Partitions, testPartitions)
209 |     
210 | })
211 | 
212 | test_that("Neighbor distances", {
213 |   
214 |   testGRdt = grToDt(sort(testGR1))
215 |   splitdt = splitDataTable(testGRdt, "chr")
216 |   chromTest = splitdt[[1]]
217 |   # Compare bp distance generated by neighbordt
218 |   distancesExp = neighbordt(chromTest)
219 |   # Calculated by hand c(749, 749)
220 |   expect_equal(distancesExp, c(749, 749))
221 |   
222 |   # Compare  distances from calcNeighborDist
223 |   distances = calcNeighborDist(testGR1)
224 |   expect_equal(distances, c(749, 749))
225 |   
226 | })
227 | 
228 | test_that("Nearest Neighbor distances", {
229 |   
230 |   testGR2dt = grToDt(sort(testGR2))
231 |   splitdt2 = splitDataTable(testGR2dt, "chr")
232 |   chromTest2 = splitdt2[[1]]
233 |   # Compare bp distance generated by neighbordt
234 |   nearestDistancesExp = neighbordt(chromTest2)
235 |   up = nearestDistancesExp[-length(dist)]
236 |   down = nearestDistancesExp[-1]
237 |   dt = data.table(i=up, j=down)
238 |   pairmins = dt[, pmin(i, j)]
239 |   nNeighbors = c(nearestDistancesExp[1], pairmins, 
240 |                  nearestDistancesExp[length(dist)])
241 |   
242 |   # Calculated by hand c(849, 849, 849)
243 |   expect_equal(nNeighbors, rep(849, 3))
244 |   
245 |   # Compare  distances from calcNeighborDist
246 |   nearestNeighborsTest = calcNearestNeighbors(testGR2)
247 |   expect_equal(nearestNeighborsTest, rep(849, 3))
248 |   
249 | })
250 | 
251 | 


--------------------------------------------------------------------------------
/vignettes/figures-full-power/GC-content-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/GC-content-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/TSS-plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/TSS-plot-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/TSS-plot-closeup-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/TSS-plot-closeup-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/chrom-bin-plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/chrom-bin-plot-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/cumulative-partitions-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/cumulative-partitions-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/custom-cumulative-partitions-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/custom-cumulative-partitions-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/custom-expected-partition-plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/custom-expected-partition-plot-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/custom-partition-plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/custom-partition-plot-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/dinuc-content-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/dinuc-content-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/expected-partition-plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/expected-partition-plot-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/gene-distance-plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/gene-distance-plot-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/neighbor-distance-distribution-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/neighbor-distance-distribution-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/open-signal-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/open-signal-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/partition-plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/partition-plot-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/partition-plot-proportional-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/partition-plot-proportional-1.png


--------------------------------------------------------------------------------
/vignettes/figures-full-power/width-distribution-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/databio/GenomicDistributions/981f02c84a88e8ff73df432db1d11edfbb52f706/vignettes/figures-full-power/width-distribution-1.png


--------------------------------------------------------------------------------