├── data
    ├── BS.chr21.rda
    ├── dmrs.ex.rda
    └── annot.chr21.rda
├── inst
    ├── sticker
    │   ├── dmrseq.png
    │   └── sticker.R
    ├── scripts
    │   ├── get_dmrs.ex.R
    │   ├── get_annot.chr21.R
    │   └── get_BS.chr21.R
    ├── CITATION
    └── NEWS
├── .gitignore
├── man
    ├── meanDiff.Rd
    ├── dmrPlotAnnotations.Rd
    ├── annot.chr21.Rd
    ├── dmrs.ex.Rd
    ├── getAnnot.Rd
    ├── BS.chr21.Rd
    ├── plotEmpiricalDistribution.Rd
    ├── simDMRs.Rd
    ├── dmrseq.Rd
    └── plotDMRs.Rd
├── LICENSE
├── vignettes
    ├── dmrseqBib.bib
    └── dmrseq.Rmd
├── README.md
├── DESCRIPTION
├── NAMESPACE
└── R
    ├── meanDiff.R
    ├── getAnnot.R
    ├── plotEmpiricalDistribution.R
    ├── simDMRs.R
    ├── plotDMRs.R
    ├── internal_plotting_functions.R
    └── dmrseq.R


/data/BS.chr21.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdkorthauer/dmrseq/HEAD/data/BS.chr21.rda


--------------------------------------------------------------------------------
/data/dmrs.ex.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdkorthauer/dmrseq/HEAD/data/dmrs.ex.rda


--------------------------------------------------------------------------------
/data/annot.chr21.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdkorthauer/dmrseq/HEAD/data/annot.chr21.rda


--------------------------------------------------------------------------------
/inst/sticker/dmrseq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdkorthauer/dmrseq/HEAD/inst/sticker/dmrseq.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | *.Rproj
6 | *.Rbuildignore
7 | vignettes/dmrseq_cache/*
8 | 


--------------------------------------------------------------------------------
/inst/sticker/sticker.R:
--------------------------------------------------------------------------------
 1 | imgurl <- "https://clipartion.com/wp-content/uploads/2015/11/bird-island-clipart-free-clip-art-images-830x301.png"
 2 | 
 3 | sticker(imgurl,
 4 |          package="dmrseq", 
 5 |          p_size=8, 
 6 |          s_x=1,
 7 |          s_y=1.2, 
 8 |          s_width=0.85, 
 9 |          s_height=1,
10 |          p_x = 0.95,
11 |          p_y = 0.6,
12 |          h_color = "gold2",
13 |          h_fill = "dodgerblue",
14 |          p_color = "gold2",
15 |          filename="./inst/sticker/dmrseq.png")


--------------------------------------------------------------------------------
/inst/scripts/get_dmrs.ex.R:
--------------------------------------------------------------------------------
 1 | ## The following script uses the dmrseq package to
 2 | ## constructs the dmrs.ex dataset,
 3 | ## an example of dmrseq output
 4 | ## included in the dmrseq package for example purposes
 5 | 
 6 | library(dmrseq)
 7 | 
 8 | # load example data 
 9 | data(BS.chr21)
10 | 
11 | # the covariate of interest is the "CellType" column of pData(BS.chr21)
12 | testCovariate <- "CellType"
13 | 
14 | # run dmrseq on a subset of the chromosome (20K CpGs)
15 | dmrs.ex <- dmrseq(bs=BS.chr21[240001:260000,],
16 |                   cutoff = 0.05,
17 |                   testCovariate=testCovariate)
18 | 
19 | save(dmrs.ex, file = "./data/dmrs.ex.rda")
20 | library(tools)
21 | resaveRdaFiles("./data/dmrs.ex.rda")
22 | 


--------------------------------------------------------------------------------
/inst/scripts/get_annot.chr21.R:
--------------------------------------------------------------------------------
 1 | ## The following script uses the dmrseq package to
 2 | ## constructs the annot.chr21 dataset,
 3 | ## an example of annotatr annotation to use in plotting
 4 | ## included in the dmrseq package for example purposes
 5 | 
 6 | library(dmrseq)
 7 | 
 8 | # load example data 
 9 | data(BS.chr21)
10 | 
11 | # get annotation information for hg19
12 | annot.chr21 <- getAnnot("hg19")
13 | 
14 | # only keep this information for chromosome 21 (for example dataset)
15 | annot.chr21[[1]] <- annot.chr21[[1]][seqnames(annot.chr21[[1]])=="chr21",]
16 | annot.chr21[[2]] <- annot.chr21[[2]][seqnames(annot.chr21[[2]])=="chr21",]
17 | 
18 | save(annot.chr21 , file = "./data/annot.chr21.rda")
19 | library(tools)
20 | resaveRdaFiles("./data/annot.chr21.rda")
21 | 


--------------------------------------------------------------------------------
/man/meanDiff.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/meanDiff.R
 3 | \name{meanDiff}
 4 | \alias{meanDiff}
 5 | \title{Function to compute raw mean methylation differences}
 6 | \usage{
 7 | meanDiff(bs, dmrs, testCovariate)
 8 | }
 9 | \arguments{
10 | \item{bs}{a \code{BSseq} object}
11 | 
12 | \item{dmrs}{a data.frame with one row per DMR. This can be in the format
13 | of \code{dmrseq} output, but at least should contain the indexStart and 
14 | indexEnd values of the regions of interest.}
15 | 
16 | \item{testCovariate}{a character indicating the covariate of interest in
17 | the \code{pData} slot of \code{bs}.}
18 | }
19 | \value{
20 | numeric vector of raw mean methylation differences.
21 | }
22 | \description{
23 | This function calculates raw mean methylation differences for the
24 | covariate of interest over a set of DMRs (or regions of interest), 
25 | assuming a simple two-group comparison.
26 | }
27 | \examples{
28 | 
29 | data(BS.chr21)
30 | data(dmrs.ex)
31 | rawDiff <- meanDiff(BS.chr21, dmrs=dmrs.ex, testCovariate="CellType")
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite dmrseq in publications, please use:")
 2 | 
 3 | citEntry(entry = "Article",
 4 |          title        = "Detection and accurate false discovery rate control of differentially methylated regions from whole genome bisulfite sequencing",
 5 |          author       = personList(as.person("Keegan Korthauer"),
 6 |                                    as.person("Sutirtha Chakraborty"),
 7 |                                    as.person("Yuval Benjamini"),
 8 |                                    as.person("Rafael A Irizarry")),
 9 |          journal      = "Biostatistics",
10 |          year         = "2018",
11 |          #volume       = "",
12 |          #number       = "",
13 |          pages        = "kxy007",
14 |          doi          = "https://doi.org/10.1093/biostatistics/kxy007",
15 |          textVersion = 
16 |          paste("Korthauer, K., Chakraborty, S., Benjamini, Y., and Irizarry, R. A.", 
17 |                "Detection and accurate false discovery rate control of differentially methylated regions from whole genome bisulfite sequencing.",
18 | "Biostatistics kxy007, https://doi.org/10.1093/biostatistics/kxy007"))
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Keegan Korthauer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/man/dmrPlotAnnotations.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/internal_plotting_functions.R
 3 | \name{dmrPlotAnnotations}
 4 | \alias{dmrPlotAnnotations}
 5 | \title{Add annotations to DMR plots}
 6 | \usage{
 7 | dmrPlotAnnotations(gr, annoTrack)
 8 | }
 9 | \arguments{
10 | \item{gr}{a \code{GRanges} object that contains the DMRs to be
11 | plotted}
12 | 
13 | \item{annoTrack}{a \code{SimpleGRangesList} object with two elements.
14 | The first contains CpG category information in the first element (optional)
15 | coding gene sequence information in the second element (optional).
16 | At least one of these elements needs to be non-null in order for 
17 | any annotation to be plotted, but it is not necessary to contain
18 | both.}
19 | }
20 | \value{
21 | None
22 | }
23 | \description{
24 | Function to add visual representation of CpG categories 
25 | and/or coding 
26 | sequences to DMR plot
27 | }
28 | \details{
29 | An internal function that takes an annotation 
30 | \code{SimpleGRangesList}
31 | object that
32 | contains CpG category information in the first element (optional) and / or
33 | coding gene sequence information in the second element (optional). If neither
34 | of these are present, then nothing will be plotted.
35 | }
36 | 


--------------------------------------------------------------------------------
/man/annot.chr21.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/helper_functions.R
 3 | \docType{data}
 4 | \name{annot.chr21}
 5 | \alias{annot.chr21}
 6 | \title{annot.chr21: Annotation information for chromosome 21, hg38 genome}
 7 | \format{
 8 | a \code{GRangesList} object with two elements returned
 9 | by \code{\link{getAnnot}}. The first
10 | contains CpG category information in the first element (optional)
11 | coding gene sequence information in the second element (optional).
12 | At least one of these elements needs to be non-null in order for 
13 | any annotation to be plotted, but it is not necessary to contain
14 | both.
15 | }
16 | \source{
17 | Obtained from running
18 | \code{annoTrack} function and then subsetting the results to 
19 | only include chromosome 21. A script which executes these steps 
20 | and constructs the \code{annot.chr21} 
21 |  object may be found in \file{inst/scripts/get_annot.chr21.R}
22 | }
23 | \usage{
24 | data(annot.chr21)
25 | }
26 | \description{
27 | This is the annotation information returned from 
28 | \code{\link{getAnnot}}, subsetted for chromosome 21 for convenience
29 | in running the examples. The annotation is obtained using the 
30 | \code{annotatr} package.
31 | }
32 | \examples{
33 | data(annot.chr21)
34 | }
35 | \keyword{datasets}
36 | 


--------------------------------------------------------------------------------
/man/dmrs.ex.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/helper_functions.R
 3 | \docType{data}
 4 | \name{dmrs.ex}
 5 | \alias{dmrs.ex}
 6 | \title{dmrs.ex: Example results of DMRs}
 7 | \format{
 8 | a data.frame that contains the results of the inference. The
 9 |    data.frame contains one row for each candidate region, and 
10 |    10 columns, in the following order: 1. chr = 
11 |    chromosome, 2. start = 
12 |    start basepair position of the region, 3. end = end basepair position
13 |    of the region,
14 |    4. indexStart = the index of the region's first CpG, 
15 |    5. indexEnd = the index of the region's last CpG,
16 |    6. L = the number of CpGs contained in the region,
17 |    7. area = the sum of the smoothed beta values
18 |    8. beta = the coefficient value for the condition difference,
19 |    9. stat = the test statistic for the condition difference,
20 |    10. pval = the permutation p-value for the significance of the test
21 |    statistic, and 
22 |    11. qval = the q-value for the test statistic (adjustment
23 |    for multiple comparisons to control false discovery rate).
24 | }
25 | \source{
26 | Obtained from running the examples in \code{\link{dmrseq}}. 
27 | A script which executes these steps 
28 | and constructs the \code{dmrs.ex} 
29 |  object may be found in \file{inst/scripts/get_dmrs.ex.R}
30 | }
31 | \usage{
32 | data(dmrs.ex)
33 | }
34 | \description{
35 | Example output from \code{dmrseq} function run on the 
36 | example dataset \code{BS.chr21}.
37 | }
38 | \examples{
39 | data(dmrs.ex)
40 | }
41 | \keyword{datasets}
42 | 


--------------------------------------------------------------------------------
/man/getAnnot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getAnnot.R
 3 | \name{getAnnot}
 4 | \alias{getAnnot}
 5 | \title{Retrieve annotation information}
 6 | \usage{
 7 | getAnnot(genomeName)
 8 | }
 9 | \arguments{
10 | \item{genomeName}{a character object that indicates which organism is 
11 | under study. Use the function \code{builtin_genomes()} to see
12 | a character vector of available genome names to choose from (see 
13 | \code{annotatr} documentation for more details).}
14 | }
15 | \value{
16 | a \code{SimpleGRangesList} object with two elements returned
17 | by \code{\link{getAnnot}}. The first
18 | contains CpG category information in the first element (optional)
19 | coding gene sequence information in the second element (optional).
20 | At least one of these elements needs to be non-null in order for 
21 | any annotation to be plotted, but it is not necessary to contain
22 | both.
23 | }
24 | \description{
25 | Uses the \code{annotatr} package to retrieve annotation information (
26 | CpG category and gene coding sequences) for the \code{annoTrack} argument
27 | of \code{\link{plotDMRs}}. Allows for 5 
28 | re-tries if download fails (to allow for a spotty internet connection).
29 | }
30 | \details{
31 | Note that this package needs to attach the \code{annotatr} package, 
32 | and will
33 | return NULL if this cannot be done. You can still use the 
34 | \code{\link{plotDMRs}} function without this optional annotation step, 
35 | just by leaving the \code{annoTrack} argument as NULL.
36 | }
37 | \examples{
38 | 
39 | # get annotation information for hg19
40 | annoTrack <- getAnnot('hg19')
41 | 
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/vignettes/dmrseqBib.bib:
--------------------------------------------------------------------------------
 1 | @article{Hansen2012,
 2 |   author =	 {Hansen, Kasper D and Langmead, Benjamin and Irizarry, Rafael A},
 3 |   title =	 {{BSmooth: from whole genome bisulfite sequencing reads to differentially
 4 |                   methylated regions}},
 5 |   journal =	 {Genome Biology},
 6 |   year =	 {2012},
 7 |   volume =	 {13},
 8 |   number =	 {10},
 9 |   pages =	 {R83},
10 |   doi =		 {10.1186/gb-2012-13-10-r83},
11 |   pubmed =	 {23034175}
12 | }
13 | 
14 | @article{Lister2009,
15 |   author =	 {Lister, Ryan and Pelizzola, Mattia and Dowen, Robert H and Hawkins, R David and
16 |                   Hon, Gary C and Tonti-Filippini, Julian and Nery, Joseph R and Lee, Leonard and
17 |                   Ye, Zhen and Ngo, Que-Minh and Edsall, Lee and Antosiewicz-Bourget, Jessica and
18 |                   Stewart, Ron and Ruotti, Victor and Millar, A Harvey and Thomson, James A and Ren,
19 |                   Bing and Ecker, Joseph R},
20 |   title =	 {{Human DNA methylomes at base resolution show widespread epigenomic differences}},
21 |   journal =	 {Nature},
22 |   year =	 {2009},
23 |   volume =	 {462},
24 |   number =	 {7271},
25 |   pages =	 {315--322},
26 |   doi =		 {10.1038/nature08514},
27 |   pubmed =	 {19829295}
28 | }
29 | 
30 | @article {Korthauer183210,
31 | 	author = {Korthauer, Keegan and Chakraborty, Sutirtha and Benjamini, Yuval and Irizarry, Rafael A.},
32 | 	title = {Detection and accurate False Discovery Rate control of differentially methylated regions from Whole Genome Bisulfite Sequencing},
33 | 	year = {2018},
34 | 	doi = {10.1101/183210},
35 | 	journal = {Biostatistics}
36 | }
37 | 
38 | 
39 | 
40 | % Local Variables:
41 | % eval: (setq fill-column 100)
42 | % End:
43 | 


--------------------------------------------------------------------------------
/man/BS.chr21.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/helper_functions.R
 3 | \docType{data}
 4 | \name{BS.chr21}
 5 | \alias{BS.chr21}
 6 | \title{BS.chr21: Whole-genome bisulfite sequencing for chromosome 21 
 7 | from Lister et al.}
 8 | \format{
 9 | An object of class \code{BSseq}.
10 | }
11 | \source{
12 | Obtained from 
13 | \url{http://neomorph.salk.edu/human_methylome/data.html} specifically,
14 |  the files \url{mc_h1_r1.tar.gz}, \url{mc_h1_r2.tar.gz}, 
15 |  \url{mc_imr90_r1.tar.gz}, \url{mc_imr90_r2.tar.gz}
16 |  A script which downloads these files and constructs the \code{BS.chr21} 
17 |  object may be found in \file{inst/scripts/get_BS.chr21.R} - this was 
18 |  based off of and modified from the get_BS.chr22.R script in the 
19 |  \code{bsseq} package. The object constructed here contains a 
20 |  different chromosome (22 instead of 21), and two additional samples
21 |  (h1 and imr90 instead of just imr90) to enable identification of
22 |  cell type-DMRs for examples.
23 | }
24 | \usage{
25 | data(BS.chr21)
26 | }
27 | \description{
28 | This dataset represents chromosome 21 
29 | from the IMR90 and H1 cell lines sequenced in Lister et al.  
30 | Only CpG methylation are included. The two samples from 
31 | each cell line are two different extractions (ie. technical replicates), 
32 | and are pooled in the analysis in the original paper.
33 | }
34 | \details{
35 | All coordinates are in hg18.
36 | }
37 | \examples{
38 |   data(BS.chr21)
39 |   BS.chr21
40 | }
41 | \references{
42 | R Lister et al. \emph{Human DNA methylomes at base 
43 |  resolution show widespread epigenomic differences}. Nature (2009) 462,
44 |   315-322.
45 | }
46 | \keyword{datasets}
47 | 


--------------------------------------------------------------------------------
/inst/NEWS:
--------------------------------------------------------------------------------
 1 | Changes in Version 1.3.1 (2018-11-02)
 2 | _____________________________________
 3 | 
 4 | o dmrseq now requires that input BSseq objects be ordered (as the BSseq() 
 5 |   constructor no longer automatically orders loci).
 6 | 
 7 | Changes in Version 1.1.20 (2018-10-11)
 8 | _____________________________________
 9 | 
10 | o Minor bug fix for a rare situation that occured in the case of a multi-level 
11 |   covariate of interest if the region level model fitting procedure did not 
12 |   converge.
13 | 
14 | Changes in Version 1.1.2 (2018-05-09)
15 | _____________________________________
16 | 
17 | o The newly added `chrsPerChunk` argument specifies the number of chromosomes
18 |   to compute at a time (default is 1). 
19 | 
20 | Changes in Version 0.99.11 (2018-04-05)
21 | ______________________________________
22 | 
23 | o dmrseq now provides support for detecting large-scale methylation blocks. To
24 |   use this feature, specify `block=TRUE`, and increase the smoothing span 
25 |   parameters `minInSpan`, `bpSpan`, and `maxGapSmooth`. More details are 
26 |   provided in the documentation and vignette.
27 | 
28 | Changes in Version 0.99.8 (2018-03-21)
29 | ______________________________________
30 | 
31 | o dmrseq no longer requires balanced, two-group comparisons. To run using a 
32 |   continuous or categorial covariate with more than two groups, simply pass in
33 |   the name of a column in `pData` that contains this covariate. A continuous
34 |   covariate is assmued if the data type in the `testCovariate` slot is 
35 |   continuous, with the exception of if there are only two unique values (then a
36 |   two group comparison is carried out).
37 | 
38 | Version 0.99.6 (2018-03-02)
39 | ___________________________
40 | 
41 | o dmrseq is now available on Bioconductor devel (3.7) !
42 | 
43 | 


--------------------------------------------------------------------------------
/man/plotEmpiricalDistribution.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plotEmpiricalDistribution.R
 3 | \name{plotEmpiricalDistribution}
 4 | \alias{plotEmpiricalDistribution}
 5 | \title{Plot the empirical distribution of the methylation beta vals or coverage}
 6 | \usage{
 7 | plotEmpiricalDistribution(
 8 |   bs,
 9 |   testCovariate = NULL,
10 |   bySample = FALSE,
11 |   type = "M",
12 |   adj = 2.5
13 | )
14 | }
15 | \arguments{
16 | \item{bs}{a BSseq object}
17 | 
18 | \item{testCovariate}{character specifying the column name of the
19 | \code{pData} slot of the BSseq object to include in the plot legend.}
20 | 
21 | \item{bySample}{logical whether to plot a separate line for each sample,
22 | even if the grouping \code{testCovariate} is specified.
23 | Default value is FALSE (so samples with the same value of 
24 | \code{testCovariate} will be collapsed into the same line). If 
25 | \code{testCovariate} is not specified, this parameter does not have an 
26 | effect and samples are automatically plotted separately.}
27 | 
28 | \item{type}{a character indicating which type of density to plot - the 
29 | methylation (beta) values ("M") or the coverage ("Cov"). Default is "M".}
30 | 
31 | \item{adj}{a numeric value for the \code{adjust} parameter to pass to the 
32 | \code{geom_line} function. Specifies how smooth the make the function.}
33 | }
34 | \value{
35 | a ggplot object
36 | }
37 | \description{
38 | Uses ggplot2 to plot smoothed density histograms of methylation
39 | proportions (beta values), or coverage. Methylation proportion densities
40 | are weighted by coverage.
41 | The number of curves plotted 
42 | will be equal to the number of different values of \code{testCovariate},
43 | unless \code{bySample} is TRUE. This can take quite some time to 
44 | execute for a large object, so it is recommended to first take a random
45 | sample of loci (say one million) before plotting.
46 | }
47 | \examples{
48 | 
49 | data(BS.chr21)
50 | 
51 | # plot beta values by sample group
52 | plotEmpiricalDistribution(BS.chr21, testCovariate="CellType")
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # dmrseq: Inference for differentially methylated regions (DMRs) from bisulfite sequencing
 2 | 
 3 | A central question in the analysis of bisulfite sequencing data 
 4 | is to detect regions (collections of 
 5 | neighboring CpGs) with systematic differences between conditions, 
 6 | as compared to within-condition variability. These so-called *Differentially
 7 | Methylated Regions* (DMRs) are thought to be more informative than single CpGs 
 8 | in terms of of biological function. 
 9 | 
10 | <p align="center">
11 |   <img src="/inst/sticker/dmrseq.png" height="300"/>
12 | </p>
13 | 
14 | The package **dmrseq** 
15 | provides a rigorous permutation-based approach to
16 | detect and perform inference for differential methylation by use of 
17 | generalized least squares models that account for inter-individual and 
18 | inter-CpG variability to generate region-level statistics that can be
19 | comparable across the genome. The framework performs well even
20 | on samples as small as two per group. 
21 | 
22 | ## Installation
23 | 
24 | **dmrseq** is available on 
25 | [Bioconductor](https://bioconductor.org/packages/dmrseq). You can install
26 | it with R version 3.5.0 or higher with the following commands:
27 | 
28 | ```
29 | install.packages("BiocManager")
30 | BiocManager::install("dmrseq")
31 | ```
32 | 
33 | ## Getting started
34 | 
35 | See the vignette for information on how to use the package to perform
36 | typical methylation analysis workflows.
37 | 
38 | ## Learn more
39 | 
40 | More details of the **dmrseq** framework can be found in the manuscript
41 | 
42 | > Korthauer, K., Chakraborty, S., Benjamini, Y., and Irizarry, R.A.
43 | > Detection and accurate False Discovery Rate control of differentially 
44 | methylated regions from Whole Genome Bisulfite Sequencing
45 | > *Biostatistics*, 2018 (in press).
46 | > [BioRxiv:10.1101/183210](http://www.biorxiv.org/content/early/2017/08/31/183210)
47 | 
48 | 
49 | ## License/Copyright
50 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 
51 | This package is made available under an MIT license.  
52 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: dmrseq
 2 | Type: Package
 3 | Title: Detection and inference of differentially methylated regions 
 4 |   from Whole Genome Bisulfite Sequencing 
 5 | Version: 1.29.1
 6 | Authors@R: c(person("Keegan", "Korthauer", role = c("cre", "aut"), 
 7 |                     email = "keegan@stat.ubc.ca", 
 8 |                     comment = c(ORCID = "0000-0002-4565-1654")),
 9 |              person("Rafael", "Irizarry", role = "aut",
10 |                     email = "rafa@jimmy.harvard.edu",
11 |                     comment = c(ORCID = "0000-0002-3944-4309")),
12 |              person("Yuval", "Benjamini", role = "aut"),
13 |              person("Sutirtha", "Chakraborty", role = "aut"))
14 | Description: This package implements an approach for scanning the
15 |   genome to detect and perform accurate inference on differentially 
16 |   methylated regions from Whole Genome Bisulfite Sequencing data. 
17 |   The method is based on comparing detected regions 
18 |   to a pooled null distribution, that can be implemented even when as 
19 |   few as two samples per population are available. 
20 |   Region-level statistics are obtained by fitting a generalized 
21 |   least squares (GLS) regression model with a nested autoregressive  
22 |   correlated error structure for the effect of interest on transformed  
23 |   methylation proportions. 
24 | License: MIT + file LICENSE
25 | Depends: R (>= 3.5), bsseq
26 | Imports: GenomicRanges, 
27 |   nlme, 
28 |   ggplot2, 
29 |   S4Vectors,
30 |   RColorBrewer, 
31 |   bumphunter, 
32 |   DelayedMatrixStats (>= 1.1.13),
33 |   matrixStats,
34 |   BiocParallel,
35 |   outliers, 
36 |   methods, 
37 |   locfit, 
38 |   IRanges, 
39 |   grDevices, 
40 |   graphics, 
41 |   stats, 
42 |   utils,
43 |   annotatr, 
44 |   AnnotationHub, 
45 |   rtracklayer, 
46 |   GenomeInfoDb, 
47 |   splines
48 | Suggests: knitr, 
49 |   rmarkdown, 
50 |   BiocStyle,
51 |   TxDb.Hsapiens.UCSC.hg19.knownGene,
52 |   org.Hs.eg.db
53 | biocViews: ImmunoOncology, DNAMethylation, Epigenetics, MultipleComparison, Software,
54 |   Sequencing, DifferentialMethylation, WholeGenome, Regression, 
55 |   FunctionalGenomics
56 | LazyData: true
57 | VignetteBuilder: knitr
58 | RoxygenNote: 7.2.3
59 | 


--------------------------------------------------------------------------------
/man/simDMRs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simDMRs.R
 3 | \name{simDMRs}
 4 | \alias{simDMRs}
 5 | \title{Simulate Differentially Methylated Regions}
 6 | \usage{
 7 | simDMRs(bs, num.dmrs = 3000, delta.max0 = 0.3)
 8 | }
 9 | \arguments{
10 | \item{bs}{a BSseq object containing only control samples (from the same
11 | population) for which simulated DMRs will be added after dividing the 
12 | population into two artificial groups.}
13 | 
14 | \item{num.dmrs}{an integer specifying how many DMRs to add.}
15 | 
16 | \item{delta.max0}{a proportion value indicating the mode value for the
17 | difference in proportion of methylated CpGs in the simulated DMRs (the
18 | actual value will be drawn from a scaled Beta distribution centered at 
19 | this value). Default value is 0.3.}
20 | }
21 | \value{
22 | A named list object with 5 elements: (1) 
23 | \code{gr.dmrs} is a \code{GRanges} object with \code{num.dmrs}
24 | ranges that represent the random DMRs added. (2) \code{dmr.mncov} is a 
25 | numeric vector that contains the mean coverage in each simulated DMR. (3)
26 | \code{dmr.L} is a numeric vector that contains the number of CpGs in each 
27 | simulated DMR. (4) \code{bs} is the BSseq object that contains the 
28 | simulated DMRs. (5) \code{deltas} is a numeric vector that contains the 
29 | effect size used for each DMR.
30 | }
31 | \description{
32 | Add simulated DMRs to observed control data. Control data will be split
33 | into two (artificial) populations.
34 | }
35 | \examples{
36 | 
37 | # Add simulated DMRs to a BSseq dataset
38 | # This is just for illustrative purposes - ideally you would
39 | # add DMRs to a set of samples from the same condition (in our
40 | # example data, we have data from two different cell types)
41 | # In this case, we shuffle the samples by cell type to create
42 | # a null comparison.
43 | 
44 | data(BS.chr21)
45 | 
46 | BS.chr21.sim <- simDMRs(bs=BS.chr21[1:10000,c(1,3,2,4)], 
47 |                         num.dmrs=50)
48 | 
49 | # show the simulated DMRs GRanges object
50 | show(BS.chr21.sim$gr.dmrs)
51 | 
52 | # show the updated BSseq object that includes the simulated DMRs
53 | show(BS.chr21.sim$bs)
54 | 
55 | # examine effect sizes of the DMRs
56 | head(BS.chr21.sim$delta)
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(dmrseq)
 4 | export(getAnnot)
 5 | export(meanDiff)
 6 | export(plotDMRs)
 7 | export(plotEmpiricalDistribution)
 8 | export(simDMRs)
 9 | import(GenomicRanges)
10 | import(S4Vectors)
11 | import(annotatr)
12 | import(bsseq)
13 | import(ggplot2)
14 | import(nlme)
15 | importClassesFrom(bsseq,BSseq)
16 | importFrom(AnnotationHub,AnnotationHub)
17 | importFrom(AnnotationHub,query)
18 | importFrom(BiocParallel,MulticoreParam)
19 | importFrom(BiocParallel,bplapply)
20 | importFrom(BiocParallel,bpparam)
21 | importFrom(BiocParallel,register)
22 | importFrom(DelayedMatrixStats,colMedians)
23 | importFrom(DelayedMatrixStats,rowDiffs)
24 | importFrom(DelayedMatrixStats,rowMads)
25 | importFrom(DelayedMatrixStats,rowMeans2)
26 | importFrom(DelayedMatrixStats,rowSums2)
27 | importFrom(GenomeInfoDb,genome)
28 | importFrom(IRanges,IRanges)
29 | importFrom(RColorBrewer,brewer.pal)
30 | importFrom(bumphunter,clusterMaker)
31 | importFrom(bumphunter,getSegments)
32 | importFrom(grDevices,col2rgb)
33 | importFrom(grDevices,colorRampPalette)
34 | importFrom(grDevices,dev.off)
35 | importFrom(grDevices,hcl)
36 | importFrom(grDevices,pdf)
37 | importFrom(grDevices,rainbow)
38 | importFrom(grDevices,rgb)
39 | importFrom(graphics,arrows)
40 | importFrom(graphics,axis)
41 | importFrom(graphics,layout)
42 | importFrom(graphics,legend)
43 | importFrom(graphics,lines)
44 | importFrom(graphics,mtext)
45 | importFrom(graphics,par)
46 | importFrom(graphics,plot)
47 | importFrom(graphics,points)
48 | importFrom(graphics,rect)
49 | importFrom(graphics,rug)
50 | importFrom(graphics,text)
51 | importFrom(locfit,locfit)
52 | importFrom(locfit,lp)
53 | importFrom(matrixStats,rowRanges)
54 | importFrom(methods,is)
55 | importFrom(outliers,grubbs.test)
56 | importFrom(rtracklayer,liftOver)
57 | importFrom(splines,ns)
58 | importFrom(stats,anova)
59 | importFrom(stats,approxfun)
60 | importFrom(stats,as.formula)
61 | importFrom(stats,formula)
62 | importFrom(stats,lm)
63 | importFrom(stats,loess)
64 | importFrom(stats,median)
65 | importFrom(stats,model.matrix)
66 | importFrom(stats,p.adjust)
67 | importFrom(stats,predict)
68 | importFrom(stats,preplot)
69 | importFrom(stats,qt)
70 | importFrom(stats,quantile)
71 | importFrom(stats,rbeta)
72 | importFrom(stats,rbinom)
73 | importFrom(stats,runif)
74 | importFrom(utils,combn)
75 | importMethodsFrom(bsseq,pData)
76 | importMethodsFrom(bsseq,sampleNames)
77 | importMethodsFrom(bsseq,seqnames)
78 | importMethodsFrom(bsseq,start)
79 | importMethodsFrom(bsseq,width)
80 | 


--------------------------------------------------------------------------------
/R/meanDiff.R:
--------------------------------------------------------------------------------
 1 | #' Function to compute raw mean methylation differences
 2 | #' 
 3 | #' This function calculates raw mean methylation differences for the
 4 | #' covariate of interest over a set of DMRs (or regions of interest), 
 5 | #' assuming a simple two-group comparison.
 6 | #' 
 7 | #' @param bs a \code{BSseq} object
 8 | #' @param dmrs a data.frame with one row per DMR. This can be in the format
 9 | #' of \code{dmrseq} output, but at least should contain the indexStart and 
10 | #' indexEnd values of the regions of interest.
11 | #' @param testCovariate a character indicating the covariate of interest in
12 | #' the \code{pData} slot of \code{bs}.
13 | #' 
14 | #' @return numeric vector of raw mean methylation differences.
15 | #' 
16 | #' @importFrom DelayedMatrixStats rowMeans2
17 | #' 
18 | #' @export
19 | #' 
20 | #' @examples
21 | #' 
22 | #' data(BS.chr21)
23 | #' data(dmrs.ex)
24 | #' rawDiff <- meanDiff(BS.chr21, dmrs=dmrs.ex, testCovariate="CellType")
25 | #' 
26 | meanDiff <- function(bs, dmrs, testCovariate) {
27 |   # convert covariates to column numbers if characters
28 |   if (is.character(testCovariate)) {
29 |     testCovariate <- which(colnames(pData(bs)) == testCovariate)
30 |     if (length(testCovariate) == 0) {
31 |       stop("testCovariate not found in pData(). ", 
32 |            "Please specify a valid testCovariate")
33 |     }
34 |   }
35 |   
36 |   coeff <- seq(2,(2 + length(testCovariate) - 1))
37 |   testCov <- pData(bs)[, testCovariate]
38 |   if (length(unique(testCov)) == 1) {
39 |     message("Warning: only one unique value of the specified ", 
40 |             "covariate of interest.  Assuming null comparison and ", 
41 |             "splitting sample group into two equal groups")
42 |     testCov <- rep(1, length(testCov))
43 |     testCov[seq_len(round(length(testCov)/2))] <- 0
44 |   }
45 | 
46 |   design <- model.matrix(~testCov)
47 |   colnames(design)[coeff] <- colnames(pData(bs))[testCovariate]
48 |   
49 |   if (length(unique(design[, coeff])) != 2) {
50 |     message("Not a two-group comparison. Can't compute simple mean ",
51 |             "methylation differences. ", 
52 |             "Returning beta estimates instead")
53 |     return(dmrs$beta)
54 |   } else {
55 |     prop.mat <- getCoverage(bs, type = "M") / 
56 |                 getCoverage(bs, type = "Cov")
57 |     levs <- unique(design[, coeff])
58 |     
59 |     indexRanges <- IRanges(start(dmrs$index), end(dmrs$index))
60 |     prop.mat.dmr <- extractROWS(prop.mat, indexRanges)
61 |     prop.mat1.means <- DelayedMatrixStats::rowMeans2(prop.mat.dmr[,
62 |                                  design[, coeff] == levs[which.min(levs)]],
63 |                                  na.rm=TRUE)
64 |     prop.mat2.means <- DelayedMatrixStats::rowMeans2(prop.mat.dmr[,
65 |                                  design[, coeff] == levs[which.max(levs)]],
66 |                                  na.rm=TRUE)
67 |     
68 |     meanDiff <- IRanges::mean(IRanges::relist(prop.mat2.means - prop.mat1.means,
69 |                                           indexRanges), na.rm=TRUE)
70 |     
71 |     return(meanDiff)
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/inst/scripts/get_BS.chr21.R:
--------------------------------------------------------------------------------
 1 | ## The following script downloads and constructs the BS.chr21 dataset,
 2 | ## included in the dmrseq package
 3 | ## This script was modified from bsseq - includes two additional samples
 4 | ## compared to the example data in bsseq
 5 | 
 6 | library(dmrseq)
 7 | 
 8 | ## First we download.  Each file is slightly less than 200 MB
 9 | # you may need to change the 'method' in 'download.file' to suit
10 | # the utilities available on your OS
11 | 
12 | # cell type imr90, replicates 1 and 2
13 | download.file(url = "ftp://ftpuser3:s3qu3nc3@neomorph.salk.edu/mc/mc_imr90_r1.tar.gz",
14 |               destfile = "mc_imr90_r1.tar.gz", method='curl')
15 | untar("mc_imr90_r1.tar.gz",  "mc_imr90_r1/mc_imr90_r1_21", compressed = TRUE)
16 | download.file(url = "ftp://ftpuser3:s3qu3nc3@neomorph.salk.edu/mc/mc_imr90_r2.tar.gz",
17 |               destfile = "mc_imr90_r2.tar.gz", method='curl')
18 | untar("mc_imr90_r2.tar.gz",  "mc_imr90_r2/mc_imr90_r2_21", compressed = TRUE)
19 | 
20 | # cell type h1, replicates 1 and 2
21 | download.file(url = "ftp://ftpuser3:s3qu3nc3@neomorph.salk.edu/mc/mc_h1_r1.tar.gz",
22 |               destfile = "mc_h1_r1.tar.gz", method='curl')
23 | untar("mc_h1_r1.tar.gz",  "mc_h1_r1/mc_h1_r1_21", compressed = TRUE)
24 | download.file(url = "ftp://ftpuser3:s3qu3nc3@neomorph.salk.edu/mc/mc_h1_r2.tar.gz",
25 |               destfile = "mc_h1_r2.tar.gz", method='curl')
26 | untar("mc_h1_r2.tar.gz",  "mc_h1_r2/mc_h1_r2_21", compressed = TRUE)
27 | 
28 | 
29 | ## Now the workhorse function
30 | 
31 | read.lister <- function(file) {
32 |   dat <- read.table(file, skip = 1, row.names = NULL,
33 |                     col.names = c("chr", "pos", "strand", "context", "M", "Cov"),
34 |                     colClasses = c("character", "integer", "character",
35 |                                    "character", "integer", "integer"))
36 |   ## we remove all non-CpG calls.  This includes SNPs
37 |   dat <- dat[dat$context == "CG",]
38 |   dat$context <- NULL
39 |   dat$chr <- paste("chr", dat$chr, sep = "")
40 |   ## Now we need to handle that the data has separate lines for each strand
41 |   ## We join these
42 |   tmp <- dat[dat$strand == "+",]
43 |   BS.forward <- BSseq(pos = tmp$pos, chr = tmp$chr, 
44 |                       M = as.matrix(tmp$M, ncol = 1),
45 |                       Cov = as.matrix(tmp$Cov, ncol = 1), 
46 |                       sampleNames = "forward")
47 |   tmp <- dat[dat$strand == "-",]
48 |   BS.reverse <- BSseq(pos = tmp$pos - 1L, chr = tmp$chr, 
49 |                       M = as.matrix(tmp$M, ncol = 1),
50 |                       Cov = as.matrix(tmp$Cov, ncol = 1), 
51 |                       sampleNames = "reverse")
52 |   BS <- combine(BS.forward, BS.reverse)
53 |   BS <- collapseBSseq(BS, columns = c("a", "a"))
54 |   BS
55 | }
56 | 
57 | BS.imr90.r1 <- read.lister("mc_imr90_r1/mc_imr90_r1_21")
58 | sampleNames(BS.imr90.r1) <- "imr90.r1"
59 | BS.imr90.r2 <- read.lister("mc_imr90_r2/mc_imr90_r2_21")
60 | sampleNames(BS.imr90.r2) <- "imr90.r2"
61 | 
62 | BS.h1.r1 <- read.lister("mc_h1_r1/mc_h1_r1_21")
63 | sampleNames(BS.h1.r1) <- "h1.r1"
64 | BS.h1.r2 <- read.lister("mc_h1_r2/mc_h1_r2_21")
65 | sampleNames(BS.h1.r2) <- "h1.r2"
66 | 
67 | BS.chr21 <- combine(BS.imr90.r1, BS.imr90.r2, BS.h1.r1, BS.h1.r2)
68 | pData(BS.chr21)$CellType <- c(rep("imr90",2), rep("h1",2))
69 | pData(BS.chr21)$Rep <- rep(c("replicate1", "replicate2"),2)
70 | validObject(BS.chr21)
71 | pData(BS.chr21)
72 | 
73 | #remove any loci which have no coverage in one or more samples
74 | meth.levels.raw = getMeth(BS.chr21, type = "raw")
75 | no.hits = which(is.na(rowMeans(meth.levels.raw)) == TRUE)
76 | BS.chr21 = BS.chr21[-no.hits]
77 | 
78 | save(BS.chr21, file = "./data/BS.chr21.rda")
79 | library(tools)
80 | resaveRdaFiles("./data/BS.chr21.rda")
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/R/getAnnot.R:
--------------------------------------------------------------------------------
  1 | #' Retrieve annotation information 
  2 | #' 
  3 | #' Uses the \code{annotatr} package to retrieve annotation information (
  4 | #' CpG category and gene coding sequences) for the \code{annoTrack} argument
  5 | #' of \code{\link{plotDMRs}}. Allows for 5 
  6 | #' re-tries if download fails (to allow for a spotty internet connection).
  7 | #' 
  8 | #' @details Note that this package needs to attach the \code{annotatr} package, 
  9 | #' and will
 10 | #' return NULL if this cannot be done. You can still use the 
 11 | #' \code{\link{plotDMRs}} function without this optional annotation step, 
 12 | #' just by leaving the \code{annoTrack} argument as NULL.
 13 | #' 
 14 | #' @param genomeName a character object that indicates which organism is 
 15 | #' under study. Use the function \code{builtin_genomes()} to see
 16 | #' a character vector of available genome names to choose from (see 
 17 | #' \code{annotatr} documentation for more details).
 18 | #' 
 19 | #' @return a \code{SimpleGRangesList} object with two elements returned
 20 | #' by \code{\link{getAnnot}}. The first
 21 | #' contains CpG category information in the first element (optional)
 22 | #' coding gene sequence information in the second element (optional).
 23 | #' At least one of these elements needs to be non-null in order for 
 24 | #' any annotation to be plotted, but it is not necessary to contain
 25 | #' both.
 26 | #' 
 27 | #' @export
 28 | #' 
 29 | #' @import annotatr
 30 | #' @importFrom AnnotationHub AnnotationHub query
 31 | #' @importFrom rtracklayer liftOver
 32 | #' @importFrom GenomeInfoDb genome
 33 | #' 
 34 | #' @examples
 35 | #' 
 36 | #' # get annotation information for hg19
 37 | #' annoTrack <- getAnnot('hg19')
 38 | #' 
 39 | #' 
 40 | getAnnot <- function(genomeName) {
 41 |     requireNamespace("annotatr")
 42 |     liftTo <- NULL
 43 |     if(genomeName == 'hg18'){
 44 |         message("Genome ", genomeName, " will be built by lifting over ",
 45 |                 "hg19 annotations from annotatr")
 46 |         liftTo <- 'hg18'
 47 |         genomeName <- 'hg19'
 48 |     }else if (!genomeName %in% annotatr::builtin_genomes()) {
 49 |         message("Genome ", genomeName, " is not supported by ", 
 50 |                        "annotatr at this time")
 51 |         return(NULL)
 52 |     }
 53 |     
 54 |     if (is.null(genomeName)) {
 55 |         return(NULL)
 56 |     } else {
 57 |         annot_CpG <- paste0(c(genomeName, "_cpgs"), collapse = "")
 58 |         annot_genes <- paste0(c(genomeName, "_genes_cds"), collapse = "")
 59 |         
 60 |         # Build the annotations (a single GRanges object)
 61 |         
 62 |         # annotatr downloads files for genomes that aren't natively supported
 63 |         # (e.g. hg38)
 64 |         # Download has a nonzero fail rate; allow up to 5 retries before 
 65 |         # throwing an error
 66 |         
 67 |         for (attempt in seq_len(5)) {
 68 |             cpg <- try(annotatr::build_annotations(genome = genomeName, 
 69 |                                           annotations = annot_CpG), 
 70 |                 silent = TRUE)
 71 |             if (!is(cpg, "try-error")) {
 72 |                 message("Download of CpG annotation successful!")
 73 |                 fail1 <- 0
 74 |                 break
 75 |             } else {
 76 |                 message(cpg)
 77 |                 if (5 - attempt > 0)
 78 |                   message("Trying again (", 5 - attempt, " attempts remaining)")
 79 |                 fail1 <- 1
 80 |             }
 81 |         }
 82 |         
 83 |         for (attempt in seq_len(5)) {
 84 |             genes <- try(annotatr::build_annotations(genome = genomeName,
 85 |                                             annotations = annot_genes), 
 86 |                 silent = TRUE)
 87 |             if (!is(genes, "try-error")) {
 88 |                 message("Download of Gene annotation successful!")
 89 |                 fail2 <- 0
 90 |                 break
 91 |             } else {
 92 |                 message(genes)
 93 |                 if (5 - attempt > 0)
 94 |                   message("Trying again (", 5 - attempt, " attempts remaining)")
 95 |                 fail2 <- 1
 96 |             }
 97 |         }
 98 |         
 99 | 
100 |         if (fail1 == 0 && fail2 == 0) {
101 |             # lift over hg19 coordinates to hg18 using annotationHub
102 |             if (!is.null(liftTo)){
103 |               ah = AnnotationHub()
104 |               chainfiles <- query(ah , c(genomeName, liftTo, "chainfile"))
105 |               cf <- which(grepl(paste0(genomeName, "To", liftTo), 
106 |                                 chainfiles$title, 
107 |                                 ignore.case=TRUE))
108 |               if (length(cf) == 0){
109 |                 message("LiftOver from ", genomeName, " to ", liftTo, 
110 |                      " was unsucccessful")
111 |                 return(NULL)
112 |               }else if(length(cf) > 1){
113 |                 # take the first matching chain if more than one
114 |                 cf <- cf[1]
115 |               }
116 |               
117 |               chain <- chainfiles[[names(chainfiles)[cf]]]
118 |               
119 |               cpg.new <- unlist(liftOver(cpg, chain))
120 |               genes.new <- unlist(liftOver(genes, chain))
121 |               
122 |               GenomeInfoDb::genome(cpg) <- liftTo
123 |               GenomeInfoDb::genome(genes) <- liftTo
124 |             }
125 |            
126 |             keep <- which(!is.na(genes$symbol))
127 |             genes <- genes[keep, ]
128 |             cpg$type <- substr(cpg$type, 10, nchar(cpg$type))
129 |             
130 |             annot <- GRangesList(CpGs = cpg, Exons = genes, compress=FALSE)
131 |             return(annot)
132 |         } else {
133 |             message("Download with annotatr::build_annotations() failed; Annotation could not be retrieved.")
134 |             return(NULL)
135 |         }
136 |     }
137 | }
138 | 


--------------------------------------------------------------------------------
/R/plotEmpiricalDistribution.R:
--------------------------------------------------------------------------------
  1 | #' Plot the empirical distribution of the methylation beta vals or coverage
  2 | #' 
  3 | #' Uses ggplot2 to plot smoothed density histograms of methylation
  4 | #' proportions (beta values), or coverage. Methylation proportion densities
  5 | #' are weighted by coverage.
  6 | #' The number of curves plotted 
  7 | #' will be equal to the number of different values of \code{testCovariate},
  8 | #' unless \code{bySample} is TRUE. This can take quite some time to 
  9 | #' execute for a large object, so it is recommended to first take a random
 10 | #' sample of loci (say one million) before plotting.
 11 | #' 
 12 | #' @param bs a BSseq object
 13 | #'
 14 | #' @param testCovariate character specifying the column name of the
 15 | #' \code{pData} slot of the BSseq object to include in the plot legend.
 16 | #' 
 17 | #' @param bySample logical whether to plot a separate line for each sample,
 18 | #'  even if the grouping \code{testCovariate} is specified.
 19 | #'  Default value is FALSE (so samples with the same value of 
 20 | #'  \code{testCovariate} will be collapsed into the same line). If 
 21 | #'  \code{testCovariate} is not specified, this parameter does not have an 
 22 | #'  effect and samples are automatically plotted separately.
 23 | #' 
 24 | #' @param type a character indicating which type of density to plot - the 
 25 | #' methylation (beta) values ("M") or the coverage ("Cov"). Default is "M".
 26 | #' 
 27 | #' @param adj a numeric value for the \code{adjust} parameter to pass to the 
 28 | #' \code{geom_line} function. Specifies how smooth the make the function.
 29 | #' 
 30 | #' @return a ggplot object
 31 | #' 
 32 | #' @importFrom locfit locfit lp
 33 | #' 
 34 | #' @export
 35 | #' 
 36 | #' @examples 
 37 | #' 
 38 | #' data(BS.chr21)
 39 | #' 
 40 | #' # plot beta values by sample group
 41 | #' plotEmpiricalDistribution(BS.chr21, testCovariate="CellType")
 42 | #' 
 43 | plotEmpiricalDistribution <- function(bs, 
 44 |                                       testCovariate = NULL,
 45 |                                       bySample = FALSE,
 46 |                                       type = "M",
 47 |                                       adj = 2.5) {
 48 |     #satisfy check
 49 |     M <- Cov <- group <- wt <- NULL
 50 |   
 51 |     if (!(type %in% c("M", "Cov"))){
 52 |       stop("type must be either M or Cov")
 53 |     }
 54 |     
 55 |     if(is.null(testCovariate) & !bySample){
 56 |       message("No testCovariate specified; plotting each sample separately.")
 57 |       bySample = TRUE
 58 |     }
 59 |   
 60 |     meth.mat <- getCoverage(bs, type = "M")
 61 |     unmeth.mat <- getCoverage(bs, type = "Cov") - meth.mat
 62 |     
 63 |     meth.levelsm <- data.frame(meth.mat/
 64 |                                  (meth.mat + unmeth.mat))
 65 |     cov.matm <- data.frame((meth.mat + unmeth.mat))
 66 |     
 67 |     if (!is.null(testCovariate)) {
 68 |         if (sum(grepl(testCovariate, colnames(pData(bs)))) == 0) {
 69 |             stop("Error: no column in pData() found ", 
 70 |                 "that matches the testCovariate")
 71 |         } else if (length(grep(testCovariate, colnames(pData(bs)))) > 1) {
 72 |             stop("Error: testCovariate matches more ",
 73 |                 "than one column in pData()")
 74 |         }
 75 |         mC <- grep(testCovariate, colnames(pData(bs)))
 76 |         grouplab <- pData(bs)[,mC]
 77 |     }else{
 78 |        if(is.null(sampleNames(bs))){
 79 |          grouplab <- as.character(seq_len(ncol(bs)))
 80 |        }else{
 81 |          grouplab <- sampleNames(bs)
 82 |        }
 83 |     }
 84 |     
 85 |     meth.levelsm <- utils::stack(meth.levelsm)
 86 |     colnames(meth.levelsm)[1] <- "M"
 87 |     meth.levelsm$Cov <- utils::stack(cov.matm)$values
 88 |     
 89 |     if(is.null(sampleNames(bs))){
 90 |       meth.levelsm$sample <- sort(rep(seq_len(ncol(bs)), nrow(bs)))
 91 |     }else{
 92 |       meth.levelsm$sample <- unlist(lapply(sampleNames(bs), function(x) 
 93 |                                            rep(x, nrow(bs))))
 94 |     }
 95 |     
 96 |     if (!is.null(testCovariate)){
 97 |       meth.levelsm$group <- 
 98 |         unlist(lapply(seq_len(ncol(bs)), function(x) 
 99 |           rep(pData(bs)[x,mC], nrow(bs))))
100 |     }else{
101 |       meth.levelsm$group <- meth.levelsm$sample
102 |     }
103 |     
104 |     if (!bySample){
105 |       if (type=="M"){
106 |         # compute weights - sum over all samples in group ##
107 |         covtots <- rep(NA, ncol(cov.matm))
108 |         names(covtots) <- grouplab
109 |         for(l in unique(grouplab)){
110 |           covtots[names(covtots) == l] <- sum(colSums(cov.matm)[grouplab == l]) 
111 |         }
112 |         
113 |         wt.matm <- data.frame(t(t(cov.matm) / covtots)) 
114 |         meth.levelsm$wt <- utils::stack(wt.matm)$values 
115 |         
116 |         p1 <- ggplot(meth.levelsm, 
117 |                  aes(M, colour = group, group = group, weight = wt)) + 
118 |           geom_line(adjust = adj, alpha = 0.6, stat = "density", size = 1.3) + 
119 |         xlab("Methylation Proportion") + 
120 |         theme_bw()
121 |       }else{
122 |         p1 <- ggplot(meth.levelsm, aes(Cov+0.1, 
123 |                                colour = group, group = group)) + 
124 |           geom_line(adjust = adj, alpha = 0.6, stat = "density", size = 1.3) +
125 |           scale_x_continuous(trans="log2") +
126 |         xlab("Coverage") + 
127 |           theme_bw()
128 |       }
129 |       p1 <- p1 + labs(colour = "Group")
130 |     }else{
131 |       if (type=="M"){
132 |         wt.matm <- data.frame(t(t(cov.matm) / colSums(cov.matm))) 
133 |         meth.levelsm$wt <- utils::stack(wt.matm)$values 
134 |         
135 |         if(identical(meth.levelsm$group, meth.levelsm$sample)){
136 |           p1 <- ggplot(meth.levelsm, 
137 |                        aes(M, colour = group, group = sample, weight = wt)) +
138 |             labs(color = "Sample")
139 |         }else{
140 |           if(ncol(bs) <= 12){
141 |             p1 <- ggplot(meth.levelsm, 
142 |                      aes(M, colour = group, group = sample, weight = wt,
143 |                          linetype = sample)) +
144 |                     labs(color = "Group", linetype = "Sample")
145 |           }else{
146 |             p1 <- ggplot(meth.levelsm, 
147 |                          aes(M, colour = group, group = sample, weight = wt)) +
148 |               labs(color = "Group")
149 |           }
150 |         }
151 |         p1 <- p1 + geom_line(adjust = adj, alpha = 0.6, stat = "density", size = 1.3) +
152 |           xlab("Methylation Proportion") + 
153 |           theme_bw()  
154 |       }else{
155 |         if(identical(meth.levelsm$group, meth.levelsm$sample)){
156 |           p1 <- ggplot(meth.levelsm, aes(Cov+0.1, 
157 |                                          colour = group, group = sample)) +
158 |             labs(color = "Sample")
159 |         }else{
160 |           if(ncol(bs) <= 12){
161 |             p1 <- ggplot(meth.levelsm, aes(Cov+0.1, 
162 |                                          colour = group, group = sample,
163 |                                          linetype = sample)) +
164 |                   labs(color = "Group", linetype = "Sample")
165 |           }else{
166 |             p1 <- ggplot(meth.levelsm, aes(Cov+0.1, 
167 |                                            colour = group, group = sample)) +
168 |               labs(color = "Group")
169 |           }
170 |         }
171 |         
172 |         p1 <- p1 + 
173 |           geom_line(adjust = adj, alpha = 0.6, stat = "density", size = 1.3) +
174 |           scale_x_continuous(trans="log2") +
175 |           xlab("Coverage") + 
176 |           theme_bw()  
177 |       }
178 |     }
179 |     return(p1)
180 | }
181 | 
182 | 


--------------------------------------------------------------------------------
/man/dmrseq.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/dmrseq.R
  3 | \name{dmrseq}
  4 | \alias{dmrseq}
  5 | \title{Main function for detecting and evaluating significance of DMRs.}
  6 | \usage{
  7 | dmrseq(
  8 |   bs,
  9 |   testCovariate,
 10 |   adjustCovariate = NULL,
 11 |   cutoff = 0.1,
 12 |   minNumRegion = 5,
 13 |   smooth = TRUE,
 14 |   bpSpan = 1000,
 15 |   minInSpan = 30,
 16 |   maxGapSmooth = 2500,
 17 |   maxGap = 1000,
 18 |   verbose = TRUE,
 19 |   maxPerms = 10,
 20 |   matchCovariate = NULL,
 21 |   BPPARAM = bpparam(),
 22 |   stat = "stat",
 23 |   block = FALSE,
 24 |   blockSize = 5000,
 25 |   chrsPerChunk = 1
 26 | )
 27 | }
 28 | \arguments{
 29 | \item{bs}{bsseq object containing the methylation values as well as the 
 30 | phenotype matrix that contains sample level covariates}
 31 | 
 32 | \item{testCovariate}{Character value indicating which variable
 33 | (column name) in \code{pData(bs)} to test
 34 | for association of methylation levels. 
 35 | Can alternatively specify an integer value indicating
 36 | which of column of
 37 | \code{pData(bs)} to use. This is used to construct the 
 38 | design matrix for the test statistic calculation. To run using a 
 39 | continuous or categorial covariate with more than two groups, simply pass in
 40 | the name of a column in `pData` that contains this covariate. A continuous
 41 | covariate is assmued if the data type in the `testCovariate` slot is 
 42 | continuous, with the exception of if there are only two unique values 
 43 | (then a two group comparison is carried out).}
 44 | 
 45 | \item{adjustCovariate}{an (optional) character value or vector 
 46 | indicating which variables (column names) in \code{pData(bs)} 
 47 | will be adjusted for when 
 48 |  testing for the association of methylation value with the 
 49 |  \code{testCovariate}. 
 50 | Can alternatively specify an
 51 | integer value or vector indicating
 52 |  which of the columns of \code{pData(bs)} to adjust for.
 53 |  If not NULL (default), then this is also used to 
 54 |  construct the design matrix for the test statistic calculation.}
 55 | 
 56 | \item{cutoff}{scalar value that represents the absolute value (or a vector 
 57 | of two numbers representing a lower and upper bound) for the cutoff of 
 58 | the single CpG coefficient that is used to discover 
 59 | candidate regions. Default value is 0.10.}
 60 | 
 61 | \item{minNumRegion}{positive integer that represents the minimum number of
 62 | CpGs to consider for a candidate region. Default value is 5. 
 63 | Minimum value is 3.}
 64 | 
 65 | \item{smooth}{logical value that indicates whether or not to smooth the 
 66 | CpG level signal when discovering candidate regions.
 67 | Defaults to TRUE.}
 68 | 
 69 | \item{bpSpan}{a positive integer that represents the length in basepairs
 70 | of the smoothing span window if \code{smooth} is TRUE.  Default value is 
 71 | 1000.}
 72 | 
 73 | \item{minInSpan}{positive integer that represents the minimum number of
 74 | CpGs in a smoothing span window if \code{smooth} is TRUE.  
 75 | Default value is 30.}
 76 | 
 77 | \item{maxGapSmooth}{integer value representing maximum number of basepairs
 78 | in between neighboring CpGs to be included in the same 
 79 | cluster when performing smoothing (should generally be larger than
 80 | \code{maxGap})}
 81 | 
 82 | \item{maxGap}{integer value representing maximum number of basepairs in 
 83 | between neighboring CpGs to be included in the same DMR.}
 84 | 
 85 | \item{verbose}{logical value that indicates whether progress messages
 86 | should be printed to stdout. Defaults value is TRUE.}
 87 | 
 88 | \item{maxPerms}{a positive integer that represents the maximum number 
 89 | of permutations that will be used to generate the global null 
 90 | distribution of test statistics.  Default value is 10.}
 91 | 
 92 | \item{matchCovariate}{An (optional) character value 
 93 | indicating which variable (column name) of \code{pData(bs)} 
 94 | will be blocked for when 
 95 | constructing the permutations in order to
 96 | test for the association of methylation value with the 
 97 | \code{testCovariate}, only to be used when \code{testCovariate}
 98 | is a two-group factor and the number of permutations possible is less
 99 | than 500000.
100 | Alternatively, you can specify an integer value indicating
101 | which column of \code{pData(bs)} to block for.
102 | Blocking means that only permutations with balanced
103 | composition of \code{testCovariate} values will be used (for example if
104 | you have samples from different gender and this is not your covariate of
105 | interest, 
106 | it is recommended to use gender as a matching covariate to avoid one 
107 | of the permutations testing entirely males versus females; this violates
108 | the null hypothesis and will decrease power).
109 | If not NULL (default), then no blocking is performed.}
110 | 
111 | \item{BPPARAM}{a \code{BiocParallelParam} object to specify the parallel 
112 | backend. The default 
113 | option is \code{BiocParallel::bpparam()} which will automatically creates
114 | a cluster appropriate for the operating system.}
115 | 
116 | \item{stat}{a character vector indicating the name of the column of the 
117 | output to use as the region-level test statistic. Default value is 'stat'
118 | which is the region level-statistic designed to be comparable across the
119 | genome.
120 | It is not recommended to change this argument, but it can be done for
121 | experimental purposes. Possible values are: 'L' - the number of loci
122 | in the region, 'area' - the sum of the smoothed loci statistics,
123 | 'beta' - the effect size of the region, 'stat' - the test statistic for
124 | the region, or 'avg' - the average smoothed loci statistic.}
125 | 
126 | \item{block}{logical indicating whether to search for large-scale (low
127 | resolution) blocks of differential methylation (default is FALSE, which
128 | means that local DMRs are desired). If TRUE, the parameters for 
129 | \code{bpSpan}, \code{minInSpan}, and \code{maxGapSmooth} should be adjusted
130 | (increased) accordingly. This setting will also merge
131 | candidate regions that (1) are in the same direction and (2) are less than 
132 | 1kb apart with no covered CpGs separating them. The region-level model used 
133 | is also slightly modified - instead of a loci-specific intercept for each 
134 | CpG in theregion, the intercept term is modeled as a natural spline with 
135 | one interior knot per each 10kb of length (up to 10 interior knots).}
136 | 
137 | \item{blockSize}{numeric value indicating the minimum number of basepairs 
138 | to be considered a block (only used if \code{block}=TRUE). Default is 
139 | 5000 basepairs.}
140 | 
141 | \item{chrsPerChunk}{a positive integer value indicating the number of 
142 | chromosomes per chunk. The default is 1, meaning that the data will be 
143 | looped through one chromosome at a time. When pairing up multiple 
144 | chromosomes per chunk, sizes (in terms of numbers of CpGs) will be taken
145 | into consideration to balance the sizes of each chunk.}
146 | }
147 | \value{
148 | a \code{GRanges} object that contains the results of the inference. 
149 |    The object contains one row for each candidate region, sorted by q-value
150 |    and then chromosome. The standard 
151 |    \code{GRanges} chr, start, and end are included, along with at least
152 |    7 metadata
153 |    columns, in the following order: 
154 |    1. L = the number of CpGs contained in the region,
155 |    2. area = the sum of the smoothed beta values
156 |    3. beta = the coefficient value for the condition difference (there 
157 |       will be more than one column here if a multi-group comparison
158 |       was performed),
159 |    4. stat = the test statistic for the condition difference,
160 |    5. pval = the permutation p-value for the significance of the test
161 |    statistic, and 
162 |    6. qval = the q-value for the test statistic (adjustment
163 |    for multiple comparisons to control false discovery rate).
164 |    7. index = an \code{IRanges} containing the indices of the region's 
165 |       first CpG to last CpG.
166 | }
167 | \description{
168 | Performs a two-step approach that (1) detects candidate regions, and
169 | (2) scores candidate regions with an exchangeable (across the genome)
170 | statistic and evaluates statistical significance using a 
171 | permuation test on the pooled null distribution of scores.
172 | }
173 | \examples{
174 | 
175 | # load example data 
176 | data(BS.chr21)
177 | 
178 | # the covariate of interest is the 'CellType' column of pData(BS.chr21)
179 | testCovariate <- 'CellType'
180 | 
181 | # run dmrseq on a subset of the chromosome (10K CpGs)
182 | regions <- dmrseq(bs=BS.chr21[240001:250000,],
183 |                  cutoff = 0.05,
184 |                  testCovariate=testCovariate)
185 | 
186 | }
187 | \keyword{inference}
188 | 


--------------------------------------------------------------------------------
/man/plotDMRs.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/plotDMRs.R
  3 | \name{plotDMRs}
  4 | \alias{plotDMRs}
  5 | \title{Plot Differentially Methylated Regions}
  6 | \usage{
  7 | plotDMRs(
  8 |   BSseq,
  9 |   regions = NULL,
 10 |   testCovariate = NULL,
 11 |   extend = (end(regions) - start(regions) + 1)/2,
 12 |   main = "",
 13 |   addRegions = regions,
 14 |   annoTrack = NULL,
 15 |   col = NULL,
 16 |   lty = NULL,
 17 |   lwd = NULL,
 18 |   label = NULL,
 19 |   mainWithWidth = TRUE,
 20 |   regionCol = .alpha("#C77CFF", 0.2),
 21 |   addTicks = TRUE,
 22 |   addPoints = TRUE,
 23 |   pointsMinCov = 1,
 24 |   highlightMain = FALSE,
 25 |   qval = TRUE,
 26 |   stat = TRUE,
 27 |   verbose = TRUE,
 28 |   includeYlab = TRUE,
 29 |   compareTrack = NULL,
 30 |   labelCols = NULL,
 31 |   horizLegend = FALSE,
 32 |   addLines = TRUE,
 33 |   linesMinCov = 1
 34 | )
 35 | }
 36 | \arguments{
 37 | \item{BSseq}{An object of class BSseq.}
 38 | 
 39 | \item{regions}{A data.frame containing the DMRs (output from the main
 40 | \code{dmrseq}) function.}
 41 | 
 42 | \item{testCovariate}{integer value or vector indicating which of columns of
 43 | \code{pData(bs)} contains the covariate of interest. 
 44 | This is used to construct the sample labels and colors (unless this is
 45 | over-ridden by specifying \code{label}).}
 46 | 
 47 | \item{extend}{Describes how much the plotting region should be extended in
 48 | either direction. The total width of the plot is equal to the width of the 
 49 | region plus twice extend.}
 50 | 
 51 | \item{main}{The plot title. The default is to construct a title with 
 52 | information about which genomic region is being plotted.}
 53 | 
 54 | \item{addRegions}{A set of additional regions to be highlighted on the 
 55 | plots. Same format as the \code{regions} argument.}
 56 | 
 57 | \item{annoTrack}{a \code{GRangesList} object with two elements returned
 58 | by \code{\link{getAnnot}}. The first
 59 | contains CpG category information in the first element (optional)
 60 | coding gene sequence information in the second element (optional).
 61 | At least one of these elements needs to be non-null in order for 
 62 | any annotation to be plotted, but it is not necessary to contain
 63 | both.}
 64 | 
 65 | \item{col}{The color of the methylation estimates. It is recommended to 
 66 | leave this value as default (NULL), and specify a value of 
 67 | \code{testCovariate} to indicate which column of \code{pData(bs)}
 68 | to use as a factor for coloring the points and lines of the plot.
 69 | Alternatively, you can specify particular colors by 
 70 | passing this information through the \code{pData} slot of the
 71 | object \code{BSseq} (a \code{data.frame} that houses metadata). To do 
 72 | so, place the color value for each sample in a column titled \code{col}, 
 73 | and leave this argument as its default value of NULL. Alternatively,
 74 | you may specify a vector of color values (one for each sample), but 
 75 | you *must* make sure that this vector is in the same order as the samples
 76 | are in the BSseq object. If NULL and no \code{col} column is found in 
 77 | \code{pData}, then estimates are plotted in black for all samples.}
 78 | 
 79 | \item{lty}{The line type of the methylation estimates. It is recommended to
 80 | pass this information through the \code{pData} slot of the
 81 | object \code{BSseq} (a \code{data.frame} that houses metadata). To do 
 82 | so, place the line type value for each sample in a column titled \code{lty}, 
 83 | and leave this argument as its default value of NULL. Alternatively,
 84 | you may specify a vector of line type values (one for each sample), but 
 85 | you *must* make sure that this vector is in the same order as the samples
 86 | are in the BSseq object. If NULL and no \code{lty} column is found in 
 87 | \code{pData}, then estimates are plotted with \code{lty=1} for all samples.}
 88 | 
 89 | \item{lwd}{The line width of the methylation estimates. It is recommended to
 90 | pass this information through the \code{pData} slot of the
 91 | object \code{BSseq} (a \code{data.frame} that houses metadata). To do 
 92 | so, place the line width value for each sample in a column titled \code{lwd},
 93 | and leave this argument as its default value of NULL. Alternatively,
 94 | you may specify a vector of line width values (one for each sample), but 
 95 | you *must* make sure that this vector is in the same order as the samples
 96 | are in the BSseq object. If NULL and no \code{lwd} column is found in 
 97 | \code{pData}, then estimates are plotted with \code{lwd=1} for all samples.}
 98 | 
 99 | \item{label}{The condition/population labels for the plot legend. If NULL
100 | (default) this is taken from the \code{testCovariate} column of 
101 | \code{pData}. Alternatively, you can pass in labels by 
102 | adding this information through the \code{pData} slot of the
103 | object \code{BSseq} (a \code{data.frame} that houses metadata). To do 
104 | so, place the labels for each sample in a column titled \code{label}, 
105 | and leave this argument as its default value of NULL.
106 | You may instead specify an arbitrary vector of labels (one for each sample),
107 | but be aware that you *must* make sure that this vector is in the same order 
108 | as the samples are in the BSseq object. If NULL, and \code{testCovariate} is
109 | also NULL and no \code{label} column is found in 
110 | \code{pData}, then no legend is created.}
111 | 
112 | \item{mainWithWidth}{logical value indicating whether the default title
113 | should include information about width of the plot region.}
114 | 
115 | \item{regionCol}{The color used for highlighting the region.}
116 | 
117 | \item{addTicks}{logical value indicating whether tick marks showing the
118 | location of methylation loci should be added. Default is TRUE.}
119 | 
120 | \item{addPoints}{logical value indicating whether the individual 
121 | methylation estimates be plotted as points.}
122 | 
123 | \item{pointsMinCov}{The minimum coverage a methylation loci need in 
124 | order for the raw methylation estimates to be plotted. Useful for filtering
125 | out low coverage loci. Only used if addPoints = TRUE. Default value is 1 
126 | (no filtering).}
127 | 
128 | \item{highlightMain}{logical value indicating whether the plot region 
129 | should be highlighted.}
130 | 
131 | \item{qval}{logical value indicating whether the region FDR estimate
132 | (q-value) should be displayed in the plot title. The value is extracted 
133 | from the \code{regions} argument.}
134 | 
135 | \item{stat}{logical value indicating whether the region statistic 
136 | should be displayed in the plot title. The value is extracted from the
137 | \code{regions} argument.}
138 | 
139 | \item{verbose}{logical value indicating whether progress messages 
140 | should be printed to the screen.}
141 | 
142 | \item{includeYlab}{a logical indicating whether to include the Y axis
143 | label 'Methylation' (useful to turn off if combining multiple region
144 | figures and you do not want to include redundant y axis label information)}
145 | 
146 | \item{compareTrack}{a named GRangesList object that contains up to four
147 | custom tracks (GRanges objects) which will be plotted below the region.
148 | Only one of `compareTrack` or `annoTrack` can be specified since there is
149 | only for plotting either the built in GpG category and exon tracks, *or* a
150 | custom set of tracks.}
151 | 
152 | \item{labelCols}{a character vector with names of the mcols slot of the 
153 | GRanges items in `compareTrack'. Only used if plotting custom
154 | tracks using the `compareTrack' argument. If specified, the (first) value
155 | in that column is printed along with a label that includes the name of the
156 | list item. If NULL (default), just the name of the track is printed.}
157 | 
158 | \item{horizLegend}{logical indicating whether the legend should be 
159 | horizontal instead of vertical (default FALSE). This is useful if you need
160 | to plot many labels and want to preserve whitespace.}
161 | 
162 | \item{addLines}{logical indicating whether to plot smooth lines between 
163 | points. Default is true. Can be useful to turn this off for very small 
164 | regions.}
165 | 
166 | \item{linesMinCov}{The minimum coverage a methylation loci need in 
167 | order to be used for plotting of smoothed lines. Useful for filtering
168 | out low coverage loci. Only used if addLines = TRUE. Default value is 1 
169 | (no filtering).}
170 | }
171 | \value{
172 | None (generates a plot)
173 | }
174 | \description{
175 | Generates trace plots of methylation proportions by genomic position.
176 | }
177 | \details{
178 | Creates aesthetially pleasing DMR plots. By default will plot individual
179 | points with size proportional to coverage, along with a smoothed line 
180 | for each sample. Elements will be colored by biological condition 
181 | (\code{label}). Also has functionality to add annotations below the main
182 | plot (CpG category, genes) if \code{annoTrack} is specified.
183 | }
184 | \examples{
185 | 
186 | # load the example data
187 | data(BS.chr21)
188 | 
189 | # load example results (computed with dmrseq function)
190 | data(dmrs.ex)
191 | 
192 | # get annotation information (using getAnnot function)
193 | # here we'll load the example annotation from chr21
194 | data(annot.chr21)
195 | 
196 | # plot the 1st DMR
197 | plotDMRs(BS.chr21, regions=dmrs.ex[1,], testCovariate=1,
198 |    annoTrack=annot.chr21)
199 | 
200 | }
201 | 


--------------------------------------------------------------------------------
/R/simDMRs.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' Simulate Differentially Methylated Regions
  3 | #' 
  4 | #' Add simulated DMRs to observed control data. Control data will be split
  5 | #' into two (artificial) populations.
  6 | #' 
  7 | #' @param bs a BSseq object containing only control samples (from the same
  8 | #' population) for which simulated DMRs will be added after dividing the 
  9 | #' population into two artificial groups.
 10 | #' 
 11 | #' @param num.dmrs an integer specifying how many DMRs to add.
 12 | #' 
 13 | #' @param delta.max0 a proportion value indicating the mode value for the
 14 | #' difference in proportion of methylated CpGs in the simulated DMRs (the
 15 | #' actual value will be drawn from a scaled Beta distribution centered at 
 16 | #' this value). Default value is 0.3.
 17 | #' 
 18 | #' @return A named list object with 5 elements: (1) 
 19 | #' \code{gr.dmrs} is a \code{GRanges} object with \code{num.dmrs}
 20 | #' ranges that represent the random DMRs added. (2) \code{dmr.mncov} is a 
 21 | #' numeric vector that contains the mean coverage in each simulated DMR. (3)
 22 | #' \code{dmr.L} is a numeric vector that contains the number of CpGs in each 
 23 | #' simulated DMR. (4) \code{bs} is the BSseq object that contains the 
 24 | #' simulated DMRs. (5) \code{deltas} is a numeric vector that contains the 
 25 | #' effect size used for each DMR. 
 26 | #' 
 27 | #' @importFrom IRanges IRanges
 28 | #' 
 29 | #' @export
 30 | #' 
 31 | #' @examples 
 32 | #' 
 33 | #' # Add simulated DMRs to a BSseq dataset
 34 | #' # This is just for illustrative purposes - ideally you would
 35 | #' # add DMRs to a set of samples from the same condition (in our
 36 | #' # example data, we have data from two different cell types)
 37 | #' # In this case, we shuffle the samples by cell type to create
 38 | #' # a null comparison.
 39 | #' 
 40 | #' data(BS.chr21)
 41 | #' 
 42 | #' BS.chr21.sim <- simDMRs(bs=BS.chr21[1:10000,c(1,3,2,4)], 
 43 | #'                         num.dmrs=50)
 44 | #' 
 45 | #' # show the simulated DMRs GRanges object
 46 | #' show(BS.chr21.sim$gr.dmrs)
 47 | #' 
 48 | #' # show the updated BSseq object that includes the simulated DMRs
 49 | #' show(BS.chr21.sim$bs)
 50 | #' 
 51 | #' # examine effect sizes of the DMRs
 52 | #' head(BS.chr21.sim$delta)
 53 | #' 
 54 | simDMRs <- function(bs, num.dmrs = 3000, delta.max0 = 0.3) {
 55 |     # check that all loci have coverage in every sample
 56 |     zero.cov <- which(rowSums(as.matrix(getCoverage(bs, 
 57 |                                              type="Cov")) == 0) > 0)
 58 |     if (length(zero.cov) > 0){
 59 |       stop("Zero coverage found for at least one sample in ", length(zero.cov),
 60 |            " loci. Please filter for loci with coverage at least one in ",
 61 |            "all samples before passing to 'simDMRs'")
 62 |     }
 63 |   
 64 |     sampleSize <- floor(nrow(pData(bs))/2)
 65 |     
 66 |     # code to simulate DMRs if some number of simulated dmrs was specified
 67 |     message("Simulating DMRs for ", sampleSize, " vs ", ncol(bs) - sampleSize, 
 68 |             " comparison")
 69 |     triwt <- function(x, amp = 1, base = 0, width = 1, center = 0, 
 70 |                       deg = 3, dir = 1) {
 71 |         y <- dir * (((width/2)^deg - abs(x - center)^deg)/
 72 |                       (width/2)^deg)^deg * amp +  base
 73 |         y[abs(x - center) > ceiling(width/2)] <- base[abs(x - center) >
 74 |                                                         ceiling(width/2)]
 75 |         return(y)
 76 |     }
 77 |     
 78 |     meth.mat <- as.matrix(getCoverage(bs, type = "M"))
 79 |     unmeth.mat <- as.matrix(getCoverage(bs, type = "Cov")) - meth.mat
 80 |     chr <- as.character(seqnames(bs))
 81 |     pos <- start(bs)
 82 |     
 83 |     cluster <- bumphunter::clusterMaker(chr, pos, maxGap = 500)
 84 |     Indexes <- split(seq(along = cluster), cluster)
 85 |     lns <- lengths(Indexes)
 86 |     Indexes <- Indexes[lns >= 5 & lns <= 500]
 87 |     
 88 |     # sample regions with intermediate methylation values preferentially
 89 |     prop.mat <- rowMeans(meth.mat/(meth.mat + unmeth.mat))
 90 |     prop.mat <- unlist(lapply(Indexes, function(x) median(prop.mat[x])))
 91 |   
 92 |     dmrs.ind <- sample(seq_len(length(Indexes)), num.dmrs, replace = FALSE, 
 93 |                        prob = pmax(1 - sqrt(2) * 
 94 |                                      abs(0.5 - prop.mat)^0.5, 0))
 95 |     dmrs.ind <- Indexes[dmrs.ind]
 96 |     fnc <- function(index) {
 97 |         gr.dmr <- GRanges(seqnames = unique(as.character(seqnames(bs)[index])),
 98 |                           IRanges(start = min(start(bs)[index]), 
 99 |                                   end = max(start(bs)[index])))
100 |         return(gr.dmr)
101 |     }
102 |     ## GRanges Object for the Simulated DMRs
103 |     gr.dmrs <- suppressWarnings(Reduce("c", lapply(dmrs.ind, fnc)))
104 |     
105 |     ## Generating the Methylated and Unmethylated Read Counts 
106 |     ## for the CpG sites in the
107 |     ## DMRs and outside
108 |     
109 |     # set up null signal (smooth function of position) mcols(bs)$diff <- 0
110 |     Diff <- Diff2 <- rep(0, length(bs))
111 |     
112 |     dmr.mncov <- dmr.L <- deltas <- rep(NA, num.dmrs)
113 |     
114 |     for (u in seq_len(num.dmrs)) {
115 |         # coin flip for up or down
116 |         up <- 1 - 2 * (rbinom(1, 1, 0.5) == 1)
117 |         
118 |         # let effect size change randomly
119 |         delta.max <- delta.max0 + (rbeta(1, 2, 2) - 0.5)/3
120 |         deltas[u] <- delta.max
121 |         
122 |         # grab loci in the dmr
123 |         dmr.L[u] <- length(dmrs.ind[[u]])
124 |         prop.mat <- meth.mat[dmrs.ind[[u]], ]/(meth.mat[dmrs.ind[[u]], ] + 
125 |                                                  unmeth.mat[dmrs.ind[[u]], 
126 |             ])
127 |         
128 |         # change direction if baseline mean is near boundary
129 |         if (up == 1) {
130 |             if (mean(prop.mat) > 1 - delta.max) {
131 |                 up <- -1
132 |             }
133 |         } else if (up == -1) {
134 |             if (mean(prop.mat) < delta.max) {
135 |                 up <- 1
136 |             }
137 |         }
138 |         
139 |         # simulated mean as a smooth parabola added or subtracted 
140 |         # from baseline mean
141 |         last <- max(pos[dmrs.ind[[u]]])
142 |         first <- min(pos[dmrs.ind[[u]]])
143 |         width <- last - first
144 |         
145 |         # widen out so that first and last CpGs don't have a 
146 |         # difference of zero
147 |         last <- last + 0.2 * width
148 |         first <- first - 0.2 * width
149 |         width <- last - first
150 |         
151 |         mid <- round((last - first)/2 + first)
152 |         
153 |         # Diff.hit is the methylation percentage difference for each position in
154 |         # spiked in DMR; restricted to between -1 and 1 (negative indicates that
155 |         # difference is in the negative direction.
156 |         
157 |         Diff.hit <- round(triwt(pos[dmrs.ind[[u]]], amp = delta.max, 
158 |                                 base = Diff[dmrs.ind[[u]]], 
159 |             width = width, center = mid, deg = 4, dir = up), 4)
160 |         
161 |         # calculate the mean coverage in the DMR over all the samples and save 
162 |         # result in a vector dmr.mncov for exploring the characteristics of the
163 |         # detected / missed DMRs in simulation results.
164 |         mn.cov <- by(t(meth.mat[dmrs.ind[[u]], ] + unmeth.mat[dmrs.ind[[u]], ]),
165 |             factor(paste0("Condition", c(rep(1, sampleSize), 
166 |                                          rep(2, ncol(bs) - sampleSize)))), 
167 |             colMeans)
168 |         mn.cov <- rowMeans(cbind(mn.cov[[1]], mn.cov[[2]]))
169 |         dmr.mncov[u] <- mean(mn.cov)
170 |         
171 |         
172 |         # Conditional on the coverage for each site and sample combination,
173 |         # sample the
174 |         # number of methylated reads from a binomial distribution where the 
175 |         # probability
176 |         # parameter is taken as the observed estimate plus the Diff/Diff2 
177 |         # (depending on
178 |         # whether the sample is from condition 1 or condition 2 This will 
179 |         # induce sampling
180 |         # error into the number of reads observed in the simulation and 
181 |         # thus will be more
182 |         # realistic and a more practical evaluation of the methods
183 |         cov <- meth.mat[dmrs.ind[[u]], ] + unmeth.mat[dmrs.ind[[u]], ]
184 |         prop <- meth.mat[dmrs.ind[[u]], ]/cov
185 |         grp <- runif(1) < 0.5
186 |         ss <- ifelse(grp, sampleSize, ncol(bs) - sampleSize)
187 |         for (samp in seq_len(ss)) {
188 |             # randomly choose which condition is the one with the difference
189 |             if (grp) {
190 |                 # first generate M counts for sample samp of condition 1
191 |                 meth.mat[dmrs.ind[[u]], samp] <- rbinom(n=length(dmrs.ind[[u]]),
192 |                   size = cov[, samp], 
193 |                   prob = pmax(pmin(prop[, samp] + Diff.hit, 1), 
194 |                     0))
195 |                 
196 |                 # next assign the other Cov - M counts to unmethylated matrix
197 |                 unmeth.mat[dmrs.ind[[u]], samp] <- cov[, samp] -
198 |                   meth.mat[dmrs.ind[[u]], samp]
199 |                 
200 |             } else {
201 |                 # next generate M counts for sample samp of condition 2
202 |                 meth.mat[dmrs.ind[[u]], (sampleSize + samp)] <- 
203 |                         rbinom(n = length(dmrs.ind[[u]]),
204 |                   size = cov[, (sampleSize + samp)], 
205 |                   prob = pmax(pmin(prop[, (sampleSize + samp)] + 
206 |                                      Diff.hit, 1), 0))
207 |                 
208 |                 # next assign the other Cov - M counts to unmethylated matrix
209 |                 unmeth.mat[dmrs.ind[[u]], (sampleSize + samp)] <- cov[,
210 |                   (sampleSize + samp)] - meth.mat[dmrs.ind[[u]],
211 |                                                   (sampleSize + samp)]
212 |             }
213 |         }
214 |     }
215 | 
216 |     # get everything in order to run bumphunter functions
217 |     sampnames <- paste0("Condition", c(rep(1, sampleSize),
218 |                                        rep(2, ncol(bs) - sampleSize)), 
219 |                         "_Rep", c(seq_len(sampleSize), 
220 |                                   seq_len(ncol(bs) - sampleSize)))
221 |     colnames(meth.mat) <- colnames(unmeth.mat) <- sampnames 
222 |     bsNew <- BSseq(pos = pos, chr = chr, M = meth.mat, 
223 |                    Cov = (meth.mat + unmeth.mat), 
224 |                    sampleNames = sampnames)
225 |     
226 |     sim.dat.red <- list(gr.dmrs = gr.dmrs, dmr.mncov = dmr.mncov, dmr.L = dmr.L,
227 |         bs = bsNew, delta = deltas)
228 |     return(sim.dat.red)
229 | }
230 | 
231 | 


--------------------------------------------------------------------------------
/R/plotDMRs.R:
--------------------------------------------------------------------------------
  1 | #' Plot Differentially Methylated Regions
  2 | #'
  3 | #' Generates trace plots of methylation proportions by genomic position.
  4 | #' 
  5 | #' Creates aesthetially pleasing DMR plots. By default will plot individual
  6 | #' points with size proportional to coverage, along with a smoothed line 
  7 | #' for each sample. Elements will be colored by biological condition 
  8 | #' (\code{label}). Also has functionality to add annotations below the main
  9 | #' plot (CpG category, genes) if \code{annoTrack} is specified.
 10 | #' 
 11 | #' @param BSseq An object of class BSseq.
 12 | #' 
 13 | #' @param regions A data.frame containing the DMRs (output from the main
 14 | #' \code{dmrseq}) function.
 15 | #' 
 16 | #' @param extend Describes how much the plotting region should be extended in
 17 | #'  either direction. The total width of the plot is equal to the width of the 
 18 | #'  region plus twice extend.
 19 | #' 
 20 | #' @param main The plot title. The default is to construct a title with 
 21 | #' information about which genomic region is being plotted.
 22 | #' 
 23 | #' @param addRegions A set of additional regions to be highlighted on the 
 24 | #' plots. Same format as the \code{regions} argument.
 25 | #' 
 26 | #' @param annoTrack a \code{GRangesList} object with two elements returned
 27 | #' by \code{\link{getAnnot}}. The first
 28 | #' contains CpG category information in the first element (optional)
 29 | #' coding gene sequence information in the second element (optional).
 30 | #' At least one of these elements needs to be non-null in order for 
 31 | #' any annotation to be plotted, but it is not necessary to contain
 32 | #' both.
 33 | #' 
 34 | #' @param col The color of the methylation estimates. It is recommended to 
 35 | #' leave this value as default (NULL), and specify a value of 
 36 | #' \code{testCovariate} to indicate which column of \code{pData(bs)}
 37 | #' to use as a factor for coloring the points and lines of the plot.
 38 | #' Alternatively, you can specify particular colors by 
 39 | #' passing this information through the \code{pData} slot of the
 40 | #' object \code{BSseq} (a \code{data.frame} that houses metadata). To do 
 41 | #' so, place the color value for each sample in a column titled \code{col}, 
 42 | #' and leave this argument as its default value of NULL. Alternatively,
 43 | #' you may specify a vector of color values (one for each sample), but 
 44 | #' you *must* make sure that this vector is in the same order as the samples
 45 | #' are in the BSseq object. If NULL and no \code{col} column is found in 
 46 | #' \code{pData}, then estimates are plotted in black for all samples.
 47 | #' 
 48 | #' @param lty The line type of the methylation estimates. It is recommended to
 49 | #' pass this information through the \code{pData} slot of the
 50 | #' object \code{BSseq} (a \code{data.frame} that houses metadata). To do 
 51 | #' so, place the line type value for each sample in a column titled \code{lty}, 
 52 | #' and leave this argument as its default value of NULL. Alternatively,
 53 | #' you may specify a vector of line type values (one for each sample), but 
 54 | #' you *must* make sure that this vector is in the same order as the samples
 55 | #' are in the BSseq object. If NULL and no \code{lty} column is found in 
 56 | #' \code{pData}, then estimates are plotted with \code{lty=1} for all samples.
 57 | #' 
 58 | #' @param lwd The line width of the methylation estimates. It is recommended to
 59 | #' pass this information through the \code{pData} slot of the
 60 | #' object \code{BSseq} (a \code{data.frame} that houses metadata). To do 
 61 | #' so, place the line width value for each sample in a column titled \code{lwd},
 62 | #' and leave this argument as its default value of NULL. Alternatively,
 63 | #' you may specify a vector of line width values (one for each sample), but 
 64 | #' you *must* make sure that this vector is in the same order as the samples
 65 | #' are in the BSseq object. If NULL and no \code{lwd} column is found in 
 66 | #' \code{pData}, then estimates are plotted with \code{lwd=1} for all samples.
 67 | #' 
 68 | #' @param label The condition/population labels for the plot legend. If NULL
 69 | #' (default) this is taken from the \code{testCovariate} column of 
 70 | #' \code{pData}. Alternatively, you can pass in labels by 
 71 | #' adding this information through the \code{pData} slot of the
 72 | #' object \code{BSseq} (a \code{data.frame} that houses metadata). To do 
 73 | #' so, place the labels for each sample in a column titled \code{label}, 
 74 | #' and leave this argument as its default value of NULL.
 75 | #' You may instead specify an arbitrary vector of labels (one for each sample),
 76 | #' but be aware that you *must* make sure that this vector is in the same order 
 77 | #' as the samples are in the BSseq object. If NULL, and \code{testCovariate} is
 78 | #' also NULL and no \code{label} column is found in 
 79 | #' \code{pData}, then no legend is created.
 80 | #' 
 81 | #' @param mainWithWidth logical value indicating whether the default title
 82 | #' should include information about width of the plot region.
 83 | #' 
 84 | #' @param regionCol The color used for highlighting the region.
 85 | #' 
 86 | #' @param addTicks logical value indicating whether tick marks showing the
 87 | #'  location of methylation loci should be added. Default is TRUE.
 88 | #' 
 89 | #' @param addPoints logical value indicating whether the individual 
 90 | #' methylation estimates be plotted as points.
 91 | #' 
 92 | #' @param pointsMinCov The minimum coverage a methylation loci need in 
 93 | #' order for the raw methylation estimates to be plotted. Useful for filtering
 94 | #' out low coverage loci. Only used if addPoints = TRUE. Default value is 1 
 95 | #' (no filtering).
 96 | #' 
 97 | #' @param highlightMain logical value indicating whether the plot region 
 98 | #' should be highlighted.
 99 | #' 
100 | #' @param stat logical value indicating whether the region statistic 
101 | #' should be displayed in the plot title. The value is extracted from the
102 | #' \code{regions} argument.
103 | #' 
104 | #' @param qval logical value indicating whether the region FDR estimate  
105 | #' (q-value) should be displayed in the plot title. The value is extracted 
106 | #' from the \code{regions} argument.
107 | #' 
108 | #' @param verbose logical value indicating whether progress messages 
109 | #' should be printed to the screen.
110 | #' 
111 | #' @param testCovariate integer value or vector indicating which of columns of
112 | #'  \code{pData(bs)} contains the covariate of interest. 
113 | #'  This is used to construct the sample labels and colors (unless this is
114 | #'  over-ridden by specifying \code{label}).
115 | #'  
116 | #' @param includeYlab a logical indicating whether to include the Y axis
117 | #'  label 'Methylation' (useful to turn off if combining multiple region
118 | #'  figures and you do not want to include redundant y axis label information)
119 | #'  
120 | #' @param compareTrack a named GRangesList object that contains up to four
121 | #' custom tracks (GRanges objects) which will be plotted below the region.
122 | #' Only one of `compareTrack` or `annoTrack` can be specified since there is
123 | #' only for plotting either the built in GpG category and exon tracks, *or* a
124 | #' custom set of tracks.
125 | #' 
126 | #' @param labelCols a character vector with names of the mcols slot of the 
127 | #' GRanges items in `compareTrack'. Only used if plotting custom
128 | #' tracks using the `compareTrack' argument. If specified, the (first) value
129 | #' in that column is printed along with a label that includes the name of the
130 | #' list item. If NULL (default), just the name of the track is printed.
131 | #' 
132 | #' @param horizLegend logical indicating whether the legend should be 
133 | #' horizontal instead of vertical (default FALSE). This is useful if you need
134 | #' to plot many labels and want to preserve whitespace.
135 | #' 
136 | #' @param addLines logical indicating whether to plot smooth lines between 
137 | #' points. Default is true. Can be useful to turn this off for very small 
138 | #' regions.
139 | #' 
140 | #' @param linesMinCov The minimum coverage a methylation loci need in 
141 | #' order to be used for plotting of smoothed lines. Useful for filtering
142 | #' out low coverage loci. Only used if addLines = TRUE. Default value is 1 
143 | #' (no filtering).
144 | #' 
145 | #' @export
146 | #' 
147 | #' @return None (generates a plot)
148 | #' 
149 | #' @importFrom RColorBrewer brewer.pal
150 | #' @importFrom grDevices hcl rainbow
151 | #' @importFrom graphics arrows
152 | #' 
153 | #' @examples
154 | #' 
155 | #' # load the example data
156 | #' data(BS.chr21)
157 | #' 
158 | #' # load example results (computed with dmrseq function)
159 | #' data(dmrs.ex)
160 | #' 
161 | #' # get annotation information (using getAnnot function)
162 | #' # here we'll load the example annotation from chr21
163 | #' data(annot.chr21)
164 | #' 
165 | #' # plot the 1st DMR
166 | #' plotDMRs(BS.chr21, regions=dmrs.ex[1,], testCovariate=1,
167 | #'    annoTrack=annot.chr21)
168 | #' 
169 | plotDMRs <- function(BSseq, regions = NULL, testCovariate = NULL, 
170 |     extend = (end(regions) - start(regions) + 1)/2, main = "", 
171 |     addRegions = regions, annoTrack = NULL, col = NULL, 
172 |     lty = NULL, lwd = NULL, label = NULL, mainWithWidth = TRUE, 
173 |     regionCol = .alpha("#C77CFF", 
174 |         0.2), addTicks = TRUE, addPoints = TRUE, pointsMinCov = 1, 
175 |     highlightMain = FALSE, 
176 |     qval = TRUE, stat = TRUE, verbose = TRUE, includeYlab = TRUE, 
177 |     compareTrack = NULL, 
178 |     labelCols = NULL, horizLegend = FALSE,
179 |     addLines = TRUE, linesMinCov = 1) {
180 |     # adapted from plotManyRegions from bsseq plot to take 
181 |     # in a vector of qval values
182 |     # (1 per region in regions argument) to be displayed in
183 |     # the plot title.  set
184 |     # addPoints = TRUE to plot individual points sized by coverage
185 |     # and one smooth
186 |     # (loess) line per sample instead of a uniform-sized verbatim 
187 |     # line going through
188 |     # each observation
189 |     if (!addLines && !addPoints)
190 |       stop("At least one of addLines or addPoints must be true")
191 |     if (verbose) 
192 |         message("[plotDMRs] Plotting ", nrow(regions), " DMRs")
193 |     if (!is.null(regions)) {
194 |         if (is(regions, "data.frame")){
195 |             gr <- data.frame2GRanges(regions, keepColumns = FALSE)
196 |         }else{ 
197 |           gr <- regions
198 |         }
199 |         if (!is(gr, "GRanges")) 
200 |             stop("'regions' needs to be either a 'data.frame' ",
201 |                         " or a 'GRanges' ")
202 |     } else {
203 |         gr <- granges(BSseq)
204 |     }
205 |     gr <- resize(gr, width = 2 * extend + width(gr), fix = "center")
206 |     BSseq <- subsetByOverlaps(BSseq, gr)
207 |     
208 |     if (!is.null(annoTrack) && !is.null(compareTrack)) 
209 |         stop("Choose either annoTrack or compareTrack; can't plot both")
210 |     
211 |     if (length(start(BSseq)) == 0) 
212 |         stop("No overlap between BSseq data and regions")
213 |     if (!is.null(main) && length(main) != length(gr)) 
214 |         main <- rep(main, length = length(gr))
215 |     
216 |     if (length(extend) == 1) {
217 |         extend <- rep(extend, length(gr))
218 |     }
219 |     
220 |     if (!is.null(testCovariate)) {
221 |       coeff <- seq(2, (1 + length(testCovariate)))
222 |       testCov <- as.character(pData(BSseq)[, testCovariate])
223 |       if (length(unique(testCov)) > 2 && !is.numeric(testCov) && length(coeff) == 1)
224 |         coeff <- seq(coeff, coeff + length(unique(as.character(testCov))) - 2 )
225 |       
226 |       design <- model.matrix(~testCov)
227 |         
228 |         if (is.null(col) && !("col" %in% names(pData(BSseq)))) {
229 |             cov.unique <- unique(design[, coeff, drop = FALSE])
230 |             ncol <- nrow(cov.unique) 
231 |             
232 |             colors <- gg_color_hue(ncol)
233 |             if (ncol == 2) {
234 |                 colors <- c("mediumblue", "deeppink1")
235 |             }
236 |             colors <- cbind(cov.unique, 
237 |                             colors[rank(as.numeric(rowSums(cov.unique)), 
238 |                                         ties.method = "first")])
239 |             colmat <- colors[, -ncol(colors), drop = FALSE]
240 |             colmat <- apply(colmat, 2, as.numeric)
241 |             z <- colors[,ncol(colors)][
242 |                          match(data.frame(t(design[, coeff, drop = FALSE])), 
243 |                                data.frame(t(colmat)))]
244 |             
245 |             pData(BSseq)$col <- as.character(z)
246 |         }
247 |         
248 |         if (is.null(label)  && !("label" %in% names(pData(BSseq))))  {
249 |             pData(BSseq)$label <- paste0(pData(BSseq)[, testCovariate])
250 |         }
251 |     }
252 |     
253 |     if (!is.null(label) || "label" %in% names(pData(BSseq))) {
254 |         if(!is.null(label)){
255 |           labs <- label
256 |         }else{
257 |           labs <- pData(BSseq)[["label"]]
258 |         }
259 |         if(horizLegend){
260 |           wiggle <- max(nchar(labs)) * 0.4
261 |         }else{
262 |           wiggle <- length(unique(labs)) * 0.9
263 |         }
264 |         opar <- par(mar = c(0, 4.1, 0, wiggle), 
265 |                     oma = c(0, 0, 2.5, 1))
266 |     } else {
267 |         opar <- par(mar = c(0, 4.1, 0, 0), oma = c(0, 0, 2.5, 1))
268 |     }
269 |     on.exit(par(opar))
270 |     
271 |     for (ii in seq(along = gr)) {
272 |         if (verbose && ii%%100 == 0) {
273 |             cat(sprintf("..... Plotting region %d (out of %d)\n", ii, 
274 |                         nrow(regions)))
275 |         }
276 |         
277 |         .plotSingleDMR(BSseq = BSseq, region = regions[ii, ], 
278 |             extend = extend[ii], main = main[ii], col = col, lty = lty, 
279 |             lwd = lwd, label = label, addRegions = addRegions, 
280 |             regionCol = regionCol, mainWithWidth = mainWithWidth, 
281 |             annoTrack = annoTrack, addTicks = addTicks, addPoints = addPoints, 
282 |             pointsMinCov = pointsMinCov, highlightMain = highlightMain, 
283 |             qval = qval, stat = stat, includeYlab = includeYlab, 
284 |             compareTrack = compareTrack, labelCols = labelCols,
285 |             horizLegend = horizLegend, addLines = addLines, linesMinCov = linesMinCov)
286 |     }
287 | }
288 | 
289 | 
290 | 


--------------------------------------------------------------------------------
/R/internal_plotting_functions.R:
--------------------------------------------------------------------------------
  1 | #' @title Add annotations to DMR plots
  2 | #' 
  3 | #' @description Function to add visual representation of CpG categories 
  4 | #' and/or coding 
  5 | #' sequences to DMR plot
  6 | #' 
  7 | #' @details An internal function that takes an annotation 
  8 | #' \code{SimpleGRangesList}
  9 | #' object that
 10 | #' contains CpG category information in the first element (optional) and / or
 11 | #' coding gene sequence information in the second element (optional). If neither
 12 | #' of these are present, then nothing will be plotted.
 13 | #' 
 14 | #' @param gr a \code{GRanges} object that contains the DMRs to be
 15 | #'  plotted
 16 | #' 
 17 | #' @param annoTrack a \code{SimpleGRangesList} object with two elements.
 18 | #' The first contains CpG category information in the first element (optional)
 19 | #' coding gene sequence information in the second element (optional).
 20 | #' At least one of these elements needs to be non-null in order for 
 21 | #' any annotation to be plotted, but it is not necessary to contain
 22 | #' both.
 23 | #' 
 24 | #' @return None
 25 | #' 
 26 | dmrPlotAnnotations <- function(gr, annoTrack) {
 27 |     # Code adapted from bsseq package
 28 |     
 29 |     ## check may need to be modified
 30 |     if (!is(annoTrack, "SimpleGRangesList"))
 31 |         stop("'annoTrack' needs to be a 'SimpleGRangesList'")
 32 |     plot(start(gr), 1, type = "n", xaxt = "n", yaxt = "n", bty = "n", 
 33 |         ylim = c(0, length(annoTrack) + 0.5), xlim = c(start(gr), end(gr)), 
 34 |         xlab = "", ylab = "")
 35 |     
 36 |     # add legend
 37 |     vars <- list(Island = "Island   ", Shore = "Shore   ", Shelf = "Shelf   ", 
 38 |                  OpenSea1 = "Open ", OpenSea2 = "Sea")
 39 |     cols <- c("forestgreen", "goldenrod2", "dodgerblue", "blue3", "blue3")
 40 |     for (i in seq_len(length(vars))) {
 41 |         tmpvars <- vars
 42 |         tmpvars[-i] <- paste("phantom('", tmpvars[-i], "')", sep = "")
 43 |         expr <- paste(tmpvars, collapse = "*")
 44 |         text(start(gr), 1.8, parse(text = expr), col = cols[i], adj = c(0, 1),
 45 |              cex = 0.85)
 46 |     }
 47 |     
 48 |     lapply(seq(along = annoTrack), function(ii) {
 49 |         jj <- length(annoTrack) + 1 - ii
 50 |         ir <- subsetByOverlaps(annoTrack[[ii]], gr)
 51 |         start(ir) <- pmax(start(ir), start(gr))
 52 |         end(ir) <- pmin(end(ir), end(gr))
 53 |         
 54 |         if (ii == 2) {
 55 |             jj <- jj - 0.15
 56 |         }
 57 |         
 58 |         if (length(ir) > 0) {
 59 |             if (ii == 2) {
 60 |                 colourCount <- length(unique(ir$symbol))
 61 |                 getPalette <- colorRampPalette(.alpha(brewer.pal(max(
 62 |                   length(unique(ir$symbol)), 
 63 |                   3), "Dark2"), 0.4))
 64 |                 color.pal <- getPalette(colourCount)
 65 |                 names(color.pal) <- unique(ir$symbol)
 66 |                 map <- match(ir$symbol, names(color.pal))
 67 |                 color <- color.pal[map]
 68 |                 
 69 |                 bord <- "black"
 70 |             } else if (ii == 1) {
 71 |                 color <- ir$type
 72 |                 colvec <- c("blue3", "dodgerblue", "goldenrod2", "forestgreen")
 73 |                 names(colvec) <- c("inter", "shelves", "shores", "islands")
 74 |                 
 75 |                 for (ucol in unique(color)){
 76 |                   ix <- agrep(ucol, names(colvec))
 77 |                   if (length(ix) > 1)
 78 |                     stop("Ambiguous CpG annotation labels")
 79 |                   color[color == ucol] <- colvec[ix]
 80 |                 }
 81 |                 bord <- color
 82 |                 
 83 |                 rect(start(ir), jj - 0.06, end(ir), jj + 0.17, col = color, 
 84 |                      border = bord)
 85 |             }
 86 |             
 87 |             if (ii == 2) {
 88 |                 lastPos <- rep(NA, length(unique(ir$symbol)) - 1)
 89 |                 used <- NULL
 90 |                 for (k in seq_len(length(unique(ir$symbol)))) {
 91 |                   irk <- ir[ir$symbol == unique(ir$symbol)[k], ]
 92 |                   rect(min(start(irk)), jj - 0.065, max(end(irk)), jj + 0.065, 
 93 |                        col = .alpha("black", 
 94 |                     0.1), border = .alpha("black", 0.1))
 95 |                   if (!(unique(ir$symbol)[k] %in% used)) {
 96 |                     rwidth <- end(gr) - start(gr)
 97 |                     sg <- pmin(start(irk), end(irk))
 98 |                     eg <- pmax(start(irk), end(irk))
 99 |                     gwidth <- min(max(eg), end(gr)) - max(min(sg), start(gr))
100 |                     textPos <- max(min(sg), start(gr)) + gwidth/2
101 |                     jj.orig <- jj
102 |                     if (sum(!is.na(lastPos)) > 0) {
103 |                       separation <- (textPos - lastPos[k - 1])/rwidth
104 |                       if (abs(separation) <= 0.2 && k < 3) {
105 |                         jj <- jj - 0.29
106 |                       } else {
107 |                         separation <- min(abs((textPos - lastPos)/rwidth), 
108 |                                           na.rm = TRUE)
109 |                         if (abs(separation) <= 0.2) {
110 |                           jj <- jj - 0.29
111 |                         }
112 |                       }
113 |                     }
114 |                     lastPos[k] <- textPos
115 |                     text(textPos, jj - 0.375, labels = unique(irk$symbol), 
116 |                          cex = 0.85, 
117 |                       col = unique(color)[k])
118 |                     jj <- jj.orig
119 |                     used <- c(used, unique(ir$symbol)[k])
120 |                   }
121 |                   rect(sg, jj - 0.11, eg, jj + 0.12, col = unique(color)[k], 
122 |                        border = bord)
123 |                 }
124 |             }
125 |             
126 |         }
127 |         mtext(names(annoTrack)[ii], side = 2, at = jj, las = 1, line = 1)
128 |     })
129 |     
130 | }
131 | 
132 | .isColor <- function(x)
133 | {
134 |   res <- try(col2rgb(x),silent=TRUE)
135 |   return(!"try-error"%in%class(res))
136 | }
137 | 
138 | .dmrGetMeta <- function(object, col, lty, lwd, label) {
139 |     ## Assumes that object has pData and sampleNames methods Code adapted from 
140 |     ## bsseq package
141 |     
142 |     ## extract col
143 |     if (is.null(col)) {
144 |         if ("col" %in% names(pData(object))) 
145 |             col <- pData(object)[["col"]] else col <- rep("black", 
146 |                                                           nrow(pData(object)))
147 |     }else if (length(col) == 1){
148 |       if (col %in% names(pData(object)))
149 |         col <- pData(object)[[col]] else col <- rep("black", ncol(object))
150 |       if (!.isColor(col))
151 |         col <- rainbow(length(unique(col)))[as.numeric(as.factor(col))]
152 |     }
153 |     if (length(col) != ncol(object)) 
154 |         col <- rep(col, length.out = ncol(object))
155 |     if (is.null(names(col))) 
156 |         names(col) <- sampleNames(object)
157 |     
158 |     ## extract lty
159 |     if (is.null(lty)) {
160 |       if ("lty" %in% names(pData(object)))
161 |         lty <- pData(object)[["lty"]] else lty <- rep(1, ncol(object))
162 |     }else if (length(lty) == 1){
163 |       if (lty %in% names(pData(object)))
164 |         lty <- pData(object)[[lty]] else lty <- rep(1, ncol(object))
165 |       if (!is.numeric(lty))
166 |         lty <- as.numeric(as.factor(lty))
167 |     }
168 |     if (length(lty) != ncol(object)) 
169 |         lty <- rep(lty, length.out = ncol(object))
170 |     if (is.null(names(lty))) 
171 |         names(lty) <- sampleNames(object)
172 |     
173 |     # extract lwd
174 |     if (is.null(lwd)) {
175 |         if ("lwd" %in% names(pData(object))) 
176 |             lwd <- pData(object)[["lwd"]] else lwd <- rep(1.5, 
177 |                                                           nrow(pData(object)))
178 |     }else if (length(lwd) == 1){
179 |       if (lwd %in% names(pData(object)))
180 |         lwd <- pData(object)[[lwd]] else lwd <- rep(1, ncol(object))
181 |       if (!is.numeric(lwd))
182 |         lwd <- as.numeric(as.factor(lwd))
183 |     }
184 |     if (length(lwd) != ncol(object)) 
185 |         lwd <- rep(lwd, length.out = ncol(object))
186 |     if (is.null(names(lwd))) 
187 |         names(lwd) <- sampleNames(object)
188 |     
189 |     ## extract label
190 |     if (is.null(label)) {
191 |         if ("label" %in% names(pData(object))) 
192 |             label <- pData(object)[["label"]] else label <- rep(NA, 
193 |                                                                 ncol(object))
194 |     }else if (length(label) == 1){
195 |       if (label %in% names(pData(object)))
196 |         label <- pData(object)[[label]] else label <- rep(NA, ncol(object))
197 |         if (!is.character(label))
198 |           label <- as.character(label)
199 |     }
200 |     if (length(label) != ncol(object)) 
201 |         label <- rep(label, length.out = ncol(object))
202 |     if (is.null(names(label))) 
203 |         names(label) <- sampleNames(object)
204 |     
205 |     return(list(col = col, lty = lty, lwd = lwd, label = label))
206 | }
207 | 
208 | 
209 | .dmrPlotTitle <- function(gr, extend, main, mainWithWidth, 
210 |                           qval = NULL, stat = NULL) {
211 |     # this function creates the main title for DMR plots Code adapted from bsseq
212 |     # package
213 |     if (is.data.frame(gr)) 
214 |         gr <- data.frame2GRanges(gr)
215 |     if (length(gr) > 1) {
216 |         warning("plotTitle: gr has more than one element")
217 |         gr <- gr[1]
218 |     }
219 |     plotChr <- as.character(seqnames(gr))
220 |     plotRange <- c(start(gr), end(gr))
221 |     regionCoord <- sprintf("%s: %s - %s", plotChr, format(plotRange[1], 
222 |                                                           big.mark = ",", 
223 |         scientific = FALSE), format(plotRange[2], big.mark = ",", 
224 |                                     scientific = FALSE))
225 |     if (mainWithWidth) {
226 |         regionWidth <- sprintf("width = %s", format(width(gr), 
227 |                                                     big.mark = ",",
228 |                                                     scientific = FALSE))
229 |         # add optional labels to plot titles
230 |         if (!is.null(qval) && !is.null(stat)) {
231 |             regionStat <- sprintf("Stat: %s", format(stat, big.mark = ",",
232 |                                                      scientific = FALSE))
233 |             regionFDR <- sprintf("FDR: %s", format(qval, big.mark = ",",
234 |                                                    scientific = FALSE))
235 |             regionCoord <- sprintf(paste0("%s (%s)\n%s, %s"), regionCoord,
236 |                                    regionWidth, regionStat, regionFDR)
237 |         } else if (!is.null(stat)) {
238 |             regionStat <- sprintf("Stat: %s", format(stat, big.mark = ",",
239 |                                                      scientific = FALSE))
240 |             regionCoord <- sprintf(paste0("%s (%s)\n%s"), regionCoord,
241 |                                    regionWidth, regionStat)
242 |         } else if (!is.null(qval)) {
243 |             regionFDR <- sprintf("FDR: %s", format(qval, big.mark = ",", 
244 |                                                    scientific = FALSE))
245 |             regionCoord <- sprintf(paste0("%s (%s)\n%s"), regionCoord, 
246 |                                    regionWidth, regionFDR)
247 |         } else {
248 |             regionCoord <- sprintf("%s (%s)", regionCoord, regionWidth)
249 |         }
250 |     } else {
251 |         # add optional labels to plot titles
252 |         if (!is.null(qval) && !is.null(stat)) {
253 |             regionStat <- sprintf("Stat: %s", format(stat, big.mark = ",",
254 |                                                      scientific = FALSE))
255 |             regionFDR <- sprintf("FDR: %s", format(qval, big.mark = ",", 
256 |                                                    scientific = FALSE))
257 |             regionCoord <- sprintf(paste0("%s\n%s, %s"), regionCoord, 
258 |                                    regionStat, regionFDR)
259 |         } else if (!is.null(stat)) {
260 |             regionStat <- sprintf("Stat: %s", format(stat, big.mark = ",",
261 |                                                      scientific = FALSE))
262 |             regionCoord <- sprintf(paste0("%s\n%s"), regionCoord, regionStat)
263 |         } else if (!is.null(qval)) {
264 |             regionFDR <- sprintf("FDR: %s", format(qval, big.mark = ",",
265 |                                                    scientific = FALSE))
266 |             regionCoord <- sprintf(paste0("%s\n%s"), regionCoord, regionFDR)
267 |         } else {
268 |             regionCoord <- sprintf("%s", regionCoord)
269 |         }
270 |     }
271 |     if (main != "") {
272 |         main <- sprintf("%s\n%s", main, regionCoord)
273 |     } else {
274 |         main <- regionCoord
275 |     }
276 |     main
277 | }
278 | 
279 | .dmrPlotLegend <- function(plotRange, col, label, horizLegend) {
280 |     # this function plots a legend to the right of the plot to indicate
281 |     # whichcolor corresponds to which samples
282 |     
283 |     numUnique <- length(unique(paste0(col, label, sep = "")))
284 |     if (numUnique < length(col)) {
285 |         col <- unique(col)
286 |         label <- unique(label)
287 |     }
288 |     
289 |     if (!horizLegend){
290 |       for (lg in seq_len(length(label))) {
291 |          mtext(label[lg], side = 4, line = lg - 1, col = .darken(col[lg]), 
292 |                cex = 0.9, las = 0)
293 |       }
294 |     }else{
295 |       for (lg in seq_len(length(label))) {
296 |         mtext(label[lg], side = 4, line = 0.5, col = .darken(col[lg]), 
297 |               cex = 0.9, las = 1, at = 1.02 - 0.08*(lg-1))
298 |       }
299 |     }
300 | }
301 | 
302 | 
303 | # function to transform a given color specified by a character object to a
304 | # transparent version of that color
305 | .makeTransparent <- function(someColor, alpha = 130) {
306 |     newColor <- col2rgb(someColor)
307 |     apply(newColor, 2, function(curcoldata) {
308 |         rgb(red = curcoldata[1], green = curcoldata[2], blue = curcoldata[3], 
309 |             alpha = alpha, maxColorValue = 255)
310 |     })
311 | }
312 | 
313 | .alpha <- function(col, alpha = 1) {
314 |     if (missing(col)) 
315 |         stop("Please provide a vector of colours.")
316 |     apply(vapply(col, grDevices::col2rgb, matrix(rep(0,3)))/255, 2, 
317 |           function(x) grDevices::rgb(x[1], x[2], x[3], alpha = alpha))
318 | }
319 | 
320 | 
321 | .dmrPlotPoints <- function(x, y, z, col, pointsMinCov, maxCov, 
322 |                            regionWidth) {
323 |     # modified from .bsPlotPoints in bsseq added functionality for point size 
324 |     # to vary with coverage
325 |     
326 |     lwd <- 1.5
327 |     # make color of points semi-transparent so that overlapping points can 
328 |     # still be seen
329 |     col.points <- .makeTransparent(col)
330 |     
331 |     # if there are a lot of CpGs to plot (the case for a block-level analysis
332 |     # decrease the size of the plotted points since these can get very crowded
333 |     c1 <- pmax(-0.25 * atan(3 * (length(x) - 80)/80 * pi)/atan(3 * 
334 |                (80)/80 * pi) +  0.75, 0.5)
335 |     ptSize <- c1 * (sqrt(z)/sqrt(maxCov) + 0.25)
336 |     
337 |     points(x[z >= pointsMinCov], y[z >= pointsMinCov], col = col.points, pch = 16,
338 |         cex = ptSize)
339 | }
340 | 
341 | .darken <- function(color, factor = 1.4) {
342 |     col <- col2rgb(color)
343 |     col <- col/factor
344 |     col <- rgb(t(col), maxColorValue = 255)
345 |     col
346 | }
347 | 
348 | 
349 | 
350 | .dmrPlotLines <- function(x, y, z, col, lwd, linesMinCov, maxCov, 
351 |                           regionWidth, lty) {
352 |   
353 |   if (length(x) > 100 && !is.null(lwd)) {
354 |     lwd <- lwd + 1
355 |   } else if (length(x) > 100) {
356 |     lwd <- 2
357 |   }
358 |   
359 |   spn <- max(1 - (1/160)*sum(z >= linesMinCov), 0.75)
360 | 
361 |   y[y==1] <- 0.99
362 |   y[y==0] <- 0.01
363 |   logit <- function(p){ log(p/(1-p))}
364 |   inv.logit <- function(l){ exp(l) / (1 + exp(l)) }
365 |   
366 |   # don't interpolate smooth lines if there are fewer than 10 cpgs
367 |   if (length(x) >= 10) {
368 |     loess_fit <- loess(logit(y[z >= linesMinCov]) ~ x[z >= linesMinCov],
369 |                        weights = z[z >= linesMinCov], span = spn)
370 |     
371 |     xl <- seq(min(x[z >= linesMinCov], na.rm=TRUE), 
372 |               max(x[z >= linesMinCov], na.rm=TRUE), 
373 |               (max(x[z >= linesMinCov], na.rm=TRUE) - 
374 |                min(x[z >= linesMinCov], na.rm=TRUE))/500)
375 |     lines(xl, inv.logit(predict(loess_fit,xl)), 
376 |           col = .makeTransparent(.darken(col), 175), lwd = lwd,
377 |           lty = lty)
378 |     
379 |   }else{
380 |     lines(x[z >= linesMinCov], y[z >= linesMinCov], 
381 |           col = .makeTransparent(.darken(col), 175), lwd = lwd,
382 |           lty = lty)
383 |   }
384 | }
385 | 
386 | .dmrPlotSmoothData <- function(BSseq, region, extend, addRegions, col, lty, lwd,
387 |     label, regionCol, addTicks, addPoints, pointsMinCov, highlightMain, 
388 |     includeYlab = TRUE, horizLegend, addLines=TRUE, linesMinCov) {
389 |     # modified from .plotSmoothData in bsseq to allow non-smoothed regions
390 |     
391 |     gr <- bsseq.bsGetGr(BSseq, region, extend)
392 |     BSseq <- subsetByOverlaps(BSseq, gr)
393 |     BSseq2 <- subsetByOverlaps(BSseq, bsseq.bsGetGr(BSseq, region, extend = 0))
394 |     ## Extract basic information
395 |     sampleNames <- sampleNames(BSseq)
396 |     names(sampleNames) <- sampleNames
397 |     positions <- start(BSseq)
398 |     positions2 <- start(BSseq2)
399 |     rawPs <- as.matrix(bsseq::getMeth(BSseq, type = "raw"))
400 |     coverage <- as.matrix(bsseq::getCoverage(BSseq))
401 |     
402 |     ## get col, lwd, lty these are extracted from the pData data.frame that is 
403 |     ## part of the bsseq object colEtc is a list object that contains col, lty,
404 |     ## lwd and label which are used as plotting parameters label is a condition
405 |     ##label to use for adding a legend with condition names
406 |     
407 |     colEtc <- .dmrGetMeta(object = BSseq, col = col, lty = lty, lwd = lwd, 
408 |                           label = label)
409 |     
410 |     if (includeYlab) {
411 |         yl <- "Methylation"
412 |     } else {
413 |         yl <- ""
414 |     }
415 |     
416 |     ## The actual plotting starts here
417 |     plot(positions[1], 0.5, type = "n", xaxt = "n", yaxt = "n", ylim = c(0, 1),
418 |          xlim = c(start(gr), 
419 |         end(gr)), xlab = "", ylab = yl)
420 |     axis(side = 2, at = c(0, 0.25, 0.5, 0.75, 1), las = 1)
421 |     if (addTicks) 
422 |         rug(positions)
423 |     
424 |     if (is.list(addRegions) && !is.data.frame(addRegions)) {
425 |         if (length(addRegions) > 2) {
426 |             stop("Only two sets of regions can be highlighted")
427 |         }
428 |         if (length(regionCol) == 1) {
429 |             regionCol <- c(regionCol, .alpha("blue", 0.2))
430 |         }
431 |         bsseq.bsHighlightRegions(regions = addRegions[[1]], gr = gr, ylim = c(0,
432 |             1), regionCol = regionCol[1], highlightMain = highlightMain)
433 |         bsseq.bsHighlightRegions(regions = addRegions[[2]], gr = gr, ylim = c(0,
434 |             1), regionCol = regionCol[2], highlightMain = highlightMain)
435 |         
436 |     } else {
437 |         bsseq.bsHighlightRegions(regions = addRegions, gr = gr, ylim = c(0, 1), 
438 |                                  regionCol = regionCol, 
439 |             highlightMain = highlightMain)
440 |     }
441 |     
442 |     # add points first to avoid lines getting hidden by plotting many cpg points
443 |     if (addPoints) {
444 |         for(sampIdx in seq_len(ncol(BSseq))){
445 |           .dmrPlotPoints(positions, rawPs[, sampIdx], coverage[, sampIdx], 
446 |                          col = colEtc$col[sampIdx], 
447 |                          pointsMinCov = pointsMinCov, 
448 |                          maxCov = quantile(coverage, 0.95), 
449 |                          regionWidth = end(gr) - 
450 |                            start(gr))
451 |         }
452 |     }   
453 |     
454 |     if (addLines){
455 |        for(sampIdx in seq_len(ncol(BSseq))){
456 |             if (sum(coverage[, sampIdx] >= linesMinCov) > 1){
457 |               .dmrPlotLines(positions, rawPs[, sampIdx], coverage[, sampIdx], 
458 |                         col = colEtc$col[sampIdx], 
459 |                         lwd = colEtc$lwd[sampIdx],
460 |                         linesMinCov = linesMinCov, 
461 |                         maxCov = quantile(coverage, 0.95), 
462 |                         regionWidth = end(gr) - 
463 |                           start(gr),
464 |                         lty = colEtc$lty[sampIdx])
465 |             }
466 |         }
467 |     }
468 |     
469 |     # if colEtc$label contains characters that are not null or missing, then 
470 |     # create a legend which houses the labels as well as the colors that 
471 |     # correspond to them -> pass in both colEtc$label as well as colEtc$col
472 |     if (sum(!is.na(colEtc$label)) == length(colEtc$label)) {
473 |         .dmrPlotLegend(plotRange = c(start(gr), end(gr)), 
474 |                        colEtc$col, colEtc$label, horizLegend)
475 |     }
476 |     
477 | }
478 | 
479 | # function doesn't need to be exported; not a user-level function since a single
480 | # DMR can be plotted just fine with plotDMRs.
481 | .plotSingleDMR <- function(BSseq, region = NULL, extend = 0, main = "", 
482 |     addRegions = NULL, annoTrack = NULL, col = NULL, lty = NULL, lwd = NULL, 
483 |     label = NULL, mainWithWidth = TRUE, regionCol = .alpha("orchid1", 0.2), 
484 |     addTicks = TRUE, addPoints = FALSE, pointsMinCov = 5, highlightMain = FALSE,
485 |     qval = NULL, stat = NULL, includeYlab = TRUE, compareTrack = NULL, 
486 |     labelCols = NULL, horizLegend = FALSE, addLines = TRUE, linesMinCov = 1) {
487 |     
488 |     if(!is.null(annoTrack) || !is.null(compareTrack)){
489 |       layout(matrix(seq_len(2), ncol = 1), heights = c(2, 1.5))
490 |     }else{
491 |       layout(matrix(seq_len(2), ncol = 1), heights = c(2, 0.2))
492 |     }
493 |     .dmrPlotSmoothData(BSseq = BSseq, region = region, extend = extend, 
494 |         addRegions = addRegions, 
495 |         col = col, lty = lty, lwd = lwd, label = label, 
496 |         regionCol = regionCol, addTicks = addTicks, 
497 |         addPoints = addPoints, pointsMinCov = pointsMinCov, 
498 |         highlightMain = highlightMain, 
499 |         includeYlab = includeYlab, 
500 |         horizLegend = horizLegend, 
501 |         addLines = addLines,
502 |         linesMinCov = linesMinCov)
503 |     gr <- bsseq.bsGetGr(BSseq, region, extend)
504 |     
505 |     if (!is.null(main)) {
506 |         if (qval && stat) {
507 |             qval <- round(region$qval, 4)
508 |             stat <- round(region$stat, 3)
509 |             main <- .dmrPlotTitle(gr = region, extend = extend, main = main, 
510 |                                   mainWithWidth = mainWithWidth, 
511 |                                   qval = qval, stat = stat)
512 |         } else if (stat) {
513 |             stat <- round(region$stat, 3)
514 |             main <- .dmrPlotTitle(gr = region, extend = extend, main = main, 
515 |                                   mainWithWidth = mainWithWidth, stat = stat)
516 |         } else if (qval) {
517 |             qval <- round(region$qval, 4)
518 |             main <- .dmrPlotTitle(gr = region, extend = extend, main = main,
519 |                                   mainWithWidth = mainWithWidth, qval = qval)
520 |         } else {
521 |             main <- .dmrPlotTitle(gr = region, extend = extend, main = main,
522 |                                   mainWithWidth = mainWithWidth)
523 |         }
524 |         mtext(side = 3, text = main, outer = FALSE, cex = 0.8, line = 0)
525 |     }
526 |     
527 |     if (!is.null(annoTrack)) {
528 |         dmrPlotAnnotations(gr, annoTrack)
529 |     } else if (!is.null(compareTrack)) {
530 |         dmrPlotComparisons(gr, compareTrack, labelCols = labelCols)
531 |     }
532 | }
533 | 
534 | # pasting bsseq's .bsGetGr function since not exported
535 | bsseq.bsGetGr <- function(object, region, extend) {
536 |     if (is.null(region)) {
537 |         gr <- GRanges(seqnames = seqnames(object)[1], 
538 |                       ranges = IRanges(start = min(start(object)), 
539 |             end = max(start(object))))
540 |     } else {
541 |         if (is(region, "data.frame")){
542 |               gr <- data.frame2GRanges(region, keepColumns = FALSE)
543 |             }else{ 
544 |               gr <- region
545 |             }
546 |         if (!is(gr, "GRanges") || length(gr) != 1) 
547 |             stop("'region' needs to be either a 'data.frame' ",
548 |                 "(with a single row) or a 'GRanges' (with a single element)")
549 |         gr <- resize(gr, width = 2 * extend + width(gr), fix = "center")
550 |     }
551 |     gr
552 | }
553 | 
554 | # pasting bsseq's .bsHighlightRegions function since not exported
555 | bsseq.bsHighlightRegions <- function(regions, gr, ylim, regionCol, 
556 |                                      highlightMain) {
557 |     if (is.data.frame(regions)) 
558 |         regions <- data.frame2GRanges(regions)
559 |     if (highlightMain) 
560 |         regions <- c(regions, gr)
561 |     if (is.null(regions)) 
562 |         return(NULL)
563 |     regions <- subsetByOverlaps(regions, gr)
564 |     regions <- pintersect(regions, rep(gr, length(regions)))
565 |     if (length(regions) == 0) 
566 |         return(NULL)
567 |     rect(xleft = start(regions), xright = end(regions), ybottom = ylim[1], 
568 |          ytop = ylim[2], col = regionCol, border = NA)
569 | }
570 | 
571 | gg_color_hue <- function(n) {
572 |     hues <- seq(15, 375, length = n + 1)
573 |     hcl(h = hues, l = 65, c = 100)[seq_len(n)]
574 | }
575 | 
576 | # function to draw nonoverlapping comparison regions (up to 3) below the main
577 | # region plot (instead of annotations)
578 | dmrPlotComparisons <- function(gr, annoTrack, labelCols = NULL) {
579 |     
580 |     if (!is(annoTrack, "SimpleGRangesList"))
581 |       stop("'annoTrack' needs to be a 'SimpleGRangesList'")
582 |   
583 |     if (length(annoTrack) > 4) 
584 |         stop("Can't plot more than 4 tracks")
585 |     
586 |     plot(start(gr), 1, type = "n", xaxt = "n", yaxt = "n", bty = "n", 
587 |          ylim = c(0, length(annoTrack) + 0.5), xlim = c(start(gr), end(gr)),
588 |          xlab = "", ylab = "")
589 |     
590 |     colourCount <- length(annoTrack)
591 |     getPalette <- colorRampPalette(.alpha(brewer.pal(max(length(annoTrack), 3),
592 |                                                      "Dark2"), 
593 |         0.4))
594 |     color <- getPalette(colourCount)
595 |     bord <- "black"
596 |     
597 |     lapply(seq(along = annoTrack), function(ii) {
598 |         jj <- length(annoTrack) + 1 - ii
599 |         ir <- subsetByOverlaps(annoTrack[[ii]], gr)
600 |         start(ir) <- pmax(start(ir), start(gr))
601 |         end(ir) <- pmin(end(ir), end(gr))
602 |         
603 |         top.pos <- 4.1 - (4 - length(annoTrack))
604 |         bot.pos <- 0 + (4 - length(annoTrack)) * 0.1
605 |         jj.between <- (top.pos - bot.pos)/length(annoTrack)
606 |         
607 |         if (length(ir) > 0) {
608 |             jj <- top.pos - (ii - 1) * jj.between
609 |             
610 |             arrows(start(ir), jj, end(ir), jj, code = 3, length = 0.05, 
611 |                    angle = 90, col = .makeTransparent(color[ii], alpha = 185))
612 |             if (is.null(labelCols)) {
613 |                 text((end(gr) + start(gr))/2, jj - 0.4, names(annoTrack)[[ii]])
614 |             }else if(sum(labelCols %in% colnames(mcols(annoTrack[[ii]]))) > 0) {
615 |                 whichLabelCols <- match(labelCols, 
616 |                                         colnames(mcols(annoTrack[[ii]])))
617 |                 notmiss <- !is.na(whichLabelCols)
618 |                 whichLabelCols <- whichLabelCols[notmiss]
619 |                 if (class(unlist(mcols(annoTrack[[ii]])[1, 
620 |                                             whichLabelCols, drop = TRUE])) == 
621 |                   "numeric") {
622 |                   comps <- round(unlist(mcols(ir)[1, 
623 |                                             whichLabelCols, drop = TRUE]), 
624 |                     3)
625 |                 } else {
626 |                   comps <- unlist(mcols(ir)[1, whichLabelCols, drop = TRUE])
627 |                 }
628 |                 if (sum(grepl("\\.", labelCols)) > 0) {
629 |                   labelCols <- gsub("\\.", " ", labelCols)
630 |                 }
631 |                 Label <- paste0(names(annoTrack)[ii], ": ", 
632 |                                 paste0(labelCols[notmiss], 
633 |                   "=", comps, collapse = ", "))
634 |                 text((end(gr) + start(gr))/2, jj - 0.4, Label, 
635 |                      cex = 0.85, col = color[ii])
636 |             } else {
637 |                 text((end(gr) + start(gr))/2, jj - 0.4, names(annoTrack)[[ii]])
638 |             }
639 |         }
640 |     })
641 | }
642 | 


--------------------------------------------------------------------------------
/vignettes/dmrseq.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Analyzing Bisulfite-seq data with dmrseq"
  3 | author: "Keegan Korthauer"
  4 | date: "`r BiocStyle::doc_date()`"
  5 | package: "`r BiocStyle::pkg_ver('dmrseq')`"
  6 | bibliography: dmrseqBib.bib
  7 | abstract: >
  8 |   A basic task in the analysis of count data from Whole Genome 
  9 |   Bisulfite-Sequencing is the detection of differentially methylated regions. 
 10 |   The count data consist of, for each sample, the number of methylated 
 11 |   reads and the total number of reads covering CpG. 
 12 |   An important analysis question is to detect regions (collections of 
 13 |   neighboring CpGs) with systematic differences between conditions, 
 14 |   as compared to within-condition variability. These so-called Differentially
 15 |   Methylated Regions (DMRs) are thought to be more informative than single CpGs 
 16 |   in terms of of biological function. Although several methods exist
 17 |   to quantify and perform statistical inference on changes at the individual
 18 |   CpG level, detection of DMRs is still limited to aggregating signifiant 
 19 |   CpGs without proper inference at the region level. The package **dmrseq**
 20 |   addresses this gap by providing a rigorous permutation-based approach to
 21 |   detect and perform inference for differential methylation by use of 
 22 |   generalized least squares models that account for inter-individual and 
 23 |   inter-CpG variability to generate region-level statistics that can be
 24 |   comparable across the genome. This allows the framework to perform well even
 25 |   on samples as small as two per group. This vignette explains the
 26 |   use of the package and demonstrates typical workflows. This vignette was 
 27 |   generated with dmrseq package version `r packageVersion("dmrseq")`
 28 | output:
 29 |   BiocStyle::html_document:
 30 |     highlight: pygments
 31 |     toc_float: true
 32 |     fig_width: 5
 33 | vignette: >
 34 |   %\VignetteIndexEntry{Analyzing Bisulfite-seq data with dmrseq}
 35 |   %\VignetteEngine{knitr::rmarkdown}
 36 |   %\VignetteEncoding[utf8]{inputenc}
 37 | ---
 38 | 
 39 | 
 40 | <!-- This is the source document -->
 41 | 
 42 | 
 43 | ```{r setup, echo=FALSE, results="hide"}
 44 | knitr::opts_chunk$set(tidy=FALSE, cache=FALSE,
 45 |     dev="png", 
 46 |     message=FALSE, error=FALSE, warning=TRUE)
 47 | ```
 48 | 
 49 | # Quick start
 50 | 
 51 | **If you use dmrseq in published research, please cite:**
 52 | 
 53 | > Korthauer, K., Chakraborty, S., Benjamini, Y., and Irizarry, R.A.
 54 | > Detection and accurate False Discovery Rate control of differentially 
 55 | methylated regions from Whole Genome Bisulfite Sequencing
 56 | > *Biostatistics*, 2018 (in press).
 57 | 
 58 | This package builds upon the 
 59 | [bsseq](http://bioconductor.org/packages/bsseq) package [@Hansen2012], 
 60 | which provides efficient storage and manipulation of bisulfite 
 61 | sequencing data and inference for differentially methylated CpGs. 
 62 | The main goal of **dmrseq** [@Korthauer183210]
 63 | is to provide inference for differentially methylated *regions*, or 
 64 | groups of CpGs.
 65 | 
 66 | Here we show the most basic steps for a differential methylation 
 67 | analysis. There are a variety of steps upstream of **dmrseq** that result
 68 | in the generation of counts of methylated reads and total reads covering each
 69 | CpG for each sample, including mapping of sequencing reads to a reference
 70 | genome with and without bisulfite conversion. You can use the software
 71 | of your preference for this step (one option is 
 72 | [Bismark](https://www.bioinformatics.babraham.ac.uk/projects/bismark/)), as
 73 | long as you are able to obtain counts of methylation and coverage (as 
 74 | opposed to solely methylation proportions, as discussed below).
 75 | 
 76 | This package uses a specific data structure to store and manipulate 
 77 | bisulfite sequencing data introduced by the **bsseq** package. This data 
 78 | structure is a *class* called `BSseq`. Objects of the class `BSseq` contain
 79 | all pertinent information for a bisulfite sequencing experiment, including 
 80 | the number of reads corresponding to methylation, and the total number 
 81 | of reads at each
 82 | CpG site, the location of each CpG site, and experimental metadata on the
 83 | samples. Note that here we focus on CpG methylation, since this is the 
 84 | most common form of methylation in humans and many other organisms; take
 85 | care when applying this method to other types of methylation and make sure 
 86 | that it will
 87 | be able to scale to the number of methylation sites, and that similar 
 88 | assumptions can be made regarding spatial correlation. Also note that 
 89 | the default settings for smoothing parameters and spacing/gap parameters
 90 | are set to values that we found useful, but may need to be altered for 
 91 | datasets for other organisms.
 92 | 
 93 | To store your data in a `BSseq` object, make sure you have the following
 94 | neccessary components:
 95 | 
 96 | 1. genomic positions, including chromosome and location, for methylation loci.
 97 | 
 98 | 2. a (matrix) of M (Methylation) values, describing the number of reads 
 99 | supporting methylation covering a single loci. 
100 | Each row in this matrix is a methylation loci and each column is a sample.
101 | 
102 | 3. a (matrix) of Cov (Coverage) values, 
103 | describing the total number of reads covering a single loci.
104 | Each row in this matrix is a methylation loci and each column is a sample.
105 | 
106 | The following code chunk assumes that `chr` and `pos` are vectors of 
107 | chromosome names and positions, respectively, for each CpG in the dataset. 
108 | You can also provide a `GRanges` object instead of `chr` and `pos`. It 
109 | also assumes that the matrices of methylation and coverage values (described
110 | above) are named `M` and `Cov`, respectively. Note, `M` and `Cov` can also
111 | be data stored on-disk (not in memory) using HDF5 files with the `HDF5Array` 
112 | package or `DelayedMatrix` with the `DelayedArray` package. 
113 | 
114 | The `sampleNames` and `trt` objects are 
115 | vectors with sample labels and condition labels for each sample. A condition
116 | label could be something like 
117 | treatment or control, a tissue type, or a continous measurement.
118 | This is the covariate for which you wish to test for differences in 
119 | methylation. Once the `BSseq` object is constructed and the sample covariate
120 | information is added, DMRs are obtained by running the `dmrseq` function.
121 | A continuous covariate is assumed if the data type of the `testCovariate` 
122 | arugment in `dmrseq` is 
123 | continuous, with the exception of if there are only two unique values 
124 | (then a two group comparison is carried out).
125 | 
126 | ```{r quickStart, eval=FALSE}
127 | bs <- BSseq(chr = chr, pos = pos,
128 |             M = M, Cov = Cov, 
129 |             sampleNames = sampleNames)
130 | pData(bs)$Condition <- trt
131 | 
132 | regions <- dmrseq(bs=bs, testCovariate="Condition")
133 | ```
134 | 
135 | For more information on constructing and manipulating `BSseq` objects, 
136 | see the [bsseq](http://bioconductor.org/packages/bsseq) vignettes. 
137 | 
138 | * If you used *Bismark* to align your bisulfite sequencing data,
139 |   you can use the `read.bismark` function to read bismark files
140 |   into `BSseq` objects. See below for more details.
141 | 
142 | # How to get help for dmrseq
143 | 
144 | Please post **dmrseq** questions to the 
145 | **Bioconductor support site**, which serves as a searchable knowledge
146 | base of questions and answers:
147 | 
148 | <https://support.bioconductor.org>
149 | 
150 | Posting a question and tagging with "dmrseq" will automatically send
151 | an alert to the package authors to respond on the support site.  See
152 | the first question in the list of [Frequently Asked Questions](#FAQ)
153 | (FAQ) for information about how to construct an informative post. 
154 | 
155 | # Input data
156 | 
157 | ## Why counts instead of methylation proportions?
158 | 
159 | As input, the **dmrseq** package expects count data as obtained, e.g.,
160 | from Bisulfite-sequencing. The value in the *i*-th row and the *j*-th column of
161 | the `M` matrix tells how many methylated reads can be assigned to CpG *i* 
162 | in sample *j*. Likewise, the value in the *i*-th row and the *j*-th column of
163 | the `Cov` matrix tells how many total reads can be assigned to CpG *i* 
164 | in sample *j*. Although we might be tempted to combine these matrices into
165 | one matrix that contains the methylation *proportion* (`M`/`Cov`) at each CpG
166 | site, it is critical to notice that this would be throwing away a lot of
167 | information. For example, some sites have much higher coverage than others,
168 | and naturally, we have more confidence in those with many reads mapping to them.
169 | If we only kept the proportions, a CpG with 2 out of 2 reads methylated would
170 | be treated the same as a CpG with 30 out of 30 reads methylated.
171 | 
172 | ## How many samples do I need?
173 | 
174 | To use **dmrseq**, you need to have at least 2 samples in each condition. 
175 | Without this replicates, it is impossible to distinguish between biological
176 | variability due to condition/covariate of interest, and inter-individual 
177 | variability within condition.
178 | 
179 | If your experiment contains additional samples, perhaps from other conditions
180 | that are not of interest in the current test, these should be filtered out 
181 | prior to running **dmrseq**. Rather than creating a new filtered object,
182 | the filtering step can be included in the call to the main function `dmrseq`.
183 | For more details, see the 
184 | [Filtering CpGs and samples Section](#filtering-cpgs-and-samples).
185 | 
186 | ## Bismark input
187 | 
188 | If you used Bismark for mapping and methylation level extraction, you can 
189 | use the `read.bismark` function from the **bsseq** package to read the 
190 | data directly into 
191 | a `BSeq` object. 
192 | 
193 | The following example is from the help page of the function. After running
194 | Bismark's methylation extractor, you should have output files with names 
195 | that end in `.bismark.cov.gz`. You can specify a vector of file names with
196 | the `file` argument, and a corresponding vector of `sampleNames`. It is 
197 | recommended that you set `rmZeroCov` to TRUE in order to remove CpGs with 
198 | no coverage in any of the samples, and set `strandCollapse` to TRUE in order
199 | to combine CpGs on opposite strands into one observation (since CpG methylation)
200 | is symmetric.
201 | 
202 | ```{r bismarkinput}
203 | library(dmrseq)
204 | infile <- system.file("extdata/test_data.fastq_bismark.bismark.cov.gz",
205 |                         package = 'bsseq')
206 | bismarkBSseq <- read.bismark(files = infile,
207 |                                rmZeroCov = TRUE,
208 |                                strandCollapse = FALSE,
209 |                                verbose = TRUE)
210 | bismarkBSseq
211 | ```
212 | 
213 | See the [bsseq](http://bioconductor.org/packages/bsseq) help pages for 
214 | more information on using this function.
215 | 
216 | ## Count matrix input
217 | 
218 | If you haven't used Bismark, but you have count data for number of methylated
219 | reads and total coverage for each CpG, along with their corresponding chromosome
220 | and position information, you can construct a `BSseq` object from scratch, 
221 | like below. Notice that the `M` and `Cov` matrices have the same dimension, and
222 | `chr` and `pos` have the same number of elements as rows in the count matrices
223 | (which corresponds to the number of CpGs). Also note that the number of columns
224 | in the count matrices matches the number of elements in `sampleNames` and the
225 | condition variable 'celltype`.
226 | 
227 | ```{r dissect, results="hide", echo=FALSE}
228 | data("BS.chr21")
229 | M <- getCoverage(BS.chr21, type="M")
230 | Cov <- getCoverage(BS.chr21, type="Cov")
231 | chr <- as.character(seqnames(BS.chr21))
232 | pos <- start(BS.chr21)
233 | celltype <- pData(BS.chr21)$CellType
234 | sampleNames <- sampleNames(BS.chr21)
235 | ```
236 | 
237 | ```{r fromScratch}
238 | head(M)
239 | head(Cov)
240 | head(chr)
241 | head(pos)
242 | 
243 | dim(M)
244 | dim(Cov)
245 | length(chr)
246 | length(pos)
247 | 
248 | print(sampleNames)
249 | print(celltype)
250 | 
251 | bs <- BSseq(chr = chr, pos = pos,
252 |             M = M, Cov = Cov, 
253 |             sampleNames = sampleNames)
254 | show(bs)
255 | ```
256 | ```{r cleanup, results="hide", echo=FALSE}
257 | rm(M, Cov, pos, chr, bismarkBSseq)
258 | ```
259 | 
260 | The example data contains CpGs from chromosome 21 for four samples 
261 | from @Lister2009. To load this data directly (already in the `BSseq` format), 
262 | simply type `data(BS.chr21)`.
263 | Two of the samples are replicates of the cell type 'imr90' 
264 | and the other two are replicates of the cell type 'h1'. Now that we have the
265 | data loaded into a `BSseq` object, we can use **dmrseq**
266 | to find regions of the genome where these two cell types have significantly
267 | different methylation levels. But first, we need to add the sample metadata
268 | that indicates which samples are from which cell type (the `celltype` 
269 | varialbe above). This information, which we call 'metadata', 
270 | will be used by the `dmrseq` function to decide
271 | which samples to compare to one another. The next section shows how to add
272 | this information to the `BSseq` object.
273 | 
274 | ## Sample metadata
275 | 
276 | To add sample metadata, including the covariate of interest, you can add it 
277 | to the 
278 | `BSseq` object by adding columns to the `pData` slot. You must have at least
279 | one column of `pData`, which contains the covariate of interest. Additional 
280 | columns are optional.
281 | 
282 | ```{r meta}
283 | pData(bs)$CellType <- celltype
284 | pData(bs)$Replicate <- substr(sampleNames, 
285 |                               nchar(sampleNames), nchar(sampleNames))
286 | 
287 | pData(bs)
288 | ```
289 | 
290 | We will then tell the `dmrseq` function which metadata variable to use
291 | for testing for methylation differences by setting the `testCovariate` 
292 | parameter equal to its column name. 
293 | 
294 | ## Smoothing
295 | 
296 | Note that unlike in **bsseq**, you do not need to carry out the smoothing step
297 | with a separate function. In addition, you should not use **bsseq**'s `BSmooth()`
298 | function to smooth the methylation levels, since **dmrseq** smooths in a very 
299 | different way. Briefly, **dmrseq** smooths methylation *differences*, so it 
300 | carries out the smoothing step once. This is automatically done with the main
301 | `dmrseq` function. **bsseq** on the other hand, smooths each sample 
302 | independently, so smoothing needs to be carried out once per sample.
303 | 
304 | ## Filtering CpGs and samples
305 | 
306 | For pairwise comparisons, **dmrseq** analyzes all CpGs that have at least one
307 | read in at least one sample per group.
308 | Thus, if your dataset contains CpGs with zero reads in every sample within a 
309 | group, you should filter them out prior to running `dmrseq`. Likewise,
310 | if your `bsseq` object contains extraneous samples that are part of the 
311 | experiment but not the differential methylation testing of interest, these
312 | should be filtered out as well.
313 | 
314 | Filtering `bsseq` objects is straightforward: 
315 | 
316 | * Subset rows to filter CpG loci
317 | * Subset columns to filter samples
318 | 
319 | If we wish to remove all CpGs that have no coverage in at least one sample 
320 | and only keep samples with a CellType of "imr90" or "h1", we would do so with:
321 | 
322 | ```{r, filter}
323 | # which loci and sample indices to keep
324 | loci.idx <- which(DelayedMatrixStats::rowSums2(getCoverage(bs, type="Cov")==0) == 0)
325 | sample.idx <- which(pData(bs)$CellType %in% c("imr90", "h1"))
326 | 
327 | bs.filtered <- bs[loci.idx, sample.idx]
328 | ```
329 | ```{r, results="hide", echo=FALSE}
330 | rm(bs.filtered)
331 | ```
332 | 
333 | Note that this is a trivial example, since our toy example object `BS.chr21` 
334 | already contains only loci with coverage at least one read in all samples as well
335 | as only samples from the "imr90" and "h1" conditions. 
336 | 
337 | Also note that instead of creating a separate object, the filtering step 
338 | can be combined with the call to `dmrseq` by replacing the `bs` input with a
339 | filtered version `bs[loci.idx, sample.idx]`.
340 | 
341 | ## Adjusting for covariates
342 | 
343 | There are two ways to adjust for covariates in the dmrseq model. The first way
344 | is to specify the `adjustCovariate` parameter of the `dmrseq()` function as 
345 | a column of the `pData()` slot that contains the covariate you
346 | would like to adjust for. This will include that covariate directly in the
347 | model. This is ideal if the adjustment covariate is continuous or has more
348 | than two groups.
349 | 
350 | The second way is to specify the `matchCovariate` parameter of the `dmrseq` 
351 | function as a column of the `pData()` slot that contains the covariate you
352 | would like to match on. This will restrict the permutations considered to only
353 | those where the `matchCovariate` is balanced. For example, the `matchCovariate` 
354 | could represent the sex of each sample. In that case, a permutation that 
355 | includes all males in one group and all females in another would not be 
356 | considered (since there is a plausible biological difference that may induce
357 | the null distribution to resemble non-null). This matching adjustment is ideal
358 | for two-group comparisons.
359 | 
360 | # Differentially Methylated Regions
361 | 
362 | The standard differential expression analysis steps are wrapped
363 | into a single function, `dmrseq`. The estimation steps performed
364 | by this function are described briefly below, as well as in 
365 | more detail in the **dmrseq** paper. Here we run the results for a subset
366 | of 20,000 CpGs in the interest of computation time.
367 | 
368 | ```{r mainfunction, message=TRUE, warning=TRUE}
369 | testCovariate <- "CellType"
370 | regions <- dmrseq(bs=bs[240001:260000,],
371 |                   cutoff = 0.05,
372 |                   testCovariate=testCovariate)
373 | ```
374 | 
375 | Progress messages are printed to the console if `verbose` is TRUE.
376 | The text, `condition h1 vs imr90`, tells you that positive methylation 
377 | differences mean h1 has higher methylation than imr90 (see below for
378 | more details). 
379 | 
380 | ## Output of dmrseq
381 | 
382 | The results object is a `GRanges` object with the coordiates 
383 | of each candidate region, and contains the following metadata columns (which 
384 | can be extracted with the `$` operator:
385 | 
386 | 1. `L` = the number of CpGs contained in the region,
387 | 2. `area` = the sum of the smoothed beta values
388 | 3. `beta` = the coefficient value for the condition difference (Note: if the 
389 |    test covariate is categorical with more than 2 groups, there will be
390 |    more than one beta column),
391 | 4. `stat` = the test statistic for the condition difference,
392 | 5. `pval` = the permutation _p_-value for the significance of the test
393 | statistic, and 
394 | 6. `qval` = the _q_-value for the test statistic (adjustment
395 | for multiple comparisons to control false discovery rate).
396 | 7. `index = an `IRanges` containing the indices of the region's 
397 | first CpG to last CpG.
398 | 
399 | ```{r, showresults}
400 | show(regions)
401 | ```
402 | 
403 | The above steps are carried out on a very small subset of data (20,000 CpGs).
404 | This package loads data into memory one chromosome at a 
405 | time. For on human data, this means objects with a few million 
406 | entries per sample (since there are roughly 28.2 million total CpGs in the human
407 | genome, and the largest chromosomes will have more than 2 million CpGs). 
408 | This means that whole-genome `BSseq` objects for several samples can use up
409 | several GB of RAM. In order to improve speed, the package allows for easy
410 | parallel processing of chromosomes, but be aware that using more cores will
411 | also require the use of more RAM.
412 | 
413 | To use more cores, use the `register` function of 
414 | [BiocParallel](http://bioconductor.org/packages/BiocParallel). For example,
415 | the following chunk (not evaluated here), would register 4 cores, and
416 | then the functions above would
417 | split computation over these cores. 
418 | 
419 | ```{r parallel, eval=FALSE}
420 | library("BiocParallel")
421 | register(MulticoreParam(4))
422 | ```
423 | ## Steps of the dmrseq method
424 | 
425 | **dmrseq** is a two-stage approach that first detects candidate regions and then
426 | explicitly evaluates statistical significance at the region level while 
427 | accounting for known sources of variability. 
428 | Candidate DMRs are defined by segmenting the genome into groups of CpGs 
429 | that show consistent evidence of differential methylation. 
430 | Because the methylation levels of neighboring CpGs are highly correlated, 
431 | we first smooth the signal to combat loss of power due to low coverage as done
432 | in **bsseq**. 
433 | 
434 | In the second stage, we compute a statistic for each candidate 
435 | DMR that takes into account variability between biological replicates 
436 | and spatial correlation among neighboring loci. Significance of each 
437 | region is assessed via a permutation procedure which uses a pooled null 
438 | distribution that can be generated from as few as two biological replicates,
439 | and false discovery rate is controlled using the Benjamini-Hochberg 
440 | procedure.
441 | 
442 | For more details, refer to the **dmrseq** paper [@Korthauer183210].
443 | 
444 | ## Detecting large-scale methylation blocks
445 | 
446 | The default smoothing parameters (`bpSpan`, `minInSpan`, and `maxGapSmooth`) 
447 | are designed to focus on local DMRs, generally in the range of hundreds to 
448 | thousands of bases. In some applications, such as cancer, it is of interest
449 | to effectively 'zoom out' in order to detect larger (lower-resolution) 
450 | methylation blocks on the order of hundreds of thousands to millions of bases.
451 | To do so, you can 
452 | set the `block` argument to true, which will only include candidate regions with
453 | at least `blockSize` basepairs (default = 5000). This setting will also merge
454 | candidate regions that (1) are in the same direction and (2) are less than 1kb
455 | apart with no covered CpGs separating them. The region-level model used is also
456 | slightly modified - instead of a loci-specific intercept for each CpG in the 
457 | region, the intercept term is modeled as a natural spline with one interior 
458 | knot per each 10kb of length (up to 10 interior knots).
459 | 
460 | In addition, detecting large-scale blocks requires that 
461 | the smoothing window be increased to minimize the impact of noisy local
462 | methylation measurements. To do so, the values of the 
463 | smoothing parameters should be increased. For example, to use a smoothing window
464 | that captures at least 500 CpGs or 50,000 basepairs that are spaced apart by no 
465 | more than 1e6 bases, use `minInSpan=500`, `bpSpan=5e4`, and `maxGapSmooth=1e6`. 
466 | In addition, to avoid a block being broken up simply due to a gap with no 
467 | covered CpGs, you can increase the `maxGap` parameter.
468 | 
469 | ```{r blocks, message=TRUE, warning=TRUE}
470 | testCovariate <- "CellType"
471 | blocks <- dmrseq(bs=bs[120001:125000,],
472 |                   cutoff = 0.05,
473 |                   testCovariate=testCovariate,
474 |                   block = TRUE,
475 |                   minInSpan = 500,
476 |                   bpSpan = 5e4,
477 |                   maxGapSmooth = 1e6,
478 |                   maxGap = 5e3)
479 | head(blocks)
480 | ```
481 | 
482 | The top hit is `r signif(width(blocks)[1]/1e3, 3)` thousand basepairs wide. 
483 | In general, it also might be advised to decrease the cutoff when detecting 
484 | blocks, since a smaller methylation
485 | difference might be biologically significant if it is maintained 
486 | over a large genomic region. Note that block-finding can be more computationally
487 | intensive since we are fitting region-level models to large numbers of CpGs at a
488 | time. In the toy example above we are only searching over 5,000 CpGs (which
489 | span 
490 | `r signif((max(end(bs[120001:125000,])) - 
491 | min(start(bs[120001:125000,])))/1e3,3)` 
492 | thousand basepairs), so we do not find enough null
493 | candidate regions to carry out inference and obtain significance levels.
494 | 
495 | # Exploring and exporting results
496 | 
497 | ## Explore how many regions were significant
498 | 
499 | How many regions were significant at the FDR (_q_-value) cutoff of 0.05? We
500 | can find this by counting how many values in the `qval` column of the `regions`
501 | object were less than 0.05. 
502 | You can also subset the regions by an FDR cutoff.
503 | 
504 | ```{r}
505 | sum(regions$qval < 0.05)
506 | 
507 | # select just the regions below FDR 0.05 and place in a new data.frame
508 | sigRegions <- regions[regions$qval < 0.05,]
509 | ```
510 | 
511 | ## Hypo- or Hyper- methylation?
512 | 
513 | You can determine the proportion of regions with hyper-methylation by counting
514 | how many had a positive direction of effect (positive statistic).
515 | 
516 | ```{r hyper}
517 | sum(sigRegions$stat > 0) / length(sigRegions)
518 | ```
519 | 
520 | To interpret the direction of effect, note that for a two-group comparison 
521 | **dmrseq** uses alphabetical order of the covariate of interest. 
522 | The condition with a higher alphabetical rank will become the reference category.
523 | For example, if
524 | the two conditions are "A" and "B", the "A" group will be the reference category,
525 | so a positive direction of effect means that 
526 | "B" is hyper-methylated relative to "A". Conversely, a negative direction of 
527 | effect means that "B" is hypo-methylated relative to "A". 
528 | 
529 | ## Plot DMRs 
530 | 
531 | It can be useful to visualize individual DMRs, so we provide a plotting 
532 | function that is based off of **bsseq**'s plotting functions. There is also
533 | functionality to add annotations using the 
534 | [annotatr](http://bioconductor.org/packages/annotatr) package to 
535 | see the nearby CpG categories (island, shore, shelf, open sea) and nearby
536 | coding sequences.
537 | 
538 | To retrieve annotations for genomes supported by **annotatr**, use the 
539 | helper function `getAnnot`, and pass this annotation object to the `plotDMRs` 
540 | function as the `annoTrack` parameter.
541 | 
542 | ```{r plot, out.width='\\textwidth', fig.height = 2.5, warning=FALSE}
543 | # get annotations for hg18
544 | annoTrack <- getAnnot("hg18")
545 | 
546 | plotDMRs(bs, regions=regions[1,], testCovariate="CellType",
547 |     annoTrack=annoTrack)
548 | ```
549 | 
550 | Here we also plot the top methylation block from the block analysis:
551 | 
552 | ```{r plotblock, out.width='\\textwidth', fig.height = 2.5, warning=FALSE}
553 | plotDMRs(bs, regions=blocks[1,], testCovariate="CellType",
554 |     annoTrack=annoTrack)
555 | ```
556 | 
557 | ## Plot distribution of methylation values and coverage
558 | 
559 | It can also be helpful to visualize overall distributions of methylation values
560 | and / or coverage. The function `plotEmpiricalDistribution` will plot the 
561 | methylation values of
562 | the covariate of interest (specified with `testCovariate`).
563 | 
564 | ```{r plot2, fig.height=3}
565 | plotEmpiricalDistribution(bs, testCovariate="CellType")
566 | ```
567 | 
568 | By changing the `type` argument to `Cov`, it will also plot the distribution of 
569 | coverage values. In addition, samples can be plotted separately by setting 
570 | `bySample` to true.
571 | 
572 | ```{r plot3, fig.height=3}
573 | plotEmpiricalDistribution(bs, testCovariate="CellType", 
574 |                           type="Cov", bySample=TRUE)
575 | ```
576 | 
577 | ## Exporting results to CSV files
578 | 
579 | A plain-text file of the results can be exported using the 
580 | base R functions *write.csv* or *write.delim*. 
581 | We suggest using a descriptive file name indicating the variable
582 | and levels which were tested.
583 | 
584 | ```{r export, eval=FALSE}
585 | write.csv(as.data.frame(regions), 
586 |           file="h1_imr90_results.csv")
587 | ```
588 | 
589 | ## Extract raw mean methylation differences
590 | 
591 | For a two-group comparison, it might be of interest to extract the raw mean
592 | methylation differences over the DMRs. This can be done with the helper function
593 | `meanDiff`. For example, we can extract the raw mean difference values for 
594 | the regions at FDR level 0.05 (using the `sigRegions` object created 
595 | in the section
596 | [Explore how many regions were significant](#explore-how-many-regions-were-significant)).
597 | 
598 | ```{r, meandiff}
599 | rawDiff <- meanDiff(bs, dmrs=sigRegions, testCovariate="CellType")
600 | str(rawDiff)
601 | ```
602 | 
603 | # Simulating DMRs
604 | 
605 | If you have multiple samples from the same condition (e.g. control samples), 
606 | the function `simDMRS` will split these into two artificial sample groups
607 | and then add _in silico_ DMRs. This can then be used to assess sensitivity
608 | and specificity of DMR approaches, since we hope to be able to recover the
609 | DMRs that were spiked in, but not identify too many other differences (since
610 | we don't expect any biological difference between the two artificial sample
611 | groups).
612 | 
613 | The use of this function is demonstrated below, although note that in this
614 | toy example, we do not have enough samples from the same biological condition to
615 | split into two groups, so instead we shuffle the cell types to create a null
616 | sample comparison.
617 | 
618 | ```{r, sim}
619 | data(BS.chr21)
620 | 
621 | # reorder samples to create a null comparison 
622 | BS.null <- BS.chr21[1:20000,c(1,3,2,4)]
623 | 
624 | # add 100 DMRs
625 | BS.chr21.sim <- simDMRs(bs=BS.null, num.dmrs=100)
626 | 
627 | # bsseq object with original null + simulated DMRs
628 | show(BS.chr21.sim$bs)
629 | 
630 | # coordinates of spiked-in DMRs
631 | show(BS.chr21.sim$gr.dmrs)
632 | 
633 | # effect sizes
634 | head(BS.chr21.sim$delta)
635 | ```
636 | 
637 | The resulting object is a list with the following elements:
638 | 
639 | * `gr.dmrs`: a `GRanges` object containing the coordinates of the true spiked
640 | in DMRs
641 | * `dmr.mncov`: a numeric vector containing the mean coverage of the 
642 | simulated DMRs
643 | * `dmr.L`: a numeric vector containing the sizes (number of CpG loci) of the
644 | simulated DMRs
645 | * `bs`: the `BSSeq` object containing the original null data + simulated DMRs
646 | * `delta`: a numeric vector of effect sizes (proportion differences) of the 
647 | simulated DMRs.
648 | 
649 | # Session info
650 | 
651 | ```{r sessionInfo}
652 | sessionInfo()
653 | ```
654 | 
655 | # References
656 | 


--------------------------------------------------------------------------------
/R/dmrseq.R:
--------------------------------------------------------------------------------
  1 | #' Main function for detecting and evaluating significance of DMRs.
  2 | #' 
  3 | #' Performs a two-step approach that (1) detects candidate regions, and
  4 | #' (2) scores candidate regions with an exchangeable (across the genome)
  5 | #' statistic and evaluates statistical significance using a 
  6 | #' permuation test on the pooled null distribution of scores.
  7 | #' 
  8 | #' @param bs bsseq object containing the methylation values as well as the 
  9 | #'   phenotype matrix that contains sample level covariates
 10 | #' @param testCovariate Character value indicating which variable
 11 | #'  (column name) in \code{pData(bs)} to test
 12 | #'  for association of methylation levels. 
 13 | #'  Can alternatively specify an integer value indicating
 14 | #'  which of column of
 15 | #'  \code{pData(bs)} to use. This is used to construct the 
 16 | #'  design matrix for the test statistic calculation. To run using a 
 17 | #'  continuous or categorial covariate with more than two groups, simply pass in
 18 | #'  the name of a column in `pData` that contains this covariate. A continuous
 19 | #'  covariate is assumued if the data type in the `testCovariate` slot is 
 20 | #'  continuous, with the exception of if there are only two unique values 
 21 | #'  (then a two group comparison is carried out).
 22 | #' @param adjustCovariate an (optional) character value or vector 
 23 | #' indicating which variables (column names) in \code{pData(bs)} 
 24 | #' will be adjusted for when 
 25 | #'  testing for the association of methylation value with the 
 26 | #'  \code{testCovariate}. 
 27 | #' Can alternatively specify an
 28 | #' integer value or vector indicating
 29 | #'  which of the columns of \code{pData(bs)} to adjust for.
 30 | #'  If not NULL (default), then this is also used to 
 31 | #'  construct the design matrix for the test statistic calculation.
 32 | #' @param matchCovariate An (optional) character value 
 33 | #'  indicating which variable (column name) of \code{pData(bs)} 
 34 | #'  will be blocked for when 
 35 | #'  constructing the permutations in order to
 36 | #'  test for the association of methylation value with the 
 37 | #'  \code{testCovariate}, only to be used when \code{testCovariate}
 38 | #'  is a two-group factor and the number of permutations possible is less
 39 | #'  than 500000.
 40 | #'  Alternatively, you can specify an integer value indicating
 41 | #'  which column of \code{pData(bs)} to block for.
 42 | #'  Blocking means that only permutations with balanced
 43 | #'  composition of \code{testCovariate} values will be used (for example if
 44 | #'  you have samples from different gender and this is not your covariate of
 45 | #'  interest, 
 46 | #'  it is recommended to use gender as a matching covariate to avoid one 
 47 | #'  of the permutations testing entirely males versus females; this violates
 48 | #'  the null hypothesis and will decrease power).
 49 | #'  If not NULL (default), then no blocking is performed.
 50 | #' @param minInSpan positive integer that represents the minimum number of
 51 | #'    CpGs in a smoothing span window if \code{smooth} is TRUE.  
 52 | #'    Default value is 30.
 53 | #' @param minNumRegion positive integer that represents the minimum number of
 54 | #'    CpGs to consider for a candidate region. Default value is 5. 
 55 | #'    Minimum value is 3.
 56 | #' @param cutoff scalar value that represents the absolute value (or a vector 
 57 | #'    of two numbers representing a lower and upper bound) for the cutoff of 
 58 | #'    the single CpG coefficient that is used to discover 
 59 | #'    candidate regions. Default value is 0.10.
 60 | #' @param smooth logical value that indicates whether or not to smooth the 
 61 | #'    CpG level signal when discovering candidate regions.
 62 | #'    Defaults to TRUE.
 63 | #' @param bpSpan a positive integer that represents the length in basepairs
 64 | #'    of the smoothing span window if \code{smooth} is TRUE.  Default value is 
 65 | #'    1000.
 66 | #' @param verbose logical value that indicates whether progress messages
 67 | #'    should be printed to stdout. Defaults value is TRUE.
 68 | #' @param BPPARAM a \code{BiocParallelParam} object to specify the parallel 
 69 | #'    backend. The default 
 70 | #'    option is \code{BiocParallel::bpparam()} which will automatically creates
 71 | #'    a cluster appropriate for the operating system. 
 72 | #' @param maxPerms a positive integer that represents the maximum number 
 73 | #'    of permutations that will be used to generate the global null 
 74 | #'    distribution of test statistics.  Default value is 10.
 75 | #' @param maxGap integer value representing maximum number of basepairs in 
 76 | #' between neighboring CpGs to be included in the same DMR. 
 77 | #' @param maxGapSmooth integer value representing maximum number of basepairs  
 78 | #' in between neighboring CpGs to be included in the same 
 79 | #' cluster when performing smoothing (should generally be larger than
 80 | #' \code{maxGap})
 81 | #' @param stat a character vector indicating the name of the column of the 
 82 | #'   output to use as the region-level test statistic. Default value is 'stat'
 83 | #'   which is the region level-statistic designed to be comparable across the
 84 | #'   genome.
 85 | #'   It is not recommended to change this argument, but it can be done for
 86 | #'   experimental purposes. Possible values are: 'L' - the number of loci
 87 | #'   in the region, 'area' - the sum of the smoothed loci statistics,
 88 | #'   'beta' - the effect size of the region, 'stat' - the test statistic for
 89 | #'   the region, or 'avg' - the average smoothed loci statistic.
 90 | #' @param block logical indicating whether to search for large-scale (low
 91 | #'  resolution) blocks of differential methylation (default is FALSE, which
 92 | #'  means that local DMRs are desired). If TRUE, the parameters for 
 93 | #'  \code{bpSpan}, \code{minInSpan}, and \code{maxGapSmooth} should be adjusted
 94 | #'  (increased) accordingly. This setting will also merge
 95 | #'  candidate regions that (1) are in the same direction and (2) are less than 
 96 | #'  1kb apart with no covered CpGs separating them. The region-level model used 
 97 | #'  is also slightly modified - instead of a loci-specific intercept for each 
 98 | #'  CpG in theregion, the intercept term is modeled as a natural spline with 
 99 | #'  one interior knot per each 10kb of length (up to 10 interior knots).
100 | #' @param blockSize numeric value indicating the minimum number of basepairs 
101 | #'  to be considered a block (only used if \code{block}=TRUE). Default is 
102 | #'  5000 basepairs.
103 | #' @param chrsPerChunk a positive integer value indicating the number of 
104 | #'  chromosomes per chunk. The default is 1, meaning that the data will be 
105 | #'  looped through one chromosome at a time. When pairing up multiple 
106 | #'  chromosomes per chunk, sizes (in terms of numbers of CpGs) will be taken
107 | #'  into consideration to balance the sizes of each chunk.
108 | #' @return a \code{GRanges} object that contains the results of the inference. 
109 | #'    The object contains one row for each candidate region, sorted by q-value
110 | #'    and then chromosome. The standard 
111 | #'    \code{GRanges} chr, start, and end are included, along with at least
112 | #'    7 metadata
113 | #'    columns, in the following order: 
114 | #'    1. L = the number of CpGs contained in the region,
115 | #'    2. area = the sum of the smoothed beta values
116 | #'    3. beta = the coefficient value for the condition difference (there 
117 | #'       will be more than one column here if a multi-group comparison
118 | #'       was performed),
119 | #'    4. stat = the test statistic for the condition difference,
120 | #'    5. pval = the permutation p-value for the significance of the test
121 | #'    statistic, and 
122 | #'    6. qval = the q-value for the test statistic (adjustment
123 | #'    for multiple comparisons to control false discovery rate).
124 | #'    7. index = an \code{IRanges} containing the indices of the region's 
125 | #'       first CpG to last CpG.
126 | #'
127 | #' @keywords inference
128 | #' @importFrom outliers grubbs.test
129 | #' @importFrom bumphunter clusterMaker getSegments
130 | #' @importFrom DelayedMatrixStats colMedians rowMads rowSums2 rowMeans2 rowDiffs
131 | #' @importFrom matrixStats rowRanges
132 | #' @importFrom stats formula anova as.formula
133 | #' 
134 | #' @importClassesFrom bsseq BSseq 
135 | #' @importMethodsFrom bsseq pData seqnames sampleNames start width 
136 | #' 
137 | #' @importFrom grDevices col2rgb colorRampPalette dev.off pdf rgb
138 | #' @importFrom graphics axis layout legend lines mtext par 
139 | #' plot points rect rug text
140 | #' @importFrom methods is
141 | #' @importFrom stats approxfun lm loess median model.matrix p.adjust
142 | #' predict preplot qt quantile rbeta rbinom runif
143 | #' @importFrom utils combn
144 | #' @importFrom BiocParallel bplapply register MulticoreParam bpparam
145 | #' @importFrom splines ns
146 | #'
147 | #' @import bsseq 
148 | #' @import GenomicRanges
149 | #' @import nlme
150 | #' @import annotatr
151 | #' @import ggplot2
152 | #' @import S4Vectors
153 | #' 
154 | #' @export
155 | #' 
156 | #' @examples
157 | #' 
158 | #' # load example data 
159 | #' data(BS.chr21)
160 | #' 
161 | #' # the covariate of interest is the 'CellType' column of pData(BS.chr21)
162 | #' testCovariate <- 'CellType'
163 | #' 
164 | #' # run dmrseq on a subset of the chromosome (10K CpGs)
165 | #' regions <- dmrseq(bs=BS.chr21[240001:250000,],
166 | #'                  cutoff = 0.05,
167 | #'                  testCovariate=testCovariate)
168 | #' 
169 | dmrseq <- function(bs, testCovariate, adjustCovariate = NULL, cutoff = 0.1, 
170 |                    minNumRegion = 5, smooth = TRUE, bpSpan = 1000, 
171 |                    minInSpan = 30, maxGapSmooth = 2500, maxGap = 1000, 
172 |                    verbose = TRUE,  
173 |                    maxPerms = 10, matchCovariate = NULL, 
174 |                    BPPARAM = bpparam(), stat = "stat", 
175 |                    block = FALSE, blockSize = 5000,
176 |                    chrsPerChunk = 1) {
177 |     
178 |     stopifnot(is(bs, "BSseq"))
179 |     
180 |     if (!(is.null(cutoff) || length(cutoff) %in% seq_len(2))) 
181 |         stop("'cutoff' has to be either NULL or a vector of length 1 or 2")
182 |     if (length(cutoff) == 2) 
183 |         cutoff <- sort(cutoff)
184 |     if (is.null(cutoff) | abs(cutoff) > 1 | abs(cutoff) == 0) 
185 |         stop("Must specify a value for cutoff between 0 and 1")
186 |     subverbose <- max(as.integer(verbose) - 1L, 0)
187 |     
188 |     if(minNumRegion < 3){
189 |       stop("minNumRegion must be at least 3")
190 |     }
191 |     
192 |     # check statistic name
193 |     if (!(stat %in% c("L", "area", "beta", "stat", "avg"))) {
194 |         stop("Specified '", stat, 
195 |             "' as the test statistic which is not ", 
196 |             "in the results. Please specify a valid name from one of ",
197 |             "L, area, beta, stat, or avg")
198 |     }
199 |     
200 |     # informative message about blocks if block=TRUE; check for increased
201 |     # smoothing window
202 |     if (block){
203 |       message("Searching for large scale blocks with at least ",
204 |               blockSize, " basepairs.")
205 |       
206 |       if(minInSpan < 100 && bpSpan < 2000 && maxGapSmooth < 1e5){
207 |         warning("When block=TRUE, it is recommended to increase the values ",
208 |                 "of minInSpan, bpSpan, and maxGapSmooth in order to widen ",
209 |                 "the smoothing window")
210 |       }
211 |     }
212 |     
213 |     # convert covariates to column numbers if characters
214 |     if (is.character(testCovariate)) {
215 |         if(length(testCovariate) > 1)
216 |           stop("Only one testCovariate can be specified")
217 |         if(is.character(adjustCovariate)){
218 |           if(sum(testCovariate %in% adjustCovariate) > 0)
219 |             stop("adjustCovariate can't contain testCovariate")
220 |         }
221 |         if(is.character(matchCovariate)){
222 |           if(sum(testCovariate %in% matchCovariate))
223 |             stop("matchCovariate can't contain testCovariate")
224 |         }
225 |         testCovariate <- which(colnames(pData(bs)) == testCovariate)
226 |         if (length(testCovariate) == 0) {
227 |             stop("testCovariate not found in pData(). ",
228 |                  "Please specify a valid testCovariate")
229 |         }
230 |     }
231 |     
232 |     if (is.character(adjustCovariate)) {
233 |         if(is.character(matchCovariate)){
234 |           if(matchCovariate == adjustCovariate)
235 |             stop("matchCovariate can't be identical to adjustCovariate")
236 |         }
237 |         adjustCovariate <- which(colnames(pData(bs)) %in% adjustCovariate)
238 |         if (length(adjustCovariate) == 0) {
239 |             stop("adjustCovariate not found in pData(). ",
240 |                 "Please specify a valid adjustCovariate")
241 |         }
242 |     }
243 |     
244 |     # check that chrsPerChunk value makes sense
245 |     if (chrsPerChunk != 1){
246 |       if (chrsPerChunk%%1 != 0){
247 |         stop("chrsPerChunk must be an integer")
248 |       }else if(chrsPerChunk < 1){
249 |         stop("chrsPerChunk must be strictly positive")
250 |       }else if(chrsPerChunk > length(unique(seqnames(bs)))){
251 |         stop("chrsPerChunk can't be larger than the total",
252 |              " number of chromosomes")
253 |       }
254 |     }
255 |     
256 |     # check that bs object is sorted since `bsseq::BSseq()` no longer 
257 |     # automatically sorts to ensure loci from same chr are indexed consecutively
258 |     if (is.unsorted(bs)) {
259 |       stop("'bs' must be sorted. Use 'sort(bs)'.")
260 |     }
261 |     
262 |     # construct the design matrix using the pData of bs
263 |     if (ncol(pData(bs)) < max(testCovariate, adjustCovariate)) {
264 |         stop("Error: pData(bs) has too few columns.  ","
265 |               Please specify valid ", 
266 |             "covariates to use in the analysis")
267 |     }
268 |     
269 |     coeff <- seq(2,(2 + length(testCovariate) - 1))
270 |     testCov <- pData(bs)[, testCovariate]
271 |     if (is.factor(testCov)) # drop unused levels of test
272 |       testCov <- droplevels(testCov)
273 |     
274 |     # check for missing values in testCov 
275 |     if (any(is.na(testCov))) {
276 |       stop("Missing values found in the testCovariate ",
277 |            "Please remove these samples from the analysis.")
278 |     }
279 |     
280 |     
281 |     fact <- TRUE
282 |     sampleSize <- table(testCov)[names(table(testCov)) %in% pData(bs)[,testCovariate]]
283 |     if (length(unique(testCov)) == 1) {
284 |         message("Warning: only one unique value of the specified ", 
285 |                 "covariate of interest.  Assuming null comparison and ",
286 |                 "splitting sample group into two equal groups")
287 |         testCov <- rep(1, length(testCov))
288 |         testCov[seq_len(round(length(testCov)/2))] <- 0
289 |     }else if (length(unique(testCov)) > 2 && !is.numeric(testCov)) {
290 |         message("Performing a global test of H0: no difference among ",
291 |                 length(unique(testCov)), " groups (assuming the test ",
292 |                 "covariate ", colnames(pData(bs))[testCovariate],
293 |                 " is a factor).")
294 |         coeff <- seq(coeff, coeff + length(unique(testCov)) - 2)
295 |     }else if (length(unique(testCov)) > 2 && is.numeric(testCov)) {
296 |         message("Assuming the test ",
297 |               "covariate ", colnames(pData(bs))[testCovariate],
298 |               " is continuous.")
299 |         fact <- FALSE
300 |     }else{
301 |         message("Assuming the test ",
302 |               "covariate ", colnames(pData(bs))[testCovariate],
303 |               " is a factor.")
304 |         if(min(sampleSize) < 2)
305 |           stop("At least one group has only one sample! ",
306 |                "Replicates are required to run dmrseq.")
307 |         testCov <- as.factor(testCov)
308 |     }
309 |     
310 |     if (!is.null(adjustCovariate)) {
311 |         mmdat <- data.frame(testCov = testCov)
312 |         adjustCov <- pData(bs)[, adjustCovariate, drop = FALSE]
313 |         
314 |         # check for missing values in adjustCovariate
315 |         if (any(is.na(adjustCov))) {
316 |             stop("Missing values found in the adjustCovariate. ",
317 |                  "Please remove these samples from the analysis.")
318 |         }
319 |         
320 |         # check for number of unique values per adjust cov
321 |         nunq <- apply(adjustCov, 2, function(x) length(unique(x)))
322 |         if (any(nunq < 2))
323 |           stop("At least one adjust covariate is constant across samples.",
324 |                " Please remove this covariate from the model and try again.")
325 |         
326 |         # remove any empty factor levels
327 |         for (f in 1:ncol(adjustCov)){
328 |           if (is.factor(adjustCov[,f]))
329 |             adjustCov[,f] <- droplevels(adjustCov[,f])
330 |         }
331 |         
332 |         mmdat <- cbind(mmdat, adjustCov)
333 |         frm <- paste0("~", paste0(colnames(mmdat), collapse = " + "))
334 |         design <- model.matrix(as.formula(frm), data=mmdat)
335 |         colnames(design)[coeff] <- colnames(pData(bs))[testCovariate]
336 |         coeff.adj <- (max(coeff) + 1):(ncol(design))
337 |     } else {
338 |         design <- model.matrix(~testCov)
339 |         colnames(design)[coeff] <- colnames(pData(bs))[testCovariate]
340 |         coeff.adj <- NULL
341 |     }
342 |     
343 |     # check model matrix is full rank
344 |     e <- eigen(crossprod(as.matrix(design)), symmetric = TRUE, only.values = TRUE)$values
345 |     if (! (e[1] > 0 && abs(e[length(e)]/e[1]) > 1e-13)){
346 |       stop("Design matrix is not full rank")
347 |     }
348 |     
349 |     # check for empty factor levels in design matrix
350 |     if (sum(colSums(design) == 0) > 0){
351 |       which.empty <- which(colSums(design) == 0)
352 |       design <- design[,-which.empty]
353 |     }
354 |     
355 |     # check that p <= n
356 |     if (ncol(bs) < ncol(design) + 1)
357 |       stop("Not enough degrees of freedom to estimate ", ncol(design)-1,
358 |            " covariates using ", ncol(bs), " samples. Please use a larger ",
359 |            "number of samples, or specify fewer adjust covariates.")
360 |   
361 |     # check for incompatible args
362 |     if (fact && !is.null(matchCovariate) && length(unique(testCov)) > 2)
363 |       stop("matchCovariate can't be used when testCovariate is not a 2-group ",
364 |            "factor. Perhaps you'd like to add an adjustCovariate instead?")
365 |     
366 |     if(!is.null(matchCovariate) && choose(nrow(design), min(sampleSize)) >= 5e5)
367 |       stop("matchCovariate can't be used when the sample size is large enough ",
368 |            "to yield more than 500000 possible permutations. ",
369 |            "Perhaps you'd like to add an adjustCovariate instead?")
370 |     
371 |     # check for interaction terms (not yet supported)
372 |     if (length(coeff) > 1 && any(rowSums(design[,coeff]) > 1))
373 |       stop("Interaction terms in testCovariate are not yet supported.")
374 |     
375 |     if (length(unique(testCov)) == 2) {
376 |         message("Condition: ",
377 |             unique(pData(bs)[, testCovariate][which(design[, coeff] == 1)]), 
378 |             " vs ", 
379 |             unique(pData(bs)[, testCovariate][which(design[, coeff] == 0)]))
380 |     }
381 |     if (!is.null(adjustCovariate)) {
382 |       message("Adjusting for covariate (s): ", 
383 |               paste(colnames(pData(bs))[adjustCovariate], collapse = ", "))
384 |     }
385 |     if (!is.null(matchCovariate)) {
386 |         if (length(matchCovariate) > 1)
387 |           stop("Covariate matching can only be carried out for one",
388 |               " covariate")
389 |         if (length(unique(testCov)) > 2)
390 |           stop("Covariate matching can only be carried out for 2-group",
391 |                " comparisons")
392 |         if (is.character(matchCovariate)) {
393 |             if (sum(grepl(matchCovariate, colnames(pData(bs)))) == 0) {
394 |                 stop("Error: no column in pData() found that matches ",
395 |                       "the matchCovariate")
396 |             } else if (length(grep(matchCovariate, colnames(pData(bs)))) > 1) {
397 |                 stop("Error: matchCovariate matches more than one ",
398 |                       "column in pData()")
399 |             }
400 |             mC <- grep(matchCovariate, colnames(pData(bs)))
401 |         } else {
402 |             stopifnot(matchCovariate <= ncol(pData(bs)))
403 |         }
404 |       message("Matching permutations on covariate: ", 
405 |               colnames(pData(bs))[mC])
406 |     }
407 |     
408 |     # check for loci with missing data
409 |     if (fact){
410 |       lev <- unique(pData(bs)[[testCovariate]])
411 |       filter <- NULL
412 |       for (l in seq_along(lev)){
413 |         filter <- rbind(filter,
414 |               1*(DelayedMatrixStats::rowSums2(getCoverage(bs)[,pData(bs)[[testCovariate]] == 
415 |                                            lev[l], drop = FALSE]) == 0))
416 |       }
417 |       filter <- which( apply(filter, 2, max) > 0 )
418 |   
419 |       if (length(filter) > 0) {
420 |         stop(length(filter), " loci have zero coverage in all samples ",
421 |              "of at least one condition. Please remove these loci ", 
422 |              "before running dmrseq")
423 |       }
424 |       
425 |     }else{
426 |       filter <- DelayedMatrixStats::rowSums2(getCoverage(bs)==0) >= ncol(bs) - 1
427 |       if(sum(filter) > 0)
428 |         stop(sum(filter), " loci have zero coverage in at least ",
429 |              ncol(bs) - 1, " samples. Please remove these loci ", 
430 |              "before running dmrseq")
431 |     }
432 |     
433 |     # register the parallel backend
434 |     BiocParallel::register(BPPARAM)
435 |     backend <- paste0("BiocParallel:", class(bpparam())[1])
436 |     
437 |     if (bpparam()$workers == 1) {
438 |       if (verbose) {
439 |         mes <- "Using a single core (backend: %s)."
440 |         message(sprintf(mes, backend))
441 |       }
442 |       parallel <- FALSE
443 |     } else {
444 |       if (verbose) {
445 |         mes <- paste0("Parallelizing using %s workers/cores ", 
446 |                       "(backend: %s).")
447 |         message(sprintf(mes, bpparam()$workers, backend))
448 |       }
449 |       parallel <- TRUE
450 |     }
451 |     message("Computing on ", chrsPerChunk, 
452 |             " chromosome(s) at a time.\n")
453 |     
454 |     message("Detecting candidate regions with coefficient larger than ",
455 |                    unique(abs(cutoff)), 
456 |            " in magnitude.")
457 |     OBS <- bumphunt(bs=bs, design = design, 
458 |                     coeff = coeff, coeff.adj = coeff.adj, minInSpan = minInSpan,
459 |                     minNumRegion = minNumRegion, cutoff = cutoff, 
460 |                     maxGap = maxGap, maxGapSmooth = maxGapSmooth, 
461 |                     smooth = smooth, bpSpan = bpSpan, verbose = verbose, 
462 |                     parallel = parallel, block = block, blockSize = blockSize,
463 |                     chrsPerChunk = chrsPerChunk, fact = fact,
464 |                     adjustCovariate = adjustCovariate)
465 |    
466 |     # check that at least one candidate region was found; if there were none 
467 |     # there is no need to go on to compute permutation tests...
468 |     
469 |     if (length(OBS) > 0) {
470 |         message("* ", nrow(OBS), " candidates detected")
471 |         FLIP <- NULL
472 |         # configure the permutation matrix for two group comparisons
473 |         if (length(unique(design[, coeff[1]])) == 2 && 
474 |             length(coeff) == 1 &&
475 |             choose(nrow(design), min(sampleSize)) < 5e5 ) {
476 |             if (verbose) {
477 |                 message("Performing balanced permutations of ",
478 |                         "condition across samples ", 
479 |                   "to generate a null distribution of region test statistics")
480 |             }
481 |             perms <- combn(seq(1, nrow(design)), min(sampleSize))
482 |             
483 |             # Remove redundant permutations (if balanced)
484 |             if (length(unique(table(design[,coeff]))) == 1){
485 |               perms <- perms[, seq_len(ncol(perms)/2)]
486 |             }
487 |             
488 |             # restrict to unique permutations that don't include any 
489 |             # groups consisting of all identical conditions
490 |             rmv <- NULL
491 |             for (p in seq_len(ncol(perms))){
492 |               if (length(unique(design[perms[,p],coeff])) == 1){
493 |                 rmv <- c(rmv, p)
494 |               }
495 |             }
496 |             if (length(rmv) > 0 )
497 |               perms <- perms[,-rmv]
498 |             
499 |             # subsample permutations based on similarity to original partition
500 |             # gives preference to those with the least similarity
501 |             if (maxPerms < ncol(perms)) {
502 |                similarity <- apply(perms, 2, function(x) {
503 |                  max(table(design[x,coeff]))
504 |                })
505 |                perms.all <- perms
506 |                perms <- NULL
507 |                levs <- sort(unique(similarity))
508 |                l <- 1
509 |                num <- 0
510 |                while(!(num == maxPerms) && l <= length(levs)) {
511 |                  keep <- sample(which(similarity == levs[l]), 
512 |                                 min(maxPerms-num, sum(similarity == levs[l])) )
513 |                  perms <- cbind(perms, perms.all[,keep])
514 |                  l <- l + 1
515 |                  num <- ncol(perms) 
516 |                }
517 |             }
518 |         } else {
519 |             # Next consider a multilevel, or continuous covariate where the 
520 |             # covariate will be permuted in an unrestricted manner
521 |             if (verbose) {
522 |                 message("Performing unrestricted permutation of", 
523 |                   " covariate of interest across samples ", 
524 |                   "to generate a null distribution of region test statistics")
525 |             }
526 |             perms <- as.matrix(seq_len(nrow(design)))
527 | 
528 |             for (p in seq_len(maxPerms)) {
529 |                 tries <- 0
530 |                 candidate <- sample(seq_len(nrow(design)), nrow(design))
531 |                 # check that the permutation is not a duplicate, and not 
532 |                 # equal to the original
533 |                 while ((sum(apply(perms, 2, function(x) 
534 |                                 all.equal(x, candidate)) == TRUE) > 0 || 
535 |                        sum(apply(perms, 2, function(x) 
536 |                          all.equal(x, rev(candidate))) == TRUE) > 0) &&
537 |                        tries <= 20) {
538 |                   candidate <- sample(seq(seq_len(nrow(design))), nrow(design))
539 |                   tries <- tries + 1
540 |                 }
541 |                 # save the permutation to the permutation matrix
542 |                 if (tries <= 20){
543 |                   perms <- cbind(perms, candidate)
544 |                 }
545 |             }
546 |             perms <- perms[,-1] # remove original
547 |         }
548 |         
549 |         pData.orig <- pData(bs)
550 |         levs <- unique(pData.orig[[testCovariate]])
551 |         # Now rerun on permuted designs and concatenate results
552 |         for (j in seq_len(ncol(perms))) {
553 |             if (verbose) {
554 |                 message("\nBeginning permutation ", j)
555 |             }
556 |             reorder <- perms[, j]
557 |             designr <- design
558 |             
559 |             if (length(unique(design[, coeff[1]])) == 2 && 
560 |                 length(coeff) == 1 && 
561 |                 !nrow(perms) == nrow(designr)) {
562 |                 designr[, coeff] <- 0
563 |                 designr[reorder, coeff] <- 1
564 |                 pData(bs)[[testCovariate]] <- levs[1]
565 |                 pData(bs)[[testCovariate]][reorder] <- levs[2]
566 |                 
567 |                 if (!all(sort(pData.orig[[testCovariate]]) ==
568 |                               sort(pData(bs)[[testCovariate]]))){
569 |                   designr[, coeff] <- 1
570 |                   designr[reorder, coeff] <- 0
571 |                   pData(bs)[[testCovariate]] <- levs[2]
572 |                   pData(bs)[[testCovariate]][reorder] <- levs[1]
573 |                 }
574 |                 
575 |                 xr <- NULL
576 |                 for (rd in seq_len(nrow(pData.orig))) {
577 |                   match <- which(pData.orig[[testCovariate]] %in%
578 |                               pData(bs)[rd,][[testCovariate]])
579 |                   taken <- which(match %in% xr)
580 |                   if (length(taken) > 0)
581 |                     match <- match[-taken]
582 |                   if (length(match) > 0)
583 |                     xr <- c(xr, match[1])
584 |                 }
585 |                 if(length(coeff.adj) > 0){
586 |                   pData(bs)[,adjustCovariate] <- 
587 |                     pData.orig[xr,adjustCovariate]
588 |                 }
589 |             } else {
590 |                 designr[, coeff] <- designr[reorder, coeff]
591 |                 pData(bs) <- pData.orig[reorder, , drop = FALSE]
592 |             }
593 |             
594 |             # if matchCovariate is not null, restrict permutations such that 
595 |             # null comparisons are balanced for the values of 
596 |             # pData$matchCovariate this avoids comparison of,
597 |             # say two different individuals in the null, that the comparison of 
598 |             # interest is tissue type. Not matching would mean the null is 
599 |             # really not null
600 |             if (!is.null(matchCovariate)) {
601 |                 permLabel <- paste0(paste0(pData(bs)[designr[, coeff[1]] == 1, 
602 |                                                      mC], collapse = "_"), 
603 |                   "vs", paste0(pData(bs)[(1 - designr[, coeff[1]]) == 1, 
604 |                                            mC], collapse = "_"))
605 |                 
606 |                 c1 <- unlist(strsplit(permLabel, "vs"))[1]
607 |                 c2 <- unlist(strsplit(permLabel, "vs"))[2]
608 |                 
609 |                 c1 <- unlist(strsplit(c1, "_"))
610 |                 c2 <- unlist(strsplit(c2, "_"))
611 |                 
612 |                 keepPerm <- 1 * (sum(c1 %in% c2) > 0 &&
613 |                                  sum(c2 %in% c1) > 0)
614 |                 
615 |                 if (keepPerm == 0) {
616 |                   if (verbose) {
617 |                     message(paste0("Skipping permutation ", 
618 |                                    gsub("vs", " vs ", permLabel)))
619 |                   }
620 |                   next
621 |                 }
622 |             } else {
623 |                 permLabel <- j
624 |             }
625 |             
626 |             res.flip.p <- bumphunt(bs=bs, design = designr, 
627 |                                    coeff = coeff, 
628 |                                    coeff.adj = coeff.adj,
629 |                                    minInSpan = minInSpan, 
630 |                                    minNumRegion = minNumRegion, cutoff = cutoff,
631 |                                    maxGap = maxGap, maxGapSmooth = maxGapSmooth,
632 |                                    smooth = smooth, bpSpan = bpSpan, 
633 |                                    verbose = verbose, parallel = parallel,
634 |                                    block = block, blockSize = blockSize,
635 |                                    chrsPerChunk = chrsPerChunk, fact = fact,
636 |                                    adjustCovariate = adjustCovariate)
637 |             
638 |             if (verbose) {
639 |               message("* ", j, " out of ", ncol(perms), 
640 |                       " permutations completed (",
641 |                       nrow(res.flip.p), " null candidates)")
642 |             }
643 |             
644 |             if (!is.null(res.flip.p)) {
645 |                 res.flip.p$permNum <- permLabel
646 |                 FLIP <- rbind(FLIP, res.flip.p)
647 |             }
648 |         }
649 |         
650 |         # restore original pData
651 |         pData(bs) <- pData.orig
652 |         
653 |         # if no candidates were found in permutation
654 |         # provide informative error message
655 |         if (is.null(FLIP)){
656 |           warning("No candidate regions found in permutation, so inference ",
657 |                "can't be carried out. ",
658 |                "Try decreasing the cutoff, or running on a larger ",
659 |                "dataset if you are currently using a subset.")
660 |           OBS$pval <- NA
661 |           OBS$qval <- NA
662 |         }else if (nrow(FLIP) < 0.05*nrow(OBS)){
663 |           message("Note: Very few null candidate regions were found.",
664 |                "For more accurate and sensitive inference, ",
665 |                "try decreasing the cutoff, or running on a larger ",
666 |                "dataset if you are currently using a subset.")
667 |         }
668 |         
669 |         if (!is.null(FLIP)){
670 |           # if there are more than 1 million candidate null regions, 
671 |           # take a random sample
672 |           # of 1 million of them
673 |           if (nrow(FLIP) > 1e+06) {
674 |             rs <- sample(seq_len(nrow(FLIP)), 1e+06, replace = FALSE)
675 |             FLIP <- FLIP[rs, ]
676 |           }
677 |           
678 |           # which column of results to use as test statistic ?  
679 |           # check statistic name
680 |           if (!(stat %in% c(colnames(OBS), "avg"))) {
681 |             stop("Specified '", stat, 
682 |                  "' as the test statistic which is not ", 
683 |                  "in the results. Please specify a valid name from one of ",
684 |                  "L, area, beta, or stat")
685 |           } else if (stat == "avg") {
686 |             OBS$avg <- OBS$area/OBS$L
687 |             FLIP$avg <- FLIP$area/FLIP$L
688 |           }
689 |           
690 |           whichStatO <- which(colnames(OBS) == stat)
691 |           whichStatF <- which(colnames(FLIP) == stat)
692 |           
693 |           # Faster way to compute the p-values that doesn't use multiple cores 
694 |           # Step 1: sort the permuted statistics vector
695 |           perm.ordered <- c(sort(abs(FLIP[, whichStatF]), 
696 |                                  method = "quick"), Inf)
697 |           
698 |           # Step 2: find the first instance in the sorted vector where the 
699 |           # permuted value is greater than the observed and use this to 
700 |           # determine the number of permuted values that are greater than or  
701 |           # equal to theobserved
702 |           pval <- rep(NA, nrow(OBS))
703 |           pval[!is.na(OBS[, whichStatO])] <- (1 + 
704 |                         vapply(abs(OBS[!is.na(OBS[, whichStatO]), whichStatO]),
705 |             function(x) length(perm.ordered) - min(which(x <= perm.ordered)),
706 |                         numeric(1))) / (1 + sum(!is.na(FLIP[, whichStatF])))
707 |           
708 |           # missing test statistics cause Inf for the p-value calculation 
709 |           # instead propagate the missing values
710 |           pval[abs(pval) == Inf] <- NA
711 |           
712 |           pval <- data.frame(x = pval, y = p.adjust(pval, method = "BH"))
713 |           
714 |           OBS$pval <- pval$x
715 |           OBS$qval <- pval$y
716 |         }
717 |         
718 |         # convert output into GRanges, with indexStart/indexEnd as IRanges
719 |         indexIR <- IRanges(OBS$indexStart, OBS$indexEnd)
720 |         OBS.gr <- makeGRangesFromDataFrame(OBS[,-c(4:5)], 
721 |                                            keep.extra.columns = TRUE)
722 |         OBS.gr$index <- indexIR
723 |         names(OBS.gr) <- NULL
724 |         
725 |         # sort on pval overall (currently sorted within chromsome)
726 |         OBS.gr <- OBS.gr[order(OBS.gr$pval, -abs(OBS.gr$stat)),]
727 |         
728 |         return(OBS.gr)
729 |     } else {
730 |         message("No candidate regions pass the cutoff of ", unique(abs(cutoff)))
731 |         return(NULL)
732 |     }
733 | }
734 | 


--------------------------------------------------------------------------------