├── data
├── BS.chr21.rda
├── dmrs.ex.rda
└── annot.chr21.rda
├── inst
├── sticker
│ ├── dmrseq.png
│ └── sticker.R
├── scripts
│ ├── get_dmrs.ex.R
│ ├── get_annot.chr21.R
│ └── get_BS.chr21.R
├── CITATION
└── NEWS
├── .gitignore
├── man
├── meanDiff.Rd
├── dmrPlotAnnotations.Rd
├── annot.chr21.Rd
├── dmrs.ex.Rd
├── getAnnot.Rd
├── BS.chr21.Rd
├── plotEmpiricalDistribution.Rd
├── simDMRs.Rd
├── dmrseq.Rd
└── plotDMRs.Rd
├── LICENSE
├── vignettes
├── dmrseqBib.bib
└── dmrseq.Rmd
├── README.md
├── DESCRIPTION
├── NAMESPACE
└── R
├── meanDiff.R
├── getAnnot.R
├── plotEmpiricalDistribution.R
├── simDMRs.R
├── plotDMRs.R
├── internal_plotting_functions.R
└── dmrseq.R
/data/BS.chr21.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdkorthauer/dmrseq/HEAD/data/BS.chr21.rda
--------------------------------------------------------------------------------
/data/dmrs.ex.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdkorthauer/dmrseq/HEAD/data/dmrs.ex.rda
--------------------------------------------------------------------------------
/data/annot.chr21.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdkorthauer/dmrseq/HEAD/data/annot.chr21.rda
--------------------------------------------------------------------------------
/inst/sticker/dmrseq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdkorthauer/dmrseq/HEAD/inst/sticker/dmrseq.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | *.Rproj
6 | *.Rbuildignore
7 | vignettes/dmrseq_cache/*
8 |
--------------------------------------------------------------------------------
/inst/sticker/sticker.R:
--------------------------------------------------------------------------------
1 | imgurl <- "https://clipartion.com/wp-content/uploads/2015/11/bird-island-clipart-free-clip-art-images-830x301.png"
2 |
3 | sticker(imgurl,
4 | package="dmrseq",
5 | p_size=8,
6 | s_x=1,
7 | s_y=1.2,
8 | s_width=0.85,
9 | s_height=1,
10 | p_x = 0.95,
11 | p_y = 0.6,
12 | h_color = "gold2",
13 | h_fill = "dodgerblue",
14 | p_color = "gold2",
15 | filename="./inst/sticker/dmrseq.png")
--------------------------------------------------------------------------------
/inst/scripts/get_dmrs.ex.R:
--------------------------------------------------------------------------------
1 | ## The following script uses the dmrseq package to
2 | ## constructs the dmrs.ex dataset,
3 | ## an example of dmrseq output
4 | ## included in the dmrseq package for example purposes
5 |
6 | library(dmrseq)
7 |
8 | # load example data
9 | data(BS.chr21)
10 |
11 | # the covariate of interest is the "CellType" column of pData(BS.chr21)
12 | testCovariate <- "CellType"
13 |
14 | # run dmrseq on a subset of the chromosome (20K CpGs)
15 | dmrs.ex <- dmrseq(bs=BS.chr21[240001:260000,],
16 | cutoff = 0.05,
17 | testCovariate=testCovariate)
18 |
19 | save(dmrs.ex, file = "./data/dmrs.ex.rda")
20 | library(tools)
21 | resaveRdaFiles("./data/dmrs.ex.rda")
22 |
--------------------------------------------------------------------------------
/inst/scripts/get_annot.chr21.R:
--------------------------------------------------------------------------------
1 | ## The following script uses the dmrseq package to
2 | ## constructs the annot.chr21 dataset,
3 | ## an example of annotatr annotation to use in plotting
4 | ## included in the dmrseq package for example purposes
5 |
6 | library(dmrseq)
7 |
8 | # load example data
9 | data(BS.chr21)
10 |
11 | # get annotation information for hg19
12 | annot.chr21 <- getAnnot("hg19")
13 |
14 | # only keep this information for chromosome 21 (for example dataset)
15 | annot.chr21[[1]] <- annot.chr21[[1]][seqnames(annot.chr21[[1]])=="chr21",]
16 | annot.chr21[[2]] <- annot.chr21[[2]][seqnames(annot.chr21[[2]])=="chr21",]
17 |
18 | save(annot.chr21 , file = "./data/annot.chr21.rda")
19 | library(tools)
20 | resaveRdaFiles("./data/annot.chr21.rda")
21 |
--------------------------------------------------------------------------------
/man/meanDiff.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/meanDiff.R
3 | \name{meanDiff}
4 | \alias{meanDiff}
5 | \title{Function to compute raw mean methylation differences}
6 | \usage{
7 | meanDiff(bs, dmrs, testCovariate)
8 | }
9 | \arguments{
10 | \item{bs}{a \code{BSseq} object}
11 |
12 | \item{dmrs}{a data.frame with one row per DMR. This can be in the format
13 | of \code{dmrseq} output, but at least should contain the indexStart and
14 | indexEnd values of the regions of interest.}
15 |
16 | \item{testCovariate}{a character indicating the covariate of interest in
17 | the \code{pData} slot of \code{bs}.}
18 | }
19 | \value{
20 | numeric vector of raw mean methylation differences.
21 | }
22 | \description{
23 | This function calculates raw mean methylation differences for the
24 | covariate of interest over a set of DMRs (or regions of interest),
25 | assuming a simple two-group comparison.
26 | }
27 | \examples{
28 |
29 | data(BS.chr21)
30 | data(dmrs.ex)
31 | rawDiff <- meanDiff(BS.chr21, dmrs=dmrs.ex, testCovariate="CellType")
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
1 | citHeader("To cite dmrseq in publications, please use:")
2 |
3 | citEntry(entry = "Article",
4 | title = "Detection and accurate false discovery rate control of differentially methylated regions from whole genome bisulfite sequencing",
5 | author = personList(as.person("Keegan Korthauer"),
6 | as.person("Sutirtha Chakraborty"),
7 | as.person("Yuval Benjamini"),
8 | as.person("Rafael A Irizarry")),
9 | journal = "Biostatistics",
10 | year = "2018",
11 | #volume = "",
12 | #number = "",
13 | pages = "kxy007",
14 | doi = "https://doi.org/10.1093/biostatistics/kxy007",
15 | textVersion =
16 | paste("Korthauer, K., Chakraborty, S., Benjamini, Y., and Irizarry, R. A.",
17 | "Detection and accurate false discovery rate control of differentially methylated regions from whole genome bisulfite sequencing.",
18 | "Biostatistics kxy007, https://doi.org/10.1093/biostatistics/kxy007"))
19 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Keegan Korthauer
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/man/dmrPlotAnnotations.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/internal_plotting_functions.R
3 | \name{dmrPlotAnnotations}
4 | \alias{dmrPlotAnnotations}
5 | \title{Add annotations to DMR plots}
6 | \usage{
7 | dmrPlotAnnotations(gr, annoTrack)
8 | }
9 | \arguments{
10 | \item{gr}{a \code{GRanges} object that contains the DMRs to be
11 | plotted}
12 |
13 | \item{annoTrack}{a \code{SimpleGRangesList} object with two elements.
14 | The first contains CpG category information in the first element (optional)
15 | coding gene sequence information in the second element (optional).
16 | At least one of these elements needs to be non-null in order for
17 | any annotation to be plotted, but it is not necessary to contain
18 | both.}
19 | }
20 | \value{
21 | None
22 | }
23 | \description{
24 | Function to add visual representation of CpG categories
25 | and/or coding
26 | sequences to DMR plot
27 | }
28 | \details{
29 | An internal function that takes an annotation
30 | \code{SimpleGRangesList}
31 | object that
32 | contains CpG category information in the first element (optional) and / or
33 | coding gene sequence information in the second element (optional). If neither
34 | of these are present, then nothing will be plotted.
35 | }
36 |
--------------------------------------------------------------------------------
/man/annot.chr21.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/helper_functions.R
3 | \docType{data}
4 | \name{annot.chr21}
5 | \alias{annot.chr21}
6 | \title{annot.chr21: Annotation information for chromosome 21, hg38 genome}
7 | \format{
8 | a \code{GRangesList} object with two elements returned
9 | by \code{\link{getAnnot}}. The first
10 | contains CpG category information in the first element (optional)
11 | coding gene sequence information in the second element (optional).
12 | At least one of these elements needs to be non-null in order for
13 | any annotation to be plotted, but it is not necessary to contain
14 | both.
15 | }
16 | \source{
17 | Obtained from running
18 | \code{annoTrack} function and then subsetting the results to
19 | only include chromosome 21. A script which executes these steps
20 | and constructs the \code{annot.chr21}
21 | object may be found in \file{inst/scripts/get_annot.chr21.R}
22 | }
23 | \usage{
24 | data(annot.chr21)
25 | }
26 | \description{
27 | This is the annotation information returned from
28 | \code{\link{getAnnot}}, subsetted for chromosome 21 for convenience
29 | in running the examples. The annotation is obtained using the
30 | \code{annotatr} package.
31 | }
32 | \examples{
33 | data(annot.chr21)
34 | }
35 | \keyword{datasets}
36 |
--------------------------------------------------------------------------------
/man/dmrs.ex.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/helper_functions.R
3 | \docType{data}
4 | \name{dmrs.ex}
5 | \alias{dmrs.ex}
6 | \title{dmrs.ex: Example results of DMRs}
7 | \format{
8 | a data.frame that contains the results of the inference. The
9 | data.frame contains one row for each candidate region, and
10 | 10 columns, in the following order: 1. chr =
11 | chromosome, 2. start =
12 | start basepair position of the region, 3. end = end basepair position
13 | of the region,
14 | 4. indexStart = the index of the region's first CpG,
15 | 5. indexEnd = the index of the region's last CpG,
16 | 6. L = the number of CpGs contained in the region,
17 | 7. area = the sum of the smoothed beta values
18 | 8. beta = the coefficient value for the condition difference,
19 | 9. stat = the test statistic for the condition difference,
20 | 10. pval = the permutation p-value for the significance of the test
21 | statistic, and
22 | 11. qval = the q-value for the test statistic (adjustment
23 | for multiple comparisons to control false discovery rate).
24 | }
25 | \source{
26 | Obtained from running the examples in \code{\link{dmrseq}}.
27 | A script which executes these steps
28 | and constructs the \code{dmrs.ex}
29 | object may be found in \file{inst/scripts/get_dmrs.ex.R}
30 | }
31 | \usage{
32 | data(dmrs.ex)
33 | }
34 | \description{
35 | Example output from \code{dmrseq} function run on the
36 | example dataset \code{BS.chr21}.
37 | }
38 | \examples{
39 | data(dmrs.ex)
40 | }
41 | \keyword{datasets}
42 |
--------------------------------------------------------------------------------
/man/getAnnot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/getAnnot.R
3 | \name{getAnnot}
4 | \alias{getAnnot}
5 | \title{Retrieve annotation information}
6 | \usage{
7 | getAnnot(genomeName)
8 | }
9 | \arguments{
10 | \item{genomeName}{a character object that indicates which organism is
11 | under study. Use the function \code{builtin_genomes()} to see
12 | a character vector of available genome names to choose from (see
13 | \code{annotatr} documentation for more details).}
14 | }
15 | \value{
16 | a \code{SimpleGRangesList} object with two elements returned
17 | by \code{\link{getAnnot}}. The first
18 | contains CpG category information in the first element (optional)
19 | coding gene sequence information in the second element (optional).
20 | At least one of these elements needs to be non-null in order for
21 | any annotation to be plotted, but it is not necessary to contain
22 | both.
23 | }
24 | \description{
25 | Uses the \code{annotatr} package to retrieve annotation information (
26 | CpG category and gene coding sequences) for the \code{annoTrack} argument
27 | of \code{\link{plotDMRs}}. Allows for 5
28 | re-tries if download fails (to allow for a spotty internet connection).
29 | }
30 | \details{
31 | Note that this package needs to attach the \code{annotatr} package,
32 | and will
33 | return NULL if this cannot be done. You can still use the
34 | \code{\link{plotDMRs}} function without this optional annotation step,
35 | just by leaving the \code{annoTrack} argument as NULL.
36 | }
37 | \examples{
38 |
39 | # get annotation information for hg19
40 | annoTrack <- getAnnot('hg19')
41 |
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/vignettes/dmrseqBib.bib:
--------------------------------------------------------------------------------
1 | @article{Hansen2012,
2 | author = {Hansen, Kasper D and Langmead, Benjamin and Irizarry, Rafael A},
3 | title = {{BSmooth: from whole genome bisulfite sequencing reads to differentially
4 | methylated regions}},
5 | journal = {Genome Biology},
6 | year = {2012},
7 | volume = {13},
8 | number = {10},
9 | pages = {R83},
10 | doi = {10.1186/gb-2012-13-10-r83},
11 | pubmed = {23034175}
12 | }
13 |
14 | @article{Lister2009,
15 | author = {Lister, Ryan and Pelizzola, Mattia and Dowen, Robert H and Hawkins, R David and
16 | Hon, Gary C and Tonti-Filippini, Julian and Nery, Joseph R and Lee, Leonard and
17 | Ye, Zhen and Ngo, Que-Minh and Edsall, Lee and Antosiewicz-Bourget, Jessica and
18 | Stewart, Ron and Ruotti, Victor and Millar, A Harvey and Thomson, James A and Ren,
19 | Bing and Ecker, Joseph R},
20 | title = {{Human DNA methylomes at base resolution show widespread epigenomic differences}},
21 | journal = {Nature},
22 | year = {2009},
23 | volume = {462},
24 | number = {7271},
25 | pages = {315--322},
26 | doi = {10.1038/nature08514},
27 | pubmed = {19829295}
28 | }
29 |
30 | @article {Korthauer183210,
31 | author = {Korthauer, Keegan and Chakraborty, Sutirtha and Benjamini, Yuval and Irizarry, Rafael A.},
32 | title = {Detection and accurate False Discovery Rate control of differentially methylated regions from Whole Genome Bisulfite Sequencing},
33 | year = {2018},
34 | doi = {10.1101/183210},
35 | journal = {Biostatistics}
36 | }
37 |
38 |
39 |
40 | % Local Variables:
41 | % eval: (setq fill-column 100)
42 | % End:
43 |
--------------------------------------------------------------------------------
/man/BS.chr21.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/helper_functions.R
3 | \docType{data}
4 | \name{BS.chr21}
5 | \alias{BS.chr21}
6 | \title{BS.chr21: Whole-genome bisulfite sequencing for chromosome 21
7 | from Lister et al.}
8 | \format{
9 | An object of class \code{BSseq}.
10 | }
11 | \source{
12 | Obtained from
13 | \url{http://neomorph.salk.edu/human_methylome/data.html} specifically,
14 | the files \url{mc_h1_r1.tar.gz}, \url{mc_h1_r2.tar.gz},
15 | \url{mc_imr90_r1.tar.gz}, \url{mc_imr90_r2.tar.gz}
16 | A script which downloads these files and constructs the \code{BS.chr21}
17 | object may be found in \file{inst/scripts/get_BS.chr21.R} - this was
18 | based off of and modified from the get_BS.chr22.R script in the
19 | \code{bsseq} package. The object constructed here contains a
20 | different chromosome (22 instead of 21), and two additional samples
21 | (h1 and imr90 instead of just imr90) to enable identification of
22 | cell type-DMRs for examples.
23 | }
24 | \usage{
25 | data(BS.chr21)
26 | }
27 | \description{
28 | This dataset represents chromosome 21
29 | from the IMR90 and H1 cell lines sequenced in Lister et al.
30 | Only CpG methylation are included. The two samples from
31 | each cell line are two different extractions (ie. technical replicates),
32 | and are pooled in the analysis in the original paper.
33 | }
34 | \details{
35 | All coordinates are in hg18.
36 | }
37 | \examples{
38 | data(BS.chr21)
39 | BS.chr21
40 | }
41 | \references{
42 | R Lister et al. \emph{Human DNA methylomes at base
43 | resolution show widespread epigenomic differences}. Nature (2009) 462,
44 | 315-322.
45 | }
46 | \keyword{datasets}
47 |
--------------------------------------------------------------------------------
/inst/NEWS:
--------------------------------------------------------------------------------
1 | Changes in Version 1.3.1 (2018-11-02)
2 | _____________________________________
3 |
4 | o dmrseq now requires that input BSseq objects be ordered (as the BSseq()
5 | constructor no longer automatically orders loci).
6 |
7 | Changes in Version 1.1.20 (2018-10-11)
8 | _____________________________________
9 |
10 | o Minor bug fix for a rare situation that occured in the case of a multi-level
11 | covariate of interest if the region level model fitting procedure did not
12 | converge.
13 |
14 | Changes in Version 1.1.2 (2018-05-09)
15 | _____________________________________
16 |
17 | o The newly added `chrsPerChunk` argument specifies the number of chromosomes
18 | to compute at a time (default is 1).
19 |
20 | Changes in Version 0.99.11 (2018-04-05)
21 | ______________________________________
22 |
23 | o dmrseq now provides support for detecting large-scale methylation blocks. To
24 | use this feature, specify `block=TRUE`, and increase the smoothing span
25 | parameters `minInSpan`, `bpSpan`, and `maxGapSmooth`. More details are
26 | provided in the documentation and vignette.
27 |
28 | Changes in Version 0.99.8 (2018-03-21)
29 | ______________________________________
30 |
31 | o dmrseq no longer requires balanced, two-group comparisons. To run using a
32 | continuous or categorial covariate with more than two groups, simply pass in
33 | the name of a column in `pData` that contains this covariate. A continuous
34 | covariate is assmued if the data type in the `testCovariate` slot is
35 | continuous, with the exception of if there are only two unique values (then a
36 | two group comparison is carried out).
37 |
38 | Version 0.99.6 (2018-03-02)
39 | ___________________________
40 |
41 | o dmrseq is now available on Bioconductor devel (3.7) !
42 |
43 |
--------------------------------------------------------------------------------
/man/plotEmpiricalDistribution.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plotEmpiricalDistribution.R
3 | \name{plotEmpiricalDistribution}
4 | \alias{plotEmpiricalDistribution}
5 | \title{Plot the empirical distribution of the methylation beta vals or coverage}
6 | \usage{
7 | plotEmpiricalDistribution(
8 | bs,
9 | testCovariate = NULL,
10 | bySample = FALSE,
11 | type = "M",
12 | adj = 2.5
13 | )
14 | }
15 | \arguments{
16 | \item{bs}{a BSseq object}
17 |
18 | \item{testCovariate}{character specifying the column name of the
19 | \code{pData} slot of the BSseq object to include in the plot legend.}
20 |
21 | \item{bySample}{logical whether to plot a separate line for each sample,
22 | even if the grouping \code{testCovariate} is specified.
23 | Default value is FALSE (so samples with the same value of
24 | \code{testCovariate} will be collapsed into the same line). If
25 | \code{testCovariate} is not specified, this parameter does not have an
26 | effect and samples are automatically plotted separately.}
27 |
28 | \item{type}{a character indicating which type of density to plot - the
29 | methylation (beta) values ("M") or the coverage ("Cov"). Default is "M".}
30 |
31 | \item{adj}{a numeric value for the \code{adjust} parameter to pass to the
32 | \code{geom_line} function. Specifies how smooth the make the function.}
33 | }
34 | \value{
35 | a ggplot object
36 | }
37 | \description{
38 | Uses ggplot2 to plot smoothed density histograms of methylation
39 | proportions (beta values), or coverage. Methylation proportion densities
40 | are weighted by coverage.
41 | The number of curves plotted
42 | will be equal to the number of different values of \code{testCovariate},
43 | unless \code{bySample} is TRUE. This can take quite some time to
44 | execute for a large object, so it is recommended to first take a random
45 | sample of loci (say one million) before plotting.
46 | }
47 | \examples{
48 |
49 | data(BS.chr21)
50 |
51 | # plot beta values by sample group
52 | plotEmpiricalDistribution(BS.chr21, testCovariate="CellType")
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # dmrseq: Inference for differentially methylated regions (DMRs) from bisulfite sequencing
2 |
3 | A central question in the analysis of bisulfite sequencing data
4 | is to detect regions (collections of
5 | neighboring CpGs) with systematic differences between conditions,
6 | as compared to within-condition variability. These so-called *Differentially
7 | Methylated Regions* (DMRs) are thought to be more informative than single CpGs
8 | in terms of of biological function.
9 |
10 |
11 |
12 |
13 |
14 | The package **dmrseq**
15 | provides a rigorous permutation-based approach to
16 | detect and perform inference for differential methylation by use of
17 | generalized least squares models that account for inter-individual and
18 | inter-CpG variability to generate region-level statistics that can be
19 | comparable across the genome. The framework performs well even
20 | on samples as small as two per group.
21 |
22 | ## Installation
23 |
24 | **dmrseq** is available on
25 | [Bioconductor](https://bioconductor.org/packages/dmrseq). You can install
26 | it with R version 3.5.0 or higher with the following commands:
27 |
28 | ```
29 | install.packages("BiocManager")
30 | BiocManager::install("dmrseq")
31 | ```
32 |
33 | ## Getting started
34 |
35 | See the vignette for information on how to use the package to perform
36 | typical methylation analysis workflows.
37 |
38 | ## Learn more
39 |
40 | More details of the **dmrseq** framework can be found in the manuscript
41 |
42 | > Korthauer, K., Chakraborty, S., Benjamini, Y., and Irizarry, R.A.
43 | > Detection and accurate False Discovery Rate control of differentially
44 | methylated regions from Whole Genome Bisulfite Sequencing
45 | > *Biostatistics*, 2018 (in press).
46 | > [BioRxiv:10.1101/183210](http://www.biorxiv.org/content/early/2017/08/31/183210)
47 |
48 |
49 | ## License/Copyright
50 | [](https://opensource.org/licenses/MIT)
51 | This package is made available under an MIT license.
52 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: dmrseq
2 | Type: Package
3 | Title: Detection and inference of differentially methylated regions
4 | from Whole Genome Bisulfite Sequencing
5 | Version: 1.29.1
6 | Authors@R: c(person("Keegan", "Korthauer", role = c("cre", "aut"),
7 | email = "keegan@stat.ubc.ca",
8 | comment = c(ORCID = "0000-0002-4565-1654")),
9 | person("Rafael", "Irizarry", role = "aut",
10 | email = "rafa@jimmy.harvard.edu",
11 | comment = c(ORCID = "0000-0002-3944-4309")),
12 | person("Yuval", "Benjamini", role = "aut"),
13 | person("Sutirtha", "Chakraborty", role = "aut"))
14 | Description: This package implements an approach for scanning the
15 | genome to detect and perform accurate inference on differentially
16 | methylated regions from Whole Genome Bisulfite Sequencing data.
17 | The method is based on comparing detected regions
18 | to a pooled null distribution, that can be implemented even when as
19 | few as two samples per population are available.
20 | Region-level statistics are obtained by fitting a generalized
21 | least squares (GLS) regression model with a nested autoregressive
22 | correlated error structure for the effect of interest on transformed
23 | methylation proportions.
24 | License: MIT + file LICENSE
25 | Depends: R (>= 3.5), bsseq
26 | Imports: GenomicRanges,
27 | nlme,
28 | ggplot2,
29 | S4Vectors,
30 | RColorBrewer,
31 | bumphunter,
32 | DelayedMatrixStats (>= 1.1.13),
33 | matrixStats,
34 | BiocParallel,
35 | outliers,
36 | methods,
37 | locfit,
38 | IRanges,
39 | grDevices,
40 | graphics,
41 | stats,
42 | utils,
43 | annotatr,
44 | AnnotationHub,
45 | rtracklayer,
46 | GenomeInfoDb,
47 | splines
48 | Suggests: knitr,
49 | rmarkdown,
50 | BiocStyle,
51 | TxDb.Hsapiens.UCSC.hg19.knownGene,
52 | org.Hs.eg.db
53 | biocViews: ImmunoOncology, DNAMethylation, Epigenetics, MultipleComparison, Software,
54 | Sequencing, DifferentialMethylation, WholeGenome, Regression,
55 | FunctionalGenomics
56 | LazyData: true
57 | VignetteBuilder: knitr
58 | RoxygenNote: 7.2.3
59 |
--------------------------------------------------------------------------------
/man/simDMRs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/simDMRs.R
3 | \name{simDMRs}
4 | \alias{simDMRs}
5 | \title{Simulate Differentially Methylated Regions}
6 | \usage{
7 | simDMRs(bs, num.dmrs = 3000, delta.max0 = 0.3)
8 | }
9 | \arguments{
10 | \item{bs}{a BSseq object containing only control samples (from the same
11 | population) for which simulated DMRs will be added after dividing the
12 | population into two artificial groups.}
13 |
14 | \item{num.dmrs}{an integer specifying how many DMRs to add.}
15 |
16 | \item{delta.max0}{a proportion value indicating the mode value for the
17 | difference in proportion of methylated CpGs in the simulated DMRs (the
18 | actual value will be drawn from a scaled Beta distribution centered at
19 | this value). Default value is 0.3.}
20 | }
21 | \value{
22 | A named list object with 5 elements: (1)
23 | \code{gr.dmrs} is a \code{GRanges} object with \code{num.dmrs}
24 | ranges that represent the random DMRs added. (2) \code{dmr.mncov} is a
25 | numeric vector that contains the mean coverage in each simulated DMR. (3)
26 | \code{dmr.L} is a numeric vector that contains the number of CpGs in each
27 | simulated DMR. (4) \code{bs} is the BSseq object that contains the
28 | simulated DMRs. (5) \code{deltas} is a numeric vector that contains the
29 | effect size used for each DMR.
30 | }
31 | \description{
32 | Add simulated DMRs to observed control data. Control data will be split
33 | into two (artificial) populations.
34 | }
35 | \examples{
36 |
37 | # Add simulated DMRs to a BSseq dataset
38 | # This is just for illustrative purposes - ideally you would
39 | # add DMRs to a set of samples from the same condition (in our
40 | # example data, we have data from two different cell types)
41 | # In this case, we shuffle the samples by cell type to create
42 | # a null comparison.
43 |
44 | data(BS.chr21)
45 |
46 | BS.chr21.sim <- simDMRs(bs=BS.chr21[1:10000,c(1,3,2,4)],
47 | num.dmrs=50)
48 |
49 | # show the simulated DMRs GRanges object
50 | show(BS.chr21.sim$gr.dmrs)
51 |
52 | # show the updated BSseq object that includes the simulated DMRs
53 | show(BS.chr21.sim$bs)
54 |
55 | # examine effect sizes of the DMRs
56 | head(BS.chr21.sim$delta)
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(dmrseq)
4 | export(getAnnot)
5 | export(meanDiff)
6 | export(plotDMRs)
7 | export(plotEmpiricalDistribution)
8 | export(simDMRs)
9 | import(GenomicRanges)
10 | import(S4Vectors)
11 | import(annotatr)
12 | import(bsseq)
13 | import(ggplot2)
14 | import(nlme)
15 | importClassesFrom(bsseq,BSseq)
16 | importFrom(AnnotationHub,AnnotationHub)
17 | importFrom(AnnotationHub,query)
18 | importFrom(BiocParallel,MulticoreParam)
19 | importFrom(BiocParallel,bplapply)
20 | importFrom(BiocParallel,bpparam)
21 | importFrom(BiocParallel,register)
22 | importFrom(DelayedMatrixStats,colMedians)
23 | importFrom(DelayedMatrixStats,rowDiffs)
24 | importFrom(DelayedMatrixStats,rowMads)
25 | importFrom(DelayedMatrixStats,rowMeans2)
26 | importFrom(DelayedMatrixStats,rowSums2)
27 | importFrom(GenomeInfoDb,genome)
28 | importFrom(IRanges,IRanges)
29 | importFrom(RColorBrewer,brewer.pal)
30 | importFrom(bumphunter,clusterMaker)
31 | importFrom(bumphunter,getSegments)
32 | importFrom(grDevices,col2rgb)
33 | importFrom(grDevices,colorRampPalette)
34 | importFrom(grDevices,dev.off)
35 | importFrom(grDevices,hcl)
36 | importFrom(grDevices,pdf)
37 | importFrom(grDevices,rainbow)
38 | importFrom(grDevices,rgb)
39 | importFrom(graphics,arrows)
40 | importFrom(graphics,axis)
41 | importFrom(graphics,layout)
42 | importFrom(graphics,legend)
43 | importFrom(graphics,lines)
44 | importFrom(graphics,mtext)
45 | importFrom(graphics,par)
46 | importFrom(graphics,plot)
47 | importFrom(graphics,points)
48 | importFrom(graphics,rect)
49 | importFrom(graphics,rug)
50 | importFrom(graphics,text)
51 | importFrom(locfit,locfit)
52 | importFrom(locfit,lp)
53 | importFrom(matrixStats,rowRanges)
54 | importFrom(methods,is)
55 | importFrom(outliers,grubbs.test)
56 | importFrom(rtracklayer,liftOver)
57 | importFrom(splines,ns)
58 | importFrom(stats,anova)
59 | importFrom(stats,approxfun)
60 | importFrom(stats,as.formula)
61 | importFrom(stats,formula)
62 | importFrom(stats,lm)
63 | importFrom(stats,loess)
64 | importFrom(stats,median)
65 | importFrom(stats,model.matrix)
66 | importFrom(stats,p.adjust)
67 | importFrom(stats,predict)
68 | importFrom(stats,preplot)
69 | importFrom(stats,qt)
70 | importFrom(stats,quantile)
71 | importFrom(stats,rbeta)
72 | importFrom(stats,rbinom)
73 | importFrom(stats,runif)
74 | importFrom(utils,combn)
75 | importMethodsFrom(bsseq,pData)
76 | importMethodsFrom(bsseq,sampleNames)
77 | importMethodsFrom(bsseq,seqnames)
78 | importMethodsFrom(bsseq,start)
79 | importMethodsFrom(bsseq,width)
80 |
--------------------------------------------------------------------------------
/R/meanDiff.R:
--------------------------------------------------------------------------------
1 | #' Function to compute raw mean methylation differences
2 | #'
3 | #' This function calculates raw mean methylation differences for the
4 | #' covariate of interest over a set of DMRs (or regions of interest),
5 | #' assuming a simple two-group comparison.
6 | #'
7 | #' @param bs a \code{BSseq} object
8 | #' @param dmrs a data.frame with one row per DMR. This can be in the format
9 | #' of \code{dmrseq} output, but at least should contain the indexStart and
10 | #' indexEnd values of the regions of interest.
11 | #' @param testCovariate a character indicating the covariate of interest in
12 | #' the \code{pData} slot of \code{bs}.
13 | #'
14 | #' @return numeric vector of raw mean methylation differences.
15 | #'
16 | #' @importFrom DelayedMatrixStats rowMeans2
17 | #'
18 | #' @export
19 | #'
20 | #' @examples
21 | #'
22 | #' data(BS.chr21)
23 | #' data(dmrs.ex)
24 | #' rawDiff <- meanDiff(BS.chr21, dmrs=dmrs.ex, testCovariate="CellType")
25 | #'
26 | meanDiff <- function(bs, dmrs, testCovariate) {
27 | # convert covariates to column numbers if characters
28 | if (is.character(testCovariate)) {
29 | testCovariate <- which(colnames(pData(bs)) == testCovariate)
30 | if (length(testCovariate) == 0) {
31 | stop("testCovariate not found in pData(). ",
32 | "Please specify a valid testCovariate")
33 | }
34 | }
35 |
36 | coeff <- seq(2,(2 + length(testCovariate) - 1))
37 | testCov <- pData(bs)[, testCovariate]
38 | if (length(unique(testCov)) == 1) {
39 | message("Warning: only one unique value of the specified ",
40 | "covariate of interest. Assuming null comparison and ",
41 | "splitting sample group into two equal groups")
42 | testCov <- rep(1, length(testCov))
43 | testCov[seq_len(round(length(testCov)/2))] <- 0
44 | }
45 |
46 | design <- model.matrix(~testCov)
47 | colnames(design)[coeff] <- colnames(pData(bs))[testCovariate]
48 |
49 | if (length(unique(design[, coeff])) != 2) {
50 | message("Not a two-group comparison. Can't compute simple mean ",
51 | "methylation differences. ",
52 | "Returning beta estimates instead")
53 | return(dmrs$beta)
54 | } else {
55 | prop.mat <- getCoverage(bs, type = "M") /
56 | getCoverage(bs, type = "Cov")
57 | levs <- unique(design[, coeff])
58 |
59 | indexRanges <- IRanges(start(dmrs$index), end(dmrs$index))
60 | prop.mat.dmr <- extractROWS(prop.mat, indexRanges)
61 | prop.mat1.means <- DelayedMatrixStats::rowMeans2(prop.mat.dmr[,
62 | design[, coeff] == levs[which.min(levs)]],
63 | na.rm=TRUE)
64 | prop.mat2.means <- DelayedMatrixStats::rowMeans2(prop.mat.dmr[,
65 | design[, coeff] == levs[which.max(levs)]],
66 | na.rm=TRUE)
67 |
68 | meanDiff <- IRanges::mean(IRanges::relist(prop.mat2.means - prop.mat1.means,
69 | indexRanges), na.rm=TRUE)
70 |
71 | return(meanDiff)
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/inst/scripts/get_BS.chr21.R:
--------------------------------------------------------------------------------
1 | ## The following script downloads and constructs the BS.chr21 dataset,
2 | ## included in the dmrseq package
3 | ## This script was modified from bsseq - includes two additional samples
4 | ## compared to the example data in bsseq
5 |
6 | library(dmrseq)
7 |
8 | ## First we download. Each file is slightly less than 200 MB
9 | # you may need to change the 'method' in 'download.file' to suit
10 | # the utilities available on your OS
11 |
12 | # cell type imr90, replicates 1 and 2
13 | download.file(url = "ftp://ftpuser3:s3qu3nc3@neomorph.salk.edu/mc/mc_imr90_r1.tar.gz",
14 | destfile = "mc_imr90_r1.tar.gz", method='curl')
15 | untar("mc_imr90_r1.tar.gz", "mc_imr90_r1/mc_imr90_r1_21", compressed = TRUE)
16 | download.file(url = "ftp://ftpuser3:s3qu3nc3@neomorph.salk.edu/mc/mc_imr90_r2.tar.gz",
17 | destfile = "mc_imr90_r2.tar.gz", method='curl')
18 | untar("mc_imr90_r2.tar.gz", "mc_imr90_r2/mc_imr90_r2_21", compressed = TRUE)
19 |
20 | # cell type h1, replicates 1 and 2
21 | download.file(url = "ftp://ftpuser3:s3qu3nc3@neomorph.salk.edu/mc/mc_h1_r1.tar.gz",
22 | destfile = "mc_h1_r1.tar.gz", method='curl')
23 | untar("mc_h1_r1.tar.gz", "mc_h1_r1/mc_h1_r1_21", compressed = TRUE)
24 | download.file(url = "ftp://ftpuser3:s3qu3nc3@neomorph.salk.edu/mc/mc_h1_r2.tar.gz",
25 | destfile = "mc_h1_r2.tar.gz", method='curl')
26 | untar("mc_h1_r2.tar.gz", "mc_h1_r2/mc_h1_r2_21", compressed = TRUE)
27 |
28 |
29 | ## Now the workhorse function
30 |
31 | read.lister <- function(file) {
32 | dat <- read.table(file, skip = 1, row.names = NULL,
33 | col.names = c("chr", "pos", "strand", "context", "M", "Cov"),
34 | colClasses = c("character", "integer", "character",
35 | "character", "integer", "integer"))
36 | ## we remove all non-CpG calls. This includes SNPs
37 | dat <- dat[dat$context == "CG",]
38 | dat$context <- NULL
39 | dat$chr <- paste("chr", dat$chr, sep = "")
40 | ## Now we need to handle that the data has separate lines for each strand
41 | ## We join these
42 | tmp <- dat[dat$strand == "+",]
43 | BS.forward <- BSseq(pos = tmp$pos, chr = tmp$chr,
44 | M = as.matrix(tmp$M, ncol = 1),
45 | Cov = as.matrix(tmp$Cov, ncol = 1),
46 | sampleNames = "forward")
47 | tmp <- dat[dat$strand == "-",]
48 | BS.reverse <- BSseq(pos = tmp$pos - 1L, chr = tmp$chr,
49 | M = as.matrix(tmp$M, ncol = 1),
50 | Cov = as.matrix(tmp$Cov, ncol = 1),
51 | sampleNames = "reverse")
52 | BS <- combine(BS.forward, BS.reverse)
53 | BS <- collapseBSseq(BS, columns = c("a", "a"))
54 | BS
55 | }
56 |
57 | BS.imr90.r1 <- read.lister("mc_imr90_r1/mc_imr90_r1_21")
58 | sampleNames(BS.imr90.r1) <- "imr90.r1"
59 | BS.imr90.r2 <- read.lister("mc_imr90_r2/mc_imr90_r2_21")
60 | sampleNames(BS.imr90.r2) <- "imr90.r2"
61 |
62 | BS.h1.r1 <- read.lister("mc_h1_r1/mc_h1_r1_21")
63 | sampleNames(BS.h1.r1) <- "h1.r1"
64 | BS.h1.r2 <- read.lister("mc_h1_r2/mc_h1_r2_21")
65 | sampleNames(BS.h1.r2) <- "h1.r2"
66 |
67 | BS.chr21 <- combine(BS.imr90.r1, BS.imr90.r2, BS.h1.r1, BS.h1.r2)
68 | pData(BS.chr21)$CellType <- c(rep("imr90",2), rep("h1",2))
69 | pData(BS.chr21)$Rep <- rep(c("replicate1", "replicate2"),2)
70 | validObject(BS.chr21)
71 | pData(BS.chr21)
72 |
73 | #remove any loci which have no coverage in one or more samples
74 | meth.levels.raw = getMeth(BS.chr21, type = "raw")
75 | no.hits = which(is.na(rowMeans(meth.levels.raw)) == TRUE)
76 | BS.chr21 = BS.chr21[-no.hits]
77 |
78 | save(BS.chr21, file = "./data/BS.chr21.rda")
79 | library(tools)
80 | resaveRdaFiles("./data/BS.chr21.rda")
81 |
82 |
83 |
--------------------------------------------------------------------------------
/R/getAnnot.R:
--------------------------------------------------------------------------------
1 | #' Retrieve annotation information
2 | #'
3 | #' Uses the \code{annotatr} package to retrieve annotation information (
4 | #' CpG category and gene coding sequences) for the \code{annoTrack} argument
5 | #' of \code{\link{plotDMRs}}. Allows for 5
6 | #' re-tries if download fails (to allow for a spotty internet connection).
7 | #'
8 | #' @details Note that this package needs to attach the \code{annotatr} package,
9 | #' and will
10 | #' return NULL if this cannot be done. You can still use the
11 | #' \code{\link{plotDMRs}} function without this optional annotation step,
12 | #' just by leaving the \code{annoTrack} argument as NULL.
13 | #'
14 | #' @param genomeName a character object that indicates which organism is
15 | #' under study. Use the function \code{builtin_genomes()} to see
16 | #' a character vector of available genome names to choose from (see
17 | #' \code{annotatr} documentation for more details).
18 | #'
19 | #' @return a \code{SimpleGRangesList} object with two elements returned
20 | #' by \code{\link{getAnnot}}. The first
21 | #' contains CpG category information in the first element (optional)
22 | #' coding gene sequence information in the second element (optional).
23 | #' At least one of these elements needs to be non-null in order for
24 | #' any annotation to be plotted, but it is not necessary to contain
25 | #' both.
26 | #'
27 | #' @export
28 | #'
29 | #' @import annotatr
30 | #' @importFrom AnnotationHub AnnotationHub query
31 | #' @importFrom rtracklayer liftOver
32 | #' @importFrom GenomeInfoDb genome
33 | #'
34 | #' @examples
35 | #'
36 | #' # get annotation information for hg19
37 | #' annoTrack <- getAnnot('hg19')
38 | #'
39 | #'
40 | getAnnot <- function(genomeName) {
41 | requireNamespace("annotatr")
42 | liftTo <- NULL
43 | if(genomeName == 'hg18'){
44 | message("Genome ", genomeName, " will be built by lifting over ",
45 | "hg19 annotations from annotatr")
46 | liftTo <- 'hg18'
47 | genomeName <- 'hg19'
48 | }else if (!genomeName %in% annotatr::builtin_genomes()) {
49 | message("Genome ", genomeName, " is not supported by ",
50 | "annotatr at this time")
51 | return(NULL)
52 | }
53 |
54 | if (is.null(genomeName)) {
55 | return(NULL)
56 | } else {
57 | annot_CpG <- paste0(c(genomeName, "_cpgs"), collapse = "")
58 | annot_genes <- paste0(c(genomeName, "_genes_cds"), collapse = "")
59 |
60 | # Build the annotations (a single GRanges object)
61 |
62 | # annotatr downloads files for genomes that aren't natively supported
63 | # (e.g. hg38)
64 | # Download has a nonzero fail rate; allow up to 5 retries before
65 | # throwing an error
66 |
67 | for (attempt in seq_len(5)) {
68 | cpg <- try(annotatr::build_annotations(genome = genomeName,
69 | annotations = annot_CpG),
70 | silent = TRUE)
71 | if (!is(cpg, "try-error")) {
72 | message("Download of CpG annotation successful!")
73 | fail1 <- 0
74 | break
75 | } else {
76 | message(cpg)
77 | if (5 - attempt > 0)
78 | message("Trying again (", 5 - attempt, " attempts remaining)")
79 | fail1 <- 1
80 | }
81 | }
82 |
83 | for (attempt in seq_len(5)) {
84 | genes <- try(annotatr::build_annotations(genome = genomeName,
85 | annotations = annot_genes),
86 | silent = TRUE)
87 | if (!is(genes, "try-error")) {
88 | message("Download of Gene annotation successful!")
89 | fail2 <- 0
90 | break
91 | } else {
92 | message(genes)
93 | if (5 - attempt > 0)
94 | message("Trying again (", 5 - attempt, " attempts remaining)")
95 | fail2 <- 1
96 | }
97 | }
98 |
99 |
100 | if (fail1 == 0 && fail2 == 0) {
101 | # lift over hg19 coordinates to hg18 using annotationHub
102 | if (!is.null(liftTo)){
103 | ah = AnnotationHub()
104 | chainfiles <- query(ah , c(genomeName, liftTo, "chainfile"))
105 | cf <- which(grepl(paste0(genomeName, "To", liftTo),
106 | chainfiles$title,
107 | ignore.case=TRUE))
108 | if (length(cf) == 0){
109 | message("LiftOver from ", genomeName, " to ", liftTo,
110 | " was unsucccessful")
111 | return(NULL)
112 | }else if(length(cf) > 1){
113 | # take the first matching chain if more than one
114 | cf <- cf[1]
115 | }
116 |
117 | chain <- chainfiles[[names(chainfiles)[cf]]]
118 |
119 | cpg.new <- unlist(liftOver(cpg, chain))
120 | genes.new <- unlist(liftOver(genes, chain))
121 |
122 | GenomeInfoDb::genome(cpg) <- liftTo
123 | GenomeInfoDb::genome(genes) <- liftTo
124 | }
125 |
126 | keep <- which(!is.na(genes$symbol))
127 | genes <- genes[keep, ]
128 | cpg$type <- substr(cpg$type, 10, nchar(cpg$type))
129 |
130 | annot <- GRangesList(CpGs = cpg, Exons = genes, compress=FALSE)
131 | return(annot)
132 | } else {
133 | message("Download with annotatr::build_annotations() failed; Annotation could not be retrieved.")
134 | return(NULL)
135 | }
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/R/plotEmpiricalDistribution.R:
--------------------------------------------------------------------------------
1 | #' Plot the empirical distribution of the methylation beta vals or coverage
2 | #'
3 | #' Uses ggplot2 to plot smoothed density histograms of methylation
4 | #' proportions (beta values), or coverage. Methylation proportion densities
5 | #' are weighted by coverage.
6 | #' The number of curves plotted
7 | #' will be equal to the number of different values of \code{testCovariate},
8 | #' unless \code{bySample} is TRUE. This can take quite some time to
9 | #' execute for a large object, so it is recommended to first take a random
10 | #' sample of loci (say one million) before plotting.
11 | #'
12 | #' @param bs a BSseq object
13 | #'
14 | #' @param testCovariate character specifying the column name of the
15 | #' \code{pData} slot of the BSseq object to include in the plot legend.
16 | #'
17 | #' @param bySample logical whether to plot a separate line for each sample,
18 | #' even if the grouping \code{testCovariate} is specified.
19 | #' Default value is FALSE (so samples with the same value of
20 | #' \code{testCovariate} will be collapsed into the same line). If
21 | #' \code{testCovariate} is not specified, this parameter does not have an
22 | #' effect and samples are automatically plotted separately.
23 | #'
24 | #' @param type a character indicating which type of density to plot - the
25 | #' methylation (beta) values ("M") or the coverage ("Cov"). Default is "M".
26 | #'
27 | #' @param adj a numeric value for the \code{adjust} parameter to pass to the
28 | #' \code{geom_line} function. Specifies how smooth the make the function.
29 | #'
30 | #' @return a ggplot object
31 | #'
32 | #' @importFrom locfit locfit lp
33 | #'
34 | #' @export
35 | #'
36 | #' @examples
37 | #'
38 | #' data(BS.chr21)
39 | #'
40 | #' # plot beta values by sample group
41 | #' plotEmpiricalDistribution(BS.chr21, testCovariate="CellType")
42 | #'
43 | plotEmpiricalDistribution <- function(bs,
44 | testCovariate = NULL,
45 | bySample = FALSE,
46 | type = "M",
47 | adj = 2.5) {
48 | #satisfy check
49 | M <- Cov <- group <- wt <- NULL
50 |
51 | if (!(type %in% c("M", "Cov"))){
52 | stop("type must be either M or Cov")
53 | }
54 |
55 | if(is.null(testCovariate) & !bySample){
56 | message("No testCovariate specified; plotting each sample separately.")
57 | bySample = TRUE
58 | }
59 |
60 | meth.mat <- getCoverage(bs, type = "M")
61 | unmeth.mat <- getCoverage(bs, type = "Cov") - meth.mat
62 |
63 | meth.levelsm <- data.frame(meth.mat/
64 | (meth.mat + unmeth.mat))
65 | cov.matm <- data.frame((meth.mat + unmeth.mat))
66 |
67 | if (!is.null(testCovariate)) {
68 | if (sum(grepl(testCovariate, colnames(pData(bs)))) == 0) {
69 | stop("Error: no column in pData() found ",
70 | "that matches the testCovariate")
71 | } else if (length(grep(testCovariate, colnames(pData(bs)))) > 1) {
72 | stop("Error: testCovariate matches more ",
73 | "than one column in pData()")
74 | }
75 | mC <- grep(testCovariate, colnames(pData(bs)))
76 | grouplab <- pData(bs)[,mC]
77 | }else{
78 | if(is.null(sampleNames(bs))){
79 | grouplab <- as.character(seq_len(ncol(bs)))
80 | }else{
81 | grouplab <- sampleNames(bs)
82 | }
83 | }
84 |
85 | meth.levelsm <- utils::stack(meth.levelsm)
86 | colnames(meth.levelsm)[1] <- "M"
87 | meth.levelsm$Cov <- utils::stack(cov.matm)$values
88 |
89 | if(is.null(sampleNames(bs))){
90 | meth.levelsm$sample <- sort(rep(seq_len(ncol(bs)), nrow(bs)))
91 | }else{
92 | meth.levelsm$sample <- unlist(lapply(sampleNames(bs), function(x)
93 | rep(x, nrow(bs))))
94 | }
95 |
96 | if (!is.null(testCovariate)){
97 | meth.levelsm$group <-
98 | unlist(lapply(seq_len(ncol(bs)), function(x)
99 | rep(pData(bs)[x,mC], nrow(bs))))
100 | }else{
101 | meth.levelsm$group <- meth.levelsm$sample
102 | }
103 |
104 | if (!bySample){
105 | if (type=="M"){
106 | # compute weights - sum over all samples in group ##
107 | covtots <- rep(NA, ncol(cov.matm))
108 | names(covtots) <- grouplab
109 | for(l in unique(grouplab)){
110 | covtots[names(covtots) == l] <- sum(colSums(cov.matm)[grouplab == l])
111 | }
112 |
113 | wt.matm <- data.frame(t(t(cov.matm) / covtots))
114 | meth.levelsm$wt <- utils::stack(wt.matm)$values
115 |
116 | p1 <- ggplot(meth.levelsm,
117 | aes(M, colour = group, group = group, weight = wt)) +
118 | geom_line(adjust = adj, alpha = 0.6, stat = "density", size = 1.3) +
119 | xlab("Methylation Proportion") +
120 | theme_bw()
121 | }else{
122 | p1 <- ggplot(meth.levelsm, aes(Cov+0.1,
123 | colour = group, group = group)) +
124 | geom_line(adjust = adj, alpha = 0.6, stat = "density", size = 1.3) +
125 | scale_x_continuous(trans="log2") +
126 | xlab("Coverage") +
127 | theme_bw()
128 | }
129 | p1 <- p1 + labs(colour = "Group")
130 | }else{
131 | if (type=="M"){
132 | wt.matm <- data.frame(t(t(cov.matm) / colSums(cov.matm)))
133 | meth.levelsm$wt <- utils::stack(wt.matm)$values
134 |
135 | if(identical(meth.levelsm$group, meth.levelsm$sample)){
136 | p1 <- ggplot(meth.levelsm,
137 | aes(M, colour = group, group = sample, weight = wt)) +
138 | labs(color = "Sample")
139 | }else{
140 | if(ncol(bs) <= 12){
141 | p1 <- ggplot(meth.levelsm,
142 | aes(M, colour = group, group = sample, weight = wt,
143 | linetype = sample)) +
144 | labs(color = "Group", linetype = "Sample")
145 | }else{
146 | p1 <- ggplot(meth.levelsm,
147 | aes(M, colour = group, group = sample, weight = wt)) +
148 | labs(color = "Group")
149 | }
150 | }
151 | p1 <- p1 + geom_line(adjust = adj, alpha = 0.6, stat = "density", size = 1.3) +
152 | xlab("Methylation Proportion") +
153 | theme_bw()
154 | }else{
155 | if(identical(meth.levelsm$group, meth.levelsm$sample)){
156 | p1 <- ggplot(meth.levelsm, aes(Cov+0.1,
157 | colour = group, group = sample)) +
158 | labs(color = "Sample")
159 | }else{
160 | if(ncol(bs) <= 12){
161 | p1 <- ggplot(meth.levelsm, aes(Cov+0.1,
162 | colour = group, group = sample,
163 | linetype = sample)) +
164 | labs(color = "Group", linetype = "Sample")
165 | }else{
166 | p1 <- ggplot(meth.levelsm, aes(Cov+0.1,
167 | colour = group, group = sample)) +
168 | labs(color = "Group")
169 | }
170 | }
171 |
172 | p1 <- p1 +
173 | geom_line(adjust = adj, alpha = 0.6, stat = "density", size = 1.3) +
174 | scale_x_continuous(trans="log2") +
175 | xlab("Coverage") +
176 | theme_bw()
177 | }
178 | }
179 | return(p1)
180 | }
181 |
182 |
--------------------------------------------------------------------------------
/man/dmrseq.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/dmrseq.R
3 | \name{dmrseq}
4 | \alias{dmrseq}
5 | \title{Main function for detecting and evaluating significance of DMRs.}
6 | \usage{
7 | dmrseq(
8 | bs,
9 | testCovariate,
10 | adjustCovariate = NULL,
11 | cutoff = 0.1,
12 | minNumRegion = 5,
13 | smooth = TRUE,
14 | bpSpan = 1000,
15 | minInSpan = 30,
16 | maxGapSmooth = 2500,
17 | maxGap = 1000,
18 | verbose = TRUE,
19 | maxPerms = 10,
20 | matchCovariate = NULL,
21 | BPPARAM = bpparam(),
22 | stat = "stat",
23 | block = FALSE,
24 | blockSize = 5000,
25 | chrsPerChunk = 1
26 | )
27 | }
28 | \arguments{
29 | \item{bs}{bsseq object containing the methylation values as well as the
30 | phenotype matrix that contains sample level covariates}
31 |
32 | \item{testCovariate}{Character value indicating which variable
33 | (column name) in \code{pData(bs)} to test
34 | for association of methylation levels.
35 | Can alternatively specify an integer value indicating
36 | which of column of
37 | \code{pData(bs)} to use. This is used to construct the
38 | design matrix for the test statistic calculation. To run using a
39 | continuous or categorial covariate with more than two groups, simply pass in
40 | the name of a column in `pData` that contains this covariate. A continuous
41 | covariate is assmued if the data type in the `testCovariate` slot is
42 | continuous, with the exception of if there are only two unique values
43 | (then a two group comparison is carried out).}
44 |
45 | \item{adjustCovariate}{an (optional) character value or vector
46 | indicating which variables (column names) in \code{pData(bs)}
47 | will be adjusted for when
48 | testing for the association of methylation value with the
49 | \code{testCovariate}.
50 | Can alternatively specify an
51 | integer value or vector indicating
52 | which of the columns of \code{pData(bs)} to adjust for.
53 | If not NULL (default), then this is also used to
54 | construct the design matrix for the test statistic calculation.}
55 |
56 | \item{cutoff}{scalar value that represents the absolute value (or a vector
57 | of two numbers representing a lower and upper bound) for the cutoff of
58 | the single CpG coefficient that is used to discover
59 | candidate regions. Default value is 0.10.}
60 |
61 | \item{minNumRegion}{positive integer that represents the minimum number of
62 | CpGs to consider for a candidate region. Default value is 5.
63 | Minimum value is 3.}
64 |
65 | \item{smooth}{logical value that indicates whether or not to smooth the
66 | CpG level signal when discovering candidate regions.
67 | Defaults to TRUE.}
68 |
69 | \item{bpSpan}{a positive integer that represents the length in basepairs
70 | of the smoothing span window if \code{smooth} is TRUE. Default value is
71 | 1000.}
72 |
73 | \item{minInSpan}{positive integer that represents the minimum number of
74 | CpGs in a smoothing span window if \code{smooth} is TRUE.
75 | Default value is 30.}
76 |
77 | \item{maxGapSmooth}{integer value representing maximum number of basepairs
78 | in between neighboring CpGs to be included in the same
79 | cluster when performing smoothing (should generally be larger than
80 | \code{maxGap})}
81 |
82 | \item{maxGap}{integer value representing maximum number of basepairs in
83 | between neighboring CpGs to be included in the same DMR.}
84 |
85 | \item{verbose}{logical value that indicates whether progress messages
86 | should be printed to stdout. Defaults value is TRUE.}
87 |
88 | \item{maxPerms}{a positive integer that represents the maximum number
89 | of permutations that will be used to generate the global null
90 | distribution of test statistics. Default value is 10.}
91 |
92 | \item{matchCovariate}{An (optional) character value
93 | indicating which variable (column name) of \code{pData(bs)}
94 | will be blocked for when
95 | constructing the permutations in order to
96 | test for the association of methylation value with the
97 | \code{testCovariate}, only to be used when \code{testCovariate}
98 | is a two-group factor and the number of permutations possible is less
99 | than 500000.
100 | Alternatively, you can specify an integer value indicating
101 | which column of \code{pData(bs)} to block for.
102 | Blocking means that only permutations with balanced
103 | composition of \code{testCovariate} values will be used (for example if
104 | you have samples from different gender and this is not your covariate of
105 | interest,
106 | it is recommended to use gender as a matching covariate to avoid one
107 | of the permutations testing entirely males versus females; this violates
108 | the null hypothesis and will decrease power).
109 | If not NULL (default), then no blocking is performed.}
110 |
111 | \item{BPPARAM}{a \code{BiocParallelParam} object to specify the parallel
112 | backend. The default
113 | option is \code{BiocParallel::bpparam()} which will automatically creates
114 | a cluster appropriate for the operating system.}
115 |
116 | \item{stat}{a character vector indicating the name of the column of the
117 | output to use as the region-level test statistic. Default value is 'stat'
118 | which is the region level-statistic designed to be comparable across the
119 | genome.
120 | It is not recommended to change this argument, but it can be done for
121 | experimental purposes. Possible values are: 'L' - the number of loci
122 | in the region, 'area' - the sum of the smoothed loci statistics,
123 | 'beta' - the effect size of the region, 'stat' - the test statistic for
124 | the region, or 'avg' - the average smoothed loci statistic.}
125 |
126 | \item{block}{logical indicating whether to search for large-scale (low
127 | resolution) blocks of differential methylation (default is FALSE, which
128 | means that local DMRs are desired). If TRUE, the parameters for
129 | \code{bpSpan}, \code{minInSpan}, and \code{maxGapSmooth} should be adjusted
130 | (increased) accordingly. This setting will also merge
131 | candidate regions that (1) are in the same direction and (2) are less than
132 | 1kb apart with no covered CpGs separating them. The region-level model used
133 | is also slightly modified - instead of a loci-specific intercept for each
134 | CpG in theregion, the intercept term is modeled as a natural spline with
135 | one interior knot per each 10kb of length (up to 10 interior knots).}
136 |
137 | \item{blockSize}{numeric value indicating the minimum number of basepairs
138 | to be considered a block (only used if \code{block}=TRUE). Default is
139 | 5000 basepairs.}
140 |
141 | \item{chrsPerChunk}{a positive integer value indicating the number of
142 | chromosomes per chunk. The default is 1, meaning that the data will be
143 | looped through one chromosome at a time. When pairing up multiple
144 | chromosomes per chunk, sizes (in terms of numbers of CpGs) will be taken
145 | into consideration to balance the sizes of each chunk.}
146 | }
147 | \value{
148 | a \code{GRanges} object that contains the results of the inference.
149 | The object contains one row for each candidate region, sorted by q-value
150 | and then chromosome. The standard
151 | \code{GRanges} chr, start, and end are included, along with at least
152 | 7 metadata
153 | columns, in the following order:
154 | 1. L = the number of CpGs contained in the region,
155 | 2. area = the sum of the smoothed beta values
156 | 3. beta = the coefficient value for the condition difference (there
157 | will be more than one column here if a multi-group comparison
158 | was performed),
159 | 4. stat = the test statistic for the condition difference,
160 | 5. pval = the permutation p-value for the significance of the test
161 | statistic, and
162 | 6. qval = the q-value for the test statistic (adjustment
163 | for multiple comparisons to control false discovery rate).
164 | 7. index = an \code{IRanges} containing the indices of the region's
165 | first CpG to last CpG.
166 | }
167 | \description{
168 | Performs a two-step approach that (1) detects candidate regions, and
169 | (2) scores candidate regions with an exchangeable (across the genome)
170 | statistic and evaluates statistical significance using a
171 | permuation test on the pooled null distribution of scores.
172 | }
173 | \examples{
174 |
175 | # load example data
176 | data(BS.chr21)
177 |
178 | # the covariate of interest is the 'CellType' column of pData(BS.chr21)
179 | testCovariate <- 'CellType'
180 |
181 | # run dmrseq on a subset of the chromosome (10K CpGs)
182 | regions <- dmrseq(bs=BS.chr21[240001:250000,],
183 | cutoff = 0.05,
184 | testCovariate=testCovariate)
185 |
186 | }
187 | \keyword{inference}
188 |
--------------------------------------------------------------------------------
/man/plotDMRs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plotDMRs.R
3 | \name{plotDMRs}
4 | \alias{plotDMRs}
5 | \title{Plot Differentially Methylated Regions}
6 | \usage{
7 | plotDMRs(
8 | BSseq,
9 | regions = NULL,
10 | testCovariate = NULL,
11 | extend = (end(regions) - start(regions) + 1)/2,
12 | main = "",
13 | addRegions = regions,
14 | annoTrack = NULL,
15 | col = NULL,
16 | lty = NULL,
17 | lwd = NULL,
18 | label = NULL,
19 | mainWithWidth = TRUE,
20 | regionCol = .alpha("#C77CFF", 0.2),
21 | addTicks = TRUE,
22 | addPoints = TRUE,
23 | pointsMinCov = 1,
24 | highlightMain = FALSE,
25 | qval = TRUE,
26 | stat = TRUE,
27 | verbose = TRUE,
28 | includeYlab = TRUE,
29 | compareTrack = NULL,
30 | labelCols = NULL,
31 | horizLegend = FALSE,
32 | addLines = TRUE,
33 | linesMinCov = 1
34 | )
35 | }
36 | \arguments{
37 | \item{BSseq}{An object of class BSseq.}
38 |
39 | \item{regions}{A data.frame containing the DMRs (output from the main
40 | \code{dmrseq}) function.}
41 |
42 | \item{testCovariate}{integer value or vector indicating which of columns of
43 | \code{pData(bs)} contains the covariate of interest.
44 | This is used to construct the sample labels and colors (unless this is
45 | over-ridden by specifying \code{label}).}
46 |
47 | \item{extend}{Describes how much the plotting region should be extended in
48 | either direction. The total width of the plot is equal to the width of the
49 | region plus twice extend.}
50 |
51 | \item{main}{The plot title. The default is to construct a title with
52 | information about which genomic region is being plotted.}
53 |
54 | \item{addRegions}{A set of additional regions to be highlighted on the
55 | plots. Same format as the \code{regions} argument.}
56 |
57 | \item{annoTrack}{a \code{GRangesList} object with two elements returned
58 | by \code{\link{getAnnot}}. The first
59 | contains CpG category information in the first element (optional)
60 | coding gene sequence information in the second element (optional).
61 | At least one of these elements needs to be non-null in order for
62 | any annotation to be plotted, but it is not necessary to contain
63 | both.}
64 |
65 | \item{col}{The color of the methylation estimates. It is recommended to
66 | leave this value as default (NULL), and specify a value of
67 | \code{testCovariate} to indicate which column of \code{pData(bs)}
68 | to use as a factor for coloring the points and lines of the plot.
69 | Alternatively, you can specify particular colors by
70 | passing this information through the \code{pData} slot of the
71 | object \code{BSseq} (a \code{data.frame} that houses metadata). To do
72 | so, place the color value for each sample in a column titled \code{col},
73 | and leave this argument as its default value of NULL. Alternatively,
74 | you may specify a vector of color values (one for each sample), but
75 | you *must* make sure that this vector is in the same order as the samples
76 | are in the BSseq object. If NULL and no \code{col} column is found in
77 | \code{pData}, then estimates are plotted in black for all samples.}
78 |
79 | \item{lty}{The line type of the methylation estimates. It is recommended to
80 | pass this information through the \code{pData} slot of the
81 | object \code{BSseq} (a \code{data.frame} that houses metadata). To do
82 | so, place the line type value for each sample in a column titled \code{lty},
83 | and leave this argument as its default value of NULL. Alternatively,
84 | you may specify a vector of line type values (one for each sample), but
85 | you *must* make sure that this vector is in the same order as the samples
86 | are in the BSseq object. If NULL and no \code{lty} column is found in
87 | \code{pData}, then estimates are plotted with \code{lty=1} for all samples.}
88 |
89 | \item{lwd}{The line width of the methylation estimates. It is recommended to
90 | pass this information through the \code{pData} slot of the
91 | object \code{BSseq} (a \code{data.frame} that houses metadata). To do
92 | so, place the line width value for each sample in a column titled \code{lwd},
93 | and leave this argument as its default value of NULL. Alternatively,
94 | you may specify a vector of line width values (one for each sample), but
95 | you *must* make sure that this vector is in the same order as the samples
96 | are in the BSseq object. If NULL and no \code{lwd} column is found in
97 | \code{pData}, then estimates are plotted with \code{lwd=1} for all samples.}
98 |
99 | \item{label}{The condition/population labels for the plot legend. If NULL
100 | (default) this is taken from the \code{testCovariate} column of
101 | \code{pData}. Alternatively, you can pass in labels by
102 | adding this information through the \code{pData} slot of the
103 | object \code{BSseq} (a \code{data.frame} that houses metadata). To do
104 | so, place the labels for each sample in a column titled \code{label},
105 | and leave this argument as its default value of NULL.
106 | You may instead specify an arbitrary vector of labels (one for each sample),
107 | but be aware that you *must* make sure that this vector is in the same order
108 | as the samples are in the BSseq object. If NULL, and \code{testCovariate} is
109 | also NULL and no \code{label} column is found in
110 | \code{pData}, then no legend is created.}
111 |
112 | \item{mainWithWidth}{logical value indicating whether the default title
113 | should include information about width of the plot region.}
114 |
115 | \item{regionCol}{The color used for highlighting the region.}
116 |
117 | \item{addTicks}{logical value indicating whether tick marks showing the
118 | location of methylation loci should be added. Default is TRUE.}
119 |
120 | \item{addPoints}{logical value indicating whether the individual
121 | methylation estimates be plotted as points.}
122 |
123 | \item{pointsMinCov}{The minimum coverage a methylation loci need in
124 | order for the raw methylation estimates to be plotted. Useful for filtering
125 | out low coverage loci. Only used if addPoints = TRUE. Default value is 1
126 | (no filtering).}
127 |
128 | \item{highlightMain}{logical value indicating whether the plot region
129 | should be highlighted.}
130 |
131 | \item{qval}{logical value indicating whether the region FDR estimate
132 | (q-value) should be displayed in the plot title. The value is extracted
133 | from the \code{regions} argument.}
134 |
135 | \item{stat}{logical value indicating whether the region statistic
136 | should be displayed in the plot title. The value is extracted from the
137 | \code{regions} argument.}
138 |
139 | \item{verbose}{logical value indicating whether progress messages
140 | should be printed to the screen.}
141 |
142 | \item{includeYlab}{a logical indicating whether to include the Y axis
143 | label 'Methylation' (useful to turn off if combining multiple region
144 | figures and you do not want to include redundant y axis label information)}
145 |
146 | \item{compareTrack}{a named GRangesList object that contains up to four
147 | custom tracks (GRanges objects) which will be plotted below the region.
148 | Only one of `compareTrack` or `annoTrack` can be specified since there is
149 | only for plotting either the built in GpG category and exon tracks, *or* a
150 | custom set of tracks.}
151 |
152 | \item{labelCols}{a character vector with names of the mcols slot of the
153 | GRanges items in `compareTrack'. Only used if plotting custom
154 | tracks using the `compareTrack' argument. If specified, the (first) value
155 | in that column is printed along with a label that includes the name of the
156 | list item. If NULL (default), just the name of the track is printed.}
157 |
158 | \item{horizLegend}{logical indicating whether the legend should be
159 | horizontal instead of vertical (default FALSE). This is useful if you need
160 | to plot many labels and want to preserve whitespace.}
161 |
162 | \item{addLines}{logical indicating whether to plot smooth lines between
163 | points. Default is true. Can be useful to turn this off for very small
164 | regions.}
165 |
166 | \item{linesMinCov}{The minimum coverage a methylation loci need in
167 | order to be used for plotting of smoothed lines. Useful for filtering
168 | out low coverage loci. Only used if addLines = TRUE. Default value is 1
169 | (no filtering).}
170 | }
171 | \value{
172 | None (generates a plot)
173 | }
174 | \description{
175 | Generates trace plots of methylation proportions by genomic position.
176 | }
177 | \details{
178 | Creates aesthetially pleasing DMR plots. By default will plot individual
179 | points with size proportional to coverage, along with a smoothed line
180 | for each sample. Elements will be colored by biological condition
181 | (\code{label}). Also has functionality to add annotations below the main
182 | plot (CpG category, genes) if \code{annoTrack} is specified.
183 | }
184 | \examples{
185 |
186 | # load the example data
187 | data(BS.chr21)
188 |
189 | # load example results (computed with dmrseq function)
190 | data(dmrs.ex)
191 |
192 | # get annotation information (using getAnnot function)
193 | # here we'll load the example annotation from chr21
194 | data(annot.chr21)
195 |
196 | # plot the 1st DMR
197 | plotDMRs(BS.chr21, regions=dmrs.ex[1,], testCovariate=1,
198 | annoTrack=annot.chr21)
199 |
200 | }
201 |
--------------------------------------------------------------------------------
/R/simDMRs.R:
--------------------------------------------------------------------------------
1 |
2 | #' Simulate Differentially Methylated Regions
3 | #'
4 | #' Add simulated DMRs to observed control data. Control data will be split
5 | #' into two (artificial) populations.
6 | #'
7 | #' @param bs a BSseq object containing only control samples (from the same
8 | #' population) for which simulated DMRs will be added after dividing the
9 | #' population into two artificial groups.
10 | #'
11 | #' @param num.dmrs an integer specifying how many DMRs to add.
12 | #'
13 | #' @param delta.max0 a proportion value indicating the mode value for the
14 | #' difference in proportion of methylated CpGs in the simulated DMRs (the
15 | #' actual value will be drawn from a scaled Beta distribution centered at
16 | #' this value). Default value is 0.3.
17 | #'
18 | #' @return A named list object with 5 elements: (1)
19 | #' \code{gr.dmrs} is a \code{GRanges} object with \code{num.dmrs}
20 | #' ranges that represent the random DMRs added. (2) \code{dmr.mncov} is a
21 | #' numeric vector that contains the mean coverage in each simulated DMR. (3)
22 | #' \code{dmr.L} is a numeric vector that contains the number of CpGs in each
23 | #' simulated DMR. (4) \code{bs} is the BSseq object that contains the
24 | #' simulated DMRs. (5) \code{deltas} is a numeric vector that contains the
25 | #' effect size used for each DMR.
26 | #'
27 | #' @importFrom IRanges IRanges
28 | #'
29 | #' @export
30 | #'
31 | #' @examples
32 | #'
33 | #' # Add simulated DMRs to a BSseq dataset
34 | #' # This is just for illustrative purposes - ideally you would
35 | #' # add DMRs to a set of samples from the same condition (in our
36 | #' # example data, we have data from two different cell types)
37 | #' # In this case, we shuffle the samples by cell type to create
38 | #' # a null comparison.
39 | #'
40 | #' data(BS.chr21)
41 | #'
42 | #' BS.chr21.sim <- simDMRs(bs=BS.chr21[1:10000,c(1,3,2,4)],
43 | #' num.dmrs=50)
44 | #'
45 | #' # show the simulated DMRs GRanges object
46 | #' show(BS.chr21.sim$gr.dmrs)
47 | #'
48 | #' # show the updated BSseq object that includes the simulated DMRs
49 | #' show(BS.chr21.sim$bs)
50 | #'
51 | #' # examine effect sizes of the DMRs
52 | #' head(BS.chr21.sim$delta)
53 | #'
54 | simDMRs <- function(bs, num.dmrs = 3000, delta.max0 = 0.3) {
55 | # check that all loci have coverage in every sample
56 | zero.cov <- which(rowSums(as.matrix(getCoverage(bs,
57 | type="Cov")) == 0) > 0)
58 | if (length(zero.cov) > 0){
59 | stop("Zero coverage found for at least one sample in ", length(zero.cov),
60 | " loci. Please filter for loci with coverage at least one in ",
61 | "all samples before passing to 'simDMRs'")
62 | }
63 |
64 | sampleSize <- floor(nrow(pData(bs))/2)
65 |
66 | # code to simulate DMRs if some number of simulated dmrs was specified
67 | message("Simulating DMRs for ", sampleSize, " vs ", ncol(bs) - sampleSize,
68 | " comparison")
69 | triwt <- function(x, amp = 1, base = 0, width = 1, center = 0,
70 | deg = 3, dir = 1) {
71 | y <- dir * (((width/2)^deg - abs(x - center)^deg)/
72 | (width/2)^deg)^deg * amp + base
73 | y[abs(x - center) > ceiling(width/2)] <- base[abs(x - center) >
74 | ceiling(width/2)]
75 | return(y)
76 | }
77 |
78 | meth.mat <- as.matrix(getCoverage(bs, type = "M"))
79 | unmeth.mat <- as.matrix(getCoverage(bs, type = "Cov")) - meth.mat
80 | chr <- as.character(seqnames(bs))
81 | pos <- start(bs)
82 |
83 | cluster <- bumphunter::clusterMaker(chr, pos, maxGap = 500)
84 | Indexes <- split(seq(along = cluster), cluster)
85 | lns <- lengths(Indexes)
86 | Indexes <- Indexes[lns >= 5 & lns <= 500]
87 |
88 | # sample regions with intermediate methylation values preferentially
89 | prop.mat <- rowMeans(meth.mat/(meth.mat + unmeth.mat))
90 | prop.mat <- unlist(lapply(Indexes, function(x) median(prop.mat[x])))
91 |
92 | dmrs.ind <- sample(seq_len(length(Indexes)), num.dmrs, replace = FALSE,
93 | prob = pmax(1 - sqrt(2) *
94 | abs(0.5 - prop.mat)^0.5, 0))
95 | dmrs.ind <- Indexes[dmrs.ind]
96 | fnc <- function(index) {
97 | gr.dmr <- GRanges(seqnames = unique(as.character(seqnames(bs)[index])),
98 | IRanges(start = min(start(bs)[index]),
99 | end = max(start(bs)[index])))
100 | return(gr.dmr)
101 | }
102 | ## GRanges Object for the Simulated DMRs
103 | gr.dmrs <- suppressWarnings(Reduce("c", lapply(dmrs.ind, fnc)))
104 |
105 | ## Generating the Methylated and Unmethylated Read Counts
106 | ## for the CpG sites in the
107 | ## DMRs and outside
108 |
109 | # set up null signal (smooth function of position) mcols(bs)$diff <- 0
110 | Diff <- Diff2 <- rep(0, length(bs))
111 |
112 | dmr.mncov <- dmr.L <- deltas <- rep(NA, num.dmrs)
113 |
114 | for (u in seq_len(num.dmrs)) {
115 | # coin flip for up or down
116 | up <- 1 - 2 * (rbinom(1, 1, 0.5) == 1)
117 |
118 | # let effect size change randomly
119 | delta.max <- delta.max0 + (rbeta(1, 2, 2) - 0.5)/3
120 | deltas[u] <- delta.max
121 |
122 | # grab loci in the dmr
123 | dmr.L[u] <- length(dmrs.ind[[u]])
124 | prop.mat <- meth.mat[dmrs.ind[[u]], ]/(meth.mat[dmrs.ind[[u]], ] +
125 | unmeth.mat[dmrs.ind[[u]],
126 | ])
127 |
128 | # change direction if baseline mean is near boundary
129 | if (up == 1) {
130 | if (mean(prop.mat) > 1 - delta.max) {
131 | up <- -1
132 | }
133 | } else if (up == -1) {
134 | if (mean(prop.mat) < delta.max) {
135 | up <- 1
136 | }
137 | }
138 |
139 | # simulated mean as a smooth parabola added or subtracted
140 | # from baseline mean
141 | last <- max(pos[dmrs.ind[[u]]])
142 | first <- min(pos[dmrs.ind[[u]]])
143 | width <- last - first
144 |
145 | # widen out so that first and last CpGs don't have a
146 | # difference of zero
147 | last <- last + 0.2 * width
148 | first <- first - 0.2 * width
149 | width <- last - first
150 |
151 | mid <- round((last - first)/2 + first)
152 |
153 | # Diff.hit is the methylation percentage difference for each position in
154 | # spiked in DMR; restricted to between -1 and 1 (negative indicates that
155 | # difference is in the negative direction.
156 |
157 | Diff.hit <- round(triwt(pos[dmrs.ind[[u]]], amp = delta.max,
158 | base = Diff[dmrs.ind[[u]]],
159 | width = width, center = mid, deg = 4, dir = up), 4)
160 |
161 | # calculate the mean coverage in the DMR over all the samples and save
162 | # result in a vector dmr.mncov for exploring the characteristics of the
163 | # detected / missed DMRs in simulation results.
164 | mn.cov <- by(t(meth.mat[dmrs.ind[[u]], ] + unmeth.mat[dmrs.ind[[u]], ]),
165 | factor(paste0("Condition", c(rep(1, sampleSize),
166 | rep(2, ncol(bs) - sampleSize)))),
167 | colMeans)
168 | mn.cov <- rowMeans(cbind(mn.cov[[1]], mn.cov[[2]]))
169 | dmr.mncov[u] <- mean(mn.cov)
170 |
171 |
172 | # Conditional on the coverage for each site and sample combination,
173 | # sample the
174 | # number of methylated reads from a binomial distribution where the
175 | # probability
176 | # parameter is taken as the observed estimate plus the Diff/Diff2
177 | # (depending on
178 | # whether the sample is from condition 1 or condition 2 This will
179 | # induce sampling
180 | # error into the number of reads observed in the simulation and
181 | # thus will be more
182 | # realistic and a more practical evaluation of the methods
183 | cov <- meth.mat[dmrs.ind[[u]], ] + unmeth.mat[dmrs.ind[[u]], ]
184 | prop <- meth.mat[dmrs.ind[[u]], ]/cov
185 | grp <- runif(1) < 0.5
186 | ss <- ifelse(grp, sampleSize, ncol(bs) - sampleSize)
187 | for (samp in seq_len(ss)) {
188 | # randomly choose which condition is the one with the difference
189 | if (grp) {
190 | # first generate M counts for sample samp of condition 1
191 | meth.mat[dmrs.ind[[u]], samp] <- rbinom(n=length(dmrs.ind[[u]]),
192 | size = cov[, samp],
193 | prob = pmax(pmin(prop[, samp] + Diff.hit, 1),
194 | 0))
195 |
196 | # next assign the other Cov - M counts to unmethylated matrix
197 | unmeth.mat[dmrs.ind[[u]], samp] <- cov[, samp] -
198 | meth.mat[dmrs.ind[[u]], samp]
199 |
200 | } else {
201 | # next generate M counts for sample samp of condition 2
202 | meth.mat[dmrs.ind[[u]], (sampleSize + samp)] <-
203 | rbinom(n = length(dmrs.ind[[u]]),
204 | size = cov[, (sampleSize + samp)],
205 | prob = pmax(pmin(prop[, (sampleSize + samp)] +
206 | Diff.hit, 1), 0))
207 |
208 | # next assign the other Cov - M counts to unmethylated matrix
209 | unmeth.mat[dmrs.ind[[u]], (sampleSize + samp)] <- cov[,
210 | (sampleSize + samp)] - meth.mat[dmrs.ind[[u]],
211 | (sampleSize + samp)]
212 | }
213 | }
214 | }
215 |
216 | # get everything in order to run bumphunter functions
217 | sampnames <- paste0("Condition", c(rep(1, sampleSize),
218 | rep(2, ncol(bs) - sampleSize)),
219 | "_Rep", c(seq_len(sampleSize),
220 | seq_len(ncol(bs) - sampleSize)))
221 | colnames(meth.mat) <- colnames(unmeth.mat) <- sampnames
222 | bsNew <- BSseq(pos = pos, chr = chr, M = meth.mat,
223 | Cov = (meth.mat + unmeth.mat),
224 | sampleNames = sampnames)
225 |
226 | sim.dat.red <- list(gr.dmrs = gr.dmrs, dmr.mncov = dmr.mncov, dmr.L = dmr.L,
227 | bs = bsNew, delta = deltas)
228 | return(sim.dat.red)
229 | }
230 |
231 |
--------------------------------------------------------------------------------
/R/plotDMRs.R:
--------------------------------------------------------------------------------
1 | #' Plot Differentially Methylated Regions
2 | #'
3 | #' Generates trace plots of methylation proportions by genomic position.
4 | #'
5 | #' Creates aesthetially pleasing DMR plots. By default will plot individual
6 | #' points with size proportional to coverage, along with a smoothed line
7 | #' for each sample. Elements will be colored by biological condition
8 | #' (\code{label}). Also has functionality to add annotations below the main
9 | #' plot (CpG category, genes) if \code{annoTrack} is specified.
10 | #'
11 | #' @param BSseq An object of class BSseq.
12 | #'
13 | #' @param regions A data.frame containing the DMRs (output from the main
14 | #' \code{dmrseq}) function.
15 | #'
16 | #' @param extend Describes how much the plotting region should be extended in
17 | #' either direction. The total width of the plot is equal to the width of the
18 | #' region plus twice extend.
19 | #'
20 | #' @param main The plot title. The default is to construct a title with
21 | #' information about which genomic region is being plotted.
22 | #'
23 | #' @param addRegions A set of additional regions to be highlighted on the
24 | #' plots. Same format as the \code{regions} argument.
25 | #'
26 | #' @param annoTrack a \code{GRangesList} object with two elements returned
27 | #' by \code{\link{getAnnot}}. The first
28 | #' contains CpG category information in the first element (optional)
29 | #' coding gene sequence information in the second element (optional).
30 | #' At least one of these elements needs to be non-null in order for
31 | #' any annotation to be plotted, but it is not necessary to contain
32 | #' both.
33 | #'
34 | #' @param col The color of the methylation estimates. It is recommended to
35 | #' leave this value as default (NULL), and specify a value of
36 | #' \code{testCovariate} to indicate which column of \code{pData(bs)}
37 | #' to use as a factor for coloring the points and lines of the plot.
38 | #' Alternatively, you can specify particular colors by
39 | #' passing this information through the \code{pData} slot of the
40 | #' object \code{BSseq} (a \code{data.frame} that houses metadata). To do
41 | #' so, place the color value for each sample in a column titled \code{col},
42 | #' and leave this argument as its default value of NULL. Alternatively,
43 | #' you may specify a vector of color values (one for each sample), but
44 | #' you *must* make sure that this vector is in the same order as the samples
45 | #' are in the BSseq object. If NULL and no \code{col} column is found in
46 | #' \code{pData}, then estimates are plotted in black for all samples.
47 | #'
48 | #' @param lty The line type of the methylation estimates. It is recommended to
49 | #' pass this information through the \code{pData} slot of the
50 | #' object \code{BSseq} (a \code{data.frame} that houses metadata). To do
51 | #' so, place the line type value for each sample in a column titled \code{lty},
52 | #' and leave this argument as its default value of NULL. Alternatively,
53 | #' you may specify a vector of line type values (one for each sample), but
54 | #' you *must* make sure that this vector is in the same order as the samples
55 | #' are in the BSseq object. If NULL and no \code{lty} column is found in
56 | #' \code{pData}, then estimates are plotted with \code{lty=1} for all samples.
57 | #'
58 | #' @param lwd The line width of the methylation estimates. It is recommended to
59 | #' pass this information through the \code{pData} slot of the
60 | #' object \code{BSseq} (a \code{data.frame} that houses metadata). To do
61 | #' so, place the line width value for each sample in a column titled \code{lwd},
62 | #' and leave this argument as its default value of NULL. Alternatively,
63 | #' you may specify a vector of line width values (one for each sample), but
64 | #' you *must* make sure that this vector is in the same order as the samples
65 | #' are in the BSseq object. If NULL and no \code{lwd} column is found in
66 | #' \code{pData}, then estimates are plotted with \code{lwd=1} for all samples.
67 | #'
68 | #' @param label The condition/population labels for the plot legend. If NULL
69 | #' (default) this is taken from the \code{testCovariate} column of
70 | #' \code{pData}. Alternatively, you can pass in labels by
71 | #' adding this information through the \code{pData} slot of the
72 | #' object \code{BSseq} (a \code{data.frame} that houses metadata). To do
73 | #' so, place the labels for each sample in a column titled \code{label},
74 | #' and leave this argument as its default value of NULL.
75 | #' You may instead specify an arbitrary vector of labels (one for each sample),
76 | #' but be aware that you *must* make sure that this vector is in the same order
77 | #' as the samples are in the BSseq object. If NULL, and \code{testCovariate} is
78 | #' also NULL and no \code{label} column is found in
79 | #' \code{pData}, then no legend is created.
80 | #'
81 | #' @param mainWithWidth logical value indicating whether the default title
82 | #' should include information about width of the plot region.
83 | #'
84 | #' @param regionCol The color used for highlighting the region.
85 | #'
86 | #' @param addTicks logical value indicating whether tick marks showing the
87 | #' location of methylation loci should be added. Default is TRUE.
88 | #'
89 | #' @param addPoints logical value indicating whether the individual
90 | #' methylation estimates be plotted as points.
91 | #'
92 | #' @param pointsMinCov The minimum coverage a methylation loci need in
93 | #' order for the raw methylation estimates to be plotted. Useful for filtering
94 | #' out low coverage loci. Only used if addPoints = TRUE. Default value is 1
95 | #' (no filtering).
96 | #'
97 | #' @param highlightMain logical value indicating whether the plot region
98 | #' should be highlighted.
99 | #'
100 | #' @param stat logical value indicating whether the region statistic
101 | #' should be displayed in the plot title. The value is extracted from the
102 | #' \code{regions} argument.
103 | #'
104 | #' @param qval logical value indicating whether the region FDR estimate
105 | #' (q-value) should be displayed in the plot title. The value is extracted
106 | #' from the \code{regions} argument.
107 | #'
108 | #' @param verbose logical value indicating whether progress messages
109 | #' should be printed to the screen.
110 | #'
111 | #' @param testCovariate integer value or vector indicating which of columns of
112 | #' \code{pData(bs)} contains the covariate of interest.
113 | #' This is used to construct the sample labels and colors (unless this is
114 | #' over-ridden by specifying \code{label}).
115 | #'
116 | #' @param includeYlab a logical indicating whether to include the Y axis
117 | #' label 'Methylation' (useful to turn off if combining multiple region
118 | #' figures and you do not want to include redundant y axis label information)
119 | #'
120 | #' @param compareTrack a named GRangesList object that contains up to four
121 | #' custom tracks (GRanges objects) which will be plotted below the region.
122 | #' Only one of `compareTrack` or `annoTrack` can be specified since there is
123 | #' only for plotting either the built in GpG category and exon tracks, *or* a
124 | #' custom set of tracks.
125 | #'
126 | #' @param labelCols a character vector with names of the mcols slot of the
127 | #' GRanges items in `compareTrack'. Only used if plotting custom
128 | #' tracks using the `compareTrack' argument. If specified, the (first) value
129 | #' in that column is printed along with a label that includes the name of the
130 | #' list item. If NULL (default), just the name of the track is printed.
131 | #'
132 | #' @param horizLegend logical indicating whether the legend should be
133 | #' horizontal instead of vertical (default FALSE). This is useful if you need
134 | #' to plot many labels and want to preserve whitespace.
135 | #'
136 | #' @param addLines logical indicating whether to plot smooth lines between
137 | #' points. Default is true. Can be useful to turn this off for very small
138 | #' regions.
139 | #'
140 | #' @param linesMinCov The minimum coverage a methylation loci need in
141 | #' order to be used for plotting of smoothed lines. Useful for filtering
142 | #' out low coverage loci. Only used if addLines = TRUE. Default value is 1
143 | #' (no filtering).
144 | #'
145 | #' @export
146 | #'
147 | #' @return None (generates a plot)
148 | #'
149 | #' @importFrom RColorBrewer brewer.pal
150 | #' @importFrom grDevices hcl rainbow
151 | #' @importFrom graphics arrows
152 | #'
153 | #' @examples
154 | #'
155 | #' # load the example data
156 | #' data(BS.chr21)
157 | #'
158 | #' # load example results (computed with dmrseq function)
159 | #' data(dmrs.ex)
160 | #'
161 | #' # get annotation information (using getAnnot function)
162 | #' # here we'll load the example annotation from chr21
163 | #' data(annot.chr21)
164 | #'
165 | #' # plot the 1st DMR
166 | #' plotDMRs(BS.chr21, regions=dmrs.ex[1,], testCovariate=1,
167 | #' annoTrack=annot.chr21)
168 | #'
169 | plotDMRs <- function(BSseq, regions = NULL, testCovariate = NULL,
170 | extend = (end(regions) - start(regions) + 1)/2, main = "",
171 | addRegions = regions, annoTrack = NULL, col = NULL,
172 | lty = NULL, lwd = NULL, label = NULL, mainWithWidth = TRUE,
173 | regionCol = .alpha("#C77CFF",
174 | 0.2), addTicks = TRUE, addPoints = TRUE, pointsMinCov = 1,
175 | highlightMain = FALSE,
176 | qval = TRUE, stat = TRUE, verbose = TRUE, includeYlab = TRUE,
177 | compareTrack = NULL,
178 | labelCols = NULL, horizLegend = FALSE,
179 | addLines = TRUE, linesMinCov = 1) {
180 | # adapted from plotManyRegions from bsseq plot to take
181 | # in a vector of qval values
182 | # (1 per region in regions argument) to be displayed in
183 | # the plot title. set
184 | # addPoints = TRUE to plot individual points sized by coverage
185 | # and one smooth
186 | # (loess) line per sample instead of a uniform-sized verbatim
187 | # line going through
188 | # each observation
189 | if (!addLines && !addPoints)
190 | stop("At least one of addLines or addPoints must be true")
191 | if (verbose)
192 | message("[plotDMRs] Plotting ", nrow(regions), " DMRs")
193 | if (!is.null(regions)) {
194 | if (is(regions, "data.frame")){
195 | gr <- data.frame2GRanges(regions, keepColumns = FALSE)
196 | }else{
197 | gr <- regions
198 | }
199 | if (!is(gr, "GRanges"))
200 | stop("'regions' needs to be either a 'data.frame' ",
201 | " or a 'GRanges' ")
202 | } else {
203 | gr <- granges(BSseq)
204 | }
205 | gr <- resize(gr, width = 2 * extend + width(gr), fix = "center")
206 | BSseq <- subsetByOverlaps(BSseq, gr)
207 |
208 | if (!is.null(annoTrack) && !is.null(compareTrack))
209 | stop("Choose either annoTrack or compareTrack; can't plot both")
210 |
211 | if (length(start(BSseq)) == 0)
212 | stop("No overlap between BSseq data and regions")
213 | if (!is.null(main) && length(main) != length(gr))
214 | main <- rep(main, length = length(gr))
215 |
216 | if (length(extend) == 1) {
217 | extend <- rep(extend, length(gr))
218 | }
219 |
220 | if (!is.null(testCovariate)) {
221 | coeff <- seq(2, (1 + length(testCovariate)))
222 | testCov <- as.character(pData(BSseq)[, testCovariate])
223 | if (length(unique(testCov)) > 2 && !is.numeric(testCov) && length(coeff) == 1)
224 | coeff <- seq(coeff, coeff + length(unique(as.character(testCov))) - 2 )
225 |
226 | design <- model.matrix(~testCov)
227 |
228 | if (is.null(col) && !("col" %in% names(pData(BSseq)))) {
229 | cov.unique <- unique(design[, coeff, drop = FALSE])
230 | ncol <- nrow(cov.unique)
231 |
232 | colors <- gg_color_hue(ncol)
233 | if (ncol == 2) {
234 | colors <- c("mediumblue", "deeppink1")
235 | }
236 | colors <- cbind(cov.unique,
237 | colors[rank(as.numeric(rowSums(cov.unique)),
238 | ties.method = "first")])
239 | colmat <- colors[, -ncol(colors), drop = FALSE]
240 | colmat <- apply(colmat, 2, as.numeric)
241 | z <- colors[,ncol(colors)][
242 | match(data.frame(t(design[, coeff, drop = FALSE])),
243 | data.frame(t(colmat)))]
244 |
245 | pData(BSseq)$col <- as.character(z)
246 | }
247 |
248 | if (is.null(label) && !("label" %in% names(pData(BSseq)))) {
249 | pData(BSseq)$label <- paste0(pData(BSseq)[, testCovariate])
250 | }
251 | }
252 |
253 | if (!is.null(label) || "label" %in% names(pData(BSseq))) {
254 | if(!is.null(label)){
255 | labs <- label
256 | }else{
257 | labs <- pData(BSseq)[["label"]]
258 | }
259 | if(horizLegend){
260 | wiggle <- max(nchar(labs)) * 0.4
261 | }else{
262 | wiggle <- length(unique(labs)) * 0.9
263 | }
264 | opar <- par(mar = c(0, 4.1, 0, wiggle),
265 | oma = c(0, 0, 2.5, 1))
266 | } else {
267 | opar <- par(mar = c(0, 4.1, 0, 0), oma = c(0, 0, 2.5, 1))
268 | }
269 | on.exit(par(opar))
270 |
271 | for (ii in seq(along = gr)) {
272 | if (verbose && ii%%100 == 0) {
273 | cat(sprintf("..... Plotting region %d (out of %d)\n", ii,
274 | nrow(regions)))
275 | }
276 |
277 | .plotSingleDMR(BSseq = BSseq, region = regions[ii, ],
278 | extend = extend[ii], main = main[ii], col = col, lty = lty,
279 | lwd = lwd, label = label, addRegions = addRegions,
280 | regionCol = regionCol, mainWithWidth = mainWithWidth,
281 | annoTrack = annoTrack, addTicks = addTicks, addPoints = addPoints,
282 | pointsMinCov = pointsMinCov, highlightMain = highlightMain,
283 | qval = qval, stat = stat, includeYlab = includeYlab,
284 | compareTrack = compareTrack, labelCols = labelCols,
285 | horizLegend = horizLegend, addLines = addLines, linesMinCov = linesMinCov)
286 | }
287 | }
288 |
289 |
290 |
--------------------------------------------------------------------------------
/R/internal_plotting_functions.R:
--------------------------------------------------------------------------------
1 | #' @title Add annotations to DMR plots
2 | #'
3 | #' @description Function to add visual representation of CpG categories
4 | #' and/or coding
5 | #' sequences to DMR plot
6 | #'
7 | #' @details An internal function that takes an annotation
8 | #' \code{SimpleGRangesList}
9 | #' object that
10 | #' contains CpG category information in the first element (optional) and / or
11 | #' coding gene sequence information in the second element (optional). If neither
12 | #' of these are present, then nothing will be plotted.
13 | #'
14 | #' @param gr a \code{GRanges} object that contains the DMRs to be
15 | #' plotted
16 | #'
17 | #' @param annoTrack a \code{SimpleGRangesList} object with two elements.
18 | #' The first contains CpG category information in the first element (optional)
19 | #' coding gene sequence information in the second element (optional).
20 | #' At least one of these elements needs to be non-null in order for
21 | #' any annotation to be plotted, but it is not necessary to contain
22 | #' both.
23 | #'
24 | #' @return None
25 | #'
26 | dmrPlotAnnotations <- function(gr, annoTrack) {
27 | # Code adapted from bsseq package
28 |
29 | ## check may need to be modified
30 | if (!is(annoTrack, "SimpleGRangesList"))
31 | stop("'annoTrack' needs to be a 'SimpleGRangesList'")
32 | plot(start(gr), 1, type = "n", xaxt = "n", yaxt = "n", bty = "n",
33 | ylim = c(0, length(annoTrack) + 0.5), xlim = c(start(gr), end(gr)),
34 | xlab = "", ylab = "")
35 |
36 | # add legend
37 | vars <- list(Island = "Island ", Shore = "Shore ", Shelf = "Shelf ",
38 | OpenSea1 = "Open ", OpenSea2 = "Sea")
39 | cols <- c("forestgreen", "goldenrod2", "dodgerblue", "blue3", "blue3")
40 | for (i in seq_len(length(vars))) {
41 | tmpvars <- vars
42 | tmpvars[-i] <- paste("phantom('", tmpvars[-i], "')", sep = "")
43 | expr <- paste(tmpvars, collapse = "*")
44 | text(start(gr), 1.8, parse(text = expr), col = cols[i], adj = c(0, 1),
45 | cex = 0.85)
46 | }
47 |
48 | lapply(seq(along = annoTrack), function(ii) {
49 | jj <- length(annoTrack) + 1 - ii
50 | ir <- subsetByOverlaps(annoTrack[[ii]], gr)
51 | start(ir) <- pmax(start(ir), start(gr))
52 | end(ir) <- pmin(end(ir), end(gr))
53 |
54 | if (ii == 2) {
55 | jj <- jj - 0.15
56 | }
57 |
58 | if (length(ir) > 0) {
59 | if (ii == 2) {
60 | colourCount <- length(unique(ir$symbol))
61 | getPalette <- colorRampPalette(.alpha(brewer.pal(max(
62 | length(unique(ir$symbol)),
63 | 3), "Dark2"), 0.4))
64 | color.pal <- getPalette(colourCount)
65 | names(color.pal) <- unique(ir$symbol)
66 | map <- match(ir$symbol, names(color.pal))
67 | color <- color.pal[map]
68 |
69 | bord <- "black"
70 | } else if (ii == 1) {
71 | color <- ir$type
72 | colvec <- c("blue3", "dodgerblue", "goldenrod2", "forestgreen")
73 | names(colvec) <- c("inter", "shelves", "shores", "islands")
74 |
75 | for (ucol in unique(color)){
76 | ix <- agrep(ucol, names(colvec))
77 | if (length(ix) > 1)
78 | stop("Ambiguous CpG annotation labels")
79 | color[color == ucol] <- colvec[ix]
80 | }
81 | bord <- color
82 |
83 | rect(start(ir), jj - 0.06, end(ir), jj + 0.17, col = color,
84 | border = bord)
85 | }
86 |
87 | if (ii == 2) {
88 | lastPos <- rep(NA, length(unique(ir$symbol)) - 1)
89 | used <- NULL
90 | for (k in seq_len(length(unique(ir$symbol)))) {
91 | irk <- ir[ir$symbol == unique(ir$symbol)[k], ]
92 | rect(min(start(irk)), jj - 0.065, max(end(irk)), jj + 0.065,
93 | col = .alpha("black",
94 | 0.1), border = .alpha("black", 0.1))
95 | if (!(unique(ir$symbol)[k] %in% used)) {
96 | rwidth <- end(gr) - start(gr)
97 | sg <- pmin(start(irk), end(irk))
98 | eg <- pmax(start(irk), end(irk))
99 | gwidth <- min(max(eg), end(gr)) - max(min(sg), start(gr))
100 | textPos <- max(min(sg), start(gr)) + gwidth/2
101 | jj.orig <- jj
102 | if (sum(!is.na(lastPos)) > 0) {
103 | separation <- (textPos - lastPos[k - 1])/rwidth
104 | if (abs(separation) <= 0.2 && k < 3) {
105 | jj <- jj - 0.29
106 | } else {
107 | separation <- min(abs((textPos - lastPos)/rwidth),
108 | na.rm = TRUE)
109 | if (abs(separation) <= 0.2) {
110 | jj <- jj - 0.29
111 | }
112 | }
113 | }
114 | lastPos[k] <- textPos
115 | text(textPos, jj - 0.375, labels = unique(irk$symbol),
116 | cex = 0.85,
117 | col = unique(color)[k])
118 | jj <- jj.orig
119 | used <- c(used, unique(ir$symbol)[k])
120 | }
121 | rect(sg, jj - 0.11, eg, jj + 0.12, col = unique(color)[k],
122 | border = bord)
123 | }
124 | }
125 |
126 | }
127 | mtext(names(annoTrack)[ii], side = 2, at = jj, las = 1, line = 1)
128 | })
129 |
130 | }
131 |
132 | .isColor <- function(x)
133 | {
134 | res <- try(col2rgb(x),silent=TRUE)
135 | return(!"try-error"%in%class(res))
136 | }
137 |
138 | .dmrGetMeta <- function(object, col, lty, lwd, label) {
139 | ## Assumes that object has pData and sampleNames methods Code adapted from
140 | ## bsseq package
141 |
142 | ## extract col
143 | if (is.null(col)) {
144 | if ("col" %in% names(pData(object)))
145 | col <- pData(object)[["col"]] else col <- rep("black",
146 | nrow(pData(object)))
147 | }else if (length(col) == 1){
148 | if (col %in% names(pData(object)))
149 | col <- pData(object)[[col]] else col <- rep("black", ncol(object))
150 | if (!.isColor(col))
151 | col <- rainbow(length(unique(col)))[as.numeric(as.factor(col))]
152 | }
153 | if (length(col) != ncol(object))
154 | col <- rep(col, length.out = ncol(object))
155 | if (is.null(names(col)))
156 | names(col) <- sampleNames(object)
157 |
158 | ## extract lty
159 | if (is.null(lty)) {
160 | if ("lty" %in% names(pData(object)))
161 | lty <- pData(object)[["lty"]] else lty <- rep(1, ncol(object))
162 | }else if (length(lty) == 1){
163 | if (lty %in% names(pData(object)))
164 | lty <- pData(object)[[lty]] else lty <- rep(1, ncol(object))
165 | if (!is.numeric(lty))
166 | lty <- as.numeric(as.factor(lty))
167 | }
168 | if (length(lty) != ncol(object))
169 | lty <- rep(lty, length.out = ncol(object))
170 | if (is.null(names(lty)))
171 | names(lty) <- sampleNames(object)
172 |
173 | # extract lwd
174 | if (is.null(lwd)) {
175 | if ("lwd" %in% names(pData(object)))
176 | lwd <- pData(object)[["lwd"]] else lwd <- rep(1.5,
177 | nrow(pData(object)))
178 | }else if (length(lwd) == 1){
179 | if (lwd %in% names(pData(object)))
180 | lwd <- pData(object)[[lwd]] else lwd <- rep(1, ncol(object))
181 | if (!is.numeric(lwd))
182 | lwd <- as.numeric(as.factor(lwd))
183 | }
184 | if (length(lwd) != ncol(object))
185 | lwd <- rep(lwd, length.out = ncol(object))
186 | if (is.null(names(lwd)))
187 | names(lwd) <- sampleNames(object)
188 |
189 | ## extract label
190 | if (is.null(label)) {
191 | if ("label" %in% names(pData(object)))
192 | label <- pData(object)[["label"]] else label <- rep(NA,
193 | ncol(object))
194 | }else if (length(label) == 1){
195 | if (label %in% names(pData(object)))
196 | label <- pData(object)[[label]] else label <- rep(NA, ncol(object))
197 | if (!is.character(label))
198 | label <- as.character(label)
199 | }
200 | if (length(label) != ncol(object))
201 | label <- rep(label, length.out = ncol(object))
202 | if (is.null(names(label)))
203 | names(label) <- sampleNames(object)
204 |
205 | return(list(col = col, lty = lty, lwd = lwd, label = label))
206 | }
207 |
208 |
209 | .dmrPlotTitle <- function(gr, extend, main, mainWithWidth,
210 | qval = NULL, stat = NULL) {
211 | # this function creates the main title for DMR plots Code adapted from bsseq
212 | # package
213 | if (is.data.frame(gr))
214 | gr <- data.frame2GRanges(gr)
215 | if (length(gr) > 1) {
216 | warning("plotTitle: gr has more than one element")
217 | gr <- gr[1]
218 | }
219 | plotChr <- as.character(seqnames(gr))
220 | plotRange <- c(start(gr), end(gr))
221 | regionCoord <- sprintf("%s: %s - %s", plotChr, format(plotRange[1],
222 | big.mark = ",",
223 | scientific = FALSE), format(plotRange[2], big.mark = ",",
224 | scientific = FALSE))
225 | if (mainWithWidth) {
226 | regionWidth <- sprintf("width = %s", format(width(gr),
227 | big.mark = ",",
228 | scientific = FALSE))
229 | # add optional labels to plot titles
230 | if (!is.null(qval) && !is.null(stat)) {
231 | regionStat <- sprintf("Stat: %s", format(stat, big.mark = ",",
232 | scientific = FALSE))
233 | regionFDR <- sprintf("FDR: %s", format(qval, big.mark = ",",
234 | scientific = FALSE))
235 | regionCoord <- sprintf(paste0("%s (%s)\n%s, %s"), regionCoord,
236 | regionWidth, regionStat, regionFDR)
237 | } else if (!is.null(stat)) {
238 | regionStat <- sprintf("Stat: %s", format(stat, big.mark = ",",
239 | scientific = FALSE))
240 | regionCoord <- sprintf(paste0("%s (%s)\n%s"), regionCoord,
241 | regionWidth, regionStat)
242 | } else if (!is.null(qval)) {
243 | regionFDR <- sprintf("FDR: %s", format(qval, big.mark = ",",
244 | scientific = FALSE))
245 | regionCoord <- sprintf(paste0("%s (%s)\n%s"), regionCoord,
246 | regionWidth, regionFDR)
247 | } else {
248 | regionCoord <- sprintf("%s (%s)", regionCoord, regionWidth)
249 | }
250 | } else {
251 | # add optional labels to plot titles
252 | if (!is.null(qval) && !is.null(stat)) {
253 | regionStat <- sprintf("Stat: %s", format(stat, big.mark = ",",
254 | scientific = FALSE))
255 | regionFDR <- sprintf("FDR: %s", format(qval, big.mark = ",",
256 | scientific = FALSE))
257 | regionCoord <- sprintf(paste0("%s\n%s, %s"), regionCoord,
258 | regionStat, regionFDR)
259 | } else if (!is.null(stat)) {
260 | regionStat <- sprintf("Stat: %s", format(stat, big.mark = ",",
261 | scientific = FALSE))
262 | regionCoord <- sprintf(paste0("%s\n%s"), regionCoord, regionStat)
263 | } else if (!is.null(qval)) {
264 | regionFDR <- sprintf("FDR: %s", format(qval, big.mark = ",",
265 | scientific = FALSE))
266 | regionCoord <- sprintf(paste0("%s\n%s"), regionCoord, regionFDR)
267 | } else {
268 | regionCoord <- sprintf("%s", regionCoord)
269 | }
270 | }
271 | if (main != "") {
272 | main <- sprintf("%s\n%s", main, regionCoord)
273 | } else {
274 | main <- regionCoord
275 | }
276 | main
277 | }
278 |
279 | .dmrPlotLegend <- function(plotRange, col, label, horizLegend) {
280 | # this function plots a legend to the right of the plot to indicate
281 | # whichcolor corresponds to which samples
282 |
283 | numUnique <- length(unique(paste0(col, label, sep = "")))
284 | if (numUnique < length(col)) {
285 | col <- unique(col)
286 | label <- unique(label)
287 | }
288 |
289 | if (!horizLegend){
290 | for (lg in seq_len(length(label))) {
291 | mtext(label[lg], side = 4, line = lg - 1, col = .darken(col[lg]),
292 | cex = 0.9, las = 0)
293 | }
294 | }else{
295 | for (lg in seq_len(length(label))) {
296 | mtext(label[lg], side = 4, line = 0.5, col = .darken(col[lg]),
297 | cex = 0.9, las = 1, at = 1.02 - 0.08*(lg-1))
298 | }
299 | }
300 | }
301 |
302 |
303 | # function to transform a given color specified by a character object to a
304 | # transparent version of that color
305 | .makeTransparent <- function(someColor, alpha = 130) {
306 | newColor <- col2rgb(someColor)
307 | apply(newColor, 2, function(curcoldata) {
308 | rgb(red = curcoldata[1], green = curcoldata[2], blue = curcoldata[3],
309 | alpha = alpha, maxColorValue = 255)
310 | })
311 | }
312 |
313 | .alpha <- function(col, alpha = 1) {
314 | if (missing(col))
315 | stop("Please provide a vector of colours.")
316 | apply(vapply(col, grDevices::col2rgb, matrix(rep(0,3)))/255, 2,
317 | function(x) grDevices::rgb(x[1], x[2], x[3], alpha = alpha))
318 | }
319 |
320 |
321 | .dmrPlotPoints <- function(x, y, z, col, pointsMinCov, maxCov,
322 | regionWidth) {
323 | # modified from .bsPlotPoints in bsseq added functionality for point size
324 | # to vary with coverage
325 |
326 | lwd <- 1.5
327 | # make color of points semi-transparent so that overlapping points can
328 | # still be seen
329 | col.points <- .makeTransparent(col)
330 |
331 | # if there are a lot of CpGs to plot (the case for a block-level analysis
332 | # decrease the size of the plotted points since these can get very crowded
333 | c1 <- pmax(-0.25 * atan(3 * (length(x) - 80)/80 * pi)/atan(3 *
334 | (80)/80 * pi) + 0.75, 0.5)
335 | ptSize <- c1 * (sqrt(z)/sqrt(maxCov) + 0.25)
336 |
337 | points(x[z >= pointsMinCov], y[z >= pointsMinCov], col = col.points, pch = 16,
338 | cex = ptSize)
339 | }
340 |
341 | .darken <- function(color, factor = 1.4) {
342 | col <- col2rgb(color)
343 | col <- col/factor
344 | col <- rgb(t(col), maxColorValue = 255)
345 | col
346 | }
347 |
348 |
349 |
350 | .dmrPlotLines <- function(x, y, z, col, lwd, linesMinCov, maxCov,
351 | regionWidth, lty) {
352 |
353 | if (length(x) > 100 && !is.null(lwd)) {
354 | lwd <- lwd + 1
355 | } else if (length(x) > 100) {
356 | lwd <- 2
357 | }
358 |
359 | spn <- max(1 - (1/160)*sum(z >= linesMinCov), 0.75)
360 |
361 | y[y==1] <- 0.99
362 | y[y==0] <- 0.01
363 | logit <- function(p){ log(p/(1-p))}
364 | inv.logit <- function(l){ exp(l) / (1 + exp(l)) }
365 |
366 | # don't interpolate smooth lines if there are fewer than 10 cpgs
367 | if (length(x) >= 10) {
368 | loess_fit <- loess(logit(y[z >= linesMinCov]) ~ x[z >= linesMinCov],
369 | weights = z[z >= linesMinCov], span = spn)
370 |
371 | xl <- seq(min(x[z >= linesMinCov], na.rm=TRUE),
372 | max(x[z >= linesMinCov], na.rm=TRUE),
373 | (max(x[z >= linesMinCov], na.rm=TRUE) -
374 | min(x[z >= linesMinCov], na.rm=TRUE))/500)
375 | lines(xl, inv.logit(predict(loess_fit,xl)),
376 | col = .makeTransparent(.darken(col), 175), lwd = lwd,
377 | lty = lty)
378 |
379 | }else{
380 | lines(x[z >= linesMinCov], y[z >= linesMinCov],
381 | col = .makeTransparent(.darken(col), 175), lwd = lwd,
382 | lty = lty)
383 | }
384 | }
385 |
386 | .dmrPlotSmoothData <- function(BSseq, region, extend, addRegions, col, lty, lwd,
387 | label, regionCol, addTicks, addPoints, pointsMinCov, highlightMain,
388 | includeYlab = TRUE, horizLegend, addLines=TRUE, linesMinCov) {
389 | # modified from .plotSmoothData in bsseq to allow non-smoothed regions
390 |
391 | gr <- bsseq.bsGetGr(BSseq, region, extend)
392 | BSseq <- subsetByOverlaps(BSseq, gr)
393 | BSseq2 <- subsetByOverlaps(BSseq, bsseq.bsGetGr(BSseq, region, extend = 0))
394 | ## Extract basic information
395 | sampleNames <- sampleNames(BSseq)
396 | names(sampleNames) <- sampleNames
397 | positions <- start(BSseq)
398 | positions2 <- start(BSseq2)
399 | rawPs <- as.matrix(bsseq::getMeth(BSseq, type = "raw"))
400 | coverage <- as.matrix(bsseq::getCoverage(BSseq))
401 |
402 | ## get col, lwd, lty these are extracted from the pData data.frame that is
403 | ## part of the bsseq object colEtc is a list object that contains col, lty,
404 | ## lwd and label which are used as plotting parameters label is a condition
405 | ##label to use for adding a legend with condition names
406 |
407 | colEtc <- .dmrGetMeta(object = BSseq, col = col, lty = lty, lwd = lwd,
408 | label = label)
409 |
410 | if (includeYlab) {
411 | yl <- "Methylation"
412 | } else {
413 | yl <- ""
414 | }
415 |
416 | ## The actual plotting starts here
417 | plot(positions[1], 0.5, type = "n", xaxt = "n", yaxt = "n", ylim = c(0, 1),
418 | xlim = c(start(gr),
419 | end(gr)), xlab = "", ylab = yl)
420 | axis(side = 2, at = c(0, 0.25, 0.5, 0.75, 1), las = 1)
421 | if (addTicks)
422 | rug(positions)
423 |
424 | if (is.list(addRegions) && !is.data.frame(addRegions)) {
425 | if (length(addRegions) > 2) {
426 | stop("Only two sets of regions can be highlighted")
427 | }
428 | if (length(regionCol) == 1) {
429 | regionCol <- c(regionCol, .alpha("blue", 0.2))
430 | }
431 | bsseq.bsHighlightRegions(regions = addRegions[[1]], gr = gr, ylim = c(0,
432 | 1), regionCol = regionCol[1], highlightMain = highlightMain)
433 | bsseq.bsHighlightRegions(regions = addRegions[[2]], gr = gr, ylim = c(0,
434 | 1), regionCol = regionCol[2], highlightMain = highlightMain)
435 |
436 | } else {
437 | bsseq.bsHighlightRegions(regions = addRegions, gr = gr, ylim = c(0, 1),
438 | regionCol = regionCol,
439 | highlightMain = highlightMain)
440 | }
441 |
442 | # add points first to avoid lines getting hidden by plotting many cpg points
443 | if (addPoints) {
444 | for(sampIdx in seq_len(ncol(BSseq))){
445 | .dmrPlotPoints(positions, rawPs[, sampIdx], coverage[, sampIdx],
446 | col = colEtc$col[sampIdx],
447 | pointsMinCov = pointsMinCov,
448 | maxCov = quantile(coverage, 0.95),
449 | regionWidth = end(gr) -
450 | start(gr))
451 | }
452 | }
453 |
454 | if (addLines){
455 | for(sampIdx in seq_len(ncol(BSseq))){
456 | if (sum(coverage[, sampIdx] >= linesMinCov) > 1){
457 | .dmrPlotLines(positions, rawPs[, sampIdx], coverage[, sampIdx],
458 | col = colEtc$col[sampIdx],
459 | lwd = colEtc$lwd[sampIdx],
460 | linesMinCov = linesMinCov,
461 | maxCov = quantile(coverage, 0.95),
462 | regionWidth = end(gr) -
463 | start(gr),
464 | lty = colEtc$lty[sampIdx])
465 | }
466 | }
467 | }
468 |
469 | # if colEtc$label contains characters that are not null or missing, then
470 | # create a legend which houses the labels as well as the colors that
471 | # correspond to them -> pass in both colEtc$label as well as colEtc$col
472 | if (sum(!is.na(colEtc$label)) == length(colEtc$label)) {
473 | .dmrPlotLegend(plotRange = c(start(gr), end(gr)),
474 | colEtc$col, colEtc$label, horizLegend)
475 | }
476 |
477 | }
478 |
479 | # function doesn't need to be exported; not a user-level function since a single
480 | # DMR can be plotted just fine with plotDMRs.
481 | .plotSingleDMR <- function(BSseq, region = NULL, extend = 0, main = "",
482 | addRegions = NULL, annoTrack = NULL, col = NULL, lty = NULL, lwd = NULL,
483 | label = NULL, mainWithWidth = TRUE, regionCol = .alpha("orchid1", 0.2),
484 | addTicks = TRUE, addPoints = FALSE, pointsMinCov = 5, highlightMain = FALSE,
485 | qval = NULL, stat = NULL, includeYlab = TRUE, compareTrack = NULL,
486 | labelCols = NULL, horizLegend = FALSE, addLines = TRUE, linesMinCov = 1) {
487 |
488 | if(!is.null(annoTrack) || !is.null(compareTrack)){
489 | layout(matrix(seq_len(2), ncol = 1), heights = c(2, 1.5))
490 | }else{
491 | layout(matrix(seq_len(2), ncol = 1), heights = c(2, 0.2))
492 | }
493 | .dmrPlotSmoothData(BSseq = BSseq, region = region, extend = extend,
494 | addRegions = addRegions,
495 | col = col, lty = lty, lwd = lwd, label = label,
496 | regionCol = regionCol, addTicks = addTicks,
497 | addPoints = addPoints, pointsMinCov = pointsMinCov,
498 | highlightMain = highlightMain,
499 | includeYlab = includeYlab,
500 | horizLegend = horizLegend,
501 | addLines = addLines,
502 | linesMinCov = linesMinCov)
503 | gr <- bsseq.bsGetGr(BSseq, region, extend)
504 |
505 | if (!is.null(main)) {
506 | if (qval && stat) {
507 | qval <- round(region$qval, 4)
508 | stat <- round(region$stat, 3)
509 | main <- .dmrPlotTitle(gr = region, extend = extend, main = main,
510 | mainWithWidth = mainWithWidth,
511 | qval = qval, stat = stat)
512 | } else if (stat) {
513 | stat <- round(region$stat, 3)
514 | main <- .dmrPlotTitle(gr = region, extend = extend, main = main,
515 | mainWithWidth = mainWithWidth, stat = stat)
516 | } else if (qval) {
517 | qval <- round(region$qval, 4)
518 | main <- .dmrPlotTitle(gr = region, extend = extend, main = main,
519 | mainWithWidth = mainWithWidth, qval = qval)
520 | } else {
521 | main <- .dmrPlotTitle(gr = region, extend = extend, main = main,
522 | mainWithWidth = mainWithWidth)
523 | }
524 | mtext(side = 3, text = main, outer = FALSE, cex = 0.8, line = 0)
525 | }
526 |
527 | if (!is.null(annoTrack)) {
528 | dmrPlotAnnotations(gr, annoTrack)
529 | } else if (!is.null(compareTrack)) {
530 | dmrPlotComparisons(gr, compareTrack, labelCols = labelCols)
531 | }
532 | }
533 |
534 | # pasting bsseq's .bsGetGr function since not exported
535 | bsseq.bsGetGr <- function(object, region, extend) {
536 | if (is.null(region)) {
537 | gr <- GRanges(seqnames = seqnames(object)[1],
538 | ranges = IRanges(start = min(start(object)),
539 | end = max(start(object))))
540 | } else {
541 | if (is(region, "data.frame")){
542 | gr <- data.frame2GRanges(region, keepColumns = FALSE)
543 | }else{
544 | gr <- region
545 | }
546 | if (!is(gr, "GRanges") || length(gr) != 1)
547 | stop("'region' needs to be either a 'data.frame' ",
548 | "(with a single row) or a 'GRanges' (with a single element)")
549 | gr <- resize(gr, width = 2 * extend + width(gr), fix = "center")
550 | }
551 | gr
552 | }
553 |
554 | # pasting bsseq's .bsHighlightRegions function since not exported
555 | bsseq.bsHighlightRegions <- function(regions, gr, ylim, regionCol,
556 | highlightMain) {
557 | if (is.data.frame(regions))
558 | regions <- data.frame2GRanges(regions)
559 | if (highlightMain)
560 | regions <- c(regions, gr)
561 | if (is.null(regions))
562 | return(NULL)
563 | regions <- subsetByOverlaps(regions, gr)
564 | regions <- pintersect(regions, rep(gr, length(regions)))
565 | if (length(regions) == 0)
566 | return(NULL)
567 | rect(xleft = start(regions), xright = end(regions), ybottom = ylim[1],
568 | ytop = ylim[2], col = regionCol, border = NA)
569 | }
570 |
571 | gg_color_hue <- function(n) {
572 | hues <- seq(15, 375, length = n + 1)
573 | hcl(h = hues, l = 65, c = 100)[seq_len(n)]
574 | }
575 |
576 | # function to draw nonoverlapping comparison regions (up to 3) below the main
577 | # region plot (instead of annotations)
578 | dmrPlotComparisons <- function(gr, annoTrack, labelCols = NULL) {
579 |
580 | if (!is(annoTrack, "SimpleGRangesList"))
581 | stop("'annoTrack' needs to be a 'SimpleGRangesList'")
582 |
583 | if (length(annoTrack) > 4)
584 | stop("Can't plot more than 4 tracks")
585 |
586 | plot(start(gr), 1, type = "n", xaxt = "n", yaxt = "n", bty = "n",
587 | ylim = c(0, length(annoTrack) + 0.5), xlim = c(start(gr), end(gr)),
588 | xlab = "", ylab = "")
589 |
590 | colourCount <- length(annoTrack)
591 | getPalette <- colorRampPalette(.alpha(brewer.pal(max(length(annoTrack), 3),
592 | "Dark2"),
593 | 0.4))
594 | color <- getPalette(colourCount)
595 | bord <- "black"
596 |
597 | lapply(seq(along = annoTrack), function(ii) {
598 | jj <- length(annoTrack) + 1 - ii
599 | ir <- subsetByOverlaps(annoTrack[[ii]], gr)
600 | start(ir) <- pmax(start(ir), start(gr))
601 | end(ir) <- pmin(end(ir), end(gr))
602 |
603 | top.pos <- 4.1 - (4 - length(annoTrack))
604 | bot.pos <- 0 + (4 - length(annoTrack)) * 0.1
605 | jj.between <- (top.pos - bot.pos)/length(annoTrack)
606 |
607 | if (length(ir) > 0) {
608 | jj <- top.pos - (ii - 1) * jj.between
609 |
610 | arrows(start(ir), jj, end(ir), jj, code = 3, length = 0.05,
611 | angle = 90, col = .makeTransparent(color[ii], alpha = 185))
612 | if (is.null(labelCols)) {
613 | text((end(gr) + start(gr))/2, jj - 0.4, names(annoTrack)[[ii]])
614 | }else if(sum(labelCols %in% colnames(mcols(annoTrack[[ii]]))) > 0) {
615 | whichLabelCols <- match(labelCols,
616 | colnames(mcols(annoTrack[[ii]])))
617 | notmiss <- !is.na(whichLabelCols)
618 | whichLabelCols <- whichLabelCols[notmiss]
619 | if (class(unlist(mcols(annoTrack[[ii]])[1,
620 | whichLabelCols, drop = TRUE])) ==
621 | "numeric") {
622 | comps <- round(unlist(mcols(ir)[1,
623 | whichLabelCols, drop = TRUE]),
624 | 3)
625 | } else {
626 | comps <- unlist(mcols(ir)[1, whichLabelCols, drop = TRUE])
627 | }
628 | if (sum(grepl("\\.", labelCols)) > 0) {
629 | labelCols <- gsub("\\.", " ", labelCols)
630 | }
631 | Label <- paste0(names(annoTrack)[ii], ": ",
632 | paste0(labelCols[notmiss],
633 | "=", comps, collapse = ", "))
634 | text((end(gr) + start(gr))/2, jj - 0.4, Label,
635 | cex = 0.85, col = color[ii])
636 | } else {
637 | text((end(gr) + start(gr))/2, jj - 0.4, names(annoTrack)[[ii]])
638 | }
639 | }
640 | })
641 | }
642 |
--------------------------------------------------------------------------------
/vignettes/dmrseq.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Analyzing Bisulfite-seq data with dmrseq"
3 | author: "Keegan Korthauer"
4 | date: "`r BiocStyle::doc_date()`"
5 | package: "`r BiocStyle::pkg_ver('dmrseq')`"
6 | bibliography: dmrseqBib.bib
7 | abstract: >
8 | A basic task in the analysis of count data from Whole Genome
9 | Bisulfite-Sequencing is the detection of differentially methylated regions.
10 | The count data consist of, for each sample, the number of methylated
11 | reads and the total number of reads covering CpG.
12 | An important analysis question is to detect regions (collections of
13 | neighboring CpGs) with systematic differences between conditions,
14 | as compared to within-condition variability. These so-called Differentially
15 | Methylated Regions (DMRs) are thought to be more informative than single CpGs
16 | in terms of of biological function. Although several methods exist
17 | to quantify and perform statistical inference on changes at the individual
18 | CpG level, detection of DMRs is still limited to aggregating signifiant
19 | CpGs without proper inference at the region level. The package **dmrseq**
20 | addresses this gap by providing a rigorous permutation-based approach to
21 | detect and perform inference for differential methylation by use of
22 | generalized least squares models that account for inter-individual and
23 | inter-CpG variability to generate region-level statistics that can be
24 | comparable across the genome. This allows the framework to perform well even
25 | on samples as small as two per group. This vignette explains the
26 | use of the package and demonstrates typical workflows. This vignette was
27 | generated with dmrseq package version `r packageVersion("dmrseq")`
28 | output:
29 | BiocStyle::html_document:
30 | highlight: pygments
31 | toc_float: true
32 | fig_width: 5
33 | vignette: >
34 | %\VignetteIndexEntry{Analyzing Bisulfite-seq data with dmrseq}
35 | %\VignetteEngine{knitr::rmarkdown}
36 | %\VignetteEncoding[utf8]{inputenc}
37 | ---
38 |
39 |
40 |
41 |
42 |
43 | ```{r setup, echo=FALSE, results="hide"}
44 | knitr::opts_chunk$set(tidy=FALSE, cache=FALSE,
45 | dev="png",
46 | message=FALSE, error=FALSE, warning=TRUE)
47 | ```
48 |
49 | # Quick start
50 |
51 | **If you use dmrseq in published research, please cite:**
52 |
53 | > Korthauer, K., Chakraborty, S., Benjamini, Y., and Irizarry, R.A.
54 | > Detection and accurate False Discovery Rate control of differentially
55 | methylated regions from Whole Genome Bisulfite Sequencing
56 | > *Biostatistics*, 2018 (in press).
57 |
58 | This package builds upon the
59 | [bsseq](http://bioconductor.org/packages/bsseq) package [@Hansen2012],
60 | which provides efficient storage and manipulation of bisulfite
61 | sequencing data and inference for differentially methylated CpGs.
62 | The main goal of **dmrseq** [@Korthauer183210]
63 | is to provide inference for differentially methylated *regions*, or
64 | groups of CpGs.
65 |
66 | Here we show the most basic steps for a differential methylation
67 | analysis. There are a variety of steps upstream of **dmrseq** that result
68 | in the generation of counts of methylated reads and total reads covering each
69 | CpG for each sample, including mapping of sequencing reads to a reference
70 | genome with and without bisulfite conversion. You can use the software
71 | of your preference for this step (one option is
72 | [Bismark](https://www.bioinformatics.babraham.ac.uk/projects/bismark/)), as
73 | long as you are able to obtain counts of methylation and coverage (as
74 | opposed to solely methylation proportions, as discussed below).
75 |
76 | This package uses a specific data structure to store and manipulate
77 | bisulfite sequencing data introduced by the **bsseq** package. This data
78 | structure is a *class* called `BSseq`. Objects of the class `BSseq` contain
79 | all pertinent information for a bisulfite sequencing experiment, including
80 | the number of reads corresponding to methylation, and the total number
81 | of reads at each
82 | CpG site, the location of each CpG site, and experimental metadata on the
83 | samples. Note that here we focus on CpG methylation, since this is the
84 | most common form of methylation in humans and many other organisms; take
85 | care when applying this method to other types of methylation and make sure
86 | that it will
87 | be able to scale to the number of methylation sites, and that similar
88 | assumptions can be made regarding spatial correlation. Also note that
89 | the default settings for smoothing parameters and spacing/gap parameters
90 | are set to values that we found useful, but may need to be altered for
91 | datasets for other organisms.
92 |
93 | To store your data in a `BSseq` object, make sure you have the following
94 | neccessary components:
95 |
96 | 1. genomic positions, including chromosome and location, for methylation loci.
97 |
98 | 2. a (matrix) of M (Methylation) values, describing the number of reads
99 | supporting methylation covering a single loci.
100 | Each row in this matrix is a methylation loci and each column is a sample.
101 |
102 | 3. a (matrix) of Cov (Coverage) values,
103 | describing the total number of reads covering a single loci.
104 | Each row in this matrix is a methylation loci and each column is a sample.
105 |
106 | The following code chunk assumes that `chr` and `pos` are vectors of
107 | chromosome names and positions, respectively, for each CpG in the dataset.
108 | You can also provide a `GRanges` object instead of `chr` and `pos`. It
109 | also assumes that the matrices of methylation and coverage values (described
110 | above) are named `M` and `Cov`, respectively. Note, `M` and `Cov` can also
111 | be data stored on-disk (not in memory) using HDF5 files with the `HDF5Array`
112 | package or `DelayedMatrix` with the `DelayedArray` package.
113 |
114 | The `sampleNames` and `trt` objects are
115 | vectors with sample labels and condition labels for each sample. A condition
116 | label could be something like
117 | treatment or control, a tissue type, or a continous measurement.
118 | This is the covariate for which you wish to test for differences in
119 | methylation. Once the `BSseq` object is constructed and the sample covariate
120 | information is added, DMRs are obtained by running the `dmrseq` function.
121 | A continuous covariate is assumed if the data type of the `testCovariate`
122 | arugment in `dmrseq` is
123 | continuous, with the exception of if there are only two unique values
124 | (then a two group comparison is carried out).
125 |
126 | ```{r quickStart, eval=FALSE}
127 | bs <- BSseq(chr = chr, pos = pos,
128 | M = M, Cov = Cov,
129 | sampleNames = sampleNames)
130 | pData(bs)$Condition <- trt
131 |
132 | regions <- dmrseq(bs=bs, testCovariate="Condition")
133 | ```
134 |
135 | For more information on constructing and manipulating `BSseq` objects,
136 | see the [bsseq](http://bioconductor.org/packages/bsseq) vignettes.
137 |
138 | * If you used *Bismark* to align your bisulfite sequencing data,
139 | you can use the `read.bismark` function to read bismark files
140 | into `BSseq` objects. See below for more details.
141 |
142 | # How to get help for dmrseq
143 |
144 | Please post **dmrseq** questions to the
145 | **Bioconductor support site**, which serves as a searchable knowledge
146 | base of questions and answers:
147 |
148 |
149 |
150 | Posting a question and tagging with "dmrseq" will automatically send
151 | an alert to the package authors to respond on the support site. See
152 | the first question in the list of [Frequently Asked Questions](#FAQ)
153 | (FAQ) for information about how to construct an informative post.
154 |
155 | # Input data
156 |
157 | ## Why counts instead of methylation proportions?
158 |
159 | As input, the **dmrseq** package expects count data as obtained, e.g.,
160 | from Bisulfite-sequencing. The value in the *i*-th row and the *j*-th column of
161 | the `M` matrix tells how many methylated reads can be assigned to CpG *i*
162 | in sample *j*. Likewise, the value in the *i*-th row and the *j*-th column of
163 | the `Cov` matrix tells how many total reads can be assigned to CpG *i*
164 | in sample *j*. Although we might be tempted to combine these matrices into
165 | one matrix that contains the methylation *proportion* (`M`/`Cov`) at each CpG
166 | site, it is critical to notice that this would be throwing away a lot of
167 | information. For example, some sites have much higher coverage than others,
168 | and naturally, we have more confidence in those with many reads mapping to them.
169 | If we only kept the proportions, a CpG with 2 out of 2 reads methylated would
170 | be treated the same as a CpG with 30 out of 30 reads methylated.
171 |
172 | ## How many samples do I need?
173 |
174 | To use **dmrseq**, you need to have at least 2 samples in each condition.
175 | Without this replicates, it is impossible to distinguish between biological
176 | variability due to condition/covariate of interest, and inter-individual
177 | variability within condition.
178 |
179 | If your experiment contains additional samples, perhaps from other conditions
180 | that are not of interest in the current test, these should be filtered out
181 | prior to running **dmrseq**. Rather than creating a new filtered object,
182 | the filtering step can be included in the call to the main function `dmrseq`.
183 | For more details, see the
184 | [Filtering CpGs and samples Section](#filtering-cpgs-and-samples).
185 |
186 | ## Bismark input
187 |
188 | If you used Bismark for mapping and methylation level extraction, you can
189 | use the `read.bismark` function from the **bsseq** package to read the
190 | data directly into
191 | a `BSeq` object.
192 |
193 | The following example is from the help page of the function. After running
194 | Bismark's methylation extractor, you should have output files with names
195 | that end in `.bismark.cov.gz`. You can specify a vector of file names with
196 | the `file` argument, and a corresponding vector of `sampleNames`. It is
197 | recommended that you set `rmZeroCov` to TRUE in order to remove CpGs with
198 | no coverage in any of the samples, and set `strandCollapse` to TRUE in order
199 | to combine CpGs on opposite strands into one observation (since CpG methylation)
200 | is symmetric.
201 |
202 | ```{r bismarkinput}
203 | library(dmrseq)
204 | infile <- system.file("extdata/test_data.fastq_bismark.bismark.cov.gz",
205 | package = 'bsseq')
206 | bismarkBSseq <- read.bismark(files = infile,
207 | rmZeroCov = TRUE,
208 | strandCollapse = FALSE,
209 | verbose = TRUE)
210 | bismarkBSseq
211 | ```
212 |
213 | See the [bsseq](http://bioconductor.org/packages/bsseq) help pages for
214 | more information on using this function.
215 |
216 | ## Count matrix input
217 |
218 | If you haven't used Bismark, but you have count data for number of methylated
219 | reads and total coverage for each CpG, along with their corresponding chromosome
220 | and position information, you can construct a `BSseq` object from scratch,
221 | like below. Notice that the `M` and `Cov` matrices have the same dimension, and
222 | `chr` and `pos` have the same number of elements as rows in the count matrices
223 | (which corresponds to the number of CpGs). Also note that the number of columns
224 | in the count matrices matches the number of elements in `sampleNames` and the
225 | condition variable 'celltype`.
226 |
227 | ```{r dissect, results="hide", echo=FALSE}
228 | data("BS.chr21")
229 | M <- getCoverage(BS.chr21, type="M")
230 | Cov <- getCoverage(BS.chr21, type="Cov")
231 | chr <- as.character(seqnames(BS.chr21))
232 | pos <- start(BS.chr21)
233 | celltype <- pData(BS.chr21)$CellType
234 | sampleNames <- sampleNames(BS.chr21)
235 | ```
236 |
237 | ```{r fromScratch}
238 | head(M)
239 | head(Cov)
240 | head(chr)
241 | head(pos)
242 |
243 | dim(M)
244 | dim(Cov)
245 | length(chr)
246 | length(pos)
247 |
248 | print(sampleNames)
249 | print(celltype)
250 |
251 | bs <- BSseq(chr = chr, pos = pos,
252 | M = M, Cov = Cov,
253 | sampleNames = sampleNames)
254 | show(bs)
255 | ```
256 | ```{r cleanup, results="hide", echo=FALSE}
257 | rm(M, Cov, pos, chr, bismarkBSseq)
258 | ```
259 |
260 | The example data contains CpGs from chromosome 21 for four samples
261 | from @Lister2009. To load this data directly (already in the `BSseq` format),
262 | simply type `data(BS.chr21)`.
263 | Two of the samples are replicates of the cell type 'imr90'
264 | and the other two are replicates of the cell type 'h1'. Now that we have the
265 | data loaded into a `BSseq` object, we can use **dmrseq**
266 | to find regions of the genome where these two cell types have significantly
267 | different methylation levels. But first, we need to add the sample metadata
268 | that indicates which samples are from which cell type (the `celltype`
269 | varialbe above). This information, which we call 'metadata',
270 | will be used by the `dmrseq` function to decide
271 | which samples to compare to one another. The next section shows how to add
272 | this information to the `BSseq` object.
273 |
274 | ## Sample metadata
275 |
276 | To add sample metadata, including the covariate of interest, you can add it
277 | to the
278 | `BSseq` object by adding columns to the `pData` slot. You must have at least
279 | one column of `pData`, which contains the covariate of interest. Additional
280 | columns are optional.
281 |
282 | ```{r meta}
283 | pData(bs)$CellType <- celltype
284 | pData(bs)$Replicate <- substr(sampleNames,
285 | nchar(sampleNames), nchar(sampleNames))
286 |
287 | pData(bs)
288 | ```
289 |
290 | We will then tell the `dmrseq` function which metadata variable to use
291 | for testing for methylation differences by setting the `testCovariate`
292 | parameter equal to its column name.
293 |
294 | ## Smoothing
295 |
296 | Note that unlike in **bsseq**, you do not need to carry out the smoothing step
297 | with a separate function. In addition, you should not use **bsseq**'s `BSmooth()`
298 | function to smooth the methylation levels, since **dmrseq** smooths in a very
299 | different way. Briefly, **dmrseq** smooths methylation *differences*, so it
300 | carries out the smoothing step once. This is automatically done with the main
301 | `dmrseq` function. **bsseq** on the other hand, smooths each sample
302 | independently, so smoothing needs to be carried out once per sample.
303 |
304 | ## Filtering CpGs and samples
305 |
306 | For pairwise comparisons, **dmrseq** analyzes all CpGs that have at least one
307 | read in at least one sample per group.
308 | Thus, if your dataset contains CpGs with zero reads in every sample within a
309 | group, you should filter them out prior to running `dmrseq`. Likewise,
310 | if your `bsseq` object contains extraneous samples that are part of the
311 | experiment but not the differential methylation testing of interest, these
312 | should be filtered out as well.
313 |
314 | Filtering `bsseq` objects is straightforward:
315 |
316 | * Subset rows to filter CpG loci
317 | * Subset columns to filter samples
318 |
319 | If we wish to remove all CpGs that have no coverage in at least one sample
320 | and only keep samples with a CellType of "imr90" or "h1", we would do so with:
321 |
322 | ```{r, filter}
323 | # which loci and sample indices to keep
324 | loci.idx <- which(DelayedMatrixStats::rowSums2(getCoverage(bs, type="Cov")==0) == 0)
325 | sample.idx <- which(pData(bs)$CellType %in% c("imr90", "h1"))
326 |
327 | bs.filtered <- bs[loci.idx, sample.idx]
328 | ```
329 | ```{r, results="hide", echo=FALSE}
330 | rm(bs.filtered)
331 | ```
332 |
333 | Note that this is a trivial example, since our toy example object `BS.chr21`
334 | already contains only loci with coverage at least one read in all samples as well
335 | as only samples from the "imr90" and "h1" conditions.
336 |
337 | Also note that instead of creating a separate object, the filtering step
338 | can be combined with the call to `dmrseq` by replacing the `bs` input with a
339 | filtered version `bs[loci.idx, sample.idx]`.
340 |
341 | ## Adjusting for covariates
342 |
343 | There are two ways to adjust for covariates in the dmrseq model. The first way
344 | is to specify the `adjustCovariate` parameter of the `dmrseq()` function as
345 | a column of the `pData()` slot that contains the covariate you
346 | would like to adjust for. This will include that covariate directly in the
347 | model. This is ideal if the adjustment covariate is continuous or has more
348 | than two groups.
349 |
350 | The second way is to specify the `matchCovariate` parameter of the `dmrseq`
351 | function as a column of the `pData()` slot that contains the covariate you
352 | would like to match on. This will restrict the permutations considered to only
353 | those where the `matchCovariate` is balanced. For example, the `matchCovariate`
354 | could represent the sex of each sample. In that case, a permutation that
355 | includes all males in one group and all females in another would not be
356 | considered (since there is a plausible biological difference that may induce
357 | the null distribution to resemble non-null). This matching adjustment is ideal
358 | for two-group comparisons.
359 |
360 | # Differentially Methylated Regions
361 |
362 | The standard differential expression analysis steps are wrapped
363 | into a single function, `dmrseq`. The estimation steps performed
364 | by this function are described briefly below, as well as in
365 | more detail in the **dmrseq** paper. Here we run the results for a subset
366 | of 20,000 CpGs in the interest of computation time.
367 |
368 | ```{r mainfunction, message=TRUE, warning=TRUE}
369 | testCovariate <- "CellType"
370 | regions <- dmrseq(bs=bs[240001:260000,],
371 | cutoff = 0.05,
372 | testCovariate=testCovariate)
373 | ```
374 |
375 | Progress messages are printed to the console if `verbose` is TRUE.
376 | The text, `condition h1 vs imr90`, tells you that positive methylation
377 | differences mean h1 has higher methylation than imr90 (see below for
378 | more details).
379 |
380 | ## Output of dmrseq
381 |
382 | The results object is a `GRanges` object with the coordiates
383 | of each candidate region, and contains the following metadata columns (which
384 | can be extracted with the `$` operator:
385 |
386 | 1. `L` = the number of CpGs contained in the region,
387 | 2. `area` = the sum of the smoothed beta values
388 | 3. `beta` = the coefficient value for the condition difference (Note: if the
389 | test covariate is categorical with more than 2 groups, there will be
390 | more than one beta column),
391 | 4. `stat` = the test statistic for the condition difference,
392 | 5. `pval` = the permutation _p_-value for the significance of the test
393 | statistic, and
394 | 6. `qval` = the _q_-value for the test statistic (adjustment
395 | for multiple comparisons to control false discovery rate).
396 | 7. `index = an `IRanges` containing the indices of the region's
397 | first CpG to last CpG.
398 |
399 | ```{r, showresults}
400 | show(regions)
401 | ```
402 |
403 | The above steps are carried out on a very small subset of data (20,000 CpGs).
404 | This package loads data into memory one chromosome at a
405 | time. For on human data, this means objects with a few million
406 | entries per sample (since there are roughly 28.2 million total CpGs in the human
407 | genome, and the largest chromosomes will have more than 2 million CpGs).
408 | This means that whole-genome `BSseq` objects for several samples can use up
409 | several GB of RAM. In order to improve speed, the package allows for easy
410 | parallel processing of chromosomes, but be aware that using more cores will
411 | also require the use of more RAM.
412 |
413 | To use more cores, use the `register` function of
414 | [BiocParallel](http://bioconductor.org/packages/BiocParallel). For example,
415 | the following chunk (not evaluated here), would register 4 cores, and
416 | then the functions above would
417 | split computation over these cores.
418 |
419 | ```{r parallel, eval=FALSE}
420 | library("BiocParallel")
421 | register(MulticoreParam(4))
422 | ```
423 | ## Steps of the dmrseq method
424 |
425 | **dmrseq** is a two-stage approach that first detects candidate regions and then
426 | explicitly evaluates statistical significance at the region level while
427 | accounting for known sources of variability.
428 | Candidate DMRs are defined by segmenting the genome into groups of CpGs
429 | that show consistent evidence of differential methylation.
430 | Because the methylation levels of neighboring CpGs are highly correlated,
431 | we first smooth the signal to combat loss of power due to low coverage as done
432 | in **bsseq**.
433 |
434 | In the second stage, we compute a statistic for each candidate
435 | DMR that takes into account variability between biological replicates
436 | and spatial correlation among neighboring loci. Significance of each
437 | region is assessed via a permutation procedure which uses a pooled null
438 | distribution that can be generated from as few as two biological replicates,
439 | and false discovery rate is controlled using the Benjamini-Hochberg
440 | procedure.
441 |
442 | For more details, refer to the **dmrseq** paper [@Korthauer183210].
443 |
444 | ## Detecting large-scale methylation blocks
445 |
446 | The default smoothing parameters (`bpSpan`, `minInSpan`, and `maxGapSmooth`)
447 | are designed to focus on local DMRs, generally in the range of hundreds to
448 | thousands of bases. In some applications, such as cancer, it is of interest
449 | to effectively 'zoom out' in order to detect larger (lower-resolution)
450 | methylation blocks on the order of hundreds of thousands to millions of bases.
451 | To do so, you can
452 | set the `block` argument to true, which will only include candidate regions with
453 | at least `blockSize` basepairs (default = 5000). This setting will also merge
454 | candidate regions that (1) are in the same direction and (2) are less than 1kb
455 | apart with no covered CpGs separating them. The region-level model used is also
456 | slightly modified - instead of a loci-specific intercept for each CpG in the
457 | region, the intercept term is modeled as a natural spline with one interior
458 | knot per each 10kb of length (up to 10 interior knots).
459 |
460 | In addition, detecting large-scale blocks requires that
461 | the smoothing window be increased to minimize the impact of noisy local
462 | methylation measurements. To do so, the values of the
463 | smoothing parameters should be increased. For example, to use a smoothing window
464 | that captures at least 500 CpGs or 50,000 basepairs that are spaced apart by no
465 | more than 1e6 bases, use `minInSpan=500`, `bpSpan=5e4`, and `maxGapSmooth=1e6`.
466 | In addition, to avoid a block being broken up simply due to a gap with no
467 | covered CpGs, you can increase the `maxGap` parameter.
468 |
469 | ```{r blocks, message=TRUE, warning=TRUE}
470 | testCovariate <- "CellType"
471 | blocks <- dmrseq(bs=bs[120001:125000,],
472 | cutoff = 0.05,
473 | testCovariate=testCovariate,
474 | block = TRUE,
475 | minInSpan = 500,
476 | bpSpan = 5e4,
477 | maxGapSmooth = 1e6,
478 | maxGap = 5e3)
479 | head(blocks)
480 | ```
481 |
482 | The top hit is `r signif(width(blocks)[1]/1e3, 3)` thousand basepairs wide.
483 | In general, it also might be advised to decrease the cutoff when detecting
484 | blocks, since a smaller methylation
485 | difference might be biologically significant if it is maintained
486 | over a large genomic region. Note that block-finding can be more computationally
487 | intensive since we are fitting region-level models to large numbers of CpGs at a
488 | time. In the toy example above we are only searching over 5,000 CpGs (which
489 | span
490 | `r signif((max(end(bs[120001:125000,])) -
491 | min(start(bs[120001:125000,])))/1e3,3)`
492 | thousand basepairs), so we do not find enough null
493 | candidate regions to carry out inference and obtain significance levels.
494 |
495 | # Exploring and exporting results
496 |
497 | ## Explore how many regions were significant
498 |
499 | How many regions were significant at the FDR (_q_-value) cutoff of 0.05? We
500 | can find this by counting how many values in the `qval` column of the `regions`
501 | object were less than 0.05.
502 | You can also subset the regions by an FDR cutoff.
503 |
504 | ```{r}
505 | sum(regions$qval < 0.05)
506 |
507 | # select just the regions below FDR 0.05 and place in a new data.frame
508 | sigRegions <- regions[regions$qval < 0.05,]
509 | ```
510 |
511 | ## Hypo- or Hyper- methylation?
512 |
513 | You can determine the proportion of regions with hyper-methylation by counting
514 | how many had a positive direction of effect (positive statistic).
515 |
516 | ```{r hyper}
517 | sum(sigRegions$stat > 0) / length(sigRegions)
518 | ```
519 |
520 | To interpret the direction of effect, note that for a two-group comparison
521 | **dmrseq** uses alphabetical order of the covariate of interest.
522 | The condition with a higher alphabetical rank will become the reference category.
523 | For example, if
524 | the two conditions are "A" and "B", the "A" group will be the reference category,
525 | so a positive direction of effect means that
526 | "B" is hyper-methylated relative to "A". Conversely, a negative direction of
527 | effect means that "B" is hypo-methylated relative to "A".
528 |
529 | ## Plot DMRs
530 |
531 | It can be useful to visualize individual DMRs, so we provide a plotting
532 | function that is based off of **bsseq**'s plotting functions. There is also
533 | functionality to add annotations using the
534 | [annotatr](http://bioconductor.org/packages/annotatr) package to
535 | see the nearby CpG categories (island, shore, shelf, open sea) and nearby
536 | coding sequences.
537 |
538 | To retrieve annotations for genomes supported by **annotatr**, use the
539 | helper function `getAnnot`, and pass this annotation object to the `plotDMRs`
540 | function as the `annoTrack` parameter.
541 |
542 | ```{r plot, out.width='\\textwidth', fig.height = 2.5, warning=FALSE}
543 | # get annotations for hg18
544 | annoTrack <- getAnnot("hg18")
545 |
546 | plotDMRs(bs, regions=regions[1,], testCovariate="CellType",
547 | annoTrack=annoTrack)
548 | ```
549 |
550 | Here we also plot the top methylation block from the block analysis:
551 |
552 | ```{r plotblock, out.width='\\textwidth', fig.height = 2.5, warning=FALSE}
553 | plotDMRs(bs, regions=blocks[1,], testCovariate="CellType",
554 | annoTrack=annoTrack)
555 | ```
556 |
557 | ## Plot distribution of methylation values and coverage
558 |
559 | It can also be helpful to visualize overall distributions of methylation values
560 | and / or coverage. The function `plotEmpiricalDistribution` will plot the
561 | methylation values of
562 | the covariate of interest (specified with `testCovariate`).
563 |
564 | ```{r plot2, fig.height=3}
565 | plotEmpiricalDistribution(bs, testCovariate="CellType")
566 | ```
567 |
568 | By changing the `type` argument to `Cov`, it will also plot the distribution of
569 | coverage values. In addition, samples can be plotted separately by setting
570 | `bySample` to true.
571 |
572 | ```{r plot3, fig.height=3}
573 | plotEmpiricalDistribution(bs, testCovariate="CellType",
574 | type="Cov", bySample=TRUE)
575 | ```
576 |
577 | ## Exporting results to CSV files
578 |
579 | A plain-text file of the results can be exported using the
580 | base R functions *write.csv* or *write.delim*.
581 | We suggest using a descriptive file name indicating the variable
582 | and levels which were tested.
583 |
584 | ```{r export, eval=FALSE}
585 | write.csv(as.data.frame(regions),
586 | file="h1_imr90_results.csv")
587 | ```
588 |
589 | ## Extract raw mean methylation differences
590 |
591 | For a two-group comparison, it might be of interest to extract the raw mean
592 | methylation differences over the DMRs. This can be done with the helper function
593 | `meanDiff`. For example, we can extract the raw mean difference values for
594 | the regions at FDR level 0.05 (using the `sigRegions` object created
595 | in the section
596 | [Explore how many regions were significant](#explore-how-many-regions-were-significant)).
597 |
598 | ```{r, meandiff}
599 | rawDiff <- meanDiff(bs, dmrs=sigRegions, testCovariate="CellType")
600 | str(rawDiff)
601 | ```
602 |
603 | # Simulating DMRs
604 |
605 | If you have multiple samples from the same condition (e.g. control samples),
606 | the function `simDMRS` will split these into two artificial sample groups
607 | and then add _in silico_ DMRs. This can then be used to assess sensitivity
608 | and specificity of DMR approaches, since we hope to be able to recover the
609 | DMRs that were spiked in, but not identify too many other differences (since
610 | we don't expect any biological difference between the two artificial sample
611 | groups).
612 |
613 | The use of this function is demonstrated below, although note that in this
614 | toy example, we do not have enough samples from the same biological condition to
615 | split into two groups, so instead we shuffle the cell types to create a null
616 | sample comparison.
617 |
618 | ```{r, sim}
619 | data(BS.chr21)
620 |
621 | # reorder samples to create a null comparison
622 | BS.null <- BS.chr21[1:20000,c(1,3,2,4)]
623 |
624 | # add 100 DMRs
625 | BS.chr21.sim <- simDMRs(bs=BS.null, num.dmrs=100)
626 |
627 | # bsseq object with original null + simulated DMRs
628 | show(BS.chr21.sim$bs)
629 |
630 | # coordinates of spiked-in DMRs
631 | show(BS.chr21.sim$gr.dmrs)
632 |
633 | # effect sizes
634 | head(BS.chr21.sim$delta)
635 | ```
636 |
637 | The resulting object is a list with the following elements:
638 |
639 | * `gr.dmrs`: a `GRanges` object containing the coordinates of the true spiked
640 | in DMRs
641 | * `dmr.mncov`: a numeric vector containing the mean coverage of the
642 | simulated DMRs
643 | * `dmr.L`: a numeric vector containing the sizes (number of CpG loci) of the
644 | simulated DMRs
645 | * `bs`: the `BSSeq` object containing the original null data + simulated DMRs
646 | * `delta`: a numeric vector of effect sizes (proportion differences) of the
647 | simulated DMRs.
648 |
649 | # Session info
650 |
651 | ```{r sessionInfo}
652 | sessionInfo()
653 | ```
654 |
655 | # References
656 |
--------------------------------------------------------------------------------
/R/dmrseq.R:
--------------------------------------------------------------------------------
1 | #' Main function for detecting and evaluating significance of DMRs.
2 | #'
3 | #' Performs a two-step approach that (1) detects candidate regions, and
4 | #' (2) scores candidate regions with an exchangeable (across the genome)
5 | #' statistic and evaluates statistical significance using a
6 | #' permuation test on the pooled null distribution of scores.
7 | #'
8 | #' @param bs bsseq object containing the methylation values as well as the
9 | #' phenotype matrix that contains sample level covariates
10 | #' @param testCovariate Character value indicating which variable
11 | #' (column name) in \code{pData(bs)} to test
12 | #' for association of methylation levels.
13 | #' Can alternatively specify an integer value indicating
14 | #' which of column of
15 | #' \code{pData(bs)} to use. This is used to construct the
16 | #' design matrix for the test statistic calculation. To run using a
17 | #' continuous or categorial covariate with more than two groups, simply pass in
18 | #' the name of a column in `pData` that contains this covariate. A continuous
19 | #' covariate is assumued if the data type in the `testCovariate` slot is
20 | #' continuous, with the exception of if there are only two unique values
21 | #' (then a two group comparison is carried out).
22 | #' @param adjustCovariate an (optional) character value or vector
23 | #' indicating which variables (column names) in \code{pData(bs)}
24 | #' will be adjusted for when
25 | #' testing for the association of methylation value with the
26 | #' \code{testCovariate}.
27 | #' Can alternatively specify an
28 | #' integer value or vector indicating
29 | #' which of the columns of \code{pData(bs)} to adjust for.
30 | #' If not NULL (default), then this is also used to
31 | #' construct the design matrix for the test statistic calculation.
32 | #' @param matchCovariate An (optional) character value
33 | #' indicating which variable (column name) of \code{pData(bs)}
34 | #' will be blocked for when
35 | #' constructing the permutations in order to
36 | #' test for the association of methylation value with the
37 | #' \code{testCovariate}, only to be used when \code{testCovariate}
38 | #' is a two-group factor and the number of permutations possible is less
39 | #' than 500000.
40 | #' Alternatively, you can specify an integer value indicating
41 | #' which column of \code{pData(bs)} to block for.
42 | #' Blocking means that only permutations with balanced
43 | #' composition of \code{testCovariate} values will be used (for example if
44 | #' you have samples from different gender and this is not your covariate of
45 | #' interest,
46 | #' it is recommended to use gender as a matching covariate to avoid one
47 | #' of the permutations testing entirely males versus females; this violates
48 | #' the null hypothesis and will decrease power).
49 | #' If not NULL (default), then no blocking is performed.
50 | #' @param minInSpan positive integer that represents the minimum number of
51 | #' CpGs in a smoothing span window if \code{smooth} is TRUE.
52 | #' Default value is 30.
53 | #' @param minNumRegion positive integer that represents the minimum number of
54 | #' CpGs to consider for a candidate region. Default value is 5.
55 | #' Minimum value is 3.
56 | #' @param cutoff scalar value that represents the absolute value (or a vector
57 | #' of two numbers representing a lower and upper bound) for the cutoff of
58 | #' the single CpG coefficient that is used to discover
59 | #' candidate regions. Default value is 0.10.
60 | #' @param smooth logical value that indicates whether or not to smooth the
61 | #' CpG level signal when discovering candidate regions.
62 | #' Defaults to TRUE.
63 | #' @param bpSpan a positive integer that represents the length in basepairs
64 | #' of the smoothing span window if \code{smooth} is TRUE. Default value is
65 | #' 1000.
66 | #' @param verbose logical value that indicates whether progress messages
67 | #' should be printed to stdout. Defaults value is TRUE.
68 | #' @param BPPARAM a \code{BiocParallelParam} object to specify the parallel
69 | #' backend. The default
70 | #' option is \code{BiocParallel::bpparam()} which will automatically creates
71 | #' a cluster appropriate for the operating system.
72 | #' @param maxPerms a positive integer that represents the maximum number
73 | #' of permutations that will be used to generate the global null
74 | #' distribution of test statistics. Default value is 10.
75 | #' @param maxGap integer value representing maximum number of basepairs in
76 | #' between neighboring CpGs to be included in the same DMR.
77 | #' @param maxGapSmooth integer value representing maximum number of basepairs
78 | #' in between neighboring CpGs to be included in the same
79 | #' cluster when performing smoothing (should generally be larger than
80 | #' \code{maxGap})
81 | #' @param stat a character vector indicating the name of the column of the
82 | #' output to use as the region-level test statistic. Default value is 'stat'
83 | #' which is the region level-statistic designed to be comparable across the
84 | #' genome.
85 | #' It is not recommended to change this argument, but it can be done for
86 | #' experimental purposes. Possible values are: 'L' - the number of loci
87 | #' in the region, 'area' - the sum of the smoothed loci statistics,
88 | #' 'beta' - the effect size of the region, 'stat' - the test statistic for
89 | #' the region, or 'avg' - the average smoothed loci statistic.
90 | #' @param block logical indicating whether to search for large-scale (low
91 | #' resolution) blocks of differential methylation (default is FALSE, which
92 | #' means that local DMRs are desired). If TRUE, the parameters for
93 | #' \code{bpSpan}, \code{minInSpan}, and \code{maxGapSmooth} should be adjusted
94 | #' (increased) accordingly. This setting will also merge
95 | #' candidate regions that (1) are in the same direction and (2) are less than
96 | #' 1kb apart with no covered CpGs separating them. The region-level model used
97 | #' is also slightly modified - instead of a loci-specific intercept for each
98 | #' CpG in theregion, the intercept term is modeled as a natural spline with
99 | #' one interior knot per each 10kb of length (up to 10 interior knots).
100 | #' @param blockSize numeric value indicating the minimum number of basepairs
101 | #' to be considered a block (only used if \code{block}=TRUE). Default is
102 | #' 5000 basepairs.
103 | #' @param chrsPerChunk a positive integer value indicating the number of
104 | #' chromosomes per chunk. The default is 1, meaning that the data will be
105 | #' looped through one chromosome at a time. When pairing up multiple
106 | #' chromosomes per chunk, sizes (in terms of numbers of CpGs) will be taken
107 | #' into consideration to balance the sizes of each chunk.
108 | #' @return a \code{GRanges} object that contains the results of the inference.
109 | #' The object contains one row for each candidate region, sorted by q-value
110 | #' and then chromosome. The standard
111 | #' \code{GRanges} chr, start, and end are included, along with at least
112 | #' 7 metadata
113 | #' columns, in the following order:
114 | #' 1. L = the number of CpGs contained in the region,
115 | #' 2. area = the sum of the smoothed beta values
116 | #' 3. beta = the coefficient value for the condition difference (there
117 | #' will be more than one column here if a multi-group comparison
118 | #' was performed),
119 | #' 4. stat = the test statistic for the condition difference,
120 | #' 5. pval = the permutation p-value for the significance of the test
121 | #' statistic, and
122 | #' 6. qval = the q-value for the test statistic (adjustment
123 | #' for multiple comparisons to control false discovery rate).
124 | #' 7. index = an \code{IRanges} containing the indices of the region's
125 | #' first CpG to last CpG.
126 | #'
127 | #' @keywords inference
128 | #' @importFrom outliers grubbs.test
129 | #' @importFrom bumphunter clusterMaker getSegments
130 | #' @importFrom DelayedMatrixStats colMedians rowMads rowSums2 rowMeans2 rowDiffs
131 | #' @importFrom matrixStats rowRanges
132 | #' @importFrom stats formula anova as.formula
133 | #'
134 | #' @importClassesFrom bsseq BSseq
135 | #' @importMethodsFrom bsseq pData seqnames sampleNames start width
136 | #'
137 | #' @importFrom grDevices col2rgb colorRampPalette dev.off pdf rgb
138 | #' @importFrom graphics axis layout legend lines mtext par
139 | #' plot points rect rug text
140 | #' @importFrom methods is
141 | #' @importFrom stats approxfun lm loess median model.matrix p.adjust
142 | #' predict preplot qt quantile rbeta rbinom runif
143 | #' @importFrom utils combn
144 | #' @importFrom BiocParallel bplapply register MulticoreParam bpparam
145 | #' @importFrom splines ns
146 | #'
147 | #' @import bsseq
148 | #' @import GenomicRanges
149 | #' @import nlme
150 | #' @import annotatr
151 | #' @import ggplot2
152 | #' @import S4Vectors
153 | #'
154 | #' @export
155 | #'
156 | #' @examples
157 | #'
158 | #' # load example data
159 | #' data(BS.chr21)
160 | #'
161 | #' # the covariate of interest is the 'CellType' column of pData(BS.chr21)
162 | #' testCovariate <- 'CellType'
163 | #'
164 | #' # run dmrseq on a subset of the chromosome (10K CpGs)
165 | #' regions <- dmrseq(bs=BS.chr21[240001:250000,],
166 | #' cutoff = 0.05,
167 | #' testCovariate=testCovariate)
168 | #'
169 | dmrseq <- function(bs, testCovariate, adjustCovariate = NULL, cutoff = 0.1,
170 | minNumRegion = 5, smooth = TRUE, bpSpan = 1000,
171 | minInSpan = 30, maxGapSmooth = 2500, maxGap = 1000,
172 | verbose = TRUE,
173 | maxPerms = 10, matchCovariate = NULL,
174 | BPPARAM = bpparam(), stat = "stat",
175 | block = FALSE, blockSize = 5000,
176 | chrsPerChunk = 1) {
177 |
178 | stopifnot(is(bs, "BSseq"))
179 |
180 | if (!(is.null(cutoff) || length(cutoff) %in% seq_len(2)))
181 | stop("'cutoff' has to be either NULL or a vector of length 1 or 2")
182 | if (length(cutoff) == 2)
183 | cutoff <- sort(cutoff)
184 | if (is.null(cutoff) | abs(cutoff) > 1 | abs(cutoff) == 0)
185 | stop("Must specify a value for cutoff between 0 and 1")
186 | subverbose <- max(as.integer(verbose) - 1L, 0)
187 |
188 | if(minNumRegion < 3){
189 | stop("minNumRegion must be at least 3")
190 | }
191 |
192 | # check statistic name
193 | if (!(stat %in% c("L", "area", "beta", "stat", "avg"))) {
194 | stop("Specified '", stat,
195 | "' as the test statistic which is not ",
196 | "in the results. Please specify a valid name from one of ",
197 | "L, area, beta, stat, or avg")
198 | }
199 |
200 | # informative message about blocks if block=TRUE; check for increased
201 | # smoothing window
202 | if (block){
203 | message("Searching for large scale blocks with at least ",
204 | blockSize, " basepairs.")
205 |
206 | if(minInSpan < 100 && bpSpan < 2000 && maxGapSmooth < 1e5){
207 | warning("When block=TRUE, it is recommended to increase the values ",
208 | "of minInSpan, bpSpan, and maxGapSmooth in order to widen ",
209 | "the smoothing window")
210 | }
211 | }
212 |
213 | # convert covariates to column numbers if characters
214 | if (is.character(testCovariate)) {
215 | if(length(testCovariate) > 1)
216 | stop("Only one testCovariate can be specified")
217 | if(is.character(adjustCovariate)){
218 | if(sum(testCovariate %in% adjustCovariate) > 0)
219 | stop("adjustCovariate can't contain testCovariate")
220 | }
221 | if(is.character(matchCovariate)){
222 | if(sum(testCovariate %in% matchCovariate))
223 | stop("matchCovariate can't contain testCovariate")
224 | }
225 | testCovariate <- which(colnames(pData(bs)) == testCovariate)
226 | if (length(testCovariate) == 0) {
227 | stop("testCovariate not found in pData(). ",
228 | "Please specify a valid testCovariate")
229 | }
230 | }
231 |
232 | if (is.character(adjustCovariate)) {
233 | if(is.character(matchCovariate)){
234 | if(matchCovariate == adjustCovariate)
235 | stop("matchCovariate can't be identical to adjustCovariate")
236 | }
237 | adjustCovariate <- which(colnames(pData(bs)) %in% adjustCovariate)
238 | if (length(adjustCovariate) == 0) {
239 | stop("adjustCovariate not found in pData(). ",
240 | "Please specify a valid adjustCovariate")
241 | }
242 | }
243 |
244 | # check that chrsPerChunk value makes sense
245 | if (chrsPerChunk != 1){
246 | if (chrsPerChunk%%1 != 0){
247 | stop("chrsPerChunk must be an integer")
248 | }else if(chrsPerChunk < 1){
249 | stop("chrsPerChunk must be strictly positive")
250 | }else if(chrsPerChunk > length(unique(seqnames(bs)))){
251 | stop("chrsPerChunk can't be larger than the total",
252 | " number of chromosomes")
253 | }
254 | }
255 |
256 | # check that bs object is sorted since `bsseq::BSseq()` no longer
257 | # automatically sorts to ensure loci from same chr are indexed consecutively
258 | if (is.unsorted(bs)) {
259 | stop("'bs' must be sorted. Use 'sort(bs)'.")
260 | }
261 |
262 | # construct the design matrix using the pData of bs
263 | if (ncol(pData(bs)) < max(testCovariate, adjustCovariate)) {
264 | stop("Error: pData(bs) has too few columns. ","
265 | Please specify valid ",
266 | "covariates to use in the analysis")
267 | }
268 |
269 | coeff <- seq(2,(2 + length(testCovariate) - 1))
270 | testCov <- pData(bs)[, testCovariate]
271 | if (is.factor(testCov)) # drop unused levels of test
272 | testCov <- droplevels(testCov)
273 |
274 | # check for missing values in testCov
275 | if (any(is.na(testCov))) {
276 | stop("Missing values found in the testCovariate ",
277 | "Please remove these samples from the analysis.")
278 | }
279 |
280 |
281 | fact <- TRUE
282 | sampleSize <- table(testCov)[names(table(testCov)) %in% pData(bs)[,testCovariate]]
283 | if (length(unique(testCov)) == 1) {
284 | message("Warning: only one unique value of the specified ",
285 | "covariate of interest. Assuming null comparison and ",
286 | "splitting sample group into two equal groups")
287 | testCov <- rep(1, length(testCov))
288 | testCov[seq_len(round(length(testCov)/2))] <- 0
289 | }else if (length(unique(testCov)) > 2 && !is.numeric(testCov)) {
290 | message("Performing a global test of H0: no difference among ",
291 | length(unique(testCov)), " groups (assuming the test ",
292 | "covariate ", colnames(pData(bs))[testCovariate],
293 | " is a factor).")
294 | coeff <- seq(coeff, coeff + length(unique(testCov)) - 2)
295 | }else if (length(unique(testCov)) > 2 && is.numeric(testCov)) {
296 | message("Assuming the test ",
297 | "covariate ", colnames(pData(bs))[testCovariate],
298 | " is continuous.")
299 | fact <- FALSE
300 | }else{
301 | message("Assuming the test ",
302 | "covariate ", colnames(pData(bs))[testCovariate],
303 | " is a factor.")
304 | if(min(sampleSize) < 2)
305 | stop("At least one group has only one sample! ",
306 | "Replicates are required to run dmrseq.")
307 | testCov <- as.factor(testCov)
308 | }
309 |
310 | if (!is.null(adjustCovariate)) {
311 | mmdat <- data.frame(testCov = testCov)
312 | adjustCov <- pData(bs)[, adjustCovariate, drop = FALSE]
313 |
314 | # check for missing values in adjustCovariate
315 | if (any(is.na(adjustCov))) {
316 | stop("Missing values found in the adjustCovariate. ",
317 | "Please remove these samples from the analysis.")
318 | }
319 |
320 | # check for number of unique values per adjust cov
321 | nunq <- apply(adjustCov, 2, function(x) length(unique(x)))
322 | if (any(nunq < 2))
323 | stop("At least one adjust covariate is constant across samples.",
324 | " Please remove this covariate from the model and try again.")
325 |
326 | # remove any empty factor levels
327 | for (f in 1:ncol(adjustCov)){
328 | if (is.factor(adjustCov[,f]))
329 | adjustCov[,f] <- droplevels(adjustCov[,f])
330 | }
331 |
332 | mmdat <- cbind(mmdat, adjustCov)
333 | frm <- paste0("~", paste0(colnames(mmdat), collapse = " + "))
334 | design <- model.matrix(as.formula(frm), data=mmdat)
335 | colnames(design)[coeff] <- colnames(pData(bs))[testCovariate]
336 | coeff.adj <- (max(coeff) + 1):(ncol(design))
337 | } else {
338 | design <- model.matrix(~testCov)
339 | colnames(design)[coeff] <- colnames(pData(bs))[testCovariate]
340 | coeff.adj <- NULL
341 | }
342 |
343 | # check model matrix is full rank
344 | e <- eigen(crossprod(as.matrix(design)), symmetric = TRUE, only.values = TRUE)$values
345 | if (! (e[1] > 0 && abs(e[length(e)]/e[1]) > 1e-13)){
346 | stop("Design matrix is not full rank")
347 | }
348 |
349 | # check for empty factor levels in design matrix
350 | if (sum(colSums(design) == 0) > 0){
351 | which.empty <- which(colSums(design) == 0)
352 | design <- design[,-which.empty]
353 | }
354 |
355 | # check that p <= n
356 | if (ncol(bs) < ncol(design) + 1)
357 | stop("Not enough degrees of freedom to estimate ", ncol(design)-1,
358 | " covariates using ", ncol(bs), " samples. Please use a larger ",
359 | "number of samples, or specify fewer adjust covariates.")
360 |
361 | # check for incompatible args
362 | if (fact && !is.null(matchCovariate) && length(unique(testCov)) > 2)
363 | stop("matchCovariate can't be used when testCovariate is not a 2-group ",
364 | "factor. Perhaps you'd like to add an adjustCovariate instead?")
365 |
366 | if(!is.null(matchCovariate) && choose(nrow(design), min(sampleSize)) >= 5e5)
367 | stop("matchCovariate can't be used when the sample size is large enough ",
368 | "to yield more than 500000 possible permutations. ",
369 | "Perhaps you'd like to add an adjustCovariate instead?")
370 |
371 | # check for interaction terms (not yet supported)
372 | if (length(coeff) > 1 && any(rowSums(design[,coeff]) > 1))
373 | stop("Interaction terms in testCovariate are not yet supported.")
374 |
375 | if (length(unique(testCov)) == 2) {
376 | message("Condition: ",
377 | unique(pData(bs)[, testCovariate][which(design[, coeff] == 1)]),
378 | " vs ",
379 | unique(pData(bs)[, testCovariate][which(design[, coeff] == 0)]))
380 | }
381 | if (!is.null(adjustCovariate)) {
382 | message("Adjusting for covariate (s): ",
383 | paste(colnames(pData(bs))[adjustCovariate], collapse = ", "))
384 | }
385 | if (!is.null(matchCovariate)) {
386 | if (length(matchCovariate) > 1)
387 | stop("Covariate matching can only be carried out for one",
388 | " covariate")
389 | if (length(unique(testCov)) > 2)
390 | stop("Covariate matching can only be carried out for 2-group",
391 | " comparisons")
392 | if (is.character(matchCovariate)) {
393 | if (sum(grepl(matchCovariate, colnames(pData(bs)))) == 0) {
394 | stop("Error: no column in pData() found that matches ",
395 | "the matchCovariate")
396 | } else if (length(grep(matchCovariate, colnames(pData(bs)))) > 1) {
397 | stop("Error: matchCovariate matches more than one ",
398 | "column in pData()")
399 | }
400 | mC <- grep(matchCovariate, colnames(pData(bs)))
401 | } else {
402 | stopifnot(matchCovariate <= ncol(pData(bs)))
403 | }
404 | message("Matching permutations on covariate: ",
405 | colnames(pData(bs))[mC])
406 | }
407 |
408 | # check for loci with missing data
409 | if (fact){
410 | lev <- unique(pData(bs)[[testCovariate]])
411 | filter <- NULL
412 | for (l in seq_along(lev)){
413 | filter <- rbind(filter,
414 | 1*(DelayedMatrixStats::rowSums2(getCoverage(bs)[,pData(bs)[[testCovariate]] ==
415 | lev[l], drop = FALSE]) == 0))
416 | }
417 | filter <- which( apply(filter, 2, max) > 0 )
418 |
419 | if (length(filter) > 0) {
420 | stop(length(filter), " loci have zero coverage in all samples ",
421 | "of at least one condition. Please remove these loci ",
422 | "before running dmrseq")
423 | }
424 |
425 | }else{
426 | filter <- DelayedMatrixStats::rowSums2(getCoverage(bs)==0) >= ncol(bs) - 1
427 | if(sum(filter) > 0)
428 | stop(sum(filter), " loci have zero coverage in at least ",
429 | ncol(bs) - 1, " samples. Please remove these loci ",
430 | "before running dmrseq")
431 | }
432 |
433 | # register the parallel backend
434 | BiocParallel::register(BPPARAM)
435 | backend <- paste0("BiocParallel:", class(bpparam())[1])
436 |
437 | if (bpparam()$workers == 1) {
438 | if (verbose) {
439 | mes <- "Using a single core (backend: %s)."
440 | message(sprintf(mes, backend))
441 | }
442 | parallel <- FALSE
443 | } else {
444 | if (verbose) {
445 | mes <- paste0("Parallelizing using %s workers/cores ",
446 | "(backend: %s).")
447 | message(sprintf(mes, bpparam()$workers, backend))
448 | }
449 | parallel <- TRUE
450 | }
451 | message("Computing on ", chrsPerChunk,
452 | " chromosome(s) at a time.\n")
453 |
454 | message("Detecting candidate regions with coefficient larger than ",
455 | unique(abs(cutoff)),
456 | " in magnitude.")
457 | OBS <- bumphunt(bs=bs, design = design,
458 | coeff = coeff, coeff.adj = coeff.adj, minInSpan = minInSpan,
459 | minNumRegion = minNumRegion, cutoff = cutoff,
460 | maxGap = maxGap, maxGapSmooth = maxGapSmooth,
461 | smooth = smooth, bpSpan = bpSpan, verbose = verbose,
462 | parallel = parallel, block = block, blockSize = blockSize,
463 | chrsPerChunk = chrsPerChunk, fact = fact,
464 | adjustCovariate = adjustCovariate)
465 |
466 | # check that at least one candidate region was found; if there were none
467 | # there is no need to go on to compute permutation tests...
468 |
469 | if (length(OBS) > 0) {
470 | message("* ", nrow(OBS), " candidates detected")
471 | FLIP <- NULL
472 | # configure the permutation matrix for two group comparisons
473 | if (length(unique(design[, coeff[1]])) == 2 &&
474 | length(coeff) == 1 &&
475 | choose(nrow(design), min(sampleSize)) < 5e5 ) {
476 | if (verbose) {
477 | message("Performing balanced permutations of ",
478 | "condition across samples ",
479 | "to generate a null distribution of region test statistics")
480 | }
481 | perms <- combn(seq(1, nrow(design)), min(sampleSize))
482 |
483 | # Remove redundant permutations (if balanced)
484 | if (length(unique(table(design[,coeff]))) == 1){
485 | perms <- perms[, seq_len(ncol(perms)/2)]
486 | }
487 |
488 | # restrict to unique permutations that don't include any
489 | # groups consisting of all identical conditions
490 | rmv <- NULL
491 | for (p in seq_len(ncol(perms))){
492 | if (length(unique(design[perms[,p],coeff])) == 1){
493 | rmv <- c(rmv, p)
494 | }
495 | }
496 | if (length(rmv) > 0 )
497 | perms <- perms[,-rmv]
498 |
499 | # subsample permutations based on similarity to original partition
500 | # gives preference to those with the least similarity
501 | if (maxPerms < ncol(perms)) {
502 | similarity <- apply(perms, 2, function(x) {
503 | max(table(design[x,coeff]))
504 | })
505 | perms.all <- perms
506 | perms <- NULL
507 | levs <- sort(unique(similarity))
508 | l <- 1
509 | num <- 0
510 | while(!(num == maxPerms) && l <= length(levs)) {
511 | keep <- sample(which(similarity == levs[l]),
512 | min(maxPerms-num, sum(similarity == levs[l])) )
513 | perms <- cbind(perms, perms.all[,keep])
514 | l <- l + 1
515 | num <- ncol(perms)
516 | }
517 | }
518 | } else {
519 | # Next consider a multilevel, or continuous covariate where the
520 | # covariate will be permuted in an unrestricted manner
521 | if (verbose) {
522 | message("Performing unrestricted permutation of",
523 | " covariate of interest across samples ",
524 | "to generate a null distribution of region test statistics")
525 | }
526 | perms <- as.matrix(seq_len(nrow(design)))
527 |
528 | for (p in seq_len(maxPerms)) {
529 | tries <- 0
530 | candidate <- sample(seq_len(nrow(design)), nrow(design))
531 | # check that the permutation is not a duplicate, and not
532 | # equal to the original
533 | while ((sum(apply(perms, 2, function(x)
534 | all.equal(x, candidate)) == TRUE) > 0 ||
535 | sum(apply(perms, 2, function(x)
536 | all.equal(x, rev(candidate))) == TRUE) > 0) &&
537 | tries <= 20) {
538 | candidate <- sample(seq(seq_len(nrow(design))), nrow(design))
539 | tries <- tries + 1
540 | }
541 | # save the permutation to the permutation matrix
542 | if (tries <= 20){
543 | perms <- cbind(perms, candidate)
544 | }
545 | }
546 | perms <- perms[,-1] # remove original
547 | }
548 |
549 | pData.orig <- pData(bs)
550 | levs <- unique(pData.orig[[testCovariate]])
551 | # Now rerun on permuted designs and concatenate results
552 | for (j in seq_len(ncol(perms))) {
553 | if (verbose) {
554 | message("\nBeginning permutation ", j)
555 | }
556 | reorder <- perms[, j]
557 | designr <- design
558 |
559 | if (length(unique(design[, coeff[1]])) == 2 &&
560 | length(coeff) == 1 &&
561 | !nrow(perms) == nrow(designr)) {
562 | designr[, coeff] <- 0
563 | designr[reorder, coeff] <- 1
564 | pData(bs)[[testCovariate]] <- levs[1]
565 | pData(bs)[[testCovariate]][reorder] <- levs[2]
566 |
567 | if (!all(sort(pData.orig[[testCovariate]]) ==
568 | sort(pData(bs)[[testCovariate]]))){
569 | designr[, coeff] <- 1
570 | designr[reorder, coeff] <- 0
571 | pData(bs)[[testCovariate]] <- levs[2]
572 | pData(bs)[[testCovariate]][reorder] <- levs[1]
573 | }
574 |
575 | xr <- NULL
576 | for (rd in seq_len(nrow(pData.orig))) {
577 | match <- which(pData.orig[[testCovariate]] %in%
578 | pData(bs)[rd,][[testCovariate]])
579 | taken <- which(match %in% xr)
580 | if (length(taken) > 0)
581 | match <- match[-taken]
582 | if (length(match) > 0)
583 | xr <- c(xr, match[1])
584 | }
585 | if(length(coeff.adj) > 0){
586 | pData(bs)[,adjustCovariate] <-
587 | pData.orig[xr,adjustCovariate]
588 | }
589 | } else {
590 | designr[, coeff] <- designr[reorder, coeff]
591 | pData(bs) <- pData.orig[reorder, , drop = FALSE]
592 | }
593 |
594 | # if matchCovariate is not null, restrict permutations such that
595 | # null comparisons are balanced for the values of
596 | # pData$matchCovariate this avoids comparison of,
597 | # say two different individuals in the null, that the comparison of
598 | # interest is tissue type. Not matching would mean the null is
599 | # really not null
600 | if (!is.null(matchCovariate)) {
601 | permLabel <- paste0(paste0(pData(bs)[designr[, coeff[1]] == 1,
602 | mC], collapse = "_"),
603 | "vs", paste0(pData(bs)[(1 - designr[, coeff[1]]) == 1,
604 | mC], collapse = "_"))
605 |
606 | c1 <- unlist(strsplit(permLabel, "vs"))[1]
607 | c2 <- unlist(strsplit(permLabel, "vs"))[2]
608 |
609 | c1 <- unlist(strsplit(c1, "_"))
610 | c2 <- unlist(strsplit(c2, "_"))
611 |
612 | keepPerm <- 1 * (sum(c1 %in% c2) > 0 &&
613 | sum(c2 %in% c1) > 0)
614 |
615 | if (keepPerm == 0) {
616 | if (verbose) {
617 | message(paste0("Skipping permutation ",
618 | gsub("vs", " vs ", permLabel)))
619 | }
620 | next
621 | }
622 | } else {
623 | permLabel <- j
624 | }
625 |
626 | res.flip.p <- bumphunt(bs=bs, design = designr,
627 | coeff = coeff,
628 | coeff.adj = coeff.adj,
629 | minInSpan = minInSpan,
630 | minNumRegion = minNumRegion, cutoff = cutoff,
631 | maxGap = maxGap, maxGapSmooth = maxGapSmooth,
632 | smooth = smooth, bpSpan = bpSpan,
633 | verbose = verbose, parallel = parallel,
634 | block = block, blockSize = blockSize,
635 | chrsPerChunk = chrsPerChunk, fact = fact,
636 | adjustCovariate = adjustCovariate)
637 |
638 | if (verbose) {
639 | message("* ", j, " out of ", ncol(perms),
640 | " permutations completed (",
641 | nrow(res.flip.p), " null candidates)")
642 | }
643 |
644 | if (!is.null(res.flip.p)) {
645 | res.flip.p$permNum <- permLabel
646 | FLIP <- rbind(FLIP, res.flip.p)
647 | }
648 | }
649 |
650 | # restore original pData
651 | pData(bs) <- pData.orig
652 |
653 | # if no candidates were found in permutation
654 | # provide informative error message
655 | if (is.null(FLIP)){
656 | warning("No candidate regions found in permutation, so inference ",
657 | "can't be carried out. ",
658 | "Try decreasing the cutoff, or running on a larger ",
659 | "dataset if you are currently using a subset.")
660 | OBS$pval <- NA
661 | OBS$qval <- NA
662 | }else if (nrow(FLIP) < 0.05*nrow(OBS)){
663 | message("Note: Very few null candidate regions were found.",
664 | "For more accurate and sensitive inference, ",
665 | "try decreasing the cutoff, or running on a larger ",
666 | "dataset if you are currently using a subset.")
667 | }
668 |
669 | if (!is.null(FLIP)){
670 | # if there are more than 1 million candidate null regions,
671 | # take a random sample
672 | # of 1 million of them
673 | if (nrow(FLIP) > 1e+06) {
674 | rs <- sample(seq_len(nrow(FLIP)), 1e+06, replace = FALSE)
675 | FLIP <- FLIP[rs, ]
676 | }
677 |
678 | # which column of results to use as test statistic ?
679 | # check statistic name
680 | if (!(stat %in% c(colnames(OBS), "avg"))) {
681 | stop("Specified '", stat,
682 | "' as the test statistic which is not ",
683 | "in the results. Please specify a valid name from one of ",
684 | "L, area, beta, or stat")
685 | } else if (stat == "avg") {
686 | OBS$avg <- OBS$area/OBS$L
687 | FLIP$avg <- FLIP$area/FLIP$L
688 | }
689 |
690 | whichStatO <- which(colnames(OBS) == stat)
691 | whichStatF <- which(colnames(FLIP) == stat)
692 |
693 | # Faster way to compute the p-values that doesn't use multiple cores
694 | # Step 1: sort the permuted statistics vector
695 | perm.ordered <- c(sort(abs(FLIP[, whichStatF]),
696 | method = "quick"), Inf)
697 |
698 | # Step 2: find the first instance in the sorted vector where the
699 | # permuted value is greater than the observed and use this to
700 | # determine the number of permuted values that are greater than or
701 | # equal to theobserved
702 | pval <- rep(NA, nrow(OBS))
703 | pval[!is.na(OBS[, whichStatO])] <- (1 +
704 | vapply(abs(OBS[!is.na(OBS[, whichStatO]), whichStatO]),
705 | function(x) length(perm.ordered) - min(which(x <= perm.ordered)),
706 | numeric(1))) / (1 + sum(!is.na(FLIP[, whichStatF])))
707 |
708 | # missing test statistics cause Inf for the p-value calculation
709 | # instead propagate the missing values
710 | pval[abs(pval) == Inf] <- NA
711 |
712 | pval <- data.frame(x = pval, y = p.adjust(pval, method = "BH"))
713 |
714 | OBS$pval <- pval$x
715 | OBS$qval <- pval$y
716 | }
717 |
718 | # convert output into GRanges, with indexStart/indexEnd as IRanges
719 | indexIR <- IRanges(OBS$indexStart, OBS$indexEnd)
720 | OBS.gr <- makeGRangesFromDataFrame(OBS[,-c(4:5)],
721 | keep.extra.columns = TRUE)
722 | OBS.gr$index <- indexIR
723 | names(OBS.gr) <- NULL
724 |
725 | # sort on pval overall (currently sorted within chromsome)
726 | OBS.gr <- OBS.gr[order(OBS.gr$pval, -abs(OBS.gr$stat)),]
727 |
728 | return(OBS.gr)
729 | } else {
730 | message("No candidate regions pass the cutoff of ", unique(abs(cutoff)))
731 | return(NULL)
732 | }
733 | }
734 |
--------------------------------------------------------------------------------