├── data ├── BS.chr21.rda ├── dmrs.ex.rda └── annot.chr21.rda ├── inst ├── sticker │ ├── dmrseq.png │ └── sticker.R ├── scripts │ ├── get_dmrs.ex.R │ ├── get_annot.chr21.R │ └── get_BS.chr21.R ├── CITATION └── NEWS ├── .gitignore ├── man ├── meanDiff.Rd ├── dmrPlotAnnotations.Rd ├── annot.chr21.Rd ├── dmrs.ex.Rd ├── getAnnot.Rd ├── BS.chr21.Rd ├── plotEmpiricalDistribution.Rd ├── simDMRs.Rd ├── dmrseq.Rd └── plotDMRs.Rd ├── LICENSE ├── vignettes ├── dmrseqBib.bib └── dmrseq.Rmd ├── README.md ├── DESCRIPTION ├── NAMESPACE └── R ├── meanDiff.R ├── getAnnot.R ├── plotEmpiricalDistribution.R ├── simDMRs.R ├── plotDMRs.R ├── internal_plotting_functions.R └── dmrseq.R /data/BS.chr21.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdkorthauer/dmrseq/HEAD/data/BS.chr21.rda -------------------------------------------------------------------------------- /data/dmrs.ex.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdkorthauer/dmrseq/HEAD/data/dmrs.ex.rda -------------------------------------------------------------------------------- /data/annot.chr21.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdkorthauer/dmrseq/HEAD/data/annot.chr21.rda -------------------------------------------------------------------------------- /inst/sticker/dmrseq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdkorthauer/dmrseq/HEAD/inst/sticker/dmrseq.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | *.Rproj 6 | *.Rbuildignore 7 | vignettes/dmrseq_cache/* 8 | -------------------------------------------------------------------------------- /inst/sticker/sticker.R: -------------------------------------------------------------------------------- 1 | imgurl <- "https://clipartion.com/wp-content/uploads/2015/11/bird-island-clipart-free-clip-art-images-830x301.png" 2 | 3 | sticker(imgurl, 4 | package="dmrseq", 5 | p_size=8, 6 | s_x=1, 7 | s_y=1.2, 8 | s_width=0.85, 9 | s_height=1, 10 | p_x = 0.95, 11 | p_y = 0.6, 12 | h_color = "gold2", 13 | h_fill = "dodgerblue", 14 | p_color = "gold2", 15 | filename="./inst/sticker/dmrseq.png") -------------------------------------------------------------------------------- /inst/scripts/get_dmrs.ex.R: -------------------------------------------------------------------------------- 1 | ## The following script uses the dmrseq package to 2 | ## constructs the dmrs.ex dataset, 3 | ## an example of dmrseq output 4 | ## included in the dmrseq package for example purposes 5 | 6 | library(dmrseq) 7 | 8 | # load example data 9 | data(BS.chr21) 10 | 11 | # the covariate of interest is the "CellType" column of pData(BS.chr21) 12 | testCovariate <- "CellType" 13 | 14 | # run dmrseq on a subset of the chromosome (20K CpGs) 15 | dmrs.ex <- dmrseq(bs=BS.chr21[240001:260000,], 16 | cutoff = 0.05, 17 | testCovariate=testCovariate) 18 | 19 | save(dmrs.ex, file = "./data/dmrs.ex.rda") 20 | library(tools) 21 | resaveRdaFiles("./data/dmrs.ex.rda") 22 | -------------------------------------------------------------------------------- /inst/scripts/get_annot.chr21.R: -------------------------------------------------------------------------------- 1 | ## The following script uses the dmrseq package to 2 | ## constructs the annot.chr21 dataset, 3 | ## an example of annotatr annotation to use in plotting 4 | ## included in the dmrseq package for example purposes 5 | 6 | library(dmrseq) 7 | 8 | # load example data 9 | data(BS.chr21) 10 | 11 | # get annotation information for hg19 12 | annot.chr21 <- getAnnot("hg19") 13 | 14 | # only keep this information for chromosome 21 (for example dataset) 15 | annot.chr21[[1]] <- annot.chr21[[1]][seqnames(annot.chr21[[1]])=="chr21",] 16 | annot.chr21[[2]] <- annot.chr21[[2]][seqnames(annot.chr21[[2]])=="chr21",] 17 | 18 | save(annot.chr21 , file = "./data/annot.chr21.rda") 19 | library(tools) 20 | resaveRdaFiles("./data/annot.chr21.rda") 21 | -------------------------------------------------------------------------------- /man/meanDiff.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/meanDiff.R 3 | \name{meanDiff} 4 | \alias{meanDiff} 5 | \title{Function to compute raw mean methylation differences} 6 | \usage{ 7 | meanDiff(bs, dmrs, testCovariate) 8 | } 9 | \arguments{ 10 | \item{bs}{a \code{BSseq} object} 11 | 12 | \item{dmrs}{a data.frame with one row per DMR. This can be in the format 13 | of \code{dmrseq} output, but at least should contain the indexStart and 14 | indexEnd values of the regions of interest.} 15 | 16 | \item{testCovariate}{a character indicating the covariate of interest in 17 | the \code{pData} slot of \code{bs}.} 18 | } 19 | \value{ 20 | numeric vector of raw mean methylation differences. 21 | } 22 | \description{ 23 | This function calculates raw mean methylation differences for the 24 | covariate of interest over a set of DMRs (or regions of interest), 25 | assuming a simple two-group comparison. 26 | } 27 | \examples{ 28 | 29 | data(BS.chr21) 30 | data(dmrs.ex) 31 | rawDiff <- meanDiff(BS.chr21, dmrs=dmrs.ex, testCovariate="CellType") 32 | 33 | } 34 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite dmrseq in publications, please use:") 2 | 3 | citEntry(entry = "Article", 4 | title = "Detection and accurate false discovery rate control of differentially methylated regions from whole genome bisulfite sequencing", 5 | author = personList(as.person("Keegan Korthauer"), 6 | as.person("Sutirtha Chakraborty"), 7 | as.person("Yuval Benjamini"), 8 | as.person("Rafael A Irizarry")), 9 | journal = "Biostatistics", 10 | year = "2018", 11 | #volume = "", 12 | #number = "", 13 | pages = "kxy007", 14 | doi = "https://doi.org/10.1093/biostatistics/kxy007", 15 | textVersion = 16 | paste("Korthauer, K., Chakraborty, S., Benjamini, Y., and Irizarry, R. A.", 17 | "Detection and accurate false discovery rate control of differentially methylated regions from whole genome bisulfite sequencing.", 18 | "Biostatistics kxy007, https://doi.org/10.1093/biostatistics/kxy007")) 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Keegan Korthauer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /man/dmrPlotAnnotations.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/internal_plotting_functions.R 3 | \name{dmrPlotAnnotations} 4 | \alias{dmrPlotAnnotations} 5 | \title{Add annotations to DMR plots} 6 | \usage{ 7 | dmrPlotAnnotations(gr, annoTrack) 8 | } 9 | \arguments{ 10 | \item{gr}{a \code{GRanges} object that contains the DMRs to be 11 | plotted} 12 | 13 | \item{annoTrack}{a \code{SimpleGRangesList} object with two elements. 14 | The first contains CpG category information in the first element (optional) 15 | coding gene sequence information in the second element (optional). 16 | At least one of these elements needs to be non-null in order for 17 | any annotation to be plotted, but it is not necessary to contain 18 | both.} 19 | } 20 | \value{ 21 | None 22 | } 23 | \description{ 24 | Function to add visual representation of CpG categories 25 | and/or coding 26 | sequences to DMR plot 27 | } 28 | \details{ 29 | An internal function that takes an annotation 30 | \code{SimpleGRangesList} 31 | object that 32 | contains CpG category information in the first element (optional) and / or 33 | coding gene sequence information in the second element (optional). If neither 34 | of these are present, then nothing will be plotted. 35 | } 36 | -------------------------------------------------------------------------------- /man/annot.chr21.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/helper_functions.R 3 | \docType{data} 4 | \name{annot.chr21} 5 | \alias{annot.chr21} 6 | \title{annot.chr21: Annotation information for chromosome 21, hg38 genome} 7 | \format{ 8 | a \code{GRangesList} object with two elements returned 9 | by \code{\link{getAnnot}}. The first 10 | contains CpG category information in the first element (optional) 11 | coding gene sequence information in the second element (optional). 12 | At least one of these elements needs to be non-null in order for 13 | any annotation to be plotted, but it is not necessary to contain 14 | both. 15 | } 16 | \source{ 17 | Obtained from running 18 | \code{annoTrack} function and then subsetting the results to 19 | only include chromosome 21. A script which executes these steps 20 | and constructs the \code{annot.chr21} 21 | object may be found in \file{inst/scripts/get_annot.chr21.R} 22 | } 23 | \usage{ 24 | data(annot.chr21) 25 | } 26 | \description{ 27 | This is the annotation information returned from 28 | \code{\link{getAnnot}}, subsetted for chromosome 21 for convenience 29 | in running the examples. The annotation is obtained using the 30 | \code{annotatr} package. 31 | } 32 | \examples{ 33 | data(annot.chr21) 34 | } 35 | \keyword{datasets} 36 | -------------------------------------------------------------------------------- /man/dmrs.ex.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/helper_functions.R 3 | \docType{data} 4 | \name{dmrs.ex} 5 | \alias{dmrs.ex} 6 | \title{dmrs.ex: Example results of DMRs} 7 | \format{ 8 | a data.frame that contains the results of the inference. The 9 | data.frame contains one row for each candidate region, and 10 | 10 columns, in the following order: 1. chr = 11 | chromosome, 2. start = 12 | start basepair position of the region, 3. end = end basepair position 13 | of the region, 14 | 4. indexStart = the index of the region's first CpG, 15 | 5. indexEnd = the index of the region's last CpG, 16 | 6. L = the number of CpGs contained in the region, 17 | 7. area = the sum of the smoothed beta values 18 | 8. beta = the coefficient value for the condition difference, 19 | 9. stat = the test statistic for the condition difference, 20 | 10. pval = the permutation p-value for the significance of the test 21 | statistic, and 22 | 11. qval = the q-value for the test statistic (adjustment 23 | for multiple comparisons to control false discovery rate). 24 | } 25 | \source{ 26 | Obtained from running the examples in \code{\link{dmrseq}}. 27 | A script which executes these steps 28 | and constructs the \code{dmrs.ex} 29 | object may be found in \file{inst/scripts/get_dmrs.ex.R} 30 | } 31 | \usage{ 32 | data(dmrs.ex) 33 | } 34 | \description{ 35 | Example output from \code{dmrseq} function run on the 36 | example dataset \code{BS.chr21}. 37 | } 38 | \examples{ 39 | data(dmrs.ex) 40 | } 41 | \keyword{datasets} 42 | -------------------------------------------------------------------------------- /man/getAnnot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getAnnot.R 3 | \name{getAnnot} 4 | \alias{getAnnot} 5 | \title{Retrieve annotation information} 6 | \usage{ 7 | getAnnot(genomeName) 8 | } 9 | \arguments{ 10 | \item{genomeName}{a character object that indicates which organism is 11 | under study. Use the function \code{builtin_genomes()} to see 12 | a character vector of available genome names to choose from (see 13 | \code{annotatr} documentation for more details).} 14 | } 15 | \value{ 16 | a \code{SimpleGRangesList} object with two elements returned 17 | by \code{\link{getAnnot}}. The first 18 | contains CpG category information in the first element (optional) 19 | coding gene sequence information in the second element (optional). 20 | At least one of these elements needs to be non-null in order for 21 | any annotation to be plotted, but it is not necessary to contain 22 | both. 23 | } 24 | \description{ 25 | Uses the \code{annotatr} package to retrieve annotation information ( 26 | CpG category and gene coding sequences) for the \code{annoTrack} argument 27 | of \code{\link{plotDMRs}}. Allows for 5 28 | re-tries if download fails (to allow for a spotty internet connection). 29 | } 30 | \details{ 31 | Note that this package needs to attach the \code{annotatr} package, 32 | and will 33 | return NULL if this cannot be done. You can still use the 34 | \code{\link{plotDMRs}} function without this optional annotation step, 35 | just by leaving the \code{annoTrack} argument as NULL. 36 | } 37 | \examples{ 38 | 39 | # get annotation information for hg19 40 | annoTrack <- getAnnot('hg19') 41 | 42 | 43 | } 44 | -------------------------------------------------------------------------------- /vignettes/dmrseqBib.bib: -------------------------------------------------------------------------------- 1 | @article{Hansen2012, 2 | author = {Hansen, Kasper D and Langmead, Benjamin and Irizarry, Rafael A}, 3 | title = {{BSmooth: from whole genome bisulfite sequencing reads to differentially 4 | methylated regions}}, 5 | journal = {Genome Biology}, 6 | year = {2012}, 7 | volume = {13}, 8 | number = {10}, 9 | pages = {R83}, 10 | doi = {10.1186/gb-2012-13-10-r83}, 11 | pubmed = {23034175} 12 | } 13 | 14 | @article{Lister2009, 15 | author = {Lister, Ryan and Pelizzola, Mattia and Dowen, Robert H and Hawkins, R David and 16 | Hon, Gary C and Tonti-Filippini, Julian and Nery, Joseph R and Lee, Leonard and 17 | Ye, Zhen and Ngo, Que-Minh and Edsall, Lee and Antosiewicz-Bourget, Jessica and 18 | Stewart, Ron and Ruotti, Victor and Millar, A Harvey and Thomson, James A and Ren, 19 | Bing and Ecker, Joseph R}, 20 | title = {{Human DNA methylomes at base resolution show widespread epigenomic differences}}, 21 | journal = {Nature}, 22 | year = {2009}, 23 | volume = {462}, 24 | number = {7271}, 25 | pages = {315--322}, 26 | doi = {10.1038/nature08514}, 27 | pubmed = {19829295} 28 | } 29 | 30 | @article {Korthauer183210, 31 | author = {Korthauer, Keegan and Chakraborty, Sutirtha and Benjamini, Yuval and Irizarry, Rafael A.}, 32 | title = {Detection and accurate False Discovery Rate control of differentially methylated regions from Whole Genome Bisulfite Sequencing}, 33 | year = {2018}, 34 | doi = {10.1101/183210}, 35 | journal = {Biostatistics} 36 | } 37 | 38 | 39 | 40 | % Local Variables: 41 | % eval: (setq fill-column 100) 42 | % End: 43 | -------------------------------------------------------------------------------- /man/BS.chr21.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/helper_functions.R 3 | \docType{data} 4 | \name{BS.chr21} 5 | \alias{BS.chr21} 6 | \title{BS.chr21: Whole-genome bisulfite sequencing for chromosome 21 7 | from Lister et al.} 8 | \format{ 9 | An object of class \code{BSseq}. 10 | } 11 | \source{ 12 | Obtained from 13 | \url{http://neomorph.salk.edu/human_methylome/data.html} specifically, 14 | the files \url{mc_h1_r1.tar.gz}, \url{mc_h1_r2.tar.gz}, 15 | \url{mc_imr90_r1.tar.gz}, \url{mc_imr90_r2.tar.gz} 16 | A script which downloads these files and constructs the \code{BS.chr21} 17 | object may be found in \file{inst/scripts/get_BS.chr21.R} - this was 18 | based off of and modified from the get_BS.chr22.R script in the 19 | \code{bsseq} package. The object constructed here contains a 20 | different chromosome (22 instead of 21), and two additional samples 21 | (h1 and imr90 instead of just imr90) to enable identification of 22 | cell type-DMRs for examples. 23 | } 24 | \usage{ 25 | data(BS.chr21) 26 | } 27 | \description{ 28 | This dataset represents chromosome 21 29 | from the IMR90 and H1 cell lines sequenced in Lister et al. 30 | Only CpG methylation are included. The two samples from 31 | each cell line are two different extractions (ie. technical replicates), 32 | and are pooled in the analysis in the original paper. 33 | } 34 | \details{ 35 | All coordinates are in hg18. 36 | } 37 | \examples{ 38 | data(BS.chr21) 39 | BS.chr21 40 | } 41 | \references{ 42 | R Lister et al. \emph{Human DNA methylomes at base 43 | resolution show widespread epigenomic differences}. Nature (2009) 462, 44 | 315-322. 45 | } 46 | \keyword{datasets} 47 | -------------------------------------------------------------------------------- /inst/NEWS: -------------------------------------------------------------------------------- 1 | Changes in Version 1.3.1 (2018-11-02) 2 | _____________________________________ 3 | 4 | o dmrseq now requires that input BSseq objects be ordered (as the BSseq() 5 | constructor no longer automatically orders loci). 6 | 7 | Changes in Version 1.1.20 (2018-10-11) 8 | _____________________________________ 9 | 10 | o Minor bug fix for a rare situation that occured in the case of a multi-level 11 | covariate of interest if the region level model fitting procedure did not 12 | converge. 13 | 14 | Changes in Version 1.1.2 (2018-05-09) 15 | _____________________________________ 16 | 17 | o The newly added `chrsPerChunk` argument specifies the number of chromosomes 18 | to compute at a time (default is 1). 19 | 20 | Changes in Version 0.99.11 (2018-04-05) 21 | ______________________________________ 22 | 23 | o dmrseq now provides support for detecting large-scale methylation blocks. To 24 | use this feature, specify `block=TRUE`, and increase the smoothing span 25 | parameters `minInSpan`, `bpSpan`, and `maxGapSmooth`. More details are 26 | provided in the documentation and vignette. 27 | 28 | Changes in Version 0.99.8 (2018-03-21) 29 | ______________________________________ 30 | 31 | o dmrseq no longer requires balanced, two-group comparisons. To run using a 32 | continuous or categorial covariate with more than two groups, simply pass in 33 | the name of a column in `pData` that contains this covariate. A continuous 34 | covariate is assmued if the data type in the `testCovariate` slot is 35 | continuous, with the exception of if there are only two unique values (then a 36 | two group comparison is carried out). 37 | 38 | Version 0.99.6 (2018-03-02) 39 | ___________________________ 40 | 41 | o dmrseq is now available on Bioconductor devel (3.7) ! 42 | 43 | -------------------------------------------------------------------------------- /man/plotEmpiricalDistribution.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plotEmpiricalDistribution.R 3 | \name{plotEmpiricalDistribution} 4 | \alias{plotEmpiricalDistribution} 5 | \title{Plot the empirical distribution of the methylation beta vals or coverage} 6 | \usage{ 7 | plotEmpiricalDistribution( 8 | bs, 9 | testCovariate = NULL, 10 | bySample = FALSE, 11 | type = "M", 12 | adj = 2.5 13 | ) 14 | } 15 | \arguments{ 16 | \item{bs}{a BSseq object} 17 | 18 | \item{testCovariate}{character specifying the column name of the 19 | \code{pData} slot of the BSseq object to include in the plot legend.} 20 | 21 | \item{bySample}{logical whether to plot a separate line for each sample, 22 | even if the grouping \code{testCovariate} is specified. 23 | Default value is FALSE (so samples with the same value of 24 | \code{testCovariate} will be collapsed into the same line). If 25 | \code{testCovariate} is not specified, this parameter does not have an 26 | effect and samples are automatically plotted separately.} 27 | 28 | \item{type}{a character indicating which type of density to plot - the 29 | methylation (beta) values ("M") or the coverage ("Cov"). Default is "M".} 30 | 31 | \item{adj}{a numeric value for the \code{adjust} parameter to pass to the 32 | \code{geom_line} function. Specifies how smooth the make the function.} 33 | } 34 | \value{ 35 | a ggplot object 36 | } 37 | \description{ 38 | Uses ggplot2 to plot smoothed density histograms of methylation 39 | proportions (beta values), or coverage. Methylation proportion densities 40 | are weighted by coverage. 41 | The number of curves plotted 42 | will be equal to the number of different values of \code{testCovariate}, 43 | unless \code{bySample} is TRUE. This can take quite some time to 44 | execute for a large object, so it is recommended to first take a random 45 | sample of loci (say one million) before plotting. 46 | } 47 | \examples{ 48 | 49 | data(BS.chr21) 50 | 51 | # plot beta values by sample group 52 | plotEmpiricalDistribution(BS.chr21, testCovariate="CellType") 53 | 54 | } 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dmrseq: Inference for differentially methylated regions (DMRs) from bisulfite sequencing 2 | 3 | A central question in the analysis of bisulfite sequencing data 4 | is to detect regions (collections of 5 | neighboring CpGs) with systematic differences between conditions, 6 | as compared to within-condition variability. These so-called *Differentially 7 | Methylated Regions* (DMRs) are thought to be more informative than single CpGs 8 | in terms of of biological function. 9 | 10 |

11 | 12 |

13 | 14 | The package **dmrseq** 15 | provides a rigorous permutation-based approach to 16 | detect and perform inference for differential methylation by use of 17 | generalized least squares models that account for inter-individual and 18 | inter-CpG variability to generate region-level statistics that can be 19 | comparable across the genome. The framework performs well even 20 | on samples as small as two per group. 21 | 22 | ## Installation 23 | 24 | **dmrseq** is available on 25 | [Bioconductor](https://bioconductor.org/packages/dmrseq). You can install 26 | it with R version 3.5.0 or higher with the following commands: 27 | 28 | ``` 29 | install.packages("BiocManager") 30 | BiocManager::install("dmrseq") 31 | ``` 32 | 33 | ## Getting started 34 | 35 | See the vignette for information on how to use the package to perform 36 | typical methylation analysis workflows. 37 | 38 | ## Learn more 39 | 40 | More details of the **dmrseq** framework can be found in the manuscript 41 | 42 | > Korthauer, K., Chakraborty, S., Benjamini, Y., and Irizarry, R.A. 43 | > Detection and accurate False Discovery Rate control of differentially 44 | methylated regions from Whole Genome Bisulfite Sequencing 45 | > *Biostatistics*, 2018 (in press). 46 | > [BioRxiv:10.1101/183210](http://www.biorxiv.org/content/early/2017/08/31/183210) 47 | 48 | 49 | ## License/Copyright 50 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 51 | This package is made available under an MIT license. 52 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: dmrseq 2 | Type: Package 3 | Title: Detection and inference of differentially methylated regions 4 | from Whole Genome Bisulfite Sequencing 5 | Version: 1.29.1 6 | Authors@R: c(person("Keegan", "Korthauer", role = c("cre", "aut"), 7 | email = "keegan@stat.ubc.ca", 8 | comment = c(ORCID = "0000-0002-4565-1654")), 9 | person("Rafael", "Irizarry", role = "aut", 10 | email = "rafa@jimmy.harvard.edu", 11 | comment = c(ORCID = "0000-0002-3944-4309")), 12 | person("Yuval", "Benjamini", role = "aut"), 13 | person("Sutirtha", "Chakraborty", role = "aut")) 14 | Description: This package implements an approach for scanning the 15 | genome to detect and perform accurate inference on differentially 16 | methylated regions from Whole Genome Bisulfite Sequencing data. 17 | The method is based on comparing detected regions 18 | to a pooled null distribution, that can be implemented even when as 19 | few as two samples per population are available. 20 | Region-level statistics are obtained by fitting a generalized 21 | least squares (GLS) regression model with a nested autoregressive 22 | correlated error structure for the effect of interest on transformed 23 | methylation proportions. 24 | License: MIT + file LICENSE 25 | Depends: R (>= 3.5), bsseq 26 | Imports: GenomicRanges, 27 | nlme, 28 | ggplot2, 29 | S4Vectors, 30 | RColorBrewer, 31 | bumphunter, 32 | DelayedMatrixStats (>= 1.1.13), 33 | matrixStats, 34 | BiocParallel, 35 | outliers, 36 | methods, 37 | locfit, 38 | IRanges, 39 | grDevices, 40 | graphics, 41 | stats, 42 | utils, 43 | annotatr, 44 | AnnotationHub, 45 | rtracklayer, 46 | GenomeInfoDb, 47 | splines 48 | Suggests: knitr, 49 | rmarkdown, 50 | BiocStyle, 51 | TxDb.Hsapiens.UCSC.hg19.knownGene, 52 | org.Hs.eg.db 53 | biocViews: ImmunoOncology, DNAMethylation, Epigenetics, MultipleComparison, Software, 54 | Sequencing, DifferentialMethylation, WholeGenome, Regression, 55 | FunctionalGenomics 56 | LazyData: true 57 | VignetteBuilder: knitr 58 | RoxygenNote: 7.2.3 59 | -------------------------------------------------------------------------------- /man/simDMRs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/simDMRs.R 3 | \name{simDMRs} 4 | \alias{simDMRs} 5 | \title{Simulate Differentially Methylated Regions} 6 | \usage{ 7 | simDMRs(bs, num.dmrs = 3000, delta.max0 = 0.3) 8 | } 9 | \arguments{ 10 | \item{bs}{a BSseq object containing only control samples (from the same 11 | population) for which simulated DMRs will be added after dividing the 12 | population into two artificial groups.} 13 | 14 | \item{num.dmrs}{an integer specifying how many DMRs to add.} 15 | 16 | \item{delta.max0}{a proportion value indicating the mode value for the 17 | difference in proportion of methylated CpGs in the simulated DMRs (the 18 | actual value will be drawn from a scaled Beta distribution centered at 19 | this value). Default value is 0.3.} 20 | } 21 | \value{ 22 | A named list object with 5 elements: (1) 23 | \code{gr.dmrs} is a \code{GRanges} object with \code{num.dmrs} 24 | ranges that represent the random DMRs added. (2) \code{dmr.mncov} is a 25 | numeric vector that contains the mean coverage in each simulated DMR. (3) 26 | \code{dmr.L} is a numeric vector that contains the number of CpGs in each 27 | simulated DMR. (4) \code{bs} is the BSseq object that contains the 28 | simulated DMRs. (5) \code{deltas} is a numeric vector that contains the 29 | effect size used for each DMR. 30 | } 31 | \description{ 32 | Add simulated DMRs to observed control data. Control data will be split 33 | into two (artificial) populations. 34 | } 35 | \examples{ 36 | 37 | # Add simulated DMRs to a BSseq dataset 38 | # This is just for illustrative purposes - ideally you would 39 | # add DMRs to a set of samples from the same condition (in our 40 | # example data, we have data from two different cell types) 41 | # In this case, we shuffle the samples by cell type to create 42 | # a null comparison. 43 | 44 | data(BS.chr21) 45 | 46 | BS.chr21.sim <- simDMRs(bs=BS.chr21[1:10000,c(1,3,2,4)], 47 | num.dmrs=50) 48 | 49 | # show the simulated DMRs GRanges object 50 | show(BS.chr21.sim$gr.dmrs) 51 | 52 | # show the updated BSseq object that includes the simulated DMRs 53 | show(BS.chr21.sim$bs) 54 | 55 | # examine effect sizes of the DMRs 56 | head(BS.chr21.sim$delta) 57 | 58 | } 59 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(dmrseq) 4 | export(getAnnot) 5 | export(meanDiff) 6 | export(plotDMRs) 7 | export(plotEmpiricalDistribution) 8 | export(simDMRs) 9 | import(GenomicRanges) 10 | import(S4Vectors) 11 | import(annotatr) 12 | import(bsseq) 13 | import(ggplot2) 14 | import(nlme) 15 | importClassesFrom(bsseq,BSseq) 16 | importFrom(AnnotationHub,AnnotationHub) 17 | importFrom(AnnotationHub,query) 18 | importFrom(BiocParallel,MulticoreParam) 19 | importFrom(BiocParallel,bplapply) 20 | importFrom(BiocParallel,bpparam) 21 | importFrom(BiocParallel,register) 22 | importFrom(DelayedMatrixStats,colMedians) 23 | importFrom(DelayedMatrixStats,rowDiffs) 24 | importFrom(DelayedMatrixStats,rowMads) 25 | importFrom(DelayedMatrixStats,rowMeans2) 26 | importFrom(DelayedMatrixStats,rowSums2) 27 | importFrom(GenomeInfoDb,genome) 28 | importFrom(IRanges,IRanges) 29 | importFrom(RColorBrewer,brewer.pal) 30 | importFrom(bumphunter,clusterMaker) 31 | importFrom(bumphunter,getSegments) 32 | importFrom(grDevices,col2rgb) 33 | importFrom(grDevices,colorRampPalette) 34 | importFrom(grDevices,dev.off) 35 | importFrom(grDevices,hcl) 36 | importFrom(grDevices,pdf) 37 | importFrom(grDevices,rainbow) 38 | importFrom(grDevices,rgb) 39 | importFrom(graphics,arrows) 40 | importFrom(graphics,axis) 41 | importFrom(graphics,layout) 42 | importFrom(graphics,legend) 43 | importFrom(graphics,lines) 44 | importFrom(graphics,mtext) 45 | importFrom(graphics,par) 46 | importFrom(graphics,plot) 47 | importFrom(graphics,points) 48 | importFrom(graphics,rect) 49 | importFrom(graphics,rug) 50 | importFrom(graphics,text) 51 | importFrom(locfit,locfit) 52 | importFrom(locfit,lp) 53 | importFrom(matrixStats,rowRanges) 54 | importFrom(methods,is) 55 | importFrom(outliers,grubbs.test) 56 | importFrom(rtracklayer,liftOver) 57 | importFrom(splines,ns) 58 | importFrom(stats,anova) 59 | importFrom(stats,approxfun) 60 | importFrom(stats,as.formula) 61 | importFrom(stats,formula) 62 | importFrom(stats,lm) 63 | importFrom(stats,loess) 64 | importFrom(stats,median) 65 | importFrom(stats,model.matrix) 66 | importFrom(stats,p.adjust) 67 | importFrom(stats,predict) 68 | importFrom(stats,preplot) 69 | importFrom(stats,qt) 70 | importFrom(stats,quantile) 71 | importFrom(stats,rbeta) 72 | importFrom(stats,rbinom) 73 | importFrom(stats,runif) 74 | importFrom(utils,combn) 75 | importMethodsFrom(bsseq,pData) 76 | importMethodsFrom(bsseq,sampleNames) 77 | importMethodsFrom(bsseq,seqnames) 78 | importMethodsFrom(bsseq,start) 79 | importMethodsFrom(bsseq,width) 80 | -------------------------------------------------------------------------------- /R/meanDiff.R: -------------------------------------------------------------------------------- 1 | #' Function to compute raw mean methylation differences 2 | #' 3 | #' This function calculates raw mean methylation differences for the 4 | #' covariate of interest over a set of DMRs (or regions of interest), 5 | #' assuming a simple two-group comparison. 6 | #' 7 | #' @param bs a \code{BSseq} object 8 | #' @param dmrs a data.frame with one row per DMR. This can be in the format 9 | #' of \code{dmrseq} output, but at least should contain the indexStart and 10 | #' indexEnd values of the regions of interest. 11 | #' @param testCovariate a character indicating the covariate of interest in 12 | #' the \code{pData} slot of \code{bs}. 13 | #' 14 | #' @return numeric vector of raw mean methylation differences. 15 | #' 16 | #' @importFrom DelayedMatrixStats rowMeans2 17 | #' 18 | #' @export 19 | #' 20 | #' @examples 21 | #' 22 | #' data(BS.chr21) 23 | #' data(dmrs.ex) 24 | #' rawDiff <- meanDiff(BS.chr21, dmrs=dmrs.ex, testCovariate="CellType") 25 | #' 26 | meanDiff <- function(bs, dmrs, testCovariate) { 27 | # convert covariates to column numbers if characters 28 | if (is.character(testCovariate)) { 29 | testCovariate <- which(colnames(pData(bs)) == testCovariate) 30 | if (length(testCovariate) == 0) { 31 | stop("testCovariate not found in pData(). ", 32 | "Please specify a valid testCovariate") 33 | } 34 | } 35 | 36 | coeff <- seq(2,(2 + length(testCovariate) - 1)) 37 | testCov <- pData(bs)[, testCovariate] 38 | if (length(unique(testCov)) == 1) { 39 | message("Warning: only one unique value of the specified ", 40 | "covariate of interest. Assuming null comparison and ", 41 | "splitting sample group into two equal groups") 42 | testCov <- rep(1, length(testCov)) 43 | testCov[seq_len(round(length(testCov)/2))] <- 0 44 | } 45 | 46 | design <- model.matrix(~testCov) 47 | colnames(design)[coeff] <- colnames(pData(bs))[testCovariate] 48 | 49 | if (length(unique(design[, coeff])) != 2) { 50 | message("Not a two-group comparison. Can't compute simple mean ", 51 | "methylation differences. ", 52 | "Returning beta estimates instead") 53 | return(dmrs$beta) 54 | } else { 55 | prop.mat <- getCoverage(bs, type = "M") / 56 | getCoverage(bs, type = "Cov") 57 | levs <- unique(design[, coeff]) 58 | 59 | indexRanges <- IRanges(start(dmrs$index), end(dmrs$index)) 60 | prop.mat.dmr <- extractROWS(prop.mat, indexRanges) 61 | prop.mat1.means <- DelayedMatrixStats::rowMeans2(prop.mat.dmr[, 62 | design[, coeff] == levs[which.min(levs)]], 63 | na.rm=TRUE) 64 | prop.mat2.means <- DelayedMatrixStats::rowMeans2(prop.mat.dmr[, 65 | design[, coeff] == levs[which.max(levs)]], 66 | na.rm=TRUE) 67 | 68 | meanDiff <- IRanges::mean(IRanges::relist(prop.mat2.means - prop.mat1.means, 69 | indexRanges), na.rm=TRUE) 70 | 71 | return(meanDiff) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /inst/scripts/get_BS.chr21.R: -------------------------------------------------------------------------------- 1 | ## The following script downloads and constructs the BS.chr21 dataset, 2 | ## included in the dmrseq package 3 | ## This script was modified from bsseq - includes two additional samples 4 | ## compared to the example data in bsseq 5 | 6 | library(dmrseq) 7 | 8 | ## First we download. Each file is slightly less than 200 MB 9 | # you may need to change the 'method' in 'download.file' to suit 10 | # the utilities available on your OS 11 | 12 | # cell type imr90, replicates 1 and 2 13 | download.file(url = "ftp://ftpuser3:s3qu3nc3@neomorph.salk.edu/mc/mc_imr90_r1.tar.gz", 14 | destfile = "mc_imr90_r1.tar.gz", method='curl') 15 | untar("mc_imr90_r1.tar.gz", "mc_imr90_r1/mc_imr90_r1_21", compressed = TRUE) 16 | download.file(url = "ftp://ftpuser3:s3qu3nc3@neomorph.salk.edu/mc/mc_imr90_r2.tar.gz", 17 | destfile = "mc_imr90_r2.tar.gz", method='curl') 18 | untar("mc_imr90_r2.tar.gz", "mc_imr90_r2/mc_imr90_r2_21", compressed = TRUE) 19 | 20 | # cell type h1, replicates 1 and 2 21 | download.file(url = "ftp://ftpuser3:s3qu3nc3@neomorph.salk.edu/mc/mc_h1_r1.tar.gz", 22 | destfile = "mc_h1_r1.tar.gz", method='curl') 23 | untar("mc_h1_r1.tar.gz", "mc_h1_r1/mc_h1_r1_21", compressed = TRUE) 24 | download.file(url = "ftp://ftpuser3:s3qu3nc3@neomorph.salk.edu/mc/mc_h1_r2.tar.gz", 25 | destfile = "mc_h1_r2.tar.gz", method='curl') 26 | untar("mc_h1_r2.tar.gz", "mc_h1_r2/mc_h1_r2_21", compressed = TRUE) 27 | 28 | 29 | ## Now the workhorse function 30 | 31 | read.lister <- function(file) { 32 | dat <- read.table(file, skip = 1, row.names = NULL, 33 | col.names = c("chr", "pos", "strand", "context", "M", "Cov"), 34 | colClasses = c("character", "integer", "character", 35 | "character", "integer", "integer")) 36 | ## we remove all non-CpG calls. This includes SNPs 37 | dat <- dat[dat$context == "CG",] 38 | dat$context <- NULL 39 | dat$chr <- paste("chr", dat$chr, sep = "") 40 | ## Now we need to handle that the data has separate lines for each strand 41 | ## We join these 42 | tmp <- dat[dat$strand == "+",] 43 | BS.forward <- BSseq(pos = tmp$pos, chr = tmp$chr, 44 | M = as.matrix(tmp$M, ncol = 1), 45 | Cov = as.matrix(tmp$Cov, ncol = 1), 46 | sampleNames = "forward") 47 | tmp <- dat[dat$strand == "-",] 48 | BS.reverse <- BSseq(pos = tmp$pos - 1L, chr = tmp$chr, 49 | M = as.matrix(tmp$M, ncol = 1), 50 | Cov = as.matrix(tmp$Cov, ncol = 1), 51 | sampleNames = "reverse") 52 | BS <- combine(BS.forward, BS.reverse) 53 | BS <- collapseBSseq(BS, columns = c("a", "a")) 54 | BS 55 | } 56 | 57 | BS.imr90.r1 <- read.lister("mc_imr90_r1/mc_imr90_r1_21") 58 | sampleNames(BS.imr90.r1) <- "imr90.r1" 59 | BS.imr90.r2 <- read.lister("mc_imr90_r2/mc_imr90_r2_21") 60 | sampleNames(BS.imr90.r2) <- "imr90.r2" 61 | 62 | BS.h1.r1 <- read.lister("mc_h1_r1/mc_h1_r1_21") 63 | sampleNames(BS.h1.r1) <- "h1.r1" 64 | BS.h1.r2 <- read.lister("mc_h1_r2/mc_h1_r2_21") 65 | sampleNames(BS.h1.r2) <- "h1.r2" 66 | 67 | BS.chr21 <- combine(BS.imr90.r1, BS.imr90.r2, BS.h1.r1, BS.h1.r2) 68 | pData(BS.chr21)$CellType <- c(rep("imr90",2), rep("h1",2)) 69 | pData(BS.chr21)$Rep <- rep(c("replicate1", "replicate2"),2) 70 | validObject(BS.chr21) 71 | pData(BS.chr21) 72 | 73 | #remove any loci which have no coverage in one or more samples 74 | meth.levels.raw = getMeth(BS.chr21, type = "raw") 75 | no.hits = which(is.na(rowMeans(meth.levels.raw)) == TRUE) 76 | BS.chr21 = BS.chr21[-no.hits] 77 | 78 | save(BS.chr21, file = "./data/BS.chr21.rda") 79 | library(tools) 80 | resaveRdaFiles("./data/BS.chr21.rda") 81 | 82 | 83 | -------------------------------------------------------------------------------- /R/getAnnot.R: -------------------------------------------------------------------------------- 1 | #' Retrieve annotation information 2 | #' 3 | #' Uses the \code{annotatr} package to retrieve annotation information ( 4 | #' CpG category and gene coding sequences) for the \code{annoTrack} argument 5 | #' of \code{\link{plotDMRs}}. Allows for 5 6 | #' re-tries if download fails (to allow for a spotty internet connection). 7 | #' 8 | #' @details Note that this package needs to attach the \code{annotatr} package, 9 | #' and will 10 | #' return NULL if this cannot be done. You can still use the 11 | #' \code{\link{plotDMRs}} function without this optional annotation step, 12 | #' just by leaving the \code{annoTrack} argument as NULL. 13 | #' 14 | #' @param genomeName a character object that indicates which organism is 15 | #' under study. Use the function \code{builtin_genomes()} to see 16 | #' a character vector of available genome names to choose from (see 17 | #' \code{annotatr} documentation for more details). 18 | #' 19 | #' @return a \code{SimpleGRangesList} object with two elements returned 20 | #' by \code{\link{getAnnot}}. The first 21 | #' contains CpG category information in the first element (optional) 22 | #' coding gene sequence information in the second element (optional). 23 | #' At least one of these elements needs to be non-null in order for 24 | #' any annotation to be plotted, but it is not necessary to contain 25 | #' both. 26 | #' 27 | #' @export 28 | #' 29 | #' @import annotatr 30 | #' @importFrom AnnotationHub AnnotationHub query 31 | #' @importFrom rtracklayer liftOver 32 | #' @importFrom GenomeInfoDb genome 33 | #' 34 | #' @examples 35 | #' 36 | #' # get annotation information for hg19 37 | #' annoTrack <- getAnnot('hg19') 38 | #' 39 | #' 40 | getAnnot <- function(genomeName) { 41 | requireNamespace("annotatr") 42 | liftTo <- NULL 43 | if(genomeName == 'hg18'){ 44 | message("Genome ", genomeName, " will be built by lifting over ", 45 | "hg19 annotations from annotatr") 46 | liftTo <- 'hg18' 47 | genomeName <- 'hg19' 48 | }else if (!genomeName %in% annotatr::builtin_genomes()) { 49 | message("Genome ", genomeName, " is not supported by ", 50 | "annotatr at this time") 51 | return(NULL) 52 | } 53 | 54 | if (is.null(genomeName)) { 55 | return(NULL) 56 | } else { 57 | annot_CpG <- paste0(c(genomeName, "_cpgs"), collapse = "") 58 | annot_genes <- paste0(c(genomeName, "_genes_cds"), collapse = "") 59 | 60 | # Build the annotations (a single GRanges object) 61 | 62 | # annotatr downloads files for genomes that aren't natively supported 63 | # (e.g. hg38) 64 | # Download has a nonzero fail rate; allow up to 5 retries before 65 | # throwing an error 66 | 67 | for (attempt in seq_len(5)) { 68 | cpg <- try(annotatr::build_annotations(genome = genomeName, 69 | annotations = annot_CpG), 70 | silent = TRUE) 71 | if (!is(cpg, "try-error")) { 72 | message("Download of CpG annotation successful!") 73 | fail1 <- 0 74 | break 75 | } else { 76 | message(cpg) 77 | if (5 - attempt > 0) 78 | message("Trying again (", 5 - attempt, " attempts remaining)") 79 | fail1 <- 1 80 | } 81 | } 82 | 83 | for (attempt in seq_len(5)) { 84 | genes <- try(annotatr::build_annotations(genome = genomeName, 85 | annotations = annot_genes), 86 | silent = TRUE) 87 | if (!is(genes, "try-error")) { 88 | message("Download of Gene annotation successful!") 89 | fail2 <- 0 90 | break 91 | } else { 92 | message(genes) 93 | if (5 - attempt > 0) 94 | message("Trying again (", 5 - attempt, " attempts remaining)") 95 | fail2 <- 1 96 | } 97 | } 98 | 99 | 100 | if (fail1 == 0 && fail2 == 0) { 101 | # lift over hg19 coordinates to hg18 using annotationHub 102 | if (!is.null(liftTo)){ 103 | ah = AnnotationHub() 104 | chainfiles <- query(ah , c(genomeName, liftTo, "chainfile")) 105 | cf <- which(grepl(paste0(genomeName, "To", liftTo), 106 | chainfiles$title, 107 | ignore.case=TRUE)) 108 | if (length(cf) == 0){ 109 | message("LiftOver from ", genomeName, " to ", liftTo, 110 | " was unsucccessful") 111 | return(NULL) 112 | }else if(length(cf) > 1){ 113 | # take the first matching chain if more than one 114 | cf <- cf[1] 115 | } 116 | 117 | chain <- chainfiles[[names(chainfiles)[cf]]] 118 | 119 | cpg.new <- unlist(liftOver(cpg, chain)) 120 | genes.new <- unlist(liftOver(genes, chain)) 121 | 122 | GenomeInfoDb::genome(cpg) <- liftTo 123 | GenomeInfoDb::genome(genes) <- liftTo 124 | } 125 | 126 | keep <- which(!is.na(genes$symbol)) 127 | genes <- genes[keep, ] 128 | cpg$type <- substr(cpg$type, 10, nchar(cpg$type)) 129 | 130 | annot <- GRangesList(CpGs = cpg, Exons = genes, compress=FALSE) 131 | return(annot) 132 | } else { 133 | message("Download with annotatr::build_annotations() failed; Annotation could not be retrieved.") 134 | return(NULL) 135 | } 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /R/plotEmpiricalDistribution.R: -------------------------------------------------------------------------------- 1 | #' Plot the empirical distribution of the methylation beta vals or coverage 2 | #' 3 | #' Uses ggplot2 to plot smoothed density histograms of methylation 4 | #' proportions (beta values), or coverage. Methylation proportion densities 5 | #' are weighted by coverage. 6 | #' The number of curves plotted 7 | #' will be equal to the number of different values of \code{testCovariate}, 8 | #' unless \code{bySample} is TRUE. This can take quite some time to 9 | #' execute for a large object, so it is recommended to first take a random 10 | #' sample of loci (say one million) before plotting. 11 | #' 12 | #' @param bs a BSseq object 13 | #' 14 | #' @param testCovariate character specifying the column name of the 15 | #' \code{pData} slot of the BSseq object to include in the plot legend. 16 | #' 17 | #' @param bySample logical whether to plot a separate line for each sample, 18 | #' even if the grouping \code{testCovariate} is specified. 19 | #' Default value is FALSE (so samples with the same value of 20 | #' \code{testCovariate} will be collapsed into the same line). If 21 | #' \code{testCovariate} is not specified, this parameter does not have an 22 | #' effect and samples are automatically plotted separately. 23 | #' 24 | #' @param type a character indicating which type of density to plot - the 25 | #' methylation (beta) values ("M") or the coverage ("Cov"). Default is "M". 26 | #' 27 | #' @param adj a numeric value for the \code{adjust} parameter to pass to the 28 | #' \code{geom_line} function. Specifies how smooth the make the function. 29 | #' 30 | #' @return a ggplot object 31 | #' 32 | #' @importFrom locfit locfit lp 33 | #' 34 | #' @export 35 | #' 36 | #' @examples 37 | #' 38 | #' data(BS.chr21) 39 | #' 40 | #' # plot beta values by sample group 41 | #' plotEmpiricalDistribution(BS.chr21, testCovariate="CellType") 42 | #' 43 | plotEmpiricalDistribution <- function(bs, 44 | testCovariate = NULL, 45 | bySample = FALSE, 46 | type = "M", 47 | adj = 2.5) { 48 | #satisfy check 49 | M <- Cov <- group <- wt <- NULL 50 | 51 | if (!(type %in% c("M", "Cov"))){ 52 | stop("type must be either M or Cov") 53 | } 54 | 55 | if(is.null(testCovariate) & !bySample){ 56 | message("No testCovariate specified; plotting each sample separately.") 57 | bySample = TRUE 58 | } 59 | 60 | meth.mat <- getCoverage(bs, type = "M") 61 | unmeth.mat <- getCoverage(bs, type = "Cov") - meth.mat 62 | 63 | meth.levelsm <- data.frame(meth.mat/ 64 | (meth.mat + unmeth.mat)) 65 | cov.matm <- data.frame((meth.mat + unmeth.mat)) 66 | 67 | if (!is.null(testCovariate)) { 68 | if (sum(grepl(testCovariate, colnames(pData(bs)))) == 0) { 69 | stop("Error: no column in pData() found ", 70 | "that matches the testCovariate") 71 | } else if (length(grep(testCovariate, colnames(pData(bs)))) > 1) { 72 | stop("Error: testCovariate matches more ", 73 | "than one column in pData()") 74 | } 75 | mC <- grep(testCovariate, colnames(pData(bs))) 76 | grouplab <- pData(bs)[,mC] 77 | }else{ 78 | if(is.null(sampleNames(bs))){ 79 | grouplab <- as.character(seq_len(ncol(bs))) 80 | }else{ 81 | grouplab <- sampleNames(bs) 82 | } 83 | } 84 | 85 | meth.levelsm <- utils::stack(meth.levelsm) 86 | colnames(meth.levelsm)[1] <- "M" 87 | meth.levelsm$Cov <- utils::stack(cov.matm)$values 88 | 89 | if(is.null(sampleNames(bs))){ 90 | meth.levelsm$sample <- sort(rep(seq_len(ncol(bs)), nrow(bs))) 91 | }else{ 92 | meth.levelsm$sample <- unlist(lapply(sampleNames(bs), function(x) 93 | rep(x, nrow(bs)))) 94 | } 95 | 96 | if (!is.null(testCovariate)){ 97 | meth.levelsm$group <- 98 | unlist(lapply(seq_len(ncol(bs)), function(x) 99 | rep(pData(bs)[x,mC], nrow(bs)))) 100 | }else{ 101 | meth.levelsm$group <- meth.levelsm$sample 102 | } 103 | 104 | if (!bySample){ 105 | if (type=="M"){ 106 | # compute weights - sum over all samples in group ## 107 | covtots <- rep(NA, ncol(cov.matm)) 108 | names(covtots) <- grouplab 109 | for(l in unique(grouplab)){ 110 | covtots[names(covtots) == l] <- sum(colSums(cov.matm)[grouplab == l]) 111 | } 112 | 113 | wt.matm <- data.frame(t(t(cov.matm) / covtots)) 114 | meth.levelsm$wt <- utils::stack(wt.matm)$values 115 | 116 | p1 <- ggplot(meth.levelsm, 117 | aes(M, colour = group, group = group, weight = wt)) + 118 | geom_line(adjust = adj, alpha = 0.6, stat = "density", size = 1.3) + 119 | xlab("Methylation Proportion") + 120 | theme_bw() 121 | }else{ 122 | p1 <- ggplot(meth.levelsm, aes(Cov+0.1, 123 | colour = group, group = group)) + 124 | geom_line(adjust = adj, alpha = 0.6, stat = "density", size = 1.3) + 125 | scale_x_continuous(trans="log2") + 126 | xlab("Coverage") + 127 | theme_bw() 128 | } 129 | p1 <- p1 + labs(colour = "Group") 130 | }else{ 131 | if (type=="M"){ 132 | wt.matm <- data.frame(t(t(cov.matm) / colSums(cov.matm))) 133 | meth.levelsm$wt <- utils::stack(wt.matm)$values 134 | 135 | if(identical(meth.levelsm$group, meth.levelsm$sample)){ 136 | p1 <- ggplot(meth.levelsm, 137 | aes(M, colour = group, group = sample, weight = wt)) + 138 | labs(color = "Sample") 139 | }else{ 140 | if(ncol(bs) <= 12){ 141 | p1 <- ggplot(meth.levelsm, 142 | aes(M, colour = group, group = sample, weight = wt, 143 | linetype = sample)) + 144 | labs(color = "Group", linetype = "Sample") 145 | }else{ 146 | p1 <- ggplot(meth.levelsm, 147 | aes(M, colour = group, group = sample, weight = wt)) + 148 | labs(color = "Group") 149 | } 150 | } 151 | p1 <- p1 + geom_line(adjust = adj, alpha = 0.6, stat = "density", size = 1.3) + 152 | xlab("Methylation Proportion") + 153 | theme_bw() 154 | }else{ 155 | if(identical(meth.levelsm$group, meth.levelsm$sample)){ 156 | p1 <- ggplot(meth.levelsm, aes(Cov+0.1, 157 | colour = group, group = sample)) + 158 | labs(color = "Sample") 159 | }else{ 160 | if(ncol(bs) <= 12){ 161 | p1 <- ggplot(meth.levelsm, aes(Cov+0.1, 162 | colour = group, group = sample, 163 | linetype = sample)) + 164 | labs(color = "Group", linetype = "Sample") 165 | }else{ 166 | p1 <- ggplot(meth.levelsm, aes(Cov+0.1, 167 | colour = group, group = sample)) + 168 | labs(color = "Group") 169 | } 170 | } 171 | 172 | p1 <- p1 + 173 | geom_line(adjust = adj, alpha = 0.6, stat = "density", size = 1.3) + 174 | scale_x_continuous(trans="log2") + 175 | xlab("Coverage") + 176 | theme_bw() 177 | } 178 | } 179 | return(p1) 180 | } 181 | 182 | -------------------------------------------------------------------------------- /man/dmrseq.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dmrseq.R 3 | \name{dmrseq} 4 | \alias{dmrseq} 5 | \title{Main function for detecting and evaluating significance of DMRs.} 6 | \usage{ 7 | dmrseq( 8 | bs, 9 | testCovariate, 10 | adjustCovariate = NULL, 11 | cutoff = 0.1, 12 | minNumRegion = 5, 13 | smooth = TRUE, 14 | bpSpan = 1000, 15 | minInSpan = 30, 16 | maxGapSmooth = 2500, 17 | maxGap = 1000, 18 | verbose = TRUE, 19 | maxPerms = 10, 20 | matchCovariate = NULL, 21 | BPPARAM = bpparam(), 22 | stat = "stat", 23 | block = FALSE, 24 | blockSize = 5000, 25 | chrsPerChunk = 1 26 | ) 27 | } 28 | \arguments{ 29 | \item{bs}{bsseq object containing the methylation values as well as the 30 | phenotype matrix that contains sample level covariates} 31 | 32 | \item{testCovariate}{Character value indicating which variable 33 | (column name) in \code{pData(bs)} to test 34 | for association of methylation levels. 35 | Can alternatively specify an integer value indicating 36 | which of column of 37 | \code{pData(bs)} to use. This is used to construct the 38 | design matrix for the test statistic calculation. To run using a 39 | continuous or categorial covariate with more than two groups, simply pass in 40 | the name of a column in `pData` that contains this covariate. A continuous 41 | covariate is assmued if the data type in the `testCovariate` slot is 42 | continuous, with the exception of if there are only two unique values 43 | (then a two group comparison is carried out).} 44 | 45 | \item{adjustCovariate}{an (optional) character value or vector 46 | indicating which variables (column names) in \code{pData(bs)} 47 | will be adjusted for when 48 | testing for the association of methylation value with the 49 | \code{testCovariate}. 50 | Can alternatively specify an 51 | integer value or vector indicating 52 | which of the columns of \code{pData(bs)} to adjust for. 53 | If not NULL (default), then this is also used to 54 | construct the design matrix for the test statistic calculation.} 55 | 56 | \item{cutoff}{scalar value that represents the absolute value (or a vector 57 | of two numbers representing a lower and upper bound) for the cutoff of 58 | the single CpG coefficient that is used to discover 59 | candidate regions. Default value is 0.10.} 60 | 61 | \item{minNumRegion}{positive integer that represents the minimum number of 62 | CpGs to consider for a candidate region. Default value is 5. 63 | Minimum value is 3.} 64 | 65 | \item{smooth}{logical value that indicates whether or not to smooth the 66 | CpG level signal when discovering candidate regions. 67 | Defaults to TRUE.} 68 | 69 | \item{bpSpan}{a positive integer that represents the length in basepairs 70 | of the smoothing span window if \code{smooth} is TRUE. Default value is 71 | 1000.} 72 | 73 | \item{minInSpan}{positive integer that represents the minimum number of 74 | CpGs in a smoothing span window if \code{smooth} is TRUE. 75 | Default value is 30.} 76 | 77 | \item{maxGapSmooth}{integer value representing maximum number of basepairs 78 | in between neighboring CpGs to be included in the same 79 | cluster when performing smoothing (should generally be larger than 80 | \code{maxGap})} 81 | 82 | \item{maxGap}{integer value representing maximum number of basepairs in 83 | between neighboring CpGs to be included in the same DMR.} 84 | 85 | \item{verbose}{logical value that indicates whether progress messages 86 | should be printed to stdout. Defaults value is TRUE.} 87 | 88 | \item{maxPerms}{a positive integer that represents the maximum number 89 | of permutations that will be used to generate the global null 90 | distribution of test statistics. Default value is 10.} 91 | 92 | \item{matchCovariate}{An (optional) character value 93 | indicating which variable (column name) of \code{pData(bs)} 94 | will be blocked for when 95 | constructing the permutations in order to 96 | test for the association of methylation value with the 97 | \code{testCovariate}, only to be used when \code{testCovariate} 98 | is a two-group factor and the number of permutations possible is less 99 | than 500000. 100 | Alternatively, you can specify an integer value indicating 101 | which column of \code{pData(bs)} to block for. 102 | Blocking means that only permutations with balanced 103 | composition of \code{testCovariate} values will be used (for example if 104 | you have samples from different gender and this is not your covariate of 105 | interest, 106 | it is recommended to use gender as a matching covariate to avoid one 107 | of the permutations testing entirely males versus females; this violates 108 | the null hypothesis and will decrease power). 109 | If not NULL (default), then no blocking is performed.} 110 | 111 | \item{BPPARAM}{a \code{BiocParallelParam} object to specify the parallel 112 | backend. The default 113 | option is \code{BiocParallel::bpparam()} which will automatically creates 114 | a cluster appropriate for the operating system.} 115 | 116 | \item{stat}{a character vector indicating the name of the column of the 117 | output to use as the region-level test statistic. Default value is 'stat' 118 | which is the region level-statistic designed to be comparable across the 119 | genome. 120 | It is not recommended to change this argument, but it can be done for 121 | experimental purposes. Possible values are: 'L' - the number of loci 122 | in the region, 'area' - the sum of the smoothed loci statistics, 123 | 'beta' - the effect size of the region, 'stat' - the test statistic for 124 | the region, or 'avg' - the average smoothed loci statistic.} 125 | 126 | \item{block}{logical indicating whether to search for large-scale (low 127 | resolution) blocks of differential methylation (default is FALSE, which 128 | means that local DMRs are desired). If TRUE, the parameters for 129 | \code{bpSpan}, \code{minInSpan}, and \code{maxGapSmooth} should be adjusted 130 | (increased) accordingly. This setting will also merge 131 | candidate regions that (1) are in the same direction and (2) are less than 132 | 1kb apart with no covered CpGs separating them. The region-level model used 133 | is also slightly modified - instead of a loci-specific intercept for each 134 | CpG in theregion, the intercept term is modeled as a natural spline with 135 | one interior knot per each 10kb of length (up to 10 interior knots).} 136 | 137 | \item{blockSize}{numeric value indicating the minimum number of basepairs 138 | to be considered a block (only used if \code{block}=TRUE). Default is 139 | 5000 basepairs.} 140 | 141 | \item{chrsPerChunk}{a positive integer value indicating the number of 142 | chromosomes per chunk. The default is 1, meaning that the data will be 143 | looped through one chromosome at a time. When pairing up multiple 144 | chromosomes per chunk, sizes (in terms of numbers of CpGs) will be taken 145 | into consideration to balance the sizes of each chunk.} 146 | } 147 | \value{ 148 | a \code{GRanges} object that contains the results of the inference. 149 | The object contains one row for each candidate region, sorted by q-value 150 | and then chromosome. The standard 151 | \code{GRanges} chr, start, and end are included, along with at least 152 | 7 metadata 153 | columns, in the following order: 154 | 1. L = the number of CpGs contained in the region, 155 | 2. area = the sum of the smoothed beta values 156 | 3. beta = the coefficient value for the condition difference (there 157 | will be more than one column here if a multi-group comparison 158 | was performed), 159 | 4. stat = the test statistic for the condition difference, 160 | 5. pval = the permutation p-value for the significance of the test 161 | statistic, and 162 | 6. qval = the q-value for the test statistic (adjustment 163 | for multiple comparisons to control false discovery rate). 164 | 7. index = an \code{IRanges} containing the indices of the region's 165 | first CpG to last CpG. 166 | } 167 | \description{ 168 | Performs a two-step approach that (1) detects candidate regions, and 169 | (2) scores candidate regions with an exchangeable (across the genome) 170 | statistic and evaluates statistical significance using a 171 | permuation test on the pooled null distribution of scores. 172 | } 173 | \examples{ 174 | 175 | # load example data 176 | data(BS.chr21) 177 | 178 | # the covariate of interest is the 'CellType' column of pData(BS.chr21) 179 | testCovariate <- 'CellType' 180 | 181 | # run dmrseq on a subset of the chromosome (10K CpGs) 182 | regions <- dmrseq(bs=BS.chr21[240001:250000,], 183 | cutoff = 0.05, 184 | testCovariate=testCovariate) 185 | 186 | } 187 | \keyword{inference} 188 | -------------------------------------------------------------------------------- /man/plotDMRs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plotDMRs.R 3 | \name{plotDMRs} 4 | \alias{plotDMRs} 5 | \title{Plot Differentially Methylated Regions} 6 | \usage{ 7 | plotDMRs( 8 | BSseq, 9 | regions = NULL, 10 | testCovariate = NULL, 11 | extend = (end(regions) - start(regions) + 1)/2, 12 | main = "", 13 | addRegions = regions, 14 | annoTrack = NULL, 15 | col = NULL, 16 | lty = NULL, 17 | lwd = NULL, 18 | label = NULL, 19 | mainWithWidth = TRUE, 20 | regionCol = .alpha("#C77CFF", 0.2), 21 | addTicks = TRUE, 22 | addPoints = TRUE, 23 | pointsMinCov = 1, 24 | highlightMain = FALSE, 25 | qval = TRUE, 26 | stat = TRUE, 27 | verbose = TRUE, 28 | includeYlab = TRUE, 29 | compareTrack = NULL, 30 | labelCols = NULL, 31 | horizLegend = FALSE, 32 | addLines = TRUE, 33 | linesMinCov = 1 34 | ) 35 | } 36 | \arguments{ 37 | \item{BSseq}{An object of class BSseq.} 38 | 39 | \item{regions}{A data.frame containing the DMRs (output from the main 40 | \code{dmrseq}) function.} 41 | 42 | \item{testCovariate}{integer value or vector indicating which of columns of 43 | \code{pData(bs)} contains the covariate of interest. 44 | This is used to construct the sample labels and colors (unless this is 45 | over-ridden by specifying \code{label}).} 46 | 47 | \item{extend}{Describes how much the plotting region should be extended in 48 | either direction. The total width of the plot is equal to the width of the 49 | region plus twice extend.} 50 | 51 | \item{main}{The plot title. The default is to construct a title with 52 | information about which genomic region is being plotted.} 53 | 54 | \item{addRegions}{A set of additional regions to be highlighted on the 55 | plots. Same format as the \code{regions} argument.} 56 | 57 | \item{annoTrack}{a \code{GRangesList} object with two elements returned 58 | by \code{\link{getAnnot}}. The first 59 | contains CpG category information in the first element (optional) 60 | coding gene sequence information in the second element (optional). 61 | At least one of these elements needs to be non-null in order for 62 | any annotation to be plotted, but it is not necessary to contain 63 | both.} 64 | 65 | \item{col}{The color of the methylation estimates. It is recommended to 66 | leave this value as default (NULL), and specify a value of 67 | \code{testCovariate} to indicate which column of \code{pData(bs)} 68 | to use as a factor for coloring the points and lines of the plot. 69 | Alternatively, you can specify particular colors by 70 | passing this information through the \code{pData} slot of the 71 | object \code{BSseq} (a \code{data.frame} that houses metadata). To do 72 | so, place the color value for each sample in a column titled \code{col}, 73 | and leave this argument as its default value of NULL. Alternatively, 74 | you may specify a vector of color values (one for each sample), but 75 | you *must* make sure that this vector is in the same order as the samples 76 | are in the BSseq object. If NULL and no \code{col} column is found in 77 | \code{pData}, then estimates are plotted in black for all samples.} 78 | 79 | \item{lty}{The line type of the methylation estimates. It is recommended to 80 | pass this information through the \code{pData} slot of the 81 | object \code{BSseq} (a \code{data.frame} that houses metadata). To do 82 | so, place the line type value for each sample in a column titled \code{lty}, 83 | and leave this argument as its default value of NULL. Alternatively, 84 | you may specify a vector of line type values (one for each sample), but 85 | you *must* make sure that this vector is in the same order as the samples 86 | are in the BSseq object. If NULL and no \code{lty} column is found in 87 | \code{pData}, then estimates are plotted with \code{lty=1} for all samples.} 88 | 89 | \item{lwd}{The line width of the methylation estimates. It is recommended to 90 | pass this information through the \code{pData} slot of the 91 | object \code{BSseq} (a \code{data.frame} that houses metadata). To do 92 | so, place the line width value for each sample in a column titled \code{lwd}, 93 | and leave this argument as its default value of NULL. Alternatively, 94 | you may specify a vector of line width values (one for each sample), but 95 | you *must* make sure that this vector is in the same order as the samples 96 | are in the BSseq object. If NULL and no \code{lwd} column is found in 97 | \code{pData}, then estimates are plotted with \code{lwd=1} for all samples.} 98 | 99 | \item{label}{The condition/population labels for the plot legend. If NULL 100 | (default) this is taken from the \code{testCovariate} column of 101 | \code{pData}. Alternatively, you can pass in labels by 102 | adding this information through the \code{pData} slot of the 103 | object \code{BSseq} (a \code{data.frame} that houses metadata). To do 104 | so, place the labels for each sample in a column titled \code{label}, 105 | and leave this argument as its default value of NULL. 106 | You may instead specify an arbitrary vector of labels (one for each sample), 107 | but be aware that you *must* make sure that this vector is in the same order 108 | as the samples are in the BSseq object. If NULL, and \code{testCovariate} is 109 | also NULL and no \code{label} column is found in 110 | \code{pData}, then no legend is created.} 111 | 112 | \item{mainWithWidth}{logical value indicating whether the default title 113 | should include information about width of the plot region.} 114 | 115 | \item{regionCol}{The color used for highlighting the region.} 116 | 117 | \item{addTicks}{logical value indicating whether tick marks showing the 118 | location of methylation loci should be added. Default is TRUE.} 119 | 120 | \item{addPoints}{logical value indicating whether the individual 121 | methylation estimates be plotted as points.} 122 | 123 | \item{pointsMinCov}{The minimum coverage a methylation loci need in 124 | order for the raw methylation estimates to be plotted. Useful for filtering 125 | out low coverage loci. Only used if addPoints = TRUE. Default value is 1 126 | (no filtering).} 127 | 128 | \item{highlightMain}{logical value indicating whether the plot region 129 | should be highlighted.} 130 | 131 | \item{qval}{logical value indicating whether the region FDR estimate 132 | (q-value) should be displayed in the plot title. The value is extracted 133 | from the \code{regions} argument.} 134 | 135 | \item{stat}{logical value indicating whether the region statistic 136 | should be displayed in the plot title. The value is extracted from the 137 | \code{regions} argument.} 138 | 139 | \item{verbose}{logical value indicating whether progress messages 140 | should be printed to the screen.} 141 | 142 | \item{includeYlab}{a logical indicating whether to include the Y axis 143 | label 'Methylation' (useful to turn off if combining multiple region 144 | figures and you do not want to include redundant y axis label information)} 145 | 146 | \item{compareTrack}{a named GRangesList object that contains up to four 147 | custom tracks (GRanges objects) which will be plotted below the region. 148 | Only one of `compareTrack` or `annoTrack` can be specified since there is 149 | only for plotting either the built in GpG category and exon tracks, *or* a 150 | custom set of tracks.} 151 | 152 | \item{labelCols}{a character vector with names of the mcols slot of the 153 | GRanges items in `compareTrack'. Only used if plotting custom 154 | tracks using the `compareTrack' argument. If specified, the (first) value 155 | in that column is printed along with a label that includes the name of the 156 | list item. If NULL (default), just the name of the track is printed.} 157 | 158 | \item{horizLegend}{logical indicating whether the legend should be 159 | horizontal instead of vertical (default FALSE). This is useful if you need 160 | to plot many labels and want to preserve whitespace.} 161 | 162 | \item{addLines}{logical indicating whether to plot smooth lines between 163 | points. Default is true. Can be useful to turn this off for very small 164 | regions.} 165 | 166 | \item{linesMinCov}{The minimum coverage a methylation loci need in 167 | order to be used for plotting of smoothed lines. Useful for filtering 168 | out low coverage loci. Only used if addLines = TRUE. Default value is 1 169 | (no filtering).} 170 | } 171 | \value{ 172 | None (generates a plot) 173 | } 174 | \description{ 175 | Generates trace plots of methylation proportions by genomic position. 176 | } 177 | \details{ 178 | Creates aesthetially pleasing DMR plots. By default will plot individual 179 | points with size proportional to coverage, along with a smoothed line 180 | for each sample. Elements will be colored by biological condition 181 | (\code{label}). Also has functionality to add annotations below the main 182 | plot (CpG category, genes) if \code{annoTrack} is specified. 183 | } 184 | \examples{ 185 | 186 | # load the example data 187 | data(BS.chr21) 188 | 189 | # load example results (computed with dmrseq function) 190 | data(dmrs.ex) 191 | 192 | # get annotation information (using getAnnot function) 193 | # here we'll load the example annotation from chr21 194 | data(annot.chr21) 195 | 196 | # plot the 1st DMR 197 | plotDMRs(BS.chr21, regions=dmrs.ex[1,], testCovariate=1, 198 | annoTrack=annot.chr21) 199 | 200 | } 201 | -------------------------------------------------------------------------------- /R/simDMRs.R: -------------------------------------------------------------------------------- 1 | 2 | #' Simulate Differentially Methylated Regions 3 | #' 4 | #' Add simulated DMRs to observed control data. Control data will be split 5 | #' into two (artificial) populations. 6 | #' 7 | #' @param bs a BSseq object containing only control samples (from the same 8 | #' population) for which simulated DMRs will be added after dividing the 9 | #' population into two artificial groups. 10 | #' 11 | #' @param num.dmrs an integer specifying how many DMRs to add. 12 | #' 13 | #' @param delta.max0 a proportion value indicating the mode value for the 14 | #' difference in proportion of methylated CpGs in the simulated DMRs (the 15 | #' actual value will be drawn from a scaled Beta distribution centered at 16 | #' this value). Default value is 0.3. 17 | #' 18 | #' @return A named list object with 5 elements: (1) 19 | #' \code{gr.dmrs} is a \code{GRanges} object with \code{num.dmrs} 20 | #' ranges that represent the random DMRs added. (2) \code{dmr.mncov} is a 21 | #' numeric vector that contains the mean coverage in each simulated DMR. (3) 22 | #' \code{dmr.L} is a numeric vector that contains the number of CpGs in each 23 | #' simulated DMR. (4) \code{bs} is the BSseq object that contains the 24 | #' simulated DMRs. (5) \code{deltas} is a numeric vector that contains the 25 | #' effect size used for each DMR. 26 | #' 27 | #' @importFrom IRanges IRanges 28 | #' 29 | #' @export 30 | #' 31 | #' @examples 32 | #' 33 | #' # Add simulated DMRs to a BSseq dataset 34 | #' # This is just for illustrative purposes - ideally you would 35 | #' # add DMRs to a set of samples from the same condition (in our 36 | #' # example data, we have data from two different cell types) 37 | #' # In this case, we shuffle the samples by cell type to create 38 | #' # a null comparison. 39 | #' 40 | #' data(BS.chr21) 41 | #' 42 | #' BS.chr21.sim <- simDMRs(bs=BS.chr21[1:10000,c(1,3,2,4)], 43 | #' num.dmrs=50) 44 | #' 45 | #' # show the simulated DMRs GRanges object 46 | #' show(BS.chr21.sim$gr.dmrs) 47 | #' 48 | #' # show the updated BSseq object that includes the simulated DMRs 49 | #' show(BS.chr21.sim$bs) 50 | #' 51 | #' # examine effect sizes of the DMRs 52 | #' head(BS.chr21.sim$delta) 53 | #' 54 | simDMRs <- function(bs, num.dmrs = 3000, delta.max0 = 0.3) { 55 | # check that all loci have coverage in every sample 56 | zero.cov <- which(rowSums(as.matrix(getCoverage(bs, 57 | type="Cov")) == 0) > 0) 58 | if (length(zero.cov) > 0){ 59 | stop("Zero coverage found for at least one sample in ", length(zero.cov), 60 | " loci. Please filter for loci with coverage at least one in ", 61 | "all samples before passing to 'simDMRs'") 62 | } 63 | 64 | sampleSize <- floor(nrow(pData(bs))/2) 65 | 66 | # code to simulate DMRs if some number of simulated dmrs was specified 67 | message("Simulating DMRs for ", sampleSize, " vs ", ncol(bs) - sampleSize, 68 | " comparison") 69 | triwt <- function(x, amp = 1, base = 0, width = 1, center = 0, 70 | deg = 3, dir = 1) { 71 | y <- dir * (((width/2)^deg - abs(x - center)^deg)/ 72 | (width/2)^deg)^deg * amp + base 73 | y[abs(x - center) > ceiling(width/2)] <- base[abs(x - center) > 74 | ceiling(width/2)] 75 | return(y) 76 | } 77 | 78 | meth.mat <- as.matrix(getCoverage(bs, type = "M")) 79 | unmeth.mat <- as.matrix(getCoverage(bs, type = "Cov")) - meth.mat 80 | chr <- as.character(seqnames(bs)) 81 | pos <- start(bs) 82 | 83 | cluster <- bumphunter::clusterMaker(chr, pos, maxGap = 500) 84 | Indexes <- split(seq(along = cluster), cluster) 85 | lns <- lengths(Indexes) 86 | Indexes <- Indexes[lns >= 5 & lns <= 500] 87 | 88 | # sample regions with intermediate methylation values preferentially 89 | prop.mat <- rowMeans(meth.mat/(meth.mat + unmeth.mat)) 90 | prop.mat <- unlist(lapply(Indexes, function(x) median(prop.mat[x]))) 91 | 92 | dmrs.ind <- sample(seq_len(length(Indexes)), num.dmrs, replace = FALSE, 93 | prob = pmax(1 - sqrt(2) * 94 | abs(0.5 - prop.mat)^0.5, 0)) 95 | dmrs.ind <- Indexes[dmrs.ind] 96 | fnc <- function(index) { 97 | gr.dmr <- GRanges(seqnames = unique(as.character(seqnames(bs)[index])), 98 | IRanges(start = min(start(bs)[index]), 99 | end = max(start(bs)[index]))) 100 | return(gr.dmr) 101 | } 102 | ## GRanges Object for the Simulated DMRs 103 | gr.dmrs <- suppressWarnings(Reduce("c", lapply(dmrs.ind, fnc))) 104 | 105 | ## Generating the Methylated and Unmethylated Read Counts 106 | ## for the CpG sites in the 107 | ## DMRs and outside 108 | 109 | # set up null signal (smooth function of position) mcols(bs)$diff <- 0 110 | Diff <- Diff2 <- rep(0, length(bs)) 111 | 112 | dmr.mncov <- dmr.L <- deltas <- rep(NA, num.dmrs) 113 | 114 | for (u in seq_len(num.dmrs)) { 115 | # coin flip for up or down 116 | up <- 1 - 2 * (rbinom(1, 1, 0.5) == 1) 117 | 118 | # let effect size change randomly 119 | delta.max <- delta.max0 + (rbeta(1, 2, 2) - 0.5)/3 120 | deltas[u] <- delta.max 121 | 122 | # grab loci in the dmr 123 | dmr.L[u] <- length(dmrs.ind[[u]]) 124 | prop.mat <- meth.mat[dmrs.ind[[u]], ]/(meth.mat[dmrs.ind[[u]], ] + 125 | unmeth.mat[dmrs.ind[[u]], 126 | ]) 127 | 128 | # change direction if baseline mean is near boundary 129 | if (up == 1) { 130 | if (mean(prop.mat) > 1 - delta.max) { 131 | up <- -1 132 | } 133 | } else if (up == -1) { 134 | if (mean(prop.mat) < delta.max) { 135 | up <- 1 136 | } 137 | } 138 | 139 | # simulated mean as a smooth parabola added or subtracted 140 | # from baseline mean 141 | last <- max(pos[dmrs.ind[[u]]]) 142 | first <- min(pos[dmrs.ind[[u]]]) 143 | width <- last - first 144 | 145 | # widen out so that first and last CpGs don't have a 146 | # difference of zero 147 | last <- last + 0.2 * width 148 | first <- first - 0.2 * width 149 | width <- last - first 150 | 151 | mid <- round((last - first)/2 + first) 152 | 153 | # Diff.hit is the methylation percentage difference for each position in 154 | # spiked in DMR; restricted to between -1 and 1 (negative indicates that 155 | # difference is in the negative direction. 156 | 157 | Diff.hit <- round(triwt(pos[dmrs.ind[[u]]], amp = delta.max, 158 | base = Diff[dmrs.ind[[u]]], 159 | width = width, center = mid, deg = 4, dir = up), 4) 160 | 161 | # calculate the mean coverage in the DMR over all the samples and save 162 | # result in a vector dmr.mncov for exploring the characteristics of the 163 | # detected / missed DMRs in simulation results. 164 | mn.cov <- by(t(meth.mat[dmrs.ind[[u]], ] + unmeth.mat[dmrs.ind[[u]], ]), 165 | factor(paste0("Condition", c(rep(1, sampleSize), 166 | rep(2, ncol(bs) - sampleSize)))), 167 | colMeans) 168 | mn.cov <- rowMeans(cbind(mn.cov[[1]], mn.cov[[2]])) 169 | dmr.mncov[u] <- mean(mn.cov) 170 | 171 | 172 | # Conditional on the coverage for each site and sample combination, 173 | # sample the 174 | # number of methylated reads from a binomial distribution where the 175 | # probability 176 | # parameter is taken as the observed estimate plus the Diff/Diff2 177 | # (depending on 178 | # whether the sample is from condition 1 or condition 2 This will 179 | # induce sampling 180 | # error into the number of reads observed in the simulation and 181 | # thus will be more 182 | # realistic and a more practical evaluation of the methods 183 | cov <- meth.mat[dmrs.ind[[u]], ] + unmeth.mat[dmrs.ind[[u]], ] 184 | prop <- meth.mat[dmrs.ind[[u]], ]/cov 185 | grp <- runif(1) < 0.5 186 | ss <- ifelse(grp, sampleSize, ncol(bs) - sampleSize) 187 | for (samp in seq_len(ss)) { 188 | # randomly choose which condition is the one with the difference 189 | if (grp) { 190 | # first generate M counts for sample samp of condition 1 191 | meth.mat[dmrs.ind[[u]], samp] <- rbinom(n=length(dmrs.ind[[u]]), 192 | size = cov[, samp], 193 | prob = pmax(pmin(prop[, samp] + Diff.hit, 1), 194 | 0)) 195 | 196 | # next assign the other Cov - M counts to unmethylated matrix 197 | unmeth.mat[dmrs.ind[[u]], samp] <- cov[, samp] - 198 | meth.mat[dmrs.ind[[u]], samp] 199 | 200 | } else { 201 | # next generate M counts for sample samp of condition 2 202 | meth.mat[dmrs.ind[[u]], (sampleSize + samp)] <- 203 | rbinom(n = length(dmrs.ind[[u]]), 204 | size = cov[, (sampleSize + samp)], 205 | prob = pmax(pmin(prop[, (sampleSize + samp)] + 206 | Diff.hit, 1), 0)) 207 | 208 | # next assign the other Cov - M counts to unmethylated matrix 209 | unmeth.mat[dmrs.ind[[u]], (sampleSize + samp)] <- cov[, 210 | (sampleSize + samp)] - meth.mat[dmrs.ind[[u]], 211 | (sampleSize + samp)] 212 | } 213 | } 214 | } 215 | 216 | # get everything in order to run bumphunter functions 217 | sampnames <- paste0("Condition", c(rep(1, sampleSize), 218 | rep(2, ncol(bs) - sampleSize)), 219 | "_Rep", c(seq_len(sampleSize), 220 | seq_len(ncol(bs) - sampleSize))) 221 | colnames(meth.mat) <- colnames(unmeth.mat) <- sampnames 222 | bsNew <- BSseq(pos = pos, chr = chr, M = meth.mat, 223 | Cov = (meth.mat + unmeth.mat), 224 | sampleNames = sampnames) 225 | 226 | sim.dat.red <- list(gr.dmrs = gr.dmrs, dmr.mncov = dmr.mncov, dmr.L = dmr.L, 227 | bs = bsNew, delta = deltas) 228 | return(sim.dat.red) 229 | } 230 | 231 | -------------------------------------------------------------------------------- /R/plotDMRs.R: -------------------------------------------------------------------------------- 1 | #' Plot Differentially Methylated Regions 2 | #' 3 | #' Generates trace plots of methylation proportions by genomic position. 4 | #' 5 | #' Creates aesthetially pleasing DMR plots. By default will plot individual 6 | #' points with size proportional to coverage, along with a smoothed line 7 | #' for each sample. Elements will be colored by biological condition 8 | #' (\code{label}). Also has functionality to add annotations below the main 9 | #' plot (CpG category, genes) if \code{annoTrack} is specified. 10 | #' 11 | #' @param BSseq An object of class BSseq. 12 | #' 13 | #' @param regions A data.frame containing the DMRs (output from the main 14 | #' \code{dmrseq}) function. 15 | #' 16 | #' @param extend Describes how much the plotting region should be extended in 17 | #' either direction. The total width of the plot is equal to the width of the 18 | #' region plus twice extend. 19 | #' 20 | #' @param main The plot title. The default is to construct a title with 21 | #' information about which genomic region is being plotted. 22 | #' 23 | #' @param addRegions A set of additional regions to be highlighted on the 24 | #' plots. Same format as the \code{regions} argument. 25 | #' 26 | #' @param annoTrack a \code{GRangesList} object with two elements returned 27 | #' by \code{\link{getAnnot}}. The first 28 | #' contains CpG category information in the first element (optional) 29 | #' coding gene sequence information in the second element (optional). 30 | #' At least one of these elements needs to be non-null in order for 31 | #' any annotation to be plotted, but it is not necessary to contain 32 | #' both. 33 | #' 34 | #' @param col The color of the methylation estimates. It is recommended to 35 | #' leave this value as default (NULL), and specify a value of 36 | #' \code{testCovariate} to indicate which column of \code{pData(bs)} 37 | #' to use as a factor for coloring the points and lines of the plot. 38 | #' Alternatively, you can specify particular colors by 39 | #' passing this information through the \code{pData} slot of the 40 | #' object \code{BSseq} (a \code{data.frame} that houses metadata). To do 41 | #' so, place the color value for each sample in a column titled \code{col}, 42 | #' and leave this argument as its default value of NULL. Alternatively, 43 | #' you may specify a vector of color values (one for each sample), but 44 | #' you *must* make sure that this vector is in the same order as the samples 45 | #' are in the BSseq object. If NULL and no \code{col} column is found in 46 | #' \code{pData}, then estimates are plotted in black for all samples. 47 | #' 48 | #' @param lty The line type of the methylation estimates. It is recommended to 49 | #' pass this information through the \code{pData} slot of the 50 | #' object \code{BSseq} (a \code{data.frame} that houses metadata). To do 51 | #' so, place the line type value for each sample in a column titled \code{lty}, 52 | #' and leave this argument as its default value of NULL. Alternatively, 53 | #' you may specify a vector of line type values (one for each sample), but 54 | #' you *must* make sure that this vector is in the same order as the samples 55 | #' are in the BSseq object. If NULL and no \code{lty} column is found in 56 | #' \code{pData}, then estimates are plotted with \code{lty=1} for all samples. 57 | #' 58 | #' @param lwd The line width of the methylation estimates. It is recommended to 59 | #' pass this information through the \code{pData} slot of the 60 | #' object \code{BSseq} (a \code{data.frame} that houses metadata). To do 61 | #' so, place the line width value for each sample in a column titled \code{lwd}, 62 | #' and leave this argument as its default value of NULL. Alternatively, 63 | #' you may specify a vector of line width values (one for each sample), but 64 | #' you *must* make sure that this vector is in the same order as the samples 65 | #' are in the BSseq object. If NULL and no \code{lwd} column is found in 66 | #' \code{pData}, then estimates are plotted with \code{lwd=1} for all samples. 67 | #' 68 | #' @param label The condition/population labels for the plot legend. If NULL 69 | #' (default) this is taken from the \code{testCovariate} column of 70 | #' \code{pData}. Alternatively, you can pass in labels by 71 | #' adding this information through the \code{pData} slot of the 72 | #' object \code{BSseq} (a \code{data.frame} that houses metadata). To do 73 | #' so, place the labels for each sample in a column titled \code{label}, 74 | #' and leave this argument as its default value of NULL. 75 | #' You may instead specify an arbitrary vector of labels (one for each sample), 76 | #' but be aware that you *must* make sure that this vector is in the same order 77 | #' as the samples are in the BSseq object. If NULL, and \code{testCovariate} is 78 | #' also NULL and no \code{label} column is found in 79 | #' \code{pData}, then no legend is created. 80 | #' 81 | #' @param mainWithWidth logical value indicating whether the default title 82 | #' should include information about width of the plot region. 83 | #' 84 | #' @param regionCol The color used for highlighting the region. 85 | #' 86 | #' @param addTicks logical value indicating whether tick marks showing the 87 | #' location of methylation loci should be added. Default is TRUE. 88 | #' 89 | #' @param addPoints logical value indicating whether the individual 90 | #' methylation estimates be plotted as points. 91 | #' 92 | #' @param pointsMinCov The minimum coverage a methylation loci need in 93 | #' order for the raw methylation estimates to be plotted. Useful for filtering 94 | #' out low coverage loci. Only used if addPoints = TRUE. Default value is 1 95 | #' (no filtering). 96 | #' 97 | #' @param highlightMain logical value indicating whether the plot region 98 | #' should be highlighted. 99 | #' 100 | #' @param stat logical value indicating whether the region statistic 101 | #' should be displayed in the plot title. The value is extracted from the 102 | #' \code{regions} argument. 103 | #' 104 | #' @param qval logical value indicating whether the region FDR estimate 105 | #' (q-value) should be displayed in the plot title. The value is extracted 106 | #' from the \code{regions} argument. 107 | #' 108 | #' @param verbose logical value indicating whether progress messages 109 | #' should be printed to the screen. 110 | #' 111 | #' @param testCovariate integer value or vector indicating which of columns of 112 | #' \code{pData(bs)} contains the covariate of interest. 113 | #' This is used to construct the sample labels and colors (unless this is 114 | #' over-ridden by specifying \code{label}). 115 | #' 116 | #' @param includeYlab a logical indicating whether to include the Y axis 117 | #' label 'Methylation' (useful to turn off if combining multiple region 118 | #' figures and you do not want to include redundant y axis label information) 119 | #' 120 | #' @param compareTrack a named GRangesList object that contains up to four 121 | #' custom tracks (GRanges objects) which will be plotted below the region. 122 | #' Only one of `compareTrack` or `annoTrack` can be specified since there is 123 | #' only for plotting either the built in GpG category and exon tracks, *or* a 124 | #' custom set of tracks. 125 | #' 126 | #' @param labelCols a character vector with names of the mcols slot of the 127 | #' GRanges items in `compareTrack'. Only used if plotting custom 128 | #' tracks using the `compareTrack' argument. If specified, the (first) value 129 | #' in that column is printed along with a label that includes the name of the 130 | #' list item. If NULL (default), just the name of the track is printed. 131 | #' 132 | #' @param horizLegend logical indicating whether the legend should be 133 | #' horizontal instead of vertical (default FALSE). This is useful if you need 134 | #' to plot many labels and want to preserve whitespace. 135 | #' 136 | #' @param addLines logical indicating whether to plot smooth lines between 137 | #' points. Default is true. Can be useful to turn this off for very small 138 | #' regions. 139 | #' 140 | #' @param linesMinCov The minimum coverage a methylation loci need in 141 | #' order to be used for plotting of smoothed lines. Useful for filtering 142 | #' out low coverage loci. Only used if addLines = TRUE. Default value is 1 143 | #' (no filtering). 144 | #' 145 | #' @export 146 | #' 147 | #' @return None (generates a plot) 148 | #' 149 | #' @importFrom RColorBrewer brewer.pal 150 | #' @importFrom grDevices hcl rainbow 151 | #' @importFrom graphics arrows 152 | #' 153 | #' @examples 154 | #' 155 | #' # load the example data 156 | #' data(BS.chr21) 157 | #' 158 | #' # load example results (computed with dmrseq function) 159 | #' data(dmrs.ex) 160 | #' 161 | #' # get annotation information (using getAnnot function) 162 | #' # here we'll load the example annotation from chr21 163 | #' data(annot.chr21) 164 | #' 165 | #' # plot the 1st DMR 166 | #' plotDMRs(BS.chr21, regions=dmrs.ex[1,], testCovariate=1, 167 | #' annoTrack=annot.chr21) 168 | #' 169 | plotDMRs <- function(BSseq, regions = NULL, testCovariate = NULL, 170 | extend = (end(regions) - start(regions) + 1)/2, main = "", 171 | addRegions = regions, annoTrack = NULL, col = NULL, 172 | lty = NULL, lwd = NULL, label = NULL, mainWithWidth = TRUE, 173 | regionCol = .alpha("#C77CFF", 174 | 0.2), addTicks = TRUE, addPoints = TRUE, pointsMinCov = 1, 175 | highlightMain = FALSE, 176 | qval = TRUE, stat = TRUE, verbose = TRUE, includeYlab = TRUE, 177 | compareTrack = NULL, 178 | labelCols = NULL, horizLegend = FALSE, 179 | addLines = TRUE, linesMinCov = 1) { 180 | # adapted from plotManyRegions from bsseq plot to take 181 | # in a vector of qval values 182 | # (1 per region in regions argument) to be displayed in 183 | # the plot title. set 184 | # addPoints = TRUE to plot individual points sized by coverage 185 | # and one smooth 186 | # (loess) line per sample instead of a uniform-sized verbatim 187 | # line going through 188 | # each observation 189 | if (!addLines && !addPoints) 190 | stop("At least one of addLines or addPoints must be true") 191 | if (verbose) 192 | message("[plotDMRs] Plotting ", nrow(regions), " DMRs") 193 | if (!is.null(regions)) { 194 | if (is(regions, "data.frame")){ 195 | gr <- data.frame2GRanges(regions, keepColumns = FALSE) 196 | }else{ 197 | gr <- regions 198 | } 199 | if (!is(gr, "GRanges")) 200 | stop("'regions' needs to be either a 'data.frame' ", 201 | " or a 'GRanges' ") 202 | } else { 203 | gr <- granges(BSseq) 204 | } 205 | gr <- resize(gr, width = 2 * extend + width(gr), fix = "center") 206 | BSseq <- subsetByOverlaps(BSseq, gr) 207 | 208 | if (!is.null(annoTrack) && !is.null(compareTrack)) 209 | stop("Choose either annoTrack or compareTrack; can't plot both") 210 | 211 | if (length(start(BSseq)) == 0) 212 | stop("No overlap between BSseq data and regions") 213 | if (!is.null(main) && length(main) != length(gr)) 214 | main <- rep(main, length = length(gr)) 215 | 216 | if (length(extend) == 1) { 217 | extend <- rep(extend, length(gr)) 218 | } 219 | 220 | if (!is.null(testCovariate)) { 221 | coeff <- seq(2, (1 + length(testCovariate))) 222 | testCov <- as.character(pData(BSseq)[, testCovariate]) 223 | if (length(unique(testCov)) > 2 && !is.numeric(testCov) && length(coeff) == 1) 224 | coeff <- seq(coeff, coeff + length(unique(as.character(testCov))) - 2 ) 225 | 226 | design <- model.matrix(~testCov) 227 | 228 | if (is.null(col) && !("col" %in% names(pData(BSseq)))) { 229 | cov.unique <- unique(design[, coeff, drop = FALSE]) 230 | ncol <- nrow(cov.unique) 231 | 232 | colors <- gg_color_hue(ncol) 233 | if (ncol == 2) { 234 | colors <- c("mediumblue", "deeppink1") 235 | } 236 | colors <- cbind(cov.unique, 237 | colors[rank(as.numeric(rowSums(cov.unique)), 238 | ties.method = "first")]) 239 | colmat <- colors[, -ncol(colors), drop = FALSE] 240 | colmat <- apply(colmat, 2, as.numeric) 241 | z <- colors[,ncol(colors)][ 242 | match(data.frame(t(design[, coeff, drop = FALSE])), 243 | data.frame(t(colmat)))] 244 | 245 | pData(BSseq)$col <- as.character(z) 246 | } 247 | 248 | if (is.null(label) && !("label" %in% names(pData(BSseq)))) { 249 | pData(BSseq)$label <- paste0(pData(BSseq)[, testCovariate]) 250 | } 251 | } 252 | 253 | if (!is.null(label) || "label" %in% names(pData(BSseq))) { 254 | if(!is.null(label)){ 255 | labs <- label 256 | }else{ 257 | labs <- pData(BSseq)[["label"]] 258 | } 259 | if(horizLegend){ 260 | wiggle <- max(nchar(labs)) * 0.4 261 | }else{ 262 | wiggle <- length(unique(labs)) * 0.9 263 | } 264 | opar <- par(mar = c(0, 4.1, 0, wiggle), 265 | oma = c(0, 0, 2.5, 1)) 266 | } else { 267 | opar <- par(mar = c(0, 4.1, 0, 0), oma = c(0, 0, 2.5, 1)) 268 | } 269 | on.exit(par(opar)) 270 | 271 | for (ii in seq(along = gr)) { 272 | if (verbose && ii%%100 == 0) { 273 | cat(sprintf("..... Plotting region %d (out of %d)\n", ii, 274 | nrow(regions))) 275 | } 276 | 277 | .plotSingleDMR(BSseq = BSseq, region = regions[ii, ], 278 | extend = extend[ii], main = main[ii], col = col, lty = lty, 279 | lwd = lwd, label = label, addRegions = addRegions, 280 | regionCol = regionCol, mainWithWidth = mainWithWidth, 281 | annoTrack = annoTrack, addTicks = addTicks, addPoints = addPoints, 282 | pointsMinCov = pointsMinCov, highlightMain = highlightMain, 283 | qval = qval, stat = stat, includeYlab = includeYlab, 284 | compareTrack = compareTrack, labelCols = labelCols, 285 | horizLegend = horizLegend, addLines = addLines, linesMinCov = linesMinCov) 286 | } 287 | } 288 | 289 | 290 | -------------------------------------------------------------------------------- /R/internal_plotting_functions.R: -------------------------------------------------------------------------------- 1 | #' @title Add annotations to DMR plots 2 | #' 3 | #' @description Function to add visual representation of CpG categories 4 | #' and/or coding 5 | #' sequences to DMR plot 6 | #' 7 | #' @details An internal function that takes an annotation 8 | #' \code{SimpleGRangesList} 9 | #' object that 10 | #' contains CpG category information in the first element (optional) and / or 11 | #' coding gene sequence information in the second element (optional). If neither 12 | #' of these are present, then nothing will be plotted. 13 | #' 14 | #' @param gr a \code{GRanges} object that contains the DMRs to be 15 | #' plotted 16 | #' 17 | #' @param annoTrack a \code{SimpleGRangesList} object with two elements. 18 | #' The first contains CpG category information in the first element (optional) 19 | #' coding gene sequence information in the second element (optional). 20 | #' At least one of these elements needs to be non-null in order for 21 | #' any annotation to be plotted, but it is not necessary to contain 22 | #' both. 23 | #' 24 | #' @return None 25 | #' 26 | dmrPlotAnnotations <- function(gr, annoTrack) { 27 | # Code adapted from bsseq package 28 | 29 | ## check may need to be modified 30 | if (!is(annoTrack, "SimpleGRangesList")) 31 | stop("'annoTrack' needs to be a 'SimpleGRangesList'") 32 | plot(start(gr), 1, type = "n", xaxt = "n", yaxt = "n", bty = "n", 33 | ylim = c(0, length(annoTrack) + 0.5), xlim = c(start(gr), end(gr)), 34 | xlab = "", ylab = "") 35 | 36 | # add legend 37 | vars <- list(Island = "Island ", Shore = "Shore ", Shelf = "Shelf ", 38 | OpenSea1 = "Open ", OpenSea2 = "Sea") 39 | cols <- c("forestgreen", "goldenrod2", "dodgerblue", "blue3", "blue3") 40 | for (i in seq_len(length(vars))) { 41 | tmpvars <- vars 42 | tmpvars[-i] <- paste("phantom('", tmpvars[-i], "')", sep = "") 43 | expr <- paste(tmpvars, collapse = "*") 44 | text(start(gr), 1.8, parse(text = expr), col = cols[i], adj = c(0, 1), 45 | cex = 0.85) 46 | } 47 | 48 | lapply(seq(along = annoTrack), function(ii) { 49 | jj <- length(annoTrack) + 1 - ii 50 | ir <- subsetByOverlaps(annoTrack[[ii]], gr) 51 | start(ir) <- pmax(start(ir), start(gr)) 52 | end(ir) <- pmin(end(ir), end(gr)) 53 | 54 | if (ii == 2) { 55 | jj <- jj - 0.15 56 | } 57 | 58 | if (length(ir) > 0) { 59 | if (ii == 2) { 60 | colourCount <- length(unique(ir$symbol)) 61 | getPalette <- colorRampPalette(.alpha(brewer.pal(max( 62 | length(unique(ir$symbol)), 63 | 3), "Dark2"), 0.4)) 64 | color.pal <- getPalette(colourCount) 65 | names(color.pal) <- unique(ir$symbol) 66 | map <- match(ir$symbol, names(color.pal)) 67 | color <- color.pal[map] 68 | 69 | bord <- "black" 70 | } else if (ii == 1) { 71 | color <- ir$type 72 | colvec <- c("blue3", "dodgerblue", "goldenrod2", "forestgreen") 73 | names(colvec) <- c("inter", "shelves", "shores", "islands") 74 | 75 | for (ucol in unique(color)){ 76 | ix <- agrep(ucol, names(colvec)) 77 | if (length(ix) > 1) 78 | stop("Ambiguous CpG annotation labels") 79 | color[color == ucol] <- colvec[ix] 80 | } 81 | bord <- color 82 | 83 | rect(start(ir), jj - 0.06, end(ir), jj + 0.17, col = color, 84 | border = bord) 85 | } 86 | 87 | if (ii == 2) { 88 | lastPos <- rep(NA, length(unique(ir$symbol)) - 1) 89 | used <- NULL 90 | for (k in seq_len(length(unique(ir$symbol)))) { 91 | irk <- ir[ir$symbol == unique(ir$symbol)[k], ] 92 | rect(min(start(irk)), jj - 0.065, max(end(irk)), jj + 0.065, 93 | col = .alpha("black", 94 | 0.1), border = .alpha("black", 0.1)) 95 | if (!(unique(ir$symbol)[k] %in% used)) { 96 | rwidth <- end(gr) - start(gr) 97 | sg <- pmin(start(irk), end(irk)) 98 | eg <- pmax(start(irk), end(irk)) 99 | gwidth <- min(max(eg), end(gr)) - max(min(sg), start(gr)) 100 | textPos <- max(min(sg), start(gr)) + gwidth/2 101 | jj.orig <- jj 102 | if (sum(!is.na(lastPos)) > 0) { 103 | separation <- (textPos - lastPos[k - 1])/rwidth 104 | if (abs(separation) <= 0.2 && k < 3) { 105 | jj <- jj - 0.29 106 | } else { 107 | separation <- min(abs((textPos - lastPos)/rwidth), 108 | na.rm = TRUE) 109 | if (abs(separation) <= 0.2) { 110 | jj <- jj - 0.29 111 | } 112 | } 113 | } 114 | lastPos[k] <- textPos 115 | text(textPos, jj - 0.375, labels = unique(irk$symbol), 116 | cex = 0.85, 117 | col = unique(color)[k]) 118 | jj <- jj.orig 119 | used <- c(used, unique(ir$symbol)[k]) 120 | } 121 | rect(sg, jj - 0.11, eg, jj + 0.12, col = unique(color)[k], 122 | border = bord) 123 | } 124 | } 125 | 126 | } 127 | mtext(names(annoTrack)[ii], side = 2, at = jj, las = 1, line = 1) 128 | }) 129 | 130 | } 131 | 132 | .isColor <- function(x) 133 | { 134 | res <- try(col2rgb(x),silent=TRUE) 135 | return(!"try-error"%in%class(res)) 136 | } 137 | 138 | .dmrGetMeta <- function(object, col, lty, lwd, label) { 139 | ## Assumes that object has pData and sampleNames methods Code adapted from 140 | ## bsseq package 141 | 142 | ## extract col 143 | if (is.null(col)) { 144 | if ("col" %in% names(pData(object))) 145 | col <- pData(object)[["col"]] else col <- rep("black", 146 | nrow(pData(object))) 147 | }else if (length(col) == 1){ 148 | if (col %in% names(pData(object))) 149 | col <- pData(object)[[col]] else col <- rep("black", ncol(object)) 150 | if (!.isColor(col)) 151 | col <- rainbow(length(unique(col)))[as.numeric(as.factor(col))] 152 | } 153 | if (length(col) != ncol(object)) 154 | col <- rep(col, length.out = ncol(object)) 155 | if (is.null(names(col))) 156 | names(col) <- sampleNames(object) 157 | 158 | ## extract lty 159 | if (is.null(lty)) { 160 | if ("lty" %in% names(pData(object))) 161 | lty <- pData(object)[["lty"]] else lty <- rep(1, ncol(object)) 162 | }else if (length(lty) == 1){ 163 | if (lty %in% names(pData(object))) 164 | lty <- pData(object)[[lty]] else lty <- rep(1, ncol(object)) 165 | if (!is.numeric(lty)) 166 | lty <- as.numeric(as.factor(lty)) 167 | } 168 | if (length(lty) != ncol(object)) 169 | lty <- rep(lty, length.out = ncol(object)) 170 | if (is.null(names(lty))) 171 | names(lty) <- sampleNames(object) 172 | 173 | # extract lwd 174 | if (is.null(lwd)) { 175 | if ("lwd" %in% names(pData(object))) 176 | lwd <- pData(object)[["lwd"]] else lwd <- rep(1.5, 177 | nrow(pData(object))) 178 | }else if (length(lwd) == 1){ 179 | if (lwd %in% names(pData(object))) 180 | lwd <- pData(object)[[lwd]] else lwd <- rep(1, ncol(object)) 181 | if (!is.numeric(lwd)) 182 | lwd <- as.numeric(as.factor(lwd)) 183 | } 184 | if (length(lwd) != ncol(object)) 185 | lwd <- rep(lwd, length.out = ncol(object)) 186 | if (is.null(names(lwd))) 187 | names(lwd) <- sampleNames(object) 188 | 189 | ## extract label 190 | if (is.null(label)) { 191 | if ("label" %in% names(pData(object))) 192 | label <- pData(object)[["label"]] else label <- rep(NA, 193 | ncol(object)) 194 | }else if (length(label) == 1){ 195 | if (label %in% names(pData(object))) 196 | label <- pData(object)[[label]] else label <- rep(NA, ncol(object)) 197 | if (!is.character(label)) 198 | label <- as.character(label) 199 | } 200 | if (length(label) != ncol(object)) 201 | label <- rep(label, length.out = ncol(object)) 202 | if (is.null(names(label))) 203 | names(label) <- sampleNames(object) 204 | 205 | return(list(col = col, lty = lty, lwd = lwd, label = label)) 206 | } 207 | 208 | 209 | .dmrPlotTitle <- function(gr, extend, main, mainWithWidth, 210 | qval = NULL, stat = NULL) { 211 | # this function creates the main title for DMR plots Code adapted from bsseq 212 | # package 213 | if (is.data.frame(gr)) 214 | gr <- data.frame2GRanges(gr) 215 | if (length(gr) > 1) { 216 | warning("plotTitle: gr has more than one element") 217 | gr <- gr[1] 218 | } 219 | plotChr <- as.character(seqnames(gr)) 220 | plotRange <- c(start(gr), end(gr)) 221 | regionCoord <- sprintf("%s: %s - %s", plotChr, format(plotRange[1], 222 | big.mark = ",", 223 | scientific = FALSE), format(plotRange[2], big.mark = ",", 224 | scientific = FALSE)) 225 | if (mainWithWidth) { 226 | regionWidth <- sprintf("width = %s", format(width(gr), 227 | big.mark = ",", 228 | scientific = FALSE)) 229 | # add optional labels to plot titles 230 | if (!is.null(qval) && !is.null(stat)) { 231 | regionStat <- sprintf("Stat: %s", format(stat, big.mark = ",", 232 | scientific = FALSE)) 233 | regionFDR <- sprintf("FDR: %s", format(qval, big.mark = ",", 234 | scientific = FALSE)) 235 | regionCoord <- sprintf(paste0("%s (%s)\n%s, %s"), regionCoord, 236 | regionWidth, regionStat, regionFDR) 237 | } else if (!is.null(stat)) { 238 | regionStat <- sprintf("Stat: %s", format(stat, big.mark = ",", 239 | scientific = FALSE)) 240 | regionCoord <- sprintf(paste0("%s (%s)\n%s"), regionCoord, 241 | regionWidth, regionStat) 242 | } else if (!is.null(qval)) { 243 | regionFDR <- sprintf("FDR: %s", format(qval, big.mark = ",", 244 | scientific = FALSE)) 245 | regionCoord <- sprintf(paste0("%s (%s)\n%s"), regionCoord, 246 | regionWidth, regionFDR) 247 | } else { 248 | regionCoord <- sprintf("%s (%s)", regionCoord, regionWidth) 249 | } 250 | } else { 251 | # add optional labels to plot titles 252 | if (!is.null(qval) && !is.null(stat)) { 253 | regionStat <- sprintf("Stat: %s", format(stat, big.mark = ",", 254 | scientific = FALSE)) 255 | regionFDR <- sprintf("FDR: %s", format(qval, big.mark = ",", 256 | scientific = FALSE)) 257 | regionCoord <- sprintf(paste0("%s\n%s, %s"), regionCoord, 258 | regionStat, regionFDR) 259 | } else if (!is.null(stat)) { 260 | regionStat <- sprintf("Stat: %s", format(stat, big.mark = ",", 261 | scientific = FALSE)) 262 | regionCoord <- sprintf(paste0("%s\n%s"), regionCoord, regionStat) 263 | } else if (!is.null(qval)) { 264 | regionFDR <- sprintf("FDR: %s", format(qval, big.mark = ",", 265 | scientific = FALSE)) 266 | regionCoord <- sprintf(paste0("%s\n%s"), regionCoord, regionFDR) 267 | } else { 268 | regionCoord <- sprintf("%s", regionCoord) 269 | } 270 | } 271 | if (main != "") { 272 | main <- sprintf("%s\n%s", main, regionCoord) 273 | } else { 274 | main <- regionCoord 275 | } 276 | main 277 | } 278 | 279 | .dmrPlotLegend <- function(plotRange, col, label, horizLegend) { 280 | # this function plots a legend to the right of the plot to indicate 281 | # whichcolor corresponds to which samples 282 | 283 | numUnique <- length(unique(paste0(col, label, sep = ""))) 284 | if (numUnique < length(col)) { 285 | col <- unique(col) 286 | label <- unique(label) 287 | } 288 | 289 | if (!horizLegend){ 290 | for (lg in seq_len(length(label))) { 291 | mtext(label[lg], side = 4, line = lg - 1, col = .darken(col[lg]), 292 | cex = 0.9, las = 0) 293 | } 294 | }else{ 295 | for (lg in seq_len(length(label))) { 296 | mtext(label[lg], side = 4, line = 0.5, col = .darken(col[lg]), 297 | cex = 0.9, las = 1, at = 1.02 - 0.08*(lg-1)) 298 | } 299 | } 300 | } 301 | 302 | 303 | # function to transform a given color specified by a character object to a 304 | # transparent version of that color 305 | .makeTransparent <- function(someColor, alpha = 130) { 306 | newColor <- col2rgb(someColor) 307 | apply(newColor, 2, function(curcoldata) { 308 | rgb(red = curcoldata[1], green = curcoldata[2], blue = curcoldata[3], 309 | alpha = alpha, maxColorValue = 255) 310 | }) 311 | } 312 | 313 | .alpha <- function(col, alpha = 1) { 314 | if (missing(col)) 315 | stop("Please provide a vector of colours.") 316 | apply(vapply(col, grDevices::col2rgb, matrix(rep(0,3)))/255, 2, 317 | function(x) grDevices::rgb(x[1], x[2], x[3], alpha = alpha)) 318 | } 319 | 320 | 321 | .dmrPlotPoints <- function(x, y, z, col, pointsMinCov, maxCov, 322 | regionWidth) { 323 | # modified from .bsPlotPoints in bsseq added functionality for point size 324 | # to vary with coverage 325 | 326 | lwd <- 1.5 327 | # make color of points semi-transparent so that overlapping points can 328 | # still be seen 329 | col.points <- .makeTransparent(col) 330 | 331 | # if there are a lot of CpGs to plot (the case for a block-level analysis 332 | # decrease the size of the plotted points since these can get very crowded 333 | c1 <- pmax(-0.25 * atan(3 * (length(x) - 80)/80 * pi)/atan(3 * 334 | (80)/80 * pi) + 0.75, 0.5) 335 | ptSize <- c1 * (sqrt(z)/sqrt(maxCov) + 0.25) 336 | 337 | points(x[z >= pointsMinCov], y[z >= pointsMinCov], col = col.points, pch = 16, 338 | cex = ptSize) 339 | } 340 | 341 | .darken <- function(color, factor = 1.4) { 342 | col <- col2rgb(color) 343 | col <- col/factor 344 | col <- rgb(t(col), maxColorValue = 255) 345 | col 346 | } 347 | 348 | 349 | 350 | .dmrPlotLines <- function(x, y, z, col, lwd, linesMinCov, maxCov, 351 | regionWidth, lty) { 352 | 353 | if (length(x) > 100 && !is.null(lwd)) { 354 | lwd <- lwd + 1 355 | } else if (length(x) > 100) { 356 | lwd <- 2 357 | } 358 | 359 | spn <- max(1 - (1/160)*sum(z >= linesMinCov), 0.75) 360 | 361 | y[y==1] <- 0.99 362 | y[y==0] <- 0.01 363 | logit <- function(p){ log(p/(1-p))} 364 | inv.logit <- function(l){ exp(l) / (1 + exp(l)) } 365 | 366 | # don't interpolate smooth lines if there are fewer than 10 cpgs 367 | if (length(x) >= 10) { 368 | loess_fit <- loess(logit(y[z >= linesMinCov]) ~ x[z >= linesMinCov], 369 | weights = z[z >= linesMinCov], span = spn) 370 | 371 | xl <- seq(min(x[z >= linesMinCov], na.rm=TRUE), 372 | max(x[z >= linesMinCov], na.rm=TRUE), 373 | (max(x[z >= linesMinCov], na.rm=TRUE) - 374 | min(x[z >= linesMinCov], na.rm=TRUE))/500) 375 | lines(xl, inv.logit(predict(loess_fit,xl)), 376 | col = .makeTransparent(.darken(col), 175), lwd = lwd, 377 | lty = lty) 378 | 379 | }else{ 380 | lines(x[z >= linesMinCov], y[z >= linesMinCov], 381 | col = .makeTransparent(.darken(col), 175), lwd = lwd, 382 | lty = lty) 383 | } 384 | } 385 | 386 | .dmrPlotSmoothData <- function(BSseq, region, extend, addRegions, col, lty, lwd, 387 | label, regionCol, addTicks, addPoints, pointsMinCov, highlightMain, 388 | includeYlab = TRUE, horizLegend, addLines=TRUE, linesMinCov) { 389 | # modified from .plotSmoothData in bsseq to allow non-smoothed regions 390 | 391 | gr <- bsseq.bsGetGr(BSseq, region, extend) 392 | BSseq <- subsetByOverlaps(BSseq, gr) 393 | BSseq2 <- subsetByOverlaps(BSseq, bsseq.bsGetGr(BSseq, region, extend = 0)) 394 | ## Extract basic information 395 | sampleNames <- sampleNames(BSseq) 396 | names(sampleNames) <- sampleNames 397 | positions <- start(BSseq) 398 | positions2 <- start(BSseq2) 399 | rawPs <- as.matrix(bsseq::getMeth(BSseq, type = "raw")) 400 | coverage <- as.matrix(bsseq::getCoverage(BSseq)) 401 | 402 | ## get col, lwd, lty these are extracted from the pData data.frame that is 403 | ## part of the bsseq object colEtc is a list object that contains col, lty, 404 | ## lwd and label which are used as plotting parameters label is a condition 405 | ##label to use for adding a legend with condition names 406 | 407 | colEtc <- .dmrGetMeta(object = BSseq, col = col, lty = lty, lwd = lwd, 408 | label = label) 409 | 410 | if (includeYlab) { 411 | yl <- "Methylation" 412 | } else { 413 | yl <- "" 414 | } 415 | 416 | ## The actual plotting starts here 417 | plot(positions[1], 0.5, type = "n", xaxt = "n", yaxt = "n", ylim = c(0, 1), 418 | xlim = c(start(gr), 419 | end(gr)), xlab = "", ylab = yl) 420 | axis(side = 2, at = c(0, 0.25, 0.5, 0.75, 1), las = 1) 421 | if (addTicks) 422 | rug(positions) 423 | 424 | if (is.list(addRegions) && !is.data.frame(addRegions)) { 425 | if (length(addRegions) > 2) { 426 | stop("Only two sets of regions can be highlighted") 427 | } 428 | if (length(regionCol) == 1) { 429 | regionCol <- c(regionCol, .alpha("blue", 0.2)) 430 | } 431 | bsseq.bsHighlightRegions(regions = addRegions[[1]], gr = gr, ylim = c(0, 432 | 1), regionCol = regionCol[1], highlightMain = highlightMain) 433 | bsseq.bsHighlightRegions(regions = addRegions[[2]], gr = gr, ylim = c(0, 434 | 1), regionCol = regionCol[2], highlightMain = highlightMain) 435 | 436 | } else { 437 | bsseq.bsHighlightRegions(regions = addRegions, gr = gr, ylim = c(0, 1), 438 | regionCol = regionCol, 439 | highlightMain = highlightMain) 440 | } 441 | 442 | # add points first to avoid lines getting hidden by plotting many cpg points 443 | if (addPoints) { 444 | for(sampIdx in seq_len(ncol(BSseq))){ 445 | .dmrPlotPoints(positions, rawPs[, sampIdx], coverage[, sampIdx], 446 | col = colEtc$col[sampIdx], 447 | pointsMinCov = pointsMinCov, 448 | maxCov = quantile(coverage, 0.95), 449 | regionWidth = end(gr) - 450 | start(gr)) 451 | } 452 | } 453 | 454 | if (addLines){ 455 | for(sampIdx in seq_len(ncol(BSseq))){ 456 | if (sum(coverage[, sampIdx] >= linesMinCov) > 1){ 457 | .dmrPlotLines(positions, rawPs[, sampIdx], coverage[, sampIdx], 458 | col = colEtc$col[sampIdx], 459 | lwd = colEtc$lwd[sampIdx], 460 | linesMinCov = linesMinCov, 461 | maxCov = quantile(coverage, 0.95), 462 | regionWidth = end(gr) - 463 | start(gr), 464 | lty = colEtc$lty[sampIdx]) 465 | } 466 | } 467 | } 468 | 469 | # if colEtc$label contains characters that are not null or missing, then 470 | # create a legend which houses the labels as well as the colors that 471 | # correspond to them -> pass in both colEtc$label as well as colEtc$col 472 | if (sum(!is.na(colEtc$label)) == length(colEtc$label)) { 473 | .dmrPlotLegend(plotRange = c(start(gr), end(gr)), 474 | colEtc$col, colEtc$label, horizLegend) 475 | } 476 | 477 | } 478 | 479 | # function doesn't need to be exported; not a user-level function since a single 480 | # DMR can be plotted just fine with plotDMRs. 481 | .plotSingleDMR <- function(BSseq, region = NULL, extend = 0, main = "", 482 | addRegions = NULL, annoTrack = NULL, col = NULL, lty = NULL, lwd = NULL, 483 | label = NULL, mainWithWidth = TRUE, regionCol = .alpha("orchid1", 0.2), 484 | addTicks = TRUE, addPoints = FALSE, pointsMinCov = 5, highlightMain = FALSE, 485 | qval = NULL, stat = NULL, includeYlab = TRUE, compareTrack = NULL, 486 | labelCols = NULL, horizLegend = FALSE, addLines = TRUE, linesMinCov = 1) { 487 | 488 | if(!is.null(annoTrack) || !is.null(compareTrack)){ 489 | layout(matrix(seq_len(2), ncol = 1), heights = c(2, 1.5)) 490 | }else{ 491 | layout(matrix(seq_len(2), ncol = 1), heights = c(2, 0.2)) 492 | } 493 | .dmrPlotSmoothData(BSseq = BSseq, region = region, extend = extend, 494 | addRegions = addRegions, 495 | col = col, lty = lty, lwd = lwd, label = label, 496 | regionCol = regionCol, addTicks = addTicks, 497 | addPoints = addPoints, pointsMinCov = pointsMinCov, 498 | highlightMain = highlightMain, 499 | includeYlab = includeYlab, 500 | horizLegend = horizLegend, 501 | addLines = addLines, 502 | linesMinCov = linesMinCov) 503 | gr <- bsseq.bsGetGr(BSseq, region, extend) 504 | 505 | if (!is.null(main)) { 506 | if (qval && stat) { 507 | qval <- round(region$qval, 4) 508 | stat <- round(region$stat, 3) 509 | main <- .dmrPlotTitle(gr = region, extend = extend, main = main, 510 | mainWithWidth = mainWithWidth, 511 | qval = qval, stat = stat) 512 | } else if (stat) { 513 | stat <- round(region$stat, 3) 514 | main <- .dmrPlotTitle(gr = region, extend = extend, main = main, 515 | mainWithWidth = mainWithWidth, stat = stat) 516 | } else if (qval) { 517 | qval <- round(region$qval, 4) 518 | main <- .dmrPlotTitle(gr = region, extend = extend, main = main, 519 | mainWithWidth = mainWithWidth, qval = qval) 520 | } else { 521 | main <- .dmrPlotTitle(gr = region, extend = extend, main = main, 522 | mainWithWidth = mainWithWidth) 523 | } 524 | mtext(side = 3, text = main, outer = FALSE, cex = 0.8, line = 0) 525 | } 526 | 527 | if (!is.null(annoTrack)) { 528 | dmrPlotAnnotations(gr, annoTrack) 529 | } else if (!is.null(compareTrack)) { 530 | dmrPlotComparisons(gr, compareTrack, labelCols = labelCols) 531 | } 532 | } 533 | 534 | # pasting bsseq's .bsGetGr function since not exported 535 | bsseq.bsGetGr <- function(object, region, extend) { 536 | if (is.null(region)) { 537 | gr <- GRanges(seqnames = seqnames(object)[1], 538 | ranges = IRanges(start = min(start(object)), 539 | end = max(start(object)))) 540 | } else { 541 | if (is(region, "data.frame")){ 542 | gr <- data.frame2GRanges(region, keepColumns = FALSE) 543 | }else{ 544 | gr <- region 545 | } 546 | if (!is(gr, "GRanges") || length(gr) != 1) 547 | stop("'region' needs to be either a 'data.frame' ", 548 | "(with a single row) or a 'GRanges' (with a single element)") 549 | gr <- resize(gr, width = 2 * extend + width(gr), fix = "center") 550 | } 551 | gr 552 | } 553 | 554 | # pasting bsseq's .bsHighlightRegions function since not exported 555 | bsseq.bsHighlightRegions <- function(regions, gr, ylim, regionCol, 556 | highlightMain) { 557 | if (is.data.frame(regions)) 558 | regions <- data.frame2GRanges(regions) 559 | if (highlightMain) 560 | regions <- c(regions, gr) 561 | if (is.null(regions)) 562 | return(NULL) 563 | regions <- subsetByOverlaps(regions, gr) 564 | regions <- pintersect(regions, rep(gr, length(regions))) 565 | if (length(regions) == 0) 566 | return(NULL) 567 | rect(xleft = start(regions), xright = end(regions), ybottom = ylim[1], 568 | ytop = ylim[2], col = regionCol, border = NA) 569 | } 570 | 571 | gg_color_hue <- function(n) { 572 | hues <- seq(15, 375, length = n + 1) 573 | hcl(h = hues, l = 65, c = 100)[seq_len(n)] 574 | } 575 | 576 | # function to draw nonoverlapping comparison regions (up to 3) below the main 577 | # region plot (instead of annotations) 578 | dmrPlotComparisons <- function(gr, annoTrack, labelCols = NULL) { 579 | 580 | if (!is(annoTrack, "SimpleGRangesList")) 581 | stop("'annoTrack' needs to be a 'SimpleGRangesList'") 582 | 583 | if (length(annoTrack) > 4) 584 | stop("Can't plot more than 4 tracks") 585 | 586 | plot(start(gr), 1, type = "n", xaxt = "n", yaxt = "n", bty = "n", 587 | ylim = c(0, length(annoTrack) + 0.5), xlim = c(start(gr), end(gr)), 588 | xlab = "", ylab = "") 589 | 590 | colourCount <- length(annoTrack) 591 | getPalette <- colorRampPalette(.alpha(brewer.pal(max(length(annoTrack), 3), 592 | "Dark2"), 593 | 0.4)) 594 | color <- getPalette(colourCount) 595 | bord <- "black" 596 | 597 | lapply(seq(along = annoTrack), function(ii) { 598 | jj <- length(annoTrack) + 1 - ii 599 | ir <- subsetByOverlaps(annoTrack[[ii]], gr) 600 | start(ir) <- pmax(start(ir), start(gr)) 601 | end(ir) <- pmin(end(ir), end(gr)) 602 | 603 | top.pos <- 4.1 - (4 - length(annoTrack)) 604 | bot.pos <- 0 + (4 - length(annoTrack)) * 0.1 605 | jj.between <- (top.pos - bot.pos)/length(annoTrack) 606 | 607 | if (length(ir) > 0) { 608 | jj <- top.pos - (ii - 1) * jj.between 609 | 610 | arrows(start(ir), jj, end(ir), jj, code = 3, length = 0.05, 611 | angle = 90, col = .makeTransparent(color[ii], alpha = 185)) 612 | if (is.null(labelCols)) { 613 | text((end(gr) + start(gr))/2, jj - 0.4, names(annoTrack)[[ii]]) 614 | }else if(sum(labelCols %in% colnames(mcols(annoTrack[[ii]]))) > 0) { 615 | whichLabelCols <- match(labelCols, 616 | colnames(mcols(annoTrack[[ii]]))) 617 | notmiss <- !is.na(whichLabelCols) 618 | whichLabelCols <- whichLabelCols[notmiss] 619 | if (class(unlist(mcols(annoTrack[[ii]])[1, 620 | whichLabelCols, drop = TRUE])) == 621 | "numeric") { 622 | comps <- round(unlist(mcols(ir)[1, 623 | whichLabelCols, drop = TRUE]), 624 | 3) 625 | } else { 626 | comps <- unlist(mcols(ir)[1, whichLabelCols, drop = TRUE]) 627 | } 628 | if (sum(grepl("\\.", labelCols)) > 0) { 629 | labelCols <- gsub("\\.", " ", labelCols) 630 | } 631 | Label <- paste0(names(annoTrack)[ii], ": ", 632 | paste0(labelCols[notmiss], 633 | "=", comps, collapse = ", ")) 634 | text((end(gr) + start(gr))/2, jj - 0.4, Label, 635 | cex = 0.85, col = color[ii]) 636 | } else { 637 | text((end(gr) + start(gr))/2, jj - 0.4, names(annoTrack)[[ii]]) 638 | } 639 | } 640 | }) 641 | } 642 | -------------------------------------------------------------------------------- /vignettes/dmrseq.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Analyzing Bisulfite-seq data with dmrseq" 3 | author: "Keegan Korthauer" 4 | date: "`r BiocStyle::doc_date()`" 5 | package: "`r BiocStyle::pkg_ver('dmrseq')`" 6 | bibliography: dmrseqBib.bib 7 | abstract: > 8 | A basic task in the analysis of count data from Whole Genome 9 | Bisulfite-Sequencing is the detection of differentially methylated regions. 10 | The count data consist of, for each sample, the number of methylated 11 | reads and the total number of reads covering CpG. 12 | An important analysis question is to detect regions (collections of 13 | neighboring CpGs) with systematic differences between conditions, 14 | as compared to within-condition variability. These so-called Differentially 15 | Methylated Regions (DMRs) are thought to be more informative than single CpGs 16 | in terms of of biological function. Although several methods exist 17 | to quantify and perform statistical inference on changes at the individual 18 | CpG level, detection of DMRs is still limited to aggregating signifiant 19 | CpGs without proper inference at the region level. The package **dmrseq** 20 | addresses this gap by providing a rigorous permutation-based approach to 21 | detect and perform inference for differential methylation by use of 22 | generalized least squares models that account for inter-individual and 23 | inter-CpG variability to generate region-level statistics that can be 24 | comparable across the genome. This allows the framework to perform well even 25 | on samples as small as two per group. This vignette explains the 26 | use of the package and demonstrates typical workflows. This vignette was 27 | generated with dmrseq package version `r packageVersion("dmrseq")` 28 | output: 29 | BiocStyle::html_document: 30 | highlight: pygments 31 | toc_float: true 32 | fig_width: 5 33 | vignette: > 34 | %\VignetteIndexEntry{Analyzing Bisulfite-seq data with dmrseq} 35 | %\VignetteEngine{knitr::rmarkdown} 36 | %\VignetteEncoding[utf8]{inputenc} 37 | --- 38 | 39 | 40 | 41 | 42 | 43 | ```{r setup, echo=FALSE, results="hide"} 44 | knitr::opts_chunk$set(tidy=FALSE, cache=FALSE, 45 | dev="png", 46 | message=FALSE, error=FALSE, warning=TRUE) 47 | ``` 48 | 49 | # Quick start 50 | 51 | **If you use dmrseq in published research, please cite:** 52 | 53 | > Korthauer, K., Chakraborty, S., Benjamini, Y., and Irizarry, R.A. 54 | > Detection and accurate False Discovery Rate control of differentially 55 | methylated regions from Whole Genome Bisulfite Sequencing 56 | > *Biostatistics*, 2018 (in press). 57 | 58 | This package builds upon the 59 | [bsseq](http://bioconductor.org/packages/bsseq) package [@Hansen2012], 60 | which provides efficient storage and manipulation of bisulfite 61 | sequencing data and inference for differentially methylated CpGs. 62 | The main goal of **dmrseq** [@Korthauer183210] 63 | is to provide inference for differentially methylated *regions*, or 64 | groups of CpGs. 65 | 66 | Here we show the most basic steps for a differential methylation 67 | analysis. There are a variety of steps upstream of **dmrseq** that result 68 | in the generation of counts of methylated reads and total reads covering each 69 | CpG for each sample, including mapping of sequencing reads to a reference 70 | genome with and without bisulfite conversion. You can use the software 71 | of your preference for this step (one option is 72 | [Bismark](https://www.bioinformatics.babraham.ac.uk/projects/bismark/)), as 73 | long as you are able to obtain counts of methylation and coverage (as 74 | opposed to solely methylation proportions, as discussed below). 75 | 76 | This package uses a specific data structure to store and manipulate 77 | bisulfite sequencing data introduced by the **bsseq** package. This data 78 | structure is a *class* called `BSseq`. Objects of the class `BSseq` contain 79 | all pertinent information for a bisulfite sequencing experiment, including 80 | the number of reads corresponding to methylation, and the total number 81 | of reads at each 82 | CpG site, the location of each CpG site, and experimental metadata on the 83 | samples. Note that here we focus on CpG methylation, since this is the 84 | most common form of methylation in humans and many other organisms; take 85 | care when applying this method to other types of methylation and make sure 86 | that it will 87 | be able to scale to the number of methylation sites, and that similar 88 | assumptions can be made regarding spatial correlation. Also note that 89 | the default settings for smoothing parameters and spacing/gap parameters 90 | are set to values that we found useful, but may need to be altered for 91 | datasets for other organisms. 92 | 93 | To store your data in a `BSseq` object, make sure you have the following 94 | neccessary components: 95 | 96 | 1. genomic positions, including chromosome and location, for methylation loci. 97 | 98 | 2. a (matrix) of M (Methylation) values, describing the number of reads 99 | supporting methylation covering a single loci. 100 | Each row in this matrix is a methylation loci and each column is a sample. 101 | 102 | 3. a (matrix) of Cov (Coverage) values, 103 | describing the total number of reads covering a single loci. 104 | Each row in this matrix is a methylation loci and each column is a sample. 105 | 106 | The following code chunk assumes that `chr` and `pos` are vectors of 107 | chromosome names and positions, respectively, for each CpG in the dataset. 108 | You can also provide a `GRanges` object instead of `chr` and `pos`. It 109 | also assumes that the matrices of methylation and coverage values (described 110 | above) are named `M` and `Cov`, respectively. Note, `M` and `Cov` can also 111 | be data stored on-disk (not in memory) using HDF5 files with the `HDF5Array` 112 | package or `DelayedMatrix` with the `DelayedArray` package. 113 | 114 | The `sampleNames` and `trt` objects are 115 | vectors with sample labels and condition labels for each sample. A condition 116 | label could be something like 117 | treatment or control, a tissue type, or a continous measurement. 118 | This is the covariate for which you wish to test for differences in 119 | methylation. Once the `BSseq` object is constructed and the sample covariate 120 | information is added, DMRs are obtained by running the `dmrseq` function. 121 | A continuous covariate is assumed if the data type of the `testCovariate` 122 | arugment in `dmrseq` is 123 | continuous, with the exception of if there are only two unique values 124 | (then a two group comparison is carried out). 125 | 126 | ```{r quickStart, eval=FALSE} 127 | bs <- BSseq(chr = chr, pos = pos, 128 | M = M, Cov = Cov, 129 | sampleNames = sampleNames) 130 | pData(bs)$Condition <- trt 131 | 132 | regions <- dmrseq(bs=bs, testCovariate="Condition") 133 | ``` 134 | 135 | For more information on constructing and manipulating `BSseq` objects, 136 | see the [bsseq](http://bioconductor.org/packages/bsseq) vignettes. 137 | 138 | * If you used *Bismark* to align your bisulfite sequencing data, 139 | you can use the `read.bismark` function to read bismark files 140 | into `BSseq` objects. See below for more details. 141 | 142 | # How to get help for dmrseq 143 | 144 | Please post **dmrseq** questions to the 145 | **Bioconductor support site**, which serves as a searchable knowledge 146 | base of questions and answers: 147 | 148 | 149 | 150 | Posting a question and tagging with "dmrseq" will automatically send 151 | an alert to the package authors to respond on the support site. See 152 | the first question in the list of [Frequently Asked Questions](#FAQ) 153 | (FAQ) for information about how to construct an informative post. 154 | 155 | # Input data 156 | 157 | ## Why counts instead of methylation proportions? 158 | 159 | As input, the **dmrseq** package expects count data as obtained, e.g., 160 | from Bisulfite-sequencing. The value in the *i*-th row and the *j*-th column of 161 | the `M` matrix tells how many methylated reads can be assigned to CpG *i* 162 | in sample *j*. Likewise, the value in the *i*-th row and the *j*-th column of 163 | the `Cov` matrix tells how many total reads can be assigned to CpG *i* 164 | in sample *j*. Although we might be tempted to combine these matrices into 165 | one matrix that contains the methylation *proportion* (`M`/`Cov`) at each CpG 166 | site, it is critical to notice that this would be throwing away a lot of 167 | information. For example, some sites have much higher coverage than others, 168 | and naturally, we have more confidence in those with many reads mapping to them. 169 | If we only kept the proportions, a CpG with 2 out of 2 reads methylated would 170 | be treated the same as a CpG with 30 out of 30 reads methylated. 171 | 172 | ## How many samples do I need? 173 | 174 | To use **dmrseq**, you need to have at least 2 samples in each condition. 175 | Without this replicates, it is impossible to distinguish between biological 176 | variability due to condition/covariate of interest, and inter-individual 177 | variability within condition. 178 | 179 | If your experiment contains additional samples, perhaps from other conditions 180 | that are not of interest in the current test, these should be filtered out 181 | prior to running **dmrseq**. Rather than creating a new filtered object, 182 | the filtering step can be included in the call to the main function `dmrseq`. 183 | For more details, see the 184 | [Filtering CpGs and samples Section](#filtering-cpgs-and-samples). 185 | 186 | ## Bismark input 187 | 188 | If you used Bismark for mapping and methylation level extraction, you can 189 | use the `read.bismark` function from the **bsseq** package to read the 190 | data directly into 191 | a `BSeq` object. 192 | 193 | The following example is from the help page of the function. After running 194 | Bismark's methylation extractor, you should have output files with names 195 | that end in `.bismark.cov.gz`. You can specify a vector of file names with 196 | the `file` argument, and a corresponding vector of `sampleNames`. It is 197 | recommended that you set `rmZeroCov` to TRUE in order to remove CpGs with 198 | no coverage in any of the samples, and set `strandCollapse` to TRUE in order 199 | to combine CpGs on opposite strands into one observation (since CpG methylation) 200 | is symmetric. 201 | 202 | ```{r bismarkinput} 203 | library(dmrseq) 204 | infile <- system.file("extdata/test_data.fastq_bismark.bismark.cov.gz", 205 | package = 'bsseq') 206 | bismarkBSseq <- read.bismark(files = infile, 207 | rmZeroCov = TRUE, 208 | strandCollapse = FALSE, 209 | verbose = TRUE) 210 | bismarkBSseq 211 | ``` 212 | 213 | See the [bsseq](http://bioconductor.org/packages/bsseq) help pages for 214 | more information on using this function. 215 | 216 | ## Count matrix input 217 | 218 | If you haven't used Bismark, but you have count data for number of methylated 219 | reads and total coverage for each CpG, along with their corresponding chromosome 220 | and position information, you can construct a `BSseq` object from scratch, 221 | like below. Notice that the `M` and `Cov` matrices have the same dimension, and 222 | `chr` and `pos` have the same number of elements as rows in the count matrices 223 | (which corresponds to the number of CpGs). Also note that the number of columns 224 | in the count matrices matches the number of elements in `sampleNames` and the 225 | condition variable 'celltype`. 226 | 227 | ```{r dissect, results="hide", echo=FALSE} 228 | data("BS.chr21") 229 | M <- getCoverage(BS.chr21, type="M") 230 | Cov <- getCoverage(BS.chr21, type="Cov") 231 | chr <- as.character(seqnames(BS.chr21)) 232 | pos <- start(BS.chr21) 233 | celltype <- pData(BS.chr21)$CellType 234 | sampleNames <- sampleNames(BS.chr21) 235 | ``` 236 | 237 | ```{r fromScratch} 238 | head(M) 239 | head(Cov) 240 | head(chr) 241 | head(pos) 242 | 243 | dim(M) 244 | dim(Cov) 245 | length(chr) 246 | length(pos) 247 | 248 | print(sampleNames) 249 | print(celltype) 250 | 251 | bs <- BSseq(chr = chr, pos = pos, 252 | M = M, Cov = Cov, 253 | sampleNames = sampleNames) 254 | show(bs) 255 | ``` 256 | ```{r cleanup, results="hide", echo=FALSE} 257 | rm(M, Cov, pos, chr, bismarkBSseq) 258 | ``` 259 | 260 | The example data contains CpGs from chromosome 21 for four samples 261 | from @Lister2009. To load this data directly (already in the `BSseq` format), 262 | simply type `data(BS.chr21)`. 263 | Two of the samples are replicates of the cell type 'imr90' 264 | and the other two are replicates of the cell type 'h1'. Now that we have the 265 | data loaded into a `BSseq` object, we can use **dmrseq** 266 | to find regions of the genome where these two cell types have significantly 267 | different methylation levels. But first, we need to add the sample metadata 268 | that indicates which samples are from which cell type (the `celltype` 269 | varialbe above). This information, which we call 'metadata', 270 | will be used by the `dmrseq` function to decide 271 | which samples to compare to one another. The next section shows how to add 272 | this information to the `BSseq` object. 273 | 274 | ## Sample metadata 275 | 276 | To add sample metadata, including the covariate of interest, you can add it 277 | to the 278 | `BSseq` object by adding columns to the `pData` slot. You must have at least 279 | one column of `pData`, which contains the covariate of interest. Additional 280 | columns are optional. 281 | 282 | ```{r meta} 283 | pData(bs)$CellType <- celltype 284 | pData(bs)$Replicate <- substr(sampleNames, 285 | nchar(sampleNames), nchar(sampleNames)) 286 | 287 | pData(bs) 288 | ``` 289 | 290 | We will then tell the `dmrseq` function which metadata variable to use 291 | for testing for methylation differences by setting the `testCovariate` 292 | parameter equal to its column name. 293 | 294 | ## Smoothing 295 | 296 | Note that unlike in **bsseq**, you do not need to carry out the smoothing step 297 | with a separate function. In addition, you should not use **bsseq**'s `BSmooth()` 298 | function to smooth the methylation levels, since **dmrseq** smooths in a very 299 | different way. Briefly, **dmrseq** smooths methylation *differences*, so it 300 | carries out the smoothing step once. This is automatically done with the main 301 | `dmrseq` function. **bsseq** on the other hand, smooths each sample 302 | independently, so smoothing needs to be carried out once per sample. 303 | 304 | ## Filtering CpGs and samples 305 | 306 | For pairwise comparisons, **dmrseq** analyzes all CpGs that have at least one 307 | read in at least one sample per group. 308 | Thus, if your dataset contains CpGs with zero reads in every sample within a 309 | group, you should filter them out prior to running `dmrseq`. Likewise, 310 | if your `bsseq` object contains extraneous samples that are part of the 311 | experiment but not the differential methylation testing of interest, these 312 | should be filtered out as well. 313 | 314 | Filtering `bsseq` objects is straightforward: 315 | 316 | * Subset rows to filter CpG loci 317 | * Subset columns to filter samples 318 | 319 | If we wish to remove all CpGs that have no coverage in at least one sample 320 | and only keep samples with a CellType of "imr90" or "h1", we would do so with: 321 | 322 | ```{r, filter} 323 | # which loci and sample indices to keep 324 | loci.idx <- which(DelayedMatrixStats::rowSums2(getCoverage(bs, type="Cov")==0) == 0) 325 | sample.idx <- which(pData(bs)$CellType %in% c("imr90", "h1")) 326 | 327 | bs.filtered <- bs[loci.idx, sample.idx] 328 | ``` 329 | ```{r, results="hide", echo=FALSE} 330 | rm(bs.filtered) 331 | ``` 332 | 333 | Note that this is a trivial example, since our toy example object `BS.chr21` 334 | already contains only loci with coverage at least one read in all samples as well 335 | as only samples from the "imr90" and "h1" conditions. 336 | 337 | Also note that instead of creating a separate object, the filtering step 338 | can be combined with the call to `dmrseq` by replacing the `bs` input with a 339 | filtered version `bs[loci.idx, sample.idx]`. 340 | 341 | ## Adjusting for covariates 342 | 343 | There are two ways to adjust for covariates in the dmrseq model. The first way 344 | is to specify the `adjustCovariate` parameter of the `dmrseq()` function as 345 | a column of the `pData()` slot that contains the covariate you 346 | would like to adjust for. This will include that covariate directly in the 347 | model. This is ideal if the adjustment covariate is continuous or has more 348 | than two groups. 349 | 350 | The second way is to specify the `matchCovariate` parameter of the `dmrseq` 351 | function as a column of the `pData()` slot that contains the covariate you 352 | would like to match on. This will restrict the permutations considered to only 353 | those where the `matchCovariate` is balanced. For example, the `matchCovariate` 354 | could represent the sex of each sample. In that case, a permutation that 355 | includes all males in one group and all females in another would not be 356 | considered (since there is a plausible biological difference that may induce 357 | the null distribution to resemble non-null). This matching adjustment is ideal 358 | for two-group comparisons. 359 | 360 | # Differentially Methylated Regions 361 | 362 | The standard differential expression analysis steps are wrapped 363 | into a single function, `dmrseq`. The estimation steps performed 364 | by this function are described briefly below, as well as in 365 | more detail in the **dmrseq** paper. Here we run the results for a subset 366 | of 20,000 CpGs in the interest of computation time. 367 | 368 | ```{r mainfunction, message=TRUE, warning=TRUE} 369 | testCovariate <- "CellType" 370 | regions <- dmrseq(bs=bs[240001:260000,], 371 | cutoff = 0.05, 372 | testCovariate=testCovariate) 373 | ``` 374 | 375 | Progress messages are printed to the console if `verbose` is TRUE. 376 | The text, `condition h1 vs imr90`, tells you that positive methylation 377 | differences mean h1 has higher methylation than imr90 (see below for 378 | more details). 379 | 380 | ## Output of dmrseq 381 | 382 | The results object is a `GRanges` object with the coordiates 383 | of each candidate region, and contains the following metadata columns (which 384 | can be extracted with the `$` operator: 385 | 386 | 1. `L` = the number of CpGs contained in the region, 387 | 2. `area` = the sum of the smoothed beta values 388 | 3. `beta` = the coefficient value for the condition difference (Note: if the 389 | test covariate is categorical with more than 2 groups, there will be 390 | more than one beta column), 391 | 4. `stat` = the test statistic for the condition difference, 392 | 5. `pval` = the permutation _p_-value for the significance of the test 393 | statistic, and 394 | 6. `qval` = the _q_-value for the test statistic (adjustment 395 | for multiple comparisons to control false discovery rate). 396 | 7. `index = an `IRanges` containing the indices of the region's 397 | first CpG to last CpG. 398 | 399 | ```{r, showresults} 400 | show(regions) 401 | ``` 402 | 403 | The above steps are carried out on a very small subset of data (20,000 CpGs). 404 | This package loads data into memory one chromosome at a 405 | time. For on human data, this means objects with a few million 406 | entries per sample (since there are roughly 28.2 million total CpGs in the human 407 | genome, and the largest chromosomes will have more than 2 million CpGs). 408 | This means that whole-genome `BSseq` objects for several samples can use up 409 | several GB of RAM. In order to improve speed, the package allows for easy 410 | parallel processing of chromosomes, but be aware that using more cores will 411 | also require the use of more RAM. 412 | 413 | To use more cores, use the `register` function of 414 | [BiocParallel](http://bioconductor.org/packages/BiocParallel). For example, 415 | the following chunk (not evaluated here), would register 4 cores, and 416 | then the functions above would 417 | split computation over these cores. 418 | 419 | ```{r parallel, eval=FALSE} 420 | library("BiocParallel") 421 | register(MulticoreParam(4)) 422 | ``` 423 | ## Steps of the dmrseq method 424 | 425 | **dmrseq** is a two-stage approach that first detects candidate regions and then 426 | explicitly evaluates statistical significance at the region level while 427 | accounting for known sources of variability. 428 | Candidate DMRs are defined by segmenting the genome into groups of CpGs 429 | that show consistent evidence of differential methylation. 430 | Because the methylation levels of neighboring CpGs are highly correlated, 431 | we first smooth the signal to combat loss of power due to low coverage as done 432 | in **bsseq**. 433 | 434 | In the second stage, we compute a statistic for each candidate 435 | DMR that takes into account variability between biological replicates 436 | and spatial correlation among neighboring loci. Significance of each 437 | region is assessed via a permutation procedure which uses a pooled null 438 | distribution that can be generated from as few as two biological replicates, 439 | and false discovery rate is controlled using the Benjamini-Hochberg 440 | procedure. 441 | 442 | For more details, refer to the **dmrseq** paper [@Korthauer183210]. 443 | 444 | ## Detecting large-scale methylation blocks 445 | 446 | The default smoothing parameters (`bpSpan`, `minInSpan`, and `maxGapSmooth`) 447 | are designed to focus on local DMRs, generally in the range of hundreds to 448 | thousands of bases. In some applications, such as cancer, it is of interest 449 | to effectively 'zoom out' in order to detect larger (lower-resolution) 450 | methylation blocks on the order of hundreds of thousands to millions of bases. 451 | To do so, you can 452 | set the `block` argument to true, which will only include candidate regions with 453 | at least `blockSize` basepairs (default = 5000). This setting will also merge 454 | candidate regions that (1) are in the same direction and (2) are less than 1kb 455 | apart with no covered CpGs separating them. The region-level model used is also 456 | slightly modified - instead of a loci-specific intercept for each CpG in the 457 | region, the intercept term is modeled as a natural spline with one interior 458 | knot per each 10kb of length (up to 10 interior knots). 459 | 460 | In addition, detecting large-scale blocks requires that 461 | the smoothing window be increased to minimize the impact of noisy local 462 | methylation measurements. To do so, the values of the 463 | smoothing parameters should be increased. For example, to use a smoothing window 464 | that captures at least 500 CpGs or 50,000 basepairs that are spaced apart by no 465 | more than 1e6 bases, use `minInSpan=500`, `bpSpan=5e4`, and `maxGapSmooth=1e6`. 466 | In addition, to avoid a block being broken up simply due to a gap with no 467 | covered CpGs, you can increase the `maxGap` parameter. 468 | 469 | ```{r blocks, message=TRUE, warning=TRUE} 470 | testCovariate <- "CellType" 471 | blocks <- dmrseq(bs=bs[120001:125000,], 472 | cutoff = 0.05, 473 | testCovariate=testCovariate, 474 | block = TRUE, 475 | minInSpan = 500, 476 | bpSpan = 5e4, 477 | maxGapSmooth = 1e6, 478 | maxGap = 5e3) 479 | head(blocks) 480 | ``` 481 | 482 | The top hit is `r signif(width(blocks)[1]/1e3, 3)` thousand basepairs wide. 483 | In general, it also might be advised to decrease the cutoff when detecting 484 | blocks, since a smaller methylation 485 | difference might be biologically significant if it is maintained 486 | over a large genomic region. Note that block-finding can be more computationally 487 | intensive since we are fitting region-level models to large numbers of CpGs at a 488 | time. In the toy example above we are only searching over 5,000 CpGs (which 489 | span 490 | `r signif((max(end(bs[120001:125000,])) - 491 | min(start(bs[120001:125000,])))/1e3,3)` 492 | thousand basepairs), so we do not find enough null 493 | candidate regions to carry out inference and obtain significance levels. 494 | 495 | # Exploring and exporting results 496 | 497 | ## Explore how many regions were significant 498 | 499 | How many regions were significant at the FDR (_q_-value) cutoff of 0.05? We 500 | can find this by counting how many values in the `qval` column of the `regions` 501 | object were less than 0.05. 502 | You can also subset the regions by an FDR cutoff. 503 | 504 | ```{r} 505 | sum(regions$qval < 0.05) 506 | 507 | # select just the regions below FDR 0.05 and place in a new data.frame 508 | sigRegions <- regions[regions$qval < 0.05,] 509 | ``` 510 | 511 | ## Hypo- or Hyper- methylation? 512 | 513 | You can determine the proportion of regions with hyper-methylation by counting 514 | how many had a positive direction of effect (positive statistic). 515 | 516 | ```{r hyper} 517 | sum(sigRegions$stat > 0) / length(sigRegions) 518 | ``` 519 | 520 | To interpret the direction of effect, note that for a two-group comparison 521 | **dmrseq** uses alphabetical order of the covariate of interest. 522 | The condition with a higher alphabetical rank will become the reference category. 523 | For example, if 524 | the two conditions are "A" and "B", the "A" group will be the reference category, 525 | so a positive direction of effect means that 526 | "B" is hyper-methylated relative to "A". Conversely, a negative direction of 527 | effect means that "B" is hypo-methylated relative to "A". 528 | 529 | ## Plot DMRs 530 | 531 | It can be useful to visualize individual DMRs, so we provide a plotting 532 | function that is based off of **bsseq**'s plotting functions. There is also 533 | functionality to add annotations using the 534 | [annotatr](http://bioconductor.org/packages/annotatr) package to 535 | see the nearby CpG categories (island, shore, shelf, open sea) and nearby 536 | coding sequences. 537 | 538 | To retrieve annotations for genomes supported by **annotatr**, use the 539 | helper function `getAnnot`, and pass this annotation object to the `plotDMRs` 540 | function as the `annoTrack` parameter. 541 | 542 | ```{r plot, out.width='\\textwidth', fig.height = 2.5, warning=FALSE} 543 | # get annotations for hg18 544 | annoTrack <- getAnnot("hg18") 545 | 546 | plotDMRs(bs, regions=regions[1,], testCovariate="CellType", 547 | annoTrack=annoTrack) 548 | ``` 549 | 550 | Here we also plot the top methylation block from the block analysis: 551 | 552 | ```{r plotblock, out.width='\\textwidth', fig.height = 2.5, warning=FALSE} 553 | plotDMRs(bs, regions=blocks[1,], testCovariate="CellType", 554 | annoTrack=annoTrack) 555 | ``` 556 | 557 | ## Plot distribution of methylation values and coverage 558 | 559 | It can also be helpful to visualize overall distributions of methylation values 560 | and / or coverage. The function `plotEmpiricalDistribution` will plot the 561 | methylation values of 562 | the covariate of interest (specified with `testCovariate`). 563 | 564 | ```{r plot2, fig.height=3} 565 | plotEmpiricalDistribution(bs, testCovariate="CellType") 566 | ``` 567 | 568 | By changing the `type` argument to `Cov`, it will also plot the distribution of 569 | coverage values. In addition, samples can be plotted separately by setting 570 | `bySample` to true. 571 | 572 | ```{r plot3, fig.height=3} 573 | plotEmpiricalDistribution(bs, testCovariate="CellType", 574 | type="Cov", bySample=TRUE) 575 | ``` 576 | 577 | ## Exporting results to CSV files 578 | 579 | A plain-text file of the results can be exported using the 580 | base R functions *write.csv* or *write.delim*. 581 | We suggest using a descriptive file name indicating the variable 582 | and levels which were tested. 583 | 584 | ```{r export, eval=FALSE} 585 | write.csv(as.data.frame(regions), 586 | file="h1_imr90_results.csv") 587 | ``` 588 | 589 | ## Extract raw mean methylation differences 590 | 591 | For a two-group comparison, it might be of interest to extract the raw mean 592 | methylation differences over the DMRs. This can be done with the helper function 593 | `meanDiff`. For example, we can extract the raw mean difference values for 594 | the regions at FDR level 0.05 (using the `sigRegions` object created 595 | in the section 596 | [Explore how many regions were significant](#explore-how-many-regions-were-significant)). 597 | 598 | ```{r, meandiff} 599 | rawDiff <- meanDiff(bs, dmrs=sigRegions, testCovariate="CellType") 600 | str(rawDiff) 601 | ``` 602 | 603 | # Simulating DMRs 604 | 605 | If you have multiple samples from the same condition (e.g. control samples), 606 | the function `simDMRS` will split these into two artificial sample groups 607 | and then add _in silico_ DMRs. This can then be used to assess sensitivity 608 | and specificity of DMR approaches, since we hope to be able to recover the 609 | DMRs that were spiked in, but not identify too many other differences (since 610 | we don't expect any biological difference between the two artificial sample 611 | groups). 612 | 613 | The use of this function is demonstrated below, although note that in this 614 | toy example, we do not have enough samples from the same biological condition to 615 | split into two groups, so instead we shuffle the cell types to create a null 616 | sample comparison. 617 | 618 | ```{r, sim} 619 | data(BS.chr21) 620 | 621 | # reorder samples to create a null comparison 622 | BS.null <- BS.chr21[1:20000,c(1,3,2,4)] 623 | 624 | # add 100 DMRs 625 | BS.chr21.sim <- simDMRs(bs=BS.null, num.dmrs=100) 626 | 627 | # bsseq object with original null + simulated DMRs 628 | show(BS.chr21.sim$bs) 629 | 630 | # coordinates of spiked-in DMRs 631 | show(BS.chr21.sim$gr.dmrs) 632 | 633 | # effect sizes 634 | head(BS.chr21.sim$delta) 635 | ``` 636 | 637 | The resulting object is a list with the following elements: 638 | 639 | * `gr.dmrs`: a `GRanges` object containing the coordinates of the true spiked 640 | in DMRs 641 | * `dmr.mncov`: a numeric vector containing the mean coverage of the 642 | simulated DMRs 643 | * `dmr.L`: a numeric vector containing the sizes (number of CpG loci) of the 644 | simulated DMRs 645 | * `bs`: the `BSSeq` object containing the original null data + simulated DMRs 646 | * `delta`: a numeric vector of effect sizes (proportion differences) of the 647 | simulated DMRs. 648 | 649 | # Session info 650 | 651 | ```{r sessionInfo} 652 | sessionInfo() 653 | ``` 654 | 655 | # References 656 | -------------------------------------------------------------------------------- /R/dmrseq.R: -------------------------------------------------------------------------------- 1 | #' Main function for detecting and evaluating significance of DMRs. 2 | #' 3 | #' Performs a two-step approach that (1) detects candidate regions, and 4 | #' (2) scores candidate regions with an exchangeable (across the genome) 5 | #' statistic and evaluates statistical significance using a 6 | #' permuation test on the pooled null distribution of scores. 7 | #' 8 | #' @param bs bsseq object containing the methylation values as well as the 9 | #' phenotype matrix that contains sample level covariates 10 | #' @param testCovariate Character value indicating which variable 11 | #' (column name) in \code{pData(bs)} to test 12 | #' for association of methylation levels. 13 | #' Can alternatively specify an integer value indicating 14 | #' which of column of 15 | #' \code{pData(bs)} to use. This is used to construct the 16 | #' design matrix for the test statistic calculation. To run using a 17 | #' continuous or categorial covariate with more than two groups, simply pass in 18 | #' the name of a column in `pData` that contains this covariate. A continuous 19 | #' covariate is assumued if the data type in the `testCovariate` slot is 20 | #' continuous, with the exception of if there are only two unique values 21 | #' (then a two group comparison is carried out). 22 | #' @param adjustCovariate an (optional) character value or vector 23 | #' indicating which variables (column names) in \code{pData(bs)} 24 | #' will be adjusted for when 25 | #' testing for the association of methylation value with the 26 | #' \code{testCovariate}. 27 | #' Can alternatively specify an 28 | #' integer value or vector indicating 29 | #' which of the columns of \code{pData(bs)} to adjust for. 30 | #' If not NULL (default), then this is also used to 31 | #' construct the design matrix for the test statistic calculation. 32 | #' @param matchCovariate An (optional) character value 33 | #' indicating which variable (column name) of \code{pData(bs)} 34 | #' will be blocked for when 35 | #' constructing the permutations in order to 36 | #' test for the association of methylation value with the 37 | #' \code{testCovariate}, only to be used when \code{testCovariate} 38 | #' is a two-group factor and the number of permutations possible is less 39 | #' than 500000. 40 | #' Alternatively, you can specify an integer value indicating 41 | #' which column of \code{pData(bs)} to block for. 42 | #' Blocking means that only permutations with balanced 43 | #' composition of \code{testCovariate} values will be used (for example if 44 | #' you have samples from different gender and this is not your covariate of 45 | #' interest, 46 | #' it is recommended to use gender as a matching covariate to avoid one 47 | #' of the permutations testing entirely males versus females; this violates 48 | #' the null hypothesis and will decrease power). 49 | #' If not NULL (default), then no blocking is performed. 50 | #' @param minInSpan positive integer that represents the minimum number of 51 | #' CpGs in a smoothing span window if \code{smooth} is TRUE. 52 | #' Default value is 30. 53 | #' @param minNumRegion positive integer that represents the minimum number of 54 | #' CpGs to consider for a candidate region. Default value is 5. 55 | #' Minimum value is 3. 56 | #' @param cutoff scalar value that represents the absolute value (or a vector 57 | #' of two numbers representing a lower and upper bound) for the cutoff of 58 | #' the single CpG coefficient that is used to discover 59 | #' candidate regions. Default value is 0.10. 60 | #' @param smooth logical value that indicates whether or not to smooth the 61 | #' CpG level signal when discovering candidate regions. 62 | #' Defaults to TRUE. 63 | #' @param bpSpan a positive integer that represents the length in basepairs 64 | #' of the smoothing span window if \code{smooth} is TRUE. Default value is 65 | #' 1000. 66 | #' @param verbose logical value that indicates whether progress messages 67 | #' should be printed to stdout. Defaults value is TRUE. 68 | #' @param BPPARAM a \code{BiocParallelParam} object to specify the parallel 69 | #' backend. The default 70 | #' option is \code{BiocParallel::bpparam()} which will automatically creates 71 | #' a cluster appropriate for the operating system. 72 | #' @param maxPerms a positive integer that represents the maximum number 73 | #' of permutations that will be used to generate the global null 74 | #' distribution of test statistics. Default value is 10. 75 | #' @param maxGap integer value representing maximum number of basepairs in 76 | #' between neighboring CpGs to be included in the same DMR. 77 | #' @param maxGapSmooth integer value representing maximum number of basepairs 78 | #' in between neighboring CpGs to be included in the same 79 | #' cluster when performing smoothing (should generally be larger than 80 | #' \code{maxGap}) 81 | #' @param stat a character vector indicating the name of the column of the 82 | #' output to use as the region-level test statistic. Default value is 'stat' 83 | #' which is the region level-statistic designed to be comparable across the 84 | #' genome. 85 | #' It is not recommended to change this argument, but it can be done for 86 | #' experimental purposes. Possible values are: 'L' - the number of loci 87 | #' in the region, 'area' - the sum of the smoothed loci statistics, 88 | #' 'beta' - the effect size of the region, 'stat' - the test statistic for 89 | #' the region, or 'avg' - the average smoothed loci statistic. 90 | #' @param block logical indicating whether to search for large-scale (low 91 | #' resolution) blocks of differential methylation (default is FALSE, which 92 | #' means that local DMRs are desired). If TRUE, the parameters for 93 | #' \code{bpSpan}, \code{minInSpan}, and \code{maxGapSmooth} should be adjusted 94 | #' (increased) accordingly. This setting will also merge 95 | #' candidate regions that (1) are in the same direction and (2) are less than 96 | #' 1kb apart with no covered CpGs separating them. The region-level model used 97 | #' is also slightly modified - instead of a loci-specific intercept for each 98 | #' CpG in theregion, the intercept term is modeled as a natural spline with 99 | #' one interior knot per each 10kb of length (up to 10 interior knots). 100 | #' @param blockSize numeric value indicating the minimum number of basepairs 101 | #' to be considered a block (only used if \code{block}=TRUE). Default is 102 | #' 5000 basepairs. 103 | #' @param chrsPerChunk a positive integer value indicating the number of 104 | #' chromosomes per chunk. The default is 1, meaning that the data will be 105 | #' looped through one chromosome at a time. When pairing up multiple 106 | #' chromosomes per chunk, sizes (in terms of numbers of CpGs) will be taken 107 | #' into consideration to balance the sizes of each chunk. 108 | #' @return a \code{GRanges} object that contains the results of the inference. 109 | #' The object contains one row for each candidate region, sorted by q-value 110 | #' and then chromosome. The standard 111 | #' \code{GRanges} chr, start, and end are included, along with at least 112 | #' 7 metadata 113 | #' columns, in the following order: 114 | #' 1. L = the number of CpGs contained in the region, 115 | #' 2. area = the sum of the smoothed beta values 116 | #' 3. beta = the coefficient value for the condition difference (there 117 | #' will be more than one column here if a multi-group comparison 118 | #' was performed), 119 | #' 4. stat = the test statistic for the condition difference, 120 | #' 5. pval = the permutation p-value for the significance of the test 121 | #' statistic, and 122 | #' 6. qval = the q-value for the test statistic (adjustment 123 | #' for multiple comparisons to control false discovery rate). 124 | #' 7. index = an \code{IRanges} containing the indices of the region's 125 | #' first CpG to last CpG. 126 | #' 127 | #' @keywords inference 128 | #' @importFrom outliers grubbs.test 129 | #' @importFrom bumphunter clusterMaker getSegments 130 | #' @importFrom DelayedMatrixStats colMedians rowMads rowSums2 rowMeans2 rowDiffs 131 | #' @importFrom matrixStats rowRanges 132 | #' @importFrom stats formula anova as.formula 133 | #' 134 | #' @importClassesFrom bsseq BSseq 135 | #' @importMethodsFrom bsseq pData seqnames sampleNames start width 136 | #' 137 | #' @importFrom grDevices col2rgb colorRampPalette dev.off pdf rgb 138 | #' @importFrom graphics axis layout legend lines mtext par 139 | #' plot points rect rug text 140 | #' @importFrom methods is 141 | #' @importFrom stats approxfun lm loess median model.matrix p.adjust 142 | #' predict preplot qt quantile rbeta rbinom runif 143 | #' @importFrom utils combn 144 | #' @importFrom BiocParallel bplapply register MulticoreParam bpparam 145 | #' @importFrom splines ns 146 | #' 147 | #' @import bsseq 148 | #' @import GenomicRanges 149 | #' @import nlme 150 | #' @import annotatr 151 | #' @import ggplot2 152 | #' @import S4Vectors 153 | #' 154 | #' @export 155 | #' 156 | #' @examples 157 | #' 158 | #' # load example data 159 | #' data(BS.chr21) 160 | #' 161 | #' # the covariate of interest is the 'CellType' column of pData(BS.chr21) 162 | #' testCovariate <- 'CellType' 163 | #' 164 | #' # run dmrseq on a subset of the chromosome (10K CpGs) 165 | #' regions <- dmrseq(bs=BS.chr21[240001:250000,], 166 | #' cutoff = 0.05, 167 | #' testCovariate=testCovariate) 168 | #' 169 | dmrseq <- function(bs, testCovariate, adjustCovariate = NULL, cutoff = 0.1, 170 | minNumRegion = 5, smooth = TRUE, bpSpan = 1000, 171 | minInSpan = 30, maxGapSmooth = 2500, maxGap = 1000, 172 | verbose = TRUE, 173 | maxPerms = 10, matchCovariate = NULL, 174 | BPPARAM = bpparam(), stat = "stat", 175 | block = FALSE, blockSize = 5000, 176 | chrsPerChunk = 1) { 177 | 178 | stopifnot(is(bs, "BSseq")) 179 | 180 | if (!(is.null(cutoff) || length(cutoff) %in% seq_len(2))) 181 | stop("'cutoff' has to be either NULL or a vector of length 1 or 2") 182 | if (length(cutoff) == 2) 183 | cutoff <- sort(cutoff) 184 | if (is.null(cutoff) | abs(cutoff) > 1 | abs(cutoff) == 0) 185 | stop("Must specify a value for cutoff between 0 and 1") 186 | subverbose <- max(as.integer(verbose) - 1L, 0) 187 | 188 | if(minNumRegion < 3){ 189 | stop("minNumRegion must be at least 3") 190 | } 191 | 192 | # check statistic name 193 | if (!(stat %in% c("L", "area", "beta", "stat", "avg"))) { 194 | stop("Specified '", stat, 195 | "' as the test statistic which is not ", 196 | "in the results. Please specify a valid name from one of ", 197 | "L, area, beta, stat, or avg") 198 | } 199 | 200 | # informative message about blocks if block=TRUE; check for increased 201 | # smoothing window 202 | if (block){ 203 | message("Searching for large scale blocks with at least ", 204 | blockSize, " basepairs.") 205 | 206 | if(minInSpan < 100 && bpSpan < 2000 && maxGapSmooth < 1e5){ 207 | warning("When block=TRUE, it is recommended to increase the values ", 208 | "of minInSpan, bpSpan, and maxGapSmooth in order to widen ", 209 | "the smoothing window") 210 | } 211 | } 212 | 213 | # convert covariates to column numbers if characters 214 | if (is.character(testCovariate)) { 215 | if(length(testCovariate) > 1) 216 | stop("Only one testCovariate can be specified") 217 | if(is.character(adjustCovariate)){ 218 | if(sum(testCovariate %in% adjustCovariate) > 0) 219 | stop("adjustCovariate can't contain testCovariate") 220 | } 221 | if(is.character(matchCovariate)){ 222 | if(sum(testCovariate %in% matchCovariate)) 223 | stop("matchCovariate can't contain testCovariate") 224 | } 225 | testCovariate <- which(colnames(pData(bs)) == testCovariate) 226 | if (length(testCovariate) == 0) { 227 | stop("testCovariate not found in pData(). ", 228 | "Please specify a valid testCovariate") 229 | } 230 | } 231 | 232 | if (is.character(adjustCovariate)) { 233 | if(is.character(matchCovariate)){ 234 | if(matchCovariate == adjustCovariate) 235 | stop("matchCovariate can't be identical to adjustCovariate") 236 | } 237 | adjustCovariate <- which(colnames(pData(bs)) %in% adjustCovariate) 238 | if (length(adjustCovariate) == 0) { 239 | stop("adjustCovariate not found in pData(). ", 240 | "Please specify a valid adjustCovariate") 241 | } 242 | } 243 | 244 | # check that chrsPerChunk value makes sense 245 | if (chrsPerChunk != 1){ 246 | if (chrsPerChunk%%1 != 0){ 247 | stop("chrsPerChunk must be an integer") 248 | }else if(chrsPerChunk < 1){ 249 | stop("chrsPerChunk must be strictly positive") 250 | }else if(chrsPerChunk > length(unique(seqnames(bs)))){ 251 | stop("chrsPerChunk can't be larger than the total", 252 | " number of chromosomes") 253 | } 254 | } 255 | 256 | # check that bs object is sorted since `bsseq::BSseq()` no longer 257 | # automatically sorts to ensure loci from same chr are indexed consecutively 258 | if (is.unsorted(bs)) { 259 | stop("'bs' must be sorted. Use 'sort(bs)'.") 260 | } 261 | 262 | # construct the design matrix using the pData of bs 263 | if (ncol(pData(bs)) < max(testCovariate, adjustCovariate)) { 264 | stop("Error: pData(bs) has too few columns. "," 265 | Please specify valid ", 266 | "covariates to use in the analysis") 267 | } 268 | 269 | coeff <- seq(2,(2 + length(testCovariate) - 1)) 270 | testCov <- pData(bs)[, testCovariate] 271 | if (is.factor(testCov)) # drop unused levels of test 272 | testCov <- droplevels(testCov) 273 | 274 | # check for missing values in testCov 275 | if (any(is.na(testCov))) { 276 | stop("Missing values found in the testCovariate ", 277 | "Please remove these samples from the analysis.") 278 | } 279 | 280 | 281 | fact <- TRUE 282 | sampleSize <- table(testCov)[names(table(testCov)) %in% pData(bs)[,testCovariate]] 283 | if (length(unique(testCov)) == 1) { 284 | message("Warning: only one unique value of the specified ", 285 | "covariate of interest. Assuming null comparison and ", 286 | "splitting sample group into two equal groups") 287 | testCov <- rep(1, length(testCov)) 288 | testCov[seq_len(round(length(testCov)/2))] <- 0 289 | }else if (length(unique(testCov)) > 2 && !is.numeric(testCov)) { 290 | message("Performing a global test of H0: no difference among ", 291 | length(unique(testCov)), " groups (assuming the test ", 292 | "covariate ", colnames(pData(bs))[testCovariate], 293 | " is a factor).") 294 | coeff <- seq(coeff, coeff + length(unique(testCov)) - 2) 295 | }else if (length(unique(testCov)) > 2 && is.numeric(testCov)) { 296 | message("Assuming the test ", 297 | "covariate ", colnames(pData(bs))[testCovariate], 298 | " is continuous.") 299 | fact <- FALSE 300 | }else{ 301 | message("Assuming the test ", 302 | "covariate ", colnames(pData(bs))[testCovariate], 303 | " is a factor.") 304 | if(min(sampleSize) < 2) 305 | stop("At least one group has only one sample! ", 306 | "Replicates are required to run dmrseq.") 307 | testCov <- as.factor(testCov) 308 | } 309 | 310 | if (!is.null(adjustCovariate)) { 311 | mmdat <- data.frame(testCov = testCov) 312 | adjustCov <- pData(bs)[, adjustCovariate, drop = FALSE] 313 | 314 | # check for missing values in adjustCovariate 315 | if (any(is.na(adjustCov))) { 316 | stop("Missing values found in the adjustCovariate. ", 317 | "Please remove these samples from the analysis.") 318 | } 319 | 320 | # check for number of unique values per adjust cov 321 | nunq <- apply(adjustCov, 2, function(x) length(unique(x))) 322 | if (any(nunq < 2)) 323 | stop("At least one adjust covariate is constant across samples.", 324 | " Please remove this covariate from the model and try again.") 325 | 326 | # remove any empty factor levels 327 | for (f in 1:ncol(adjustCov)){ 328 | if (is.factor(adjustCov[,f])) 329 | adjustCov[,f] <- droplevels(adjustCov[,f]) 330 | } 331 | 332 | mmdat <- cbind(mmdat, adjustCov) 333 | frm <- paste0("~", paste0(colnames(mmdat), collapse = " + ")) 334 | design <- model.matrix(as.formula(frm), data=mmdat) 335 | colnames(design)[coeff] <- colnames(pData(bs))[testCovariate] 336 | coeff.adj <- (max(coeff) + 1):(ncol(design)) 337 | } else { 338 | design <- model.matrix(~testCov) 339 | colnames(design)[coeff] <- colnames(pData(bs))[testCovariate] 340 | coeff.adj <- NULL 341 | } 342 | 343 | # check model matrix is full rank 344 | e <- eigen(crossprod(as.matrix(design)), symmetric = TRUE, only.values = TRUE)$values 345 | if (! (e[1] > 0 && abs(e[length(e)]/e[1]) > 1e-13)){ 346 | stop("Design matrix is not full rank") 347 | } 348 | 349 | # check for empty factor levels in design matrix 350 | if (sum(colSums(design) == 0) > 0){ 351 | which.empty <- which(colSums(design) == 0) 352 | design <- design[,-which.empty] 353 | } 354 | 355 | # check that p <= n 356 | if (ncol(bs) < ncol(design) + 1) 357 | stop("Not enough degrees of freedom to estimate ", ncol(design)-1, 358 | " covariates using ", ncol(bs), " samples. Please use a larger ", 359 | "number of samples, or specify fewer adjust covariates.") 360 | 361 | # check for incompatible args 362 | if (fact && !is.null(matchCovariate) && length(unique(testCov)) > 2) 363 | stop("matchCovariate can't be used when testCovariate is not a 2-group ", 364 | "factor. Perhaps you'd like to add an adjustCovariate instead?") 365 | 366 | if(!is.null(matchCovariate) && choose(nrow(design), min(sampleSize)) >= 5e5) 367 | stop("matchCovariate can't be used when the sample size is large enough ", 368 | "to yield more than 500000 possible permutations. ", 369 | "Perhaps you'd like to add an adjustCovariate instead?") 370 | 371 | # check for interaction terms (not yet supported) 372 | if (length(coeff) > 1 && any(rowSums(design[,coeff]) > 1)) 373 | stop("Interaction terms in testCovariate are not yet supported.") 374 | 375 | if (length(unique(testCov)) == 2) { 376 | message("Condition: ", 377 | unique(pData(bs)[, testCovariate][which(design[, coeff] == 1)]), 378 | " vs ", 379 | unique(pData(bs)[, testCovariate][which(design[, coeff] == 0)])) 380 | } 381 | if (!is.null(adjustCovariate)) { 382 | message("Adjusting for covariate (s): ", 383 | paste(colnames(pData(bs))[adjustCovariate], collapse = ", ")) 384 | } 385 | if (!is.null(matchCovariate)) { 386 | if (length(matchCovariate) > 1) 387 | stop("Covariate matching can only be carried out for one", 388 | " covariate") 389 | if (length(unique(testCov)) > 2) 390 | stop("Covariate matching can only be carried out for 2-group", 391 | " comparisons") 392 | if (is.character(matchCovariate)) { 393 | if (sum(grepl(matchCovariate, colnames(pData(bs)))) == 0) { 394 | stop("Error: no column in pData() found that matches ", 395 | "the matchCovariate") 396 | } else if (length(grep(matchCovariate, colnames(pData(bs)))) > 1) { 397 | stop("Error: matchCovariate matches more than one ", 398 | "column in pData()") 399 | } 400 | mC <- grep(matchCovariate, colnames(pData(bs))) 401 | } else { 402 | stopifnot(matchCovariate <= ncol(pData(bs))) 403 | } 404 | message("Matching permutations on covariate: ", 405 | colnames(pData(bs))[mC]) 406 | } 407 | 408 | # check for loci with missing data 409 | if (fact){ 410 | lev <- unique(pData(bs)[[testCovariate]]) 411 | filter <- NULL 412 | for (l in seq_along(lev)){ 413 | filter <- rbind(filter, 414 | 1*(DelayedMatrixStats::rowSums2(getCoverage(bs)[,pData(bs)[[testCovariate]] == 415 | lev[l], drop = FALSE]) == 0)) 416 | } 417 | filter <- which( apply(filter, 2, max) > 0 ) 418 | 419 | if (length(filter) > 0) { 420 | stop(length(filter), " loci have zero coverage in all samples ", 421 | "of at least one condition. Please remove these loci ", 422 | "before running dmrseq") 423 | } 424 | 425 | }else{ 426 | filter <- DelayedMatrixStats::rowSums2(getCoverage(bs)==0) >= ncol(bs) - 1 427 | if(sum(filter) > 0) 428 | stop(sum(filter), " loci have zero coverage in at least ", 429 | ncol(bs) - 1, " samples. Please remove these loci ", 430 | "before running dmrseq") 431 | } 432 | 433 | # register the parallel backend 434 | BiocParallel::register(BPPARAM) 435 | backend <- paste0("BiocParallel:", class(bpparam())[1]) 436 | 437 | if (bpparam()$workers == 1) { 438 | if (verbose) { 439 | mes <- "Using a single core (backend: %s)." 440 | message(sprintf(mes, backend)) 441 | } 442 | parallel <- FALSE 443 | } else { 444 | if (verbose) { 445 | mes <- paste0("Parallelizing using %s workers/cores ", 446 | "(backend: %s).") 447 | message(sprintf(mes, bpparam()$workers, backend)) 448 | } 449 | parallel <- TRUE 450 | } 451 | message("Computing on ", chrsPerChunk, 452 | " chromosome(s) at a time.\n") 453 | 454 | message("Detecting candidate regions with coefficient larger than ", 455 | unique(abs(cutoff)), 456 | " in magnitude.") 457 | OBS <- bumphunt(bs=bs, design = design, 458 | coeff = coeff, coeff.adj = coeff.adj, minInSpan = minInSpan, 459 | minNumRegion = minNumRegion, cutoff = cutoff, 460 | maxGap = maxGap, maxGapSmooth = maxGapSmooth, 461 | smooth = smooth, bpSpan = bpSpan, verbose = verbose, 462 | parallel = parallel, block = block, blockSize = blockSize, 463 | chrsPerChunk = chrsPerChunk, fact = fact, 464 | adjustCovariate = adjustCovariate) 465 | 466 | # check that at least one candidate region was found; if there were none 467 | # there is no need to go on to compute permutation tests... 468 | 469 | if (length(OBS) > 0) { 470 | message("* ", nrow(OBS), " candidates detected") 471 | FLIP <- NULL 472 | # configure the permutation matrix for two group comparisons 473 | if (length(unique(design[, coeff[1]])) == 2 && 474 | length(coeff) == 1 && 475 | choose(nrow(design), min(sampleSize)) < 5e5 ) { 476 | if (verbose) { 477 | message("Performing balanced permutations of ", 478 | "condition across samples ", 479 | "to generate a null distribution of region test statistics") 480 | } 481 | perms <- combn(seq(1, nrow(design)), min(sampleSize)) 482 | 483 | # Remove redundant permutations (if balanced) 484 | if (length(unique(table(design[,coeff]))) == 1){ 485 | perms <- perms[, seq_len(ncol(perms)/2)] 486 | } 487 | 488 | # restrict to unique permutations that don't include any 489 | # groups consisting of all identical conditions 490 | rmv <- NULL 491 | for (p in seq_len(ncol(perms))){ 492 | if (length(unique(design[perms[,p],coeff])) == 1){ 493 | rmv <- c(rmv, p) 494 | } 495 | } 496 | if (length(rmv) > 0 ) 497 | perms <- perms[,-rmv] 498 | 499 | # subsample permutations based on similarity to original partition 500 | # gives preference to those with the least similarity 501 | if (maxPerms < ncol(perms)) { 502 | similarity <- apply(perms, 2, function(x) { 503 | max(table(design[x,coeff])) 504 | }) 505 | perms.all <- perms 506 | perms <- NULL 507 | levs <- sort(unique(similarity)) 508 | l <- 1 509 | num <- 0 510 | while(!(num == maxPerms) && l <= length(levs)) { 511 | keep <- sample(which(similarity == levs[l]), 512 | min(maxPerms-num, sum(similarity == levs[l])) ) 513 | perms <- cbind(perms, perms.all[,keep]) 514 | l <- l + 1 515 | num <- ncol(perms) 516 | } 517 | } 518 | } else { 519 | # Next consider a multilevel, or continuous covariate where the 520 | # covariate will be permuted in an unrestricted manner 521 | if (verbose) { 522 | message("Performing unrestricted permutation of", 523 | " covariate of interest across samples ", 524 | "to generate a null distribution of region test statistics") 525 | } 526 | perms <- as.matrix(seq_len(nrow(design))) 527 | 528 | for (p in seq_len(maxPerms)) { 529 | tries <- 0 530 | candidate <- sample(seq_len(nrow(design)), nrow(design)) 531 | # check that the permutation is not a duplicate, and not 532 | # equal to the original 533 | while ((sum(apply(perms, 2, function(x) 534 | all.equal(x, candidate)) == TRUE) > 0 || 535 | sum(apply(perms, 2, function(x) 536 | all.equal(x, rev(candidate))) == TRUE) > 0) && 537 | tries <= 20) { 538 | candidate <- sample(seq(seq_len(nrow(design))), nrow(design)) 539 | tries <- tries + 1 540 | } 541 | # save the permutation to the permutation matrix 542 | if (tries <= 20){ 543 | perms <- cbind(perms, candidate) 544 | } 545 | } 546 | perms <- perms[,-1] # remove original 547 | } 548 | 549 | pData.orig <- pData(bs) 550 | levs <- unique(pData.orig[[testCovariate]]) 551 | # Now rerun on permuted designs and concatenate results 552 | for (j in seq_len(ncol(perms))) { 553 | if (verbose) { 554 | message("\nBeginning permutation ", j) 555 | } 556 | reorder <- perms[, j] 557 | designr <- design 558 | 559 | if (length(unique(design[, coeff[1]])) == 2 && 560 | length(coeff) == 1 && 561 | !nrow(perms) == nrow(designr)) { 562 | designr[, coeff] <- 0 563 | designr[reorder, coeff] <- 1 564 | pData(bs)[[testCovariate]] <- levs[1] 565 | pData(bs)[[testCovariate]][reorder] <- levs[2] 566 | 567 | if (!all(sort(pData.orig[[testCovariate]]) == 568 | sort(pData(bs)[[testCovariate]]))){ 569 | designr[, coeff] <- 1 570 | designr[reorder, coeff] <- 0 571 | pData(bs)[[testCovariate]] <- levs[2] 572 | pData(bs)[[testCovariate]][reorder] <- levs[1] 573 | } 574 | 575 | xr <- NULL 576 | for (rd in seq_len(nrow(pData.orig))) { 577 | match <- which(pData.orig[[testCovariate]] %in% 578 | pData(bs)[rd,][[testCovariate]]) 579 | taken <- which(match %in% xr) 580 | if (length(taken) > 0) 581 | match <- match[-taken] 582 | if (length(match) > 0) 583 | xr <- c(xr, match[1]) 584 | } 585 | if(length(coeff.adj) > 0){ 586 | pData(bs)[,adjustCovariate] <- 587 | pData.orig[xr,adjustCovariate] 588 | } 589 | } else { 590 | designr[, coeff] <- designr[reorder, coeff] 591 | pData(bs) <- pData.orig[reorder, , drop = FALSE] 592 | } 593 | 594 | # if matchCovariate is not null, restrict permutations such that 595 | # null comparisons are balanced for the values of 596 | # pData$matchCovariate this avoids comparison of, 597 | # say two different individuals in the null, that the comparison of 598 | # interest is tissue type. Not matching would mean the null is 599 | # really not null 600 | if (!is.null(matchCovariate)) { 601 | permLabel <- paste0(paste0(pData(bs)[designr[, coeff[1]] == 1, 602 | mC], collapse = "_"), 603 | "vs", paste0(pData(bs)[(1 - designr[, coeff[1]]) == 1, 604 | mC], collapse = "_")) 605 | 606 | c1 <- unlist(strsplit(permLabel, "vs"))[1] 607 | c2 <- unlist(strsplit(permLabel, "vs"))[2] 608 | 609 | c1 <- unlist(strsplit(c1, "_")) 610 | c2 <- unlist(strsplit(c2, "_")) 611 | 612 | keepPerm <- 1 * (sum(c1 %in% c2) > 0 && 613 | sum(c2 %in% c1) > 0) 614 | 615 | if (keepPerm == 0) { 616 | if (verbose) { 617 | message(paste0("Skipping permutation ", 618 | gsub("vs", " vs ", permLabel))) 619 | } 620 | next 621 | } 622 | } else { 623 | permLabel <- j 624 | } 625 | 626 | res.flip.p <- bumphunt(bs=bs, design = designr, 627 | coeff = coeff, 628 | coeff.adj = coeff.adj, 629 | minInSpan = minInSpan, 630 | minNumRegion = minNumRegion, cutoff = cutoff, 631 | maxGap = maxGap, maxGapSmooth = maxGapSmooth, 632 | smooth = smooth, bpSpan = bpSpan, 633 | verbose = verbose, parallel = parallel, 634 | block = block, blockSize = blockSize, 635 | chrsPerChunk = chrsPerChunk, fact = fact, 636 | adjustCovariate = adjustCovariate) 637 | 638 | if (verbose) { 639 | message("* ", j, " out of ", ncol(perms), 640 | " permutations completed (", 641 | nrow(res.flip.p), " null candidates)") 642 | } 643 | 644 | if (!is.null(res.flip.p)) { 645 | res.flip.p$permNum <- permLabel 646 | FLIP <- rbind(FLIP, res.flip.p) 647 | } 648 | } 649 | 650 | # restore original pData 651 | pData(bs) <- pData.orig 652 | 653 | # if no candidates were found in permutation 654 | # provide informative error message 655 | if (is.null(FLIP)){ 656 | warning("No candidate regions found in permutation, so inference ", 657 | "can't be carried out. ", 658 | "Try decreasing the cutoff, or running on a larger ", 659 | "dataset if you are currently using a subset.") 660 | OBS$pval <- NA 661 | OBS$qval <- NA 662 | }else if (nrow(FLIP) < 0.05*nrow(OBS)){ 663 | message("Note: Very few null candidate regions were found.", 664 | "For more accurate and sensitive inference, ", 665 | "try decreasing the cutoff, or running on a larger ", 666 | "dataset if you are currently using a subset.") 667 | } 668 | 669 | if (!is.null(FLIP)){ 670 | # if there are more than 1 million candidate null regions, 671 | # take a random sample 672 | # of 1 million of them 673 | if (nrow(FLIP) > 1e+06) { 674 | rs <- sample(seq_len(nrow(FLIP)), 1e+06, replace = FALSE) 675 | FLIP <- FLIP[rs, ] 676 | } 677 | 678 | # which column of results to use as test statistic ? 679 | # check statistic name 680 | if (!(stat %in% c(colnames(OBS), "avg"))) { 681 | stop("Specified '", stat, 682 | "' as the test statistic which is not ", 683 | "in the results. Please specify a valid name from one of ", 684 | "L, area, beta, or stat") 685 | } else if (stat == "avg") { 686 | OBS$avg <- OBS$area/OBS$L 687 | FLIP$avg <- FLIP$area/FLIP$L 688 | } 689 | 690 | whichStatO <- which(colnames(OBS) == stat) 691 | whichStatF <- which(colnames(FLIP) == stat) 692 | 693 | # Faster way to compute the p-values that doesn't use multiple cores 694 | # Step 1: sort the permuted statistics vector 695 | perm.ordered <- c(sort(abs(FLIP[, whichStatF]), 696 | method = "quick"), Inf) 697 | 698 | # Step 2: find the first instance in the sorted vector where the 699 | # permuted value is greater than the observed and use this to 700 | # determine the number of permuted values that are greater than or 701 | # equal to theobserved 702 | pval <- rep(NA, nrow(OBS)) 703 | pval[!is.na(OBS[, whichStatO])] <- (1 + 704 | vapply(abs(OBS[!is.na(OBS[, whichStatO]), whichStatO]), 705 | function(x) length(perm.ordered) - min(which(x <= perm.ordered)), 706 | numeric(1))) / (1 + sum(!is.na(FLIP[, whichStatF]))) 707 | 708 | # missing test statistics cause Inf for the p-value calculation 709 | # instead propagate the missing values 710 | pval[abs(pval) == Inf] <- NA 711 | 712 | pval <- data.frame(x = pval, y = p.adjust(pval, method = "BH")) 713 | 714 | OBS$pval <- pval$x 715 | OBS$qval <- pval$y 716 | } 717 | 718 | # convert output into GRanges, with indexStart/indexEnd as IRanges 719 | indexIR <- IRanges(OBS$indexStart, OBS$indexEnd) 720 | OBS.gr <- makeGRangesFromDataFrame(OBS[,-c(4:5)], 721 | keep.extra.columns = TRUE) 722 | OBS.gr$index <- indexIR 723 | names(OBS.gr) <- NULL 724 | 725 | # sort on pval overall (currently sorted within chromsome) 726 | OBS.gr <- OBS.gr[order(OBS.gr$pval, -abs(OBS.gr$stat)),] 727 | 728 | return(OBS.gr) 729 | } else { 730 | message("No candidate regions pass the cutoff of ", unique(abs(cutoff))) 731 | return(NULL) 732 | } 733 | } 734 | --------------------------------------------------------------------------------